need to clean this up
This commit is contained in:
parent
10dd6418b7
commit
875890a83a
|
@ -0,0 +1,2 @@
|
|||
__pycache__/
|
||||
*.pyc
|
Binary file not shown.
|
@ -0,0 +1,4 @@
|
|||
import leglib
|
||||
import fitz
|
||||
|
||||
leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
|
|
@ -0,0 +1,202 @@
|
|||
import fitz
|
||||
import math
|
||||
|
||||
from typing import Any
|
||||
|
||||
class FitzBlockWrapper:
|
||||
def __init__(self, block):
|
||||
self.x0, self.y0, self.x1, \
|
||||
self.y1, self.text, \
|
||||
self.block_number, self.block_type = block
|
||||
|
||||
self.x0 = int(self.x0)
|
||||
self.x1 = int(self.x1)
|
||||
self.y0 = int(self.y0)
|
||||
self.y1 = int(self.y1)
|
||||
self.block_number = int(self.block_number)
|
||||
self.block_type = int(self.block_type)
|
||||
|
||||
def __str__(self):
|
||||
return str((
|
||||
self.x0, self.y0, self.x1, self.y1, self.text
|
||||
))
|
||||
|
||||
def __repl__(self):
|
||||
return self.__str__()
|
||||
|
||||
class BillCode:
|
||||
def __init__(self, text: str):
|
||||
# try to parse
|
||||
# codes are in this rough format: "RSB/yy-c(c)-n(n)"
|
||||
|
||||
text = text.rstrip()
|
||||
slashsplit = text.split('/')
|
||||
dashsplit = slashsplit[1].split('-')
|
||||
|
||||
assemblycode = slashsplit[0]
|
||||
|
||||
self.color = assemblycode[0]
|
||||
if self.color == "R":
|
||||
self.color = "red"
|
||||
elif self.color == "W":
|
||||
self.color = "white"
|
||||
elif self.color == "B":
|
||||
self.color = "blue"
|
||||
|
||||
assemblydivision = assemblycode[1]
|
||||
if assemblydivision == "S":
|
||||
self.assembly = "senate"
|
||||
elif assemblydivision == "H":
|
||||
self.assembly = "house"
|
||||
|
||||
self.year = int(dashsplit[0])
|
||||
self.committee = int(dashsplit[1])
|
||||
self.docketplacement = int(dashsplit[2])
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} - {}-{}-{}".format(
|
||||
self.color,
|
||||
self.assembly,
|
||||
str(self.year),
|
||||
str(self.committee),
|
||||
str(self.docketplacement)
|
||||
)
|
||||
|
||||
class Bill:
|
||||
def __init__(self,
|
||||
code: str | BillCode,
|
||||
sponsors: str,
|
||||
subcommittee: str,
|
||||
school: str
|
||||
):
|
||||
if isinstance(code, str):
|
||||
self.code = BillCode(code)
|
||||
else:
|
||||
self.code = code
|
||||
|
||||
self.sponsors = sponsors.rstrip()
|
||||
self.subcommittee = subcommittee.rstrip()
|
||||
self.school = school.rstrip()
|
||||
|
||||
class PdfParser:
|
||||
def __init__(self, document: fitz.Document):
|
||||
self.document = document
|
||||
|
||||
@staticmethod
|
||||
def _words_in_superstring(words: list[str], superstring: str) -> bool:
|
||||
for word in words:
|
||||
if not str(word).lower() in str(superstring).lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
|
||||
"""
|
||||
sections is an array of section pages plus the last page.
|
||||
"""
|
||||
current = 0
|
||||
legislative_pages: list[int] = []
|
||||
try:
|
||||
while True:
|
||||
legislative_pages += list(
|
||||
range(
|
||||
sections[current] + 1,
|
||||
sections[current + 1],
|
||||
1
|
||||
)
|
||||
)
|
||||
|
||||
current += 1
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return legislative_pages
|
||||
|
||||
def _generate_section_markers(self, document: fitz.Document) -> list[int]:
|
||||
section_pages = []
|
||||
for page in document:
|
||||
text = page.get_text().encode("utf8")
|
||||
is_section_page = self._words_in_superstring(
|
||||
words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
|
||||
superstring=text
|
||||
)
|
||||
is_last_page = self._words_in_superstring(
|
||||
words=[ "ABCs" ],
|
||||
superstring=text
|
||||
)
|
||||
print("page number {} contains sentintal? {}".format(page.number, is_section_page))
|
||||
if len(page.get_images()) == 3:
|
||||
print("page {} has one image!".format(page.number))
|
||||
print(page.get_images())
|
||||
|
||||
if is_section_page and len(page.get_images()) == 3:
|
||||
section_pages.append(page.number)
|
||||
|
||||
if is_last_page and len(section_pages) > 2:
|
||||
section_pages.append(page.number)
|
||||
|
||||
return section_pages
|
||||
|
||||
def _get_block_info_from_page(self, page: fitz.Page):
|
||||
return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
|
||||
|
||||
@staticmethod
|
||||
def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
|
||||
to_return: list[FitzBlockWrapper] = []
|
||||
for block in blocks:
|
||||
if block.block_type == 0:
|
||||
to_return.append(block)
|
||||
|
||||
return to_return
|
||||
|
||||
@staticmethod
|
||||
def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
|
||||
to_return: list[str] = []
|
||||
for block in blocks:
|
||||
to_return.append(block.text)
|
||||
|
||||
return to_return
|
||||
|
||||
@staticmethod
|
||||
def _get_info_from_block(block, lat: int):
|
||||
to_return = []
|
||||
for i in block:
|
||||
if math.floor(i[0]) == lat:
|
||||
to_return.append(i)
|
||||
return to_return
|
||||
|
||||
@staticmethod
|
||||
def _split_list_by_element(arr: list[Any], pivot: Any):
|
||||
output = []
|
||||
current = []
|
||||
for i in arr:
|
||||
if i == pivot:
|
||||
output.append(current)
|
||||
current = []
|
||||
else:
|
||||
current.append(i)
|
||||
|
||||
output.append(current)
|
||||
return output
|
||||
|
||||
def parse(self):
|
||||
section_pages = self._generate_section_markers(self.document)
|
||||
legislative_pages = self._generate_legislative_pages_list(section_pages)
|
||||
joined_blocks: list[FitzBlockWrapper] = []
|
||||
for page_number in legislative_pages:
|
||||
page = self.document.load_page(page_number)
|
||||
block_info = self._get_block_info_from_page(page)
|
||||
|
||||
joined_blocks += block_info
|
||||
|
||||
joined_blocks = self._remove_image_blocks(joined_blocks)
|
||||
joined_blocks = self._remove_coordinate_information(joined_blocks)
|
||||
|
||||
bill_header = joined_blocks[0]
|
||||
|
||||
splitted = self._split_list_by_element(joined_blocks, bill_header)
|
||||
|
||||
count = 0
|
||||
for i in splitted:
|
||||
if count < 20:
|
||||
print(i)
|
||||
count += 1
|
Loading…
Reference in New Issue