import fitz import math from enum import StrEnum, auto class CCEColors(StrEnum): Red = "Red" White = "White", Blue = "Blue" class CCEAssemblies(StrEnum): Senate = "Senate", House = "House", GeneralAssembly = "GeneralAssembly" from typing import Any class FitzBlockWrapper: def __init__(self, block): self.x0, self.y0, self.x1, \ self.y1, self.text, \ self.block_number, self.block_type = block self.x0 = int(self.x0) self.x1 = int(self.x1) self.y0 = int(self.y0) self.y1 = int(self.y1) self.block_number = int(self.block_number) self.block_type = int(self.block_type) def __str__(self): return str(( self.x0, self.y0, self.x1, self.y1, self.text )) def __repl__(self): return self.__str__() class BillCode: def __init__(self, text: str): # try to parse # codes are in this rough format: "RSB/yy-c(c)-n(n)" text = text.rstrip() slashsplit = text.split('/') dashsplit = slashsplit[1].split('-') assemblycode = slashsplit[0] self.color = assemblycode[0] if self.color == "R": self.color = CCEColors.Red elif self.color == "W": self.color = CCEColors.White elif self.color == "B": self.color = CCEColors.Blue assemblydivision = assemblycode[1] if assemblydivision == "S": self.assembly = CCEAssemblies.Senate elif assemblydivision == "H": self.assembly = CCEAssemblies.House elif assemblydivision == "G": self.assembly = CCEAssemblies.GeneralAssembly self.year = int(dashsplit[0]) self.committee = int(dashsplit[1]) self.docketplacement = int(dashsplit[2]) self.stringrep = self.color[0].upper() + \ self.assembly[0].upper() + \ "B/{}-{}-{}".format( str(self.year), str(self.committee), str(self.docketplacement) ) def __str__(self): return "{} {} - {}-{}-{}".format( self.color, self.assembly, str(self.year), str(self.committee), str(self.docketplacement) ) class Bill: def __init__(self, code: str | BillCode, sponsors: str, subcommittee: str, school: str, bill_text: list[str], title: str ): if isinstance(code, str): self.code = BillCode(code) else: self.code = code self.sponsors = sponsors.rstrip() self.subcommittee = subcommittee.rstrip() self.school = school.rstrip() self.bill_text = bill_text self.title = title class PdfParser: def __init__(self, document: fitz.Document): self.document = document @staticmethod def _words_in_superstring(words: list[str], superstring: str) -> bool: for word in words: if not str(word).lower() in str(superstring).lower(): return False return True def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]: """ sections is an array of section pages plus the last page. """ current = 0 legislative_pages: list[int] = [] try: while True: legislative_pages += list( range( sections[current] + 1, sections[current + 1], 1 ) ) current += 1 except IndexError: pass return legislative_pages def _generate_section_markers(self, document: fitz.Document) -> list[int]: section_pages = [] for page in document: text = page.get_text().encode("utf8") is_section_page = self._words_in_superstring( words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ], superstring=text ) is_last_page = self._words_in_superstring( words=[ "ABCs" ], superstring=text ) print("page number {} contains sentintal? {}".format(page.number, is_section_page)) if len(page.get_images()) == 3: print("page {} has one image!".format(page.number)) print(page.get_images()) if is_section_page and len(page.get_images()) == 3: section_pages.append(page.number) if is_last_page and len(section_pages) > 2: section_pages.append(page.number) return section_pages def _get_block_info_from_page(self, page: fitz.Page): return [FitzBlockWrapper(i) for i in page.get_text("blocks")] @staticmethod def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: to_return: list[FitzBlockWrapper] = [] for block in blocks: if block.block_type == 0: to_return.append(block) return to_return @staticmethod def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: to_return: list[str] = [] for block in blocks: to_return.append(block.text) return to_return @staticmethod def _get_info_from_block(block, lat: int): to_return = [] for i in block: if math.floor(i[0]) == lat: to_return.append(i) return to_return @staticmethod def _split_list_by_element(arr: list[Any], pivot: Any): output = [] current = [] for i in arr: if i == pivot: output.append(current) current = [] else: current.append(i) output.append(current) return output def parse(self): section_pages = self._generate_section_markers(self.document) legislative_pages = self._generate_legislative_pages_list(section_pages) joined_blocks: list[FitzBlockWrapper] = [] for page_number in legislative_pages: page = self.document.load_page(page_number) block_info = self._get_block_info_from_page(page) joined_blocks += block_info[:-1] # remove the page number at the end of every page joined_blocks = self._remove_image_blocks(joined_blocks) joined_blocks = self._remove_coordinate_information(joined_blocks) bill_header = joined_blocks[0] splitted = self._split_list_by_element(joined_blocks, bill_header) bills: list[Bill] = [] for splitted_item in splitted: try: bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item except ValueError: continue bill_text = ' '.join(bill_text) print(type(bill_text)) pretty_printed = self._pretty_print_bill_text(bill_text) bills.append(Bill( code=bill_code, subcommittee=subcommittee, sponsors=sponsors, school=school, bill_text=pretty_printed["bill_array"], title=pretty_printed["title"] )) self.bills = bills @staticmethod def _find_first_line_number(bill_arrays): for i in range(len(bill_arrays)): try: if str(int(bill_arrays[i])) == bill_arrays[i]: return i except ValueError: pass def _pretty_print_bill_text(self, bill_text: str): replaced = bill_text.replace("� ", "\n") replaced = replaced.split('\n') replaced = [i.rstrip().lstrip() for i in replaced] first_line_number = self._find_first_line_number(replaced) title = ' '.join(replaced[:first_line_number]) rebuilt = replaced[first_line_number:][1::2] return { "title": title.lstrip(), "bill_array": rebuilt } @classmethod def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser return cls(fitz.open(filename))