import fitz from typing import Any from lib import FitzBlockWrapper from common import Bill class HSYIGPdfParser: def __init__(self, document: fitz.Document): self.document = document @staticmethod def _words_in_superstring(words: list[str], superstring: str) -> bool: for word in words: if not str(word).lower() in str(superstring).lower(): return False return True def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]: """ sections is an array of section pages plus the last page. """ current = 0 legislative_pages: list[int] = [] try: while True: legislative_pages += list( range( sections[current] + 1, sections[current + 1], 1 ) ) current += 1 except IndexError: pass return legislative_pages def _generate_section_markers(self, document: fitz.Document) -> list[int]: section_pages = [] for page in document: text = page.get_text().encode("utf8") is_section_page = self._words_in_superstring( words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ], superstring=text ) is_last_page = self._words_in_superstring( words=[ "ABCs" ], superstring=text ) # print("page number {} contains sentintal? {}".format(page.number, is_section_page)) # if len(page.get_images()) == 3: # print("page {} has one image!".format(page.number)) # print(page.get_images()) if is_section_page and len(page.get_images()) == 3: section_pages.append(page.number) if is_last_page and len(section_pages) > 2: section_pages.append(page.number) return section_pages def _get_block_info_from_page(self, page: fitz.Page): return [FitzBlockWrapper(i) for i in page.get_text("blocks")] @staticmethod def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: to_return: list[FitzBlockWrapper] = [] for block in blocks: if block.block_type == 0: to_return.append(block) return to_return @staticmethod def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: to_return: list[str] = [] for block in blocks: to_return.append(block.text) return to_return @staticmethod def _get_info_from_block(block, lat: int): to_return = [] for i in block: if math.floor(i[0]) == lat: to_return.append(i) return to_return @staticmethod def _split_list_by_element(arr: list[Any], pivot: Any): output = [] current = [] for i in arr: if i == pivot: output.append(current) current = [] else: current.append(i) output.append(current) return output def parse(self): section_pages = self._generate_section_markers(self.document) legislative_pages = self._generate_legislative_pages_list(section_pages) joined_blocks: list[FitzBlockWrapper] = [] for page_number in legislative_pages: page = self.document.load_page(page_number) block_info = self._get_block_info_from_page(page) joined_blocks += block_info[:-1] # remove the page number at the end of every page joined_blocks = self._remove_image_blocks(joined_blocks) joined_blocks = self._remove_coordinate_information(joined_blocks) bill_header = joined_blocks[0] splitted = self._split_list_by_element(joined_blocks, bill_header) bills: list[Bill] = [] for splitted_item in splitted: try: bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item except ValueError: continue bill_text = ' '.join(bill_text) # print(type(bill_text)) pretty_printed = self._pretty_print_bill_text(bill_text) bills.append(Bill( code=bill_code, subcommittee=subcommittee, sponsors=sponsors, school=school, bill_text=pretty_printed["bill_array"], title=pretty_printed["title"] )) self.bills = bills @staticmethod def _find_first_line_number(bill_arrays): for i in range(len(bill_arrays)): try: if str(int(bill_arrays[i])) == bill_arrays[i]: return i except ValueError: pass def _pretty_print_bill_text(self, bill_text: str): replaced = bill_text.replace("�", "\n") replaced = replaced.split('\n') replaced = [i.rstrip().lstrip() for i in replaced] first_line_number = self._find_first_line_number(replaced) title = ' '.join(replaced[:first_line_number]) rebuilt = replaced[first_line_number:][1::2] return { "title": title.lstrip(), "bill_array": rebuilt } @classmethod def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser return cls(fitz.open(filename))