diff --git a/analyser.py b/analyser.py index b47d81e..08b5eb4 100644 --- a/analyser.py +++ b/analyser.py @@ -1,5 +1,5 @@ import leglib -import fitz -leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse() -leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse() +parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf") +parser.parse() +print([i.bill_text for i in parser.bills]) diff --git a/common.py b/common.py new file mode 100644 index 0000000..2382ecc --- /dev/null +++ b/common.py @@ -0,0 +1,79 @@ +from enum import StrEnum, auto + +class CCEColors(StrEnum): + Red = "Red" + White = "White", + Blue = "Blue" + +class CCEAssemblies(StrEnum): + Senate = "Senate", + House = "House", + GeneralAssembly = "GeneralAssembly" + +class BillCode: + def __init__(self, text: str): + # try to parse + # codes are in this rough format: "RSB/yy-c(c)-n(n)" + + text = text.rstrip() + slashsplit = text.split('/') + dashsplit = slashsplit[1].split('-') + + assemblycode = slashsplit[0] + + self.color = assemblycode[0] + if self.color == "R": + self.color = CCEColors.Red + elif self.color == "W": + self.color = CCEColors.White + elif self.color == "B": + self.color = CCEColors.Blue + + assemblydivision = assemblycode[1] + if assemblydivision == "S": + self.assembly = CCEAssemblies.Senate + elif assemblydivision == "H": + self.assembly = CCEAssemblies.House + elif assemblydivision == "G": + self.assembly = CCEAssemblies.GeneralAssembly + + self.year = int(dashsplit[0]) + self.committee = int(dashsplit[1]) + self.docketplacement = int(dashsplit[2]) + + self.stringrep = self.color[0].upper() + \ + self.assembly[0].upper() + \ + "B/{}-{}-{}".format( + str(self.year), + str(self.committee), + str(self.docketplacement) + ) + + def __str__(self): + return "{} {} - {}-{}-{}".format( + self.color, + self.assembly, + str(self.year), + str(self.committee), + str(self.docketplacement) + ) + +class Bill: + def __init__(self, + code: str | BillCode, + sponsors: str, + subcommittee: str, + school: str, + bill_text: list[str], + title: str + ): + if isinstance(code, str): + self.code = BillCode(code) + else: + self.code = code + + self.sponsors = sponsors.rstrip() + self.subcommittee = subcommittee.rstrip() + self.school = school.rstrip() + self.bill_text = bill_text + self.title = title diff --git a/leglib.py b/leglib.py index 0cabd56..32778c1 100644 --- a/leglib.py +++ b/leglib.py @@ -1,274 +1,5 @@ import fitz import math -from enum import StrEnum, auto - -class CCEColors(StrEnum): - Red = "Red" - White = "White", - Blue = "Blue" - -class CCEAssemblies(StrEnum): - Senate = "Senate", - House = "House", - GeneralAssembly = "GeneralAssembly" - from typing import Any - -class FitzBlockWrapper: - def __init__(self, block): - self.x0, self.y0, self.x1, \ - self.y1, self.text, \ - self.block_number, self.block_type = block - - self.x0 = int(self.x0) - self.x1 = int(self.x1) - self.y0 = int(self.y0) - self.y1 = int(self.y1) - self.block_number = int(self.block_number) - self.block_type = int(self.block_type) - - def __str__(self): - return str(( - self.x0, self.y0, self.x1, self.y1, self.text - )) - - def __repl__(self): - return self.__str__() - -class BillCode: - def __init__(self, text: str): - # try to parse - # codes are in this rough format: "RSB/yy-c(c)-n(n)" - - text = text.rstrip() - slashsplit = text.split('/') - dashsplit = slashsplit[1].split('-') - - assemblycode = slashsplit[0] - - self.color = assemblycode[0] - if self.color == "R": - self.color = CCEColors.Red - elif self.color == "W": - self.color = CCEColors.White - elif self.color == "B": - self.color = CCEColors.Blue - - assemblydivision = assemblycode[1] - if assemblydivision == "S": - self.assembly = CCEAssemblies.Senate - elif assemblydivision == "H": - self.assembly = CCEAssemblies.House - elif assemblydivision == "G": - self.assembly = CCEAssemblies.GeneralAssembly - - self.year = int(dashsplit[0]) - self.committee = int(dashsplit[1]) - self.docketplacement = int(dashsplit[2]) - - self.stringrep = self.color[0].upper() + \ - self.assembly[0].upper() + \ - "B/{}-{}-{}".format( - str(self.year), - str(self.committee), - str(self.docketplacement) - ) - - def __str__(self): - return "{} {} - {}-{}-{}".format( - self.color, - self.assembly, - str(self.year), - str(self.committee), - str(self.docketplacement) - ) - -class Bill: - def __init__(self, - code: str | BillCode, - sponsors: str, - subcommittee: str, - school: str, - bill_text: list[str], - title: str - ): - if isinstance(code, str): - self.code = BillCode(code) - else: - self.code = code - - self.sponsors = sponsors.rstrip() - self.subcommittee = subcommittee.rstrip() - self.school = school.rstrip() - self.bill_text = bill_text - self.title = title - -class PdfParser: - def __init__(self, document: fitz.Document): - self.document = document - - @staticmethod - def _words_in_superstring(words: list[str], superstring: str) -> bool: - for word in words: - if not str(word).lower() in str(superstring).lower(): - return False - return True - - def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]: - """ - sections is an array of section pages plus the last page. - """ - current = 0 - legislative_pages: list[int] = [] - try: - while True: - legislative_pages += list( - range( - sections[current] + 1, - sections[current + 1], - 1 - ) - ) - - current += 1 - except IndexError: - pass - - return legislative_pages - - def _generate_section_markers(self, document: fitz.Document) -> list[int]: - section_pages = [] - for page in document: - text = page.get_text().encode("utf8") - is_section_page = self._words_in_superstring( - words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ], - superstring=text - ) - is_last_page = self._words_in_superstring( - words=[ "ABCs" ], - superstring=text - ) - print("page number {} contains sentintal? {}".format(page.number, is_section_page)) - if len(page.get_images()) == 3: - print("page {} has one image!".format(page.number)) - print(page.get_images()) - - if is_section_page and len(page.get_images()) == 3: - section_pages.append(page.number) - - if is_last_page and len(section_pages) > 2: - section_pages.append(page.number) - - return section_pages - - def _get_block_info_from_page(self, page: fitz.Page): - return [FitzBlockWrapper(i) for i in page.get_text("blocks")] - - @staticmethod - def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: - to_return: list[FitzBlockWrapper] = [] - for block in blocks: - if block.block_type == 0: - to_return.append(block) - - return to_return - - @staticmethod - def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: - to_return: list[str] = [] - for block in blocks: - to_return.append(block.text) - - return to_return - - @staticmethod - def _get_info_from_block(block, lat: int): - to_return = [] - for i in block: - if math.floor(i[0]) == lat: - to_return.append(i) - return to_return - - @staticmethod - def _split_list_by_element(arr: list[Any], pivot: Any): - output = [] - current = [] - for i in arr: - if i == pivot: - output.append(current) - current = [] - else: - current.append(i) - - output.append(current) - return output - - def parse(self): - section_pages = self._generate_section_markers(self.document) - legislative_pages = self._generate_legislative_pages_list(section_pages) - joined_blocks: list[FitzBlockWrapper] = [] - for page_number in legislative_pages: - page = self.document.load_page(page_number) - block_info = self._get_block_info_from_page(page) - - joined_blocks += block_info[:-1] # remove the page number at the end of every page - - joined_blocks = self._remove_image_blocks(joined_blocks) - joined_blocks = self._remove_coordinate_information(joined_blocks) - - bill_header = joined_blocks[0] - - splitted = self._split_list_by_element(joined_blocks, bill_header) - - bills: list[Bill] = [] - for splitted_item in splitted: - try: - bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item - except ValueError: - continue - - bill_text = ' '.join(bill_text) - - print(type(bill_text)) - - pretty_printed = self._pretty_print_bill_text(bill_text) - bills.append(Bill( - code=bill_code, - subcommittee=subcommittee, - sponsors=sponsors, - school=school, - bill_text=pretty_printed["bill_array"], - title=pretty_printed["title"] - )) - - self.bills = bills - - @staticmethod - def _find_first_line_number(bill_arrays): - for i in range(len(bill_arrays)): - try: - if str(int(bill_arrays[i])) == bill_arrays[i]: - return i - except ValueError: - pass - - def _pretty_print_bill_text(self, bill_text: str): - replaced = bill_text.replace("� ", "\n") - replaced = replaced.split('\n') - - replaced = [i.rstrip().lstrip() for i in replaced] - - first_line_number = self._find_first_line_number(replaced) - - title = ' '.join(replaced[:first_line_number]) - rebuilt = replaced[first_line_number:][1::2] - - return { - "title": title.lstrip(), - "bill_array": rebuilt - } - - @classmethod - def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser - return cls(fitz.open(filename)) +import parsers diff --git a/lib.py b/lib.py new file mode 100644 index 0000000..cf38e95 --- /dev/null +++ b/lib.py @@ -0,0 +1,20 @@ +class FitzBlockWrapper: + def __init__(self, block): + self.x0, self.y0, self.x1, \ + self.y1, self.text, \ + self.block_number, self.block_type = block + + self.x0 = int(self.x0) + self.x1 = int(self.x1) + self.y0 = int(self.y0) + self.y1 = int(self.y1) + self.block_number = int(self.block_number) + self.block_type = int(self.block_type) + + def __str__(self): + return str(( + self.x0, self.y0, self.x1, self.y1, self.text + )) + + def __repl__(self): + return self.__str__() diff --git a/parsers.py b/parsers.py new file mode 100644 index 0000000..d85c2a5 --- /dev/null +++ b/parsers.py @@ -0,0 +1,174 @@ +import fitz +from typing import Any + +from lib import FitzBlockWrapper +from common import Bill + +class HSYIGPdfParser: + def __init__(self, document: fitz.Document): + self.document = document + + @staticmethod + def _words_in_superstring(words: list[str], superstring: str) -> bool: + for word in words: + if not str(word).lower() in str(superstring).lower(): + return False + return True + + def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]: + """ + sections is an array of section pages plus the last page. + """ + current = 0 + legislative_pages: list[int] = [] + try: + while True: + legislative_pages += list( + range( + sections[current] + 1, + sections[current + 1], + 1 + ) + ) + + current += 1 + except IndexError: + pass + + return legislative_pages + + def _generate_section_markers(self, document: fitz.Document) -> list[int]: + section_pages = [] + for page in document: + text = page.get_text().encode("utf8") + is_section_page = self._words_in_superstring( + words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ], + superstring=text + ) + is_last_page = self._words_in_superstring( + words=[ "ABCs" ], + superstring=text + ) +# print("page number {} contains sentintal? {}".format(page.number, is_section_page)) +# if len(page.get_images()) == 3: +# print("page {} has one image!".format(page.number)) +# print(page.get_images()) + + if is_section_page and len(page.get_images()) == 3: + section_pages.append(page.number) + + if is_last_page and len(section_pages) > 2: + section_pages.append(page.number) + + return section_pages + + def _get_block_info_from_page(self, page: fitz.Page): + return [FitzBlockWrapper(i) for i in page.get_text("blocks")] + + @staticmethod + def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: + to_return: list[FitzBlockWrapper] = [] + for block in blocks: + if block.block_type == 0: + to_return.append(block) + + return to_return + + @staticmethod + def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: + to_return: list[str] = [] + for block in blocks: + to_return.append(block.text) + + return to_return + + @staticmethod + def _get_info_from_block(block, lat: int): + to_return = [] + for i in block: + if math.floor(i[0]) == lat: + to_return.append(i) + return to_return + + @staticmethod + def _split_list_by_element(arr: list[Any], pivot: Any): + output = [] + current = [] + for i in arr: + if i == pivot: + output.append(current) + current = [] + else: + current.append(i) + + output.append(current) + return output + + def parse(self): + section_pages = self._generate_section_markers(self.document) + legislative_pages = self._generate_legislative_pages_list(section_pages) + joined_blocks: list[FitzBlockWrapper] = [] + for page_number in legislative_pages: + page = self.document.load_page(page_number) + block_info = self._get_block_info_from_page(page) + + joined_blocks += block_info[:-1] # remove the page number at the end of every page + + joined_blocks = self._remove_image_blocks(joined_blocks) + joined_blocks = self._remove_coordinate_information(joined_blocks) + + bill_header = joined_blocks[0] + + splitted = self._split_list_by_element(joined_blocks, bill_header) + + bills: list[Bill] = [] + for splitted_item in splitted: + try: + bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item + except ValueError: + continue + + bill_text = ' '.join(bill_text) + +# print(type(bill_text)) + + pretty_printed = self._pretty_print_bill_text(bill_text) + bills.append(Bill( + code=bill_code, + subcommittee=subcommittee, + sponsors=sponsors, + school=school, + bill_text=pretty_printed["bill_array"], + title=pretty_printed["title"] + )) + + self.bills = bills + + @staticmethod + def _find_first_line_number(bill_arrays): + for i in range(len(bill_arrays)): + try: + if str(int(bill_arrays[i])) == bill_arrays[i]: + return i + except ValueError: + pass + + def _pretty_print_bill_text(self, bill_text: str): + replaced = bill_text.replace("�", "\n") + replaced = replaced.split('\n') + + replaced = [i.rstrip().lstrip() for i in replaced] + + first_line_number = self._find_first_line_number(replaced) + + title = ' '.join(replaced[:first_line_number]) + rebuilt = replaced[first_line_number:][1::2] + + return { + "title": title.lstrip(), + "bill_array": rebuilt + } + + @classmethod + def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser + return cls(fitz.open(filename))