diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7a60b85 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/YIGVolunteerBook2024.pdf b/YIGVolunteerBook2024.pdf new file mode 100644 index 0000000..acbc262 Binary files /dev/null and b/YIGVolunteerBook2024.pdf differ diff --git a/analyser.py b/analyser.py new file mode 100644 index 0000000..6c8ac0c --- /dev/null +++ b/analyser.py @@ -0,0 +1,4 @@ +import leglib +import fitz + +leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse() diff --git a/leglib.py b/leglib.py new file mode 100644 index 0000000..a5d5a25 --- /dev/null +++ b/leglib.py @@ -0,0 +1,202 @@ +import fitz +import math + +from typing import Any + +class FitzBlockWrapper: + def __init__(self, block): + self.x0, self.y0, self.x1, \ + self.y1, self.text, \ + self.block_number, self.block_type = block + + self.x0 = int(self.x0) + self.x1 = int(self.x1) + self.y0 = int(self.y0) + self.y1 = int(self.y1) + self.block_number = int(self.block_number) + self.block_type = int(self.block_type) + + def __str__(self): + return str(( + self.x0, self.y0, self.x1, self.y1, self.text + )) + + def __repl__(self): + return self.__str__() + +class BillCode: + def __init__(self, text: str): + # try to parse + # codes are in this rough format: "RSB/yy-c(c)-n(n)" + + text = text.rstrip() + slashsplit = text.split('/') + dashsplit = slashsplit[1].split('-') + + assemblycode = slashsplit[0] + + self.color = assemblycode[0] + if self.color == "R": + self.color = "red" + elif self.color == "W": + self.color = "white" + elif self.color == "B": + self.color = "blue" + + assemblydivision = assemblycode[1] + if assemblydivision == "S": + self.assembly = "senate" + elif assemblydivision == "H": + self.assembly = "house" + + self.year = int(dashsplit[0]) + self.committee = int(dashsplit[1]) + self.docketplacement = int(dashsplit[2]) + + def __str__(self): + return "{} {} - {}-{}-{}".format( + self.color, + self.assembly, + str(self.year), + str(self.committee), + str(self.docketplacement) + ) + +class Bill: + def __init__(self, + code: str | BillCode, + sponsors: str, + subcommittee: str, + school: str + ): + if isinstance(code, str): + self.code = BillCode(code) + else: + self.code = code + + self.sponsors = sponsors.rstrip() + self.subcommittee = subcommittee.rstrip() + self.school = school.rstrip() + +class PdfParser: + def __init__(self, document: fitz.Document): + self.document = document + + @staticmethod + def _words_in_superstring(words: list[str], superstring: str) -> bool: + for word in words: + if not str(word).lower() in str(superstring).lower(): + return False + return True + + def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]: + """ + sections is an array of section pages plus the last page. + """ + current = 0 + legislative_pages: list[int] = [] + try: + while True: + legislative_pages += list( + range( + sections[current] + 1, + sections[current + 1], + 1 + ) + ) + + current += 1 + except IndexError: + pass + + return legislative_pages + + def _generate_section_markers(self, document: fitz.Document) -> list[int]: + section_pages = [] + for page in document: + text = page.get_text().encode("utf8") + is_section_page = self._words_in_superstring( + words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ], + superstring=text + ) + is_last_page = self._words_in_superstring( + words=[ "ABCs" ], + superstring=text + ) + print("page number {} contains sentintal? {}".format(page.number, is_section_page)) + if len(page.get_images()) == 3: + print("page {} has one image!".format(page.number)) + print(page.get_images()) + + if is_section_page and len(page.get_images()) == 3: + section_pages.append(page.number) + + if is_last_page and len(section_pages) > 2: + section_pages.append(page.number) + + return section_pages + + def _get_block_info_from_page(self, page: fitz.Page): + return [FitzBlockWrapper(i) for i in page.get_text("blocks")] + + @staticmethod + def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: + to_return: list[FitzBlockWrapper] = [] + for block in blocks: + if block.block_type == 0: + to_return.append(block) + + return to_return + + @staticmethod + def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]: + to_return: list[str] = [] + for block in blocks: + to_return.append(block.text) + + return to_return + + @staticmethod + def _get_info_from_block(block, lat: int): + to_return = [] + for i in block: + if math.floor(i[0]) == lat: + to_return.append(i) + return to_return + + @staticmethod + def _split_list_by_element(arr: list[Any], pivot: Any): + output = [] + current = [] + for i in arr: + if i == pivot: + output.append(current) + current = [] + else: + current.append(i) + + output.append(current) + return output + + def parse(self): + section_pages = self._generate_section_markers(self.document) + legislative_pages = self._generate_legislative_pages_list(section_pages) + joined_blocks: list[FitzBlockWrapper] = [] + for page_number in legislative_pages: + page = self.document.load_page(page_number) + block_info = self._get_block_info_from_page(page) + + joined_blocks += block_info + + joined_blocks = self._remove_image_blocks(joined_blocks) + joined_blocks = self._remove_coordinate_information(joined_blocks) + + bill_header = joined_blocks[0] + + splitted = self._split_list_by_element(joined_blocks, bill_header) + + count = 0 + for i in splitted: + if count < 20: + print(i) + count += 1 diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..2c827b4 --- /dev/null +++ b/shell.nix @@ -0,0 +1,7 @@ +{ pkgs ? import {} }: + pkgs.mkShell { + # nativeBuildInputs is usually what you want -- tools you need to run + nativeBuildInputs = with pkgs; [ + buildPackages.python311Packages.pymupdf + ]; +}