split up the parsers and other utilities

2024-05-03 13:49:16 -05:00 · 2024-05-03 13:49:16 -05:00 · 11fbcb474a
parent eabe1c98a0
commit 11fbcb474a
5 changed files with 277 additions and 273 deletions
--- a/analyser.py
+++ b/analyser.py
@ -1,5 +1,5 @@
 import leglib
-import fitz

-leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
-leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse()
+parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
+parser.parse()
+print([i.bill_text for i in parser.bills])
--- a/common.py
+++ b/common.py
@ -0,0 +1,79 @@
+from enum import StrEnum, auto
+
+class CCEColors(StrEnum):
+    Red = "Red"
+    White = "White",
+    Blue = "Blue"
+
+class CCEAssemblies(StrEnum):
+    Senate = "Senate",
+    House = "House",
+    GeneralAssembly = "GeneralAssembly"
+
+class BillCode:
+    def __init__(self, text: str):
+        # try to parse
+        # codes are in this rough format: "RSB/yy-c(c)-n(n)"
+
+        text = text.rstrip()
+        slashsplit = text.split('/')
+        dashsplit = slashsplit[1].split('-')
+
+        assemblycode = slashsplit[0]
+
+        self.color = assemblycode[0]
+        if self.color == "R":
+            self.color = CCEColors.Red
+        elif self.color == "W":
+            self.color = CCEColors.White
+        elif self.color == "B":
+            self.color = CCEColors.Blue
+
+        assemblydivision = assemblycode[1]
+        if assemblydivision == "S":
+            self.assembly = CCEAssemblies.Senate
+        elif assemblydivision == "H":
+            self.assembly = CCEAssemblies.House
+        elif assemblydivision == "G":
+            self.assembly = CCEAssemblies.GeneralAssembly
+
+        self.year = int(dashsplit[0])
+        self.committee = int(dashsplit[1])
+        self.docketplacement = int(dashsplit[2])
+
+        self.stringrep = self.color[0].upper() + \
+            self.assembly[0].upper() + \
+            "B/{}-{}-{}".format(
+                str(self.year),
+                str(self.committee),
+                str(self.docketplacement)
+            )
+
+    def __str__(self):
+        return "{} {} - {}-{}-{}".format(
+            self.color,
+            self.assembly,
+            str(self.year),
+            str(self.committee),
+            str(self.docketplacement)
+        )
+
+class Bill:
+    def __init__(self,
+        code: str | BillCode,
+        sponsors: str,
+        subcommittee: str,
+        school: str,
+        bill_text: list[str],
+        title: str
+    ):
+        if isinstance(code, str):
+            self.code = BillCode(code)
+        else:
+            self.code = code
+
+        self.sponsors = sponsors.rstrip()
+        self.subcommittee = subcommittee.rstrip()
+        self.school = school.rstrip()
+        self.bill_text = bill_text
+        self.title = title
--- a/leglib.py
+++ b/leglib.py
@ -1,274 +1,5 @@
 import fitz
 import math

-from enum import StrEnum, auto
-
-class CCEColors(StrEnum):
-    Red = "Red"
-    White = "White",
-    Blue = "Blue"
-
-class CCEAssemblies(StrEnum):
-    Senate = "Senate",
-    House = "House",
-    GeneralAssembly = "GeneralAssembly"
-
 from typing import Any
-
-class FitzBlockWrapper:
-    def __init__(self, block):
-        self.x0, self.y0, self.x1, \
-            self.y1, self.text, \
-            self.block_number, self.block_type = block
-
-        self.x0 = int(self.x0)
-        self.x1 = int(self.x1)
-        self.y0 = int(self.y0)
-        self.y1 = int(self.y1)
-        self.block_number = int(self.block_number)
-        self.block_type = int(self.block_type)
-
-    def __str__(self):
-        return str((
-            self.x0, self.y0, self.x1, self.y1, self.text
-        ))
-
-    def __repl__(self):
-        return self.__str__()
-
-class BillCode:
-    def __init__(self, text: str):
-        # try to parse
-        # codes are in this rough format: "RSB/yy-c(c)-n(n)"
-
-        text = text.rstrip()
-        slashsplit = text.split('/')
-        dashsplit = slashsplit[1].split('-')
-
-        assemblycode = slashsplit[0]
-
-        self.color = assemblycode[0]
-        if self.color == "R":
-            self.color = CCEColors.Red
-        elif self.color == "W":
-            self.color = CCEColors.White
-        elif self.color == "B":
-            self.color = CCEColors.Blue
-
-        assemblydivision = assemblycode[1]
-        if assemblydivision == "S":
-            self.assembly = CCEAssemblies.Senate
-        elif assemblydivision == "H":
-            self.assembly = CCEAssemblies.House
-        elif assemblydivision == "G":
-            self.assembly = CCEAssemblies.GeneralAssembly
-
-        self.year = int(dashsplit[0])
-        self.committee = int(dashsplit[1])
-        self.docketplacement = int(dashsplit[2])
-
-        self.stringrep = self.color[0].upper() + \
-            self.assembly[0].upper() + \
-            "B/{}-{}-{}".format(
-                str(self.year),
-                str(self.committee),
-                str(self.docketplacement)
-            )
-
-    def __str__(self):
-        return "{} {} - {}-{}-{}".format(
-            self.color,
-            self.assembly,
-            str(self.year),
-            str(self.committee),
-            str(self.docketplacement)
-        )
-
-class Bill:
-    def __init__(self,
-        code: str | BillCode,
-        sponsors: str,
-        subcommittee: str,
-        school: str,
-        bill_text: list[str],
-        title: str
-    ):
-        if isinstance(code, str):
-            self.code = BillCode(code)
-        else:
-            self.code = code
-
-        self.sponsors = sponsors.rstrip()
-        self.subcommittee = subcommittee.rstrip()
-        self.school = school.rstrip()
-        self.bill_text = bill_text
-        self.title = title
-
-class PdfParser:
-    def __init__(self, document: fitz.Document):
-        self.document = document
-
-    @staticmethod
-    def _words_in_superstring(words: list[str], superstring: str) -> bool:
-        for word in words:
-            if not str(word).lower() in str(superstring).lower():
-                return False
-        return True
-
-    def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
-        """
-        sections is an array of section pages plus the last page.
-        """
-        current = 0
-        legislative_pages: list[int] = []
-        try:
-            while True:
-                legislative_pages += list(
-                    range(
-                        sections[current] + 1,
-                        sections[current + 1],
-                        1
-                    )
-                )
-
-                current += 1
-        except IndexError:
-            pass
-
-        return legislative_pages
-
-    def _generate_section_markers(self, document: fitz.Document) -> list[int]:
-        section_pages = []
-        for page in document:
-            text = page.get_text().encode("utf8")
-            is_section_page = self._words_in_superstring(
-                    words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
-                    superstring=text
-            )
-            is_last_page = self._words_in_superstring(
-                    words=[ "ABCs" ],
-                    superstring=text
-            )
-            print("page number {} contains sentintal? {}".format(page.number, is_section_page))
-            if len(page.get_images()) == 3:
-                print("page {} has one image!".format(page.number))
-                print(page.get_images())
-
-            if is_section_page and len(page.get_images()) == 3:
-                section_pages.append(page.number)
-
-            if is_last_page and len(section_pages) > 2:
-                section_pages.append(page.number)
-
-        return section_pages
-
-    def _get_block_info_from_page(self, page: fitz.Page):
-        return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
-
-    @staticmethod
-    def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
-        to_return: list[FitzBlockWrapper] = []
-        for block in blocks:
-            if block.block_type == 0:
-                to_return.append(block)
-
-        return to_return
-
-    @staticmethod
-    def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
-        to_return: list[str] = []
-        for block in blocks:
-            to_return.append(block.text)
-
-        return to_return
-
-    @staticmethod
-    def _get_info_from_block(block, lat: int):
-        to_return = []
-        for i in block:
-            if math.floor(i[0]) == lat:
-                to_return.append(i)
-        return to_return
-
-    @staticmethod
-    def _split_list_by_element(arr: list[Any], pivot: Any):
-        output = []
-        current = []
-        for i in arr:
-            if i == pivot:
-                output.append(current)
-                current = []
-            else:
-                current.append(i)
-
-        output.append(current)
-        return output
-
-    def parse(self):
-        section_pages = self._generate_section_markers(self.document)
-        legislative_pages = self._generate_legislative_pages_list(section_pages)
-        joined_blocks: list[FitzBlockWrapper] = []
-        for page_number in legislative_pages:
-            page = self.document.load_page(page_number)
-            block_info = self._get_block_info_from_page(page)
-
-            joined_blocks += block_info[:-1] # remove the page number at the end of every page
-
-        joined_blocks = self._remove_image_blocks(joined_blocks)
-        joined_blocks = self._remove_coordinate_information(joined_blocks)
-
-        bill_header = joined_blocks[0]
-
-        splitted = self._split_list_by_element(joined_blocks, bill_header)
-
-        bills: list[Bill] = []
-        for splitted_item in splitted:
-            try:
-                bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
-            except ValueError:
-                continue
-
-            bill_text = ' '.join(bill_text)
-
-            print(type(bill_text))
-
-            pretty_printed = self._pretty_print_bill_text(bill_text)
-            bills.append(Bill(
-                code=bill_code,
-                subcommittee=subcommittee,
-                sponsors=sponsors,
-                school=school,
-                bill_text=pretty_printed["bill_array"],
-                title=pretty_printed["title"]
-            ))
-
-        self.bills = bills
-
-    @staticmethod
-    def _find_first_line_number(bill_arrays):
-        for i in range(len(bill_arrays)):
-            try:
-                if str(int(bill_arrays[i])) == bill_arrays[i]:
-                    return i
-            except ValueError:
-                pass
-
-    def _pretty_print_bill_text(self, bill_text: str):
-        replaced = bill_text.replace("<EFBFBD> ", "\n")
-        replaced = replaced.split('\n')
-
-        replaced = [i.rstrip().lstrip() for i in replaced]
-
-        first_line_number = self._find_first_line_number(replaced)
-
-        title = ' '.join(replaced[:first_line_number])
-        rebuilt = replaced[first_line_number:][1::2]
-
-        return {
-            "title": title.lstrip(),
-            "bill_array": rebuilt
-        }
-
-    @classmethod
-    def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
-        return cls(fitz.open(filename))
+import parsers
--- a/lib.py
+++ b/lib.py
@ -0,0 +1,20 @@
+class FitzBlockWrapper:
+    def __init__(self, block):
+        self.x0, self.y0, self.x1, \
+            self.y1, self.text, \
+            self.block_number, self.block_type = block
+
+        self.x0 = int(self.x0)
+        self.x1 = int(self.x1)
+        self.y0 = int(self.y0)
+        self.y1 = int(self.y1)
+        self.block_number = int(self.block_number)
+        self.block_type = int(self.block_type)
+
+    def __str__(self):
+        return str((
+            self.x0, self.y0, self.x1, self.y1, self.text
+        ))
+
+    def __repl__(self):
+        return self.__str__()
--- a/parsers.py
+++ b/parsers.py
@ -0,0 +1,174 @@
+import fitz
+from typing import Any
+
+from lib import FitzBlockWrapper
+from common import Bill
+
+class HSYIGPdfParser:
+    def __init__(self, document: fitz.Document):
+        self.document = document
+
+    @staticmethod
+    def _words_in_superstring(words: list[str], superstring: str) -> bool:
+        for word in words:
+            if not str(word).lower() in str(superstring).lower():
+                return False
+        return True
+
+    def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
+        """
+        sections is an array of section pages plus the last page.
+        """
+        current = 0
+        legislative_pages: list[int] = []
+        try:
+            while True:
+                legislative_pages += list(
+                    range(
+                        sections[current] + 1,
+                        sections[current + 1],
+                        1
+                    )
+                )
+
+                current += 1
+        except IndexError:
+            pass
+
+        return legislative_pages
+
+    def _generate_section_markers(self, document: fitz.Document) -> list[int]:
+        section_pages = []
+        for page in document:
+            text = page.get_text().encode("utf8")
+            is_section_page = self._words_in_superstring(
+                    words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
+                    superstring=text
+            )
+            is_last_page = self._words_in_superstring(
+                    words=[ "ABCs" ],
+                    superstring=text
+            )
+#            print("page number {} contains sentintal? {}".format(page.number, is_section_page))
+#            if len(page.get_images()) == 3:
+#                print("page {} has one image!".format(page.number))
+#                print(page.get_images())
+
+            if is_section_page and len(page.get_images()) == 3:
+                section_pages.append(page.number)
+
+            if is_last_page and len(section_pages) > 2:
+                section_pages.append(page.number)
+
+        return section_pages
+
+    def _get_block_info_from_page(self, page: fitz.Page):
+        return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
+
+    @staticmethod
+    def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
+        to_return: list[FitzBlockWrapper] = []
+        for block in blocks:
+            if block.block_type == 0:
+                to_return.append(block)
+
+        return to_return
+
+    @staticmethod
+    def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
+        to_return: list[str] = []
+        for block in blocks:
+            to_return.append(block.text)
+
+        return to_return
+
+    @staticmethod
+    def _get_info_from_block(block, lat: int):
+        to_return = []
+        for i in block:
+            if math.floor(i[0]) == lat:
+                to_return.append(i)
+        return to_return
+
+    @staticmethod
+    def _split_list_by_element(arr: list[Any], pivot: Any):
+        output = []
+        current = []
+        for i in arr:
+            if i == pivot:
+                output.append(current)
+                current = []
+            else:
+                current.append(i)
+
+        output.append(current)
+        return output
+
+    def parse(self):
+        section_pages = self._generate_section_markers(self.document)
+        legislative_pages = self._generate_legislative_pages_list(section_pages)
+        joined_blocks: list[FitzBlockWrapper] = []
+        for page_number in legislative_pages:
+            page = self.document.load_page(page_number)
+            block_info = self._get_block_info_from_page(page)
+
+            joined_blocks += block_info[:-1] # remove the page number at the end of every page
+
+        joined_blocks = self._remove_image_blocks(joined_blocks)
+        joined_blocks = self._remove_coordinate_information(joined_blocks)
+
+        bill_header = joined_blocks[0]
+
+        splitted = self._split_list_by_element(joined_blocks, bill_header)
+
+        bills: list[Bill] = []
+        for splitted_item in splitted:
+            try:
+                bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
+            except ValueError:
+                continue
+
+            bill_text = ' '.join(bill_text)
+
+#            print(type(bill_text))
+
+            pretty_printed = self._pretty_print_bill_text(bill_text)
+            bills.append(Bill(
+                code=bill_code,
+                subcommittee=subcommittee,
+                sponsors=sponsors,
+                school=school,
+                bill_text=pretty_printed["bill_array"],
+                title=pretty_printed["title"]
+            ))
+
+        self.bills = bills
+
+    @staticmethod
+    def _find_first_line_number(bill_arrays):
+        for i in range(len(bill_arrays)):
+            try:
+                if str(int(bill_arrays[i])) == bill_arrays[i]:
+                    return i
+            except ValueError:
+                pass
+
+    def _pretty_print_bill_text(self, bill_text: str):
+        replaced = bill_text.replace("<EFBFBD>", "\n")
+        replaced = replaced.split('\n')
+
+        replaced = [i.rstrip().lstrip() for i in replaced]
+
+        first_line_number = self._find_first_line_number(replaced)
+
+        title = ' '.join(replaced[:first_line_number])
+        rebuilt = replaced[first_line_number:][1::2]
+
+        return {
+            "title": title.lstrip(),
+            "bill_array": rebuilt
+        }
+
+    @classmethod
+    def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
+        return cls(fitz.open(filename))