split up the parsers and other utilities

2024-05-03 13:49:16 -05:00 · 2024-05-03 13:49:16 -05:00 · 11fbcb474a
commit 11fbcb474a
parent eabe1c98a0
5 changed files with 277 additions and 273 deletions
--- a/analyser.py
+++ b/analyser.py
@ -1,5 +1,5 @@
 import leglib
 import fitz
-leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
+parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
-leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse()
+parser.parse()
 print([i.bill_text for i in parser.bills])
--- a/common.py
+++ b/common.py
@ -0,0 +1,79 @@
 from enum import StrEnum, auto
 class CCEColors(StrEnum):
    Red = "Red"
    White = "White",
    Blue = "Blue"
 class CCEAssemblies(StrEnum):
    Senate = "Senate",
    House = "House",
    GeneralAssembly = "GeneralAssembly"
 class BillCode:
    def __init__(self, text: str):
        # try to parse
        # codes are in this rough format: "RSB/yy-c(c)-n(n)"
        text = text.rstrip()
        slashsplit = text.split('/')
        dashsplit = slashsplit[1].split('-')
        assemblycode = slashsplit[0]
        self.color = assemblycode[0]
        if self.color == "R":
            self.color = CCEColors.Red
        elif self.color == "W":
            self.color = CCEColors.White
        elif self.color == "B":
            self.color = CCEColors.Blue
        assemblydivision = assemblycode[1]
        if assemblydivision == "S":
            self.assembly = CCEAssemblies.Senate
        elif assemblydivision == "H":
            self.assembly = CCEAssemblies.House
        elif assemblydivision == "G":
            self.assembly = CCEAssemblies.GeneralAssembly
        self.year = int(dashsplit[0])
        self.committee = int(dashsplit[1])
        self.docketplacement = int(dashsplit[2])
        self.stringrep = self.color[0].upper() + \
            self.assembly[0].upper() + \
            "B/{}-{}-{}".format(
                str(self.year),
                str(self.committee),
                str(self.docketplacement)
            )
    def __str__(self):
        return "{} {} - {}-{}-{}".format(
            self.color,
            self.assembly,
            str(self.year),
            str(self.committee),
            str(self.docketplacement)
        )
 class Bill:
    def __init__(self,
        code: str | BillCode,
        sponsors: str,
        subcommittee: str,
        school: str,
        bill_text: list[str],
        title: str
    ):
        if isinstance(code, str):
            self.code = BillCode(code)
        else:
            self.code = code
        self.sponsors = sponsors.rstrip()
        self.subcommittee = subcommittee.rstrip()
        self.school = school.rstrip()
        self.bill_text = bill_text
        self.title = title
--- a/leglib.py
+++ b/leglib.py
@ -1,274 +1,5 @@
 import fitz
 import math
 from enum import StrEnum, auto
 class CCEColors(StrEnum):
    Red = "Red"
    White = "White",
    Blue = "Blue"
 class CCEAssemblies(StrEnum):
    Senate = "Senate",
    House = "House",
    GeneralAssembly = "GeneralAssembly"
 from typing import Any
-
+import parsers
 class FitzBlockWrapper:
    def __init__(self, block):
        self.x0, self.y0, self.x1, \
            self.y1, self.text, \
            self.block_number, self.block_type = block
        self.x0 = int(self.x0)
        self.x1 = int(self.x1)
        self.y0 = int(self.y0)
        self.y1 = int(self.y1)
        self.block_number = int(self.block_number)
        self.block_type = int(self.block_type)
    def __str__(self):
        return str((
            self.x0, self.y0, self.x1, self.y1, self.text
        ))
    def __repl__(self):
        return self.__str__()
 class BillCode:
    def __init__(self, text: str):
        # try to parse
        # codes are in this rough format: "RSB/yy-c(c)-n(n)"
        text = text.rstrip()
        slashsplit = text.split('/')
        dashsplit = slashsplit[1].split('-')
        assemblycode = slashsplit[0]
        self.color = assemblycode[0]
        if self.color == "R":
            self.color = CCEColors.Red
        elif self.color == "W":
            self.color = CCEColors.White
        elif self.color == "B":
            self.color = CCEColors.Blue
        assemblydivision = assemblycode[1]
        if assemblydivision == "S":
            self.assembly = CCEAssemblies.Senate
        elif assemblydivision == "H":
            self.assembly = CCEAssemblies.House
        elif assemblydivision == "G":
            self.assembly = CCEAssemblies.GeneralAssembly
        self.year = int(dashsplit[0])
        self.committee = int(dashsplit[1])
        self.docketplacement = int(dashsplit[2])
        self.stringrep = self.color[0].upper() + \
            self.assembly[0].upper() + \
            "B/{}-{}-{}".format(
                str(self.year),
                str(self.committee),
                str(self.docketplacement)
            )
    def __str__(self):
        return "{} {} - {}-{}-{}".format(
            self.color,
            self.assembly,
            str(self.year),
            str(self.committee),
            str(self.docketplacement)
        )
 class Bill:
    def __init__(self,
        code: str | BillCode,
        sponsors: str,
        subcommittee: str,
        school: str,
        bill_text: list[str],
        title: str
    ):
        if isinstance(code, str):
            self.code = BillCode(code)
        else:
            self.code = code
        self.sponsors = sponsors.rstrip()
        self.subcommittee = subcommittee.rstrip()
        self.school = school.rstrip()
        self.bill_text = bill_text
        self.title = title
 class PdfParser:
    def __init__(self, document: fitz.Document):
        self.document = document
    @staticmethod
    def _words_in_superstring(words: list[str], superstring: str) -> bool:
        for word in words:
            if not str(word).lower() in str(superstring).lower():
                return False
        return True
    def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
        """
        sections is an array of section pages plus the last page.
        """
        current = 0
        legislative_pages: list[int] = []
        try:
            while True:
                legislative_pages += list(
                    range(
                        sections[current] + 1,
                        sections[current + 1],
                        1
                    )
                )
                current += 1
        except IndexError:
            pass
        return legislative_pages
    def _generate_section_markers(self, document: fitz.Document) -> list[int]:
        section_pages = []
        for page in document:
            text = page.get_text().encode("utf8")
            is_section_page = self._words_in_superstring(
                    words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
                    superstring=text
            )
            is_last_page = self._words_in_superstring(
                    words=[ "ABCs" ],
                    superstring=text
            )
            print("page number {} contains sentintal? {}".format(page.number, is_section_page))
            if len(page.get_images()) == 3:
                print("page {} has one image!".format(page.number))
                print(page.get_images())
            if is_section_page and len(page.get_images()) == 3:
                section_pages.append(page.number)
            if is_last_page and len(section_pages) > 2:
                section_pages.append(page.number)
        return section_pages
    def _get_block_info_from_page(self, page: fitz.Page):
        return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
    @staticmethod
    def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
        to_return: list[FitzBlockWrapper] = []
        for block in blocks:
            if block.block_type == 0:
                to_return.append(block)
        return to_return
    @staticmethod
    def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
        to_return: list[str] = []
        for block in blocks:
            to_return.append(block.text)
        return to_return
    @staticmethod
    def _get_info_from_block(block, lat: int):
        to_return = []
        for i in block:
            if math.floor(i[0]) == lat:
                to_return.append(i)
        return to_return
    @staticmethod
    def _split_list_by_element(arr: list[Any], pivot: Any):
        output = []
        current = []
        for i in arr:
            if i == pivot:
                output.append(current)
                current = []
            else:
                current.append(i)
        output.append(current)
        return output
    def parse(self):
        section_pages = self._generate_section_markers(self.document)
        legislative_pages = self._generate_legislative_pages_list(section_pages)
        joined_blocks: list[FitzBlockWrapper] = []
        for page_number in legislative_pages:
            page = self.document.load_page(page_number)
            block_info = self._get_block_info_from_page(page)
            joined_blocks += block_info[:-1] # remove the page number at the end of every page
        joined_blocks = self._remove_image_blocks(joined_blocks)
        joined_blocks = self._remove_coordinate_information(joined_blocks)
        bill_header = joined_blocks[0]
        splitted = self._split_list_by_element(joined_blocks, bill_header)
        bills: list[Bill] = []
        for splitted_item in splitted:
            try:
                bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
            except ValueError:
                continue
            bill_text = ' '.join(bill_text)
            print(type(bill_text))
            pretty_printed = self._pretty_print_bill_text(bill_text)
            bills.append(Bill(
                code=bill_code,
                subcommittee=subcommittee,
                sponsors=sponsors,
                school=school,
                bill_text=pretty_printed["bill_array"],
                title=pretty_printed["title"]
            ))
        self.bills = bills
    @staticmethod
    def _find_first_line_number(bill_arrays):
        for i in range(len(bill_arrays)):
            try:
                if str(int(bill_arrays[i])) == bill_arrays[i]:
                    return i
            except ValueError:
                pass
    def _pretty_print_bill_text(self, bill_text: str):
        replaced = bill_text.replace("<EFBFBD> ", "\n")
        replaced = replaced.split('\n')
        replaced = [i.rstrip().lstrip() for i in replaced]
        first_line_number = self._find_first_line_number(replaced)
        title = ' '.join(replaced[:first_line_number])
        rebuilt = replaced[first_line_number:][1::2]
        return {
            "title": title.lstrip(),
            "bill_array": rebuilt
        }
    @classmethod
    def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
        return cls(fitz.open(filename))
--- a/lib.py
+++ b/lib.py
@ -0,0 +1,20 @@
 class FitzBlockWrapper:
    def __init__(self, block):
        self.x0, self.y0, self.x1, \
            self.y1, self.text, \
            self.block_number, self.block_type = block
        self.x0 = int(self.x0)
        self.x1 = int(self.x1)
        self.y0 = int(self.y0)
        self.y1 = int(self.y1)
        self.block_number = int(self.block_number)
        self.block_type = int(self.block_type)
    def __str__(self):
        return str((
            self.x0, self.y0, self.x1, self.y1, self.text
        ))
    def __repl__(self):
        return self.__str__()
--- a/parsers.py
+++ b/parsers.py
@ -0,0 +1,174 @@
 import fitz
 from typing import Any
 from lib import FitzBlockWrapper
 from common import Bill
 class HSYIGPdfParser:
    def __init__(self, document: fitz.Document):
        self.document = document
    @staticmethod
    def _words_in_superstring(words: list[str], superstring: str) -> bool:
        for word in words:
            if not str(word).lower() in str(superstring).lower():
                return False
        return True
    def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
        """
        sections is an array of section pages plus the last page.
        """
        current = 0
        legislative_pages: list[int] = []
        try:
            while True:
                legislative_pages += list(
                    range(
                        sections[current] + 1,
                        sections[current + 1],
                        1
                    )
                )
                current += 1
        except IndexError:
            pass
        return legislative_pages
    def _generate_section_markers(self, document: fitz.Document) -> list[int]:
        section_pages = []
        for page in document:
            text = page.get_text().encode("utf8")
            is_section_page = self._words_in_superstring(
                    words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
                    superstring=text
            )
            is_last_page = self._words_in_superstring(
                    words=[ "ABCs" ],
                    superstring=text
            )
 #            print("page number {} contains sentintal? {}".format(page.number, is_section_page))
 #            if len(page.get_images()) == 3:
 #                print("page {} has one image!".format(page.number))
 #                print(page.get_images())
            if is_section_page and len(page.get_images()) == 3:
                section_pages.append(page.number)
            if is_last_page and len(section_pages) > 2:
                section_pages.append(page.number)
        return section_pages
    def _get_block_info_from_page(self, page: fitz.Page):
        return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
    @staticmethod
    def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
        to_return: list[FitzBlockWrapper] = []
        for block in blocks:
            if block.block_type == 0:
                to_return.append(block)
        return to_return
    @staticmethod
    def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
        to_return: list[str] = []
        for block in blocks:
            to_return.append(block.text)
        return to_return
    @staticmethod
    def _get_info_from_block(block, lat: int):
        to_return = []
        for i in block:
            if math.floor(i[0]) == lat:
                to_return.append(i)
        return to_return
    @staticmethod
    def _split_list_by_element(arr: list[Any], pivot: Any):
        output = []
        current = []
        for i in arr:
            if i == pivot:
                output.append(current)
                current = []
            else:
                current.append(i)
        output.append(current)
        return output
    def parse(self):
        section_pages = self._generate_section_markers(self.document)
        legislative_pages = self._generate_legislative_pages_list(section_pages)
        joined_blocks: list[FitzBlockWrapper] = []
        for page_number in legislative_pages:
            page = self.document.load_page(page_number)
            block_info = self._get_block_info_from_page(page)
            joined_blocks += block_info[:-1] # remove the page number at the end of every page
        joined_blocks = self._remove_image_blocks(joined_blocks)
        joined_blocks = self._remove_coordinate_information(joined_blocks)
        bill_header = joined_blocks[0]
        splitted = self._split_list_by_element(joined_blocks, bill_header)
        bills: list[Bill] = []
        for splitted_item in splitted:
            try:
                bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
            except ValueError:
                continue
            bill_text = ' '.join(bill_text)
 #            print(type(bill_text))
            pretty_printed = self._pretty_print_bill_text(bill_text)
            bills.append(Bill(
                code=bill_code,
                subcommittee=subcommittee,
                sponsors=sponsors,
                school=school,
                bill_text=pretty_printed["bill_array"],
                title=pretty_printed["title"]
            ))
        self.bills = bills
    @staticmethod
    def _find_first_line_number(bill_arrays):
        for i in range(len(bill_arrays)):
            try:
                if str(int(bill_arrays[i])) == bill_arrays[i]:
                    return i
            except ValueError:
                pass
    def _pretty_print_bill_text(self, bill_text: str):
        replaced = bill_text.replace("<EFBFBD>", "\n")
        replaced = replaced.split('\n')
        replaced = [i.rstrip().lstrip() for i in replaced]
        first_line_number = self._find_first_line_number(replaced)
        title = ' '.join(replaced[:first_line_number])
        rebuilt = replaced[first_line_number:][1::2]
        return {
            "title": title.lstrip(),
            "bill_array": rebuilt
        }
    @classmethod
    def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
        return cls(fitz.open(filename))