diff --git a/analyser.py b/analyser.py index 6c8ac0c..b47d81e 100644 --- a/analyser.py +++ b/analyser.py @@ -2,3 +2,4 @@ import leglib import fitz leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse() +leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse() diff --git a/leglib.py b/leglib.py index a5d5a25..4ffbf9a 100644 --- a/leglib.py +++ b/leglib.py @@ -53,6 +53,14 @@ class BillCode: self.committee = int(dashsplit[1]) self.docketplacement = int(dashsplit[2]) + self.stringrep = self.color[0].upper() + \ + self.assembly[0].upper() + \ + "B/{}-{}-{}".format( + str(self.year), + str(self.committee), + str(self.docketplacement) + ) + def __str__(self): return "{} {} - {}-{}-{}".format( self.color, @@ -67,7 +75,8 @@ class Bill: code: str | BillCode, sponsors: str, subcommittee: str, - school: str + school: str, + bill_text: str ): if isinstance(code, str): self.code = BillCode(code) @@ -77,6 +86,7 @@ class Bill: self.sponsors = sponsors.rstrip() self.subcommittee = subcommittee.rstrip() self.school = school.rstrip() + self.bill_text = bill_text class PdfParser: def __init__(self, document: fitz.Document): @@ -186,7 +196,7 @@ class PdfParser: page = self.document.load_page(page_number) block_info = self._get_block_info_from_page(page) - joined_blocks += block_info + joined_blocks += block_info[:-1] # remove the page number at the end of every page joined_blocks = self._remove_image_blocks(joined_blocks) joined_blocks = self._remove_coordinate_information(joined_blocks) @@ -195,8 +205,23 @@ class PdfParser: splitted = self._split_list_by_element(joined_blocks, bill_header) - count = 0 - for i in splitted: - if count < 20: - print(i) - count += 1 + bills: list[Bill] = [] + for splitted_item in splitted: + try: + bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item + except ValueError: + continue + bills.append(Bill( + code=bill_code, + subcommittee=subcommittee, + sponsors=sponsors, + school=school, + bill_text=' '.join(bill_text) + )) + + for bill in bills: + print(bill.code) + + @classmethod + def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser + return cls(fitz.open(filename))