to be squashed
This commit is contained in:
parent
875890a83a
commit
a62453bdea
|
@ -2,3 +2,4 @@ import leglib
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
|
leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
|
||||||
|
leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse()
|
||||||
|
|
39
leglib.py
39
leglib.py
|
@ -53,6 +53,14 @@ class BillCode:
|
||||||
self.committee = int(dashsplit[1])
|
self.committee = int(dashsplit[1])
|
||||||
self.docketplacement = int(dashsplit[2])
|
self.docketplacement = int(dashsplit[2])
|
||||||
|
|
||||||
|
self.stringrep = self.color[0].upper() + \
|
||||||
|
self.assembly[0].upper() + \
|
||||||
|
"B/{}-{}-{}".format(
|
||||||
|
str(self.year),
|
||||||
|
str(self.committee),
|
||||||
|
str(self.docketplacement)
|
||||||
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "{} {} - {}-{}-{}".format(
|
return "{} {} - {}-{}-{}".format(
|
||||||
self.color,
|
self.color,
|
||||||
|
@ -67,7 +75,8 @@ class Bill:
|
||||||
code: str | BillCode,
|
code: str | BillCode,
|
||||||
sponsors: str,
|
sponsors: str,
|
||||||
subcommittee: str,
|
subcommittee: str,
|
||||||
school: str
|
school: str,
|
||||||
|
bill_text: str
|
||||||
):
|
):
|
||||||
if isinstance(code, str):
|
if isinstance(code, str):
|
||||||
self.code = BillCode(code)
|
self.code = BillCode(code)
|
||||||
|
@ -77,6 +86,7 @@ class Bill:
|
||||||
self.sponsors = sponsors.rstrip()
|
self.sponsors = sponsors.rstrip()
|
||||||
self.subcommittee = subcommittee.rstrip()
|
self.subcommittee = subcommittee.rstrip()
|
||||||
self.school = school.rstrip()
|
self.school = school.rstrip()
|
||||||
|
self.bill_text = bill_text
|
||||||
|
|
||||||
class PdfParser:
|
class PdfParser:
|
||||||
def __init__(self, document: fitz.Document):
|
def __init__(self, document: fitz.Document):
|
||||||
|
@ -186,7 +196,7 @@ class PdfParser:
|
||||||
page = self.document.load_page(page_number)
|
page = self.document.load_page(page_number)
|
||||||
block_info = self._get_block_info_from_page(page)
|
block_info = self._get_block_info_from_page(page)
|
||||||
|
|
||||||
joined_blocks += block_info
|
joined_blocks += block_info[:-1] # remove the page number at the end of every page
|
||||||
|
|
||||||
joined_blocks = self._remove_image_blocks(joined_blocks)
|
joined_blocks = self._remove_image_blocks(joined_blocks)
|
||||||
joined_blocks = self._remove_coordinate_information(joined_blocks)
|
joined_blocks = self._remove_coordinate_information(joined_blocks)
|
||||||
|
@ -195,8 +205,23 @@ class PdfParser:
|
||||||
|
|
||||||
splitted = self._split_list_by_element(joined_blocks, bill_header)
|
splitted = self._split_list_by_element(joined_blocks, bill_header)
|
||||||
|
|
||||||
count = 0
|
bills: list[Bill] = []
|
||||||
for i in splitted:
|
for splitted_item in splitted:
|
||||||
if count < 20:
|
try:
|
||||||
print(i)
|
bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
|
||||||
count += 1
|
except ValueError:
|
||||||
|
continue
|
||||||
|
bills.append(Bill(
|
||||||
|
code=bill_code,
|
||||||
|
subcommittee=subcommittee,
|
||||||
|
sponsors=sponsors,
|
||||||
|
school=school,
|
||||||
|
bill_text=' '.join(bill_text)
|
||||||
|
))
|
||||||
|
|
||||||
|
for bill in bills:
|
||||||
|
print(bill.code)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
|
||||||
|
return cls(fitz.open(filename))
|
||||||
|
|
Loading…
Reference in New Issue