yig/parsers.py

203 lines
6.4 KiB
Python
Raw Blame History

import fitz
from typing import Any, Self, ClassVar
from itertools import groupby
from dataclasses import dataclass
from lib import FitzBlockWrapper
from common import Bill
@dataclass
class BookParser:
# class variables
humanname: ClassVar[str] = "Generic BookParser parent class."
description: ClassVar[str] = """
A generic description of the abilities of this BookParser.
"""
# everything else
document: fitz.Document
confname: str
@classmethod
def from_filename(cls, filename: str, confname: str):
return cls(
document=fitz.open(filename),
confname=confname
)
class HSYIGPdfParser(BookParser):
@staticmethod
def _words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
"""
sections is an array of section pages plus the last page.
"""
current = 0
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def _generate_section_markers(self, document: fitz.Document) -> list[int]:
section_pages = []
for page in document:
text = page.get_text().encode("utf8")
is_section_page = self._words_in_superstring(
words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
superstring=text
)
is_last_page = self._words_in_superstring(
words=[ "ABCs" ],
superstring=text
)
# print("page number {} contains sentintal? {}".format(page.number, is_section_page))
# if len(page.get_images()) == 3:
# print("page {} has one image!".format(page.number))
# print(page.get_images())
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def _get_block_info_from_page(self, page: fitz.Page):
return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
@staticmethod
def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[FitzBlockWrapper] = []
for block in blocks:
if block.block_type == 0:
to_return.append(block)
return to_return
@staticmethod
def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[str] = []
for block in blocks:
to_return.append(block.text)
return to_return
@staticmethod
def _get_info_from_block(block, lat: int):
to_return = []
for i in block:
if math.floor(i[0]) == lat:
to_return.append(i)
return to_return
@staticmethod
def _split_list_by_element(arr: list[Any], pivot: Any):
output = []
current = []
for i in arr:
if i == pivot:
output.append(current)
current = []
else:
current.append(i)
output.append(current)
return output
def parse(self):
section_pages = self._generate_section_markers(self.document)
legislative_pages = self._generate_legislative_pages_list(section_pages)
joined_blocks: list[FitzBlockWrapper] = []
for page_number in legislative_pages:
page = self.document.load_page(page_number)
block_info = self._get_block_info_from_page(page)
joined_blocks += block_info[:-1] # remove the page number at the end of every page
joined_blocks = self._remove_image_blocks(joined_blocks)
joined_blocks = self._remove_coordinate_information(joined_blocks)
bill_header = joined_blocks[0]
splitted = self._split_list_by_element(joined_blocks, bill_header)
bills: list[Bill] = []
for splitted_item in splitted:
try:
bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
except ValueError:
continue
bill_text = ' '.join(bill_text)
# print(type(bill_text))
pretty_printed = self._pretty_print_bill_text(bill_text)
bills.append(Bill(
code=bill_code,
subcommittee=subcommittee,
sponsors=sponsors,
school=school,
bill_text=pretty_printed["bill_array"],
title=pretty_printed["title"]
))
for bill in bills: # add the conference name to each
bill.code.conference = self.confname
self.bills = bills
@staticmethod
def _find_first_line_number(bill_arrays):
for i in range(len(bill_arrays)):
try:
if str(int(bill_arrays[i])) == bill_arrays[i]:
return i
except ValueError:
pass
def _pretty_print_bill_text(self, bill_text: str):
replaced = bill_text.replace("<EFBFBD>", "\n")
replaced = bill_text
replaced = replaced.split('\n')
replaced = [
i \
.replace('<EFBFBD>', ' ') \
.rstrip() \
.lstrip() \
for i in replaced
]
first_line_number = self._find_first_line_number(replaced)
title = ' '.join(replaced[:(first_line_number - 1)])
title = ' '.join(title.split()) # remove double spaces
rebuilt = replaced[first_line_number:][1::2]
# remove the last line number, it doesn't have a cooresponding space at the end
rebuilt = rebuilt[:-1]
# remove the first line, as it's the whitespace between the title and the bill text
rebuilt = rebuilt[1:]
return {
"title": title.lstrip(),
"bill_array": rebuilt
}