split up the parsers and other utilities

This commit is contained in:
stupidcomputer 2024-05-03 13:49:16 -05:00
parent eabe1c98a0
commit 11fbcb474a
5 changed files with 277 additions and 273 deletions

View File

@ -1,5 +1,5 @@
import leglib
import fitz
leglib.PdfParser(fitz.open("YIGVolunteerBook2024.pdf")).parse()
leglib.PdfParser.from_filename("YIGVolunteerBook2024.pdf").parse()
parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
parser.parse()
print([i.bill_text for i in parser.bills])

79
common.py Normal file
View File

@ -0,0 +1,79 @@
from enum import StrEnum, auto
class CCEColors(StrEnum):
Red = "Red"
White = "White",
Blue = "Blue"
class CCEAssemblies(StrEnum):
Senate = "Senate",
House = "House",
GeneralAssembly = "GeneralAssembly"
class BillCode:
def __init__(self, text: str):
# try to parse
# codes are in this rough format: "RSB/yy-c(c)-n(n)"
text = text.rstrip()
slashsplit = text.split('/')
dashsplit = slashsplit[1].split('-')
assemblycode = slashsplit[0]
self.color = assemblycode[0]
if self.color == "R":
self.color = CCEColors.Red
elif self.color == "W":
self.color = CCEColors.White
elif self.color == "B":
self.color = CCEColors.Blue
assemblydivision = assemblycode[1]
if assemblydivision == "S":
self.assembly = CCEAssemblies.Senate
elif assemblydivision == "H":
self.assembly = CCEAssemblies.House
elif assemblydivision == "G":
self.assembly = CCEAssemblies.GeneralAssembly
self.year = int(dashsplit[0])
self.committee = int(dashsplit[1])
self.docketplacement = int(dashsplit[2])
self.stringrep = self.color[0].upper() + \
self.assembly[0].upper() + \
"B/{}-{}-{}".format(
str(self.year),
str(self.committee),
str(self.docketplacement)
)
def __str__(self):
return "{} {} - {}-{}-{}".format(
self.color,
self.assembly,
str(self.year),
str(self.committee),
str(self.docketplacement)
)
class Bill:
def __init__(self,
code: str | BillCode,
sponsors: str,
subcommittee: str,
school: str,
bill_text: list[str],
title: str
):
if isinstance(code, str):
self.code = BillCode(code)
else:
self.code = code
self.sponsors = sponsors.rstrip()
self.subcommittee = subcommittee.rstrip()
self.school = school.rstrip()
self.bill_text = bill_text
self.title = title

271
leglib.py
View File

@ -1,274 +1,5 @@
import fitz
import math
from enum import StrEnum, auto
class CCEColors(StrEnum):
Red = "Red"
White = "White",
Blue = "Blue"
class CCEAssemblies(StrEnum):
Senate = "Senate",
House = "House",
GeneralAssembly = "GeneralAssembly"
from typing import Any
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()
class BillCode:
def __init__(self, text: str):
# try to parse
# codes are in this rough format: "RSB/yy-c(c)-n(n)"
text = text.rstrip()
slashsplit = text.split('/')
dashsplit = slashsplit[1].split('-')
assemblycode = slashsplit[0]
self.color = assemblycode[0]
if self.color == "R":
self.color = CCEColors.Red
elif self.color == "W":
self.color = CCEColors.White
elif self.color == "B":
self.color = CCEColors.Blue
assemblydivision = assemblycode[1]
if assemblydivision == "S":
self.assembly = CCEAssemblies.Senate
elif assemblydivision == "H":
self.assembly = CCEAssemblies.House
elif assemblydivision == "G":
self.assembly = CCEAssemblies.GeneralAssembly
self.year = int(dashsplit[0])
self.committee = int(dashsplit[1])
self.docketplacement = int(dashsplit[2])
self.stringrep = self.color[0].upper() + \
self.assembly[0].upper() + \
"B/{}-{}-{}".format(
str(self.year),
str(self.committee),
str(self.docketplacement)
)
def __str__(self):
return "{} {} - {}-{}-{}".format(
self.color,
self.assembly,
str(self.year),
str(self.committee),
str(self.docketplacement)
)
class Bill:
def __init__(self,
code: str | BillCode,
sponsors: str,
subcommittee: str,
school: str,
bill_text: list[str],
title: str
):
if isinstance(code, str):
self.code = BillCode(code)
else:
self.code = code
self.sponsors = sponsors.rstrip()
self.subcommittee = subcommittee.rstrip()
self.school = school.rstrip()
self.bill_text = bill_text
self.title = title
class PdfParser:
def __init__(self, document: fitz.Document):
self.document = document
@staticmethod
def _words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
"""
sections is an array of section pages plus the last page.
"""
current = 0
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def _generate_section_markers(self, document: fitz.Document) -> list[int]:
section_pages = []
for page in document:
text = page.get_text().encode("utf8")
is_section_page = self._words_in_superstring(
words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
superstring=text
)
is_last_page = self._words_in_superstring(
words=[ "ABCs" ],
superstring=text
)
print("page number {} contains sentintal? {}".format(page.number, is_section_page))
if len(page.get_images()) == 3:
print("page {} has one image!".format(page.number))
print(page.get_images())
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def _get_block_info_from_page(self, page: fitz.Page):
return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
@staticmethod
def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[FitzBlockWrapper] = []
for block in blocks:
if block.block_type == 0:
to_return.append(block)
return to_return
@staticmethod
def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[str] = []
for block in blocks:
to_return.append(block.text)
return to_return
@staticmethod
def _get_info_from_block(block, lat: int):
to_return = []
for i in block:
if math.floor(i[0]) == lat:
to_return.append(i)
return to_return
@staticmethod
def _split_list_by_element(arr: list[Any], pivot: Any):
output = []
current = []
for i in arr:
if i == pivot:
output.append(current)
current = []
else:
current.append(i)
output.append(current)
return output
def parse(self):
section_pages = self._generate_section_markers(self.document)
legislative_pages = self._generate_legislative_pages_list(section_pages)
joined_blocks: list[FitzBlockWrapper] = []
for page_number in legislative_pages:
page = self.document.load_page(page_number)
block_info = self._get_block_info_from_page(page)
joined_blocks += block_info[:-1] # remove the page number at the end of every page
joined_blocks = self._remove_image_blocks(joined_blocks)
joined_blocks = self._remove_coordinate_information(joined_blocks)
bill_header = joined_blocks[0]
splitted = self._split_list_by_element(joined_blocks, bill_header)
bills: list[Bill] = []
for splitted_item in splitted:
try:
bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
except ValueError:
continue
bill_text = ' '.join(bill_text)
print(type(bill_text))
pretty_printed = self._pretty_print_bill_text(bill_text)
bills.append(Bill(
code=bill_code,
subcommittee=subcommittee,
sponsors=sponsors,
school=school,
bill_text=pretty_printed["bill_array"],
title=pretty_printed["title"]
))
self.bills = bills
@staticmethod
def _find_first_line_number(bill_arrays):
for i in range(len(bill_arrays)):
try:
if str(int(bill_arrays[i])) == bill_arrays[i]:
return i
except ValueError:
pass
def _pretty_print_bill_text(self, bill_text: str):
replaced = bill_text.replace("<EFBFBD> ", "\n")
replaced = replaced.split('\n')
replaced = [i.rstrip().lstrip() for i in replaced]
first_line_number = self._find_first_line_number(replaced)
title = ' '.join(replaced[:first_line_number])
rebuilt = replaced[first_line_number:][1::2]
return {
"title": title.lstrip(),
"bill_array": rebuilt
}
@classmethod
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
return cls(fitz.open(filename))
import parsers

20
lib.py Normal file
View File

@ -0,0 +1,20 @@
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()

174
parsers.py Normal file
View File

@ -0,0 +1,174 @@
import fitz
from typing import Any
from lib import FitzBlockWrapper
from common import Bill
class HSYIGPdfParser:
def __init__(self, document: fitz.Document):
self.document = document
@staticmethod
def _words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
"""
sections is an array of section pages plus the last page.
"""
current = 0
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def _generate_section_markers(self, document: fitz.Document) -> list[int]:
section_pages = []
for page in document:
text = page.get_text().encode("utf8")
is_section_page = self._words_in_superstring(
words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
superstring=text
)
is_last_page = self._words_in_superstring(
words=[ "ABCs" ],
superstring=text
)
# print("page number {} contains sentintal? {}".format(page.number, is_section_page))
# if len(page.get_images()) == 3:
# print("page {} has one image!".format(page.number))
# print(page.get_images())
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def _get_block_info_from_page(self, page: fitz.Page):
return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
@staticmethod
def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[FitzBlockWrapper] = []
for block in blocks:
if block.block_type == 0:
to_return.append(block)
return to_return
@staticmethod
def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[str] = []
for block in blocks:
to_return.append(block.text)
return to_return
@staticmethod
def _get_info_from_block(block, lat: int):
to_return = []
for i in block:
if math.floor(i[0]) == lat:
to_return.append(i)
return to_return
@staticmethod
def _split_list_by_element(arr: list[Any], pivot: Any):
output = []
current = []
for i in arr:
if i == pivot:
output.append(current)
current = []
else:
current.append(i)
output.append(current)
return output
def parse(self):
section_pages = self._generate_section_markers(self.document)
legislative_pages = self._generate_legislative_pages_list(section_pages)
joined_blocks: list[FitzBlockWrapper] = []
for page_number in legislative_pages:
page = self.document.load_page(page_number)
block_info = self._get_block_info_from_page(page)
joined_blocks += block_info[:-1] # remove the page number at the end of every page
joined_blocks = self._remove_image_blocks(joined_blocks)
joined_blocks = self._remove_coordinate_information(joined_blocks)
bill_header = joined_blocks[0]
splitted = self._split_list_by_element(joined_blocks, bill_header)
bills: list[Bill] = []
for splitted_item in splitted:
try:
bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
except ValueError:
continue
bill_text = ' '.join(bill_text)
# print(type(bill_text))
pretty_printed = self._pretty_print_bill_text(bill_text)
bills.append(Bill(
code=bill_code,
subcommittee=subcommittee,
sponsors=sponsors,
school=school,
bill_text=pretty_printed["bill_array"],
title=pretty_printed["title"]
))
self.bills = bills
@staticmethod
def _find_first_line_number(bill_arrays):
for i in range(len(bill_arrays)):
try:
if str(int(bill_arrays[i])) == bill_arrays[i]:
return i
except ValueError:
pass
def _pretty_print_bill_text(self, bill_text: str):
replaced = bill_text.replace("<EFBFBD>", "\n")
replaced = replaced.split('\n')
replaced = [i.rstrip().lstrip() for i in replaced]
first_line_number = self._find_first_line_number(replaced)
title = ' '.join(replaced[:first_line_number])
rebuilt = replaced[first_line_number:][1::2]
return {
"title": title.lstrip(),
"bill_array": rebuilt
}
@classmethod
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
return cls(fitz.open(filename))