yig/franklincce/explorer/leglib.py

297 lines
9.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import Any, ClassVar
from dataclasses import dataclass
import fitz
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()
def words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def split_by_lambda(arr: list[Any], func):
output = []
current = []
for item in arr:
if func(item):
output.append(current)
current = []
else:
current.append(item)
output.append(current)
return output
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
for item in arr:
if item.x0 == xvalue:
return item
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
return [i for i in arr if not i.x0 == xvalue]
class CCEParserBase():
section_page_words: ClassVar[list[str]]
last_page_words: ClassVar[list[str]]
split_leg_pages_needle: ClassVar[str]
def __init__(self, document: fitz.Document):
self.document = document
self.__post_init__()
def __post_init__(self):
# run all the processing steps here
self.parse_legislative_metablocks()
def generate_section_markers(self) -> list[int]:
"""
In the YIG/MUN manuals, there's section markers that delineate between the different
committees within the manual. Let's find those, and then the last legislative page.
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
if is_section_page:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def get_legislative_pages(self):
"""
Generate the section markers, then fill in the pages between them.
"""
current = 0
sections = self.generate_section_markers()
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def concat_blocks_for_leg_pages(self):
"""
From the legislative pages, concatenate the "blocks" of text in the PDF.
"""
blocks = []
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
for page in pages:
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
blocks += block_info
return blocks
def split_leg_pages(self):
"""
We have the collection of legislative page text blocks. We need
to split them now. We split on the text "71st General Assembly...
Youth in Government"
"""
blocks = self.concat_blocks_for_leg_pages()
# each item within splitted is called a "legislative meta-block"
splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text)
return splitted[1:] # there's an empty array at the beginning
def handle_the_rest(self, the_rest):
weird_character = u''
another_weird_character = u'\uFFFd'
splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character)
title_content = ''.join(
splitted_by_weird[0].split('\n')[:-1]
).rstrip().lstrip()
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
return {
"bill_text": '\n'.join(bill_text),
"title": title_content
}
def parse_legislative_metablocks(self):
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
try:
school = get_block_by_x_value(legislative_text, 177).text.rstrip()
except AttributeError:
try:
school = get_block_by_x_value(legislative_text, 186).text.rstrip()
except AttributeError:
school = "you tell me, man"
try:
sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip()
except AttributeError:
try:
sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip()
except AttributeError:
sponsors = "you tell me, man"
try:
subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
except AttributeError:
try:
subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
except AttributeError:
subcommittee = "you tell me, man"
the_rest = ''.join([i.text for i in legislative_text[12:]])
print([i.text for i in legislative_text[12:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output
class HSMUN23(CCEParserBase):
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
last_page_words = ["ABCs"]
split_leg_pages_needle = "43rd General Assembly"
class HSYIG24(CCEParserBase):
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
last_page_words = [ "ABCs" ]
split_leg_pages_needle = "71st General Assembly"
def generate_section_markers(self) -> list[int]:
"""
This overrides the regular method because we need to check
for three images on a section page
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def parse_legislative_metablocks(self):
"""
This is YIG specific code
"""
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
the_rest = ''.join([i.text for i in legislative_text[6:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output
def main():
argv = sys.argv
doc = fitz.open(argv[1])
if argv[2] == "HSYIG":
doc = HSYIG24(doc)
elif argv[2] == "HSMUN":
doc = HSMUN23(doc)
else:
print("nonvalid book thing")
return
for text in doc.output:
print("{} ---------------------------- {}".format(
text["title"], text["bill_text"]
))
if __name__ == "__main__":
import sys
main()