From 24bf60f245ddf469af87e47cb06e2b8c1b3e2e8b Mon Sep 17 00:00:00 2001 From: stupidcomputer Date: Sat, 29 Jun 2024 13:52:56 -0500 Subject: [PATCH] move all the pdf parsing code into one module, and fix a critical parsing bug in containers --- .../explorer/{lib/HSMUN.py => leglib.py} | 155 +++++++++++++++++- franklincce/explorer/lib/HSYIG.py | 139 ---------------- franklincce/explorer/lib/__init__.py | 1 - franklincce/explorer/lib/common.py | 48 ------ franklincce/explorer/lib/parsers.py | 9 - franklincce/explorer/models.py | 6 +- franklincce/requirements.txt | 6 +- 7 files changed, 153 insertions(+), 211 deletions(-) rename franklincce/explorer/{lib/HSMUN.py => leglib.py} (52%) delete mode 100644 franklincce/explorer/lib/HSYIG.py delete mode 100644 franklincce/explorer/lib/__init__.py delete mode 100644 franklincce/explorer/lib/common.py delete mode 100644 franklincce/explorer/lib/parsers.py diff --git a/franklincce/explorer/lib/HSMUN.py b/franklincce/explorer/leglib.py similarity index 52% rename from franklincce/explorer/lib/HSMUN.py rename to franklincce/explorer/leglib.py index 275bdb6..fddf675 100644 --- a/franklincce/explorer/lib/HSMUN.py +++ b/franklincce/explorer/leglib.py @@ -1,12 +1,60 @@ -from .common import * -from typing import ClassVar +from typing import Any, ClassVar from dataclasses import dataclass import fitz -class HSMUN(): - section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"] - last_page_words = ["ABCs"] +class FitzBlockWrapper: + def __init__(self, block): + self.x0, self.y0, self.x1, \ + self.y1, self.text, \ + self.block_number, self.block_type = block + + self.x0 = int(self.x0) + self.x1 = int(self.x1) + self.y0 = int(self.y0) + self.y1 = int(self.y1) + self.block_number = int(self.block_number) + self.block_type = int(self.block_type) + + def __str__(self): + return str(( + self.x0, self.y0, self.x1, self.y1, self.text + )) + + def __repl__(self): + return self.__str__() + +def words_in_superstring(words: list[str], superstring: str) -> bool: + for word in words: + if not str(word).lower() in str(superstring).lower(): + return False + return True + +def split_by_lambda(arr: list[Any], func): + output = [] + current = [] + for item in arr: + if func(item): + output.append(current) + current = [] + else: + current.append(item) + + output.append(current) + return output + +def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper: + for item in arr: + if item.x0 == xvalue: + return item + +def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]: + return [i for i in arr if not i.x0 == xvalue] + +class CCEParserBase(): + section_page_words: ClassVar[list[str]] + last_page_words: ClassVar[list[str]] + split_leg_pages_needle: ClassVar[str] def __init__(self, document: fitz.Document): self.document = document @@ -90,13 +138,14 @@ class HSMUN(): blocks = self.concat_blocks_for_leg_pages() # each item within splitted is called a "legislative meta-block" - splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text) + splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text) return splitted[1:] # there's an empty array at the beginning def handle_the_rest(self, the_rest): - weird_character = u'\uFFFd' - splitted_by_weird = the_rest.split(weird_character) + weird_character = u'' + another_weird_character = u'\uFFFd' + splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character) title_content = ''.join( splitted_by_weird[0].split('\n')[:-1] ).rstrip().lstrip() @@ -156,3 +205,93 @@ class HSMUN(): }) self.output = output + + +class HSMUN23(CCEParserBase): + section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"] + last_page_words = ["ABCs"] + split_leg_pages_needle = "43rd General Assembly" + +class HSYIG24(CCEParserBase): + section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ] + last_page_words = [ "ABCs" ] + split_leg_pages_needle = "71st General Assembly" + + def generate_section_markers(self) -> list[int]: + """ + This overrides the regular method because we need to check + for three images on a section page + """ + section_pages = [] + + for page in self.document: + text = page.get_text().encode("utf8") + is_section_page = words_in_superstring( + words = self.section_page_words, + superstring = text + ) + is_last_page = words_in_superstring( + words = self.last_page_words, + superstring = text + ) + + if is_section_page and len(page.get_images()) == 3: + section_pages.append(page.number) + + if is_last_page and len(section_pages) > 2: + section_pages.append(page.number) + + return section_pages + + def parse_legislative_metablocks(self): + """ + This is YIG specific code + """ + output = [] + splitted = self.split_leg_pages() + for legislative_text in splitted: + # there are some blocks that contain just one value + # and are aligned to some x value on the pdf + + # it's an easy way to extract stuff + legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers + leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() + school = get_block_by_x_value(legislative_text, 163).text.rstrip() + sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip() + subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip() + the_rest = ''.join([i.text for i in legislative_text[6:]]) + handled = self.handle_the_rest(the_rest) + title = handled["title"] + bill_text = handled["bill_text"] + + output.append({ + "code": leg_code, + "school": school, + "sponsors": sponsors, + "subcommittee": subcommittee, + "title": title, + "bill_text": bill_text + }) + + self.output = output + +def main(): + argv = sys.argv + doc = fitz.open(argv[1]) + if argv[2] == "HSYIG": + doc = HSYIG24(doc) + elif argv[2] == "HSMUN": + doc = HSMUN23(doc) + else: + print("nonvalid book thing") + return + + for text in doc.output: + print("{} ---------------------------- {}".format( + text["title"], text["bill_text"] + )) + +if __name__ == "__main__": + import sys + + main() \ No newline at end of file diff --git a/franklincce/explorer/lib/HSYIG.py b/franklincce/explorer/lib/HSYIG.py deleted file mode 100644 index 0826aa5..0000000 --- a/franklincce/explorer/lib/HSYIG.py +++ /dev/null @@ -1,139 +0,0 @@ -from .common import * -from typing import ClassVar -from dataclasses import dataclass - -import fitz - -class HSYIG(): - section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ] - last_page_words = [ "ABCs" ] - - def __init__(self, document: fitz.Document): - self.document = document - self.__post_init__() - - def __post_init__(self): - # run all the processing steps here - self.parse_legislative_metablocks() - - def generate_section_markers(self) -> list[int]: - """ - In the YIG/MUN manuals, there's section markers that delineate between the different - committees within the manual. Let's find those, and then the last legislative page. - """ - section_pages = [] - - for page in self.document: - text = page.get_text().encode("utf8") - is_section_page = words_in_superstring( - words = self.section_page_words, - superstring = text - ) - is_last_page = words_in_superstring( - words = self.last_page_words, - superstring = text - ) - - print(text, is_section_page, is_last_page) - - if is_section_page and len(page.get_images()) == 3: - section_pages.append(page.number) - - if is_last_page and len(section_pages) > 2: - section_pages.append(page.number) - - return section_pages - - def get_legislative_pages(self): - """ - Generate the section markers, then fill in the pages between them. - """ - - current = 0 - sections = self.generate_section_markers() - legislative_pages: list[int] = [] - try: - while True: - legislative_pages += list( - range( - sections[current] + 1, - sections[current + 1], - 1 - ) - ) - - current += 1 - - except IndexError: - pass - - return legislative_pages - - def concat_blocks_for_leg_pages(self): - """ - From the legislative pages, concatenate the "blocks" of text in the PDF. - """ - blocks = [] - pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()] - for page in pages: - block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")] - - blocks += block_info - - return blocks - - def split_leg_pages(self): - """ - We have the collection of legislative page text blocks. We need - to split them now. We split on the text "71st General Assembly... - Youth in Government" - """ - - blocks = self.concat_blocks_for_leg_pages() - # each item within splitted is called a "legislative meta-block" - splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text) - - return splitted[1:] # there's an empty array at the beginning - - def handle_the_rest(self, the_rest): - weird_character = u'\uFFFd' - splitted_by_weird = the_rest.split(weird_character) - title_content = ''.join( - splitted_by_weird[0].split('\n')[:-1] - ).rstrip().lstrip() - - bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]] - - return { - "bill_text": '\n'.join(bill_text), - "title": title_content - } - - def parse_legislative_metablocks(self): - output = [] - splitted = self.split_leg_pages() - for legislative_text in splitted: - # there are some blocks that contain just one value - # and are aligned to some x value on the pdf - - # it's an easy way to extract stuff - legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers - leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() - school = get_block_by_x_value(legislative_text, 163).text.rstrip() - sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip() - subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip() - the_rest = ''.join([i.text for i in legislative_text[6:]]) - handled = self.handle_the_rest(the_rest) - title = handled["title"] - bill_text = handled["bill_text"] - - output.append({ - "code": leg_code, - "school": school, - "sponsors": sponsors, - "subcommittee": subcommittee, - "title": title, - "bill_text": bill_text - }) - - self.output = output diff --git a/franklincce/explorer/lib/__init__.py b/franklincce/explorer/lib/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/franklincce/explorer/lib/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/franklincce/explorer/lib/common.py b/franklincce/explorer/lib/common.py deleted file mode 100644 index 4f824ce..0000000 --- a/franklincce/explorer/lib/common.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Any -class FitzBlockWrapper: - def __init__(self, block): - self.x0, self.y0, self.x1, \ - self.y1, self.text, \ - self.block_number, self.block_type = block - - self.x0 = int(self.x0) - self.x1 = int(self.x1) - self.y0 = int(self.y0) - self.y1 = int(self.y1) - self.block_number = int(self.block_number) - self.block_type = int(self.block_type) - - def __str__(self): - return str(( - self.x0, self.y0, self.x1, self.y1, self.text - )) - - def __repl__(self): - return self.__str__() - -def words_in_superstring(words: list[str], superstring: str) -> bool: - for word in words: - if not str(word).lower() in str(superstring).lower(): - return False - return True - -def split_by_lambda(arr: list[Any], func): - output = [] - current = [] - for item in arr: - if func(item): - output.append(current) - current = [] - else: - current.append(item) - - output.append(current) - return output - -def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper: - for item in arr: - if item.x0 == xvalue: - return item - -def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]: - return [i for i in arr if not i.x0 == xvalue] diff --git a/franklincce/explorer/lib/parsers.py b/franklincce/explorer/lib/parsers.py deleted file mode 100644 index 16fc58e..0000000 --- a/franklincce/explorer/lib/parsers.py +++ /dev/null @@ -1,9 +0,0 @@ -import fitz - -from .HSYIG import HSYIG -from .HSMUN import HSMUN - -if __name__ == "__main__": - d = fitz.open("MUNB2023.pdf") - res = HSMUN(d) - print(res.output) diff --git a/franklincce/explorer/models.py b/franklincce/explorer/models.py index cd312ed..14e7786 100644 --- a/franklincce/explorer/models.py +++ b/franklincce/explorer/models.py @@ -1,7 +1,7 @@ from django.db import models from django.utils.translation import gettext_lazy as _ -from .lib.parsers import HSYIG, HSMUN +from .leglib import HSYIG24, HSMUN23 import io import fitz @@ -40,9 +40,9 @@ class LegislationBook(models.Model): the_file = io.BytesIO(self.pdf.file.file.read()) the_document = fitz.open(stream=the_file) if self.import_strategy == "HSYIGBookParser": - parsed = HSYIG(the_document) + parsed = HSYIG24(the_document) elif self.import_strategy == "HSMUNBookParser": - parsed = HSMUN(the_document) + parsed = HSMUN23(the_document) else: return diff --git a/franklincce/requirements.txt b/franklincce/requirements.txt index df751af..95d79f8 100644 --- a/franklincce/requirements.txt +++ b/franklincce/requirements.txt @@ -1,3 +1,3 @@ -django -pymupdf -gunicorn +django==4.2.12 +pymupdf==1.23.26 +gunicorn \ No newline at end of file