move all the pdf parsing code into one module, and fix a critical parsing bug in containers

2024-06-29 13:52:56 -05:00 · 2024-06-29 13:52:56 -05:00 · 24bf60f245
parent 07dd23a396
commit 24bf60f245
7 changed files with 153 additions and 211 deletions
--- a/franklincce/explorer/lib/HSMUN.py
+++ b/franklincce/explorer/lib/HSMUN.py
@ -1,12 +1,60 @@
-from .common import *
-from typing import ClassVar
+from typing import Any, ClassVar
 from dataclasses import dataclass

 import fitz

-class HSMUN():
-    section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
-    last_page_words = ["ABCs"]
+class FitzBlockWrapper:
+    def __init__(self, block):
+        self.x0, self.y0, self.x1, \
+            self.y1, self.text, \
+            self.block_number, self.block_type = block
+
+        self.x0 = int(self.x0)
+        self.x1 = int(self.x1)
+        self.y0 = int(self.y0)
+        self.y1 = int(self.y1)
+        self.block_number = int(self.block_number)
+        self.block_type = int(self.block_type)
+
+    def __str__(self):
+        return str((
+            self.x0, self.y0, self.x1, self.y1, self.text
+        ))
+
+    def __repl__(self):
+        return self.__str__()
+
+def words_in_superstring(words: list[str], superstring: str) -> bool:
+    for word in words:
+        if not str(word).lower() in str(superstring).lower():
+            return False
+        return True
+
+def split_by_lambda(arr: list[Any], func):
+    output = []
+    current = []
+    for item in arr:
+        if func(item):
+            output.append(current)
+            current = []
+        else:
+            current.append(item)
+
+    output.append(current)
+    return output
+
+def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
+    for item in arr:
+        if item.x0 == xvalue:
+            return item
+
+def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
+    return [i for i in arr if not i.x0 == xvalue]
+
+class CCEParserBase():
+    section_page_words: ClassVar[list[str]]
+    last_page_words: ClassVar[list[str]]
+    split_leg_pages_needle: ClassVar[str]

    def __init__(self, document: fitz.Document):
        self.document = document
@ -90,13 +138,14 @@ class HSMUN():

        blocks = self.concat_blocks_for_leg_pages()
        # each item within splitted is called a "legislative meta-block"
-        splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text)
+        splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text)

        return splitted[1:] # there's an empty array at the beginning

    def handle_the_rest(self, the_rest):
-        weird_character = u'\uFFFd'
-        splitted_by_weird = the_rest.split(weird_character)
+        weird_character = u''
+        another_weird_character = u'\uFFFd'
+        splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character)
        title_content = ''.join(
            splitted_by_weird[0].split('\n')[:-1]
        ).rstrip().lstrip()
@ -156,3 +205,93 @@ class HSMUN():
            })

        self.output = output
+    
+
+class HSMUN23(CCEParserBase):
+    section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
+    last_page_words = ["ABCs"]
+    split_leg_pages_needle = "43rd General Assembly"
+    
+class HSYIG24(CCEParserBase):
+    section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
+    last_page_words = [ "ABCs" ]
+    split_leg_pages_needle = "71st General Assembly"
+
+    def generate_section_markers(self) -> list[int]:
+        """
+        This overrides the regular method because we need to check
+        for three images on a section page
+        """
+        section_pages = []
+
+        for page in self.document:
+            text = page.get_text().encode("utf8")
+            is_section_page = words_in_superstring(
+                words = self.section_page_words,
+                superstring = text
+            )
+            is_last_page = words_in_superstring(
+                words = self.last_page_words,
+                superstring = text
+            )
+
+            if is_section_page and len(page.get_images()) == 3:
+                section_pages.append(page.number)
+
+            if is_last_page and len(section_pages) > 2:
+                section_pages.append(page.number)
+
+        return section_pages
+
+    def parse_legislative_metablocks(self):
+        """
+        This is YIG specific code
+        """
+        output = []
+        splitted = self.split_leg_pages()
+        for legislative_text in splitted:
+            # there are some blocks that contain just one value
+            # and are aligned to some x value on the pdf
+
+            # it's an easy way to extract stuff
+            legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
+            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
+            school = get_block_by_x_value(legislative_text, 163).text.rstrip()
+            sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
+            subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
+            the_rest = ''.join([i.text for i in legislative_text[6:]])
+            handled = self.handle_the_rest(the_rest)
+            title = handled["title"]
+            bill_text = handled["bill_text"]
+
+            output.append({
+                "code": leg_code,
+                "school": school,
+                "sponsors": sponsors,
+                "subcommittee": subcommittee,
+                "title": title,
+                "bill_text": bill_text
+            })
+
+        self.output = output
+
+def main():
+    argv = sys.argv
+    doc = fitz.open(argv[1])
+    if argv[2] == "HSYIG":
+        doc = HSYIG24(doc)
+    elif argv[2] == "HSMUN":
+        doc = HSMUN23(doc)
+    else:
+        print("nonvalid book thing")
+        return
+    
+    for text in doc.output:
+        print("{} ---------------------------- {}".format(
+            text["title"], text["bill_text"]
+        ))
+
+if __name__ == "__main__":
+    import sys 
+
+    main()
--- a/franklincce/explorer/lib/HSYIG.py
+++ b/franklincce/explorer/lib/HSYIG.py
@ -1,139 +0,0 @@
-from .common import *
-from typing import ClassVar
-from dataclasses import dataclass
-
-import fitz
-
-class HSYIG():
-    section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
-    last_page_words = [ "ABCs" ]
-
-    def __init__(self, document: fitz.Document):
-        self.document = document
-        self.__post_init__()
-
-    def __post_init__(self):
-        # run all the processing steps here
-        self.parse_legislative_metablocks()
-
-    def generate_section_markers(self) -> list[int]:
-        """
-        In the YIG/MUN manuals, there's section markers that delineate between the different
-        committees within the manual. Let's find those, and then the last legislative page.
-        """
-        section_pages = []
-
-        for page in self.document:
-            text = page.get_text().encode("utf8")
-            is_section_page = words_in_superstring(
-                words = self.section_page_words,
-                superstring = text
-            )
-            is_last_page = words_in_superstring(
-                words = self.last_page_words,
-                superstring = text
-            )
-
-            print(text, is_section_page, is_last_page)
-
-            if is_section_page and len(page.get_images()) == 3:
-                section_pages.append(page.number)
-
-            if is_last_page and len(section_pages) > 2:
-                section_pages.append(page.number)
-
-        return section_pages
-
-    def get_legislative_pages(self):
-        """
-        Generate the section markers, then fill in the pages between them.
-        """
-
-        current = 0
-        sections = self.generate_section_markers()
-        legislative_pages: list[int] = []
-        try:
-            while True:
-                legislative_pages += list(
-                    range(
-                        sections[current] + 1,
-                        sections[current + 1],
-                        1
-                    )
-                )
-
-                current += 1
-
-        except IndexError:
-            pass
-
-        return legislative_pages
-
-    def concat_blocks_for_leg_pages(self):
-        """
-        From the legislative pages, concatenate the "blocks" of text in the PDF.
-        """
-        blocks = []
-        pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
-        for page in pages:
-            block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
-
-            blocks += block_info
-
-        return blocks
-
-    def split_leg_pages(self):
-        """
-        We have the collection of legislative page text blocks. We need
-        to split them now. We split on the text "71st General Assembly...
-        Youth in Government"
-        """
-
-        blocks = self.concat_blocks_for_leg_pages()
-        # each item within splitted is called a "legislative meta-block"
-        splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
-
-        return splitted[1:] # there's an empty array at the beginning
-
-    def handle_the_rest(self, the_rest):
-        weird_character = u'\uFFFd'
-        splitted_by_weird = the_rest.split(weird_character)
-        title_content = ''.join(
-            splitted_by_weird[0].split('\n')[:-1]
-        ).rstrip().lstrip()
-
-        bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
-
-        return {
-            "bill_text": '\n'.join(bill_text),
-            "title": title_content
-        }
-
-    def parse_legislative_metablocks(self):
-        output = []
-        splitted = self.split_leg_pages()
-        for legislative_text in splitted:
-            # there are some blocks that contain just one value
-            # and are aligned to some x value on the pdf
-
-            # it's an easy way to extract stuff
-            legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
-            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
-            school = get_block_by_x_value(legislative_text, 163).text.rstrip()
-            sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
-            subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
-            the_rest = ''.join([i.text for i in legislative_text[6:]])
-            handled = self.handle_the_rest(the_rest)
-            title = handled["title"]
-            bill_text = handled["bill_text"]
-
-            output.append({
-                "code": leg_code,
-                "school": school,
-                "sponsors": sponsors,
-                "subcommittee": subcommittee,
-                "title": title,
-                "bill_text": bill_text
-            })
-
-        self.output = output
--- a/franklincce/explorer/lib/init.py
+++ b/franklincce/explorer/lib/init.py
@ -1 +0,0 @@
-
--- a/franklincce/explorer/lib/common.py
+++ b/franklincce/explorer/lib/common.py
@ -1,48 +0,0 @@
-from typing import Any
-class FitzBlockWrapper:
-    def __init__(self, block):
-        self.x0, self.y0, self.x1, \
-            self.y1, self.text, \
-            self.block_number, self.block_type = block
-
-        self.x0 = int(self.x0)
-        self.x1 = int(self.x1)
-        self.y0 = int(self.y0)
-        self.y1 = int(self.y1)
-        self.block_number = int(self.block_number)
-        self.block_type = int(self.block_type)
-
-    def __str__(self):
-        return str((
-            self.x0, self.y0, self.x1, self.y1, self.text
-        ))
-
-    def __repl__(self):
-        return self.__str__()
-
-def words_in_superstring(words: list[str], superstring: str) -> bool:
-    for word in words:
-        if not str(word).lower() in str(superstring).lower():
-            return False
-        return True
-
-def split_by_lambda(arr: list[Any], func):
-    output = []
-    current = []
-    for item in arr:
-        if func(item):
-            output.append(current)
-            current = []
-        else:
-            current.append(item)
-
-    output.append(current)
-    return output
-
-def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
-    for item in arr:
-        if item.x0 == xvalue:
-            return item
-
-def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
-    return [i for i in arr if not i.x0 == xvalue]
--- a/franklincce/explorer/lib/parsers.py
+++ b/franklincce/explorer/lib/parsers.py
@ -1,9 +0,0 @@
-import fitz
-
-from .HSYIG import HSYIG
-from .HSMUN import HSMUN
-
-if __name__ == "__main__":
-    d = fitz.open("MUNB2023.pdf")
-    res = HSMUN(d)
-    print(res.output)
--- a/franklincce/explorer/models.py
+++ b/franklincce/explorer/models.py
@ -1,7 +1,7 @@
 from django.db import models
 from django.utils.translation import gettext_lazy as _

-from .lib.parsers import HSYIG, HSMUN
+from .leglib import HSYIG24, HSMUN23
 import io
 import fitz

@ -40,9 +40,9 @@ class LegislationBook(models.Model):
        the_file = io.BytesIO(self.pdf.file.file.read())
        the_document = fitz.open(stream=the_file)
        if self.import_strategy == "HSYIGBookParser":
-            parsed = HSYIG(the_document)
+            parsed = HSYIG24(the_document)
        elif self.import_strategy == "HSMUNBookParser":
-            parsed = HSMUN(the_document)
+            parsed = HSMUN23(the_document)
        else:
            return

--- a/franklincce/requirements.txt
+++ b/franklincce/requirements.txt
@ -1,3 +1,3 @@
-django
-pymupdf
+django==4.2.12
+pymupdf==1.23.26
 gunicorn