move all the pdf parsing code into one module, and fix a critical parsing bug in containers

This commit is contained in:
stupidcomputer 2024-06-29 13:52:56 -05:00
parent 07dd23a396
commit 24bf60f245
7 changed files with 153 additions and 211 deletions

View File

@ -1,12 +1,60 @@
from .common import * from typing import Any, ClassVar
from typing import ClassVar
from dataclasses import dataclass from dataclasses import dataclass
import fitz import fitz
class HSMUN(): class FitzBlockWrapper:
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"] def __init__(self, block):
last_page_words = ["ABCs"] self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()
def words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def split_by_lambda(arr: list[Any], func):
output = []
current = []
for item in arr:
if func(item):
output.append(current)
current = []
else:
current.append(item)
output.append(current)
return output
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
for item in arr:
if item.x0 == xvalue:
return item
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
return [i for i in arr if not i.x0 == xvalue]
class CCEParserBase():
section_page_words: ClassVar[list[str]]
last_page_words: ClassVar[list[str]]
split_leg_pages_needle: ClassVar[str]
def __init__(self, document: fitz.Document): def __init__(self, document: fitz.Document):
self.document = document self.document = document
@ -90,13 +138,14 @@ class HSMUN():
blocks = self.concat_blocks_for_leg_pages() blocks = self.concat_blocks_for_leg_pages()
# each item within splitted is called a "legislative meta-block" # each item within splitted is called a "legislative meta-block"
splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text) splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text)
return splitted[1:] # there's an empty array at the beginning return splitted[1:] # there's an empty array at the beginning
def handle_the_rest(self, the_rest): def handle_the_rest(self, the_rest):
weird_character = u'\uFFFd' weird_character = u''
splitted_by_weird = the_rest.split(weird_character) another_weird_character = u'\uFFFd'
splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character)
title_content = ''.join( title_content = ''.join(
splitted_by_weird[0].split('\n')[:-1] splitted_by_weird[0].split('\n')[:-1]
).rstrip().lstrip() ).rstrip().lstrip()
@ -156,3 +205,93 @@ class HSMUN():
}) })
self.output = output self.output = output
class HSMUN23(CCEParserBase):
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
last_page_words = ["ABCs"]
split_leg_pages_needle = "43rd General Assembly"
class HSYIG24(CCEParserBase):
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
last_page_words = [ "ABCs" ]
split_leg_pages_needle = "71st General Assembly"
def generate_section_markers(self) -> list[int]:
"""
This overrides the regular method because we need to check
for three images on a section page
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def parse_legislative_metablocks(self):
"""
This is YIG specific code
"""
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
the_rest = ''.join([i.text for i in legislative_text[6:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output
def main():
argv = sys.argv
doc = fitz.open(argv[1])
if argv[2] == "HSYIG":
doc = HSYIG24(doc)
elif argv[2] == "HSMUN":
doc = HSMUN23(doc)
else:
print("nonvalid book thing")
return
for text in doc.output:
print("{} ---------------------------- {}".format(
text["title"], text["bill_text"]
))
if __name__ == "__main__":
import sys
main()

View File

@ -1,139 +0,0 @@
from .common import *
from typing import ClassVar
from dataclasses import dataclass
import fitz
class HSYIG():
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
last_page_words = [ "ABCs" ]
def __init__(self, document: fitz.Document):
self.document = document
self.__post_init__()
def __post_init__(self):
# run all the processing steps here
self.parse_legislative_metablocks()
def generate_section_markers(self) -> list[int]:
"""
In the YIG/MUN manuals, there's section markers that delineate between the different
committees within the manual. Let's find those, and then the last legislative page.
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
print(text, is_section_page, is_last_page)
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def get_legislative_pages(self):
"""
Generate the section markers, then fill in the pages between them.
"""
current = 0
sections = self.generate_section_markers()
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def concat_blocks_for_leg_pages(self):
"""
From the legislative pages, concatenate the "blocks" of text in the PDF.
"""
blocks = []
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
for page in pages:
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
blocks += block_info
return blocks
def split_leg_pages(self):
"""
We have the collection of legislative page text blocks. We need
to split them now. We split on the text "71st General Assembly...
Youth in Government"
"""
blocks = self.concat_blocks_for_leg_pages()
# each item within splitted is called a "legislative meta-block"
splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
return splitted[1:] # there's an empty array at the beginning
def handle_the_rest(self, the_rest):
weird_character = u'\uFFFd'
splitted_by_weird = the_rest.split(weird_character)
title_content = ''.join(
splitted_by_weird[0].split('\n')[:-1]
).rstrip().lstrip()
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
return {
"bill_text": '\n'.join(bill_text),
"title": title_content
}
def parse_legislative_metablocks(self):
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
the_rest = ''.join([i.text for i in legislative_text[6:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output

View File

@ -1 +0,0 @@

View File

@ -1,48 +0,0 @@
from typing import Any
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()
def words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def split_by_lambda(arr: list[Any], func):
output = []
current = []
for item in arr:
if func(item):
output.append(current)
current = []
else:
current.append(item)
output.append(current)
return output
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
for item in arr:
if item.x0 == xvalue:
return item
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
return [i for i in arr if not i.x0 == xvalue]

View File

@ -1,9 +0,0 @@
import fitz
from .HSYIG import HSYIG
from .HSMUN import HSMUN
if __name__ == "__main__":
d = fitz.open("MUNB2023.pdf")
res = HSMUN(d)
print(res.output)

View File

@ -1,7 +1,7 @@
from django.db import models from django.db import models
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from .lib.parsers import HSYIG, HSMUN from .leglib import HSYIG24, HSMUN23
import io import io
import fitz import fitz
@ -40,9 +40,9 @@ class LegislationBook(models.Model):
the_file = io.BytesIO(self.pdf.file.file.read()) the_file = io.BytesIO(self.pdf.file.file.read())
the_document = fitz.open(stream=the_file) the_document = fitz.open(stream=the_file)
if self.import_strategy == "HSYIGBookParser": if self.import_strategy == "HSYIGBookParser":
parsed = HSYIG(the_document) parsed = HSYIG24(the_document)
elif self.import_strategy == "HSMUNBookParser": elif self.import_strategy == "HSMUNBookParser":
parsed = HSMUN(the_document) parsed = HSMUN23(the_document)
else: else:
return return

View File

@ -1,3 +1,3 @@
django django==4.2.12
pymupdf pymupdf==1.23.26
gunicorn gunicorn