move all the pdf parsing code into one module, and fix a critical parsing bug in containers
This commit is contained in:
parent
07dd23a396
commit
24bf60f245
|
@ -1,12 +1,60 @@
|
||||||
from .common import *
|
from typing import Any, ClassVar
|
||||||
from typing import ClassVar
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
class HSMUN():
|
class FitzBlockWrapper:
|
||||||
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
|
def __init__(self, block):
|
||||||
last_page_words = ["ABCs"]
|
self.x0, self.y0, self.x1, \
|
||||||
|
self.y1, self.text, \
|
||||||
|
self.block_number, self.block_type = block
|
||||||
|
|
||||||
|
self.x0 = int(self.x0)
|
||||||
|
self.x1 = int(self.x1)
|
||||||
|
self.y0 = int(self.y0)
|
||||||
|
self.y1 = int(self.y1)
|
||||||
|
self.block_number = int(self.block_number)
|
||||||
|
self.block_type = int(self.block_type)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str((
|
||||||
|
self.x0, self.y0, self.x1, self.y1, self.text
|
||||||
|
))
|
||||||
|
|
||||||
|
def __repl__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
|
def words_in_superstring(words: list[str], superstring: str) -> bool:
|
||||||
|
for word in words:
|
||||||
|
if not str(word).lower() in str(superstring).lower():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def split_by_lambda(arr: list[Any], func):
|
||||||
|
output = []
|
||||||
|
current = []
|
||||||
|
for item in arr:
|
||||||
|
if func(item):
|
||||||
|
output.append(current)
|
||||||
|
current = []
|
||||||
|
else:
|
||||||
|
current.append(item)
|
||||||
|
|
||||||
|
output.append(current)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
|
||||||
|
for item in arr:
|
||||||
|
if item.x0 == xvalue:
|
||||||
|
return item
|
||||||
|
|
||||||
|
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
|
||||||
|
return [i for i in arr if not i.x0 == xvalue]
|
||||||
|
|
||||||
|
class CCEParserBase():
|
||||||
|
section_page_words: ClassVar[list[str]]
|
||||||
|
last_page_words: ClassVar[list[str]]
|
||||||
|
split_leg_pages_needle: ClassVar[str]
|
||||||
|
|
||||||
def __init__(self, document: fitz.Document):
|
def __init__(self, document: fitz.Document):
|
||||||
self.document = document
|
self.document = document
|
||||||
|
@ -90,13 +138,14 @@ class HSMUN():
|
||||||
|
|
||||||
blocks = self.concat_blocks_for_leg_pages()
|
blocks = self.concat_blocks_for_leg_pages()
|
||||||
# each item within splitted is called a "legislative meta-block"
|
# each item within splitted is called a "legislative meta-block"
|
||||||
splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text)
|
splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text)
|
||||||
|
|
||||||
return splitted[1:] # there's an empty array at the beginning
|
return splitted[1:] # there's an empty array at the beginning
|
||||||
|
|
||||||
def handle_the_rest(self, the_rest):
|
def handle_the_rest(self, the_rest):
|
||||||
weird_character = u'\uFFFd'
|
weird_character = u''
|
||||||
splitted_by_weird = the_rest.split(weird_character)
|
another_weird_character = u'\uFFFd'
|
||||||
|
splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character)
|
||||||
title_content = ''.join(
|
title_content = ''.join(
|
||||||
splitted_by_weird[0].split('\n')[:-1]
|
splitted_by_weird[0].split('\n')[:-1]
|
||||||
).rstrip().lstrip()
|
).rstrip().lstrip()
|
||||||
|
@ -156,3 +205,93 @@ class HSMUN():
|
||||||
})
|
})
|
||||||
|
|
||||||
self.output = output
|
self.output = output
|
||||||
|
|
||||||
|
|
||||||
|
class HSMUN23(CCEParserBase):
|
||||||
|
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
|
||||||
|
last_page_words = ["ABCs"]
|
||||||
|
split_leg_pages_needle = "43rd General Assembly"
|
||||||
|
|
||||||
|
class HSYIG24(CCEParserBase):
|
||||||
|
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
|
||||||
|
last_page_words = [ "ABCs" ]
|
||||||
|
split_leg_pages_needle = "71st General Assembly"
|
||||||
|
|
||||||
|
def generate_section_markers(self) -> list[int]:
|
||||||
|
"""
|
||||||
|
This overrides the regular method because we need to check
|
||||||
|
for three images on a section page
|
||||||
|
"""
|
||||||
|
section_pages = []
|
||||||
|
|
||||||
|
for page in self.document:
|
||||||
|
text = page.get_text().encode("utf8")
|
||||||
|
is_section_page = words_in_superstring(
|
||||||
|
words = self.section_page_words,
|
||||||
|
superstring = text
|
||||||
|
)
|
||||||
|
is_last_page = words_in_superstring(
|
||||||
|
words = self.last_page_words,
|
||||||
|
superstring = text
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_section_page and len(page.get_images()) == 3:
|
||||||
|
section_pages.append(page.number)
|
||||||
|
|
||||||
|
if is_last_page and len(section_pages) > 2:
|
||||||
|
section_pages.append(page.number)
|
||||||
|
|
||||||
|
return section_pages
|
||||||
|
|
||||||
|
def parse_legislative_metablocks(self):
|
||||||
|
"""
|
||||||
|
This is YIG specific code
|
||||||
|
"""
|
||||||
|
output = []
|
||||||
|
splitted = self.split_leg_pages()
|
||||||
|
for legislative_text in splitted:
|
||||||
|
# there are some blocks that contain just one value
|
||||||
|
# and are aligned to some x value on the pdf
|
||||||
|
|
||||||
|
# it's an easy way to extract stuff
|
||||||
|
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
|
||||||
|
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
|
||||||
|
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
|
||||||
|
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
|
||||||
|
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
|
||||||
|
the_rest = ''.join([i.text for i in legislative_text[6:]])
|
||||||
|
handled = self.handle_the_rest(the_rest)
|
||||||
|
title = handled["title"]
|
||||||
|
bill_text = handled["bill_text"]
|
||||||
|
|
||||||
|
output.append({
|
||||||
|
"code": leg_code,
|
||||||
|
"school": school,
|
||||||
|
"sponsors": sponsors,
|
||||||
|
"subcommittee": subcommittee,
|
||||||
|
"title": title,
|
||||||
|
"bill_text": bill_text
|
||||||
|
})
|
||||||
|
|
||||||
|
self.output = output
|
||||||
|
|
||||||
|
def main():
|
||||||
|
argv = sys.argv
|
||||||
|
doc = fitz.open(argv[1])
|
||||||
|
if argv[2] == "HSYIG":
|
||||||
|
doc = HSYIG24(doc)
|
||||||
|
elif argv[2] == "HSMUN":
|
||||||
|
doc = HSMUN23(doc)
|
||||||
|
else:
|
||||||
|
print("nonvalid book thing")
|
||||||
|
return
|
||||||
|
|
||||||
|
for text in doc.output:
|
||||||
|
print("{} ---------------------------- {}".format(
|
||||||
|
text["title"], text["bill_text"]
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
main()
|
|
@ -1,139 +0,0 @@
|
||||||
from .common import *
|
|
||||||
from typing import ClassVar
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import fitz
|
|
||||||
|
|
||||||
class HSYIG():
|
|
||||||
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
|
|
||||||
last_page_words = [ "ABCs" ]
|
|
||||||
|
|
||||||
def __init__(self, document: fitz.Document):
|
|
||||||
self.document = document
|
|
||||||
self.__post_init__()
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
# run all the processing steps here
|
|
||||||
self.parse_legislative_metablocks()
|
|
||||||
|
|
||||||
def generate_section_markers(self) -> list[int]:
|
|
||||||
"""
|
|
||||||
In the YIG/MUN manuals, there's section markers that delineate between the different
|
|
||||||
committees within the manual. Let's find those, and then the last legislative page.
|
|
||||||
"""
|
|
||||||
section_pages = []
|
|
||||||
|
|
||||||
for page in self.document:
|
|
||||||
text = page.get_text().encode("utf8")
|
|
||||||
is_section_page = words_in_superstring(
|
|
||||||
words = self.section_page_words,
|
|
||||||
superstring = text
|
|
||||||
)
|
|
||||||
is_last_page = words_in_superstring(
|
|
||||||
words = self.last_page_words,
|
|
||||||
superstring = text
|
|
||||||
)
|
|
||||||
|
|
||||||
print(text, is_section_page, is_last_page)
|
|
||||||
|
|
||||||
if is_section_page and len(page.get_images()) == 3:
|
|
||||||
section_pages.append(page.number)
|
|
||||||
|
|
||||||
if is_last_page and len(section_pages) > 2:
|
|
||||||
section_pages.append(page.number)
|
|
||||||
|
|
||||||
return section_pages
|
|
||||||
|
|
||||||
def get_legislative_pages(self):
|
|
||||||
"""
|
|
||||||
Generate the section markers, then fill in the pages between them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
current = 0
|
|
||||||
sections = self.generate_section_markers()
|
|
||||||
legislative_pages: list[int] = []
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
legislative_pages += list(
|
|
||||||
range(
|
|
||||||
sections[current] + 1,
|
|
||||||
sections[current + 1],
|
|
||||||
1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
current += 1
|
|
||||||
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return legislative_pages
|
|
||||||
|
|
||||||
def concat_blocks_for_leg_pages(self):
|
|
||||||
"""
|
|
||||||
From the legislative pages, concatenate the "blocks" of text in the PDF.
|
|
||||||
"""
|
|
||||||
blocks = []
|
|
||||||
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
|
|
||||||
for page in pages:
|
|
||||||
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
|
|
||||||
|
|
||||||
blocks += block_info
|
|
||||||
|
|
||||||
return blocks
|
|
||||||
|
|
||||||
def split_leg_pages(self):
|
|
||||||
"""
|
|
||||||
We have the collection of legislative page text blocks. We need
|
|
||||||
to split them now. We split on the text "71st General Assembly...
|
|
||||||
Youth in Government"
|
|
||||||
"""
|
|
||||||
|
|
||||||
blocks = self.concat_blocks_for_leg_pages()
|
|
||||||
# each item within splitted is called a "legislative meta-block"
|
|
||||||
splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
|
|
||||||
|
|
||||||
return splitted[1:] # there's an empty array at the beginning
|
|
||||||
|
|
||||||
def handle_the_rest(self, the_rest):
|
|
||||||
weird_character = u'\uFFFd'
|
|
||||||
splitted_by_weird = the_rest.split(weird_character)
|
|
||||||
title_content = ''.join(
|
|
||||||
splitted_by_weird[0].split('\n')[:-1]
|
|
||||||
).rstrip().lstrip()
|
|
||||||
|
|
||||||
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"bill_text": '\n'.join(bill_text),
|
|
||||||
"title": title_content
|
|
||||||
}
|
|
||||||
|
|
||||||
def parse_legislative_metablocks(self):
|
|
||||||
output = []
|
|
||||||
splitted = self.split_leg_pages()
|
|
||||||
for legislative_text in splitted:
|
|
||||||
# there are some blocks that contain just one value
|
|
||||||
# and are aligned to some x value on the pdf
|
|
||||||
|
|
||||||
# it's an easy way to extract stuff
|
|
||||||
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
|
|
||||||
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
|
|
||||||
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
|
|
||||||
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
|
|
||||||
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
|
|
||||||
the_rest = ''.join([i.text for i in legislative_text[6:]])
|
|
||||||
handled = self.handle_the_rest(the_rest)
|
|
||||||
title = handled["title"]
|
|
||||||
bill_text = handled["bill_text"]
|
|
||||||
|
|
||||||
output.append({
|
|
||||||
"code": leg_code,
|
|
||||||
"school": school,
|
|
||||||
"sponsors": sponsors,
|
|
||||||
"subcommittee": subcommittee,
|
|
||||||
"title": title,
|
|
||||||
"bill_text": bill_text
|
|
||||||
})
|
|
||||||
|
|
||||||
self.output = output
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
from typing import Any
|
|
||||||
class FitzBlockWrapper:
|
|
||||||
def __init__(self, block):
|
|
||||||
self.x0, self.y0, self.x1, \
|
|
||||||
self.y1, self.text, \
|
|
||||||
self.block_number, self.block_type = block
|
|
||||||
|
|
||||||
self.x0 = int(self.x0)
|
|
||||||
self.x1 = int(self.x1)
|
|
||||||
self.y0 = int(self.y0)
|
|
||||||
self.y1 = int(self.y1)
|
|
||||||
self.block_number = int(self.block_number)
|
|
||||||
self.block_type = int(self.block_type)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str((
|
|
||||||
self.x0, self.y0, self.x1, self.y1, self.text
|
|
||||||
))
|
|
||||||
|
|
||||||
def __repl__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
||||||
def words_in_superstring(words: list[str], superstring: str) -> bool:
|
|
||||||
for word in words:
|
|
||||||
if not str(word).lower() in str(superstring).lower():
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def split_by_lambda(arr: list[Any], func):
|
|
||||||
output = []
|
|
||||||
current = []
|
|
||||||
for item in arr:
|
|
||||||
if func(item):
|
|
||||||
output.append(current)
|
|
||||||
current = []
|
|
||||||
else:
|
|
||||||
current.append(item)
|
|
||||||
|
|
||||||
output.append(current)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
|
|
||||||
for item in arr:
|
|
||||||
if item.x0 == xvalue:
|
|
||||||
return item
|
|
||||||
|
|
||||||
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
|
|
||||||
return [i for i in arr if not i.x0 == xvalue]
|
|
|
@ -1,9 +0,0 @@
|
||||||
import fitz
|
|
||||||
|
|
||||||
from .HSYIG import HSYIG
|
|
||||||
from .HSMUN import HSMUN
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
d = fitz.open("MUNB2023.pdf")
|
|
||||||
res = HSMUN(d)
|
|
||||||
print(res.output)
|
|
|
@ -1,7 +1,7 @@
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
|
||||||
from .lib.parsers import HSYIG, HSMUN
|
from .leglib import HSYIG24, HSMUN23
|
||||||
import io
|
import io
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
|
@ -40,9 +40,9 @@ class LegislationBook(models.Model):
|
||||||
the_file = io.BytesIO(self.pdf.file.file.read())
|
the_file = io.BytesIO(self.pdf.file.file.read())
|
||||||
the_document = fitz.open(stream=the_file)
|
the_document = fitz.open(stream=the_file)
|
||||||
if self.import_strategy == "HSYIGBookParser":
|
if self.import_strategy == "HSYIGBookParser":
|
||||||
parsed = HSYIG(the_document)
|
parsed = HSYIG24(the_document)
|
||||||
elif self.import_strategy == "HSMUNBookParser":
|
elif self.import_strategy == "HSMUNBookParser":
|
||||||
parsed = HSMUN(the_document)
|
parsed = HSMUN23(the_document)
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
django
|
django==4.2.12
|
||||||
pymupdf
|
pymupdf==1.23.26
|
||||||
gunicorn
|
gunicorn
|
Loading…
Reference in New Issue