web interface version 1
There's a crappy frontend, but the Django admin panel works as expected.
This commit is contained in:
parent
308b0f978f
commit
ca61adc1e6
|
@ -2,5 +2,11 @@ from django.contrib import admin
|
|||
|
||||
from .models import LegislativeText, LegislationBook
|
||||
|
||||
admin.site.register(LegislativeText)
|
||||
admin.site.register(LegislationBook)
|
||||
class LegislativeTextAdmin(admin.ModelAdmin):
|
||||
list_display = ('__str__', 'legislation_title', 'school')
|
||||
|
||||
class LegislationBookAdmin(admin.ModelAdmin):
|
||||
exclude = ("has_performed_export",)
|
||||
|
||||
admin.site.register(LegislativeText, LegislativeTextAdmin)
|
||||
admin.site.register(LegislationBook, LegislationBookAdmin)
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
from .common import *
|
||||
from typing import ClassVar
|
||||
from dataclasses import dataclass
|
||||
|
||||
import fitz
|
||||
|
||||
class HSMUN():
|
||||
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
|
||||
last_page_words = ["ABCs"]
|
||||
|
||||
def __init__(self, document: fitz.Document):
|
||||
self.document = document
|
||||
self.__post_init__()
|
||||
|
||||
def __post_init__(self):
|
||||
# run all the processing steps here
|
||||
self.parse_legislative_metablocks()
|
||||
|
||||
def generate_section_markers(self) -> list[int]:
|
||||
"""
|
||||
In the YIG/MUN manuals, there's section markers that delineate between the different
|
||||
committees within the manual. Let's find those, and then the last legislative page.
|
||||
"""
|
||||
section_pages = []
|
||||
|
||||
for page in self.document:
|
||||
text = page.get_text().encode("utf8")
|
||||
is_section_page = words_in_superstring(
|
||||
words = self.section_page_words,
|
||||
superstring = text
|
||||
)
|
||||
is_last_page = words_in_superstring(
|
||||
words = self.last_page_words,
|
||||
superstring = text
|
||||
)
|
||||
|
||||
if is_section_page:
|
||||
section_pages.append(page.number)
|
||||
|
||||
if is_last_page and len(section_pages) > 2:
|
||||
section_pages.append(page.number)
|
||||
|
||||
return section_pages
|
||||
|
||||
def get_legislative_pages(self):
|
||||
"""
|
||||
Generate the section markers, then fill in the pages between them.
|
||||
"""
|
||||
|
||||
current = 0
|
||||
sections = self.generate_section_markers()
|
||||
legislative_pages: list[int] = []
|
||||
try:
|
||||
while True:
|
||||
legislative_pages += list(
|
||||
range(
|
||||
sections[current] + 1,
|
||||
sections[current + 1],
|
||||
1
|
||||
)
|
||||
)
|
||||
|
||||
current += 1
|
||||
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
|
||||
return legislative_pages
|
||||
|
||||
def concat_blocks_for_leg_pages(self):
|
||||
"""
|
||||
From the legislative pages, concatenate the "blocks" of text in the PDF.
|
||||
"""
|
||||
blocks = []
|
||||
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
|
||||
for page in pages:
|
||||
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
|
||||
|
||||
blocks += block_info
|
||||
|
||||
return blocks
|
||||
|
||||
def split_leg_pages(self):
|
||||
"""
|
||||
We have the collection of legislative page text blocks. We need
|
||||
to split them now. We split on the text "71st General Assembly...
|
||||
Youth in Government"
|
||||
"""
|
||||
|
||||
blocks = self.concat_blocks_for_leg_pages()
|
||||
# each item within splitted is called a "legislative meta-block"
|
||||
splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text)
|
||||
|
||||
return splitted[1:] # there's an empty array at the beginning
|
||||
|
||||
def handle_the_rest(self, the_rest):
|
||||
weird_character = u'\uFFFd'
|
||||
splitted_by_weird = the_rest.split(weird_character)
|
||||
title_content = ''.join(
|
||||
splitted_by_weird[0].split('\n')[:-1]
|
||||
).rstrip().lstrip()
|
||||
|
||||
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
|
||||
|
||||
return {
|
||||
"bill_text": '\n'.join(bill_text),
|
||||
"title": title_content
|
||||
}
|
||||
|
||||
def parse_legislative_metablocks(self):
|
||||
output = []
|
||||
splitted = self.split_leg_pages()
|
||||
for legislative_text in splitted:
|
||||
# there are some blocks that contain just one value
|
||||
# and are aligned to some x value on the pdf
|
||||
|
||||
# it's an easy way to extract stuff
|
||||
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
|
||||
|
||||
try:
|
||||
school = get_block_by_x_value(legislative_text, 177).text.rstrip()
|
||||
except AttributeError:
|
||||
try:
|
||||
school = get_block_by_x_value(legislative_text, 186).text.rstrip()
|
||||
except AttributeError:
|
||||
school = "you tell me, man"
|
||||
|
||||
try:
|
||||
sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip()
|
||||
except AttributeError:
|
||||
try:
|
||||
sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip()
|
||||
except AttributeError:
|
||||
sponsors = "you tell me, man"
|
||||
try:
|
||||
subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
|
||||
except AttributeError:
|
||||
try:
|
||||
subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
|
||||
except AttributeError:
|
||||
subcommittee = "you tell me, man"
|
||||
the_rest = ''.join([i.text for i in legislative_text[12:]])
|
||||
print([i.text for i in legislative_text[12:]])
|
||||
handled = self.handle_the_rest(the_rest)
|
||||
title = handled["title"]
|
||||
bill_text = handled["bill_text"]
|
||||
|
||||
output.append({
|
||||
"code": leg_code,
|
||||
"school": school,
|
||||
"sponsors": sponsors,
|
||||
"subcommittee": subcommittee,
|
||||
"title": title,
|
||||
"bill_text": bill_text
|
||||
})
|
||||
|
||||
self.output = output
|
|
@ -0,0 +1,139 @@
|
|||
from .common import *
|
||||
from typing import ClassVar
|
||||
from dataclasses import dataclass
|
||||
|
||||
import fitz
|
||||
|
||||
class HSYIG():
|
||||
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
|
||||
last_page_words = [ "ABCs" ]
|
||||
|
||||
def __init__(self, document: fitz.Document):
|
||||
self.document = document
|
||||
self.__post_init__()
|
||||
|
||||
def __post_init__(self):
|
||||
# run all the processing steps here
|
||||
self.parse_legislative_metablocks()
|
||||
|
||||
def generate_section_markers(self) -> list[int]:
|
||||
"""
|
||||
In the YIG/MUN manuals, there's section markers that delineate between the different
|
||||
committees within the manual. Let's find those, and then the last legislative page.
|
||||
"""
|
||||
section_pages = []
|
||||
|
||||
for page in self.document:
|
||||
text = page.get_text().encode("utf8")
|
||||
is_section_page = words_in_superstring(
|
||||
words = self.section_page_words,
|
||||
superstring = text
|
||||
)
|
||||
is_last_page = words_in_superstring(
|
||||
words = self.last_page_words,
|
||||
superstring = text
|
||||
)
|
||||
|
||||
print(text, is_section_page, is_last_page)
|
||||
|
||||
if is_section_page and len(page.get_images()) == 3:
|
||||
section_pages.append(page.number)
|
||||
|
||||
if is_last_page and len(section_pages) > 2:
|
||||
section_pages.append(page.number)
|
||||
|
||||
return section_pages
|
||||
|
||||
def get_legislative_pages(self):
|
||||
"""
|
||||
Generate the section markers, then fill in the pages between them.
|
||||
"""
|
||||
|
||||
current = 0
|
||||
sections = self.generate_section_markers()
|
||||
legislative_pages: list[int] = []
|
||||
try:
|
||||
while True:
|
||||
legislative_pages += list(
|
||||
range(
|
||||
sections[current] + 1,
|
||||
sections[current + 1],
|
||||
1
|
||||
)
|
||||
)
|
||||
|
||||
current += 1
|
||||
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return legislative_pages
|
||||
|
||||
def concat_blocks_for_leg_pages(self):
|
||||
"""
|
||||
From the legislative pages, concatenate the "blocks" of text in the PDF.
|
||||
"""
|
||||
blocks = []
|
||||
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
|
||||
for page in pages:
|
||||
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
|
||||
|
||||
blocks += block_info
|
||||
|
||||
return blocks
|
||||
|
||||
def split_leg_pages(self):
|
||||
"""
|
||||
We have the collection of legislative page text blocks. We need
|
||||
to split them now. We split on the text "71st General Assembly...
|
||||
Youth in Government"
|
||||
"""
|
||||
|
||||
blocks = self.concat_blocks_for_leg_pages()
|
||||
# each item within splitted is called a "legislative meta-block"
|
||||
splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
|
||||
|
||||
return splitted[1:] # there's an empty array at the beginning
|
||||
|
||||
def handle_the_rest(self, the_rest):
|
||||
weird_character = u'\uFFFd'
|
||||
splitted_by_weird = the_rest.split(weird_character)
|
||||
title_content = ''.join(
|
||||
splitted_by_weird[0].split('\n')[:-1]
|
||||
).rstrip().lstrip()
|
||||
|
||||
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
|
||||
|
||||
return {
|
||||
"bill_text": '\n'.join(bill_text),
|
||||
"title": title_content
|
||||
}
|
||||
|
||||
def parse_legislative_metablocks(self):
|
||||
output = []
|
||||
splitted = self.split_leg_pages()
|
||||
for legislative_text in splitted:
|
||||
# there are some blocks that contain just one value
|
||||
# and are aligned to some x value on the pdf
|
||||
|
||||
# it's an easy way to extract stuff
|
||||
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
|
||||
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
|
||||
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
|
||||
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
|
||||
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
|
||||
the_rest = ''.join([i.text for i in legislative_text[6:]])
|
||||
handled = self.handle_the_rest(the_rest)
|
||||
title = handled["title"]
|
||||
bill_text = handled["bill_text"]
|
||||
|
||||
output.append({
|
||||
"code": leg_code,
|
||||
"school": school,
|
||||
"sponsors": sponsors,
|
||||
"subcommittee": subcommittee,
|
||||
"title": title,
|
||||
"bill_text": bill_text
|
||||
})
|
||||
|
||||
self.output = output
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
from typing import Any
|
||||
class FitzBlockWrapper:
|
||||
def __init__(self, block):
|
||||
self.x0, self.y0, self.x1, \
|
||||
self.y1, self.text, \
|
||||
self.block_number, self.block_type = block
|
||||
|
||||
self.x0 = int(self.x0)
|
||||
self.x1 = int(self.x1)
|
||||
self.y0 = int(self.y0)
|
||||
self.y1 = int(self.y1)
|
||||
self.block_number = int(self.block_number)
|
||||
self.block_type = int(self.block_type)
|
||||
|
||||
def __str__(self):
|
||||
return str((
|
||||
self.x0, self.y0, self.x1, self.y1, self.text
|
||||
))
|
||||
|
||||
def __repl__(self):
|
||||
return self.__str__()
|
||||
|
||||
def words_in_superstring(words: list[str], superstring: str) -> bool:
|
||||
for word in words:
|
||||
if not str(word).lower() in str(superstring).lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
def split_by_lambda(arr: list[Any], func):
|
||||
output = []
|
||||
current = []
|
||||
for item in arr:
|
||||
if func(item):
|
||||
output.append(current)
|
||||
current = []
|
||||
else:
|
||||
current.append(item)
|
||||
|
||||
output.append(current)
|
||||
return output
|
||||
|
||||
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
|
||||
for item in arr:
|
||||
if item.x0 == xvalue:
|
||||
return item
|
||||
|
||||
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
|
||||
return [i for i in arr if not i.x0 == xvalue]
|
|
@ -0,0 +1,9 @@
|
|||
import fitz
|
||||
|
||||
from .HSYIG import HSYIG
|
||||
from .HSMUN import HSMUN
|
||||
|
||||
if __name__ == "__main__":
|
||||
d = fitz.open("MUNB2023.pdf")
|
||||
res = HSMUN(d)
|
||||
print(res.output)
|
|
@ -0,0 +1,23 @@
|
|||
# Generated by Django 4.2.12 on 2024-06-19 17:36
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('explorer', '0002_legislativetext_legislation_title'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='legislationbook',
|
||||
name='has_performed_export',
|
||||
field=models.BooleanField(default=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='legislationbook',
|
||||
name='import_strategy',
|
||||
field=models.CharField(choices=[('HSYIGBookParser', 'High School YIG Book Parser 1'), ('HSMUNBookParser', 'High School MUN Book Parser 1')], default='HSYIGBookParser', max_length=128),
|
||||
),
|
||||
]
|
|
@ -1,11 +1,19 @@
|
|||
from django.db import models
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from .lib.parsers import HSYIG, HSMUN
|
||||
import io
|
||||
import fitz
|
||||
|
||||
class LegislationBook(models.Model):
|
||||
class ConferenceType(models.TextChoices):
|
||||
MIDDLE = "M", _("Middle School")
|
||||
HIGH = "H", _("High School")
|
||||
|
||||
class ImportStrategy(models.TextChoices):
|
||||
HSYIGA = "HSYIGBookParser", _("High School YIG Book Parser 1")
|
||||
HSMUNA = "HSMUNBookParser", _("High School MUN Book Parser 1")
|
||||
|
||||
conference_type = models.CharField(
|
||||
max_length=1,
|
||||
choices=ConferenceType.choices,
|
||||
|
@ -13,7 +21,51 @@ class LegislationBook(models.Model):
|
|||
)
|
||||
pdf = models.FileField(upload_to="uploads/")
|
||||
name = models.CharField(max_length=256)
|
||||
import_strategy = models.CharField(max_length=128)
|
||||
import_strategy = models.CharField(
|
||||
max_length=128,
|
||||
choices=ImportStrategy.choices,
|
||||
default=ImportStrategy.HSYIGA
|
||||
)
|
||||
has_performed_export = models.BooleanField(default=False)
|
||||
|
||||
def save(self, **kwargs):
|
||||
super().save(**kwargs)
|
||||
|
||||
if not self.has_performed_export:
|
||||
self.has_performed_export = True
|
||||
super().save(**kwargs)
|
||||
else:
|
||||
return
|
||||
|
||||
the_file = io.BytesIO(self.pdf.file.file.read())
|
||||
the_document = fitz.open(stream=the_file)
|
||||
if self.import_strategy == "HSYIGBookParser":
|
||||
parsed = HSYIG(the_document)
|
||||
elif self.import_strategy == "HSMUNBookParser":
|
||||
parsed = HSMUN(the_document)
|
||||
else:
|
||||
return
|
||||
|
||||
for text in parsed.output:
|
||||
print(text["code"])
|
||||
codesplit = text["code"].split('/')
|
||||
assembly = codesplit[0]
|
||||
dashsplit = codesplit[1].split('-')
|
||||
year = 2000 + int(dashsplit[0])
|
||||
committee = int(dashsplit[1])
|
||||
docket_order = int(dashsplit[2])
|
||||
text = LegislativeText(
|
||||
assembly=assembly,
|
||||
year=year,
|
||||
committee=committee,
|
||||
docket_order=docket_order,
|
||||
school=text["school"],
|
||||
sponsors=text["sponsors"],
|
||||
legislation_title=text["title"],
|
||||
text=text["bill_text"],
|
||||
from_book=self
|
||||
)
|
||||
text.save()
|
||||
|
||||
def __str__(self):
|
||||
return "{}".format(self.name)
|
||||
|
@ -48,8 +100,9 @@ class LegislativeText(models.Model):
|
|||
legislation_title = models.CharField(max_length=512)
|
||||
|
||||
def __str__(self):
|
||||
return "{}/{}-{}".format(
|
||||
return "{}/{}-{}-{}".format(
|
||||
self.assembly,
|
||||
str(self.year),
|
||||
self.committee,
|
||||
self.docket_order,
|
||||
)
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
<form action="{% url 'import_books' %}" method="post" enctype="multipart/form-data">
|
||||
{% csrf_token %}
|
||||
<input type="text" name="bookname">
|
||||
<input type="file" name="bookpdf">
|
||||
<input type="submit" value="Import PDF">
|
||||
</form>
|
||||
|
||||
{% if just_imported %}
|
||||
thanks for the import!
|
||||
{% endif %}
|
|
@ -5,5 +5,4 @@ from . import views
|
|||
urlpatterns = [
|
||||
path("", views.index, name="index"),
|
||||
path("legislation/<int:legislation_id>/", views.view_legislation, name="viewleg"),
|
||||
path("import/", views.import_books, name="import_books"),
|
||||
]
|
||||
|
|
|
@ -4,24 +4,12 @@ from django.http import HttpResponse
|
|||
from .models import LegislativeText, LegislationBook
|
||||
|
||||
def index(request):
|
||||
legislative_texts = LegislativeText.objects.all()[:5]
|
||||
legislative_texts = LegislativeText.objects.all()
|
||||
context = {
|
||||
"legislative_texts": legislative_texts,
|
||||
}
|
||||
return render(request, "explorer/index.html", context)
|
||||
|
||||
def import_books(request):
|
||||
if request.method == "GET":
|
||||
return render(request, "explorer/import.html", {})
|
||||
elif request.method == "POST":
|
||||
book = LegislationBook(
|
||||
pdf=request.FILES["bookpdf"],
|
||||
name=request.POST.get("bookname"),
|
||||
conference_type="H",
|
||||
)
|
||||
book.save()
|
||||
return render(request, "explorer/import.html", {just_imported: True})
|
||||
|
||||
def view_legislation(request, legislation_id):
|
||||
legislation = get_object_or_404(LegislativeText, pk=legislation_id)
|
||||
context = {
|
||||
|
|
Loading…
Reference in New Issue