From ca61adc1e69f6e00e8a9326f2ee88f9b03dc875b Mon Sep 17 00:00:00 2001 From: stupidcomputer Date: Wed, 19 Jun 2024 12:41:41 -0500 Subject: [PATCH] web interface version 1 There's a crappy frontend, but the Django admin panel works as expected. --- franklincce/explorer/admin.py | 10 +- franklincce/explorer/lib/HSMUN.py | 158 ++++++++++++++++++ franklincce/explorer/lib/HSYIG.py | 139 +++++++++++++++ franklincce/explorer/lib/__init__.py | 1 + franklincce/explorer/lib/common.py | 48 ++++++ franklincce/explorer/lib/parsers.py | 9 + ...ationbook_has_performed_export_and_more.py | 23 +++ franklincce/explorer/models.py | 57 ++++++- .../explorer/templates/explorer/import.html | 10 -- franklincce/explorer/urls.py | 1 - franklincce/explorer/views.py | 14 +- shell.nix | 2 +- 12 files changed, 443 insertions(+), 29 deletions(-) create mode 100644 franklincce/explorer/lib/HSMUN.py create mode 100644 franklincce/explorer/lib/HSYIG.py create mode 100644 franklincce/explorer/lib/__init__.py create mode 100644 franklincce/explorer/lib/common.py create mode 100644 franklincce/explorer/lib/parsers.py create mode 100644 franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py delete mode 100644 franklincce/explorer/templates/explorer/import.html diff --git a/franklincce/explorer/admin.py b/franklincce/explorer/admin.py index 2462c9e..3e404c0 100644 --- a/franklincce/explorer/admin.py +++ b/franklincce/explorer/admin.py @@ -2,5 +2,11 @@ from django.contrib import admin from .models import LegislativeText, LegislationBook -admin.site.register(LegislativeText) -admin.site.register(LegislationBook) +class LegislativeTextAdmin(admin.ModelAdmin): + list_display = ('__str__', 'legislation_title', 'school') + +class LegislationBookAdmin(admin.ModelAdmin): + exclude = ("has_performed_export",) + +admin.site.register(LegislativeText, LegislativeTextAdmin) +admin.site.register(LegislationBook, LegislationBookAdmin) diff --git a/franklincce/explorer/lib/HSMUN.py b/franklincce/explorer/lib/HSMUN.py new file mode 100644 index 0000000..275bdb6 --- /dev/null +++ b/franklincce/explorer/lib/HSMUN.py @@ -0,0 +1,158 @@ +from .common import * +from typing import ClassVar +from dataclasses import dataclass + +import fitz + +class HSMUN(): + section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"] + last_page_words = ["ABCs"] + + def __init__(self, document: fitz.Document): + self.document = document + self.__post_init__() + + def __post_init__(self): + # run all the processing steps here + self.parse_legislative_metablocks() + + def generate_section_markers(self) -> list[int]: + """ + In the YIG/MUN manuals, there's section markers that delineate between the different + committees within the manual. Let's find those, and then the last legislative page. + """ + section_pages = [] + + for page in self.document: + text = page.get_text().encode("utf8") + is_section_page = words_in_superstring( + words = self.section_page_words, + superstring = text + ) + is_last_page = words_in_superstring( + words = self.last_page_words, + superstring = text + ) + + if is_section_page: + section_pages.append(page.number) + + if is_last_page and len(section_pages) > 2: + section_pages.append(page.number) + + return section_pages + + def get_legislative_pages(self): + """ + Generate the section markers, then fill in the pages between them. + """ + + current = 0 + sections = self.generate_section_markers() + legislative_pages: list[int] = [] + try: + while True: + legislative_pages += list( + range( + sections[current] + 1, + sections[current + 1], + 1 + ) + ) + + current += 1 + + except IndexError: + pass + + + return legislative_pages + + def concat_blocks_for_leg_pages(self): + """ + From the legislative pages, concatenate the "blocks" of text in the PDF. + """ + blocks = [] + pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()] + for page in pages: + block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")] + + blocks += block_info + + return blocks + + def split_leg_pages(self): + """ + We have the collection of legislative page text blocks. We need + to split them now. We split on the text "71st General Assembly... + Youth in Government" + """ + + blocks = self.concat_blocks_for_leg_pages() + # each item within splitted is called a "legislative meta-block" + splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text) + + return splitted[1:] # there's an empty array at the beginning + + def handle_the_rest(self, the_rest): + weird_character = u'\uFFFd' + splitted_by_weird = the_rest.split(weird_character) + title_content = ''.join( + splitted_by_weird[0].split('\n')[:-1] + ).rstrip().lstrip() + + bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]] + + return { + "bill_text": '\n'.join(bill_text), + "title": title_content + } + + def parse_legislative_metablocks(self): + output = [] + splitted = self.split_leg_pages() + for legislative_text in splitted: + # there are some blocks that contain just one value + # and are aligned to some x value on the pdf + + # it's an easy way to extract stuff + leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() + + try: + school = get_block_by_x_value(legislative_text, 177).text.rstrip() + except AttributeError: + try: + school = get_block_by_x_value(legislative_text, 186).text.rstrip() + except AttributeError: + school = "you tell me, man" + + try: + sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip() + except AttributeError: + try: + sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip() + except AttributeError: + sponsors = "you tell me, man" + try: + subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip() + except AttributeError: + try: + subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip() + except AttributeError: + subcommittee = "you tell me, man" + the_rest = ''.join([i.text for i in legislative_text[12:]]) + print([i.text for i in legislative_text[12:]]) + handled = self.handle_the_rest(the_rest) + title = handled["title"] + bill_text = handled["bill_text"] + + output.append({ + "code": leg_code, + "school": school, + "sponsors": sponsors, + "subcommittee": subcommittee, + "title": title, + "bill_text": bill_text + }) + + self.output = output diff --git a/franklincce/explorer/lib/HSYIG.py b/franklincce/explorer/lib/HSYIG.py new file mode 100644 index 0000000..0826aa5 --- /dev/null +++ b/franklincce/explorer/lib/HSYIG.py @@ -0,0 +1,139 @@ +from .common import * +from typing import ClassVar +from dataclasses import dataclass + +import fitz + +class HSYIG(): + section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ] + last_page_words = [ "ABCs" ] + + def __init__(self, document: fitz.Document): + self.document = document + self.__post_init__() + + def __post_init__(self): + # run all the processing steps here + self.parse_legislative_metablocks() + + def generate_section_markers(self) -> list[int]: + """ + In the YIG/MUN manuals, there's section markers that delineate between the different + committees within the manual. Let's find those, and then the last legislative page. + """ + section_pages = [] + + for page in self.document: + text = page.get_text().encode("utf8") + is_section_page = words_in_superstring( + words = self.section_page_words, + superstring = text + ) + is_last_page = words_in_superstring( + words = self.last_page_words, + superstring = text + ) + + print(text, is_section_page, is_last_page) + + if is_section_page and len(page.get_images()) == 3: + section_pages.append(page.number) + + if is_last_page and len(section_pages) > 2: + section_pages.append(page.number) + + return section_pages + + def get_legislative_pages(self): + """ + Generate the section markers, then fill in the pages between them. + """ + + current = 0 + sections = self.generate_section_markers() + legislative_pages: list[int] = [] + try: + while True: + legislative_pages += list( + range( + sections[current] + 1, + sections[current + 1], + 1 + ) + ) + + current += 1 + + except IndexError: + pass + + return legislative_pages + + def concat_blocks_for_leg_pages(self): + """ + From the legislative pages, concatenate the "blocks" of text in the PDF. + """ + blocks = [] + pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()] + for page in pages: + block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")] + + blocks += block_info + + return blocks + + def split_leg_pages(self): + """ + We have the collection of legislative page text blocks. We need + to split them now. We split on the text "71st General Assembly... + Youth in Government" + """ + + blocks = self.concat_blocks_for_leg_pages() + # each item within splitted is called a "legislative meta-block" + splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text) + + return splitted[1:] # there's an empty array at the beginning + + def handle_the_rest(self, the_rest): + weird_character = u'\uFFFd' + splitted_by_weird = the_rest.split(weird_character) + title_content = ''.join( + splitted_by_weird[0].split('\n')[:-1] + ).rstrip().lstrip() + + bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]] + + return { + "bill_text": '\n'.join(bill_text), + "title": title_content + } + + def parse_legislative_metablocks(self): + output = [] + splitted = self.split_leg_pages() + for legislative_text in splitted: + # there are some blocks that contain just one value + # and are aligned to some x value on the pdf + + # it's an easy way to extract stuff + legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers + leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() + school = get_block_by_x_value(legislative_text, 163).text.rstrip() + sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip() + subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip() + the_rest = ''.join([i.text for i in legislative_text[6:]]) + handled = self.handle_the_rest(the_rest) + title = handled["title"] + bill_text = handled["bill_text"] + + output.append({ + "code": leg_code, + "school": school, + "sponsors": sponsors, + "subcommittee": subcommittee, + "title": title, + "bill_text": bill_text + }) + + self.output = output diff --git a/franklincce/explorer/lib/__init__.py b/franklincce/explorer/lib/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/franklincce/explorer/lib/__init__.py @@ -0,0 +1 @@ + diff --git a/franklincce/explorer/lib/common.py b/franklincce/explorer/lib/common.py new file mode 100644 index 0000000..4f824ce --- /dev/null +++ b/franklincce/explorer/lib/common.py @@ -0,0 +1,48 @@ +from typing import Any +class FitzBlockWrapper: + def __init__(self, block): + self.x0, self.y0, self.x1, \ + self.y1, self.text, \ + self.block_number, self.block_type = block + + self.x0 = int(self.x0) + self.x1 = int(self.x1) + self.y0 = int(self.y0) + self.y1 = int(self.y1) + self.block_number = int(self.block_number) + self.block_type = int(self.block_type) + + def __str__(self): + return str(( + self.x0, self.y0, self.x1, self.y1, self.text + )) + + def __repl__(self): + return self.__str__() + +def words_in_superstring(words: list[str], superstring: str) -> bool: + for word in words: + if not str(word).lower() in str(superstring).lower(): + return False + return True + +def split_by_lambda(arr: list[Any], func): + output = [] + current = [] + for item in arr: + if func(item): + output.append(current) + current = [] + else: + current.append(item) + + output.append(current) + return output + +def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper: + for item in arr: + if item.x0 == xvalue: + return item + +def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]: + return [i for i in arr if not i.x0 == xvalue] diff --git a/franklincce/explorer/lib/parsers.py b/franklincce/explorer/lib/parsers.py new file mode 100644 index 0000000..16fc58e --- /dev/null +++ b/franklincce/explorer/lib/parsers.py @@ -0,0 +1,9 @@ +import fitz + +from .HSYIG import HSYIG +from .HSMUN import HSMUN + +if __name__ == "__main__": + d = fitz.open("MUNB2023.pdf") + res = HSMUN(d) + print(res.output) diff --git a/franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py b/franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py new file mode 100644 index 0000000..c637710 --- /dev/null +++ b/franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.12 on 2024-06-19 17:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('explorer', '0002_legislativetext_legislation_title'), + ] + + operations = [ + migrations.AddField( + model_name='legislationbook', + name='has_performed_export', + field=models.BooleanField(default=False), + ), + migrations.AlterField( + model_name='legislationbook', + name='import_strategy', + field=models.CharField(choices=[('HSYIGBookParser', 'High School YIG Book Parser 1'), ('HSMUNBookParser', 'High School MUN Book Parser 1')], default='HSYIGBookParser', max_length=128), + ), + ] diff --git a/franklincce/explorer/models.py b/franklincce/explorer/models.py index e8e30c5..37a110a 100644 --- a/franklincce/explorer/models.py +++ b/franklincce/explorer/models.py @@ -1,11 +1,19 @@ from django.db import models from django.utils.translation import gettext_lazy as _ +from .lib.parsers import HSYIG, HSMUN +import io +import fitz + class LegislationBook(models.Model): class ConferenceType(models.TextChoices): MIDDLE = "M", _("Middle School") HIGH = "H", _("High School") + class ImportStrategy(models.TextChoices): + HSYIGA = "HSYIGBookParser", _("High School YIG Book Parser 1") + HSMUNA = "HSMUNBookParser", _("High School MUN Book Parser 1") + conference_type = models.CharField( max_length=1, choices=ConferenceType.choices, @@ -13,7 +21,51 @@ class LegislationBook(models.Model): ) pdf = models.FileField(upload_to="uploads/") name = models.CharField(max_length=256) - import_strategy = models.CharField(max_length=128) + import_strategy = models.CharField( + max_length=128, + choices=ImportStrategy.choices, + default=ImportStrategy.HSYIGA + ) + has_performed_export = models.BooleanField(default=False) + + def save(self, **kwargs): + super().save(**kwargs) + + if not self.has_performed_export: + self.has_performed_export = True + super().save(**kwargs) + else: + return + + the_file = io.BytesIO(self.pdf.file.file.read()) + the_document = fitz.open(stream=the_file) + if self.import_strategy == "HSYIGBookParser": + parsed = HSYIG(the_document) + elif self.import_strategy == "HSMUNBookParser": + parsed = HSMUN(the_document) + else: + return + + for text in parsed.output: + print(text["code"]) + codesplit = text["code"].split('/') + assembly = codesplit[0] + dashsplit = codesplit[1].split('-') + year = 2000 + int(dashsplit[0]) + committee = int(dashsplit[1]) + docket_order = int(dashsplit[2]) + text = LegislativeText( + assembly=assembly, + year=year, + committee=committee, + docket_order=docket_order, + school=text["school"], + sponsors=text["sponsors"], + legislation_title=text["title"], + text=text["bill_text"], + from_book=self + ) + text.save() def __str__(self): return "{}".format(self.name) @@ -48,8 +100,9 @@ class LegislativeText(models.Model): legislation_title = models.CharField(max_length=512) def __str__(self): - return "{}/{}-{}".format( + return "{}/{}-{}-{}".format( self.assembly, + str(self.year), self.committee, self.docket_order, ) diff --git a/franklincce/explorer/templates/explorer/import.html b/franklincce/explorer/templates/explorer/import.html deleted file mode 100644 index 8d32763..0000000 --- a/franklincce/explorer/templates/explorer/import.html +++ /dev/null @@ -1,10 +0,0 @@ -
-{% csrf_token %} - - - -
- -{% if just_imported %} -thanks for the import! -{% endif %} diff --git a/franklincce/explorer/urls.py b/franklincce/explorer/urls.py index 3487de4..9f2b6b8 100644 --- a/franklincce/explorer/urls.py +++ b/franklincce/explorer/urls.py @@ -5,5 +5,4 @@ from . import views urlpatterns = [ path("", views.index, name="index"), path("legislation//", views.view_legislation, name="viewleg"), - path("import/", views.import_books, name="import_books"), ] diff --git a/franklincce/explorer/views.py b/franklincce/explorer/views.py index 07694b1..04bd7f5 100644 --- a/franklincce/explorer/views.py +++ b/franklincce/explorer/views.py @@ -4,24 +4,12 @@ from django.http import HttpResponse from .models import LegislativeText, LegislationBook def index(request): - legislative_texts = LegislativeText.objects.all()[:5] + legislative_texts = LegislativeText.objects.all() context = { "legislative_texts": legislative_texts, } return render(request, "explorer/index.html", context) -def import_books(request): - if request.method == "GET": - return render(request, "explorer/import.html", {}) - elif request.method == "POST": - book = LegislationBook( - pdf=request.FILES["bookpdf"], - name=request.POST.get("bookname"), - conference_type="H", - ) - book.save() - return render(request, "explorer/import.html", {just_imported: True}) - def view_legislation(request, legislation_id): legislation = get_object_or_404(LegislativeText, pk=legislation_id) context = { diff --git a/shell.nix b/shell.nix index 69bdb54..56a0930 100644 --- a/shell.nix +++ b/shell.nix @@ -1,4 +1,4 @@ { pkgs ? import {} }: pkgs.mkShell { - nativeBuildInputs = with pkgs.python311Packages; [ django ]; + nativeBuildInputs = with pkgs.python311Packages; [ django pymupdf ]; }