web interface version 1

There's a crappy frontend, but the Django admin panel works as expected.
2024-06-19 12:41:41 -05:00 · 2024-06-19 12:41:41 -05:00 · ca61adc1e6
parent 308b0f978f
commit ca61adc1e6
12 changed files with 443 additions and 29 deletions
--- a/franklincce/explorer/admin.py
+++ b/franklincce/explorer/admin.py
@ -2,5 +2,11 @@ from django.contrib import admin

 from .models import LegislativeText, LegislationBook

-admin.site.register(LegislativeText)
-admin.site.register(LegislationBook)
+class LegislativeTextAdmin(admin.ModelAdmin):
+    list_display = ('__str__', 'legislation_title', 'school')
+
+class LegislationBookAdmin(admin.ModelAdmin):
+    exclude = ("has_performed_export",)
+
+admin.site.register(LegislativeText, LegislativeTextAdmin)
+admin.site.register(LegislationBook, LegislationBookAdmin)
--- a/franklincce/explorer/lib/HSMUN.py
+++ b/franklincce/explorer/lib/HSMUN.py
@ -0,0 +1,158 @@
+from .common import *
+from typing import ClassVar
+from dataclasses import dataclass
+
+import fitz
+
+class HSMUN():
+    section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
+    last_page_words = ["ABCs"]
+
+    def __init__(self, document: fitz.Document):
+        self.document = document
+        self.__post_init__()
+
+    def __post_init__(self):
+        # run all the processing steps here
+        self.parse_legislative_metablocks()
+
+    def generate_section_markers(self) -> list[int]:
+        """
+        In the YIG/MUN manuals, there's section markers that delineate between the different
+        committees within the manual. Let's find those, and then the last legislative page.
+        """
+        section_pages = []
+
+        for page in self.document:
+            text = page.get_text().encode("utf8")
+            is_section_page = words_in_superstring(
+                words = self.section_page_words,
+                superstring = text
+            )
+            is_last_page = words_in_superstring(
+                words = self.last_page_words,
+                superstring = text
+            )
+
+            if is_section_page:
+                section_pages.append(page.number)
+
+            if is_last_page and len(section_pages) > 2:
+                section_pages.append(page.number)
+
+        return section_pages
+
+    def get_legislative_pages(self):
+        """
+        Generate the section markers, then fill in the pages between them.
+        """
+
+        current = 0
+        sections = self.generate_section_markers()
+        legislative_pages: list[int] = []
+        try:
+            while True:
+                legislative_pages += list(
+                    range(
+                        sections[current] + 1,
+                        sections[current + 1],
+                        1
+                    )
+                )
+
+                current += 1
+
+        except IndexError:
+            pass
+
+
+        return legislative_pages
+
+    def concat_blocks_for_leg_pages(self):
+        """
+        From the legislative pages, concatenate the "blocks" of text in the PDF.
+        """
+        blocks = []
+        pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
+        for page in pages:
+            block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
+
+            blocks += block_info
+
+        return blocks
+
+    def split_leg_pages(self):
+        """
+        We have the collection of legislative page text blocks. We need
+        to split them now. We split on the text "71st General Assembly...
+        Youth in Government"
+        """
+
+        blocks = self.concat_blocks_for_leg_pages()
+        # each item within splitted is called a "legislative meta-block"
+        splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text)
+
+        return splitted[1:] # there's an empty array at the beginning
+
+    def handle_the_rest(self, the_rest):
+        weird_character = u'\uFFFd'
+        splitted_by_weird = the_rest.split(weird_character)
+        title_content = ''.join(
+            splitted_by_weird[0].split('\n')[:-1]
+        ).rstrip().lstrip()
+
+        bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
+
+        return {
+            "bill_text": '\n'.join(bill_text),
+            "title": title_content
+        }
+
+    def parse_legislative_metablocks(self):
+        output = []
+        splitted = self.split_leg_pages()
+        for legislative_text in splitted:
+            # there are some blocks that contain just one value
+            # and are aligned to some x value on the pdf
+
+            # it's an easy way to extract stuff
+            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
+
+            try:
+                school = get_block_by_x_value(legislative_text, 177).text.rstrip()
+            except AttributeError:
+                try:
+                    school = get_block_by_x_value(legislative_text, 186).text.rstrip()
+                except AttributeError:
+                    school = "you tell me, man"
+
+            try:
+                sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip()
+            except AttributeError:
+                try:
+                    sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip()
+                except AttributeError:
+                    sponsors = "you tell me, man"
+            try:
+                subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
+            except AttributeError:
+                try:
+                    subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
+                except AttributeError:
+                    subcommittee = "you tell me, man"
+            the_rest = ''.join([i.text for i in legislative_text[12:]])
+            print([i.text for i in legislative_text[12:]])
+            handled = self.handle_the_rest(the_rest)
+            title = handled["title"]
+            bill_text = handled["bill_text"]
+
+            output.append({
+                "code": leg_code,
+                "school": school,
+                "sponsors": sponsors,
+                "subcommittee": subcommittee,
+                "title": title,
+                "bill_text": bill_text
+            })
+
+        self.output = output
--- a/franklincce/explorer/lib/HSYIG.py
+++ b/franklincce/explorer/lib/HSYIG.py
@ -0,0 +1,139 @@
+from .common import *
+from typing import ClassVar
+from dataclasses import dataclass
+
+import fitz
+
+class HSYIG():
+    section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
+    last_page_words = [ "ABCs" ]
+
+    def __init__(self, document: fitz.Document):
+        self.document = document
+        self.__post_init__()
+
+    def __post_init__(self):
+        # run all the processing steps here
+        self.parse_legislative_metablocks()
+
+    def generate_section_markers(self) -> list[int]:
+        """
+        In the YIG/MUN manuals, there's section markers that delineate between the different
+        committees within the manual. Let's find those, and then the last legislative page.
+        """
+        section_pages = []
+
+        for page in self.document:
+            text = page.get_text().encode("utf8")
+            is_section_page = words_in_superstring(
+                words = self.section_page_words,
+                superstring = text
+            )
+            is_last_page = words_in_superstring(
+                words = self.last_page_words,
+                superstring = text
+            )
+
+            print(text, is_section_page, is_last_page)
+
+            if is_section_page and len(page.get_images()) == 3:
+                section_pages.append(page.number)
+
+            if is_last_page and len(section_pages) > 2:
+                section_pages.append(page.number)
+
+        return section_pages
+
+    def get_legislative_pages(self):
+        """
+        Generate the section markers, then fill in the pages between them.
+        """
+
+        current = 0
+        sections = self.generate_section_markers()
+        legislative_pages: list[int] = []
+        try:
+            while True:
+                legislative_pages += list(
+                    range(
+                        sections[current] + 1,
+                        sections[current + 1],
+                        1
+                    )
+                )
+
+                current += 1
+
+        except IndexError:
+            pass
+
+        return legislative_pages
+
+    def concat_blocks_for_leg_pages(self):
+        """
+        From the legislative pages, concatenate the "blocks" of text in the PDF.
+        """
+        blocks = []
+        pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
+        for page in pages:
+            block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
+
+            blocks += block_info
+
+        return blocks
+
+    def split_leg_pages(self):
+        """
+        We have the collection of legislative page text blocks. We need
+        to split them now. We split on the text "71st General Assembly...
+        Youth in Government"
+        """
+
+        blocks = self.concat_blocks_for_leg_pages()
+        # each item within splitted is called a "legislative meta-block"
+        splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
+
+        return splitted[1:] # there's an empty array at the beginning
+
+    def handle_the_rest(self, the_rest):
+        weird_character = u'\uFFFd'
+        splitted_by_weird = the_rest.split(weird_character)
+        title_content = ''.join(
+            splitted_by_weird[0].split('\n')[:-1]
+        ).rstrip().lstrip()
+
+        bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
+
+        return {
+            "bill_text": '\n'.join(bill_text),
+            "title": title_content
+        }
+
+    def parse_legislative_metablocks(self):
+        output = []
+        splitted = self.split_leg_pages()
+        for legislative_text in splitted:
+            # there are some blocks that contain just one value
+            # and are aligned to some x value on the pdf
+
+            # it's an easy way to extract stuff
+            legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
+            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
+            school = get_block_by_x_value(legislative_text, 163).text.rstrip()
+            sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
+            subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
+            the_rest = ''.join([i.text for i in legislative_text[6:]])
+            handled = self.handle_the_rest(the_rest)
+            title = handled["title"]
+            bill_text = handled["bill_text"]
+
+            output.append({
+                "code": leg_code,
+                "school": school,
+                "sponsors": sponsors,
+                "subcommittee": subcommittee,
+                "title": title,
+                "bill_text": bill_text
+            })
+
+        self.output = output
--- a/franklincce/explorer/lib/init.py
+++ b/franklincce/explorer/lib/init.py
@ -0,0 +1 @@
+
--- a/franklincce/explorer/lib/common.py
+++ b/franklincce/explorer/lib/common.py
@ -0,0 +1,48 @@
+from typing import Any
+class FitzBlockWrapper:
+    def __init__(self, block):
+        self.x0, self.y0, self.x1, \
+            self.y1, self.text, \
+            self.block_number, self.block_type = block
+
+        self.x0 = int(self.x0)
+        self.x1 = int(self.x1)
+        self.y0 = int(self.y0)
+        self.y1 = int(self.y1)
+        self.block_number = int(self.block_number)
+        self.block_type = int(self.block_type)
+
+    def __str__(self):
+        return str((
+            self.x0, self.y0, self.x1, self.y1, self.text
+        ))
+
+    def __repl__(self):
+        return self.__str__()
+
+def words_in_superstring(words: list[str], superstring: str) -> bool:
+    for word in words:
+        if not str(word).lower() in str(superstring).lower():
+            return False
+        return True
+
+def split_by_lambda(arr: list[Any], func):
+    output = []
+    current = []
+    for item in arr:
+        if func(item):
+            output.append(current)
+            current = []
+        else:
+            current.append(item)
+
+    output.append(current)
+    return output
+
+def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
+    for item in arr:
+        if item.x0 == xvalue:
+            return item
+
+def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
+    return [i for i in arr if not i.x0 == xvalue]
--- a/franklincce/explorer/lib/parsers.py
+++ b/franklincce/explorer/lib/parsers.py
@ -0,0 +1,9 @@
+import fitz
+
+from .HSYIG import HSYIG
+from .HSMUN import HSMUN
+
+if __name__ == "__main__":
+    d = fitz.open("MUNB2023.pdf")
+    res = HSMUN(d)
+    print(res.output)
--- a/franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py
+++ b/franklincce/explorer/migrations/0003_legislationbook_has_performed_export_and_more.py
@ -0,0 +1,23 @@
+# Generated by Django 4.2.12 on 2024-06-19 17:36
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('explorer', '0002_legislativetext_legislation_title'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='legislationbook',
+            name='has_performed_export',
+            field=models.BooleanField(default=False),
+        ),
+        migrations.AlterField(
+            model_name='legislationbook',
+            name='import_strategy',
+            field=models.CharField(choices=[('HSYIGBookParser', 'High School YIG Book Parser 1'), ('HSMUNBookParser', 'High School MUN Book Parser 1')], default='HSYIGBookParser', max_length=128),
+        ),
+    ]
--- a/franklincce/explorer/models.py
+++ b/franklincce/explorer/models.py
@ -1,11 +1,19 @@
 from django.db import models
 from django.utils.translation import gettext_lazy as _

+from .lib.parsers import HSYIG, HSMUN
+import io
+import fitz
+
 class LegislationBook(models.Model):
    class ConferenceType(models.TextChoices):
        MIDDLE = "M", _("Middle School")
        HIGH = "H", _("High School")

+    class ImportStrategy(models.TextChoices):
+        HSYIGA = "HSYIGBookParser", _("High School YIG Book Parser 1")
+        HSMUNA = "HSMUNBookParser", _("High School MUN Book Parser 1")
+
    conference_type = models.CharField(
        max_length=1,
        choices=ConferenceType.choices,
@ -13,7 +21,51 @@ class LegislationBook(models.Model):
    )
    pdf = models.FileField(upload_to="uploads/")
    name = models.CharField(max_length=256)
-    import_strategy = models.CharField(max_length=128)
+    import_strategy = models.CharField(
+        max_length=128,
+        choices=ImportStrategy.choices,
+        default=ImportStrategy.HSYIGA
+    )
+    has_performed_export = models.BooleanField(default=False)
+
+    def save(self, **kwargs):
+        super().save(**kwargs)
+
+        if not self.has_performed_export:
+            self.has_performed_export = True
+            super().save(**kwargs)
+        else:
+            return
+
+        the_file = io.BytesIO(self.pdf.file.file.read())
+        the_document = fitz.open(stream=the_file)
+        if self.import_strategy == "HSYIGBookParser":
+            parsed = HSYIG(the_document)
+        elif self.import_strategy == "HSMUNBookParser":
+            parsed = HSMUN(the_document)
+        else:
+            return
+
+        for text in parsed.output:
+            print(text["code"])
+            codesplit = text["code"].split('/')
+            assembly = codesplit[0]
+            dashsplit = codesplit[1].split('-')
+            year = 2000 + int(dashsplit[0])
+            committee = int(dashsplit[1])
+            docket_order = int(dashsplit[2])
+            text = LegislativeText(
+                assembly=assembly,
+                year=year,
+                committee=committee,
+                docket_order=docket_order,
+                school=text["school"],
+                sponsors=text["sponsors"],
+                legislation_title=text["title"],
+                text=text["bill_text"],
+                from_book=self
+            )
+            text.save()

    def __str__(self):
        return "{}".format(self.name)
@ -48,8 +100,9 @@ class LegislativeText(models.Model):
    legislation_title = models.CharField(max_length=512)

    def __str__(self):
-        return "{}/{}-{}".format(
+        return "{}/{}-{}-{}".format(
            self.assembly,
+            str(self.year),
            self.committee,
            self.docket_order,
        )
--- a/franklincce/explorer/templates/explorer/import.html
+++ b/franklincce/explorer/templates/explorer/import.html
@ -1,10 +0,0 @@
-<form action="{% url 'import_books' %}" method="post" enctype="multipart/form-data">
-{% csrf_token %}
-<input type="text" name="bookname">
-<input type="file" name="bookpdf">
-<input type="submit" value="Import PDF">
-</form>
-
-{% if just_imported %}
-thanks for the import!
-{% endif %}
--- a/franklincce/explorer/urls.py
+++ b/franklincce/explorer/urls.py
@ -5,5 +5,4 @@ from . import views
 urlpatterns = [
    path("", views.index, name="index"),
    path("legislation/<int:legislation_id>/", views.view_legislation, name="viewleg"),
-    path("import/", views.import_books, name="import_books"),
 ]
--- a/franklincce/explorer/views.py
+++ b/franklincce/explorer/views.py
@ -4,24 +4,12 @@ from django.http import HttpResponse
 from .models import LegislativeText, LegislationBook

 def index(request):
-    legislative_texts = LegislativeText.objects.all()[:5]
+    legislative_texts = LegislativeText.objects.all()
    context = {
        "legislative_texts": legislative_texts,
    }
    return render(request, "explorer/index.html", context)

-def import_books(request):
-    if request.method == "GET":
-        return render(request, "explorer/import.html", {})
-    elif request.method == "POST":
-        book = LegislationBook(
-            pdf=request.FILES["bookpdf"],
-            name=request.POST.get("bookname"),
-            conference_type="H",
-        )
-        book.save()
-        return render(request, "explorer/import.html", {just_imported: True})
-
 def view_legislation(request, legislation_id):
    legislation = get_object_or_404(LegislativeText, pk=legislation_id)
    context = {
--- a/shell.nix
+++ b/shell.nix
@ -1,4 +1,4 @@
 { pkgs ? import <nixpkgs> {} }:
  pkgs.mkShell {
-    nativeBuildInputs = with pkgs.python311Packages; [ django ];
+    nativeBuildInputs = with pkgs.python311Packages; [ django pymupdf ];
  }