Compare commits

...

5 Commits

Author SHA1 Message Date
stupidcomputer ca61adc1e6 web interface version 1
There's a crappy frontend, but the Django admin panel works as expected.
2024-06-19 12:41:41 -05:00
stupidcomputer 308b0f978f created LegislativeText and LegislationBook models, wired them
- Created the LegislativeText and LegislationBook models, and wired them
  to primitive web interfaces.

- Created an import-like mechanism for users to import books into the
  database with. Need to integrate previous bill-book parsing code.
2024-06-19 05:41:13 -05:00
stupidcomputer 3c73c209a7 add 'explorer' app to 'franklincce' project 2024-06-19 01:29:30 -05:00
stupidcomputer c08636fcad create project 'franklincce' 2024-06-19 01:29:30 -05:00
stupidcomputer 244c71eb59 boilerplate 2024-06-19 01:29:26 -05:00
27 changed files with 825 additions and 0 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
*.log
*.pot
*.pyc
__pycache__
db.sqlite3
media
uploads/

4
README.md Normal file
View File

@ -0,0 +1,4 @@
yig
===
A tool to explore past bills, manage delegations, etc.

View File

View File

@ -0,0 +1,12 @@
from django.contrib import admin
from .models import LegislativeText, LegislationBook
class LegislativeTextAdmin(admin.ModelAdmin):
list_display = ('__str__', 'legislation_title', 'school')
class LegislationBookAdmin(admin.ModelAdmin):
exclude = ("has_performed_export",)
admin.site.register(LegislativeText, LegislativeTextAdmin)
admin.site.register(LegislationBook, LegislationBookAdmin)

View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class ExplorerConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'explorer'

View File

@ -0,0 +1,158 @@
from .common import *
from typing import ClassVar
from dataclasses import dataclass
import fitz
class HSMUN():
section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
last_page_words = ["ABCs"]
def __init__(self, document: fitz.Document):
self.document = document
self.__post_init__()
def __post_init__(self):
# run all the processing steps here
self.parse_legislative_metablocks()
def generate_section_markers(self) -> list[int]:
"""
In the YIG/MUN manuals, there's section markers that delineate between the different
committees within the manual. Let's find those, and then the last legislative page.
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
if is_section_page:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def get_legislative_pages(self):
"""
Generate the section markers, then fill in the pages between them.
"""
current = 0
sections = self.generate_section_markers()
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def concat_blocks_for_leg_pages(self):
"""
From the legislative pages, concatenate the "blocks" of text in the PDF.
"""
blocks = []
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
for page in pages:
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
blocks += block_info
return blocks
def split_leg_pages(self):
"""
We have the collection of legislative page text blocks. We need
to split them now. We split on the text "71st General Assembly...
Youth in Government"
"""
blocks = self.concat_blocks_for_leg_pages()
# each item within splitted is called a "legislative meta-block"
splitted = split_by_lambda(blocks, lambda x: "43rd General Assembly" in x.text)
return splitted[1:] # there's an empty array at the beginning
def handle_the_rest(self, the_rest):
weird_character = u'\uFFFd'
splitted_by_weird = the_rest.split(weird_character)
title_content = ''.join(
splitted_by_weird[0].split('\n')[:-1]
).rstrip().lstrip()
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
return {
"bill_text": '\n'.join(bill_text),
"title": title_content
}
def parse_legislative_metablocks(self):
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
try:
school = get_block_by_x_value(legislative_text, 177).text.rstrip()
except AttributeError:
try:
school = get_block_by_x_value(legislative_text, 186).text.rstrip()
except AttributeError:
school = "you tell me, man"
try:
sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip()
except AttributeError:
try:
sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip()
except AttributeError:
sponsors = "you tell me, man"
try:
subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
except AttributeError:
try:
subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
except AttributeError:
subcommittee = "you tell me, man"
the_rest = ''.join([i.text for i in legislative_text[12:]])
print([i.text for i in legislative_text[12:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output

View File

@ -0,0 +1,139 @@
from .common import *
from typing import ClassVar
from dataclasses import dataclass
import fitz
class HSYIG():
section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
last_page_words = [ "ABCs" ]
def __init__(self, document: fitz.Document):
self.document = document
self.__post_init__()
def __post_init__(self):
# run all the processing steps here
self.parse_legislative_metablocks()
def generate_section_markers(self) -> list[int]:
"""
In the YIG/MUN manuals, there's section markers that delineate between the different
committees within the manual. Let's find those, and then the last legislative page.
"""
section_pages = []
for page in self.document:
text = page.get_text().encode("utf8")
is_section_page = words_in_superstring(
words = self.section_page_words,
superstring = text
)
is_last_page = words_in_superstring(
words = self.last_page_words,
superstring = text
)
print(text, is_section_page, is_last_page)
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def get_legislative_pages(self):
"""
Generate the section markers, then fill in the pages between them.
"""
current = 0
sections = self.generate_section_markers()
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def concat_blocks_for_leg_pages(self):
"""
From the legislative pages, concatenate the "blocks" of text in the PDF.
"""
blocks = []
pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
for page in pages:
block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]
blocks += block_info
return blocks
def split_leg_pages(self):
"""
We have the collection of legislative page text blocks. We need
to split them now. We split on the text "71st General Assembly...
Youth in Government"
"""
blocks = self.concat_blocks_for_leg_pages()
# each item within splitted is called a "legislative meta-block"
splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)
return splitted[1:] # there's an empty array at the beginning
def handle_the_rest(self, the_rest):
weird_character = u'\uFFFd'
splitted_by_weird = the_rest.split(weird_character)
title_content = ''.join(
splitted_by_weird[0].split('\n')[:-1]
).rstrip().lstrip()
bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]
return {
"bill_text": '\n'.join(bill_text),
"title": title_content
}
def parse_legislative_metablocks(self):
output = []
splitted = self.split_leg_pages()
for legislative_text in splitted:
# there are some blocks that contain just one value
# and are aligned to some x value on the pdf
# it's an easy way to extract stuff
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
school = get_block_by_x_value(legislative_text, 163).text.rstrip()
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
the_rest = ''.join([i.text for i in legislative_text[6:]])
handled = self.handle_the_rest(the_rest)
title = handled["title"]
bill_text = handled["bill_text"]
output.append({
"code": leg_code,
"school": school,
"sponsors": sponsors,
"subcommittee": subcommittee,
"title": title,
"bill_text": bill_text
})
self.output = output

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,48 @@
from typing import Any
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()
def words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def split_by_lambda(arr: list[Any], func):
output = []
current = []
for item in arr:
if func(item):
output.append(current)
current = []
else:
current.append(item)
output.append(current)
return output
def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
for item in arr:
if item.x0 == xvalue:
return item
def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
return [i for i in arr if not i.x0 == xvalue]

View File

@ -0,0 +1,9 @@
import fitz
from .HSYIG import HSYIG
from .HSMUN import HSMUN
if __name__ == "__main__":
d = fitz.open("MUNB2023.pdf")
res = HSMUN(d)
print(res.output)

View File

@ -0,0 +1,39 @@
# Generated by Django 4.2.12 on 2024-06-19 06:53
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='LegislationBook',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('conference_type', models.CharField(choices=[('M', 'Middle School'), ('H', 'High School')], default='H', max_length=1)),
('pdf', models.FileField(upload_to='uploads/')),
('name', models.CharField(max_length=256)),
('import_strategy', models.CharField(max_length=128)),
],
),
migrations.CreateModel(
name='LegislativeText',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('assembly', models.CharField(choices=[('RGA', 'Red General Assembly'), ('BGA', 'Blue General Assembly'), ('WGA', 'White General Assembly'), ('RHB', 'Red House'), ('BHB', 'Blue House'), ('WHB', 'White House'), ('RSB', 'Red Senate'), ('BSB', 'Blue Senate'), ('WSB', 'White Senate'), ('SEN', 'Senate'), ('HOU', 'House'), ('GEN', 'General Assembly')], default='GEN', max_length=3)),
('text', models.TextField()),
('year', models.IntegerField()),
('committee', models.IntegerField()),
('docket_order', models.IntegerField()),
('school', models.CharField(max_length=256)),
('sponsors', models.CharField(max_length=256)),
('from_book', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='explorer.legislationbook')),
],
),
]

View File

@ -0,0 +1,19 @@
# Generated by Django 4.2.12 on 2024-06-19 07:22
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('explorer', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='legislativetext',
name='legislation_title',
field=models.CharField(default='Sample title', max_length=512),
preserve_default=False,
),
]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.2.12 on 2024-06-19 17:36
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('explorer', '0002_legislativetext_legislation_title'),
]
operations = [
migrations.AddField(
model_name='legislationbook',
name='has_performed_export',
field=models.BooleanField(default=False),
),
migrations.AlterField(
model_name='legislationbook',
name='import_strategy',
field=models.CharField(choices=[('HSYIGBookParser', 'High School YIG Book Parser 1'), ('HSMUNBookParser', 'High School MUN Book Parser 1')], default='HSYIGBookParser', max_length=128),
),
]

View File

@ -0,0 +1,108 @@
from django.db import models
from django.utils.translation import gettext_lazy as _
from .lib.parsers import HSYIG, HSMUN
import io
import fitz
class LegislationBook(models.Model):
class ConferenceType(models.TextChoices):
MIDDLE = "M", _("Middle School")
HIGH = "H", _("High School")
class ImportStrategy(models.TextChoices):
HSYIGA = "HSYIGBookParser", _("High School YIG Book Parser 1")
HSMUNA = "HSMUNBookParser", _("High School MUN Book Parser 1")
conference_type = models.CharField(
max_length=1,
choices=ConferenceType.choices,
default=ConferenceType.HIGH,
)
pdf = models.FileField(upload_to="uploads/")
name = models.CharField(max_length=256)
import_strategy = models.CharField(
max_length=128,
choices=ImportStrategy.choices,
default=ImportStrategy.HSYIGA
)
has_performed_export = models.BooleanField(default=False)
def save(self, **kwargs):
super().save(**kwargs)
if not self.has_performed_export:
self.has_performed_export = True
super().save(**kwargs)
else:
return
the_file = io.BytesIO(self.pdf.file.file.read())
the_document = fitz.open(stream=the_file)
if self.import_strategy == "HSYIGBookParser":
parsed = HSYIG(the_document)
elif self.import_strategy == "HSMUNBookParser":
parsed = HSMUN(the_document)
else:
return
for text in parsed.output:
print(text["code"])
codesplit = text["code"].split('/')
assembly = codesplit[0]
dashsplit = codesplit[1].split('-')
year = 2000 + int(dashsplit[0])
committee = int(dashsplit[1])
docket_order = int(dashsplit[2])
text = LegislativeText(
assembly=assembly,
year=year,
committee=committee,
docket_order=docket_order,
school=text["school"],
sponsors=text["sponsors"],
legislation_title=text["title"],
text=text["bill_text"],
from_book=self
)
text.save()
def __str__(self):
return "{}".format(self.name)
class LegislativeText(models.Model):
class Assemblies(models.TextChoices):
RGA = "RGA", _("Red General Assembly")
BGA = "BGA", _("Blue General Assembly")
WGA = "WGA", _("White General Assembly")
RHB = "RHB", _("Red House")
BHB = "BHB", _("Blue House")
WHB = "WHB", _("White House")
RSB = "RSB", _("Red Senate")
BSB = "BSB", _("Blue Senate")
WSB = "WSB", _("White Senate")
SEN = "SEN", _("Senate")
HOU = "HOU", _("House")
GEN = "GEN", _("General Assembly")
assembly = models.CharField(
max_length=3,
choices=Assemblies.choices,
default=Assemblies.GEN
)
text = models.TextField()
year = models.IntegerField()
committee = models.IntegerField()
docket_order = models.IntegerField()
school = models.CharField(max_length=256)
sponsors = models.CharField(max_length=256)
from_book = models.ForeignKey(LegislationBook, on_delete=models.CASCADE)
legislation_title = models.CharField(max_length=512)
def __str__(self):
return "{}/{}-{}-{}".format(
self.assembly,
str(self.year),
self.committee,
self.docket_order,
)

View File

@ -0,0 +1,9 @@
{% if legislative_texts %}
<ul>
{% for text in legislative_texts %}
<li><a href="{% url 'viewleg' text.id %}">{{ text.legislation_title }}</a></li>
{% endfor %}
</ul>
{% else %}
<p>No texts available</p>
{% endif %}

View File

@ -0,0 +1,9 @@
<h1>{{ legislation.legislation_title }}</h1>
<i>{{ legislation.assembly }}/{{ legislation.committee }}/{{ legislation.docket_order }}</i>
<p>Sponsored by {{ legislation.sponsors }} of {{ legislation.school }}</p>
<blockquote>
{{ legislation.text }}
</blockquote>

View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@ -0,0 +1,8 @@
from django.urls import path
from . import views
urlpatterns = [
path("", views.index, name="index"),
path("legislation/<int:legislation_id>/", views.view_legislation, name="viewleg"),
]

View File

@ -0,0 +1,18 @@
from django.shortcuts import get_object_or_404, render
from django.http import HttpResponse
from .models import LegislativeText, LegislationBook
def index(request):
legislative_texts = LegislativeText.objects.all()
context = {
"legislative_texts": legislative_texts,
}
return render(request, "explorer/index.html", context)
def view_legislation(request, legislation_id):
legislation = get_object_or_404(LegislativeText, pk=legislation_id)
context = {
"legislation": legislation,
}
return render(request, "explorer/legislation.html", context)

View File

View File

@ -0,0 +1,16 @@
"""
ASGI config for franklincce project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'franklincce.settings')
application = get_asgi_application()

View File

@ -0,0 +1,124 @@
"""
Django settings for franklincce project.
Generated by 'django-admin startproject' using Django 4.2.12.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-1%p#re)z*_xd9umo0!1foh(yiz&2=*5q#0b4(m42r0^m%kxli#'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'explorer.apps.ExplorerConfig',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'franklincce.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'franklincce.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

View File

@ -0,0 +1,23 @@
"""
URL configuration for franklincce project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import include, path
urlpatterns = [
path('explorer/', include("explorer.urls")),
path('admin/', admin.site.urls),
]

View File

@ -0,0 +1,16 @@
"""
WSGI config for franklincce project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'franklincce.settings')
application = get_wsgi_application()

22
franklincce/manage.py Executable file
View File

@ -0,0 +1,22 @@
#!/nix/store/7hnr99nxrd2aw6lghybqdmkckq60j6l9-python3-3.11.9/bin/python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'franklincce.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

4
shell.nix Normal file
View File

@ -0,0 +1,4 @@
{ pkgs ? import <nixpkgs> {} }:
pkgs.mkShell {
nativeBuildInputs = with pkgs.python311Packages; [ django pymupdf ];
}