Compare commits

...

9 Commits

Author SHA1 Message Date
stupidcomputer 3848e1f777 it's a web interface 2024-05-20 05:21:13 -05:00
stupidcomputer 9680a416da move leglib into a python package 2024-05-19 17:56:26 -05:00
stupidcomputer 9ba154f654 add search to the setup 2024-05-19 17:51:51 -05:00
stupidcomputer 5c11ff4371 parser object overhauls
make conference names part of the parser object in preparation for the
implementation of billdb; refactor class structure for parsers
2024-05-19 16:02:33 -05:00
stupidcomputer dbd9632e16 fix bill parsing in the main parser 2024-05-19 16:02:03 -05:00
stupidcomputer 11fbcb474a split up the parsers and other utilities 2024-05-03 13:49:16 -05:00
stupidcomputer eabe1c98a0 add more changes to ledlib 2024-05-03 13:35:11 -05:00
stupidcomputer a62453bdea to be squashed 2024-05-03 06:42:56 -05:00
stupidcomputer 875890a83a need to clean this up 2024-05-03 05:58:33 -05:00
18 changed files with 788 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
__pycache__/
*.pyc

BIN
YIGVolunteerBook2024.pdf Normal file

Binary file not shown.

99
cceexplorer/__init__.py Normal file
View File

@ -0,0 +1,99 @@
import secrets
from flask import Flask
from flask import render_template
from flask_bootstrap import Bootstrap
from .leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
from .leglib.parsers import HSYIGPdfParser
parser = HSYIGPdfParser.from_filename(
filename="YIGVolunteerBook2024.pdf",
confname="HSVolunteer"
)
parser.parse()
db = BillDB()
db.add_conference(parser=parser)
def create_app(test_config=None):
app = Flask(__name__, instance_relative_config=True)
app.config.from_mapping(
SECRET_KEY=str(secrets.randbelow(100000000))
)
Bootstrap(app)
@app.route('/')
def index():
bills = db.search(query=QueryAll)
return render_template('index.html', number_bills=len(bills), number_conferences=2, bills=bills)
@app.route('/legislation/<conference>/<year>')
def show_conference(conference=QueryField.Any):
return conference
@app.route('/legislation/<conference>/<color>/<year>')
def show_color(
conference=QueryField.Any,
year=QueryField.Any,
color=QueryField.Any,
):
bills = db.search(query=BillQuery(
color=color,
year=int(year),
))
return render_template('color.html', bills=bills)
@app.route('/legislation/<conference>/<color>/<assembly>/<year>')
def show_assembly(
conference=QueryField.Any,
assembly=QueryField.Any,
color=QueryField.Any,
year=QueryField.Any,
):
bills = db.search(query=BillQuery(
color=color,
assembly=assembly,
year=int(year),
))
return render_template('assembly.html', bills=bills)
@app.route('/legislation/<conference>/<color>/<assembly>/<year>/<committee>')
def show_committee(
conference=QueryField.Any,
assembly=QueryField.Any,
color=QueryField.Any,
year=QueryField.Any,
committee=QueryField.Any,
):
bills = db.search(query=BillQuery(
color=QueryField.Any,
assembly=assembly,
year=int(year),
committee=int(committee),
))
return render_template('committee.html', bills=bills)
@app.route('/legislation/<conference>/<color>/<assembly>/<year>/<committee>/<order>')
def show_bill(
conference=QueryField.Any,
assembly=QueryField.Any,
color=QueryField.Any,
year=QueryField.Any,
committee=QueryField.Any,
order=QueryField.Any,
):
print(order, int(order))
print(color, assembly, year, committee, order)
bills = db.search(query=BillQuery(
color=color,
assembly=assembly,
year=int(year),
committee=int(committee),
order=int(order),
))
return render_template("bill.html", bill=bills[0])
return app

29
cceexplorer/analyser.py Normal file
View File

@ -0,0 +1,29 @@
from leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
from leglib.parsers import HSYIGPdfParser
parser = HSYIGPdfParser.from_filename(
filename="YIGVolunteerBook2024.pdf",
confname="HSVolunteer"
)
parser.parse()
print(len(parser.bills))
db = BillDB()
db.add_conference(parser=parser)
allbills = len(db.search(query=QueryAll))
bluelen = len(db.search(query=BillQuery(color=QueryField.Colors.Blue)))
whitelen = len(db.search(query=BillQuery(color=QueryField.Colors.White)))
redlen = len(db.search(query=BillQuery(color=QueryField.Colors.Red)))
senatelen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.Senate)))
houselen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.House)))
franklincount = len(db.search(query=BillQuery(school="Franklin")))
print(allbills)
print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
print(senatelen, houselen, senatelen + houselen)
print(franklincount)

View File

View File

@ -0,0 +1,128 @@
from .common import Bill, CCEColors, CCEAssemblies
from .parsers import BookParser
from typing import Type, Self
from dataclasses import dataclass
class QueryAny:
"""
Use this class to indicate an Any match for attributes without an Any attribute.
"""
pass
class SearchNotSatisified(BaseException):
pass
class QueryAll:
pass
class QueryField:
Any = object()
Colors = CCEColors
Assemblies = CCEAssemblies
@dataclass
class BillQuery:
"""
Holds a query for the BillDB.
"""
color: str | CCEColors | QueryField = QueryField.Any
assembly: str | CCEAssemblies | QueryField = QueryField.Any
committee: int | QueryField = QueryField.Any
year: int | QueryField = QueryField.Any
order: int | QueryField = QueryField.Any
subcommittee: str | QueryField = QueryField.Any
sponsors: str | QueryField = QueryField.Any
school: str | QueryField = QueryField.Any
bill_text: str | QueryField = QueryField.Any
title: str | QueryField = QueryField.Any
def __post_init__(self):
self.bill_text_concat = self.bill_text # for search compat reasons
class BillDB:
def __init__(self):
self.bills: list[Bill] = []
self.cache: dict[Bill]
@staticmethod
def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
This is probably very slow. Maybe replace this with a better solution?
This function replaces repetitive code like this:
elif bill.assembly != CCEAssemblies.Any:
if bill.assembly != query.color:
raise SearchNotSatisified()
with this:
self.enum_match(bill, query, "color")
This is the case with exact_match and string_match, too.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
# check the Any case
if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
# make sure we're not matching
if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
raise SearchNotSatisified()
# if we do match, no exception
@staticmethod
def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
See self.code_enum_match for more info.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
raise SearchNotSatisified()
def add_conference(self: Self, parser: Type[BookParser]) -> None:
"""
Type[BookParser] -> any subclass of BookParser
"""
# this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
self.bills += parser.bills
def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
if query == QueryAll:
return self.bills
results = []
for bill in self.bills:
try:
# print("debug, q: {}, b: {}".format(str(query.committee), str(bill.code.committee)))
self.code_enum_match(bill, query, "color")
self.code_enum_match(bill, query, "assembly")
if not query.committee == QueryField.Any:
if not query.committee == bill.code.committee:
raise SearchNotSatisified()
if not query.order == QueryField.Any:
if not query.order == bill.code.docketplacement:
raise SearchNotSatisified()
if not query.committee == QueryField.Any:
if not query.year == bill.code.year:
raise SearchNotSatisified()
self.string_match(bill, query, "subcommittee")
self.string_match(bill, query, "sponsors")
self.string_match(bill, query, "school")
self.string_match(bill, query, "bill_text_concat")
self.string_match(bill, query, "title")
except SearchNotSatisified:
continue
results.append(bill)
return results

View File

@ -0,0 +1,161 @@
from enum import StrEnum, auto
class CCEColors(StrEnum):
Red = "Red"
White = "White",
Blue = "Blue",
Undefined = "Undefined", # some conferences don't have assemblies
Any = "Any" # for searching purposes
class CCEAssemblies(StrEnum):
Senate = "Senate",
House = "House",
GeneralAssembly = "GeneralAssembly",
Any = "Any" # for searching purposes
class BillCode:
def __init__(self, text: str):
# try to parse
# codes are in this rough format: "RSB/yy-c(c)-n(n)"
text = text.rstrip()
slashsplit = text.split('/')
dashsplit = slashsplit[1].split('-')
assemblycode = slashsplit[0]
self.color = assemblycode[0]
if self.color == "R":
self.color = CCEColors.Red
elif self.color == "W":
self.color = CCEColors.White
elif self.color == "B":
self.color = CCEColors.Blue
assemblydivision = assemblycode[1]
if assemblydivision == "S":
self.assembly = CCEAssemblies.Senate
elif assemblydivision == "H":
self.assembly = CCEAssemblies.House
elif assemblydivision == "G":
self.assembly = CCEAssemblies.GeneralAssembly
# reverse y2k problem; but conference years are stored in YY, not YYYY form
self.year = int(dashsplit[0]) + 2000
self.committee = int(dashsplit[1])
self.docketplacement = int(dashsplit[2])
self.stringrep = self.color[0].upper() + \
self.assembly[0].upper() + \
"B/{}-{}-{}".format(
str(self.year - 2000),
str(self.committee),
str(self.docketplacement)
)
self.conference: None | str = None # to be filled in with BookParser and friends
def __str__(self):
return "{} {} - {}-{}-{}".format(
self.color,
self.assembly,
str(self.year),
str(self.committee),
str(self.docketplacement)
)
class Bill:
def __init__(self,
code: str | BillCode,
sponsors: str,
subcommittee: str,
school: str,
bill_text: list[str],
title: str
):
if isinstance(code, str):
self.code = BillCode(code)
else:
self.code = code
self.sponsors = sponsors.rstrip()
self.subcommittee = subcommittee.rstrip()
self.school = school.rstrip()
self.bill_text = bill_text
self.title = title
@property
def bill_text_concat(self):
return ''.join(self.bill_text)
@property
def url(self):
if self.code.conference:
return "/legislation/" + '/'.join([
self.code.conference,
self.code.color,
self.code.assembly,
str(self.code.year),
str(self.code.committee),
str(self.code.docketplacement)
])
else:
return "/legislation/" + '/'.join([
"defaultconf",
self.code.color,
self.code.assembly,
str(self.code.year),
str(self.code.committee),
str(self.code.docketplacement)
])
@property
def committee_url(self):
if self.code.conference:
return "/legislation/" + '/'.join([
self.code.conference,
self.code.color,
self.code.assembly,
str(self.code.year),
str(self.code.committee)
])
else:
return "/legislation/" + '/'.join([
"defaultconf",
self.code.color,
self.code.assembly,
str(self.code.year),
str(self.code.committee)
])
@property
def assembly_url(self):
if self.code.conference:
return "/legislation/" + '/'.join([
self.code.conference,
self.code.color,
self.code.assembly,
str(self.code.year),
])
else:
return "/legislation/" + '/'.join([
"defaultconf",
self.code.color,
self.code.assembly,
str(self.code.year),
])
@property
def color_url(self):
if self.code.conference:
return "/legislation/" + '/'.join([
self.code.conference,
self.code.color,
str(self.code.year),
])
else:
return "/legislation/" + '/'.join([
"defaultconf",
self.code.color,
str(self.code.year),
])

20
cceexplorer/leglib/lib.py Normal file
View File

@ -0,0 +1,20 @@
class FitzBlockWrapper:
def __init__(self, block):
self.x0, self.y0, self.x1, \
self.y1, self.text, \
self.block_number, self.block_type = block
self.x0 = int(self.x0)
self.x1 = int(self.x1)
self.y0 = int(self.y0)
self.y1 = int(self.y1)
self.block_number = int(self.block_number)
self.block_type = int(self.block_type)
def __str__(self):
return str((
self.x0, self.y0, self.x1, self.y1, self.text
))
def __repl__(self):
return self.__str__()

View File

@ -0,0 +1,202 @@
import fitz
from typing import Any, Self, ClassVar
from itertools import groupby
from dataclasses import dataclass
from .lib import FitzBlockWrapper
from .common import Bill
@dataclass
class BookParser:
# class variables
humanname: ClassVar[str] = "Generic BookParser parent class."
description: ClassVar[str] = """
A generic description of the abilities of this BookParser.
"""
# everything else
document: fitz.Document
confname: str
@classmethod
def from_filename(cls, filename: str, confname: str):
return cls(
document=fitz.open(filename),
confname=confname
)
class HSYIGPdfParser(BookParser):
@staticmethod
def _words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words:
if not str(word).lower() in str(superstring).lower():
return False
return True
def _generate_legislative_pages_list(self, sections: list[int]) -> list[int]:
"""
sections is an array of section pages plus the last page.
"""
current = 0
legislative_pages: list[int] = []
try:
while True:
legislative_pages += list(
range(
sections[current] + 1,
sections[current + 1],
1
)
)
current += 1
except IndexError:
pass
return legislative_pages
def _generate_section_markers(self, document: fitz.Document) -> list[int]:
section_pages = []
for page in document:
text = page.get_text().encode("utf8")
is_section_page = self._words_in_superstring(
words=[ "Committee", "YMCA", "Tennessee", "Youth", "in" ],
superstring=text
)
is_last_page = self._words_in_superstring(
words=[ "ABCs" ],
superstring=text
)
# print("page number {} contains sentintal? {}".format(page.number, is_section_page))
# if len(page.get_images()) == 3:
# print("page {} has one image!".format(page.number))
# print(page.get_images())
if is_section_page and len(page.get_images()) == 3:
section_pages.append(page.number)
if is_last_page and len(section_pages) > 2:
section_pages.append(page.number)
return section_pages
def _get_block_info_from_page(self, page: fitz.Page):
return [FitzBlockWrapper(i) for i in page.get_text("blocks")]
@staticmethod
def _remove_image_blocks(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[FitzBlockWrapper] = []
for block in blocks:
if block.block_type == 0:
to_return.append(block)
return to_return
@staticmethod
def _remove_coordinate_information(blocks: list[FitzBlockWrapper]) -> list[FitzBlockWrapper]:
to_return: list[str] = []
for block in blocks:
to_return.append(block.text)
return to_return
@staticmethod
def _get_info_from_block(block, lat: int):
to_return = []
for i in block:
if math.floor(i[0]) == lat:
to_return.append(i)
return to_return
@staticmethod
def _split_list_by_element(arr: list[Any], pivot: Any):
output = []
current = []
for i in arr:
if i == pivot:
output.append(current)
current = []
else:
current.append(i)
output.append(current)
return output
def parse(self):
section_pages = self._generate_section_markers(self.document)
legislative_pages = self._generate_legislative_pages_list(section_pages)
joined_blocks: list[FitzBlockWrapper] = []
for page_number in legislative_pages:
page = self.document.load_page(page_number)
block_info = self._get_block_info_from_page(page)
joined_blocks += block_info[:-1] # remove the page number at the end of every page
joined_blocks = self._remove_image_blocks(joined_blocks)
joined_blocks = self._remove_coordinate_information(joined_blocks)
bill_header = joined_blocks[0]
splitted = self._split_list_by_element(joined_blocks, bill_header)
bills: list[Bill] = []
for splitted_item in splitted:
try:
bill_code, _, _, subcommittee, sponsors, school, *bill_text = splitted_item
except ValueError:
continue
bill_text = ' '.join(bill_text)
# print(type(bill_text))
pretty_printed = self._pretty_print_bill_text(bill_text)
bills.append(Bill(
code=bill_code,
subcommittee=subcommittee,
sponsors=sponsors,
school=school,
bill_text=pretty_printed["bill_array"],
title=pretty_printed["title"]
))
for bill in bills: # add the conference name to each
bill.code.conference = self.confname
self.bills = bills
@staticmethod
def _find_first_line_number(bill_arrays):
for i in range(len(bill_arrays)):
try:
if str(int(bill_arrays[i])) == bill_arrays[i]:
return i
except ValueError:
pass
def _pretty_print_bill_text(self, bill_text: str):
replaced = bill_text.replace("<EFBFBD>", "\n")
replaced = bill_text
replaced = replaced.split('\n')
replaced = [
i \
.replace('<EFBFBD>', ' ') \
.rstrip() \
.lstrip() \
for i in replaced
]
first_line_number = self._find_first_line_number(replaced)
title = ' '.join(replaced[:(first_line_number - 1)])
title = ' '.join(title.split()) # remove double spaces
rebuilt = replaced[first_line_number:][1::2]
# remove the last line number, it doesn't have a cooresponding space at the end
rebuilt = rebuilt[:-1]
# remove the first line, as it's the whitespace between the title and the bill text
rebuilt = rebuilt[1:]
return {
"title": title.lstrip(),
"bill_array": rebuilt
}

View File

@ -0,0 +1,14 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>{{ bills[0].code.color }} {{ bills[0].code.assembly }}</h1>
{% for bill in bills %}
<div class="container border-black">
<a href="{{ bill.url }}">({{bill.code.assembly[0]}}{{bill.code.committee}}/{{bill.code.docketplacement}}) {{ bill.title }}</a>
<p>Sponsors: {{ bill.sponsors }}</p>
<p>School: {{ bill.school }}</p>
</div>
{% endfor %}
{% endblock %}

View File

@ -0,0 +1,32 @@
{% extends "bootstrap/base.html" %}
{% block navbar %}
<nav class="navbar navbar-default">
<a class="navbar-brand" href="/">cceexplorer</a>
<div class="navbar-nav" id="navbarNav">
<ul class="nav navbar-nav">
<li class="nav-item">
<a class="nav-link" href="/">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/search">Search</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/statistics">Statistics</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/conferences">Conferences</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/scores">Scores</a>
</li>
</ul>
</div>
</nav>
{% endblock %}
{% block content %}
<div class="container-fluid">
{% block defcontent %}{% endblock %}
</div>
{% endblock %}

View File

@ -0,0 +1,28 @@
{% extends "base.html" %}
{% block title %} cceexplorer - {{ bill.title }} {% endblock %}
{% block defcontent %}
<div class="container">
<div class="row">
<div class="col-xs-3 border border-dark rounded">
<h1>{{ bill.code.stringrep }}</h1>
<p><i>{{ bill.title }}</i></p>
<p>Introduced by {{ bill.sponsors }} (of {{ bill.school }}) within the {{ bill.subcommittee }} subcommittee</p>
<hr>
<ul>
<li><a href="{{ bill.committee_url }}">Go to this bill's committee</a></li>
<li><a href="{{ bill.assembly_url }}">Go to this bill's assembly</a></li>
<li><a href="{{ bill.color_url }}">Go to this bill's color grouping</a></li>
</ul>
</div>
<div class="col-xs-7 border border-dark rounded">
<br>
{% for line in bill.bill_text %}
{% if line == "" %}
{% endif %}
<p>{{ line }}</p>
{% endfor %}
</div>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,14 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>All {{bills[0].code.color}} Legislation</h1>
{% for bill in bills %}
<div class="container border-black">
<a href="{{ bill.url }}">({{bill.code.committee}}/{{bill.code.docketplacement}}) {{ bill.title }}</a>
<p>Sponsors: {{ bill.sponsors }}</p>
<p>School: {{ bill.school }}</p>
</div>
{% endfor %}
{% endblock %}

View File

@ -0,0 +1,14 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>{{ bills[0].code.assembly }} Committee {{ bills[0].code.committee }}</h1>
{% for bill in bills %}
<div class="container border-black">
<a href="{{ bill.url }}">({{bill.code.color}}) {{ bill.title }}</a>
<p>Sponsors: {{ bill.sponsors }}</p>
<p>School: {{ bill.school }}</p>
</div>
{% endfor %}
{% endblock %}

View File

@ -0,0 +1,15 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>{{ bills[0].code.color }} {{ bills[0].code.assembly }}</h1>
{% for bill in bills %}
<div class="container border-black">
<a href="{{ bill.url }}">({{bill.code.assembly[0]}}{{bill.code.committee}}/{{bill.code.docketplacement}}) {{ bill.title }}</a>
<p>Sponsors: {{ bill.sponsors }}</p>
<p>School: {{ bill.school }}</p>
</div>
{% endfor %}
{% endblock %}

View File

@ -0,0 +1,15 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>Welcome to cceexplorer</h1>
<p><i>an interactive database with {{ number_bills }} bills and {{ number_conferences }} conferences</i></p>
<p>here's all of them, down here!</p>
<ul>
{% for bill in bills %}
<li>
<a href="{{ bill.url }}">{{ bill.title }}</a>
</li>
{% endfor %}
</ul>
{% endblock %}

View File

@ -0,0 +1,6 @@
{% extends "base.html" %}
{% block title %} testing title {% endblock %}
{% block defcontent %}
<h1>testing</h1>
{% endblock %}

9
shell.nix Normal file
View File

@ -0,0 +1,9 @@
{ pkgs ? import <nixpkgs> {} }:
pkgs.mkShell {
# nativeBuildInputs is usually what you want -- tools you need to run
nativeBuildInputs = with pkgs; [
buildPackages.python311Packages.pymupdf
buildPackages.python311Packages.flask
buildPackages.python311Packages.flask-bootstrap
];
}