Compare commits

...

4 Commits

Author SHA1 Message Date
stupidcomputer 9680a416da move leglib into a python package 2024-05-19 17:56:26 -05:00
stupidcomputer 9ba154f654 add search to the setup 2024-05-19 17:51:51 -05:00
stupidcomputer 5c11ff4371 parser object overhauls
make conference names part of the parser object in preparation for the
implementation of billdb; refactor class structure for parsers
2024-05-19 16:02:33 -05:00
stupidcomputer dbd9632e16 fix bill parsing in the main parser 2024-05-19 16:02:03 -05:00
8 changed files with 207 additions and 26 deletions

View File

@ -1,5 +1,30 @@
import leglib import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll
from leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
from leglib.parsers import HSYIGPdfParser
parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf") parser = HSYIGPdfParser.from_filename(
filename="YIGVolunteerBook2024.pdf",
confname="HSVolunteer"
)
parser.parse() parser.parse()
print([i.bill_text for i in parser.bills])
print(len(parser.bills))
db = BillDB()
db.add_conference(parser=parser)
allbills = len(db.search(query=QueryAll))
bluelen = len(db.search(query=BillQuery(color=QueryField.Colors.Blue)))
whitelen = len(db.search(query=BillQuery(color=QueryField.Colors.White)))
redlen = len(db.search(query=BillQuery(color=QueryField.Colors.Red)))
senatelen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.Senate)))
houselen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.House)))
franklincount = len(db.search(query=BillQuery(school="Franklin")))
print(allbills)
print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
print(senatelen, houselen, senatelen + houselen)
print(franklincount)

View File

@ -1,5 +0,0 @@
import fitz
import math
from typing import Any
import parsers

0
leglib/__main__.py Normal file
View File

122
leglib/billdb.py Normal file
View File

@ -0,0 +1,122 @@
from .common import Bill, CCEColors, CCEAssemblies
from .parsers import BookParser
from typing import Type, Self
from dataclasses import dataclass
class QueryAny:
"""
Use this class to indicate an Any match for attributes without an Any attribute.
"""
pass
class SearchNotSatisified(BaseException):
pass
class QueryAll:
pass
class QueryField:
Any = object()
Colors = CCEColors
Assemblies = CCEAssemblies
@dataclass
class BillQuery:
"""
Holds a query for the BillDB.
"""
color: CCEColors | QueryField = QueryField.Any
assembly: CCEAssemblies | QueryField = QueryField.Any
committee: int | QueryField = QueryField.Any
year: int | QueryField = QueryField.Any
subcommittee: str | QueryField = QueryField.Any
sponsors: str | QueryField = QueryField.Any
school: str | QueryField = QueryField.Any
bill_text: str | QueryField = QueryField.Any
title: str | QueryField = QueryField.Any
def __post_init__(self):
self.bill_text_concat = self.bill_text # for search compat reasons
class BillDB:
def __init__(self):
self.bills: list[Bill] = []
self.cache: dict[Bill]
@staticmethod
def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
This is probably very slow. Maybe replace this with a better solution?
This function replaces repetitive code like this:
elif bill.assembly != CCEAssemblies.Any:
if bill.assembly != query.color:
raise SearchNotSatisified()
with this:
self.enum_match(bill, query, "color")
This is the case with exact_match and string_match, too.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
# check the Any case
if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
# make sure we're not matching
if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
raise SearchNotSatisified()
# if we do match, no exception
@staticmethod
def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
See self.code_enum_match for more info.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
raise SearchNotSatisified()
def add_conference(self: Self, parser: Type[BookParser]) -> None:
"""
Type[BookParser] -> any subclass of BookParser
"""
# this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
self.bills += parser.bills
def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
if query == QueryAll:
return self.bills
results = []
for bill in self.bills:
try:
self.code_enum_match(bill, query, "color")
self.code_enum_match(bill, query, "assembly")
if not query.committee == QueryField.Any:
if not query.committee == bill.code.committee:
raise SearchNotSatisified()
if not query.committee == QueryField.Any:
if not query.year == bill.code.year:
raise SearchNotSatisified()
self.string_match(bill, query, "subcommittee")
self.string_match(bill, query, "sponsors")
self.string_match(bill, query, "school")
self.string_match(bill, query, "bill_text_concat")
self.string_match(bill, query, "title")
except SearchNotSatisified:
continue
results.append(bill)
return results

View File

@ -3,12 +3,15 @@ from enum import StrEnum, auto
class CCEColors(StrEnum): class CCEColors(StrEnum):
Red = "Red" Red = "Red"
White = "White", White = "White",
Blue = "Blue" Blue = "Blue",
Undefined = "Undefined", # some conferences don't have assemblies
Any = "Any" # for searching purposes
class CCEAssemblies(StrEnum): class CCEAssemblies(StrEnum):
Senate = "Senate", Senate = "Senate",
House = "House", House = "House",
GeneralAssembly = "GeneralAssembly" GeneralAssembly = "GeneralAssembly",
Any = "Any" # for searching purposes
class BillCode: class BillCode:
def __init__(self, text: str): def __init__(self, text: str):
@ -37,18 +40,21 @@ class BillCode:
elif assemblydivision == "G": elif assemblydivision == "G":
self.assembly = CCEAssemblies.GeneralAssembly self.assembly = CCEAssemblies.GeneralAssembly
self.year = int(dashsplit[0]) # reverse y2k problem; but conference years are stored in YY, not YYYY form
self.year = int(dashsplit[0]) + 2000
self.committee = int(dashsplit[1]) self.committee = int(dashsplit[1])
self.docketplacement = int(dashsplit[2]) self.docketplacement = int(dashsplit[2])
self.stringrep = self.color[0].upper() + \ self.stringrep = self.color[0].upper() + \
self.assembly[0].upper() + \ self.assembly[0].upper() + \
"B/{}-{}-{}".format( "B/{}-{}-{}".format(
str(self.year), str(self.year - 2000),
str(self.committee), str(self.committee),
str(self.docketplacement) str(self.docketplacement)
) )
self.conference: None | str = None # to be filled in with BookParser and friends
def __str__(self): def __str__(self):
return "{} {} - {}-{}-{}".format( return "{} {} - {}-{}-{}".format(
self.color, self.color,
@ -77,3 +83,7 @@ class Bill:
self.school = school.rstrip() self.school = school.rstrip()
self.bill_text = bill_text self.bill_text = bill_text
self.title = title self.title = title
@property
def bill_text_concat(self):
return ''.join(self.bill_text)

View File

@ -1,13 +1,31 @@
import fitz import fitz
from typing import Any from typing import Any, Self, ClassVar
from itertools import groupby
from dataclasses import dataclass
from lib import FitzBlockWrapper from .lib import FitzBlockWrapper
from common import Bill from .common import Bill
class HSYIGPdfParser: @dataclass
def __init__(self, document: fitz.Document): class BookParser:
self.document = document # class variables
humanname: ClassVar[str] = "Generic BookParser parent class."
description: ClassVar[str] = """
A generic description of the abilities of this BookParser.
"""
# everything else
document: fitz.Document
confname: str
@classmethod
def from_filename(cls, filename: str, confname: str):
return cls(
document=fitz.open(filename),
confname=confname
)
class HSYIGPdfParser(BookParser):
@staticmethod @staticmethod
def _words_in_superstring(words: list[str], superstring: str) -> bool: def _words_in_superstring(words: list[str], superstring: str) -> bool:
for word in words: for word in words:
@ -142,6 +160,9 @@ class HSYIGPdfParser:
title=pretty_printed["title"] title=pretty_printed["title"]
)) ))
for bill in bills: # add the conference name to each
bill.code.conference = self.confname
self.bills = bills self.bills = bills
@staticmethod @staticmethod
@ -155,20 +176,27 @@ class HSYIGPdfParser:
def _pretty_print_bill_text(self, bill_text: str): def _pretty_print_bill_text(self, bill_text: str):
replaced = bill_text.replace("<EFBFBD>", "\n") replaced = bill_text.replace("<EFBFBD>", "\n")
replaced = bill_text
replaced = replaced.split('\n') replaced = replaced.split('\n')
replaced = [
replaced = [i.rstrip().lstrip() for i in replaced] i \
.replace('<EFBFBD>', ' ') \
.rstrip() \
.lstrip() \
for i in replaced
]
first_line_number = self._find_first_line_number(replaced) first_line_number = self._find_first_line_number(replaced)
title = ' '.join(replaced[:(first_line_number - 1)])
title = ' '.join(replaced[:first_line_number]) title = ' '.join(title.split()) # remove double spaces
rebuilt = replaced[first_line_number:][1::2] rebuilt = replaced[first_line_number:][1::2]
# remove the last line number, it doesn't have a cooresponding space at the end
rebuilt = rebuilt[:-1]
# remove the first line, as it's the whitespace between the title and the bill text
rebuilt = rebuilt[1:]
return { return {
"title": title.lstrip(), "title": title.lstrip(),
"bill_array": rebuilt "bill_array": rebuilt
} }
@classmethod
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
return cls(fitz.open(filename))

View File

@ -3,5 +3,6 @@
# nativeBuildInputs is usually what you want -- tools you need to run # nativeBuildInputs is usually what you want -- tools you need to run
nativeBuildInputs = with pkgs; [ nativeBuildInputs = with pkgs; [
buildPackages.python311Packages.pymupdf buildPackages.python311Packages.pymupdf
buildPackages.python311Packages.flask
]; ];
} }