add search to the setup

This commit is contained in:
stupidcomputer 2024-05-19 17:51:51 -05:00
parent 5c11ff4371
commit 9ba154f654
5 changed files with 160 additions and 12 deletions

View File

@ -1,8 +1,28 @@
import leglib import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll
parser = leglib.parsers.HSYIGPdfParser.from_filename( parser = leglib.parsers.HSYIGPdfParser.from_filename(
filename="YIGVolunteerBook2024.pdf", filename="YIGVolunteerBook2024.pdf",
confname="YIGVolunteer" confname="HSVolunteer"
) )
parser.parse() parser.parse()
print([i.bill_text for i in parser.bills])
print(len(parser.bills))
db = leglib.billdb.BillDB()
db.add_conference(parser=parser)
allbills = len(db.search(query=leglib.billdb.QueryAll))
bluelen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.Blue)))
whitelen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.White)))
redlen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.Red)))
senatelen = len(db.search(query=leglib.billdb.BillQuery(assembly=leglib.billdb.QueryField.Assemblies.Senate)))
houselen = len(db.search(query=leglib.billdb.BillQuery(assembly=leglib.billdb.QueryField.Assemblies.House)))
franklincount = len(db.search(query=leglib.billdb.BillQuery(school="Franklin")))
print(allbills)
print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
print(senatelen, houselen, senatelen + houselen)
print(franklincount)

122
billdb.py Normal file
View File

@ -0,0 +1,122 @@
from common import Bill, CCEColors, CCEAssemblies
from parsers import BookParser
from typing import Type, Self
from dataclasses import dataclass
class QueryAny:
"""
Use this class to indicate an Any match for attributes without an Any attribute.
"""
pass
class SearchNotSatisified(BaseException):
pass
class QueryAll:
pass
class QueryField:
Any = object()
Colors = CCEColors
Assemblies = CCEAssemblies
@dataclass
class BillQuery:
"""
Holds a query for the BillDB.
"""
color: CCEColors | QueryField = QueryField.Any
assembly: CCEAssemblies | QueryField = QueryField.Any
committee: int | QueryField = QueryField.Any
year: int | QueryField = QueryField.Any
subcommittee: str | QueryField = QueryField.Any
sponsors: str | QueryField = QueryField.Any
school: str | QueryField = QueryField.Any
bill_text: str | QueryField = QueryField.Any
title: str | QueryField = QueryField.Any
def __post_init__(self):
self.bill_text_concat = self.bill_text # for search compat reasons
class BillDB:
def __init__(self):
self.bills: list[Bill] = []
self.cache: dict[Bill]
@staticmethod
def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
This is probably very slow. Maybe replace this with a better solution?
This function replaces repetitive code like this:
elif bill.assembly != CCEAssemblies.Any:
if bill.assembly != query.color:
raise SearchNotSatisified()
with this:
self.enum_match(bill, query, "color")
This is the case with exact_match and string_match, too.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
# check the Any case
if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
# make sure we're not matching
if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
raise SearchNotSatisified()
# if we do match, no exception
@staticmethod
def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
"""
See self.code_enum_match for more info.
"""
if query.__getattribute__(attr) == QueryField.Any:
return
if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
raise SearchNotSatisified()
def add_conference(self: Self, parser: Type[BookParser]) -> None:
"""
Type[BookParser] -> any subclass of BookParser
"""
# this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
self.bills += parser.bills
def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
if query == QueryAll:
return self.bills
results = []
for bill in self.bills:
try:
self.code_enum_match(bill, query, "color")
self.code_enum_match(bill, query, "assembly")
if not query.committee == QueryField.Any:
if not query.committee == bill.code.committee:
raise SearchNotSatisified()
if not query.committee == QueryField.Any:
if not query.year == bill.code.year:
raise SearchNotSatisified()
self.string_match(bill, query, "subcommittee")
self.string_match(bill, query, "sponsors")
self.string_match(bill, query, "school")
self.string_match(bill, query, "bill_text_concat")
self.string_match(bill, query, "title")
except SearchNotSatisified:
continue
results.append(bill)
return results

View File

@ -3,12 +3,15 @@ from enum import StrEnum, auto
class CCEColors(StrEnum): class CCEColors(StrEnum):
Red = "Red" Red = "Red"
White = "White", White = "White",
Blue = "Blue" Blue = "Blue",
Undefined = "Undefined", # some conferences don't have assemblies
Any = "Any" # for searching purposes
class CCEAssemblies(StrEnum): class CCEAssemblies(StrEnum):
Senate = "Senate", Senate = "Senate",
House = "House", House = "House",
GeneralAssembly = "GeneralAssembly" GeneralAssembly = "GeneralAssembly",
Any = "Any" # for searching purposes
class BillCode: class BillCode:
def __init__(self, text: str): def __init__(self, text: str):
@ -37,19 +40,20 @@ class BillCode:
elif assemblydivision == "G": elif assemblydivision == "G":
self.assembly = CCEAssemblies.GeneralAssembly self.assembly = CCEAssemblies.GeneralAssembly
self.year = int(dashsplit[0]) # reverse y2k problem; but conference years are stored in YY, not YYYY form
self.year = int(dashsplit[0]) + 2000
self.committee = int(dashsplit[1]) self.committee = int(dashsplit[1])
self.docketplacement = int(dashsplit[2]) self.docketplacement = int(dashsplit[2])
self.stringrep = self.color[0].upper() + \ self.stringrep = self.color[0].upper() + \
self.assembly[0].upper() + \ self.assembly[0].upper() + \
"B/{}-{}-{}".format( "B/{}-{}-{}".format(
str(self.year), str(self.year - 2000),
str(self.committee), str(self.committee),
str(self.docketplacement) str(self.docketplacement)
) )
self.conference: None | str = None # to be filled in with BillDB self.conference: None | str = None # to be filled in with BookParser and friends
def __str__(self): def __str__(self):
return "{} {} - {}-{}-{}".format( return "{} {} - {}-{}-{}".format(
@ -79,3 +83,7 @@ class Bill:
self.school = school.rstrip() self.school = school.rstrip()
self.bill_text = bill_text self.bill_text = bill_text
self.title = title self.title = title
@property
def bill_text_concat(self):
return ''.join(self.bill_text)

View File

@ -1,5 +1,2 @@
import fitz
import math
from typing import Any
import parsers import parsers
import billdb

View File

@ -3,5 +3,6 @@
# nativeBuildInputs is usually what you want -- tools you need to run # nativeBuildInputs is usually what you want -- tools you need to run
nativeBuildInputs = with pkgs; [ nativeBuildInputs = with pkgs; [
buildPackages.python311Packages.pymupdf buildPackages.python311Packages.pymupdf
buildPackages.python311Packages.flask
]; ];
} }