diff --git a/analyser.py b/analyser.py index 726c4a2..51ea1cb 100644 --- a/analyser.py +++ b/analyser.py @@ -1,8 +1,28 @@ -import leglib +import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll parser = leglib.parsers.HSYIGPdfParser.from_filename( filename="YIGVolunteerBook2024.pdf", - confname="YIGVolunteer" + confname="HSVolunteer" ) parser.parse() -print([i.bill_text for i in parser.bills]) + +print(len(parser.bills)) + +db = leglib.billdb.BillDB() +db.add_conference(parser=parser) + +allbills = len(db.search(query=leglib.billdb.QueryAll)) + +bluelen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.Blue))) +whitelen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.White))) +redlen = len(db.search(query=leglib.billdb.BillQuery(color=leglib.billdb.QueryField.Colors.Red))) + +senatelen = len(db.search(query=leglib.billdb.BillQuery(assembly=leglib.billdb.QueryField.Assemblies.Senate))) +houselen = len(db.search(query=leglib.billdb.BillQuery(assembly=leglib.billdb.QueryField.Assemblies.House))) + +franklincount = len(db.search(query=leglib.billdb.BillQuery(school="Franklin"))) + +print(allbills) +print(redlen, whitelen, bluelen, redlen + whitelen + bluelen) +print(senatelen, houselen, senatelen + houselen) +print(franklincount) diff --git a/billdb.py b/billdb.py new file mode 100644 index 0000000..c558fae --- /dev/null +++ b/billdb.py @@ -0,0 +1,122 @@ +from common import Bill, CCEColors, CCEAssemblies +from parsers import BookParser + +from typing import Type, Self +from dataclasses import dataclass + +class QueryAny: + """ + Use this class to indicate an Any match for attributes without an Any attribute. + """ + pass + +class SearchNotSatisified(BaseException): + pass + +class QueryAll: + pass + +class QueryField: + Any = object() + Colors = CCEColors + Assemblies = CCEAssemblies + +@dataclass +class BillQuery: + """ + Holds a query for the BillDB. + """ + color: CCEColors | QueryField = QueryField.Any + assembly: CCEAssemblies | QueryField = QueryField.Any + committee: int | QueryField = QueryField.Any + year: int | QueryField = QueryField.Any + subcommittee: str | QueryField = QueryField.Any + sponsors: str | QueryField = QueryField.Any + school: str | QueryField = QueryField.Any + bill_text: str | QueryField = QueryField.Any + title: str | QueryField = QueryField.Any + + def __post_init__(self): + self.bill_text_concat = self.bill_text # for search compat reasons + +class BillDB: + def __init__(self): + self.bills: list[Bill] = [] + self.cache: dict[Bill] + + @staticmethod + def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None: + """ + This is probably very slow. Maybe replace this with a better solution? + + This function replaces repetitive code like this: + + elif bill.assembly != CCEAssemblies.Any: + if bill.assembly != query.color: + raise SearchNotSatisified() + + with this: + + self.enum_match(bill, query, "color") + + This is the case with exact_match and string_match, too. + """ + + if query.__getattribute__(attr) == QueryField.Any: + return + + # check the Any case + if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any: + # make sure we're not matching + if bill.code.__getattribute__(attr) != query.__getattribute__(attr): + raise SearchNotSatisified() + + # if we do match, no exception + + @staticmethod + def string_match(bill: Bill, query: BillQuery, attr: str) -> None: + """ + See self.code_enum_match for more info. + """ + if query.__getattribute__(attr) == QueryField.Any: + return + + if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower(): + raise SearchNotSatisified() + + def add_conference(self: Self, parser: Type[BookParser]) -> None: + """ + Type[BookParser] -> any subclass of BookParser + """ + + # this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field. + self.bills += parser.bills + + def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]: + if query == QueryAll: + return self.bills + results = [] + for bill in self.bills: + try: + self.code_enum_match(bill, query, "color") + self.code_enum_match(bill, query, "assembly") + + if not query.committee == QueryField.Any: + if not query.committee == bill.code.committee: + raise SearchNotSatisified() + + if not query.committee == QueryField.Any: + if not query.year == bill.code.year: + raise SearchNotSatisified() + + self.string_match(bill, query, "subcommittee") + self.string_match(bill, query, "sponsors") + self.string_match(bill, query, "school") + self.string_match(bill, query, "bill_text_concat") + self.string_match(bill, query, "title") + + except SearchNotSatisified: + continue + results.append(bill) + + return results diff --git a/common.py b/common.py index 24f4f79..b21f225 100644 --- a/common.py +++ b/common.py @@ -3,12 +3,15 @@ from enum import StrEnum, auto class CCEColors(StrEnum): Red = "Red" White = "White", - Blue = "Blue" + Blue = "Blue", + Undefined = "Undefined", # some conferences don't have assemblies + Any = "Any" # for searching purposes class CCEAssemblies(StrEnum): Senate = "Senate", House = "House", - GeneralAssembly = "GeneralAssembly" + GeneralAssembly = "GeneralAssembly", + Any = "Any" # for searching purposes class BillCode: def __init__(self, text: str): @@ -37,19 +40,20 @@ class BillCode: elif assemblydivision == "G": self.assembly = CCEAssemblies.GeneralAssembly - self.year = int(dashsplit[0]) + # reverse y2k problem; but conference years are stored in YY, not YYYY form + self.year = int(dashsplit[0]) + 2000 self.committee = int(dashsplit[1]) self.docketplacement = int(dashsplit[2]) self.stringrep = self.color[0].upper() + \ self.assembly[0].upper() + \ "B/{}-{}-{}".format( - str(self.year), + str(self.year - 2000), str(self.committee), str(self.docketplacement) ) - self.conference: None | str = None # to be filled in with BillDB + self.conference: None | str = None # to be filled in with BookParser and friends def __str__(self): return "{} {} - {}-{}-{}".format( @@ -79,3 +83,7 @@ class Bill: self.school = school.rstrip() self.bill_text = bill_text self.title = title + + @property + def bill_text_concat(self): + return ''.join(self.bill_text) diff --git a/leglib.py b/leglib.py index 32778c1..045c0d3 100644 --- a/leglib.py +++ b/leglib.py @@ -1,5 +1,2 @@ -import fitz -import math - -from typing import Any import parsers +import billdb diff --git a/shell.nix b/shell.nix index 2c827b4..d3892c7 100644 --- a/shell.nix +++ b/shell.nix @@ -3,5 +3,6 @@ # nativeBuildInputs is usually what you want -- tools you need to run nativeBuildInputs = with pkgs; [ buildPackages.python311Packages.pymupdf + buildPackages.python311Packages.flask ]; }