move leglib into a python package

add search to the setup
parser object overhauls
2024-05-19 17:56:26 -05:00 · 2024-05-19 17:51:51 -05:00 · 2024-05-19 16:02:33 -05:00 · 2024-05-19 16:02:03 -05:00
8 changed files with 207 additions and 26 deletions
--- a/analyser.py
+++ b/analyser.py
@ -1,5 +1,30 @@
-import leglib
+import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll
 from leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
 from leglib.parsers import HSYIGPdfParser
-parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
+parser = HSYIGPdfParser.from_filename(
    filename="YIGVolunteerBook2024.pdf",
    confname="HSVolunteer"
 )
 parser.parse()
-print([i.bill_text for i in parser.bills])
+
 print(len(parser.bills))
 db = BillDB()
 db.add_conference(parser=parser)
 allbills = len(db.search(query=QueryAll))
 bluelen = len(db.search(query=BillQuery(color=QueryField.Colors.Blue)))
 whitelen = len(db.search(query=BillQuery(color=QueryField.Colors.White)))
 redlen = len(db.search(query=BillQuery(color=QueryField.Colors.Red)))
 senatelen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.Senate)))
 houselen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.House)))
 franklincount = len(db.search(query=BillQuery(school="Franklin")))
 print(allbills)
 print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
 print(senatelen, houselen, senatelen + houselen)
 print(franklincount)
--- a/leglib.py
+++ b/leglib.py
@ -1,5 +0,0 @@
 import fitz
 import math
 from typing import Any
 import parsers
--- a/leglib/main.py
+++ b/leglib/main.py
--- a/leglib/billdb.py
+++ b/leglib/billdb.py
@ -0,0 +1,122 @@
 from .common import Bill, CCEColors, CCEAssemblies
 from .parsers import BookParser
 from typing import Type, Self
 from dataclasses import dataclass
 class QueryAny:
    """
    Use this class to indicate an Any match for attributes without an Any attribute.
    """
    pass
 class SearchNotSatisified(BaseException):
    pass
 class QueryAll:
    pass
 class QueryField:
    Any = object()
    Colors = CCEColors
    Assemblies = CCEAssemblies
@dataclass
 class BillQuery:
    """
    Holds a query for the BillDB.
    """
    color: CCEColors | QueryField = QueryField.Any
    assembly: CCEAssemblies | QueryField = QueryField.Any
    committee: int | QueryField = QueryField.Any
    year: int | QueryField = QueryField.Any
    subcommittee: str | QueryField = QueryField.Any
    sponsors: str | QueryField = QueryField.Any
    school: str | QueryField = QueryField.Any
    bill_text: str | QueryField = QueryField.Any
    title: str | QueryField = QueryField.Any
    def __post_init__(self):
        self.bill_text_concat = self.bill_text # for search compat reasons
 class BillDB:
    def __init__(self):
        self.bills: list[Bill] = []
        self.cache: dict[Bill]
    @staticmethod
    def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
        """
        This is probably very slow. Maybe replace this with a better solution?
        This function replaces repetitive code like this:
        elif bill.assembly != CCEAssemblies.Any:
            if bill.assembly != query.color:
                raise SearchNotSatisified()
        with this:
        self.enum_match(bill, query, "color")
        This is the case with exact_match and string_match, too.
        """
        if query.__getattribute__(attr) == QueryField.Any:
            return
        # check the Any case
        if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
            # make sure we're not matching
            if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
                raise SearchNotSatisified()
        # if we do match, no exception
    @staticmethod
    def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
        """
        See self.code_enum_match for more info.
        """
        if query.__getattribute__(attr) == QueryField.Any:
            return
        if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
            raise SearchNotSatisified()
    def add_conference(self: Self, parser: Type[BookParser]) -> None:
        """
        Type[BookParser] -> any subclass of BookParser
        """
        # this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
        self.bills += parser.bills
    def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
        if query == QueryAll:
            return self.bills
        results = []
        for bill in self.bills:
            try:
                self.code_enum_match(bill, query, "color")
                self.code_enum_match(bill, query, "assembly")
                if not query.committee == QueryField.Any:
                    if not query.committee == bill.code.committee:
                        raise SearchNotSatisified()
                if not query.committee == QueryField.Any:
                    if not query.year == bill.code.year:
                        raise SearchNotSatisified()
                self.string_match(bill, query, "subcommittee")
                self.string_match(bill, query, "sponsors")
                self.string_match(bill, query, "school")
                self.string_match(bill, query, "bill_text_concat")
                self.string_match(bill, query, "title")
            except SearchNotSatisified:
                continue
            results.append(bill)
        return results
--- a/leglib/common.py
+++ b/leglib/common.py
@ -3,12 +3,15 @@ from enum import StrEnum, auto
 class CCEColors(StrEnum):
    Red = "Red"
    White = "White",
-    Blue = "Blue"
+    Blue = "Blue",
    Undefined = "Undefined", # some conferences don't have assemblies
    Any = "Any" # for searching purposes
 class CCEAssemblies(StrEnum):
    Senate = "Senate",
    House = "House",
-    GeneralAssembly = "GeneralAssembly"
+    GeneralAssembly = "GeneralAssembly",
    Any = "Any" # for searching purposes
 class BillCode:
    def __init__(self, text: str):
@ -37,18 +40,21 @@ class BillCode:
        elif assemblydivision == "G":
            self.assembly = CCEAssemblies.GeneralAssembly
-        self.year = int(dashsplit[0])
+        # reverse y2k problem; but conference years are stored in YY, not YYYY form
        self.year = int(dashsplit[0]) + 2000
        self.committee = int(dashsplit[1])
        self.docketplacement = int(dashsplit[2])
        self.stringrep = self.color[0].upper() + \
            self.assembly[0].upper() + \
            "B/{}-{}-{}".format(
-                str(self.year),
+                str(self.year - 2000),
                str(self.committee),
                str(self.docketplacement)
            )
        self.conference: None | str = None # to be filled in with BookParser and friends
    def __str__(self):
        return "{} {} - {}-{}-{}".format(
            self.color,
@ -77,3 +83,7 @@ class Bill:
        self.school = school.rstrip()
        self.bill_text = bill_text
        self.title = title
    @property
    def bill_text_concat(self):
        return ''.join(self.bill_text)
--- a/leglib/lib.py
+++ b/leglib/lib.py
--- a/leglib/parsers.py
+++ b/leglib/parsers.py
@ -1,13 +1,31 @@
 import fitz
-from typing import Any
+from typing import Any, Self, ClassVar
 from itertools import groupby
 from dataclasses import dataclass
-from lib import FitzBlockWrapper
+from .lib import FitzBlockWrapper
-from common import Bill
+from .common import Bill
-class HSYIGPdfParser:
+@dataclass
-    def __init__(self, document: fitz.Document):
+class BookParser:
-        self.document = document
+    # class variables
    humanname: ClassVar[str] = "Generic BookParser parent class."
    description: ClassVar[str] = """
        A generic description of the abilities of this BookParser.
    """
    # everything else
    document: fitz.Document
    confname: str
    @classmethod
    def from_filename(cls, filename: str, confname: str):
        return cls(
            document=fitz.open(filename),
            confname=confname
        )
 class HSYIGPdfParser(BookParser):
    @staticmethod
    def _words_in_superstring(words: list[str], superstring: str) -> bool:
        for word in words:
@ -142,6 +160,9 @@ class HSYIGPdfParser:
                title=pretty_printed["title"]
            ))
        for bill in bills: # add the conference name to each
            bill.code.conference = self.confname
        self.bills = bills
    @staticmethod
@ -155,20 +176,27 @@ class HSYIGPdfParser:
    def _pretty_print_bill_text(self, bill_text: str):
        replaced = bill_text.replace("<EFBFBD>", "\n")
        replaced = bill_text
        replaced = replaced.split('\n')
-
+        replaced = [
-        replaced = [i.rstrip().lstrip() for i in replaced]
+            i \
                .replace('<EFBFBD>', ' ') \
                .rstrip() \
                .lstrip() \
            for i in replaced
        ]
        first_line_number = self._find_first_line_number(replaced)
-
+        title = ' '.join(replaced[:(first_line_number - 1)])
-        title = ' '.join(replaced[:first_line_number])
+        title = ' '.join(title.split()) # remove double spaces
        rebuilt = replaced[first_line_number:][1::2]
        # remove the last line number, it doesn't have a cooresponding space at the end
        rebuilt = rebuilt[:-1]
        # remove the first line, as it's the whitespace between the title and the bill text
        rebuilt = rebuilt[1:]
        return {
            "title": title.lstrip(),
            "bill_array": rebuilt
        }
    @classmethod
    def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
        return cls(fitz.open(filename))
--- a/shell.nix
+++ b/shell.nix
@ -3,5 +3,6 @@
    # nativeBuildInputs is usually what you want -- tools you need to run
    nativeBuildInputs = with pkgs; [
        buildPackages.python311Packages.pymupdf
        buildPackages.python311Packages.flask
    ];
 }
Author	SHA1	Message	Date
stupidcomputer	9680a416da	move leglib into a python package	2024-05-19 17:56:26 -05:00
stupidcomputer	9ba154f654	add search to the setup	2024-05-19 17:51:51 -05:00
stupidcomputer	5c11ff4371	parser object overhauls make conference names part of the parser object in preparation for the implementation of billdb; refactor class structure for parsers	2024-05-19 16:02:33 -05:00
stupidcomputer	dbd9632e16	fix bill parsing in the main parser	2024-05-19 16:02:03 -05:00