Compare commits
4 Commits
11fbcb474a
...
9680a416da
Author | SHA1 | Date |
---|---|---|
stupidcomputer | 9680a416da | |
stupidcomputer | 9ba154f654 | |
stupidcomputer | 5c11ff4371 | |
stupidcomputer | dbd9632e16 |
31
analyser.py
31
analyser.py
|
@ -1,5 +1,30 @@
|
|||
import leglib
|
||||
import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll
|
||||
from leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
|
||||
from leglib.parsers import HSYIGPdfParser
|
||||
|
||||
parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
|
||||
parser = HSYIGPdfParser.from_filename(
|
||||
filename="YIGVolunteerBook2024.pdf",
|
||||
confname="HSVolunteer"
|
||||
)
|
||||
parser.parse()
|
||||
print([i.bill_text for i in parser.bills])
|
||||
|
||||
print(len(parser.bills))
|
||||
|
||||
db = BillDB()
|
||||
db.add_conference(parser=parser)
|
||||
|
||||
allbills = len(db.search(query=QueryAll))
|
||||
|
||||
bluelen = len(db.search(query=BillQuery(color=QueryField.Colors.Blue)))
|
||||
whitelen = len(db.search(query=BillQuery(color=QueryField.Colors.White)))
|
||||
redlen = len(db.search(query=BillQuery(color=QueryField.Colors.Red)))
|
||||
|
||||
senatelen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.Senate)))
|
||||
houselen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.House)))
|
||||
|
||||
franklincount = len(db.search(query=BillQuery(school="Franklin")))
|
||||
|
||||
print(allbills)
|
||||
print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
|
||||
print(senatelen, houselen, senatelen + houselen)
|
||||
print(franklincount)
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
from .common import Bill, CCEColors, CCEAssemblies
|
||||
from .parsers import BookParser
|
||||
|
||||
from typing import Type, Self
|
||||
from dataclasses import dataclass
|
||||
|
||||
class QueryAny:
|
||||
"""
|
||||
Use this class to indicate an Any match for attributes without an Any attribute.
|
||||
"""
|
||||
pass
|
||||
|
||||
class SearchNotSatisified(BaseException):
|
||||
pass
|
||||
|
||||
class QueryAll:
|
||||
pass
|
||||
|
||||
class QueryField:
|
||||
Any = object()
|
||||
Colors = CCEColors
|
||||
Assemblies = CCEAssemblies
|
||||
|
||||
@dataclass
|
||||
class BillQuery:
|
||||
"""
|
||||
Holds a query for the BillDB.
|
||||
"""
|
||||
color: CCEColors | QueryField = QueryField.Any
|
||||
assembly: CCEAssemblies | QueryField = QueryField.Any
|
||||
committee: int | QueryField = QueryField.Any
|
||||
year: int | QueryField = QueryField.Any
|
||||
subcommittee: str | QueryField = QueryField.Any
|
||||
sponsors: str | QueryField = QueryField.Any
|
||||
school: str | QueryField = QueryField.Any
|
||||
bill_text: str | QueryField = QueryField.Any
|
||||
title: str | QueryField = QueryField.Any
|
||||
|
||||
def __post_init__(self):
|
||||
self.bill_text_concat = self.bill_text # for search compat reasons
|
||||
|
||||
class BillDB:
|
||||
def __init__(self):
|
||||
self.bills: list[Bill] = []
|
||||
self.cache: dict[Bill]
|
||||
|
||||
@staticmethod
|
||||
def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
|
||||
"""
|
||||
This is probably very slow. Maybe replace this with a better solution?
|
||||
|
||||
This function replaces repetitive code like this:
|
||||
|
||||
elif bill.assembly != CCEAssemblies.Any:
|
||||
if bill.assembly != query.color:
|
||||
raise SearchNotSatisified()
|
||||
|
||||
with this:
|
||||
|
||||
self.enum_match(bill, query, "color")
|
||||
|
||||
This is the case with exact_match and string_match, too.
|
||||
"""
|
||||
|
||||
if query.__getattribute__(attr) == QueryField.Any:
|
||||
return
|
||||
|
||||
# check the Any case
|
||||
if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
|
||||
# make sure we're not matching
|
||||
if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
|
||||
raise SearchNotSatisified()
|
||||
|
||||
# if we do match, no exception
|
||||
|
||||
@staticmethod
|
||||
def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
|
||||
"""
|
||||
See self.code_enum_match for more info.
|
||||
"""
|
||||
if query.__getattribute__(attr) == QueryField.Any:
|
||||
return
|
||||
|
||||
if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
|
||||
raise SearchNotSatisified()
|
||||
|
||||
def add_conference(self: Self, parser: Type[BookParser]) -> None:
|
||||
"""
|
||||
Type[BookParser] -> any subclass of BookParser
|
||||
"""
|
||||
|
||||
# this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
|
||||
self.bills += parser.bills
|
||||
|
||||
def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
|
||||
if query == QueryAll:
|
||||
return self.bills
|
||||
results = []
|
||||
for bill in self.bills:
|
||||
try:
|
||||
self.code_enum_match(bill, query, "color")
|
||||
self.code_enum_match(bill, query, "assembly")
|
||||
|
||||
if not query.committee == QueryField.Any:
|
||||
if not query.committee == bill.code.committee:
|
||||
raise SearchNotSatisified()
|
||||
|
||||
if not query.committee == QueryField.Any:
|
||||
if not query.year == bill.code.year:
|
||||
raise SearchNotSatisified()
|
||||
|
||||
self.string_match(bill, query, "subcommittee")
|
||||
self.string_match(bill, query, "sponsors")
|
||||
self.string_match(bill, query, "school")
|
||||
self.string_match(bill, query, "bill_text_concat")
|
||||
self.string_match(bill, query, "title")
|
||||
|
||||
except SearchNotSatisified:
|
||||
continue
|
||||
results.append(bill)
|
||||
|
||||
return results
|
|
@ -3,12 +3,15 @@ from enum import StrEnum, auto
|
|||
class CCEColors(StrEnum):
|
||||
Red = "Red"
|
||||
White = "White",
|
||||
Blue = "Blue"
|
||||
Blue = "Blue",
|
||||
Undefined = "Undefined", # some conferences don't have assemblies
|
||||
Any = "Any" # for searching purposes
|
||||
|
||||
class CCEAssemblies(StrEnum):
|
||||
Senate = "Senate",
|
||||
House = "House",
|
||||
GeneralAssembly = "GeneralAssembly"
|
||||
GeneralAssembly = "GeneralAssembly",
|
||||
Any = "Any" # for searching purposes
|
||||
|
||||
class BillCode:
|
||||
def __init__(self, text: str):
|
||||
|
@ -37,18 +40,21 @@ class BillCode:
|
|||
elif assemblydivision == "G":
|
||||
self.assembly = CCEAssemblies.GeneralAssembly
|
||||
|
||||
self.year = int(dashsplit[0])
|
||||
# reverse y2k problem; but conference years are stored in YY, not YYYY form
|
||||
self.year = int(dashsplit[0]) + 2000
|
||||
self.committee = int(dashsplit[1])
|
||||
self.docketplacement = int(dashsplit[2])
|
||||
|
||||
self.stringrep = self.color[0].upper() + \
|
||||
self.assembly[0].upper() + \
|
||||
"B/{}-{}-{}".format(
|
||||
str(self.year),
|
||||
str(self.year - 2000),
|
||||
str(self.committee),
|
||||
str(self.docketplacement)
|
||||
)
|
||||
|
||||
self.conference: None | str = None # to be filled in with BookParser and friends
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} - {}-{}-{}".format(
|
||||
self.color,
|
||||
|
@ -77,3 +83,7 @@ class Bill:
|
|||
self.school = school.rstrip()
|
||||
self.bill_text = bill_text
|
||||
self.title = title
|
||||
|
||||
@property
|
||||
def bill_text_concat(self):
|
||||
return ''.join(self.bill_text)
|
|
@ -1,13 +1,31 @@
|
|||
import fitz
|
||||
from typing import Any
|
||||
from typing import Any, Self, ClassVar
|
||||
from itertools import groupby
|
||||
from dataclasses import dataclass
|
||||
|
||||
from lib import FitzBlockWrapper
|
||||
from common import Bill
|
||||
from .lib import FitzBlockWrapper
|
||||
from .common import Bill
|
||||
|
||||
class HSYIGPdfParser:
|
||||
def __init__(self, document: fitz.Document):
|
||||
self.document = document
|
||||
@dataclass
|
||||
class BookParser:
|
||||
# class variables
|
||||
humanname: ClassVar[str] = "Generic BookParser parent class."
|
||||
description: ClassVar[str] = """
|
||||
A generic description of the abilities of this BookParser.
|
||||
"""
|
||||
|
||||
# everything else
|
||||
document: fitz.Document
|
||||
confname: str
|
||||
|
||||
@classmethod
|
||||
def from_filename(cls, filename: str, confname: str):
|
||||
return cls(
|
||||
document=fitz.open(filename),
|
||||
confname=confname
|
||||
)
|
||||
|
||||
class HSYIGPdfParser(BookParser):
|
||||
@staticmethod
|
||||
def _words_in_superstring(words: list[str], superstring: str) -> bool:
|
||||
for word in words:
|
||||
|
@ -142,6 +160,9 @@ class HSYIGPdfParser:
|
|||
title=pretty_printed["title"]
|
||||
))
|
||||
|
||||
for bill in bills: # add the conference name to each
|
||||
bill.code.conference = self.confname
|
||||
|
||||
self.bills = bills
|
||||
|
||||
@staticmethod
|
||||
|
@ -155,20 +176,27 @@ class HSYIGPdfParser:
|
|||
|
||||
def _pretty_print_bill_text(self, bill_text: str):
|
||||
replaced = bill_text.replace("<EFBFBD>", "\n")
|
||||
replaced = bill_text
|
||||
replaced = replaced.split('\n')
|
||||
|
||||
replaced = [i.rstrip().lstrip() for i in replaced]
|
||||
replaced = [
|
||||
i \
|
||||
.replace('<EFBFBD>', ' ') \
|
||||
.rstrip() \
|
||||
.lstrip() \
|
||||
for i in replaced
|
||||
]
|
||||
|
||||
first_line_number = self._find_first_line_number(replaced)
|
||||
|
||||
title = ' '.join(replaced[:first_line_number])
|
||||
title = ' '.join(replaced[:(first_line_number - 1)])
|
||||
title = ' '.join(title.split()) # remove double spaces
|
||||
rebuilt = replaced[first_line_number:][1::2]
|
||||
# remove the last line number, it doesn't have a cooresponding space at the end
|
||||
rebuilt = rebuilt[:-1]
|
||||
|
||||
# remove the first line, as it's the whitespace between the title and the bill text
|
||||
rebuilt = rebuilt[1:]
|
||||
|
||||
return {
|
||||
"title": title.lstrip(),
|
||||
"bill_array": rebuilt
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
|
||||
return cls(fitz.open(filename))
|
Loading…
Reference in New Issue