Compare commits
4 Commits
11fbcb474a
...
9680a416da
Author | SHA1 | Date |
---|---|---|
stupidcomputer | 9680a416da | |
stupidcomputer | 9ba154f654 | |
stupidcomputer | 5c11ff4371 | |
stupidcomputer | dbd9632e16 |
31
analyser.py
31
analyser.py
|
@ -1,5 +1,30 @@
|
||||||
import leglib
|
import leglib #billdb import BillDB, BillQuery, QueryField, QueryAll
|
||||||
|
from leglib.billdb import BillDB, BillQuery, QueryField, QueryAll
|
||||||
|
from leglib.parsers import HSYIGPdfParser
|
||||||
|
|
||||||
parser = leglib.parsers.HSYIGPdfParser.from_filename("YIGVolunteerBook2024.pdf")
|
parser = HSYIGPdfParser.from_filename(
|
||||||
|
filename="YIGVolunteerBook2024.pdf",
|
||||||
|
confname="HSVolunteer"
|
||||||
|
)
|
||||||
parser.parse()
|
parser.parse()
|
||||||
print([i.bill_text for i in parser.bills])
|
|
||||||
|
print(len(parser.bills))
|
||||||
|
|
||||||
|
db = BillDB()
|
||||||
|
db.add_conference(parser=parser)
|
||||||
|
|
||||||
|
allbills = len(db.search(query=QueryAll))
|
||||||
|
|
||||||
|
bluelen = len(db.search(query=BillQuery(color=QueryField.Colors.Blue)))
|
||||||
|
whitelen = len(db.search(query=BillQuery(color=QueryField.Colors.White)))
|
||||||
|
redlen = len(db.search(query=BillQuery(color=QueryField.Colors.Red)))
|
||||||
|
|
||||||
|
senatelen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.Senate)))
|
||||||
|
houselen = len(db.search(query=BillQuery(assembly=QueryField.Assemblies.House)))
|
||||||
|
|
||||||
|
franklincount = len(db.search(query=BillQuery(school="Franklin")))
|
||||||
|
|
||||||
|
print(allbills)
|
||||||
|
print(redlen, whitelen, bluelen, redlen + whitelen + bluelen)
|
||||||
|
print(senatelen, houselen, senatelen + houselen)
|
||||||
|
print(franklincount)
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
from .common import Bill, CCEColors, CCEAssemblies
|
||||||
|
from .parsers import BookParser
|
||||||
|
|
||||||
|
from typing import Type, Self
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
class QueryAny:
|
||||||
|
"""
|
||||||
|
Use this class to indicate an Any match for attributes without an Any attribute.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class SearchNotSatisified(BaseException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class QueryAll:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class QueryField:
|
||||||
|
Any = object()
|
||||||
|
Colors = CCEColors
|
||||||
|
Assemblies = CCEAssemblies
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BillQuery:
|
||||||
|
"""
|
||||||
|
Holds a query for the BillDB.
|
||||||
|
"""
|
||||||
|
color: CCEColors | QueryField = QueryField.Any
|
||||||
|
assembly: CCEAssemblies | QueryField = QueryField.Any
|
||||||
|
committee: int | QueryField = QueryField.Any
|
||||||
|
year: int | QueryField = QueryField.Any
|
||||||
|
subcommittee: str | QueryField = QueryField.Any
|
||||||
|
sponsors: str | QueryField = QueryField.Any
|
||||||
|
school: str | QueryField = QueryField.Any
|
||||||
|
bill_text: str | QueryField = QueryField.Any
|
||||||
|
title: str | QueryField = QueryField.Any
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.bill_text_concat = self.bill_text # for search compat reasons
|
||||||
|
|
||||||
|
class BillDB:
|
||||||
|
def __init__(self):
|
||||||
|
self.bills: list[Bill] = []
|
||||||
|
self.cache: dict[Bill]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def code_enum_match(bill: Bill, query: BillQuery, attr: str) -> None:
|
||||||
|
"""
|
||||||
|
This is probably very slow. Maybe replace this with a better solution?
|
||||||
|
|
||||||
|
This function replaces repetitive code like this:
|
||||||
|
|
||||||
|
elif bill.assembly != CCEAssemblies.Any:
|
||||||
|
if bill.assembly != query.color:
|
||||||
|
raise SearchNotSatisified()
|
||||||
|
|
||||||
|
with this:
|
||||||
|
|
||||||
|
self.enum_match(bill, query, "color")
|
||||||
|
|
||||||
|
This is the case with exact_match and string_match, too.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if query.__getattribute__(attr) == QueryField.Any:
|
||||||
|
return
|
||||||
|
|
||||||
|
# check the Any case
|
||||||
|
if query.__getattribute__(attr) != bill.code.__getattribute__(attr).__class__.Any:
|
||||||
|
# make sure we're not matching
|
||||||
|
if bill.code.__getattribute__(attr) != query.__getattribute__(attr):
|
||||||
|
raise SearchNotSatisified()
|
||||||
|
|
||||||
|
# if we do match, no exception
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def string_match(bill: Bill, query: BillQuery, attr: str) -> None:
|
||||||
|
"""
|
||||||
|
See self.code_enum_match for more info.
|
||||||
|
"""
|
||||||
|
if query.__getattribute__(attr) == QueryField.Any:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not query.__getattribute__(attr).lower() in bill.__getattribute__(attr).lower():
|
||||||
|
raise SearchNotSatisified()
|
||||||
|
|
||||||
|
def add_conference(self: Self, parser: Type[BookParser]) -> None:
|
||||||
|
"""
|
||||||
|
Type[BookParser] -> any subclass of BookParser
|
||||||
|
"""
|
||||||
|
|
||||||
|
# this works because each BookParser must insert its self.confname into its self.bills[i].code.conference field.
|
||||||
|
self.bills += parser.bills
|
||||||
|
|
||||||
|
def search(self: Self, query: BillQuery | QueryAll) -> list[Bill]:
|
||||||
|
if query == QueryAll:
|
||||||
|
return self.bills
|
||||||
|
results = []
|
||||||
|
for bill in self.bills:
|
||||||
|
try:
|
||||||
|
self.code_enum_match(bill, query, "color")
|
||||||
|
self.code_enum_match(bill, query, "assembly")
|
||||||
|
|
||||||
|
if not query.committee == QueryField.Any:
|
||||||
|
if not query.committee == bill.code.committee:
|
||||||
|
raise SearchNotSatisified()
|
||||||
|
|
||||||
|
if not query.committee == QueryField.Any:
|
||||||
|
if not query.year == bill.code.year:
|
||||||
|
raise SearchNotSatisified()
|
||||||
|
|
||||||
|
self.string_match(bill, query, "subcommittee")
|
||||||
|
self.string_match(bill, query, "sponsors")
|
||||||
|
self.string_match(bill, query, "school")
|
||||||
|
self.string_match(bill, query, "bill_text_concat")
|
||||||
|
self.string_match(bill, query, "title")
|
||||||
|
|
||||||
|
except SearchNotSatisified:
|
||||||
|
continue
|
||||||
|
results.append(bill)
|
||||||
|
|
||||||
|
return results
|
|
@ -3,12 +3,15 @@ from enum import StrEnum, auto
|
||||||
class CCEColors(StrEnum):
|
class CCEColors(StrEnum):
|
||||||
Red = "Red"
|
Red = "Red"
|
||||||
White = "White",
|
White = "White",
|
||||||
Blue = "Blue"
|
Blue = "Blue",
|
||||||
|
Undefined = "Undefined", # some conferences don't have assemblies
|
||||||
|
Any = "Any" # for searching purposes
|
||||||
|
|
||||||
class CCEAssemblies(StrEnum):
|
class CCEAssemblies(StrEnum):
|
||||||
Senate = "Senate",
|
Senate = "Senate",
|
||||||
House = "House",
|
House = "House",
|
||||||
GeneralAssembly = "GeneralAssembly"
|
GeneralAssembly = "GeneralAssembly",
|
||||||
|
Any = "Any" # for searching purposes
|
||||||
|
|
||||||
class BillCode:
|
class BillCode:
|
||||||
def __init__(self, text: str):
|
def __init__(self, text: str):
|
||||||
|
@ -37,18 +40,21 @@ class BillCode:
|
||||||
elif assemblydivision == "G":
|
elif assemblydivision == "G":
|
||||||
self.assembly = CCEAssemblies.GeneralAssembly
|
self.assembly = CCEAssemblies.GeneralAssembly
|
||||||
|
|
||||||
self.year = int(dashsplit[0])
|
# reverse y2k problem; but conference years are stored in YY, not YYYY form
|
||||||
|
self.year = int(dashsplit[0]) + 2000
|
||||||
self.committee = int(dashsplit[1])
|
self.committee = int(dashsplit[1])
|
||||||
self.docketplacement = int(dashsplit[2])
|
self.docketplacement = int(dashsplit[2])
|
||||||
|
|
||||||
self.stringrep = self.color[0].upper() + \
|
self.stringrep = self.color[0].upper() + \
|
||||||
self.assembly[0].upper() + \
|
self.assembly[0].upper() + \
|
||||||
"B/{}-{}-{}".format(
|
"B/{}-{}-{}".format(
|
||||||
str(self.year),
|
str(self.year - 2000),
|
||||||
str(self.committee),
|
str(self.committee),
|
||||||
str(self.docketplacement)
|
str(self.docketplacement)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.conference: None | str = None # to be filled in with BookParser and friends
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "{} {} - {}-{}-{}".format(
|
return "{} {} - {}-{}-{}".format(
|
||||||
self.color,
|
self.color,
|
||||||
|
@ -77,3 +83,7 @@ class Bill:
|
||||||
self.school = school.rstrip()
|
self.school = school.rstrip()
|
||||||
self.bill_text = bill_text
|
self.bill_text = bill_text
|
||||||
self.title = title
|
self.title = title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bill_text_concat(self):
|
||||||
|
return ''.join(self.bill_text)
|
|
@ -1,13 +1,31 @@
|
||||||
import fitz
|
import fitz
|
||||||
from typing import Any
|
from typing import Any, Self, ClassVar
|
||||||
|
from itertools import groupby
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from lib import FitzBlockWrapper
|
from .lib import FitzBlockWrapper
|
||||||
from common import Bill
|
from .common import Bill
|
||||||
|
|
||||||
class HSYIGPdfParser:
|
@dataclass
|
||||||
def __init__(self, document: fitz.Document):
|
class BookParser:
|
||||||
self.document = document
|
# class variables
|
||||||
|
humanname: ClassVar[str] = "Generic BookParser parent class."
|
||||||
|
description: ClassVar[str] = """
|
||||||
|
A generic description of the abilities of this BookParser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# everything else
|
||||||
|
document: fitz.Document
|
||||||
|
confname: str
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_filename(cls, filename: str, confname: str):
|
||||||
|
return cls(
|
||||||
|
document=fitz.open(filename),
|
||||||
|
confname=confname
|
||||||
|
)
|
||||||
|
|
||||||
|
class HSYIGPdfParser(BookParser):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _words_in_superstring(words: list[str], superstring: str) -> bool:
|
def _words_in_superstring(words: list[str], superstring: str) -> bool:
|
||||||
for word in words:
|
for word in words:
|
||||||
|
@ -142,6 +160,9 @@ class HSYIGPdfParser:
|
||||||
title=pretty_printed["title"]
|
title=pretty_printed["title"]
|
||||||
))
|
))
|
||||||
|
|
||||||
|
for bill in bills: # add the conference name to each
|
||||||
|
bill.code.conference = self.confname
|
||||||
|
|
||||||
self.bills = bills
|
self.bills = bills
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -155,20 +176,27 @@ class HSYIGPdfParser:
|
||||||
|
|
||||||
def _pretty_print_bill_text(self, bill_text: str):
|
def _pretty_print_bill_text(self, bill_text: str):
|
||||||
replaced = bill_text.replace("<EFBFBD>", "\n")
|
replaced = bill_text.replace("<EFBFBD>", "\n")
|
||||||
|
replaced = bill_text
|
||||||
replaced = replaced.split('\n')
|
replaced = replaced.split('\n')
|
||||||
|
replaced = [
|
||||||
replaced = [i.rstrip().lstrip() for i in replaced]
|
i \
|
||||||
|
.replace('<EFBFBD>', ' ') \
|
||||||
|
.rstrip() \
|
||||||
|
.lstrip() \
|
||||||
|
for i in replaced
|
||||||
|
]
|
||||||
|
|
||||||
first_line_number = self._find_first_line_number(replaced)
|
first_line_number = self._find_first_line_number(replaced)
|
||||||
|
title = ' '.join(replaced[:(first_line_number - 1)])
|
||||||
title = ' '.join(replaced[:first_line_number])
|
title = ' '.join(title.split()) # remove double spaces
|
||||||
rebuilt = replaced[first_line_number:][1::2]
|
rebuilt = replaced[first_line_number:][1::2]
|
||||||
|
# remove the last line number, it doesn't have a cooresponding space at the end
|
||||||
|
rebuilt = rebuilt[:-1]
|
||||||
|
|
||||||
|
# remove the first line, as it's the whitespace between the title and the bill text
|
||||||
|
rebuilt = rebuilt[1:]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": title.lstrip(),
|
"title": title.lstrip(),
|
||||||
"bill_array": rebuilt
|
"bill_array": rebuilt
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
|
|
||||||
return cls(fitz.open(filename))
|
|
|
@ -3,5 +3,6 @@
|
||||||
# nativeBuildInputs is usually what you want -- tools you need to run
|
# nativeBuildInputs is usually what you want -- tools you need to run
|
||||||
nativeBuildInputs = with pkgs; [
|
nativeBuildInputs = with pkgs; [
|
||||||
buildPackages.python311Packages.pymupdf
|
buildPackages.python311Packages.pymupdf
|
||||||
|
buildPackages.python311Packages.flask
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue