add country and catagory fields; simplify serialization into legislation objects

This commit is contained in:
stupidcomputer 2024-06-29 22:27:05 -05:00
parent a1c722295b
commit 761b00eb49
3 changed files with 82 additions and 44 deletions

View File

@ -163,45 +163,63 @@ class CCEParserBase():
for legislative_text in splitted: for legislative_text in splitted:
# there are some blocks that contain just one value # there are some blocks that contain just one value
# and are aligned to some x value on the pdf # and are aligned to some x value on the pdf
# it's an easy way to extract stuff # it's an easy way to extract stuff
try:
country = get_block_by_x_value(legislative_text, 139).text.rstrip()
country = country.replace("Sponsor: ", "").lstrip()
except AttributeError:
country = None # this is a yig bill
try:
category = get_block_by_x_value(legislative_text, 151).text.rstrip().lstrip()
except AttributeError:
try:
category = get_block_by_x_value(legislative_text, 153).text.rstrip().lstrip()
except AttributeError:
print([(i.text, i.x0) for i in legislative_text])
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
try: try:
school = get_block_by_x_value(legislative_text, 177).text.rstrip() school = get_block_by_x_value(legislative_text, 177).text.rstrip().lstrip()
except AttributeError: except AttributeError:
try: try:
school = get_block_by_x_value(legislative_text, 186).text.rstrip() school = get_block_by_x_value(legislative_text, 186).text.rstrip().lstrip()
except AttributeError: except AttributeError:
school = "you tell me, man" school = "you tell me, man"
try: try:
sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip() sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip().lstrip()
except AttributeError: except AttributeError:
try: try:
sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip() sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip().lstrip()
except AttributeError: except AttributeError:
sponsors = "you tell me, man" sponsors = "you tell me, man"
try:
subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
except AttributeError:
try:
subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
except AttributeError:
subcommittee = "you tell me, man"
the_rest = ''.join([i.text for i in legislative_text[12:]]) the_rest = ''.join([i.text for i in legislative_text[12:]])
print([i.text for i in legislative_text[12:]])
handled = self.handle_the_rest(the_rest) handled = self.handle_the_rest(the_rest)
title = handled["title"] title = handled["title"]
bill_text = handled["bill_text"] bill_text = handled["bill_text"]
codesplit = leg_code.split('/')
assembly = codesplit[0]
dashsplit = codesplit[1].split('-')
year = 2000 + int(dashsplit[0])
committee = int(dashsplit[1])
docket_order = int(dashsplit[2])
output.append({ output.append({
"code": leg_code, "assembly": assembly,
"year": year,
"committee": committee,
"docket_order": docket_order,
"category": category,
"country": country,
"school": school, "school": school,
"sponsors": sponsors, "sponsors": sponsors,
"subcommittee": subcommittee, "legislation_title": title,
"title": title, "text": bill_text
"bill_text": bill_text
}) })
self.output = output self.output = output
@ -255,22 +273,33 @@ class HSYIG24(CCEParserBase):
# it's an easy way to extract stuff # it's an easy way to extract stuff
legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
category = get_block_by_x_value(legislative_text, 139).text.rstrip().lstrip()
leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
school = get_block_by_x_value(legislative_text, 163).text.rstrip() school = get_block_by_x_value(legislative_text, 163).text.rstrip().lstrip()
sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip() sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip().lstrip()
subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
the_rest = ''.join([i.text for i in legislative_text[6:]]) the_rest = ''.join([i.text for i in legislative_text[6:]])
handled = self.handle_the_rest(the_rest) handled = self.handle_the_rest(the_rest)
title = handled["title"] title = handled["title"]
bill_text = handled["bill_text"] bill_text = handled["bill_text"]
codesplit = leg_code.split('/')
assembly = codesplit[0]
dashsplit = codesplit[1].split('-')
year = 2000 + int(dashsplit[0])
committee = int(dashsplit[1])
docket_order = int(dashsplit[2])
output.append({ output.append({
"code": leg_code, "assembly": assembly,
"year": year,
"committee": committee,
"docket_order": docket_order,
"category": category,
"country": None, # this is a yig bill
"school": school, "school": school,
"sponsors": sponsors, "sponsors": sponsors,
"subcommittee": subcommittee, "legislation_title": title,
"title": title, "text": bill_text
"bill_text": bill_text
}) })
self.output = output self.output = output
@ -287,8 +316,9 @@ def main():
return return
for text in doc.output: for text in doc.output:
print("{} ---------------------------- {}".format( print("{} {} {} ---------------------------- {}".format(
text["title"], text["bill_text"] text["country"], text["category"],
text["legislation_title"], text["text"]
)) ))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,24 @@
# Generated by Django 4.2.12 on 2024-06-30 03:14
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('explorer', '0006_remove_legislationclassification_obvious_change'),
]
operations = [
migrations.AddField(
model_name='legislativetext',
name='category',
field=models.CharField(default='', max_length=256),
preserve_default=False,
),
migrations.AddField(
model_name='legislativetext',
name='country',
field=models.CharField(blank=True, max_length=512, null=True),
),
]

View File

@ -47,24 +47,7 @@ class LegislationBook(models.Model):
return return
for text in parsed.output: for text in parsed.output:
print(text["code"]) text = LegislativeText(**text, from_book=self)
codesplit = text["code"].split('/')
assembly = codesplit[0]
dashsplit = codesplit[1].split('-')
year = 2000 + int(dashsplit[0])
committee = int(dashsplit[1])
docket_order = int(dashsplit[2])
text = LegislativeText(
assembly=assembly,
year=year,
committee=committee,
docket_order=docket_order,
school=text["school"],
sponsors=text["sponsors"],
legislation_title=text["title"],
text=text["bill_text"],
from_book=self
)
text.save() text.save()
def __str__(self): def __str__(self):
@ -93,6 +76,7 @@ class LegislativeText(models.Model):
text = models.TextField() text = models.TextField()
year = models.IntegerField() year = models.IntegerField()
committee = models.IntegerField() committee = models.IntegerField()
category = models.CharField(max_length=256)
docket_order = models.IntegerField() docket_order = models.IntegerField()
school = models.CharField(max_length=256) school = models.CharField(max_length=256)
sponsors = models.CharField(max_length=256) sponsors = models.CharField(max_length=256)