from .common import * from typing import ClassVar from dataclasses import dataclass import fitz class HSYIG(): section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ] last_page_words = [ "ABCs" ] def __init__(self, document: fitz.Document): self.document = document self.__post_init__() def __post_init__(self): # run all the processing steps here self.parse_legislative_metablocks() def generate_section_markers(self) -> list[int]: """ In the YIG/MUN manuals, there's section markers that delineate between the different committees within the manual. Let's find those, and then the last legislative page. """ section_pages = [] for page in self.document: text = page.get_text().encode("utf8") is_section_page = words_in_superstring( words = self.section_page_words, superstring = text ) is_last_page = words_in_superstring( words = self.last_page_words, superstring = text ) print(text, is_section_page, is_last_page) if is_section_page and len(page.get_images()) == 3: section_pages.append(page.number) if is_last_page and len(section_pages) > 2: section_pages.append(page.number) return section_pages def get_legislative_pages(self): """ Generate the section markers, then fill in the pages between them. """ current = 0 sections = self.generate_section_markers() legislative_pages: list[int] = [] try: while True: legislative_pages += list( range( sections[current] + 1, sections[current + 1], 1 ) ) current += 1 except IndexError: pass return legislative_pages def concat_blocks_for_leg_pages(self): """ From the legislative pages, concatenate the "blocks" of text in the PDF. """ blocks = [] pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()] for page in pages: block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")] blocks += block_info return blocks def split_leg_pages(self): """ We have the collection of legislative page text blocks. We need to split them now. We split on the text "71st General Assembly... Youth in Government" """ blocks = self.concat_blocks_for_leg_pages() # each item within splitted is called a "legislative meta-block" splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text) return splitted[1:] # there's an empty array at the beginning def handle_the_rest(self, the_rest): weird_character = u'\uFFFd' splitted_by_weird = the_rest.split(weird_character) title_content = ''.join( splitted_by_weird[0].split('\n')[:-1] ).rstrip().lstrip() bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]] return { "bill_text": '\n'.join(bill_text), "title": title_content } def parse_legislative_metablocks(self): output = [] splitted = self.split_leg_pages() for legislative_text in splitted: # there are some blocks that contain just one value # and are aligned to some x value on the pdf # it's an easy way to extract stuff legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip() school = get_block_by_x_value(legislative_text, 163).text.rstrip() sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip() subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip() the_rest = ''.join([i.text for i in legislative_text[6:]]) handled = self.handle_the_rest(the_rest) title = handled["title"] bill_text = handled["bill_text"] output.append({ "code": leg_code, "school": school, "sponsors": sponsors, "subcommittee": subcommittee, "title": title, "bill_text": bill_text }) self.output = output