yig/franklincce/explorer/lib/HSYIG.py

from .common import *
from typing import ClassVar
from dataclasses import dataclass

import fitz

class HSYIG():
    section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
    last_page_words = [ "ABCs" ]

    def __init__(self, document: fitz.Document):
        self.document = document
        self.__post_init__()

    def __post_init__(self):
        # run all the processing steps here
        self.parse_legislative_metablocks()

    def generate_section_markers(self) -> list[int]:
        """
        In the YIG/MUN manuals, there's section markers that delineate between the different
        committees within the manual. Let's find those, and then the last legislative page.
        """
        section_pages = []

        for page in self.document:
            text = page.get_text().encode("utf8")
            is_section_page = words_in_superstring(
                words = self.section_page_words,
                superstring = text
            )
            is_last_page = words_in_superstring(
                words = self.last_page_words,
                superstring = text
            )

            print(text, is_section_page, is_last_page)

            if is_section_page and len(page.get_images()) == 3:
                section_pages.append(page.number)

            if is_last_page and len(section_pages) > 2:
                section_pages.append(page.number)

        return section_pages

    def get_legislative_pages(self):
        """
        Generate the section markers, then fill in the pages between them.
        """

        current = 0
        sections = self.generate_section_markers()
        legislative_pages: list[int] = []
        try:
            while True:
                legislative_pages += list(
                    range(
                        sections[current] + 1,
                        sections[current + 1],
                        1
                    )
                )

                current += 1

        except IndexError:
            pass

        return legislative_pages

    def concat_blocks_for_leg_pages(self):
        """
        From the legislative pages, concatenate the "blocks" of text in the PDF.
        """
        blocks = []
        pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
        for page in pages:
            block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]

            blocks += block_info

        return blocks

    def split_leg_pages(self):
        """
        We have the collection of legislative page text blocks. We need
        to split them now. We split on the text "71st General Assembly...
        Youth in Government"
        """

        blocks = self.concat_blocks_for_leg_pages()
        # each item within splitted is called a "legislative meta-block"
        splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)

        return splitted[1:] # there's an empty array at the beginning

    def handle_the_rest(self, the_rest):
        weird_character = u'\uFFFd'
        splitted_by_weird = the_rest.split(weird_character)
        title_content = ''.join(
            splitted_by_weird[0].split('\n')[:-1]
        ).rstrip().lstrip()

        bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]

        return {
            "bill_text": '\n'.join(bill_text),
            "title": title_content
        }

    def parse_legislative_metablocks(self):
        output = []
        splitted = self.split_leg_pages()
        for legislative_text in splitted:
            # there are some blocks that contain just one value
            # and are aligned to some x value on the pdf

            # it's an easy way to extract stuff
            legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
            school = get_block_by_x_value(legislative_text, 163).text.rstrip()
            sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
            subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
            the_rest = ''.join([i.text for i in legislative_text[6:]])
            handled = self.handle_the_rest(the_rest)
            title = handled["title"]
            bill_text = handled["bill_text"]

            output.append({
                "code": leg_code,
                "school": school,
                "sponsors": sponsors,
                "subcommittee": subcommittee,
                "title": title,
                "bill_text": bill_text
            })

        self.output = output
web interface version 1 There's a crappy frontend, but the Django admin panel works as expected. 2024-06-19 12:41:41 -05:00			`from .common import *`
			`from typing import ClassVar`
			`from dataclasses import dataclass`

			`import fitz`

			`class HSYIG():`
			`section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]`
			`last_page_words = [ "ABCs" ]`

			`def __init__(self, document: fitz.Document):`
			`self.document = document`
			`self.__post_init__()`

			`def __post_init__(self):`
			`# run all the processing steps here`
			`self.parse_legislative_metablocks()`

			`def generate_section_markers(self) -> list[int]:`
			`"""`
			`In the YIG/MUN manuals, there's section markers that delineate between the different`
			`committees within the manual. Let's find those, and then the last legislative page.`
			`"""`
			`section_pages = []`

			`for page in self.document:`
			`text = page.get_text().encode("utf8")`
			`is_section_page = words_in_superstring(`
			`words = self.section_page_words,`
			`superstring = text`
			`)`
			`is_last_page = words_in_superstring(`
			`words = self.last_page_words,`
			`superstring = text`
			`)`

			`print(text, is_section_page, is_last_page)`

			`if is_section_page and len(page.get_images()) == 3:`
			`section_pages.append(page.number)`

			`if is_last_page and len(section_pages) > 2:`
			`section_pages.append(page.number)`

			`return section_pages`

			`def get_legislative_pages(self):`
			`"""`
			`Generate the section markers, then fill in the pages between them.`
			`"""`

			`current = 0`
			`sections = self.generate_section_markers()`
			`legislative_pages: list[int] = []`
			`try:`
			`while True:`
			`legislative_pages += list(`
			`range(`
			`sections[current] + 1,`
			`sections[current + 1],`
			`1`
			`)`
			`)`

			`current += 1`

			`except IndexError:`
			`pass`

			`return legislative_pages`

			`def concat_blocks_for_leg_pages(self):`
			`"""`
			`From the legislative pages, concatenate the "blocks" of text in the PDF.`
			`"""`
			`blocks = []`
			`pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]`
			`for page in pages:`
			`block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]`

			`blocks += block_info`

			`return blocks`

			`def split_leg_pages(self):`
			`"""`
			`We have the collection of legislative page text blocks. We need`
			`to split them now. We split on the text "71st General Assembly...`
			`Youth in Government"`
			`"""`

			`blocks = self.concat_blocks_for_leg_pages()`
			`# each item within splitted is called a "legislative meta-block"`
			`splitted = split_by_lambda(blocks, lambda x: "71st General Assembly" in x.text)`

			`return splitted[1:] # there's an empty array at the beginning`

			`def handle_the_rest(self, the_rest):`
			`weird_character = u'\uFFFd'`
			`splitted_by_weird = the_rest.split(weird_character)`
			`title_content = ''.join(`
			`splitted_by_weird[0].split('\n')[:-1]`
			`).rstrip().lstrip()`

			`bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]`

			`return {`
			`"bill_text": '\n'.join(bill_text),`
			`"title": title_content`
			`}`

			`def parse_legislative_metablocks(self):`
			`output = []`
			`splitted = self.split_leg_pages()`
			`for legislative_text in splitted:`
			`# there are some blocks that contain just one value`
			`# and are aligned to some x value on the pdf`

			`# it's an easy way to extract stuff`
			`legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers`
			`leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()`
			`school = get_block_by_x_value(legislative_text, 163).text.rstrip()`
			`sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()`
			`subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()`
			`the_rest = ''.join([i.text for i in legislative_text[6:]])`
			`handled = self.handle_the_rest(the_rest)`
			`title = handled["title"]`
			`bill_text = handled["bill_text"]`

			`output.append({`
			`"code": leg_code,`
			`"school": school,`
			`"sponsors": sponsors,`
			`"subcommittee": subcommittee,`
			`"title": title,`
			`"bill_text": bill_text`
			`})`

			`self.output = output`