yig/franklincce/explorer/leglib.py

from typing import Any, ClassVar
from dataclasses import dataclass

import fitz

class FitzBlockWrapper:
    def __init__(self, block):
        self.x0, self.y0, self.x1, \
            self.y1, self.text, \
            self.block_number, self.block_type = block

        self.x0 = int(self.x0)
        self.x1 = int(self.x1)
        self.y0 = int(self.y0)
        self.y1 = int(self.y1)
        self.block_number = int(self.block_number)
        self.block_type = int(self.block_type)

    def __str__(self):
        return str((
            self.x0, self.y0, self.x1, self.y1, self.text
        ))

    def __repl__(self):
        return self.__str__()

def words_in_superstring(words: list[str], superstring: str) -> bool:
    for word in words:
        if not str(word).lower() in str(superstring).lower():
            return False
        return True

def split_by_lambda(arr: list[Any], func):
    output = []
    current = []
    for item in arr:
        if func(item):
            output.append(current)
            current = []
        else:
            current.append(item)

    output.append(current)
    return output

def get_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> FitzBlockWrapper:
    for item in arr:
        if item.x0 == xvalue:
            return item

def remove_block_by_x_value(arr: list[FitzBlockWrapper], xvalue: int) -> list[FitzBlockWrapper]:
    return [i for i in arr if not i.x0 == xvalue]

class CCEParserBase():
    section_page_words: ClassVar[list[str]]
    last_page_words: ClassVar[list[str]]
    split_leg_pages_needle: ClassVar[str]

    def __init__(self, document: fitz.Document):
        self.document = document
        self.__post_init__()

    def __post_init__(self):
        # run all the processing steps here
        self.parse_legislative_metablocks()

    def generate_section_markers(self) -> list[int]:
        """
        In the YIG/MUN manuals, there's section markers that delineate between the different
        committees within the manual. Let's find those, and then the last legislative page.
        """
        section_pages = []

        for page in self.document:
            text = page.get_text().encode("utf8")
            is_section_page = words_in_superstring(
                words = self.section_page_words,
                superstring = text
            )
            is_last_page = words_in_superstring(
                words = self.last_page_words,
                superstring = text
            )

            if is_section_page:
                section_pages.append(page.number)

            if is_last_page and len(section_pages) > 2:
                section_pages.append(page.number)

        return section_pages

    def get_legislative_pages(self):
        """
        Generate the section markers, then fill in the pages between them.
        """

        current = 0
        sections = self.generate_section_markers()
        legislative_pages: list[int] = []
        try:
            while True:
                legislative_pages += list(
                    range(
                        sections[current] + 1,
                        sections[current + 1],
                        1
                    )
                )

                current += 1

        except IndexError:
            pass


        return legislative_pages

    def concat_blocks_for_leg_pages(self):
        """
        From the legislative pages, concatenate the "blocks" of text in the PDF.
        """
        blocks = []
        pages = [self.document.load_page(page_num) for page_num in self.get_legislative_pages()]
        for page in pages:
            block_info = [FitzBlockWrapper(block) for block in page.get_text("blocks")]

            blocks += block_info

        return blocks

    def split_leg_pages(self):
        """
        We have the collection of legislative page text blocks. We need
        to split them now. We split on the text "71st General Assembly...
        Youth in Government"
        """

        blocks = self.concat_blocks_for_leg_pages()
        # each item within splitted is called a "legislative meta-block"
        splitted = split_by_lambda(blocks, lambda x: self.split_leg_pages_needle in x.text)

        return splitted[1:] # there's an empty array at the beginning

    def handle_the_rest(self, the_rest):
        weird_character = u''
        another_weird_character = u'\uFFFd'
        splitted_by_weird = the_rest.replace(weird_character, another_weird_character).split(another_weird_character)
        title_content = ''.join(
            splitted_by_weird[0].split('\n')[:-1]
        ).rstrip().lstrip()

        bill_text = [i.split('\n')[0][1:] for i in splitted_by_weird[1:]]

        return {
            "bill_text": '\n'.join(bill_text),
            "title": title_content
        }

    def parse_legislative_metablocks(self):
        output = []
        splitted = self.split_leg_pages()
        for legislative_text in splitted:
            # there are some blocks that contain just one value
            # and are aligned to some x value on the pdf

            # it's an easy way to extract stuff
            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()

            try:
                school = get_block_by_x_value(legislative_text, 177).text.rstrip()
            except AttributeError:
                try:
                    school = get_block_by_x_value(legislative_text, 186).text.rstrip()
                except AttributeError:
                    school = "you tell me, man"

            try:
                sponsors = get_block_by_x_value(legislative_text, 163).text.rstrip()
            except AttributeError:
                try:
                    sponsors = get_block_by_x_value(legislative_text, 166).text.rstrip()
                except AttributeError:
                    sponsors = "you tell me, man"
            try:
                subcommittee = get_block_by_x_value(legislative_text, 151).text.rstrip()
            except AttributeError:
                try:
                    subcommittee = get_block_by_x_value(legislative_text, 153).text.rstrip()
                except AttributeError:
                    subcommittee = "you tell me, man"
            the_rest = ''.join([i.text for i in legislative_text[12:]])
            print([i.text for i in legislative_text[12:]])
            handled = self.handle_the_rest(the_rest)
            title = handled["title"]
            bill_text = handled["bill_text"]

            output.append({
                "code": leg_code,
                "school": school,
                "sponsors": sponsors,
                "subcommittee": subcommittee,
                "title": title,
                "bill_text": bill_text
            })

        self.output = output


class HSMUN23(CCEParserBase):
    section_page_words = ["Committee", "Model", "United", "YMCA", "Tennessee", "Nations"]
    last_page_words = ["ABCs"]
    split_leg_pages_needle = "43rd General Assembly"

class HSYIG24(CCEParserBase):
    section_page_words = [ "Committee", "YMCA", "Tennessee", "Youth", "in" ]
    last_page_words = [ "ABCs" ]
    split_leg_pages_needle = "71st General Assembly"

    def generate_section_markers(self) -> list[int]:
        """
        This overrides the regular method because we need to check
        for three images on a section page
        """
        section_pages = []

        for page in self.document:
            text = page.get_text().encode("utf8")
            is_section_page = words_in_superstring(
                words = self.section_page_words,
                superstring = text
            )
            is_last_page = words_in_superstring(
                words = self.last_page_words,
                superstring = text
            )

            if is_section_page and len(page.get_images()) == 3:
                section_pages.append(page.number)

            if is_last_page and len(section_pages) > 2:
                section_pages.append(page.number)

        return section_pages

    def parse_legislative_metablocks(self):
        """
        This is YIG specific code
        """
        output = []
        splitted = self.split_leg_pages()
        for legislative_text in splitted:
            # there are some blocks that contain just one value
            # and are aligned to some x value on the pdf

            # it's an easy way to extract stuff
            legislative_text = remove_block_by_x_value(legislative_text, 565) # remove page numbers
            leg_code = get_block_by_x_value(legislative_text, 88).text.rstrip()
            school = get_block_by_x_value(legislative_text, 163).text.rstrip()
            sponsors = get_block_by_x_value(legislative_text, 152).text.rstrip()
            subcommittee = get_block_by_x_value(legislative_text, 139).text.rstrip()
            the_rest = ''.join([i.text for i in legislative_text[6:]])
            handled = self.handle_the_rest(the_rest)
            title = handled["title"]
            bill_text = handled["bill_text"]

            output.append({
                "code": leg_code,
                "school": school,
                "sponsors": sponsors,
                "subcommittee": subcommittee,
                "title": title,
                "bill_text": bill_text
            })

        self.output = output

def main():
    argv = sys.argv
    doc = fitz.open(argv[1])
    if argv[2] == "HSYIG":
        doc = HSYIG24(doc)
    elif argv[2] == "HSMUN":
        doc = HSMUN23(doc)
    else:
        print("nonvalid book thing")
        return

    for text in doc.output:
        print("{} ---------------------------- {}".format(
            text["title"], text["bill_text"]
        ))

if __name__ == "__main__":
    import sys

    main()