fix bill parsing in the main parser
This commit is contained in:
parent
11fbcb474a
commit
dbd9632e16
23
parsers.py
23
parsers.py
|
@ -155,20 +155,27 @@ class HSYIGPdfParser:
|
||||||
|
|
||||||
def _pretty_print_bill_text(self, bill_text: str):
|
def _pretty_print_bill_text(self, bill_text: str):
|
||||||
replaced = bill_text.replace("<EFBFBD>", "\n")
|
replaced = bill_text.replace("<EFBFBD>", "\n")
|
||||||
|
replaced = bill_text
|
||||||
replaced = replaced.split('\n')
|
replaced = replaced.split('\n')
|
||||||
|
replaced = [
|
||||||
replaced = [i.rstrip().lstrip() for i in replaced]
|
i \
|
||||||
|
.replace('<EFBFBD>', ' ') \
|
||||||
|
.rstrip() \
|
||||||
|
.lstrip() \
|
||||||
|
for i in replaced
|
||||||
|
]
|
||||||
|
|
||||||
first_line_number = self._find_first_line_number(replaced)
|
first_line_number = self._find_first_line_number(replaced)
|
||||||
|
title = ' '.join(replaced[:(first_line_number - 1)])
|
||||||
title = ' '.join(replaced[:first_line_number])
|
title = ' '.join(title.split()) # remove double spaces
|
||||||
rebuilt = replaced[first_line_number:][1::2]
|
rebuilt = replaced[first_line_number:][1::2]
|
||||||
|
# remove the last line number, it doesn't have a cooresponding space at the end
|
||||||
|
rebuilt = rebuilt[:-1]
|
||||||
|
|
||||||
|
# remove the first line, as it's the whitespace between the title and the bill text
|
||||||
|
rebuilt = rebuilt[1:]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": title.lstrip(),
|
"title": title.lstrip(),
|
||||||
"bill_array": rebuilt
|
"bill_array": rebuilt
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_filename(cls, filename: str) -> Any: # TODO: fix this so it shows PdfParser
|
|
||||||
return cls(fitz.open(filename))
|
|
||||||
|
|
Loading…
Reference in New Issue