Compare commits

...

2 Commits

Author SHA1 Message Date
stupidcomputer b8e65edccd add a simple frontend that works via console.log 2024-05-09 00:50:11 -05:00
stupidcomputer a4809fa95a npm boilerplate 2024-05-08 17:58:14 -05:00
2 changed files with 127 additions and 0 deletions

11
index.html Normal file
View File

@ -0,0 +1,11 @@
<html>
<body>
<script src="https://mozilla.github.io/pdf.js/build/pdf.mjs" type="module"></script>
<script src="main.js"></script>
<script>
</script>
<div id="message"></div>
<input type="file" id="fileupload">
<button onclick="onFileUpload()">click me</button>
</body>
</html>

116
main.js Normal file
View File

@ -0,0 +1,116 @@
var {pdfjsLib} = globalThis;
function concatLines(lines) {
output = "";
for (var line = 0; line < lines.length; line++) {
output += lines[line].str;
}
return output;
}
function fillInCommitteePages(pages) { // TODO: make this more efficent
output = []
for(var i = pages[0]; i < pages[pages.length - 1]; i++) {
if (!pages.includes(i)) output.push(i)
}
return output
}
function concatAllCommitteePages(committeePages, allLines) {
output = []
for (var i = 0; i < committeePages.length; i++) {
output = output.concat(allLines[committeePages[i]])
}
return output;
}
function splitByBillHeader(committeeLines, billHeader) {
output = [];
current = [];
for (var i = 0; i < committeeLines.length; i++) {
if(committeeLines[i].str == billHeader) {
output.push(current);
current = [];
} else {
current.push(committeeLines[i])
}
}
output.shift()
return output;
}
function extractBillInformation(splittedByBillHeader) {
var output = [];
for (var i = 0; i < splittedByBillHeader.length; i++) {
current = splittedByBillHeader[i];
console.log(current)
var subcommittee = current[31].str;
var sponsors = current[33].str;
var school = current[35].str;
var billcode = current[5].str;
output.push({
subcommittee: subcommittee,
sponsors: sponsors,
school: school,
billcode: billcode,
})
}
return output;
}
function processLines(lines) {
var committeePages = [];
var endPage = null;
for (var i = 0; i < lines.length; i++) {
var concatted = concatLines(lines[i])
if (concatted.includes("COMMITTEE") && concatted.includes("GOVERNMENT")) { // we have a committee page
committeePages.push(i)
} else if (concatted.includes("ABCs")) {
endPage = i;
}
}
committeePages.push(endPage)
committeeLines =
concatAllCommitteePages(
fillInCommitteePages(
committeePages
), lines
)
billHeader = committeeLines[0].str;
committeeLines.push({str: billHeader})
splittedByBillHeader = splitByBillHeader(committeeLines, billHeader)
billInfo = extractBillInformation(splittedByBillHeader);
console.log(billInfo)
}
function onFileUpload() {
const reader = new FileReader()
var result = null;
reader.onload = function(evt) {
const contents = evt.target.result;
pdfjsLib.GlobalWorkerOptions.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.mjs';
var task = pdfjsLib.getDocument({data: contents});
var result = task.promise.then(function(pdf) {
var pageCount = pdf.numPages;
var promises = [];
for (var i = 1; i <= pageCount; i++) {
var page = pdf.getPage(i);
promises.push(page.then(function(page) {
var textContent = page.getTextContent();
return textContent.then(function(text) {
return text.items;
})
}))
}
Promise.all(promises).then(function(lines) {
processLines(lines)
})
})
}
reader.readAsBinaryString(document.getElementById("fileupload").files[0])
}