add a simple frontend that works via console.log
This commit is contained in:
parent
a4809fa95a
commit
b8e65edccd
|
@ -0,0 +1,11 @@
|
|||
<html>
|
||||
<body>
|
||||
<script src="https://mozilla.github.io/pdf.js/build/pdf.mjs" type="module"></script>
|
||||
<script src="main.js"></script>
|
||||
<script>
|
||||
</script>
|
||||
<div id="message"></div>
|
||||
<input type="file" id="fileupload">
|
||||
<button onclick="onFileUpload()">click me</button>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,116 @@
|
|||
var {pdfjsLib} = globalThis;
|
||||
|
||||
function concatLines(lines) {
|
||||
output = "";
|
||||
for (var line = 0; line < lines.length; line++) {
|
||||
output += lines[line].str;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function fillInCommitteePages(pages) { // TODO: make this more efficent
|
||||
output = []
|
||||
for(var i = pages[0]; i < pages[pages.length - 1]; i++) {
|
||||
if (!pages.includes(i)) output.push(i)
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
function concatAllCommitteePages(committeePages, allLines) {
|
||||
output = []
|
||||
for (var i = 0; i < committeePages.length; i++) {
|
||||
output = output.concat(allLines[committeePages[i]])
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function splitByBillHeader(committeeLines, billHeader) {
|
||||
output = [];
|
||||
current = [];
|
||||
for (var i = 0; i < committeeLines.length; i++) {
|
||||
if(committeeLines[i].str == billHeader) {
|
||||
output.push(current);
|
||||
current = [];
|
||||
} else {
|
||||
current.push(committeeLines[i])
|
||||
}
|
||||
}
|
||||
output.shift()
|
||||
return output;
|
||||
}
|
||||
|
||||
function extractBillInformation(splittedByBillHeader) {
|
||||
var output = [];
|
||||
for (var i = 0; i < splittedByBillHeader.length; i++) {
|
||||
current = splittedByBillHeader[i];
|
||||
console.log(current)
|
||||
var subcommittee = current[31].str;
|
||||
var sponsors = current[33].str;
|
||||
var school = current[35].str;
|
||||
var billcode = current[5].str;
|
||||
|
||||
output.push({
|
||||
subcommittee: subcommittee,
|
||||
sponsors: sponsors,
|
||||
school: school,
|
||||
billcode: billcode,
|
||||
})
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
function processLines(lines) {
|
||||
var committeePages = [];
|
||||
var endPage = null;
|
||||
for (var i = 0; i < lines.length; i++) {
|
||||
var concatted = concatLines(lines[i])
|
||||
if (concatted.includes("COMMITTEE") && concatted.includes("GOVERNMENT")) { // we have a committee page
|
||||
committeePages.push(i)
|
||||
} else if (concatted.includes("ABCs")) {
|
||||
endPage = i;
|
||||
}
|
||||
}
|
||||
committeePages.push(endPage)
|
||||
committeeLines =
|
||||
concatAllCommitteePages(
|
||||
fillInCommitteePages(
|
||||
committeePages
|
||||
), lines
|
||||
)
|
||||
|
||||
billHeader = committeeLines[0].str;
|
||||
committeeLines.push({str: billHeader})
|
||||
|
||||
splittedByBillHeader = splitByBillHeader(committeeLines, billHeader)
|
||||
billInfo = extractBillInformation(splittedByBillHeader);
|
||||
console.log(billInfo)
|
||||
}
|
||||
|
||||
function onFileUpload() {
|
||||
const reader = new FileReader()
|
||||
var result = null;
|
||||
reader.onload = function(evt) {
|
||||
const contents = evt.target.result;
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.mjs';
|
||||
var task = pdfjsLib.getDocument({data: contents});
|
||||
var result = task.promise.then(function(pdf) {
|
||||
var pageCount = pdf.numPages;
|
||||
var promises = [];
|
||||
for (var i = 1; i <= pageCount; i++) {
|
||||
var page = pdf.getPage(i);
|
||||
promises.push(page.then(function(page) {
|
||||
var textContent = page.getTextContent();
|
||||
return textContent.then(function(text) {
|
||||
return text.items;
|
||||
})
|
||||
}))
|
||||
}
|
||||
Promise.all(promises).then(function(lines) {
|
||||
processLines(lines)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
reader.readAsBinaryString(document.getElementById("fileupload").files[0])
|
||||
}
|
15
package.json
15
package.json
|
@ -1,15 +0,0 @@
|
|||
{
|
||||
"name": "yig",
|
||||
"version": "1.0.0",
|
||||
"description": "parser and explorer for YMCA CCE legislative materials",
|
||||
"main": "main",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://git.beepboop.systems/yig"
|
||||
},
|
||||
"author": "Ryan Marina",
|
||||
"license": "GPL-3.0"
|
||||
}
|
Loading…
Reference in New Issue