Honestly way too much went into this single commit. I am so sorry future me

This commit is contained in:
Emma
2023-06-09 00:54:00 -06:00
parent cd3f653f3f
commit 42c0004150
67 changed files with 4617 additions and 92 deletions

0
40kParsing/index.ts Normal file
View File

24
40kParsing/pdfParsing.ts Normal file
View File

@@ -0,0 +1,24 @@
import { PDFDocumentProxy, PDFPageProxy, getDocument } from 'npm:pdfjs-dist';
export async function parsePDF(pdfPath: string): Promise<string> {
// Load the PDF file
const loadingTask = getDocument(pdfPath);
const pdf: PDFDocumentProxy = await loadingTask.promise;
const numPages = pdf.numPages;
const textContent: string[] = [];
// Iterate over each page and extract text content
for (let i = 1; i <= numPages; i++) {
const page: PDFPageProxy = await pdf.getPage(i);
const pageContent = await page.getTextContent();
// Extract text from the content items
const pageText = pageContent.items.map(item => item.str).join(' ');
textContent.push(pageText);
}
// Combine the text content from all pages
return textContent.join('\n');
}