Honestly way too much went into this single commit. I am so sorry future me

2023-06-09 00:54:00 -06:00
parent cd3f653f3f
commit 42c0004150
67 changed files with 4617 additions and 92 deletions
--- a/40kParsing/index.ts
+++ b/40kParsing/index.ts
--- a/40kParsing/pdfParsing.ts
+++ b/40kParsing/pdfParsing.ts
@@ -0,0 +1,24 @@
+import { PDFDocumentProxy, PDFPageProxy, getDocument } from 'npm:pdfjs-dist';
+
+export async function parsePDF(pdfPath: string): Promise<string> {
+  // Load the PDF file
+  const loadingTask = getDocument(pdfPath);
+  const pdf: PDFDocumentProxy = await loadingTask.promise;
+
+  const numPages = pdf.numPages;
+  const textContent: string[] = [];
+
+  // Iterate over each page and extract text content
+  for (let i = 1; i <= numPages; i++) {
+    const page: PDFPageProxy = await pdf.getPage(i);
+    const pageContent = await page.getTextContent();
+    
+    // Extract text from the content items
+    const pageText = pageContent.items.map(item => item.str).join(' ');
+    textContent.push(pageText);
+  }
+
+  // Combine the text content from all pages
+  return textContent.join('\n');
+}
+