Honestly way too much went into this single commit. I am so sorry future me
This commit is contained in:
0
40kParsing/index.ts
Normal file
0
40kParsing/index.ts
Normal file
24
40kParsing/pdfParsing.ts
Normal file
24
40kParsing/pdfParsing.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { PDFDocumentProxy, PDFPageProxy, getDocument } from 'npm:pdfjs-dist';
|
||||
|
||||
export async function parsePDF(pdfPath: string): Promise<string> {
|
||||
// Load the PDF file
|
||||
const loadingTask = getDocument(pdfPath);
|
||||
const pdf: PDFDocumentProxy = await loadingTask.promise;
|
||||
|
||||
const numPages = pdf.numPages;
|
||||
const textContent: string[] = [];
|
||||
|
||||
// Iterate over each page and extract text content
|
||||
for (let i = 1; i <= numPages; i++) {
|
||||
const page: PDFPageProxy = await pdf.getPage(i);
|
||||
const pageContent = await page.getTextContent();
|
||||
|
||||
// Extract text from the content items
|
||||
const pageText = pageContent.items.map(item => item.str).join(' ');
|
||||
textContent.push(pageText);
|
||||
}
|
||||
|
||||
// Combine the text content from all pages
|
||||
return textContent.join('\n');
|
||||
}
|
||||
|
Reference in New Issue
Block a user