battlelog/40kParsing/pdfParsing.ts

25 lines
780 B
TypeScript

import { PDFDocumentProxy, PDFPageProxy, getDocument } from 'npm:pdfjs-dist';
export async function parsePDF(pdfPath: string): Promise<string> {
// Load the PDF file
const loadingTask = getDocument(pdfPath);
const pdf: PDFDocumentProxy = await loadingTask.promise;
const numPages = pdf.numPages;
const textContent: string[] = [];
// Iterate over each page and extract text content
for (let i = 1; i <= numPages; i++) {
const page: PDFPageProxy = await pdf.getPage(i);
const pageContent = await page.getTextContent();
// Extract text from the content items
const pageText = pageContent.items.map(item => item.str).join(' ');
textContent.push(pageText);
}
// Combine the text content from all pages
return textContent.join('\n');
}