25 lines
780 B
TypeScript
25 lines
780 B
TypeScript
import { PDFDocumentProxy, PDFPageProxy, getDocument } from 'npm:pdfjs-dist';
|
|
|
|
export async function parsePDF(pdfPath: string): Promise<string> {
|
|
// Load the PDF file
|
|
const loadingTask = getDocument(pdfPath);
|
|
const pdf: PDFDocumentProxy = await loadingTask.promise;
|
|
|
|
const numPages = pdf.numPages;
|
|
const textContent: string[] = [];
|
|
|
|
// Iterate over each page and extract text content
|
|
for (let i = 1; i <= numPages; i++) {
|
|
const page: PDFPageProxy = await pdf.getPage(i);
|
|
const pageContent = await page.getTextContent();
|
|
|
|
// Extract text from the content items
|
|
const pageText = pageContent.items.map(item => item.str).join(' ');
|
|
textContent.push(pageText);
|
|
}
|
|
|
|
// Combine the text content from all pages
|
|
return textContent.join('\n');
|
|
}
|
|
|