PDF-Only Post-Processor

import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";

await init();

// Define a PDF-only post-processor
const pdfOnlyProcessor = {
  processingStage: () => "post-extraction",
  process: (extractionResult) => {
    // Check if this is a PDF extraction
    const isPdf =
      extractionResult.metadata?.mimeType === "application/pdf" ||
      extractionResult.metadata?.source?.endsWith(".pdf");

    if (!isPdf) {
      // Skip processing for non-PDF documents
      return extractionResult;
    }

    // Apply PDF-specific processing
    const processed = {
      ...extractionResult,
      metadata: {
        ...extractionResult.metadata,
        pdfProcessed: true,
        pageCount: extractionResult.metadata?.pageCount || 1,
      },
      // Normalize text for PDFs
      text: (extractionResult.text || "")
        .replace(/\n{3,}/g, "\n\n") // Remove excessive line breaks
        .trim(),
    };

    return processed;
  },
};

try {
  registerPostProcessor(pdfOnlyProcessor);
  console.log("PDF-only post-processor registered");
} catch (error) {
  console.error("Failed to register post-processor:", error);
}

// Test with various documents
const testDocs = [
  {
    bytes: new Uint8Array([
      /* PDF */
    ]),
    type: "application/pdf",
  },
  {
    bytes: new Uint8Array([
      /* HTML */
    ]),
    type: "text/html",
  },
];

for (const doc of testDocs) {
  const result = await extractBytes(doc.bytes, doc.type, {});
  console.log(`${doc.type}: PDF-specific processing applied:`, result.metadata?.pdfProcessed);
}

This processor applies PDF-specific transformations only to PDF documents.

1.7 KiB Raw Blame History

PDF-Only Post-Processor

1.7 KiB

Raw Blame History