Files
fil/docs/snippets/wasm/plugins/word_count_processor.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.5 KiB

Word Count Post-Processor

Register a post-processor that computes word count and other text statistics.

import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";

await init();

// Define a word count post-processor
const wordCountProcessor = {
  processingStage: () => "post-extraction",
  process: (extractionResult) => {
    const text = extractionResult.text || "";

    // Compute statistics
    const words = text
      .trim()
      .split(/\s+/)
      .filter((w) => w.length > 0);
    const lines = text.split(/\n/).filter((l) => l.trim().length > 0);
    const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim().length > 0);
    const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);

    // Calculate reading time (average 200 words per minute)
    const readingTimeMinutes = Math.ceil(words.length / 200);

    // Compute character statistics
    const chars = text.length;
    const charsNoSpaces = text.replace(/\s/g, "").length;

    // Enrich metadata with text statistics
    const enriched = {
      ...extractionResult,
      metadata: {
        ...extractionResult.metadata,
        statistics: {
          wordCount: words.length,
          lineCount: lines.length,
          paragraphCount: paragraphs.length,
          sentenceCount: sentences.length,
          charCount: chars,
          charsNoSpaces: charsNoSpaces,
          averageWordLength: words.length > 0 ? Math.round(charsNoSpaces / words.length) : 0,
          averageLineLength: lines.length > 0 ? Math.round(words.length / lines.length) : 0,
          readingTimeMinutes: readingTimeMinutes,
        },
      },
    };

    return enriched;
  },
};

try {
  registerPostProcessor(wordCountProcessor);
  console.log("Word count post-processor registered");
} catch (error) {
  console.error("Failed to register post-processor:", error);
}

// Extract with word counting
async function extractAndAnalyze(fileBytes, mimeType) {
  const result = await extractBytes(fileBytes, mimeType, {});
  const stats = result.metadata?.statistics;

  console.log("Text Analysis:", {
    words: stats?.wordCount,
    lines: stats?.lineCount,
    paragraphs: stats?.paragraphCount,
    sentences: stats?.sentenceCount,
    readingTime: `${stats?.readingTimeMinutes} min`,
  });

  return result;
}

const pdfBytes = new Uint8Array([
  /* PDF content */
]);
await extractAndAnalyze(pdfBytes, "application/pdf");

This processor analyzes text and provides readability metrics.