Files
fil/docs/snippets/wasm/advanced/keyword_extraction_config.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.4 KiB

import init, { extractBytes } from "kreuzberg-wasm";

await init();

// Note: Keyword extraction requires the 'keywords' feature,
// which may not be available in all WASM builds.
// This example shows the configuration structure.

const config = {
  // Extraction configuration
  outputFormat: "markdown",
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

// Keyword extraction would be performed on the extracted text
// using external libraries or post-processing
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);

// Example post-processing to extract keywords
// (requires external keyword extraction library)
const keywords = new Set<string>();
const words = result.content
  .toLowerCase()
  .split(/\s+/)
  .filter((w) => w.length > 4); // Simple heuristic: words > 4 chars

words.forEach((word) => {
  keywords.add(word);
});

console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  outputFormat: "markdown",
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

// Common stopwords to exclude
const stopwords = new Set([
  "the",
  "a",
  "an",
  "and",
  "or",
  "but",
  "in",
  "on",
  "at",
  "to",
  "for",
  "of",
  "with",
  "from",
  "by",
  "is",
  "are",
  "was",
  "were",
  "be",
  "been",
  "have",
  "has",
  "had",
  "do",
  "does",
  "did",
  "will",
  "would",
  "could",
  "should",
]);

// Extract and filter keywords
const text = result.content.toLowerCase();
const words = text.split(/\s+/);
const keywordCounts = new Map<string, number>();

words.forEach((word) => {
  const cleaned = word.replace(/[^\w]/g, "");
  if (cleaned.length > 4 && !stopwords.has(cleaned)) {
    keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
  }
});

// Get top keywords by frequency
const topKeywords = Array.from(keywordCounts.entries())
  .sort((a, b) => b[1] - a[1])
  .slice(0, 10)
  .map(([word, count]) => `${word} (${count})`);

console.log(`Top keywords: ${topKeywords.join(", ")}`);
- Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.