docs/snippets/wasm/advanced/keyword_extraction_example.md

```typescript title="WASM - Extract and Score Keywords"
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  outputFormat: "markdown",
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

// TF-IDF style keyword extraction (simplified)
interface Keyword {
  term: string;
  frequency: number;
  uniquePositions: number[];
  score: number;
}

const text = result.content.toLowerCase();
const words = text.split(/[\s\n\t]+/);
const tokenMap = new Map<string, number[]>();

// Record word positions
words.forEach((word, idx) => {
  const cleaned = word.replace(/[^\w]/g, "");
  if (cleaned.length > 3) {
    if (!tokenMap.has(cleaned)) {
      tokenMap.set(cleaned, []);
    }
    tokenMap.get(cleaned)!.push(idx);
  }
});

// Calculate keyword scores
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
  term,
  frequency: positions.length,
  uniquePositions: positions,
  score: positions.length * Math.log(words.length / positions.length),
}));

// Sort by score (TF-IDF approximation)
keywords.sort((a, b) => b.score - a.score);

// Top 15 keywords
const topKeywords = keywords.slice(0, 15);
console.log("Top Keywords:");
topKeywords.forEach((kw) => {
  console.log(`  ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
});
```

```typescript title="WASM - Keyword Context Window"
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  outputFormat: "markdown",
  chunking: {
    maxChars: 1000,
    chunkOverlap: 200,
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

interface KeywordContext {
  keyword: string;
  contexts: string[];
}

// Find keyword occurrences with surrounding context
function extractKeywordContexts(
  text: string,
  keyword: string,
  contextWindow: number = 50,
): string[] {
  const contexts: string[] = [];
  const regex = new RegExp(keyword, "gi");
  let match;

  while ((match = regex.exec(text)) !== null) {
    const start = Math.max(0, match.index - contextWindow);
    const end = Math.min(text.length, match.index + keyword.length + contextWindow);
    contexts.push(text.substring(start, end));
  }

  return contexts;
}

// Extract context for top keywords
const topKeywords = ["document", "analysis", "results"];
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
  keyword: kw,
  contexts: extractKeywordContexts(result.content, kw, 40),
}));

keywordContexts.forEach((kc) => {
  console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
  kc.contexts.slice(0, 2).forEach((ctx, idx) => {
    console.log(`  [${idx + 1}] ...${ctx}...`);
  });
});
```

<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.
Nomad changes 2026-06-01 23:40:55 +02:00			```typescript title="WASM - Extract and Score Keywords"
			`import init, { extractBytes } from "kreuzberg-wasm";`

			`await init();`

			`const config = {`
			`outputFormat: "markdown",`
			`};`

			`const bytes = new Uint8Array(buffer);`
			`const result = await extractBytes(bytes, "application/pdf", config);`

			`// TF-IDF style keyword extraction (simplified)`
			`interface Keyword {`
			`term: string;`
			`frequency: number;`
			`uniquePositions: number[];`
			`score: number;`
			`}`

			`const text = result.content.toLowerCase();`
			`const words = text.split(/[\s\n\t]+/);`
			`const tokenMap = new Map<string, number[]>();`

			`// Record word positions`
			`words.forEach((word, idx) => {`
			`const cleaned = word.replace(/[^\w]/g, "");`
			`if (cleaned.length > 3) {`
			`if (!tokenMap.has(cleaned)) {`
			`tokenMap.set(cleaned, []);`
			`}`
			`tokenMap.get(cleaned)!.push(idx);`
			`}`
			`});`

			`// Calculate keyword scores`
			`const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({`
			`term,`
			`frequency: positions.length,`
			`uniquePositions: positions,`
			`score: positions.length * Math.log(words.length / positions.length),`
			`}));`

			`// Sort by score (TF-IDF approximation)`
			`keywords.sort((a, b) => b.score - a.score);`

			`// Top 15 keywords`
			`const topKeywords = keywords.slice(0, 15);`
			`console.log("Top Keywords:");`
			`topKeywords.forEach((kw) => {`
			console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
			`});`
			```

			```typescript title="WASM - Keyword Context Window"
			`import init, { extractBytes } from "kreuzberg-wasm";`

			`await init();`

			`const config = {`
			`outputFormat: "markdown",`
			`chunking: {`
			`maxChars: 1000,`
			`chunkOverlap: 200,`
			`},`
			`};`

			`const bytes = new Uint8Array(buffer);`
			`const result = await extractBytes(bytes, "application/pdf", config);`

			`interface KeywordContext {`
			`keyword: string;`
			`contexts: string[];`
			`}`

			`// Find keyword occurrences with surrounding context`
			`function extractKeywordContexts(`
			`text: string,`
			`keyword: string,`
			`contextWindow: number = 50,`
			`): string[] {`
			`const contexts: string[] = [];`
			`const regex = new RegExp(keyword, "gi");`
			`let match;`

			`while ((match = regex.exec(text)) !== null) {`
			`const start = Math.max(0, match.index - contextWindow);`
			`const end = Math.min(text.length, match.index + keyword.length + contextWindow);`
			`contexts.push(text.substring(start, end));`
			`}`

			`return contexts;`
			`}`

			`// Extract context for top keywords`
			`const topKeywords = ["document", "analysis", "results"];`
			`const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({`
			`keyword: kw,`
			`contexts: extractKeywordContexts(result.content, kw, 40),`
			`}));`

			`keywordContexts.forEach((kc) => {`
			console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
			`kc.contexts.slice(0, 2).forEach((ctx, idx) => {`
			console.log(` [${idx + 1}] ...${ctx}...`);
			`});`
			`});`
			```

			`<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.`