Files
fil/docs/snippets/wasm/advanced/keyword_extraction_example.md

111 lines
2.8 KiB
Markdown
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
```typescript title="WASM - Extract and Score Keywords"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// TF-IDF style keyword extraction (simplified)
interface Keyword {
term: string;
frequency: number;
uniquePositions: number[];
score: number;
}
const text = result.content.toLowerCase();
const words = text.split(/[\s\n\t]+/);
const tokenMap = new Map<string, number[]>();
// Record word positions
words.forEach((word, idx) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 3) {
if (!tokenMap.has(cleaned)) {
tokenMap.set(cleaned, []);
}
tokenMap.get(cleaned)!.push(idx);
}
});
// Calculate keyword scores
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
term,
frequency: positions.length,
uniquePositions: positions,
score: positions.length * Math.log(words.length / positions.length),
}));
// Sort by score (TF-IDF approximation)
keywords.sort((a, b) => b.score - a.score);
// Top 15 keywords
const topKeywords = keywords.slice(0, 15);
console.log("Top Keywords:");
topKeywords.forEach((kw) => {
console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
});
```
```typescript title="WASM - Keyword Context Window"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
chunking: {
maxChars: 1000,
chunkOverlap: 200,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface KeywordContext {
keyword: string;
contexts: string[];
}
// Find keyword occurrences with surrounding context
function extractKeywordContexts(
text: string,
keyword: string,
contextWindow: number = 50,
): string[] {
const contexts: string[] = [];
const regex = new RegExp(keyword, "gi");
let match;
while ((match = regex.exec(text)) !== null) {
const start = Math.max(0, match.index - contextWindow);
const end = Math.min(text.length, match.index + keyword.length + contextWindow);
contexts.push(text.substring(start, end));
}
return contexts;
}
// Extract context for top keywords
const topKeywords = ["document", "analysis", "results"];
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
keyword: kw,
contexts: extractKeywordContexts(result.content, kw, 40),
}));
keywordContexts.forEach((kc) => {
console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
kc.contexts.slice(0, 2).forEach((ctx, idx) => {
console.log(` [${idx + 1}] ...${ctx}...`);
});
});
```
<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.