Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/advanced/keyword_extraction_example.md
+++ b/docs/snippets/wasm/advanced/keyword_extraction_example.md
@@ -0,0 +1,110 @@
+```typescript title="WASM - Extract and Score Keywords"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// TF-IDF style keyword extraction (simplified)
+interface Keyword {
+  term: string;
+  frequency: number;
+  uniquePositions: number[];
+  score: number;
+}
+
+const text = result.content.toLowerCase();
+const words = text.split(/[\s\n\t]+/);
+const tokenMap = new Map<string, number[]>();
+
+// Record word positions
+words.forEach((word, idx) => {
+  const cleaned = word.replace(/[^\w]/g, "");
+  if (cleaned.length > 3) {
+    if (!tokenMap.has(cleaned)) {
+      tokenMap.set(cleaned, []);
+    }
+    tokenMap.get(cleaned)!.push(idx);
+  }
+});
+
+// Calculate keyword scores
+const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
+  term,
+  frequency: positions.length,
+  uniquePositions: positions,
+  score: positions.length * Math.log(words.length / positions.length),
+}));
+
+// Sort by score (TF-IDF approximation)
+keywords.sort((a, b) => b.score - a.score);
+
+// Top 15 keywords
+const topKeywords = keywords.slice(0, 15);
+console.log("Top Keywords:");
+topKeywords.forEach((kw) => {
+  console.log(`  ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
+});
+```
+
+```typescript title="WASM - Keyword Context Window"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 200,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface KeywordContext {
+  keyword: string;
+  contexts: string[];
+}
+
+// Find keyword occurrences with surrounding context
+function extractKeywordContexts(
+  text: string,
+  keyword: string,
+  contextWindow: number = 50,
+): string[] {
+  const contexts: string[] = [];
+  const regex = new RegExp(keyword, "gi");
+  let match;
+
+  while ((match = regex.exec(text)) !== null) {
+    const start = Math.max(0, match.index - contextWindow);
+    const end = Math.min(text.length, match.index + keyword.length + contextWindow);
+    contexts.push(text.substring(start, end));
+  }
+
+  return contexts;
+}
+
+// Extract context for top keywords
+const topKeywords = ["document", "analysis", "results"];
+const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
+  keyword: kw,
+  contexts: extractKeywordContexts(result.content, kw, 40),
+}));
+
+keywordContexts.forEach((kc) => {
+  console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
+  kc.contexts.slice(0, 2).forEach((ctx, idx) => {
+    console.log(`  [${idx + 1}] ...${ctx}...`);
+  });
+});
+```
+
+<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.