Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,105 @@
```typescript title="WASM - Keyword Extraction Setup"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Note: Keyword extraction requires the 'keywords' feature,
// which may not be available in all WASM builds.
// This example shows the configuration structure.
const config = {
// Extraction configuration
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Keyword extraction would be performed on the extracted text
// using external libraries or post-processing
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
// Example post-processing to extract keywords
// (requires external keyword extraction library)
const keywords = new Set<string>();
const words = result.content
.toLowerCase()
.split(/\s+/)
.filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
words.forEach((word) => {
keywords.add(word);
});
console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
```
```typescript title="WASM - Keyword Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Common stopwords to exclude
const stopwords = new Set([
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"from",
"by",
"is",
"are",
"was",
"were",
"be",
"been",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
]);
// Extract and filter keywords
const text = result.content.toLowerCase();
const words = text.split(/\s+/);
const keywordCounts = new Map<string, number>();
words.forEach((word) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 4 && !stopwords.has(cleaned)) {
keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
}
});
// Get top keywords by frequency
const topKeywords = Array.from(keywordCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10)
.map(([word, count]) => `${word} (${count})`);
console.log(`Top keywords: ${topKeywords.join(", ")}`);
```
<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.