Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const config = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
enable_language_detection: true,
enable_quality: true,
};
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
}
}
```

View File

@@ -0,0 +1,29 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
images: {
extractImages: true,
targetDpi: 200,
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("Extraction complete");
console.log("Content length:", result.content.length);
}
extractWithConfig().catch(console.error);

View File

@@ -0,0 +1,66 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithChunking() {
await initWasm();
const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
chunking: {
maxChars: 800,
chunkOverlap: 150,
splitOnNewlines: true,
splitOnSentences: true,
},
};
// Example: prepend heading context so each chunk carries its heading breadcrumb
const configWithHeadings: ExtractionConfig = {
chunking: {
chunkerType: "markdown",
maxChars: 800,
prependHeadingContext: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
if (result.chunks) {
console.log(`Total chunks: ${result.chunks.length}`);
result.chunks.slice(0, 3).forEach((chunk, i) => {
console.log(`\nChunk ${i}:`);
console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
console.log(`Content: ${chunk.content.substring(0, 100)}...`);
});
}
}
async function extractWithPrependHeadingContext() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
chunking: {
chunkerType: "markdown",
maxChars: 800,
prependHeadingContext: true,
},
};
const result = await extractBytes(bytes, "text/markdown", config);
if (result.chunks) {
console.log(`Total chunks: ${result.chunks.length}`);
result.chunks.slice(0, 3).forEach((chunk, i) => {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
});
}
}
extractWithChunking().catch(console.error);
extractWithPrependHeadingContext().catch(console.error);

View File

@@ -0,0 +1,71 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
console.log(`Tokens: ${chunk.metadata?.token_count}`);
});
```
```typescript title="WASM - Markdown with Heading Context"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 2000,
// Note: Token-based sizing is not available in WASM builds.
// Use character-based sizing instead.
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
if (chunk.metadata?.headingContext?.headings) {
console.log("Headings:");
chunk.metadata.headingContext.headings.forEach((h) => {
console.log(` Level ${h.level}: ${h.text}`);
});
}
});
```
```typescript title="WASM - Prepend Heading Context"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 2000,
prependHeadingContext: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk, idx) => {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
});
```

View File

@@ -0,0 +1,34 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithFullConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "deu",
},
chunking: {
maxChars: 1000,
chunkOverlap: 200,
},
images: {
extractImages: true,
targetDpi: 200,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("=== Extraction Results ===");
console.log(`Content: ${result.content.length} chars`);
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
console.log(`Images: ${result.images?.length ?? 0}`);
console.log(`Tables: ${result.tables.length}`);
console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
}
extractWithFullConfig().catch(console.error);

View File

@@ -0,0 +1,34 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithConditionalConfig(fileSize: number) {
await initWasm();
const config: ExtractionConfig = {};
if (fileSize > 10 * 1024 * 1024) {
config.chunking = {
maxChars: 500,
chunkOverlap: 50,
};
}
if (fileSize < 1 * 1024 * 1024) {
config.images = {
extractImages: true,
targetDpi: 300,
};
}
config.ocr = {
enabled: fileSize < 50 * 1024 * 1024,
};
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "application/pdf", config);
return result;
}
extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
images: {
extractImages: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config = {
ocr: {
backend: "tesseract",
language: "eng",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Content length: ${result.content.length}`);
console.log(`Tables detected: ${result.tables?.length || 0}`);
```

View File

@@ -0,0 +1,26 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
use_cache: true,
ocr: {
backend: "tesseract",
language: "eng+deu",
tesseract_config: {
psm: 6,
},
},
chunking: {
max_characters: 1000,
overlap: 200,
},
enable_quality_processing: true,
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,15 @@
```typescript title="Document Structure Config (WASM)"
import { extractBytes } from "kreuzberg-wasm";
const config = {
includeDocumentStructure: true,
};
const result = extractBytes(fileBuffer, "application/pdf", config);
if (result.document) {
for (const node of result.document.nodes) {
console.log(`[${node.content.nodeType}]`);
}
}
```

View File

@@ -0,0 +1,35 @@
```typescript title="Element-Based Output (WASM)"
import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
// Configure element-based output
const config: ExtractionConfig = {
outputFormat: "element_based",
};
// Extract document
const result = extractFileSync(fileBuffer, "application/pdf", config);
// Access elements
for (const element of result.elements) {
console.log(`Type: ${element.elementType}`);
console.log(`Text: ${element.text.slice(0, 100)}`);
if (element.metadata.pageNumber) {
console.log(`Page: ${element.metadata.pageNumber}`);
}
if (element.metadata.coordinates) {
const coords = element.metadata.coordinates;
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
}
console.log("---");
}
// Filter by element type
const titles = result.elements.filter((e) => e.elementType === "title");
for (const title of titles) {
const level = title.metadata.additional?.level || "unknown";
console.log(`[${level}] ${title.text}`);
}
```

View File

@@ -0,0 +1,105 @@
import {
type ExtractionConfig,
type HierarchyConfig,
Kreuzberg,
type PdfConfig,
} from "kreuzberg-wasm";
// Example 1: Basic hierarchy extraction
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
const hierarchyConfigBasic: HierarchyConfig = {
enabled: true,
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
includeBbox: true, // Include bounding box coordinates
ocrCoverageThreshold: undefined, // No OCR coverage threshold
};
const pdfConfigBasic: PdfConfig = {
hierarchy: hierarchyConfigBasic,
};
const extractionConfigBasic: ExtractionConfig = {
pdfOptions: pdfConfigBasic,
};
// const kreuzberg = new Kreuzberg(extractionConfigBasic);
// const result = await kreuzberg.extractFile("document.pdf");
// Example 2: Custom kClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
const hierarchyConfigMinimal: HierarchyConfig = {
enabled: true,
kClusters: 3, // Minimal clustering: just 3 levels
includeBbox: true,
ocrCoverageThreshold: undefined,
};
const pdfConfigMinimal: PdfConfig = {
hierarchy: hierarchyConfigMinimal,
};
const _extractionConfigMinimal: ExtractionConfig = {
pdfOptions: pdfConfigMinimal,
};
// const result = await kreuzberg.extractFile("document.pdf");
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
const hierarchyConfigOcr: HierarchyConfig = {
enabled: true,
kClusters: 6,
includeBbox: true,
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
};
const pdfConfigOcr: PdfConfig = {
hierarchy: hierarchyConfigOcr,
};
const _extractionConfigOcr: ExtractionConfig = {
pdfOptions: pdfConfigOcr,
};
// const result = await kreuzberg.extractFile("document.pdf");
// Integration with Kreuzberg WASM instance
async function _extractWithHierarchy(): Promise<void> {
const config = extractionConfigBasic;
const kreuzberg = new Kreuzberg(config);
try {
// Extract from file (requires file input or fetch)
const result = await kreuzberg.extractFile("document.pdf");
console.log("Extraction complete:", result);
} catch (error) {
console.error("Extraction failed:", error);
}
}
// Field descriptions:
//
// enabled: boolean (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// kClusters: number (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// includeBbox: boolean (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocrCoverageThreshold: number | undefined (default: undefined)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - undefined means no OCR coverage-based triggering
//

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
output_format: "html",
html_output: {
theme: "github",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(result.content); // HTML with kb-* classes
```

View File

@@ -0,0 +1,31 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractImagesWithConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
images: {
extractImages: true,
targetDpi: 300,
maxDimension: 2048,
preserveAspectRatio: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
if (result.images) {
console.log(`Extracted ${result.images.length} images`);
result.images.forEach((image) => {
console.log(
`Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
);
});
}
}
extractImagesWithConfig().catch(console.error);

View File

@@ -0,0 +1,65 @@
import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
async function basicYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
async function _advancedYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 15,
minScore: 0.1,
ngramRange: [1, 2],
language: "en",
yakeParams: {
windowSize: 1,
},
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
async function _rakeConfig(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "rake",
maxKeywords: 10,
minScore: 5.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: {
minWordLength: 1,
maxWordsPerPhrase: 3,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
basicYake();

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
keywords: {
algorithm: "yake",
max_keywords: 10,
min_score: 0.1,
ngram_range: [1, 3],
language: "en",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
```

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
language_detection: {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Detected language: ${result.language}`);
console.log(`Confidence: ${result.language_confidence}`);
```

View File

@@ -0,0 +1,26 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithOcr() {
await initWasm();
const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("Extracted text from scanned document:");
console.log(result.content);
if (result.detectedLanguages) {
console.log("Detected languages:", result.detectedLanguages);
}
}
extractWithOcr().catch(console.error);

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
images: {
extract_images: true,
target_dpi: 300,
max_image_dimension: 4096,
auto_adjust_dpi: true,
min_dpi: 150,
max_dpi: 600,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Extracted images: ${result.images?.length || 0}`);
```

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
const config = {
pdf_options: {
extract_images: true,
passwords: ["password123"],
extract_metadata: true,
hierarchy: {},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Title: ${result.metadata?.title}`);
console.log(`Authors: ${result.metadata?.authors}`);
```

View File

@@ -0,0 +1,22 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
pdf_options: {
hierarchy: {
enabled: true,
detection_threshold: 0.75,
ocr_coverage_threshold: 0.8,
min_level: 1,
max_level: 5,
},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
postprocessor: {
enabled: true,
enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Processed content: ${result.content}`);
```

View File

@@ -0,0 +1,16 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
enable_quality_processing: true,
use_cache: true,
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Quality score: ${result.quality_score}`);
console.log(`Processing time: ${result.processing_time}`);
```

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config = {
ocr: {
backend: "tesseract",
language: "eng+deu",
tesseract_config: {
psm: 6,
oem: 3,
},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`OCR text: ${result.content}`);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
token_reduction: {
mode: "moderate",
preserve_important_words: true,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Original tokens: ${result.token_count}`);
console.log(`Reduced content: ${result.content}`);
```