Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
Import { extractFileSync } from '@kreuzberg/node';
Const result = extractFileSync('document.pdf', null, { chunking: { maxChars: 500, maxOverlap: 50 }, pages: { extractPages: true } });
If (result.chunks) {
for (const chunk of result.chunks) {
if (chunk.metadata.firstPage) {
const pageRange = chunk.metadata.firstPage === chunk.metadata.lastPage
? `Page ${chunk.metadata.firstPage}`
: `Pages ${chunk.metadata.firstPage}-${chunk.metadata.lastPage}`;
console.log(`Chunk: ${chunk.content.substring(0, 50)}... (${pageRange})`);
}
}
}

View File

@@ -0,0 +1,47 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
maxChars: 1000,
maxOverlap: 200,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
```
```typescript title="TypeScript - Semantic"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
chunkerType: "semantic",
},
};
const result = await extractFile("document.pdf", null, config);
for (const chunk of result.chunks ?? []) {
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
```
```typescript title="TypeScript - Prepend Heading Context"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 500,
maxOverlap: 50,
prependHeadingContext: true,
},
};
const result = await extractFile("document.md", null, config);
for (const chunk of result.chunks ?? []) {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
```

View File

@@ -0,0 +1,25 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
maxChars: 500,
maxOverlap: 50,
embedding: {
model: { type: "preset", name: "balanced" },
normalize: true,
},
},
};
const result = await extractFile("research_paper.pdf", null, config);
for (const chunk of result.chunks ?? []) {
console.log(`Chunk ${chunk.metadata.chunkIndex + 1}/${chunk.metadata.totalChunks}`);
console.log(`Position: ${chunk.metadata.byteStart}-${chunk.metadata.byteEnd}`);
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
if (chunk.embedding) {
console.log(`Embedding: ${chunk.embedding.length} dimensions`);
}
}
```

View File

@@ -0,0 +1,18 @@
```typescript title="TypeScript"
import { extractFile, type ExtractionConfig, type ExtractionResult } from "@kreuzberg/node";
/**
* Note: Custom cache backends are not supported in TypeScript v4.0.
* Caching is handled internally by the Rust core.
*
* This example demonstrates the config structure.
* To enable caching, use the useCache flag.
*/
// Usage with built-in cache
const config: ExtractionConfig = {
useCache: true, // Enable internal Rust cache
};
const result = await extractFile("document.pdf", null, config);
```

View File

@@ -0,0 +1,19 @@
```typescript title="TypeScript"
import { listDocumentExtractors, unregisterDocumentExtractor } from "@kreuzberg/node";
/**
* Note: Custom document extractors are not directly supported in TypeScript v4.0.
* Document extraction logic lives in the Rust core.
*
* You can list and unregister built-in extractors, but cannot add custom ones
* from TypeScript. For custom extractors, implement them in Rust.
*/
// List all registered document extractors
const extractors = listDocumentExtractors();
console.log("Available extractors:", extractors);
// Example output: ['PDFExtractor', 'ImageExtractor', 'OfficeExtractor', ...]
// Unregister a built-in extractor (use with caution)
unregisterDocumentExtractor("SomeExtractor");
```

View File

@@ -0,0 +1,78 @@
```typescript title="TypeScript"
import { registerOcrBackend, type OcrBackendProtocol } from "@kreuzberg/node";
/**
* Custom OCR backend implementation
* Allows integration with custom OCR services
* @example
* const backend = new CustomOcrBackend();
* await backend.initialize();
* registerOcrBackend(backend);
*/
class CustomOcrBackend implements OcrBackendProtocol {
private apiUrl: string;
constructor(apiUrl: string) {
this.apiUrl = apiUrl;
}
name(): string {
return "custom-ocr-backend";
}
supportedLanguages(): string[] {
return ["en", "de", "fr", "es"];
}
async initialize(): Promise<void> {
console.log(`Initializing custom OCR backend at ${this.apiUrl}`);
}
async shutdown(): Promise<void> {
console.log("Shutting down custom OCR backend");
}
/**
* Process image and extract text via OCR
*/
async processImage(
imageData: Uint8Array | string,
language: string,
): Promise<{
content: string;
mime_type: string;
metadata: Record<string, unknown>;
tables: unknown[];
}> {
const buffer =
typeof imageData === "string" ? Buffer.from(imageData, "base64") : Buffer.from(imageData);
const formData = new FormData();
const blob = new Blob([buffer], { type: "image/png" });
formData.append("image", blob);
formData.append("language", language);
const response = await fetch(`${this.apiUrl}/ocr`, {
method: "POST",
body: formData,
});
if (!response.ok) {
throw new Error(`OCR service failed: ${response.statusText}`);
}
const result = await response.json();
return {
content: result.text,
mime_type: "text/plain",
metadata: { confidence: result.confidence, language },
tables: result.tables || [],
};
}
}
// Register custom OCR backend
const backend = new CustomOcrBackend("http://localhost:8000");
await backend.initialize();
registerOcrBackend(backend);
```

View File

@@ -0,0 +1,63 @@
```typescript title="TypeScript"
import {
registerPostProcessor,
unregisterPostProcessor,
type PostProcessorProtocol,
type ExtractionResult,
} from "@kreuzberg/node";
/**
* Custom post-processor for cleaning extraction results
* Removes common artifacts and normalizes whitespace
* @example
* const processor = new CleaningPostProcessor();
* registerPostProcessor(processor);
*/
class CleaningPostProcessor implements PostProcessorProtocol {
name(): string {
return "cleaning-postprocessor";
}
processingStage(): "early" | "middle" | "late" {
return "middle";
}
/**
* Process extraction result for cleanup
*/
process(result: ExtractionResult): ExtractionResult {
return {
...result,
content: this.cleanContent(result.content),
};
}
/**
* Remove artifacts and normalize whitespace
*/
private cleanContent(content: string): string {
// Remove multiple spaces
let cleaned = content.replace(/\s+/g, " ");
// Remove common OCR artifacts
cleaned = cleaned.replace(/\|/g, "l");
cleaned = cleaned.replace(/0O/g, "00");
// Remove leading/trailing whitespace from lines
cleaned = cleaned
.split("\n")
.map((line) => line.trim())
.filter((line) => line.length > 0)
.join("\n");
return cleaned.trim();
}
}
// Register the post-processor
const processor = new CleaningPostProcessor();
registerPostProcessor(processor);
// Later, unregister if needed
// unregisterPostProcessor("cleaning-postprocessor");
```

View File

@@ -0,0 +1,86 @@
```typescript title="TypeScript"
import {
registerValidator,
unregisterValidator,
extractFile,
type ValidatorProtocol,
type ExtractionResult,
ValidationError,
} from "@kreuzberg/node";
/**
* Custom validator for quality checking
* Ensures extracted content meets minimum quality standards
* @example
* const validator = new QualityValidator();
* registerValidator(validator);
*/
class QualityValidator implements ValidatorProtocol {
name(): string {
return "quality-validator";
}
priority(): number {
return 10;
}
/**
* Validate extraction result meets quality standards
*/
validate(result: ExtractionResult): void {
this.checkMinimumLength(result);
this.checkEmptyContent(result);
this.checkMetadata(result);
}
/**
* Ensure minimum content length
*/
private checkMinimumLength(result: ExtractionResult): void {
const minLength = 50;
if (result.content.length < minLength) {
throw new ValidationError(
`Content too short: ${result.content.length} bytes (minimum ${minLength})`,
);
}
}
/**
* Ensure content is not empty
*/
private checkEmptyContent(result: ExtractionResult): void {
const trimmed = result.content.trim();
if (trimmed.length === 0) {
throw new ValidationError("Extracted content is empty");
}
}
/**
* Validate metadata is present
*/
private checkMetadata(result: ExtractionResult): void {
if (!result.metadata || Object.keys(result.metadata).length === 0) {
throw new ValidationError("Missing extraction metadata");
}
}
}
// Register the validator
const validator = new QualityValidator();
registerValidator(validator);
// Usage with error handling (must use async extraction for custom validators)
try {
const result = await extractFile("document.pdf");
console.log(`Validated content length: ${result.content.length} characters`);
} catch (error) {
if (error instanceof ValidationError) {
console.error(`Validation failed: ${error.message}`);
} else {
throw error;
}
}
// Later, unregister if needed
// unregisterValidator("quality-validator");
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
const config = {
chunking: {
maxChars: 1024,
maxOverlap: 100,
embedding: {
model: { type: "preset", name: "balanced" },
normalize: true,
batchSize: 32,
showDownloadProgress: false,
},
},
};
```

View File

@@ -0,0 +1,11 @@
```typescript title="TypeScript"
const config = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.3,
ngramRange: [1, 3],
language: "en",
},
};
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.3,
},
};
const result = await extractFile("research_paper.pdf", null, config);
for (const keyword of result.extractedKeywords ?? []) {
console.log(`${keyword.text}: ${keyword.score.toFixed(3)}`);
}
```

View File

@@ -0,0 +1,9 @@
```typescript title="TypeScript"
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
};
```

View File

@@ -0,0 +1,15 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: true,
},
};
const result = await extractFile("multilingual_document.pdf", null, config);
console.log(`Detected languages: ${result.detectedLanguages?.join(", ") ?? ""}`);
```

View File

@@ -0,0 +1,128 @@
```typescript title="TypeScript"
import {
registerPostProcessor,
registerValidator,
registerOcrBackend,
listPostProcessors,
listValidators,
listOcrBackends,
unregisterPostProcessor,
unregisterValidator,
clearPostProcessors,
clearValidators,
clearOcrBackends,
type PostProcessorProtocol,
type ValidatorProtocol,
type OcrBackendProtocol,
} from "@kreuzberg/node";
/**
* Plugin registry and lifecycle management
* Demonstrates how to register, list, and unregister plugins
* @example
* const registry = new PluginRegistry();
* registry.registerAll();
* registry.listAll();
*/
class PluginRegistry {
private postProcessors: PostProcessorProtocol[] = [];
private validators: ValidatorProtocol[] = [];
private ocrBackends: OcrBackendProtocol[] = [];
/**
* Register all available plugins
*/
registerAll(): void {
console.log("Registering all plugins...");
// Register post-processors
this.postProcessors.forEach((processor) => {
registerPostProcessor(processor);
console.log(`Registered post-processor: ${processor.name()}`);
});
// Register validators
this.validators.forEach((validator) => {
registerValidator(validator);
console.log(`Registered validator: ${validator.name()}`);
});
// Register OCR backends
this.ocrBackends.forEach((backend) => {
registerOcrBackend(backend);
console.log(`Registered OCR backend: ${backend.name()}`);
});
}
/**
* List all registered plugins
*/
listAll(): void {
const processors = listPostProcessors();
const validators = listValidators();
const backends = listOcrBackends();
console.log("Registered plugins:");
console.log(` Post-processors: ${processors.join(", ")}`);
console.log(` Validators: ${validators.join(", ")}`);
console.log(` OCR backends: ${backends.join(", ")}`);
}
/**
* Unregister specific plugin by name
*/
unregisterPlugin(name: string, type: "processor" | "validator"): void {
if (type === "processor") {
unregisterPostProcessor(name);
console.log(`Unregistered post-processor: ${name}`);
} else if (type === "validator") {
unregisterValidator(name);
console.log(`Unregistered validator: ${name}`);
}
}
/**
* Clear all registered plugins
*/
clearAll(): void {
clearPostProcessors();
clearValidators();
clearOcrBackends();
console.log("Cleared all plugins");
}
/**
* Add plugin to registry
*/
addPostProcessor(processor: PostProcessorProtocol): void {
this.postProcessors.push(processor);
}
addValidator(validator: ValidatorProtocol): void {
this.validators.push(validator);
}
addOcrBackend(backend: OcrBackendProtocol): void {
this.ocrBackends.push(backend);
}
}
// Usage
const registry = new PluginRegistry();
// Add plugins to registry
// registry.addPostProcessor(customProcessor);
// registry.addValidator(customValidator);
// Register all plugins
registry.registerAll();
// List registered plugins
registry.listAll();
// Unregister specific plugin
// registry.unregisterPlugin("custom-processor", "processor");
// Clear all plugins
// registry.clearAll();
```

View File

@@ -0,0 +1,5 @@
```typescript title="TypeScript"
const config = {
enableQualityProcessing: true,
};
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
enableQualityProcessing: true,
};
const result = await extractFile("scanned_document.pdf", null, config);
if (result.qualityScore !== null && result.qualityScore !== undefined) {
if (result.qualityScore < 0.5) {
console.warn(`Warning: Low quality extraction (${result.qualityScore.toFixed(2)})`);
} else {
console.log(`Quality score: ${result.qualityScore.toFixed(2)}`);
}
}
```

View File

@@ -0,0 +1,10 @@
```typescript title="TypeScript"
const config = {
tokenReduction: {
level: "Moderate",
preserveMarkdown: true,
preserveCode: true,
languageHint: "eng",
},
};
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
tokenReduction: {
level: "Moderate",
preserveMarkdown: true,
},
};
const result = await extractFile("verbose_document.pdf", null, config);
console.log(`Reduced content length: ${result.content?.length ?? 0} chars`);
```

View File

@@ -0,0 +1,49 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
interface VectorRecord {
id: string;
content: string;
embedding: number[];
metadata: Record<string, string>;
}
async function extractAndVectorize(
documentPath: string,
documentId: string,
): Promise<VectorRecord[]> {
const config = {
chunking: {
maxChars: 512,
maxOverlap: 50,
embedding: {
model: { type: "preset", name: "balanced" },
normalize: true,
batchSize: 32,
},
},
};
const result = await extractFile(documentPath, null, config);
const records: VectorRecord[] = [];
for (const [index, chunk] of (result.chunks ?? []).entries()) {
if (!chunk.embedding) {
continue;
}
records.push({
id: `${documentId}_chunk_${index}`,
content: chunk.content,
embedding: chunk.embedding,
metadata: {
document_id: documentId,
chunk_index: String(index),
content_length: String(chunk.content.length),
},
});
}
return records;
}
await extractAndVectorize("document.pdf", "doc_001");
```

View File

@@ -0,0 +1,50 @@
# Worker Thread Pool Example
Use worker pools for CPU-bound batch processing to maximize parallelism and throughput:
```typescript title="worker_pool.ts"
import {
createWorkerPool,
extractFileInWorker,
batchExtractFilesInWorker,
closeWorkerPool,
} from "@kreuzberg/node";
// Create a pool with 4 worker threads
const pool = createWorkerPool(4);
try {
// Extract single file in worker
const result = await extractFileInWorker(pool, "document.pdf", null, {
useCache: true,
});
console.log(result.content);
// Extract multiple files concurrently
const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
const results = await batchExtractFilesInWorker(pool, files, {
useCache: true,
});
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
} finally {
// Always close the pool when done
await closeWorkerPool(pool);
}
```
**Performance Benefits:**
- **Parallel Processing**: Multiple documents extracted simultaneously
- **CPU Usage**: Maximizes multi-core CPU usage for large batches
- **Queue Management**: Automatically distributes work across available workers
- **Resource Control**: Prevents thread exhaustion with configurable pool size
**Best Practices:**
- Use worker pools for batches of 10+ documents
- Set pool size to number of CPU cores (default behavior)
- Always close pools with `closeWorkerPool()` to prevent resource leaks
- Reuse pools across multiple batch operations for efficiency

View File

@@ -0,0 +1,18 @@
```typescript title="TypeScript"
import { batchExtractBytesSync } from "kreuzberg";
import { readFileSync } from "fs";
const doc1 = readFileSync("doc1.pdf");
const doc2 = readFileSync("doc2.pdf");
const items = [
{ content: doc1, mimeType: "application/pdf", config: undefined },
{ content: doc2, mimeType: "application/pdf", config: undefined },
];
const results = batchExtractBytesSync(items);
results.forEach((result, i) => {
console.log(`Document ${i}: ${result.content.length} chars`);
});
```

View File

@@ -0,0 +1,15 @@
```typescript title="TypeScript"
import { batchExtractFilesSync } from "kreuzberg";
const items = [
{ path: "doc1.pdf", config: undefined },
{ path: "doc2.docx", config: undefined },
{ path: "report.pdf", config: undefined },
];
const results = batchExtractFilesSync(items);
results.forEach((result, i) => {
console.log(`File ${i}: ${result.content.length} chars`);
});
```

View File

@@ -0,0 +1,23 @@
```typescript title="TypeScript"
const response = await fetch("http://localhost:8000/chunk", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text: "Your long text content here...",
chunker_type: "text",
config: {
max_characters: 1000,
overlap: 50,
trim: true,
},
}),
});
const result = await response.json();
console.log(`Created ${result.chunk_count} chunks`);
result.chunks.forEach((chunk: { content: string; chunk_index: number }) => {
const preview = chunk.content.substring(0, 50);
console.log(`Chunk ${chunk.chunk_index}: ${preview}...`);
});
```

View File

@@ -0,0 +1,32 @@
```typescript title="TypeScript"
import { readFileSync } from "fs";
async function extractViaClient() {
const formData = new FormData();
const fileData = readFileSync("document.pdf");
formData.append("files", new Blob([fileData]), "document.pdf");
try {
const response = await fetch("http://localhost:8000/extract", {
method: "POST",
body: formData,
});
if (!response.ok) {
const error = await response.json();
console.error(`Error: ${error.error_type}: ${error.message}`);
return;
}
const results = await response.json();
console.log(`Extracted ${results.length} document(s)`);
console.log(results[0].content);
} catch (error: unknown) {
if (error instanceof Error) {
console.error(`Request failed: ${error.message}`);
}
}
}
extractViaClient();
```

View File

@@ -0,0 +1,45 @@
```typescript title="TypeScript"
import { extractFileSync } from "kreuzberg";
const config = {
// OCR: Tesseract on all pages with English text
force_ocr: false,
ocr: {
backend: "tesseract",
language: "eng",
},
// Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
chunking: {
max_characters: 800,
overlap: 100,
chunker_type: "markdown",
prepend_heading_context: true,
},
// Output: include document structure and tables
output_format: "markdown",
include_document_structure: true,
// Images: extract embedded images
images: {
extract_images: true,
},
// Cache extracted results on disk
use_cache: true,
enable_quality_processing: true,
};
const result = extractFileSync("report.pdf", undefined, config);
console.log(`Content (${result.content.length} chars):`);
console.log(result.content.slice(0, 200));
if (result.chunks) {
console.log(`\nChunks: ${result.chunks.length}`);
}
console.log(`Tables: ${result.tables?.length ?? 0}`);
if (result.detected_languages) {
console.log(`Languages: ${result.detected_languages}`);
}
if (result.extraction_method) {
console.log(`Extraction method: ${result.extraction_method}`);
}
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "kreuzberg";
try {
const result = extractFileSync("missing.pdf");
console.log(result.content);
} catch (error: unknown) {
if (error instanceof Error) {
console.error(`Extraction failed: ${error.message}`);
}
throw error;
}
```

View File

@@ -0,0 +1,31 @@
```typescript title="TypeScript"
import { readFileSync } from "node:fs";
async function extractDocument(): Promise<void> {
const formData = new FormData();
const fileData = readFileSync("document.pdf");
formData.append("files", new Blob([fileData]), "document.pdf");
try {
const response = await fetch("http://localhost:8000/extract", {
method: "POST",
body: formData,
});
if (!response.ok) {
const error = await response.json();
console.error(`Error: ${error.error_type}: ${error.message}`);
return;
}
const results = await response.json();
console.log(`Extracted ${results.length} documents`);
} catch (error: unknown) {
if (error instanceof Error) {
console.error(`Request failed: ${error.message}`);
}
}
}
extractDocument();
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
import { extractBytes } from "kreuzberg";
import { readFileSync } from "fs";
async function main() {
const content = readFileSync("document.pdf");
const result = await extractBytes(content, "application/pdf");
console.log(result.content);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
main();
```

View File

@@ -0,0 +1,10 @@
```typescript title="TypeScript"
import { extractBytesSync } from "kreuzberg";
import { readFileSync } from "fs";
const content = readFileSync("document.pdf");
const result = extractBytesSync(content, "application/pdf");
console.log(result.content);
console.log(`Tables: ${result.tables?.length ?? 0}`);
```

View File

@@ -0,0 +1,12 @@
```typescript title="TypeScript"
import { extractFile } from "kreuzberg";
async function main() {
const result = await extractFile("document.pdf");
console.log(result.content);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
main();
```

View File

@@ -0,0 +1,9 @@
```typescript title="TypeScript"
import { extractFileSync } from "kreuzberg";
const result = extractFileSync("document.pdf");
console.log(result.content);
console.log(`MIME type: ${result.mime_type}`);
console.log(`Tables: ${result.tables?.length ?? 0}`);
```

View File

@@ -0,0 +1,11 @@
```typescript title="TypeScript"
import { extractFileSync } from "kreuzberg";
const result = extractFileSync("document.pdf");
result.tables?.forEach((table) => {
console.log(`Table with ${table.cells?.length ?? 0} rows`);
console.log(table.markdown);
table.cells?.forEach((row) => console.log(row.join(" | ")));
});
```

View File

@@ -0,0 +1,49 @@
```typescript title="simple_benchmark.ts"
/* oxlint-disable */
import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
async function benchmarkExtractions(): Promise<void> {
const config: ExtractionConfig = { useCache: false };
const filePath = "document.pdf";
const numRuns = 10;
let start = performance.now();
for (let i = 0; i < numRuns; i++) {
await extractFile(filePath, null, config);
}
const syncDuration = (performance.now() - start) / 1000;
const avgSync = syncDuration / numRuns;
console.log("Sync extraction (" + numRuns + " runs):");
console.log(" - Total time: " + syncDuration.toFixed(3) + "s");
console.log(" - Average: " + avgSync.toFixed(3) + "s per extraction");
start = performance.now();
const tasks = Array(numRuns)
.fill(null)
.map(() => extractFile(filePath, null, config));
await Promise.all(tasks);
const asyncDuration = (performance.now() - start) / 1000;
console.log("\nAsync extraction (" + numRuns + " parallel runs):");
console.log(" - Total time: " + asyncDuration.toFixed(3) + "s");
console.log(" - Average: " + (asyncDuration / numRuns).toFixed(3) + "s per extraction");
console.log(" - Speedup: " + (syncDuration / asyncDuration).toFixed(1) + "x");
const cacheConfig: ExtractionConfig = { useCache: true };
console.log("\nFirst extraction (populates cache)...");
start = performance.now();
const _result1 = await extractFile(filePath, null, cacheConfig);
const firstDuration = (performance.now() - start) / 1000;
console.log(" - Time: " + firstDuration.toFixed(3) + "s");
console.log("Second extraction (from cache)...");
start = performance.now();
const _result2 = await extractFile(filePath, null, cacheConfig);
const cachedDuration = (performance.now() - start) / 1000;
console.log(" - Time: " + cachedDuration.toFixed(3) + "s");
console.log(" - Cache speedup: " + (firstDuration / cachedDuration).toFixed(1) + "x");
}
benchmarkExtractions().catch(console.error);

View File

@@ -0,0 +1,20 @@
```typescript title="disk_cache.ts"
/* oxlint-disable */
import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
const config: ExtractionConfig = { useCache: true };
(async () => {
console.log("First extraction (will be cached)...");
const result1 = await extractFile("document.pdf", null, config);
const length1 = result1.content.length;
console.log(" - Content length: " + length1);
console.log("\nSecond extraction (from cache)...");
const result2 = await extractFile("document.pdf", null, config);
const length2 = result2.content.length;
console.log(" - Content length: " + length2);
const isIdentical = result1.content === result2.content;
console.log("\nResults are identical: " + isIdentical);
})();

View File

@@ -0,0 +1,51 @@
```typescript title="basic_cli.ts"
/* oxlint-disable */
import { spawn } from "child_process";
interface ExtractionResult {
content: string;
format?: string;
languages?: string[];
}
async function extractWithCli(
filePath: string,
outputFormat: string = "text"
): Promise<string | ExtractionResult> {
return new Promise((resolve, reject) => {
const child = spawn("kreuzberg", ["extract", filePath, "--format", outputFormat]);
let stdout = "";
let stderr = "";
child.stdout.on("data", (data) => {
stdout += data.toString();
});
child.stderr.on("data", (data) => {
stderr += data.toString();
});
child.on("close", (code) => {
if (code !== 0) {
const errMsg = "CLI exited with code " + code + ": " + stderr;
reject(new Error(errMsg));
return;
}
if (outputFormat === "json") {
resolve(JSON.parse(stdout));
} else {
resolve(stdout);
}
});
});
}
const document = "document.pdf";
const textOutput = await extractWithCli(document, "text");
console.log(`Extracted: ${(textOutput as string).length} characters`);
const jsonOutput = (await extractWithCli(document, "json")) as ExtractionResult;
console.log(`Format: ${jsonOutput.format}`);
```

View File

@@ -0,0 +1,57 @@
```typescript title="cli_with_config.ts"
/* oxlint-disable */
import { spawn } from "child_process";
interface ExtractionResult {
content: string;
format?: string;
languages?: string[];
}
async function extractWithConfig(
filePath: string,
configPath: string
): Promise<ExtractionResult> {
return new Promise((resolve, reject) => {
const child = spawn("kreuzberg", [
"extract",
filePath,
"--config",
configPath,
"--format",
"json",
]);
let stdout = "";
let stderr = "";
child.stdout.on("data", (data) => {
stdout += data.toString();
});
child.stderr.on("data", (data) => {
stderr += data.toString();
});
child.on("close", (code) => {
if (code !== 0) {
const errMsg = "CLI exited with code " + code + ": " + stderr;
reject(new Error(errMsg));
return;
}
resolve(JSON.parse(stdout));
});
});
}
const configFile = "kreuzberg.toml";
const document = "document.pdf";
console.log("Extracting " + document + " with config " + configFile);
const result = await extractWithConfig(document, configFile);
console.log("Content length: " + result.content.length);
console.log("Format: " + result.format);
const languages = result.languages ? result.languages.join(", ") : "";
console.log("Languages: " + languages);

View File

@@ -0,0 +1,35 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+deu",
},
chunking: {
maxChars: 1000,
maxOverlap: 100,
},
tokenReduction: {
mode: "aggressive",
},
languageDetection: {
enabled: true,
detectMultiple: true,
},
useCache: true,
enableQualityProcessing: true,
};
const result = extractFileSync("document.pdf", null, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
}
```

View File

@@ -0,0 +1,70 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
maxChars: 1000,
maxOverlap: 200,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
```
```typescript title="TypeScript - Markdown with Heading Context"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 500,
maxOverlap: 50,
sizingType: "tokenizer",
sizingModel: "Xenova/gpt-4o",
},
};
const result = await extractFile("document.md", null, config);
for (const chunk of result.chunks ?? []) {
const headings = chunk.metadata?.headingContext?.headings ?? [];
for (const heading of headings) {
console.log(`Heading L${heading.level}: ${heading.text}`);
}
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
```
```typescript title="TypeScript - Semantic"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
chunkerType: "semantic",
},
};
const result = await extractFile("document.pdf", null, config);
for (const chunk of result.chunks ?? []) {
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
```
```typescript title="TypeScript - Prepend Heading Context"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 500,
maxOverlap: 50,
prependHeadingContext: true,
},
};
const result = await extractFile("document.md", null, config);
for (const chunk of result.chunks ?? []) {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
```

View File

@@ -0,0 +1,11 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
useCache: true,
enableQualityProcessing: true,
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,11 @@
```typescript title="TypeScript"
import { extractFile, ExtractionConfig } from "@kreuzberg/node";
const config = ExtractionConfig.discover();
if (config) {
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
} else {
console.log("No configuration file found");
}
```

View File

@@ -0,0 +1,20 @@
# Configuration Discovery Example
Use `ExtractionConfig.discover()` to automatically find and load configuration files from the current directory or parent directories:
```typescript title="config_discovery.ts"
import { ExtractionConfig, extractFile } from "@kreuzberg/node";
const config = ExtractionConfig.discover();
if (config) {
console.log("Found configuration file");
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
} else {
console.log("No configuration file found, using defaults");
const result = await extractFile("document.pdf");
console.log(result.content);
}
```
The discovery method looks for `kreuzberg.toml`, `kreuzberg.yaml`, or `kreuzberg.json` files starting in the current directory and searching parent directories up to the filesystem root.

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+fra",
tesseractConfig: {
psm: 3,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,22 @@
```typescript title="TypeScript"
import { extractFileSync, type ExtractionConfig } from "@kreuzberg/node";
const config: ExtractionConfig = {
useCache: true,
ocr: {
backend: "tesseract",
language: "eng+deu",
tesseractConfig: {
psm: 6,
},
},
chunking: {
maxChars: 1000,
maxOverlap: 200,
},
enableQualityProcessing: true,
};
const result = extractFileSync("document.pdf", null, config);
console.log(`Content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,15 @@
```typescript title="Document Structure Config (TypeScript)"
import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
const config: ExtractionConfig = {
includeDocumentStructure: true,
};
const result = extractFileSync("document.pdf", undefined, config);
if (result.document) {
for (const node of result.document.nodes) {
console.log(`[${node.content.nodeType}] ${node.content.text ?? ""}`);
}
}
```

View File

@@ -0,0 +1,35 @@
```typescript title="Element-Based Output (TypeScript)"
import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
// Configure element-based output
const config: ExtractionConfig = {
outputFormat: "element_based",
};
// Extract document
const result = extractFileSync("document.pdf", null, config);
// Access elements
for (const element of result.elements) {
console.log(`Type: ${element.elementType}`);
console.log(`Text: ${element.text.slice(0, 100)}`);
if (element.metadata.pageNumber) {
console.log(`Page: ${element.metadata.pageNumber}`);
}
if (element.metadata.coordinates) {
const coords = element.metadata.coordinates;
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
}
console.log("---");
}
// Filter by element type
const titles = result.elements.filter((e) => e.elementType === "title");
for (const title of titles) {
const level = title.metadata.additional?.level || "unknown";
console.log(`[${level}] ${title.text}`);
}
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
chunking: {
maxChars: 1000,
embedding: {
preset: "quality",
},
},
};
const result = await extractFile("document.pdf", null, config);
if (result.chunks && result.chunks.length > 0) {
console.log(`Chunk embeddings: ${result.chunks[0].embedding?.length ?? 0} dimensions`);
}
```

View File

@@ -0,0 +1,91 @@
import type { ChunkingConfig, EmbeddingConfig, ExtractionConfig } from "@kreuzberg/node";
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
const _embeddingConfig: EmbeddingConfig = {
model: {
type: "preset",
name: "balanced",
},
batchSize: 32,
normalize: true,
showDownloadProgress: true,
cacheDir: "~/.cache/kreuzberg/embeddings",
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
const _embeddingConfig2: EmbeddingConfig = {
model: {
type: "custom",
modelId: "BAAI/bge-small-en-v1.5",
dimensions: 384,
},
batchSize: 32,
normalize: true,
showDownloadProgress: true,
cacheDir: undefined, // Uses default: .kreuzberg/embeddings/
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
const _embeddingConfig3: EmbeddingConfig = {
model: {
type: "custom",
modelId: "sentence-transformers/all-mpnet-base-v2",
dimensions: 768,
},
batchSize: 16, // Larger model requires smaller batch size
normalize: true,
showDownloadProgress: true,
cacheDir: "/var/cache/embeddings",
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
const chunkingWithEmbeddings: ChunkingConfig = {
maxChars: 1024,
maxOverlap: 100,
preset: "balanced",
embedding: {
model: {
type: "preset",
name: "balanced",
},
batchSize: 32,
normalize: true,
},
};
const _extractionConfig: ExtractionConfig = {
chunking: chunkingWithEmbeddings,
};
// Key parameter explanations:
//
// batchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// cacheDir: Where to store downloaded models
// - undefined: Uses .kreuzberg/embeddings/ in current directory
// - string: Custom directory for model storage
//
// showDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,89 @@
import {
type ExtractionConfig,
extractFileSync,
type HierarchyConfig,
type PdfConfig,
} from "@kreuzberg/node";
// Example 1: Basic hierarchy extraction
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
const hierarchyConfigBasic: HierarchyConfig = {
enabled: true,
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
includeBbox: true, // Include bounding box coordinates
ocrCoverageThreshold: undefined, // No OCR coverage threshold
};
const pdfConfigBasic: PdfConfig = {
hierarchy: hierarchyConfigBasic,
};
const extractionConfigBasic: ExtractionConfig = {
pdfOptions: pdfConfigBasic,
};
const _result = extractFileSync("document.pdf", { config: extractionConfigBasic });
// Example 2: Custom kClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
const hierarchyConfigMinimal: HierarchyConfig = {
enabled: true,
kClusters: 3, // Minimal clustering: just 3 levels
includeBbox: true,
ocrCoverageThreshold: undefined,
};
const pdfConfigMinimal: PdfConfig = {
hierarchy: hierarchyConfigMinimal,
};
const extractionConfigMinimal: ExtractionConfig = {
pdfOptions: pdfConfigMinimal,
};
const _resultMinimal = extractFileSync("document.pdf", { config: extractionConfigMinimal });
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
const hierarchyConfigOcr: HierarchyConfig = {
enabled: true,
kClusters: 6,
includeBbox: true,
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
};
const pdfConfigOcr: PdfConfig = {
hierarchy: hierarchyConfigOcr,
};
const extractionConfigOcr: ExtractionConfig = {
pdfOptions: pdfConfigOcr,
};
const _resultOcr = extractFileSync("document.pdf", { config: extractionConfigOcr });
// Field descriptions:
//
// enabled: boolean (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// kClusters: number (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// includeBbox: boolean (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocrCoverageThreshold: number | undefined (default: undefined)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - undefined means no OCR coverage-based triggering

View File

@@ -0,0 +1,12 @@
```typescript title="TypeScript"
import { extractFile } from "kreuzberg";
const result = await extractFile("document.pdf", {
outputFormat: "html",
htmlOutput: {
theme: "github",
embedCss: true,
},
});
console.log(result.content); // HTML with kb-* classes
```

View File

@@ -0,0 +1,70 @@
import {
type ExtractionConfig,
extractFile,
type RakeParams,
type YakeParams,
} from "@kreuzberg/node";
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
async function basicYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
async function _advancedYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 15,
minScore: 0.1,
ngramRange: [1, 2],
language: "en",
yakeParams: {
windowSize: 1,
} as YakeParams,
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
async function _rakeConfig(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "rake",
maxKeywords: 10,
minScore: 5.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: {
minWordLength: 1,
maxWordsPerPhrase: 3,
} as RakeParams,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
basicYake();

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.3,
ngramRange: [1, 3],
language: "en",
},
};
const result = await extractFile("document.pdf", null, config);
console.log(`Content: ${result.content}`);
```

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
};
const result = await extractFile("document.pdf", null, config);
if (result.detectedLanguages) {
console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
}
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
tesseractConfig: {
preprocessing: {
targetDpi: 300,
},
},
},
};
const result = extractFileSync("scanned.pdf", null, config);
console.log(`content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,15 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
pdfOptions: {
extractImages: true,
extractMetadata: true,
passwords: ["password1", "password2"],
hierarchy: { enabled: true, kClusters: 6, includeBbox: true },
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,23 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
pdfOptions: {
extractMetadata: true,
hierarchy: {
enabled: true,
kClusters: 6,
includeBbox: true,
ocrCoverageThreshold: 0.8,
},
},
};
const result = await extractFile("document.pdf", null, config);
if (result.pages) {
result.pages.forEach((page) => {
console.log(`Page ${page.pageNumber}:`);
console.log(` Content: ${page.content.substring(0, 100)}...`);
});
}
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
postprocessor: {
enabled: true,
enabledProcessors: ["deduplication", "whitespace_normalization"],
disabledProcessors: ["mojibake_fix"],
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,10 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
enableQualityProcessing: true,
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,18 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+fra+deu",
tesseractConfig: {
psm: 6,
tesseditCharWhitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?",
enableTableDetection: true,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
tokenReduction: {
mode: "moderate",
preserveImportantWords: true,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,66 @@
```typescript title="usage.ts"
import { exec } from "child_process";
import { promisify } from "util";
import axios from "axios";
import { readFileSync } from "fs";
import { basename } from "path";
import FormData from "form-data";
const execAsync = promisify(exec);
class DockerKreuzbergClient {
private containerName: string;
private containerImage: string;
private apiPort: number;
constructor(
containerName: string = "kreuzberg-api",
containerImage: string = "kreuzberg:latest",
apiPort: number = 8000
) {
this.containerName = containerName;
this.containerImage = containerImage;
this.apiPort = apiPort;
}
async startContainer(): Promise<void> {
console.log("Starting Kreuzberg Docker container...");
const cmd = `docker run -d --name ${this.containerName} -p ${this.apiPort}:8000 ${this.containerImage}`;
await execAsync(cmd);
console.log(`Container started on http://localhost:${this.apiPort}`);
}
async extractFile(filePath: string): Promise<string> {
const fileContent = readFileSync(filePath);
const form = new FormData();
form.append("file", fileContent, basename(filePath));
const response = await axios.post(`http://localhost:${this.apiPort}/api/extract`, form, {
headers: form.getHeaders(),
});
return response.data.content;
}
async stopContainer(): Promise<void> {
console.log("Stopping Kreuzberg Docker container...");
await execAsync(`docker stop ${this.containerName}`);
await execAsync(`docker rm ${this.containerName}`);
console.log("Container stopped and removed");
}
}
(async () => {
const dockerClient = new DockerKreuzbergClient();
try {
await dockerClient.startContainer();
await new Promise((resolve) => setTimeout(resolve, 2000));
const content = await dockerClient.extractFile("document.pdf");
console.log(`Extracted content:\n${content}`);
} finally {
await dockerClient.stopContainer();
}
})()
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
useCache: true,
enableQualityProcessing: true,
};
const result = extractFileSync("document.pdf", null, config);
console.log(result.content);
console.log(`MIME Type: ${result.mimeType}`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { batchExtractBytesSync } from "@kreuzberg/node";
import { readFileSync } from "fs";
const files = ["doc1.pdf", "doc2.docx"];
const dataList = files.map((f) => readFileSync(f));
const mimeTypes = [
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
];
const results = batchExtractBytesSync(dataList, mimeTypes);
results.forEach((result, i) => {
console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,10 @@
```typescript title="TypeScript"
import { batchExtractFilesSync } from "@kreuzberg/node";
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
const results = batchExtractFilesSync(files);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,12 @@
```typescript title="TypeScript"
// Multiple files
const multipleFiles = new FormData();
for (const file of fileInput.files) {
multipleFiles.append("files", file);
}
const response3 = await fetch("http://localhost:8000/extract", {
method: "POST",
body: multipleFiles,
});
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
// Using fetch API
const formData = new FormData();
formData.append("files", fileInput.files[0]);
const response = await fetch("http://localhost:8000/extract", {
method: "POST",
body: formData,
});
const results = await response.json();
console.log(results[0].content);
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
// With configuration
const formDataWithConfig = new FormData();
formDataWithConfig.append("files", fileInput.files[0]);
formDataWithConfig.append(
"config",
JSON.stringify({
ocr: { language: "eng" },
force_ocr: true,
}),
);
const response2 = await fetch("http://localhost:8000/extract", {
method: "POST",
body: formDataWithConfig,
});
```

View File

@@ -0,0 +1,36 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
enableQualityProcessing: true,
languageDetection: {
enabled: true,
detectMultiple: true,
},
tokenReduction: {
mode: "moderate",
preserveImportantWords: true,
},
chunking: {
maxChars: 512,
maxOverlap: 50,
embedding: {
preset: "balanced",
},
},
keywords: {
algorithm: "yake",
maxKeywords: 10,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(`Content length: ${result.content.length}`);
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
}
if (result.chunks && result.chunks.length > 0) {
console.log(`Chunks: ${result.chunks.length}`);
}
```

View File

@@ -0,0 +1,48 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
ocr: {
backend: "tesseract",
language: "eng+fra",
tesseractConfig: {
psm: 3,
enableTableDetection: true,
},
},
pdfOptions: {
extractImages: true,
extractMetadata: true,
},
images: {
extractImages: true,
targetDpi: 150,
maxImageDimension: 2048,
},
chunking: {
maxChars: 1000,
maxOverlap: 200,
embedding: {
preset: "balanced",
},
},
tokenReduction: {
mode: "moderate",
preserveImportantWords: true,
},
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
postprocessor: {
enabled: true,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(`Extracted content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,8 @@
```typescript title="TypeScript"
import { extractBytes } from "@kreuzberg/node";
import { readFile } from "fs/promises";
const data = await readFile("document.pdf");
const result = await extractBytes(data, "application/pdf");
console.log(result.content);
```

View File

@@ -0,0 +1,8 @@
```typescript title="TypeScript"
import { extractBytesSync } from "@kreuzberg/node";
import { readFileSync } from "fs";
const data = readFileSync("document.pdf");
const result = extractBytesSync(data, "application/pdf");
console.log(result.content);
```

View File

@@ -0,0 +1,9 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const result = await extractFile("document.pdf");
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
```

View File

@@ -0,0 +1,6 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const result = await extractFile("document.pdf");
console.log(result.content);
```

View File

@@ -0,0 +1,9 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
```

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
forceOcr: true,
ocr: {
backend: "tesseract",
language: "eng",
},
};
const result = extractFileSync("scanned.pdf", null, config);
console.log(result.content);
console.log(`Detected Languages: ${result.detectedLanguages?.join(", ") ?? "none"}`);
```

View File

@@ -0,0 +1,6 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
console.log(result.content);
```

View File

@@ -0,0 +1,9 @@
```typescript title="TypeScript"
import { getVersion, extractFileSync } from "@kreuzberg/node";
const version = getVersion();
console.log(`Kreuzberg version: ${version}`);
const result = extractFileSync("document.pdf");
console.log(`Extraction successful: ${result.success}`);
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
console.log(`Content: ${result.content}`);
console.log(`Success: ${result.success}`);
console.log(`Content Length: ${result.content.length}`);
if (result.metadata.page_count) {
console.log(`Pages: ${result.metadata.page_count}`);
}
```

View File

@@ -0,0 +1,25 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
structuredExtraction: {
schema: {
type: "object",
properties: {
title: { type: "string" },
authors: { type: "array", items: { type: "string" } },
date: { type: "string" },
},
required: ["title", "authors", "date"],
additionalProperties: false,
},
llm: {
model: "openai/gpt-4o-mini",
},
strict: true,
},
};
const result = extractFileSync("paper.pdf", null, config);
console.log(result.structuredOutput);
```

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
forceOcr: true,
ocr: {
backend: "vlm",
vlmConfig: {
model: "openai/gpt-4o-mini",
},
},
};
const result = extractFileSync("scan.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,174 @@
```typescript title="TypeScript"
import { spawn } from "node:child_process";
import * as readline from "node:readline";
/**
* MCP Client for Kreuzberg
* Communicates with Kreuzberg MCP server via stdio
* @example
* const client = new KreuzbergMcpClient();
* await client.connect();
* const result = await client.callTool("extract_file", { path: "doc.pdf" });
*/
class KreuzbergMcpClient {
private process: ReturnType<typeof spawn> | null = null;
private rl: readline.Interface | null = null;
private requestId: number = 0;
private pendingRequests: Map<
number,
{
resolve: (value: unknown) => void;
reject: (error: Error) => void;
}
> = new Map();
/**
* Connect to MCP server
*/
async connect(): Promise<void> {
this.process = spawn("kreuzberg", ["mcp"]);
this.rl = readline.createInterface({
input: this.process.stdout,
output: this.process.stdin,
terminal: false,
});
// Handle incoming responses
this.rl.on("line", (line) => {
try {
const response = JSON.parse(line) as {
id: number;
result?: unknown;
error?: { message: string };
};
const pending = this.pendingRequests.get(response.id);
if (pending) {
if (response.error) {
pending.reject(new Error(response.error.message));
} else {
pending.resolve(response.result);
}
this.pendingRequests.delete(response.id);
}
} catch (error) {
console.error("Failed to parse response:", error);
}
});
// Handle errors
this.process.stderr?.on("data", (data) => {
console.error("MCP server error:", data.toString());
});
// Wait for initialization
await this.listTools();
}
/**
* List available tools
*/
async listTools(): Promise<Array<{ name: string; description: string }>> {
return this.sendRequest("tools/list", {}) as Promise<
Array<{ name: string; description: string }>
>;
}
/**
* Call a tool on the server
*/
async callTool(toolName: string, args: Record<string, unknown>): Promise<unknown> {
return this.sendRequest("tools/call", {
name: toolName,
arguments: args,
});
}
/**
* Extract file from path
*/
async extractFile(path: string, async: boolean = false): Promise<Record<string, unknown>> {
return this.callTool("extract_file", {
path,
async,
}) as Promise<Record<string, unknown>>;
}
/**
* Extract from bytes
*/
async extractBytes(
data: Uint8Array,
mimeType: string,
async: boolean = false,
): Promise<Record<string, unknown>> {
const base64 = Buffer.from(data).toString("base64");
return this.callTool("extract_bytes", {
data: base64,
mimeType,
async,
}) as Promise<Record<string, unknown>>;
}
/**
* Send request to server
*/
private sendRequest(method: string, params: Record<string, unknown>): Promise<unknown> {
return new Promise((resolve, reject) => {
const id = ++this.requestId;
this.pendingRequests.set(id, { resolve, reject });
const request = {
jsonrpc: "2.0",
id,
method,
params,
};
this.process?.stdin.write(JSON.stringify(request) + "\n");
});
}
/**
* Disconnect from server
*/
disconnect(): void {
this.rl?.close();
this.process?.kill();
}
}
// Usage example
async function main(): Promise<void> {
const client = new KreuzbergMcpClient();
try {
// Connect to MCP server
await client.connect();
console.log("Connected to Kreuzberg MCP server");
// List available tools
const tools = await client.listTools();
console.log(
"Available tools:",
tools.map((t) => t.name),
);
// Extract file
const result = await client.extractFile("document.pdf", true);
console.log("Extraction result:", result);
} catch (error) {
console.error("Error:", error);
} finally {
client.disconnect();
}
}
// Run if executed directly
if (require.main === module) {
main();
}
export { KreuzbergMcpClient };
```

View File

@@ -0,0 +1,35 @@
```typescript title="TypeScript"
import { spawn } from "child_process";
import * as readline from "readline";
const mcpProcess = spawn("kreuzberg", ["mcp"]);
const rl = readline.createInterface({
input: mcpProcess.stdout,
output: mcpProcess.stdin,
terminal: false,
});
const request = {
method: "tools/call",
params: {
name: "extract_file",
arguments: {
path: "document.pdf",
async: true,
},
},
};
mcpProcess.stdin.write(JSON.stringify(request) + "\n");
rl.on("line", (line) => {
const response = JSON.parse(line);
console.log(response);
mcpProcess.kill();
});
mcpProcess.on("error", (err) => {
console.error("Failed to start MCP process:", err);
});
```

View File

@@ -0,0 +1,17 @@
```typescript title="TypeScript"
import { spawn } from "child_process";
const mcpProcess = spawn("kreuzberg", ["mcp"]);
mcpProcess.stdout.on("data", (data) => {
console.log(`MCP Server: ${data}`);
});
mcpProcess.stderr.on("data", (data) => {
console.error(`MCP Error: ${data}`);
});
mcpProcess.on("error", (err) => {
console.error(`Failed to start MCP server: ${err.message}`);
});
```

View File

@@ -0,0 +1,193 @@
```typescript title="TypeScript"
import {
extractFile,
extractFileSync,
extractBytes,
extractBytesSync,
type ExtractionResult,
type ExtractionConfig,
} from "@kreuzberg/node";
import * as readline from "node:readline";
/**
* MCP Server for Kreuzberg
* Exposes document extraction as MCP tools
* @example
* const server = new KreuzbergMcpServer();
* await server.start();
*/
class KreuzbergMcpServer {
private config?: ExtractionConfig;
private rl: readline.Interface;
constructor(config?: ExtractionConfig) {
this.config = config;
this.rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false,
});
}
/**
* Start MCP server
*/
async start(): Promise<void> {
console.error("[MCP Server] Starting Kreuzberg MCP server");
this.rl.on("line", async (line) => {
try {
const request = JSON.parse(line) as {
id: number;
method: string;
params: Record<string, unknown>;
};
const response = await this.handleRequest(request);
process.stdout.write(JSON.stringify(response) + "\n");
} catch (error) {
const errorMessage = error instanceof Error ? error.message : "Unknown error";
process.stdout.write(
JSON.stringify({
id: 0,
error: { message: errorMessage },
}) + "\n",
);
}
});
}
/**
* Handle incoming MCP request
*/
private async handleRequest(request: {
id: number;
method: string;
params: Record<string, unknown>;
}): Promise<Record<string, unknown>> {
const { id, method, params } = request;
if (method === "tools/list") {
return {
id,
result: this.listTools(),
};
}
if (method === "tools/call") {
const result = await this.callTool(
params.name as string,
params.arguments as Record<string, unknown>,
);
return {
id,
result,
};
}
throw new Error(`Unknown method: ${method}`);
}
/**
* List available tools
*/
private listTools(): Array<{
name: string;
description: string;
inputSchema: Record<string, unknown>;
}> {
return [
{
name: "extract_file",
description: "Extract content from a file by path",
inputSchema: {
type: "object",
properties: {
path: { type: "string", description: "Path to file" },
async: { type: "boolean", description: "Use async extraction" },
config: {
type: "object",
description: "Optional extraction config",
},
},
required: ["path"],
},
},
{
name: "extract_bytes",
description: "Extract content from raw bytes",
inputSchema: {
type: "object",
properties: {
data: { type: "string", description: "Base64-encoded data" },
mimeType: { type: "string", description: "MIME type" },
async: { type: "boolean", description: "Use async extraction" },
config: {
type: "object",
description: "Optional extraction config",
},
},
required: ["data", "mimeType"],
},
},
];
}
/**
* Call tool
*/
private async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
if (name === "extract_file") {
const path = args.path as string;
const useAsync = (args.async as boolean) ?? true;
const config = (args.config as ExtractionConfig) ?? this.config;
if (useAsync) {
return extractFile(path, null, config);
} else {
return extractFileSync(path, null, config);
}
}
if (name === "extract_bytes") {
const data = Buffer.from(args.data as string, "base64");
const mimeType = args.mimeType as string;
const useAsync = (args.async as boolean) ?? true;
const config = (args.config as ExtractionConfig) ?? this.config;
if (useAsync) {
return extractBytes(data, mimeType, config);
} else {
return extractBytesSync(data, mimeType, config);
}
}
throw new Error(`Unknown tool: ${name}`);
}
}
/**
* Main entry point
*/
async function main(): Promise<void> {
const config: ExtractionConfig = {
ocr: {
enabled: true,
backend: "tesseract",
},
};
const server = new KreuzbergMcpServer(config);
await server.start();
}
// Start server when invoked as MCP
if (process.argv[2] === "mcp") {
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});
}
export { KreuzbergMcpServer };
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.9,
detectMultiple: true,
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: true,
},
};
const result = await extractFile("multilingual_document.pdf", null, config);
if (result.detectedLanguages) {
console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
}
```

View File

@@ -0,0 +1,83 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
console.log(`Pages: ${result.metadata.pageCount}`);
}
const htmlResult = extractFileSync("page.html");
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
console.log(`Title: ${htmlMeta.title}`);
}
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}
// Access Open Graph fields from map
if (htmlMeta.openGraph) {
if (htmlMeta.openGraph["image"]) {
console.log(`Open Graph Image: ${htmlMeta.openGraph["image"]}`);
}
if (htmlMeta.openGraph["title"]) {
console.log(`Open Graph Title: ${htmlMeta.openGraph["title"]}`);
}
if (htmlMeta.openGraph["type"]) {
console.log(`Open Graph Type: ${htmlMeta.openGraph["type"]}`);
}
}
// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
if (htmlMeta.twitterCard["card"]) {
console.log(`Twitter Card Type: ${htmlMeta.twitterCard["card"]}`);
}
if (htmlMeta.twitterCard["creator"]) {
console.log(`Twitter Creator: ${htmlMeta.twitterCard["creator"]}`);
}
}
// Access new fields
if (htmlMeta.language) {
console.log(`Language: ${htmlMeta.language}`);
}
if (htmlMeta.textDirection) {
console.log(`Text Direction: ${htmlMeta.textDirection}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h) => h.text).join(", ")}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}
```

View File

@@ -0,0 +1,18 @@
Import { extractFileSync } from '@kreuzberg/node';
Const result = extractFileSync('document.pdf');
If (result.metadata.page_structure?.boundaries) {
const encoder = new TextEncoder();
const contentBytes = encoder.encode(result.content);
For (const boundary of result.metadata.page_structure.boundaries.slice(0, 3)) {
const pageBytes = contentBytes.slice(boundary.byteStart, boundary.byteEnd);
const pageText = new TextDecoder().decode(pageBytes);
console.log(`Page ${boundary.pageNumber}:`);
console.log(` Byte range: ${boundary.byteStart}-${boundary.byteEnd}`);
console.log(` Preview: ${pageText.substring(0, 100)}...`);
}
}

View File

@@ -0,0 +1,12 @@
Import { extractFileSync } from '@kreuzberg/node';
Const result = extractFileSync('document.pdf', null, { pages: { extractPages: true } });
If (result.pages) {
for (const page of result.pages) {
console.log(`Page ${page.pageNumber}:`);
console.log(` Content: ${page.content.length} chars`);
console.log(` Tables: ${page.tables.length}`);
console.log(` Images: ${page.images.length}`);
}
}

View File

@@ -0,0 +1,22 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
if (result.tables) {
for (const table of result.tables) {
const rowCount = table.cells?.length ?? 0;
console.log(`Table with ${rowCount} rows`);
if (table.markdown) {
console.log(table.markdown);
}
if (table.cells) {
for (const row of table.cells) {
console.log(row);
}
}
}
}
```

View File

@@ -0,0 +1,49 @@
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
interface VectorRecord {
id: string;
content: string;
embedding: number[];
metadata: Record<string, string>;
}
async function extractAndVectorize(
documentPath: string,
documentId: string,
): Promise<VectorRecord[]> {
const config = {
chunking: {
max_chars: 512,
max_overlap: 50,
embedding: {
model: { type: "preset", name: "balanced" },
normalize: true,
batchSize: 32,
},
},
};
const result = await extractFile(documentPath, null, config);
const records: VectorRecord[] = [];
if (result.chunks) {
result.chunks.forEach((chunk, index) => {
if (chunk.embedding) {
records.push({
id: `${documentId}_chunk_${index}`,
content: chunk.content,
embedding: chunk.embedding,
metadata: {
document_id: documentId,
chunk_index: index.toString(),
content_length: chunk.content.length.toString(),
},
});
}
});
}
return records;
}
```

View File

@@ -0,0 +1,29 @@
```typescript title="TypeScript"
import { registerOcrBackend, extractFileSync } from "@kreuzberg/node";
const supportedLangs = ["eng", "deu", "fra"];
const cloudBackend = {
name: () => "cloud-ocr",
version: () => "1.0.0",
initialize: () => {},
shutdown: () => {},
process_image: async (imageBytes: Uint8Array, config: { language?: string }) => {
// Call your cloud OCR API with imageBytes and config.language.
return { content: "Extracted text", mime_type: "text/plain" };
},
supports_language: (lang: string) => supportedLangs.includes(lang),
backend_type: () => "Custom",
supported_languages: () => supportedLangs,
};
registerOcrBackend(cloudBackend);
const result = extractFileSync("scanned.pdf", {
ocr: {
backend: "cloud-ocr",
language: "eng",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,16 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
images: {
extractImages: true,
targetDpi: 200,
maxImageDimension: 2048,
injectPlaceholders: true, // set to false to extract images without markdown references
autoAdjustDpi: true,
},
};
const result = extractFileSync("document.pdf", config);
console.log(`content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,22 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng",
tesseractConfig: {
preprocessing: {
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: "otsu",
},
},
},
};
const result = extractFileSync("document.pdf", config);
console.log(`content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,15 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
},
pdfOptions: {
extractImages: true,
},
};
const result = extractFileSync("scanned.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1 @@
EasyOCR is only available in Python.

View File

@@ -0,0 +1,24 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "paddle-ocr",
language: "en",
},
};
const result = extractFileSync("scanned.pdf", null, config);
if (result.ocrElements) {
for (const element of result.ocrElements) {
console.log(`Text: ${element.text}`);
console.log(`Confidence: ${element.confidence.recognition.toFixed(2)}`);
console.log(`Geometry:`, element.geometry);
if (element.rotation) {
console.log(`Rotation: ${element.rotation.angle}°`);
}
console.log();
}
}
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng",
},
};
const result = extractFileSync("scanned.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
},
forceOcr: true,
};
const result = extractFileSync("document.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,13 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+deu+fra",
},
};
const result = extractFileSync("multilingual.pdf", null, config);
console.log(result.content);
```

View File

@@ -0,0 +1,14 @@
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
ocr: {
backend: "paddle-ocr",
language: "en",
// modelTier: 'server', // for max accuracy
},
};
const result = extractFileSync("scanned.pdf", null, config);
console.log(result.content);
```

Some files were not shown because too many files have changed in this diff Show More