This commit is contained in:
16
docs/snippets/typescript/advanced/chunk_page_mapping.md
Normal file
16
docs/snippets/typescript/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,16 @@
|
||||
Import { extractFileSync } from '@kreuzberg/node';
|
||||
|
||||
Const result = extractFileSync('document.pdf', null, { chunking: { maxChars: 500, maxOverlap: 50 }, pages: { extractPages: true } });
|
||||
|
||||
If (result.chunks) {
|
||||
for (const chunk of result.chunks) {
|
||||
if (chunk.metadata.firstPage) {
|
||||
const pageRange = chunk.metadata.firstPage === chunk.metadata.lastPage
|
||||
? `Page ${chunk.metadata.firstPage}`
|
||||
: `Pages ${chunk.metadata.firstPage}-${chunk.metadata.lastPage}`;
|
||||
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 50)}... (${pageRange})`);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
47
docs/snippets/typescript/advanced/chunking_config.md
Normal file
47
docs/snippets/typescript/advanced/chunking_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
maxOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
|
||||
```
|
||||
|
||||
```typescript title="TypeScript - Semantic"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "semantic",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="TypeScript - Prepend Heading Context"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 500,
|
||||
maxOverlap: 50,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.md", null, config);
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
}
|
||||
```
|
||||
25
docs/snippets/typescript/advanced/chunking_rag.md
Normal file
25
docs/snippets/typescript/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 500,
|
||||
maxOverlap: 50,
|
||||
embedding: {
|
||||
model: { type: "preset", name: "balanced" },
|
||||
normalize: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("research_paper.pdf", null, config);
|
||||
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
console.log(`Chunk ${chunk.metadata.chunkIndex + 1}/${chunk.metadata.totalChunks}`);
|
||||
console.log(`Position: ${chunk.metadata.byteStart}-${chunk.metadata.byteEnd}`);
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
if (chunk.embedding) {
|
||||
console.log(`Embedding: ${chunk.embedding.length} dimensions`);
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/typescript/advanced/custom_cache.md
Normal file
18
docs/snippets/typescript/advanced/custom_cache.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile, type ExtractionConfig, type ExtractionResult } from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Note: Custom cache backends are not supported in TypeScript v4.0.
|
||||
* Caching is handled internally by the Rust core.
|
||||
*
|
||||
* This example demonstrates the config structure.
|
||||
* To enable caching, use the useCache flag.
|
||||
*/
|
||||
|
||||
// Usage with built-in cache
|
||||
const config: ExtractionConfig = {
|
||||
useCache: true, // Enable internal Rust cache
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
```
|
||||
19
docs/snippets/typescript/advanced/custom_extractor.md
Normal file
19
docs/snippets/typescript/advanced/custom_extractor.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="TypeScript"
|
||||
import { listDocumentExtractors, unregisterDocumentExtractor } from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Note: Custom document extractors are not directly supported in TypeScript v4.0.
|
||||
* Document extraction logic lives in the Rust core.
|
||||
*
|
||||
* You can list and unregister built-in extractors, but cannot add custom ones
|
||||
* from TypeScript. For custom extractors, implement them in Rust.
|
||||
*/
|
||||
|
||||
// List all registered document extractors
|
||||
const extractors = listDocumentExtractors();
|
||||
console.log("Available extractors:", extractors);
|
||||
// Example output: ['PDFExtractor', 'ImageExtractor', 'OfficeExtractor', ...]
|
||||
|
||||
// Unregister a built-in extractor (use with caution)
|
||||
unregisterDocumentExtractor("SomeExtractor");
|
||||
```
|
||||
78
docs/snippets/typescript/advanced/custom_ocr_backend.md
Normal file
78
docs/snippets/typescript/advanced/custom_ocr_backend.md
Normal file
@@ -0,0 +1,78 @@
|
||||
```typescript title="TypeScript"
|
||||
import { registerOcrBackend, type OcrBackendProtocol } from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Custom OCR backend implementation
|
||||
* Allows integration with custom OCR services
|
||||
* @example
|
||||
* const backend = new CustomOcrBackend();
|
||||
* await backend.initialize();
|
||||
* registerOcrBackend(backend);
|
||||
*/
|
||||
class CustomOcrBackend implements OcrBackendProtocol {
|
||||
private apiUrl: string;
|
||||
|
||||
constructor(apiUrl: string) {
|
||||
this.apiUrl = apiUrl;
|
||||
}
|
||||
|
||||
name(): string {
|
||||
return "custom-ocr-backend";
|
||||
}
|
||||
|
||||
supportedLanguages(): string[] {
|
||||
return ["en", "de", "fr", "es"];
|
||||
}
|
||||
|
||||
async initialize(): Promise<void> {
|
||||
console.log(`Initializing custom OCR backend at ${this.apiUrl}`);
|
||||
}
|
||||
|
||||
async shutdown(): Promise<void> {
|
||||
console.log("Shutting down custom OCR backend");
|
||||
}
|
||||
|
||||
/**
|
||||
* Process image and extract text via OCR
|
||||
*/
|
||||
async processImage(
|
||||
imageData: Uint8Array | string,
|
||||
language: string,
|
||||
): Promise<{
|
||||
content: string;
|
||||
mime_type: string;
|
||||
metadata: Record<string, unknown>;
|
||||
tables: unknown[];
|
||||
}> {
|
||||
const buffer =
|
||||
typeof imageData === "string" ? Buffer.from(imageData, "base64") : Buffer.from(imageData);
|
||||
|
||||
const formData = new FormData();
|
||||
const blob = new Blob([buffer], { type: "image/png" });
|
||||
formData.append("image", blob);
|
||||
formData.append("language", language);
|
||||
|
||||
const response = await fetch(`${this.apiUrl}/ocr`, {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OCR service failed: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
return {
|
||||
content: result.text,
|
||||
mime_type: "text/plain",
|
||||
metadata: { confidence: result.confidence, language },
|
||||
tables: result.tables || [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Register custom OCR backend
|
||||
const backend = new CustomOcrBackend("http://localhost:8000");
|
||||
await backend.initialize();
|
||||
registerOcrBackend(backend);
|
||||
```
|
||||
63
docs/snippets/typescript/advanced/custom_postprocessor.md
Normal file
63
docs/snippets/typescript/advanced/custom_postprocessor.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```typescript title="TypeScript"
|
||||
import {
|
||||
registerPostProcessor,
|
||||
unregisterPostProcessor,
|
||||
type PostProcessorProtocol,
|
||||
type ExtractionResult,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Custom post-processor for cleaning extraction results
|
||||
* Removes common artifacts and normalizes whitespace
|
||||
* @example
|
||||
* const processor = new CleaningPostProcessor();
|
||||
* registerPostProcessor(processor);
|
||||
*/
|
||||
class CleaningPostProcessor implements PostProcessorProtocol {
|
||||
name(): string {
|
||||
return "cleaning-postprocessor";
|
||||
}
|
||||
|
||||
processingStage(): "early" | "middle" | "late" {
|
||||
return "middle";
|
||||
}
|
||||
|
||||
/**
|
||||
* Process extraction result for cleanup
|
||||
*/
|
||||
process(result: ExtractionResult): ExtractionResult {
|
||||
return {
|
||||
...result,
|
||||
content: this.cleanContent(result.content),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove artifacts and normalize whitespace
|
||||
*/
|
||||
private cleanContent(content: string): string {
|
||||
// Remove multiple spaces
|
||||
let cleaned = content.replace(/\s+/g, " ");
|
||||
|
||||
// Remove common OCR artifacts
|
||||
cleaned = cleaned.replace(/\|/g, "l");
|
||||
cleaned = cleaned.replace(/0O/g, "00");
|
||||
|
||||
// Remove leading/trailing whitespace from lines
|
||||
cleaned = cleaned
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line.length > 0)
|
||||
.join("\n");
|
||||
|
||||
return cleaned.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Register the post-processor
|
||||
const processor = new CleaningPostProcessor();
|
||||
registerPostProcessor(processor);
|
||||
|
||||
// Later, unregister if needed
|
||||
// unregisterPostProcessor("cleaning-postprocessor");
|
||||
```
|
||||
86
docs/snippets/typescript/advanced/custom_validator.md
Normal file
86
docs/snippets/typescript/advanced/custom_validator.md
Normal file
@@ -0,0 +1,86 @@
|
||||
```typescript title="TypeScript"
|
||||
import {
|
||||
registerValidator,
|
||||
unregisterValidator,
|
||||
extractFile,
|
||||
type ValidatorProtocol,
|
||||
type ExtractionResult,
|
||||
ValidationError,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Custom validator for quality checking
|
||||
* Ensures extracted content meets minimum quality standards
|
||||
* @example
|
||||
* const validator = new QualityValidator();
|
||||
* registerValidator(validator);
|
||||
*/
|
||||
class QualityValidator implements ValidatorProtocol {
|
||||
name(): string {
|
||||
return "quality-validator";
|
||||
}
|
||||
|
||||
priority(): number {
|
||||
return 10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate extraction result meets quality standards
|
||||
*/
|
||||
validate(result: ExtractionResult): void {
|
||||
this.checkMinimumLength(result);
|
||||
this.checkEmptyContent(result);
|
||||
this.checkMetadata(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure minimum content length
|
||||
*/
|
||||
private checkMinimumLength(result: ExtractionResult): void {
|
||||
const minLength = 50;
|
||||
if (result.content.length < minLength) {
|
||||
throw new ValidationError(
|
||||
`Content too short: ${result.content.length} bytes (minimum ${minLength})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure content is not empty
|
||||
*/
|
||||
private checkEmptyContent(result: ExtractionResult): void {
|
||||
const trimmed = result.content.trim();
|
||||
if (trimmed.length === 0) {
|
||||
throw new ValidationError("Extracted content is empty");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate metadata is present
|
||||
*/
|
||||
private checkMetadata(result: ExtractionResult): void {
|
||||
if (!result.metadata || Object.keys(result.metadata).length === 0) {
|
||||
throw new ValidationError("Missing extraction metadata");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Register the validator
|
||||
const validator = new QualityValidator();
|
||||
registerValidator(validator);
|
||||
|
||||
// Usage with error handling (must use async extraction for custom validators)
|
||||
try {
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(`Validated content length: ${result.content.length} characters`);
|
||||
} catch (error) {
|
||||
if (error instanceof ValidationError) {
|
||||
console.error(`Validation failed: ${error.message}`);
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Later, unregister if needed
|
||||
// unregisterValidator("quality-validator");
|
||||
```
|
||||
14
docs/snippets/typescript/advanced/embedding_with_chunking.md
Normal file
14
docs/snippets/typescript/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1024,
|
||||
maxOverlap: 100,
|
||||
embedding: {
|
||||
model: { type: "preset", name: "balanced" },
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
},
|
||||
},
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,11 @@
|
||||
```typescript title="TypeScript"
|
||||
const config = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("research_paper.pdf", null, config);
|
||||
|
||||
for (const keyword of result.extractedKeywords ?? []) {
|
||||
console.log(`${keyword.text}: ${keyword.score.toFixed(3)}`);
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,9 @@
|
||||
```typescript title="TypeScript"
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
},
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,15 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("multilingual_document.pdf", null, config);
|
||||
|
||||
console.log(`Detected languages: ${result.detectedLanguages?.join(", ") ?? ""}`);
|
||||
```
|
||||
128
docs/snippets/typescript/advanced/plugin_registry.md
Normal file
128
docs/snippets/typescript/advanced/plugin_registry.md
Normal file
@@ -0,0 +1,128 @@
|
||||
```typescript title="TypeScript"
|
||||
import {
|
||||
registerPostProcessor,
|
||||
registerValidator,
|
||||
registerOcrBackend,
|
||||
listPostProcessors,
|
||||
listValidators,
|
||||
listOcrBackends,
|
||||
unregisterPostProcessor,
|
||||
unregisterValidator,
|
||||
clearPostProcessors,
|
||||
clearValidators,
|
||||
clearOcrBackends,
|
||||
type PostProcessorProtocol,
|
||||
type ValidatorProtocol,
|
||||
type OcrBackendProtocol,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
/**
|
||||
* Plugin registry and lifecycle management
|
||||
* Demonstrates how to register, list, and unregister plugins
|
||||
* @example
|
||||
* const registry = new PluginRegistry();
|
||||
* registry.registerAll();
|
||||
* registry.listAll();
|
||||
*/
|
||||
class PluginRegistry {
|
||||
private postProcessors: PostProcessorProtocol[] = [];
|
||||
private validators: ValidatorProtocol[] = [];
|
||||
private ocrBackends: OcrBackendProtocol[] = [];
|
||||
|
||||
/**
|
||||
* Register all available plugins
|
||||
*/
|
||||
registerAll(): void {
|
||||
console.log("Registering all plugins...");
|
||||
|
||||
// Register post-processors
|
||||
this.postProcessors.forEach((processor) => {
|
||||
registerPostProcessor(processor);
|
||||
console.log(`Registered post-processor: ${processor.name()}`);
|
||||
});
|
||||
|
||||
// Register validators
|
||||
this.validators.forEach((validator) => {
|
||||
registerValidator(validator);
|
||||
console.log(`Registered validator: ${validator.name()}`);
|
||||
});
|
||||
|
||||
// Register OCR backends
|
||||
this.ocrBackends.forEach((backend) => {
|
||||
registerOcrBackend(backend);
|
||||
console.log(`Registered OCR backend: ${backend.name()}`);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* List all registered plugins
|
||||
*/
|
||||
listAll(): void {
|
||||
const processors = listPostProcessors();
|
||||
const validators = listValidators();
|
||||
const backends = listOcrBackends();
|
||||
|
||||
console.log("Registered plugins:");
|
||||
console.log(` Post-processors: ${processors.join(", ")}`);
|
||||
console.log(` Validators: ${validators.join(", ")}`);
|
||||
console.log(` OCR backends: ${backends.join(", ")}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregister specific plugin by name
|
||||
*/
|
||||
unregisterPlugin(name: string, type: "processor" | "validator"): void {
|
||||
if (type === "processor") {
|
||||
unregisterPostProcessor(name);
|
||||
console.log(`Unregistered post-processor: ${name}`);
|
||||
} else if (type === "validator") {
|
||||
unregisterValidator(name);
|
||||
console.log(`Unregistered validator: ${name}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all registered plugins
|
||||
*/
|
||||
clearAll(): void {
|
||||
clearPostProcessors();
|
||||
clearValidators();
|
||||
clearOcrBackends();
|
||||
console.log("Cleared all plugins");
|
||||
}
|
||||
|
||||
/**
|
||||
* Add plugin to registry
|
||||
*/
|
||||
addPostProcessor(processor: PostProcessorProtocol): void {
|
||||
this.postProcessors.push(processor);
|
||||
}
|
||||
|
||||
addValidator(validator: ValidatorProtocol): void {
|
||||
this.validators.push(validator);
|
||||
}
|
||||
|
||||
addOcrBackend(backend: OcrBackendProtocol): void {
|
||||
this.ocrBackends.push(backend);
|
||||
}
|
||||
}
|
||||
|
||||
// Usage
|
||||
const registry = new PluginRegistry();
|
||||
|
||||
// Add plugins to registry
|
||||
// registry.addPostProcessor(customProcessor);
|
||||
// registry.addValidator(customValidator);
|
||||
|
||||
// Register all plugins
|
||||
registry.registerAll();
|
||||
|
||||
// List registered plugins
|
||||
registry.listAll();
|
||||
|
||||
// Unregister specific plugin
|
||||
// registry.unregisterPlugin("custom-processor", "processor");
|
||||
|
||||
// Clear all plugins
|
||||
// registry.clearAll();
|
||||
```
|
||||
@@ -0,0 +1,5 @@
|
||||
```typescript title="TypeScript"
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = await extractFile("scanned_document.pdf", null, config);
|
||||
|
||||
if (result.qualityScore !== null && result.qualityScore !== undefined) {
|
||||
if (result.qualityScore < 0.5) {
|
||||
console.warn(`Warning: Low quality extraction (${result.qualityScore.toFixed(2)})`);
|
||||
} else {
|
||||
console.log(`Quality score: ${result.qualityScore.toFixed(2)}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
10
docs/snippets/typescript/advanced/token_reduction_config.md
Normal file
10
docs/snippets/typescript/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```typescript title="TypeScript"
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
level: "Moderate",
|
||||
preserveMarkdown: true,
|
||||
preserveCode: true,
|
||||
languageHint: "eng",
|
||||
},
|
||||
};
|
||||
```
|
||||
14
docs/snippets/typescript/advanced/token_reduction_example.md
Normal file
14
docs/snippets/typescript/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
level: "Moderate",
|
||||
preserveMarkdown: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("verbose_document.pdf", null, config);
|
||||
|
||||
console.log(`Reduced content length: ${result.content?.length ?? 0} chars`);
|
||||
```
|
||||
@@ -0,0 +1,49 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
interface VectorRecord {
|
||||
id: string;
|
||||
content: string;
|
||||
embedding: number[];
|
||||
metadata: Record<string, string>;
|
||||
}
|
||||
|
||||
async function extractAndVectorize(
|
||||
documentPath: string,
|
||||
documentId: string,
|
||||
): Promise<VectorRecord[]> {
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: {
|
||||
model: { type: "preset", name: "balanced" },
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile(documentPath, null, config);
|
||||
|
||||
const records: VectorRecord[] = [];
|
||||
for (const [index, chunk] of (result.chunks ?? []).entries()) {
|
||||
if (!chunk.embedding) {
|
||||
continue;
|
||||
}
|
||||
records.push({
|
||||
id: `${documentId}_chunk_${index}`,
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
metadata: {
|
||||
document_id: documentId,
|
||||
chunk_index: String(index),
|
||||
content_length: String(chunk.content.length),
|
||||
},
|
||||
});
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
await extractAndVectorize("document.pdf", "doc_001");
|
||||
```
|
||||
50
docs/snippets/typescript/advanced/worker_pool.md
Normal file
50
docs/snippets/typescript/advanced/worker_pool.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Worker Thread Pool Example
|
||||
|
||||
Use worker pools for CPU-bound batch processing to maximize parallelism and throughput:
|
||||
|
||||
```typescript title="worker_pool.ts"
|
||||
import {
|
||||
createWorkerPool,
|
||||
extractFileInWorker,
|
||||
batchExtractFilesInWorker,
|
||||
closeWorkerPool,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
// Create a pool with 4 worker threads
|
||||
const pool = createWorkerPool(4);
|
||||
|
||||
try {
|
||||
// Extract single file in worker
|
||||
const result = await extractFileInWorker(pool, "document.pdf", null, {
|
||||
useCache: true,
|
||||
});
|
||||
console.log(result.content);
|
||||
|
||||
// Extract multiple files concurrently
|
||||
const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
|
||||
const results = await batchExtractFilesInWorker(pool, files, {
|
||||
useCache: true,
|
||||
});
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
} finally {
|
||||
// Always close the pool when done
|
||||
await closeWorkerPool(pool);
|
||||
}
|
||||
```
|
||||
|
||||
**Performance Benefits:**
|
||||
|
||||
- **Parallel Processing**: Multiple documents extracted simultaneously
|
||||
- **CPU Usage**: Maximizes multi-core CPU usage for large batches
|
||||
- **Queue Management**: Automatically distributes work across available workers
|
||||
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
||||
|
||||
**Best Practices:**
|
||||
|
||||
- Use worker pools for batches of 10+ documents
|
||||
- Set pool size to number of CPU cores (default behavior)
|
||||
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
||||
- Reuse pools across multiple batch operations for efficiency
|
||||
18
docs/snippets/typescript/api/batch_extract_bytes_sync.md
Normal file
18
docs/snippets/typescript/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractBytesSync } from "kreuzberg";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
const doc1 = readFileSync("doc1.pdf");
|
||||
const doc2 = readFileSync("doc2.pdf");
|
||||
|
||||
const items = [
|
||||
{ content: doc1, mimeType: "application/pdf", config: undefined },
|
||||
{ content: doc2, mimeType: "application/pdf", config: undefined },
|
||||
];
|
||||
|
||||
const results = batchExtractBytesSync(items);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`Document ${i}: ${result.content.length} chars`);
|
||||
});
|
||||
```
|
||||
15
docs/snippets/typescript/api/batch_extract_files_sync.md
Normal file
15
docs/snippets/typescript/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractFilesSync } from "kreuzberg";
|
||||
|
||||
const items = [
|
||||
{ path: "doc1.pdf", config: undefined },
|
||||
{ path: "doc2.docx", config: undefined },
|
||||
{ path: "report.pdf", config: undefined },
|
||||
];
|
||||
|
||||
const results = batchExtractFilesSync(items);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i}: ${result.content.length} chars`);
|
||||
});
|
||||
```
|
||||
23
docs/snippets/typescript/api/client_chunk_text.md
Normal file
23
docs/snippets/typescript/api/client_chunk_text.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```typescript title="TypeScript"
|
||||
const response = await fetch("http://localhost:8000/chunk", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
text: "Your long text content here...",
|
||||
chunker_type: "text",
|
||||
config: {
|
||||
max_characters: 1000,
|
||||
overlap: 50,
|
||||
trim: true,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
console.log(`Created ${result.chunk_count} chunks`);
|
||||
result.chunks.forEach((chunk: { content: string; chunk_index: number }) => {
|
||||
const preview = chunk.content.substring(0, 50);
|
||||
console.log(`Chunk ${chunk.chunk_index}: ${preview}...`);
|
||||
});
|
||||
```
|
||||
32
docs/snippets/typescript/api/client_extract_single_file.md
Normal file
32
docs/snippets/typescript/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```typescript title="TypeScript"
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
async function extractViaClient() {
|
||||
const formData = new FormData();
|
||||
const fileData = readFileSync("document.pdf");
|
||||
formData.append("files", new Blob([fileData]), "document.pdf");
|
||||
|
||||
try {
|
||||
const response = await fetch("http://localhost:8000/extract", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
console.error(`Error: ${error.error_type}: ${error.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const results = await response.json();
|
||||
console.log(`Extracted ${results.length} document(s)`);
|
||||
console.log(results[0].content);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof Error) {
|
||||
console.error(`Request failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractViaClient();
|
||||
```
|
||||
45
docs/snippets/typescript/api/combining_all_features.md
Normal file
45
docs/snippets/typescript/api/combining_all_features.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "kreuzberg";
|
||||
|
||||
const config = {
|
||||
// OCR: Tesseract on all pages with English text
|
||||
force_ocr: false,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
// Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
|
||||
chunking: {
|
||||
max_characters: 800,
|
||||
overlap: 100,
|
||||
chunker_type: "markdown",
|
||||
prepend_heading_context: true,
|
||||
},
|
||||
// Output: include document structure and tables
|
||||
output_format: "markdown",
|
||||
include_document_structure: true,
|
||||
// Images: extract embedded images
|
||||
images: {
|
||||
extract_images: true,
|
||||
},
|
||||
// Cache extracted results on disk
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("report.pdf", undefined, config);
|
||||
|
||||
console.log(`Content (${result.content.length} chars):`);
|
||||
console.log(result.content.slice(0, 200));
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`\nChunks: ${result.chunks.length}`);
|
||||
}
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
if (result.detected_languages) {
|
||||
console.log(`Languages: ${result.detected_languages}`);
|
||||
}
|
||||
if (result.extraction_method) {
|
||||
console.log(`Extraction method: ${result.extraction_method}`);
|
||||
}
|
||||
```
|
||||
13
docs/snippets/typescript/api/error_handling.md
Normal file
13
docs/snippets/typescript/api/error_handling.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "kreuzberg";
|
||||
|
||||
try {
|
||||
const result = extractFileSync("missing.pdf");
|
||||
console.log(result.content);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof Error) {
|
||||
console.error(`Extraction failed: ${error.message}`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
```
|
||||
31
docs/snippets/typescript/api/error_handling_extract.md
Normal file
31
docs/snippets/typescript/api/error_handling_extract.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```typescript title="TypeScript"
|
||||
import { readFileSync } from "node:fs";
|
||||
|
||||
async function extractDocument(): Promise<void> {
|
||||
const formData = new FormData();
|
||||
const fileData = readFileSync("document.pdf");
|
||||
formData.append("files", new Blob([fileData]), "document.pdf");
|
||||
|
||||
try {
|
||||
const response = await fetch("http://localhost:8000/extract", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
console.error(`Error: ${error.error_type}: ${error.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const results = await response.json();
|
||||
console.log(`Extracted ${results.length} documents`);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof Error) {
|
||||
console.error(`Request failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractDocument();
|
||||
```
|
||||
14
docs/snippets/typescript/api/extract_bytes_async.md
Normal file
14
docs/snippets/typescript/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractBytes } from "kreuzberg";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
async function main() {
|
||||
const content = readFileSync("document.pdf");
|
||||
const result = await extractBytes(content, "application/pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
|
||||
main();
|
||||
```
|
||||
10
docs/snippets/typescript/api/extract_bytes_sync.md
Normal file
10
docs/snippets/typescript/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractBytesSync } from "kreuzberg";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
const content = readFileSync("document.pdf");
|
||||
const result = extractBytesSync(content, "application/pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
```
|
||||
12
docs/snippets/typescript/api/extract_file_async.md
Normal file
12
docs/snippets/typescript/api/extract_file_async.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "kreuzberg";
|
||||
|
||||
async function main() {
|
||||
const result = await extractFile("document.pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
|
||||
main();
|
||||
```
|
||||
9
docs/snippets/typescript/api/extract_file_sync.md
Normal file
9
docs/snippets/typescript/api/extract_file_sync.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "kreuzberg";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`MIME type: ${result.mime_type}`);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
```
|
||||
11
docs/snippets/typescript/api/tables.md
Normal file
11
docs/snippets/typescript/api/tables.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "kreuzberg";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
result.tables?.forEach((table) => {
|
||||
console.log(`Table with ${table.cells?.length ?? 0} rows`);
|
||||
console.log(table.markdown);
|
||||
table.cells?.forEach((row) => console.log(row.join(" | ")));
|
||||
});
|
||||
```
|
||||
49
docs/snippets/typescript/benchmarking/simple_benchmark.ts
Normal file
49
docs/snippets/typescript/benchmarking/simple_benchmark.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
```typescript title="simple_benchmark.ts"
|
||||
/* oxlint-disable */
|
||||
import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
async function benchmarkExtractions(): Promise<void> {
|
||||
const config: ExtractionConfig = { useCache: false };
|
||||
const filePath = "document.pdf";
|
||||
const numRuns = 10;
|
||||
|
||||
let start = performance.now();
|
||||
for (let i = 0; i < numRuns; i++) {
|
||||
await extractFile(filePath, null, config);
|
||||
}
|
||||
const syncDuration = (performance.now() - start) / 1000;
|
||||
const avgSync = syncDuration / numRuns;
|
||||
|
||||
console.log("Sync extraction (" + numRuns + " runs):");
|
||||
console.log(" - Total time: " + syncDuration.toFixed(3) + "s");
|
||||
console.log(" - Average: " + avgSync.toFixed(3) + "s per extraction");
|
||||
|
||||
start = performance.now();
|
||||
const tasks = Array(numRuns)
|
||||
.fill(null)
|
||||
.map(() => extractFile(filePath, null, config));
|
||||
await Promise.all(tasks);
|
||||
const asyncDuration = (performance.now() - start) / 1000;
|
||||
|
||||
console.log("\nAsync extraction (" + numRuns + " parallel runs):");
|
||||
console.log(" - Total time: " + asyncDuration.toFixed(3) + "s");
|
||||
console.log(" - Average: " + (asyncDuration / numRuns).toFixed(3) + "s per extraction");
|
||||
console.log(" - Speedup: " + (syncDuration / asyncDuration).toFixed(1) + "x");
|
||||
|
||||
const cacheConfig: ExtractionConfig = { useCache: true };
|
||||
|
||||
console.log("\nFirst extraction (populates cache)...");
|
||||
start = performance.now();
|
||||
const _result1 = await extractFile(filePath, null, cacheConfig);
|
||||
const firstDuration = (performance.now() - start) / 1000;
|
||||
console.log(" - Time: " + firstDuration.toFixed(3) + "s");
|
||||
|
||||
console.log("Second extraction (from cache)...");
|
||||
start = performance.now();
|
||||
const _result2 = await extractFile(filePath, null, cacheConfig);
|
||||
const cachedDuration = (performance.now() - start) / 1000;
|
||||
console.log(" - Time: " + cachedDuration.toFixed(3) + "s");
|
||||
console.log(" - Cache speedup: " + (firstDuration / cachedDuration).toFixed(1) + "x");
|
||||
}
|
||||
|
||||
benchmarkExtractions().catch(console.error);
|
||||
20
docs/snippets/typescript/cache/disk_cache.ts
vendored
Normal file
20
docs/snippets/typescript/cache/disk_cache.ts
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="disk_cache.ts"
|
||||
/* oxlint-disable */
|
||||
import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
const config: ExtractionConfig = { useCache: true };
|
||||
|
||||
(async () => {
|
||||
console.log("First extraction (will be cached)...");
|
||||
const result1 = await extractFile("document.pdf", null, config);
|
||||
const length1 = result1.content.length;
|
||||
console.log(" - Content length: " + length1);
|
||||
|
||||
console.log("\nSecond extraction (from cache)...");
|
||||
const result2 = await extractFile("document.pdf", null, config);
|
||||
const length2 = result2.content.length;
|
||||
console.log(" - Content length: " + length2);
|
||||
|
||||
const isIdentical = result1.content === result2.content;
|
||||
console.log("\nResults are identical: " + isIdentical);
|
||||
})();
|
||||
51
docs/snippets/typescript/cli/basic_cli.ts
Normal file
51
docs/snippets/typescript/cli/basic_cli.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
```typescript title="basic_cli.ts"
|
||||
/* oxlint-disable */
|
||||
import { spawn } from "child_process";
|
||||
|
||||
interface ExtractionResult {
|
||||
content: string;
|
||||
format?: string;
|
||||
languages?: string[];
|
||||
}
|
||||
|
||||
async function extractWithCli(
|
||||
filePath: string,
|
||||
outputFormat: string = "text"
|
||||
): Promise<string | ExtractionResult> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn("kreuzberg", ["extract", filePath, "--format", outputFormat]);
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
child.stdout.on("data", (data) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
|
||||
child.stderr.on("data", (data) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (code !== 0) {
|
||||
const errMsg = "CLI exited with code " + code + ": " + stderr;
|
||||
reject(new Error(errMsg));
|
||||
return;
|
||||
}
|
||||
|
||||
if (outputFormat === "json") {
|
||||
resolve(JSON.parse(stdout));
|
||||
} else {
|
||||
resolve(stdout);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const document = "document.pdf";
|
||||
const textOutput = await extractWithCli(document, "text");
|
||||
console.log(`Extracted: ${(textOutput as string).length} characters`);
|
||||
|
||||
const jsonOutput = (await extractWithCli(document, "json")) as ExtractionResult;
|
||||
console.log(`Format: ${jsonOutput.format}`);
|
||||
```
|
||||
57
docs/snippets/typescript/cli/cli_with_config.ts
Normal file
57
docs/snippets/typescript/cli/cli_with_config.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
```typescript title="cli_with_config.ts"
|
||||
/* oxlint-disable */
|
||||
import { spawn } from "child_process";
|
||||
|
||||
interface ExtractionResult {
|
||||
content: string;
|
||||
format?: string;
|
||||
languages?: string[];
|
||||
}
|
||||
|
||||
async function extractWithConfig(
|
||||
filePath: string,
|
||||
configPath: string
|
||||
): Promise<ExtractionResult> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn("kreuzberg", [
|
||||
"extract",
|
||||
filePath,
|
||||
"--config",
|
||||
configPath,
|
||||
"--format",
|
||||
"json",
|
||||
]);
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
child.stdout.on("data", (data) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
|
||||
child.stderr.on("data", (data) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (code !== 0) {
|
||||
const errMsg = "CLI exited with code " + code + ": " + stderr;
|
||||
reject(new Error(errMsg));
|
||||
return;
|
||||
}
|
||||
|
||||
resolve(JSON.parse(stdout));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const configFile = "kreuzberg.toml";
|
||||
const document = "document.pdf";
|
||||
|
||||
console.log("Extracting " + document + " with config " + configFile);
|
||||
const result = await extractWithConfig(document, configFile);
|
||||
|
||||
console.log("Content length: " + result.content.length);
|
||||
console.log("Format: " + result.format);
|
||||
const languages = result.languages ? result.languages.join(", ") : "";
|
||||
console.log("Languages: " + languages);
|
||||
35
docs/snippets/typescript/config/advanced_config.md
Normal file
35
docs/snippets/typescript/config/advanced_config.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
maxOverlap: 100,
|
||||
},
|
||||
tokenReduction: {
|
||||
mode: "aggressive",
|
||||
},
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
detectMultiple: true,
|
||||
},
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.chunks) {
|
||||
for (const chunk of result.chunks) {
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
```
|
||||
70
docs/snippets/typescript/config/chunking_config.md
Normal file
70
docs/snippets/typescript/config/chunking_config.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
maxOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
|
||||
```
|
||||
|
||||
```typescript title="TypeScript - Markdown with Heading Context"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 500,
|
||||
maxOverlap: 50,
|
||||
sizingType: "tokenizer",
|
||||
sizingModel: "Xenova/gpt-4o",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.md", null, config);
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
const headings = chunk.metadata?.headingContext?.headings ?? [];
|
||||
for (const heading of headings) {
|
||||
console.log(`Heading L${heading.level}: ${heading.text}`);
|
||||
}
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="TypeScript - Semantic"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "semantic",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="TypeScript - Prepend Heading Context"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 500,
|
||||
maxOverlap: 50,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.md", null, config);
|
||||
for (const chunk of result.chunks ?? []) {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
|
||||
}
|
||||
```
|
||||
11
docs/snippets/typescript/config/config_basic.md
Normal file
11
docs/snippets/typescript/config/config_basic.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
11
docs/snippets/typescript/config/config_discover.md
Normal file
11
docs/snippets/typescript/config/config_discover.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile, ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
const config = ExtractionConfig.discover();
|
||||
if (config) {
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
} else {
|
||||
console.log("No configuration file found");
|
||||
}
|
||||
```
|
||||
20
docs/snippets/typescript/config/config_discovery.md
Normal file
20
docs/snippets/typescript/config/config_discovery.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Configuration Discovery Example
|
||||
|
||||
Use `ExtractionConfig.discover()` to automatically find and load configuration files from the current directory or parent directories:
|
||||
|
||||
```typescript title="config_discovery.ts"
|
||||
import { ExtractionConfig, extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = ExtractionConfig.discover();
|
||||
if (config) {
|
||||
console.log("Found configuration file");
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
} else {
|
||||
console.log("No configuration file found, using defaults");
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.content);
|
||||
}
|
||||
```
|
||||
|
||||
The discovery method looks for `kreuzberg.toml`, `kreuzberg.yaml`, or `kreuzberg.json` files starting in the current directory and searching parent directories up to the filesystem root.
|
||||
16
docs/snippets/typescript/config/config_ocr.md
Normal file
16
docs/snippets/typescript/config/config_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+fra",
|
||||
tesseractConfig: {
|
||||
psm: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
22
docs/snippets/typescript/config/config_programmatic.md
Normal file
22
docs/snippets/typescript/config/config_programmatic.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync, type ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
useCache: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseractConfig: {
|
||||
psm: 6,
|
||||
},
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
maxOverlap: 200,
|
||||
},
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
```
|
||||
15
docs/snippets/typescript/config/document_structure_config.md
Normal file
15
docs/snippets/typescript/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="Document Structure Config (TypeScript)"
|
||||
import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", undefined, config);
|
||||
|
||||
if (result.document) {
|
||||
for (const node of result.document.nodes) {
|
||||
console.log(`[${node.content.nodeType}] ${node.content.text ?? ""}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/typescript/config/element_based_output.md
Normal file
35
docs/snippets/typescript/config/element_based_output.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="Element-Based Output (TypeScript)"
|
||||
import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
// Configure element-based output
|
||||
const config: ExtractionConfig = {
|
||||
outputFormat: "element_based",
|
||||
};
|
||||
|
||||
// Extract document
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
|
||||
// Access elements
|
||||
for (const element of result.elements) {
|
||||
console.log(`Type: ${element.elementType}`);
|
||||
console.log(`Text: ${element.text.slice(0, 100)}`);
|
||||
|
||||
if (element.metadata.pageNumber) {
|
||||
console.log(`Page: ${element.metadata.pageNumber}`);
|
||||
}
|
||||
|
||||
if (element.metadata.coordinates) {
|
||||
const coords = element.metadata.coordinates;
|
||||
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
|
||||
}
|
||||
|
||||
console.log("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
const titles = result.elements.filter((e) => e.elementType === "title");
|
||||
for (const title of titles) {
|
||||
const level = title.metadata.additional?.level || "unknown";
|
||||
console.log(`[${level}] ${title.text}`);
|
||||
}
|
||||
```
|
||||
17
docs/snippets/typescript/config/embedding_config.md
Normal file
17
docs/snippets/typescript/config/embedding_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
embedding: {
|
||||
preset: "quality",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
if (result.chunks && result.chunks.length > 0) {
|
||||
console.log(`Chunk embeddings: ${result.chunks[0].embedding?.length ?? 0} dimensions`);
|
||||
}
|
||||
```
|
||||
91
docs/snippets/typescript/config/embedding_config.ts
Normal file
91
docs/snippets/typescript/config/embedding_config.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import type { ChunkingConfig, EmbeddingConfig, ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
const _embeddingConfig: EmbeddingConfig = {
|
||||
model: {
|
||||
type: "preset",
|
||||
name: "balanced",
|
||||
},
|
||||
batchSize: 32,
|
||||
normalize: true,
|
||||
showDownloadProgress: true,
|
||||
cacheDir: "~/.cache/kreuzberg/embeddings",
|
||||
};
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
const _embeddingConfig2: EmbeddingConfig = {
|
||||
model: {
|
||||
type: "custom",
|
||||
modelId: "BAAI/bge-small-en-v1.5",
|
||||
dimensions: 384,
|
||||
},
|
||||
batchSize: 32,
|
||||
normalize: true,
|
||||
showDownloadProgress: true,
|
||||
cacheDir: undefined, // Uses default: .kreuzberg/embeddings/
|
||||
};
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
const _embeddingConfig3: EmbeddingConfig = {
|
||||
model: {
|
||||
type: "custom",
|
||||
modelId: "sentence-transformers/all-mpnet-base-v2",
|
||||
dimensions: 768,
|
||||
},
|
||||
batchSize: 16, // Larger model requires smaller batch size
|
||||
normalize: true,
|
||||
showDownloadProgress: true,
|
||||
cacheDir: "/var/cache/embeddings",
|
||||
};
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
const chunkingWithEmbeddings: ChunkingConfig = {
|
||||
maxChars: 1024,
|
||||
maxOverlap: 100,
|
||||
preset: "balanced",
|
||||
embedding: {
|
||||
model: {
|
||||
type: "preset",
|
||||
name: "balanced",
|
||||
},
|
||||
batchSize: 32,
|
||||
normalize: true,
|
||||
},
|
||||
};
|
||||
|
||||
const _extractionConfig: ExtractionConfig = {
|
||||
chunking: chunkingWithEmbeddings,
|
||||
};
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// batchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// cacheDir: Where to store downloaded models
|
||||
// - undefined: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - string: Custom directory for model storage
|
||||
//
|
||||
// showDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
89
docs/snippets/typescript/config/hierarchy_config.ts
Normal file
89
docs/snippets/typescript/config/hierarchy_config.ts
Normal file
@@ -0,0 +1,89 @@
|
||||
import {
|
||||
type ExtractionConfig,
|
||||
extractFileSync,
|
||||
type HierarchyConfig,
|
||||
type PdfConfig,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
const hierarchyConfigBasic: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
includeBbox: true, // Include bounding box coordinates
|
||||
ocrCoverageThreshold: undefined, // No OCR coverage threshold
|
||||
};
|
||||
|
||||
const pdfConfigBasic: PdfConfig = {
|
||||
hierarchy: hierarchyConfigBasic,
|
||||
};
|
||||
|
||||
const extractionConfigBasic: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigBasic,
|
||||
};
|
||||
|
||||
const _result = extractFileSync("document.pdf", { config: extractionConfigBasic });
|
||||
|
||||
// Example 2: Custom kClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
const hierarchyConfigMinimal: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 3, // Minimal clustering: just 3 levels
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: undefined,
|
||||
};
|
||||
|
||||
const pdfConfigMinimal: PdfConfig = {
|
||||
hierarchy: hierarchyConfigMinimal,
|
||||
};
|
||||
|
||||
const extractionConfigMinimal: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigMinimal,
|
||||
};
|
||||
|
||||
const _resultMinimal = extractFileSync("document.pdf", { config: extractionConfigMinimal });
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
const hierarchyConfigOcr: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6,
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
const pdfConfigOcr: PdfConfig = {
|
||||
hierarchy: hierarchyConfigOcr,
|
||||
};
|
||||
|
||||
const extractionConfigOcr: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigOcr,
|
||||
};
|
||||
|
||||
const _resultOcr = extractFileSync("document.pdf", { config: extractionConfigOcr });
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: boolean (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// kClusters: number (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// includeBbox: boolean (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocrCoverageThreshold: number | undefined (default: undefined)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - undefined means no OCR coverage-based triggering
|
||||
12
docs/snippets/typescript/config/html_output.md
Normal file
12
docs/snippets/typescript/config/html_output.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "kreuzberg";
|
||||
|
||||
const result = await extractFile("document.pdf", {
|
||||
outputFormat: "html",
|
||||
htmlOutput: {
|
||||
theme: "github",
|
||||
embedCss: true,
|
||||
},
|
||||
});
|
||||
console.log(result.content); // HTML with kb-* classes
|
||||
```
|
||||
70
docs/snippets/typescript/config/keyword_config.ts
Normal file
70
docs/snippets/typescript/config/keyword_config.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
import {
|
||||
type ExtractionConfig,
|
||||
extractFile,
|
||||
type RakeParams,
|
||||
type YakeParams,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
async function basicYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
async function _advancedYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 15,
|
||||
minScore: 0.1,
|
||||
ngramRange: [1, 2],
|
||||
language: "en",
|
||||
yakeParams: {
|
||||
windowSize: 1,
|
||||
} as YakeParams,
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
async function _rakeConfig(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "rake",
|
||||
maxKeywords: 10,
|
||||
minScore: 5.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: {
|
||||
minWordLength: 1,
|
||||
maxWordsPerPhrase: 3,
|
||||
} as RakeParams,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
basicYake();
|
||||
16
docs/snippets/typescript/config/keyword_extraction_config.md
Normal file
16
docs/snippets/typescript/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(`Content: ${result.content}`);
|
||||
```
|
||||
16
docs/snippets/typescript/config/language_detection_config.md
Normal file
16
docs/snippets/typescript/config/language_detection_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
```
|
||||
17
docs/snippets/typescript/config/ocr_dpi_config.md
Normal file
17
docs/snippets/typescript/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
tesseractConfig: {
|
||||
preprocessing: {
|
||||
targetDpi: 300,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
console.log(`content length: ${result.content.length}`);
|
||||
```
|
||||
15
docs/snippets/typescript/config/pdf_config.md
Normal file
15
docs/snippets/typescript/config/pdf_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
pdfOptions: {
|
||||
extractImages: true,
|
||||
extractMetadata: true,
|
||||
passwords: ["password1", "password2"],
|
||||
hierarchy: { enabled: true, kClusters: 6, includeBbox: true },
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
23
docs/snippets/typescript/config/pdf_hierarchy_config.md
Normal file
23
docs/snippets/typescript/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
pdfOptions: {
|
||||
extractMetadata: true,
|
||||
hierarchy: {
|
||||
enabled: true,
|
||||
kClusters: 6,
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: 0.8,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
if (result.pages) {
|
||||
result.pages.forEach((page) => {
|
||||
console.log(`Page ${page.pageNumber}:`);
|
||||
console.log(` Content: ${page.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
```
|
||||
14
docs/snippets/typescript/config/postprocessor_config.md
Normal file
14
docs/snippets/typescript/config/postprocessor_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
postprocessor: {
|
||||
enabled: true,
|
||||
enabledProcessors: ["deduplication", "whitespace_normalization"],
|
||||
disabledProcessors: ["mojibake_fix"],
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
10
docs/snippets/typescript/config/quality_processing_config.md
Normal file
10
docs/snippets/typescript/config/quality_processing_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
18
docs/snippets/typescript/config/tesseract_config.md
Normal file
18
docs/snippets/typescript/config/tesseract_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+fra+deu",
|
||||
tesseractConfig: {
|
||||
psm: 6,
|
||||
tesseditCharWhitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?",
|
||||
enableTableDetection: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
13
docs/snippets/typescript/config/token_reduction_config.md
Normal file
13
docs/snippets/typescript/config/token_reduction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "moderate",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
66
docs/snippets/typescript/docker/usage.ts
Normal file
66
docs/snippets/typescript/docker/usage.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
```typescript title="usage.ts"
|
||||
import { exec } from "child_process";
|
||||
import { promisify } from "util";
|
||||
import axios from "axios";
|
||||
import { readFileSync } from "fs";
|
||||
import { basename } from "path";
|
||||
import FormData from "form-data";
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
class DockerKreuzbergClient {
|
||||
private containerName: string;
|
||||
private containerImage: string;
|
||||
private apiPort: number;
|
||||
|
||||
constructor(
|
||||
containerName: string = "kreuzberg-api",
|
||||
containerImage: string = "kreuzberg:latest",
|
||||
apiPort: number = 8000
|
||||
) {
|
||||
this.containerName = containerName;
|
||||
this.containerImage = containerImage;
|
||||
this.apiPort = apiPort;
|
||||
}
|
||||
|
||||
async startContainer(): Promise<void> {
|
||||
console.log("Starting Kreuzberg Docker container...");
|
||||
const cmd = `docker run -d --name ${this.containerName} -p ${this.apiPort}:8000 ${this.containerImage}`;
|
||||
await execAsync(cmd);
|
||||
console.log(`Container started on http://localhost:${this.apiPort}`);
|
||||
}
|
||||
|
||||
async extractFile(filePath: string): Promise<string> {
|
||||
const fileContent = readFileSync(filePath);
|
||||
const form = new FormData();
|
||||
form.append("file", fileContent, basename(filePath));
|
||||
|
||||
const response = await axios.post(`http://localhost:${this.apiPort}/api/extract`, form, {
|
||||
headers: form.getHeaders(),
|
||||
});
|
||||
|
||||
return response.data.content;
|
||||
}
|
||||
|
||||
async stopContainer(): Promise<void> {
|
||||
console.log("Stopping Kreuzberg Docker container...");
|
||||
await execAsync(`docker stop ${this.containerName}`);
|
||||
await execAsync(`docker rm ${this.containerName}`);
|
||||
console.log("Container stopped and removed");
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const dockerClient = new DockerKreuzbergClient();
|
||||
|
||||
try {
|
||||
await dockerClient.startContainer();
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
||||
const content = await dockerClient.extractFile("document.pdf");
|
||||
console.log(`Extracted content:\n${content}`);
|
||||
} finally {
|
||||
await dockerClient.stopContainer();
|
||||
}
|
||||
})()
|
||||
```
|
||||
13
docs/snippets/typescript/getting-started/basic_usage.md
Normal file
13
docs/snippets/typescript/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`MIME Type: ${result.mimeType}`);
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractBytesSync } from "@kreuzberg/node";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
const files = ["doc1.pdf", "doc2.docx"];
|
||||
const dataList = files.map((f) => readFileSync(f));
|
||||
const mimeTypes = [
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
];
|
||||
|
||||
const results = batchExtractBytesSync(dataList, mimeTypes);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`Document ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
@@ -0,0 +1,10 @@
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractFilesSync } from "@kreuzberg/node";
|
||||
|
||||
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
|
||||
const results = batchExtractFilesSync(files);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
@@ -0,0 +1,12 @@
|
||||
```typescript title="TypeScript"
|
||||
// Multiple files
|
||||
const multipleFiles = new FormData();
|
||||
for (const file of fileInput.files) {
|
||||
multipleFiles.append("files", file);
|
||||
}
|
||||
|
||||
const response3 = await fetch("http://localhost:8000/extract", {
|
||||
method: "POST",
|
||||
body: multipleFiles,
|
||||
});
|
||||
```
|
||||
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
// Using fetch API
|
||||
const formData = new FormData();
|
||||
formData.append("files", fileInput.files[0]);
|
||||
|
||||
const response = await fetch("http://localhost:8000/extract", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const results = await response.json();
|
||||
console.log(results[0].content);
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
// With configuration
|
||||
const formDataWithConfig = new FormData();
|
||||
formDataWithConfig.append("files", fileInput.files[0]);
|
||||
formDataWithConfig.append(
|
||||
"config",
|
||||
JSON.stringify({
|
||||
ocr: { language: "eng" },
|
||||
force_ocr: true,
|
||||
}),
|
||||
);
|
||||
|
||||
const response2 = await fetch("http://localhost:8000/extract", {
|
||||
method: "POST",
|
||||
body: formDataWithConfig,
|
||||
});
|
||||
```
|
||||
@@ -0,0 +1,36 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
detectMultiple: true,
|
||||
},
|
||||
tokenReduction: {
|
||||
mode: "moderate",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: {
|
||||
preset: "balanced",
|
||||
},
|
||||
},
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
if (result.chunks && result.chunks.length > 0) {
|
||||
console.log(`Chunks: ${result.chunks.length}`);
|
||||
}
|
||||
```
|
||||
48
docs/snippets/typescript/getting-started/complete_example.md
Normal file
48
docs/snippets/typescript/getting-started/complete_example.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+fra",
|
||||
tesseractConfig: {
|
||||
psm: 3,
|
||||
enableTableDetection: true,
|
||||
},
|
||||
},
|
||||
pdfOptions: {
|
||||
extractImages: true,
|
||||
extractMetadata: true,
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 150,
|
||||
maxImageDimension: 2048,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
maxOverlap: 200,
|
||||
embedding: {
|
||||
preset: "balanced",
|
||||
},
|
||||
},
|
||||
tokenReduction: {
|
||||
mode: "moderate",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
},
|
||||
postprocessor: {
|
||||
enabled: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(`Extracted content length: ${result.content.length}`);
|
||||
```
|
||||
@@ -0,0 +1,8 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractBytes } from "@kreuzberg/node";
|
||||
import { readFile } from "fs/promises";
|
||||
|
||||
const data = await readFile("document.pdf");
|
||||
const result = await extractBytes(data, "application/pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
@@ -0,0 +1,8 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractBytesSync } from "@kreuzberg/node";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
const data = readFileSync("document.pdf");
|
||||
const result = extractBytesSync(data, "application/pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
9
docs/snippets/typescript/getting-started/extract_file.md
Normal file
9
docs/snippets/typescript/getting-started/extract_file.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const result = await extractFile("document.pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables.length}`);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
```
|
||||
@@ -0,0 +1,6 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
@@ -0,0 +1,9 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables.length}`);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
```
|
||||
16
docs/snippets/typescript/getting-started/extract_with_ocr.md
Normal file
16
docs/snippets/typescript/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
forceOcr: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`Detected Languages: ${result.detectedLanguages?.join(", ") ?? "none"}`);
|
||||
```
|
||||
6
docs/snippets/typescript/getting-started/hello_world.md
Normal file
6
docs/snippets/typescript/getting-started/hello_world.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
@@ -0,0 +1,9 @@
|
||||
```typescript title="TypeScript"
|
||||
import { getVersion, extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const version = getVersion();
|
||||
console.log(`Kreuzberg version: ${version}`);
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
console.log(`Extraction successful: ${result.success}`);
|
||||
```
|
||||
13
docs/snippets/typescript/getting-started/read_content.md
Normal file
13
docs/snippets/typescript/getting-started/read_content.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
console.log(`Content: ${result.content}`);
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Content Length: ${result.content.length}`);
|
||||
|
||||
if (result.metadata.page_count) {
|
||||
console.log(`Pages: ${result.metadata.page_count}`);
|
||||
}
|
||||
```
|
||||
25
docs/snippets/typescript/llm/structured_extraction.md
Normal file
25
docs/snippets/typescript/llm/structured_extraction.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
structuredExtraction: {
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
authors: { type: "array", items: { type: "string" } },
|
||||
date: { type: "string" },
|
||||
},
|
||||
required: ["title", "authors", "date"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
llm: {
|
||||
model: "openai/gpt-4o-mini",
|
||||
},
|
||||
strict: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("paper.pdf", null, config);
|
||||
console.log(result.structuredOutput);
|
||||
```
|
||||
16
docs/snippets/typescript/llm/vlm_ocr.md
Normal file
16
docs/snippets/typescript/llm/vlm_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
forceOcr: true,
|
||||
ocr: {
|
||||
backend: "vlm",
|
||||
vlmConfig: {
|
||||
model: "openai/gpt-4o-mini",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scan.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
174
docs/snippets/typescript/mcp/client.md
Normal file
174
docs/snippets/typescript/mcp/client.md
Normal file
@@ -0,0 +1,174 @@
|
||||
```typescript title="TypeScript"
|
||||
import { spawn } from "node:child_process";
|
||||
import * as readline from "node:readline";
|
||||
|
||||
/**
|
||||
* MCP Client for Kreuzberg
|
||||
* Communicates with Kreuzberg MCP server via stdio
|
||||
* @example
|
||||
* const client = new KreuzbergMcpClient();
|
||||
* await client.connect();
|
||||
* const result = await client.callTool("extract_file", { path: "doc.pdf" });
|
||||
*/
|
||||
class KreuzbergMcpClient {
|
||||
private process: ReturnType<typeof spawn> | null = null;
|
||||
private rl: readline.Interface | null = null;
|
||||
private requestId: number = 0;
|
||||
private pendingRequests: Map<
|
||||
number,
|
||||
{
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (error: Error) => void;
|
||||
}
|
||||
> = new Map();
|
||||
|
||||
/**
|
||||
* Connect to MCP server
|
||||
*/
|
||||
async connect(): Promise<void> {
|
||||
this.process = spawn("kreuzberg", ["mcp"]);
|
||||
|
||||
this.rl = readline.createInterface({
|
||||
input: this.process.stdout,
|
||||
output: this.process.stdin,
|
||||
terminal: false,
|
||||
});
|
||||
|
||||
// Handle incoming responses
|
||||
this.rl.on("line", (line) => {
|
||||
try {
|
||||
const response = JSON.parse(line) as {
|
||||
id: number;
|
||||
result?: unknown;
|
||||
error?: { message: string };
|
||||
};
|
||||
const pending = this.pendingRequests.get(response.id);
|
||||
|
||||
if (pending) {
|
||||
if (response.error) {
|
||||
pending.reject(new Error(response.error.message));
|
||||
} else {
|
||||
pending.resolve(response.result);
|
||||
}
|
||||
this.pendingRequests.delete(response.id);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to parse response:", error);
|
||||
}
|
||||
});
|
||||
|
||||
// Handle errors
|
||||
this.process.stderr?.on("data", (data) => {
|
||||
console.error("MCP server error:", data.toString());
|
||||
});
|
||||
|
||||
// Wait for initialization
|
||||
await this.listTools();
|
||||
}
|
||||
|
||||
/**
|
||||
* List available tools
|
||||
*/
|
||||
async listTools(): Promise<Array<{ name: string; description: string }>> {
|
||||
return this.sendRequest("tools/list", {}) as Promise<
|
||||
Array<{ name: string; description: string }>
|
||||
>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call a tool on the server
|
||||
*/
|
||||
async callTool(toolName: string, args: Record<string, unknown>): Promise<unknown> {
|
||||
return this.sendRequest("tools/call", {
|
||||
name: toolName,
|
||||
arguments: args,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract file from path
|
||||
*/
|
||||
async extractFile(path: string, async: boolean = false): Promise<Record<string, unknown>> {
|
||||
return this.callTool("extract_file", {
|
||||
path,
|
||||
async,
|
||||
}) as Promise<Record<string, unknown>>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract from bytes
|
||||
*/
|
||||
async extractBytes(
|
||||
data: Uint8Array,
|
||||
mimeType: string,
|
||||
async: boolean = false,
|
||||
): Promise<Record<string, unknown>> {
|
||||
const base64 = Buffer.from(data).toString("base64");
|
||||
return this.callTool("extract_bytes", {
|
||||
data: base64,
|
||||
mimeType,
|
||||
async,
|
||||
}) as Promise<Record<string, unknown>>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send request to server
|
||||
*/
|
||||
private sendRequest(method: string, params: Record<string, unknown>): Promise<unknown> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const id = ++this.requestId;
|
||||
this.pendingRequests.set(id, { resolve, reject });
|
||||
|
||||
const request = {
|
||||
jsonrpc: "2.0",
|
||||
id,
|
||||
method,
|
||||
params,
|
||||
};
|
||||
|
||||
this.process?.stdin.write(JSON.stringify(request) + "\n");
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Disconnect from server
|
||||
*/
|
||||
disconnect(): void {
|
||||
this.rl?.close();
|
||||
this.process?.kill();
|
||||
}
|
||||
}
|
||||
|
||||
// Usage example
|
||||
async function main(): Promise<void> {
|
||||
const client = new KreuzbergMcpClient();
|
||||
|
||||
try {
|
||||
// Connect to MCP server
|
||||
await client.connect();
|
||||
console.log("Connected to Kreuzberg MCP server");
|
||||
|
||||
// List available tools
|
||||
const tools = await client.listTools();
|
||||
console.log(
|
||||
"Available tools:",
|
||||
tools.map((t) => t.name),
|
||||
);
|
||||
|
||||
// Extract file
|
||||
const result = await client.extractFile("document.pdf", true);
|
||||
console.log("Extraction result:", result);
|
||||
} catch (error) {
|
||||
console.error("Error:", error);
|
||||
} finally {
|
||||
client.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main();
|
||||
}
|
||||
|
||||
export { KreuzbergMcpClient };
|
||||
```
|
||||
35
docs/snippets/typescript/mcp/mcp_custom_client.md
Normal file
35
docs/snippets/typescript/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="TypeScript"
|
||||
import { spawn } from "child_process";
|
||||
import * as readline from "readline";
|
||||
|
||||
const mcpProcess = spawn("kreuzberg", ["mcp"]);
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input: mcpProcess.stdout,
|
||||
output: mcpProcess.stdin,
|
||||
terminal: false,
|
||||
});
|
||||
|
||||
const request = {
|
||||
method: "tools/call",
|
||||
params: {
|
||||
name: "extract_file",
|
||||
arguments: {
|
||||
path: "document.pdf",
|
||||
async: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
mcpProcess.stdin.write(JSON.stringify(request) + "\n");
|
||||
|
||||
rl.on("line", (line) => {
|
||||
const response = JSON.parse(line);
|
||||
console.log(response);
|
||||
mcpProcess.kill();
|
||||
});
|
||||
|
||||
mcpProcess.on("error", (err) => {
|
||||
console.error("Failed to start MCP process:", err);
|
||||
});
|
||||
```
|
||||
17
docs/snippets/typescript/mcp/mcp_server_start.md
Normal file
17
docs/snippets/typescript/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="TypeScript"
|
||||
import { spawn } from "child_process";
|
||||
|
||||
const mcpProcess = spawn("kreuzberg", ["mcp"]);
|
||||
|
||||
mcpProcess.stdout.on("data", (data) => {
|
||||
console.log(`MCP Server: ${data}`);
|
||||
});
|
||||
|
||||
mcpProcess.stderr.on("data", (data) => {
|
||||
console.error(`MCP Error: ${data}`);
|
||||
});
|
||||
|
||||
mcpProcess.on("error", (err) => {
|
||||
console.error(`Failed to start MCP server: ${err.message}`);
|
||||
});
|
||||
```
|
||||
193
docs/snippets/typescript/mcp/server.md
Normal file
193
docs/snippets/typescript/mcp/server.md
Normal file
@@ -0,0 +1,193 @@
|
||||
```typescript title="TypeScript"
|
||||
import {
|
||||
extractFile,
|
||||
extractFileSync,
|
||||
extractBytes,
|
||||
extractBytesSync,
|
||||
type ExtractionResult,
|
||||
type ExtractionConfig,
|
||||
} from "@kreuzberg/node";
|
||||
import * as readline from "node:readline";
|
||||
|
||||
/**
|
||||
* MCP Server for Kreuzberg
|
||||
* Exposes document extraction as MCP tools
|
||||
* @example
|
||||
* const server = new KreuzbergMcpServer();
|
||||
* await server.start();
|
||||
*/
|
||||
class KreuzbergMcpServer {
|
||||
private config?: ExtractionConfig;
|
||||
private rl: readline.Interface;
|
||||
|
||||
constructor(config?: ExtractionConfig) {
|
||||
this.config = config;
|
||||
this.rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
terminal: false,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start MCP server
|
||||
*/
|
||||
async start(): Promise<void> {
|
||||
console.error("[MCP Server] Starting Kreuzberg MCP server");
|
||||
|
||||
this.rl.on("line", async (line) => {
|
||||
try {
|
||||
const request = JSON.parse(line) as {
|
||||
id: number;
|
||||
method: string;
|
||||
params: Record<string, unknown>;
|
||||
};
|
||||
|
||||
const response = await this.handleRequest(request);
|
||||
process.stdout.write(JSON.stringify(response) + "\n");
|
||||
} catch (error) {
|
||||
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
||||
process.stdout.write(
|
||||
JSON.stringify({
|
||||
id: 0,
|
||||
error: { message: errorMessage },
|
||||
}) + "\n",
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle incoming MCP request
|
||||
*/
|
||||
private async handleRequest(request: {
|
||||
id: number;
|
||||
method: string;
|
||||
params: Record<string, unknown>;
|
||||
}): Promise<Record<string, unknown>> {
|
||||
const { id, method, params } = request;
|
||||
|
||||
if (method === "tools/list") {
|
||||
return {
|
||||
id,
|
||||
result: this.listTools(),
|
||||
};
|
||||
}
|
||||
|
||||
if (method === "tools/call") {
|
||||
const result = await this.callTool(
|
||||
params.name as string,
|
||||
params.arguments as Record<string, unknown>,
|
||||
);
|
||||
return {
|
||||
id,
|
||||
result,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`Unknown method: ${method}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* List available tools
|
||||
*/
|
||||
private listTools(): Array<{
|
||||
name: string;
|
||||
description: string;
|
||||
inputSchema: Record<string, unknown>;
|
||||
}> {
|
||||
return [
|
||||
{
|
||||
name: "extract_file",
|
||||
description: "Extract content from a file by path",
|
||||
inputSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
path: { type: "string", description: "Path to file" },
|
||||
async: { type: "boolean", description: "Use async extraction" },
|
||||
config: {
|
||||
type: "object",
|
||||
description: "Optional extraction config",
|
||||
},
|
||||
},
|
||||
required: ["path"],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "extract_bytes",
|
||||
description: "Extract content from raw bytes",
|
||||
inputSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
data: { type: "string", description: "Base64-encoded data" },
|
||||
mimeType: { type: "string", description: "MIME type" },
|
||||
async: { type: "boolean", description: "Use async extraction" },
|
||||
config: {
|
||||
type: "object",
|
||||
description: "Optional extraction config",
|
||||
},
|
||||
},
|
||||
required: ["data", "mimeType"],
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Call tool
|
||||
*/
|
||||
private async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
|
||||
if (name === "extract_file") {
|
||||
const path = args.path as string;
|
||||
const useAsync = (args.async as boolean) ?? true;
|
||||
const config = (args.config as ExtractionConfig) ?? this.config;
|
||||
|
||||
if (useAsync) {
|
||||
return extractFile(path, null, config);
|
||||
} else {
|
||||
return extractFileSync(path, null, config);
|
||||
}
|
||||
}
|
||||
|
||||
if (name === "extract_bytes") {
|
||||
const data = Buffer.from(args.data as string, "base64");
|
||||
const mimeType = args.mimeType as string;
|
||||
const useAsync = (args.async as boolean) ?? true;
|
||||
const config = (args.config as ExtractionConfig) ?? this.config;
|
||||
|
||||
if (useAsync) {
|
||||
return extractBytes(data, mimeType, config);
|
||||
} else {
|
||||
return extractBytesSync(data, mimeType, config);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Unknown tool: ${name}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point
|
||||
*/
|
||||
async function main(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
enabled: true,
|
||||
backend: "tesseract",
|
||||
},
|
||||
};
|
||||
|
||||
const server = new KreuzbergMcpServer(config);
|
||||
await server.start();
|
||||
}
|
||||
|
||||
// Start server when invoked as MCP
|
||||
if (process.argv[2] === "mcp") {
|
||||
main().catch((error) => {
|
||||
console.error("Server error:", error);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
export { KreuzbergMcpServer };
|
||||
```
|
||||
14
docs/snippets/typescript/metadata/language_detection.md
Normal file
14
docs/snippets/typescript/metadata/language_detection.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.9,
|
||||
detectMultiple: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("multilingual_document.pdf", null, config);
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
```
|
||||
83
docs/snippets/typescript/metadata/metadata.md
Normal file
83
docs/snippets/typescript/metadata/metadata.md
Normal file
@@ -0,0 +1,83 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
if (result.metadata.pageCount) {
|
||||
console.log(`Pages: ${result.metadata.pageCount}`);
|
||||
}
|
||||
|
||||
const htmlResult = extractFileSync("page.html");
|
||||
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
|
||||
|
||||
const htmlMeta = htmlResult.metadata;
|
||||
if (htmlMeta.title) {
|
||||
console.log(`Title: ${htmlMeta.title}`);
|
||||
}
|
||||
|
||||
// Access keywords as array
|
||||
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
|
||||
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
|
||||
}
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
if (htmlMeta.canonicalUrl) {
|
||||
console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
|
||||
}
|
||||
|
||||
// Access Open Graph fields from map
|
||||
if (htmlMeta.openGraph) {
|
||||
if (htmlMeta.openGraph["image"]) {
|
||||
console.log(`Open Graph Image: ${htmlMeta.openGraph["image"]}`);
|
||||
}
|
||||
if (htmlMeta.openGraph["title"]) {
|
||||
console.log(`Open Graph Title: ${htmlMeta.openGraph["title"]}`);
|
||||
}
|
||||
if (htmlMeta.openGraph["type"]) {
|
||||
console.log(`Open Graph Type: ${htmlMeta.openGraph["type"]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access Twitter Card fields from map
|
||||
if (htmlMeta.twitterCard) {
|
||||
if (htmlMeta.twitterCard["card"]) {
|
||||
console.log(`Twitter Card Type: ${htmlMeta.twitterCard["card"]}`);
|
||||
}
|
||||
if (htmlMeta.twitterCard["creator"]) {
|
||||
console.log(`Twitter Creator: ${htmlMeta.twitterCard["creator"]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
if (htmlMeta.language) {
|
||||
console.log(`Language: ${htmlMeta.language}`);
|
||||
}
|
||||
|
||||
if (htmlMeta.textDirection) {
|
||||
console.log(`Text Direction: ${htmlMeta.textDirection}`);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
|
||||
console.log(`Headers: ${htmlMeta.headers.map((h) => h.text).join(", ")}`);
|
||||
}
|
||||
|
||||
// Access links
|
||||
if (htmlMeta.links && htmlMeta.links.length > 0) {
|
||||
htmlMeta.links.forEach((link) => {
|
||||
console.log(`Link: ${link.href} (${link.text})`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access images
|
||||
if (htmlMeta.images && htmlMeta.images.length > 0) {
|
||||
htmlMeta.images.forEach((image) => {
|
||||
console.log(`Image: ${image.src}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
|
||||
console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
|
||||
}
|
||||
```
|
||||
18
docs/snippets/typescript/metadata/page_boundaries.md
Normal file
18
docs/snippets/typescript/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,18 @@
|
||||
Import { extractFileSync } from '@kreuzberg/node';
|
||||
|
||||
Const result = extractFileSync('document.pdf');
|
||||
|
||||
If (result.metadata.page_structure?.boundaries) {
|
||||
const encoder = new TextEncoder();
|
||||
const contentBytes = encoder.encode(result.content);
|
||||
|
||||
For (const boundary of result.metadata.page_structure.boundaries.slice(0, 3)) {
|
||||
const pageBytes = contentBytes.slice(boundary.byteStart, boundary.byteEnd);
|
||||
const pageText = new TextDecoder().decode(pageBytes);
|
||||
|
||||
console.log(`Page ${boundary.pageNumber}:`);
|
||||
console.log(` Byte range: ${boundary.byteStart}-${boundary.byteEnd}`);
|
||||
console.log(` Preview: ${pageText.substring(0, 100)}...`);
|
||||
|
||||
}
|
||||
}
|
||||
12
docs/snippets/typescript/metadata/page_tracking_basic.md
Normal file
12
docs/snippets/typescript/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,12 @@
|
||||
Import { extractFileSync } from '@kreuzberg/node';
|
||||
|
||||
Const result = extractFileSync('document.pdf', null, { pages: { extractPages: true } });
|
||||
|
||||
If (result.pages) {
|
||||
for (const page of result.pages) {
|
||||
console.log(`Page ${page.pageNumber}:`);
|
||||
console.log(` Content: ${page.content.length} chars`);
|
||||
console.log(` Tables: ${page.tables.length}`);
|
||||
console.log(` Images: ${page.images.length}`);
|
||||
}
|
||||
}
|
||||
22
docs/snippets/typescript/metadata/tables.md
Normal file
22
docs/snippets/typescript/metadata/tables.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
if (result.tables) {
|
||||
for (const table of result.tables) {
|
||||
const rowCount = table.cells?.length ?? 0;
|
||||
console.log(`Table with ${rowCount} rows`);
|
||||
|
||||
if (table.markdown) {
|
||||
console.log(table.markdown);
|
||||
}
|
||||
|
||||
if (table.cells) {
|
||||
for (const row of table.cells) {
|
||||
console.log(row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,49 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
interface VectorRecord {
|
||||
id: string;
|
||||
content: string;
|
||||
embedding: number[];
|
||||
metadata: Record<string, string>;
|
||||
}
|
||||
|
||||
async function extractAndVectorize(
|
||||
documentPath: string,
|
||||
documentId: string,
|
||||
): Promise<VectorRecord[]> {
|
||||
const config = {
|
||||
chunking: {
|
||||
max_chars: 512,
|
||||
max_overlap: 50,
|
||||
embedding: {
|
||||
model: { type: "preset", name: "balanced" },
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile(documentPath, null, config);
|
||||
|
||||
const records: VectorRecord[] = [];
|
||||
if (result.chunks) {
|
||||
result.chunks.forEach((chunk, index) => {
|
||||
if (chunk.embedding) {
|
||||
records.push({
|
||||
id: `${documentId}_chunk_${index}`,
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
metadata: {
|
||||
document_id: documentId,
|
||||
chunk_index: index.toString(),
|
||||
content_length: chunk.content.length.toString(),
|
||||
},
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
```
|
||||
29
docs/snippets/typescript/ocr/cloud_ocr_backend.md
Normal file
29
docs/snippets/typescript/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```typescript title="TypeScript"
|
||||
import { registerOcrBackend, extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const supportedLangs = ["eng", "deu", "fra"];
|
||||
|
||||
const cloudBackend = {
|
||||
name: () => "cloud-ocr",
|
||||
version: () => "1.0.0",
|
||||
initialize: () => {},
|
||||
shutdown: () => {},
|
||||
process_image: async (imageBytes: Uint8Array, config: { language?: string }) => {
|
||||
// Call your cloud OCR API with imageBytes and config.language.
|
||||
return { content: "Extracted text", mime_type: "text/plain" };
|
||||
},
|
||||
supports_language: (lang: string) => supportedLangs.includes(lang),
|
||||
backend_type: () => "Custom",
|
||||
supported_languages: () => supportedLangs,
|
||||
};
|
||||
|
||||
registerOcrBackend(cloudBackend);
|
||||
|
||||
const result = extractFileSync("scanned.pdf", {
|
||||
ocr: {
|
||||
backend: "cloud-ocr",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
console.log(result.content);
|
||||
```
|
||||
16
docs/snippets/typescript/ocr/image_extraction.md
Normal file
16
docs/snippets/typescript/ocr/image_extraction.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 200,
|
||||
maxImageDimension: 2048,
|
||||
injectPlaceholders: true, // set to false to extract images without markdown references
|
||||
autoAdjustDpi: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", config);
|
||||
console.log(`content length: ${result.content.length}`);
|
||||
```
|
||||
22
docs/snippets/typescript/ocr/image_preprocessing.md
Normal file
22
docs/snippets/typescript/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
tesseractConfig: {
|
||||
preprocessing: {
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: "otsu",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", config);
|
||||
console.log(`content length: ${result.content.length}`);
|
||||
```
|
||||
15
docs/snippets/typescript/ocr/ocr_dpi_config.md
Normal file
15
docs/snippets/typescript/ocr/ocr_dpi_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
},
|
||||
pdfOptions: {
|
||||
extractImages: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
1
docs/snippets/typescript/ocr/ocr_easyocr.md
Normal file
1
docs/snippets/typescript/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1 @@
|
||||
EasyOCR is only available in Python.
|
||||
24
docs/snippets/typescript/ocr/ocr_elements.md
Normal file
24
docs/snippets/typescript/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "paddle-ocr",
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
|
||||
if (result.ocrElements) {
|
||||
for (const element of result.ocrElements) {
|
||||
console.log(`Text: ${element.text}`);
|
||||
console.log(`Confidence: ${element.confidence.recognition.toFixed(2)}`);
|
||||
console.log(`Geometry:`, element.geometry);
|
||||
if (element.rotation) {
|
||||
console.log(`Rotation: ${element.rotation.angle}°`);
|
||||
}
|
||||
console.log();
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/typescript/ocr/ocr_extraction.md
Normal file
13
docs/snippets/typescript/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
13
docs/snippets/typescript/ocr/ocr_force_all_pages.md
Normal file
13
docs/snippets/typescript/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
},
|
||||
forceOcr: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
13
docs/snippets/typescript/ocr/ocr_multi_language.md
Normal file
13
docs/snippets/typescript/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu+fra",
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("multilingual.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
14
docs/snippets/typescript/ocr/ocr_paddleocr.md
Normal file
14
docs/snippets/typescript/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "paddle-ocr",
|
||||
language: "en",
|
||||
// modelTier: 'server', // for max accuracy
|
||||
},
|
||||
};
|
||||
|
||||
const result = extractFileSync("scanned.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user