Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/typescript/advanced/chunk_page_mapping.md
+++ b/docs/snippets/typescript/advanced/chunk_page_mapping.md
@@ -0,0 +1,16 @@
+Import { extractFileSync } from '@kreuzberg/node';
+
+Const result = extractFileSync('document.pdf', null, { chunking: { maxChars: 500, maxOverlap: 50 }, pages: { extractPages: true } });
+
+If (result.chunks) {
+for (const chunk of result.chunks) {
+if (chunk.metadata.firstPage) {
+const pageRange = chunk.metadata.firstPage === chunk.metadata.lastPage
+? `Page ${chunk.metadata.firstPage}`
+: `Pages ${chunk.metadata.firstPage}-${chunk.metadata.lastPage}`;
+
+      console.log(`Chunk: ${chunk.content.substring(0, 50)}... (${pageRange})`);
+    }
+
+}
+}
--- a/docs/snippets/typescript/advanced/chunking_config.md
+++ b/docs/snippets/typescript/advanced/chunking_config.md
@@ -0,0 +1,47 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 200,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
+```
+
+```typescript title="TypeScript - Semantic"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    chunkerType: "semantic",
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+for (const chunk of result.chunks ?? []) {
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+}
+```
+
+```typescript title="TypeScript - Prepend Heading Context"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 500,
+    maxOverlap: 50,
+    prependHeadingContext: true,
+  },
+};
+
+const result = await extractFile("document.md", null, config);
+for (const chunk of result.chunks ?? []) {
+  // Each chunk's content is prefixed with its heading breadcrumb
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+}
+```
--- a/docs/snippets/typescript/advanced/chunking_rag.md
+++ b/docs/snippets/typescript/advanced/chunking_rag.md
@@ -0,0 +1,25 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    maxChars: 500,
+    maxOverlap: 50,
+    embedding: {
+      model: { type: "preset", name: "balanced" },
+      normalize: true,
+    },
+  },
+};
+
+const result = await extractFile("research_paper.pdf", null, config);
+
+for (const chunk of result.chunks ?? []) {
+  console.log(`Chunk ${chunk.metadata.chunkIndex + 1}/${chunk.metadata.totalChunks}`);
+  console.log(`Position: ${chunk.metadata.byteStart}-${chunk.metadata.byteEnd}`);
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+  if (chunk.embedding) {
+    console.log(`Embedding: ${chunk.embedding.length} dimensions`);
+  }
+}
+```
--- a/docs/snippets/typescript/advanced/custom_cache.md
+++ b/docs/snippets/typescript/advanced/custom_cache.md
@@ -0,0 +1,18 @@
+```typescript title="TypeScript"
+import { extractFile, type ExtractionConfig, type ExtractionResult } from "@kreuzberg/node";
+
+/**
+ * Note: Custom cache backends are not supported in TypeScript v4.0.
+ * Caching is handled internally by the Rust core.
+ *
+ * This example demonstrates the config structure.
+ * To enable caching, use the useCache flag.
+ */
+
+// Usage with built-in cache
+const config: ExtractionConfig = {
+  useCache: true, // Enable internal Rust cache
+};
+
+const result = await extractFile("document.pdf", null, config);
+```
--- a/docs/snippets/typescript/advanced/custom_extractor.md
+++ b/docs/snippets/typescript/advanced/custom_extractor.md
@@ -0,0 +1,19 @@
+```typescript title="TypeScript"
+import { listDocumentExtractors, unregisterDocumentExtractor } from "@kreuzberg/node";
+
+/**
+ * Note: Custom document extractors are not directly supported in TypeScript v4.0.
+ * Document extraction logic lives in the Rust core.
+ *
+ * You can list and unregister built-in extractors, but cannot add custom ones
+ * from TypeScript. For custom extractors, implement them in Rust.
+ */
+
+// List all registered document extractors
+const extractors = listDocumentExtractors();
+console.log("Available extractors:", extractors);
+// Example output: ['PDFExtractor', 'ImageExtractor', 'OfficeExtractor', ...]
+
+// Unregister a built-in extractor (use with caution)
+unregisterDocumentExtractor("SomeExtractor");
+```
--- a/docs/snippets/typescript/advanced/custom_ocr_backend.md
+++ b/docs/snippets/typescript/advanced/custom_ocr_backend.md
@@ -0,0 +1,78 @@
+```typescript title="TypeScript"
+import { registerOcrBackend, type OcrBackendProtocol } from "@kreuzberg/node";
+
+/**
+ * Custom OCR backend implementation
+ * Allows integration with custom OCR services
+ * @example
+ * const backend = new CustomOcrBackend();
+ * await backend.initialize();
+ * registerOcrBackend(backend);
+ */
+class CustomOcrBackend implements OcrBackendProtocol {
+  private apiUrl: string;
+
+  constructor(apiUrl: string) {
+    this.apiUrl = apiUrl;
+  }
+
+  name(): string {
+    return "custom-ocr-backend";
+  }
+
+  supportedLanguages(): string[] {
+    return ["en", "de", "fr", "es"];
+  }
+
+  async initialize(): Promise<void> {
+    console.log(`Initializing custom OCR backend at ${this.apiUrl}`);
+  }
+
+  async shutdown(): Promise<void> {
+    console.log("Shutting down custom OCR backend");
+  }
+
+  /**
+   * Process image and extract text via OCR
+   */
+  async processImage(
+    imageData: Uint8Array | string,
+    language: string,
+  ): Promise<{
+    content: string;
+    mime_type: string;
+    metadata: Record<string, unknown>;
+    tables: unknown[];
+  }> {
+    const buffer =
+      typeof imageData === "string" ? Buffer.from(imageData, "base64") : Buffer.from(imageData);
+
+    const formData = new FormData();
+    const blob = new Blob([buffer], { type: "image/png" });
+    formData.append("image", blob);
+    formData.append("language", language);
+
+    const response = await fetch(`${this.apiUrl}/ocr`, {
+      method: "POST",
+      body: formData,
+    });
+
+    if (!response.ok) {
+      throw new Error(`OCR service failed: ${response.statusText}`);
+    }
+
+    const result = await response.json();
+    return {
+      content: result.text,
+      mime_type: "text/plain",
+      metadata: { confidence: result.confidence, language },
+      tables: result.tables || [],
+    };
+  }
+}
+
+// Register custom OCR backend
+const backend = new CustomOcrBackend("http://localhost:8000");
+await backend.initialize();
+registerOcrBackend(backend);
+```
--- a/docs/snippets/typescript/advanced/custom_postprocessor.md
+++ b/docs/snippets/typescript/advanced/custom_postprocessor.md
@@ -0,0 +1,63 @@
+```typescript title="TypeScript"
+import {
+  registerPostProcessor,
+  unregisterPostProcessor,
+  type PostProcessorProtocol,
+  type ExtractionResult,
+} from "@kreuzberg/node";
+
+/**
+ * Custom post-processor for cleaning extraction results
+ * Removes common artifacts and normalizes whitespace
+ * @example
+ * const processor = new CleaningPostProcessor();
+ * registerPostProcessor(processor);
+ */
+class CleaningPostProcessor implements PostProcessorProtocol {
+  name(): string {
+    return "cleaning-postprocessor";
+  }
+
+  processingStage(): "early" | "middle" | "late" {
+    return "middle";
+  }
+
+  /**
+   * Process extraction result for cleanup
+   */
+  process(result: ExtractionResult): ExtractionResult {
+    return {
+      ...result,
+      content: this.cleanContent(result.content),
+    };
+  }
+
+  /**
+   * Remove artifacts and normalize whitespace
+   */
+  private cleanContent(content: string): string {
+    // Remove multiple spaces
+    let cleaned = content.replace(/\s+/g, " ");
+
+    // Remove common OCR artifacts
+    cleaned = cleaned.replace(/\|/g, "l");
+    cleaned = cleaned.replace(/0O/g, "00");
+
+    // Remove leading/trailing whitespace from lines
+    cleaned = cleaned
+      .split("\n")
+      .map((line) => line.trim())
+      .filter((line) => line.length > 0)
+      .join("\n");
+
+    return cleaned.trim();
+  }
+}
+
+// Register the post-processor
+const processor = new CleaningPostProcessor();
+registerPostProcessor(processor);
+
+// Later, unregister if needed
+// unregisterPostProcessor("cleaning-postprocessor");
+```
--- a/docs/snippets/typescript/advanced/custom_validator.md
+++ b/docs/snippets/typescript/advanced/custom_validator.md
@@ -0,0 +1,86 @@
+```typescript title="TypeScript"
+import {
+  registerValidator,
+  unregisterValidator,
+  extractFile,
+  type ValidatorProtocol,
+  type ExtractionResult,
+  ValidationError,
+} from "@kreuzberg/node";
+
+/**
+ * Custom validator for quality checking
+ * Ensures extracted content meets minimum quality standards
+ * @example
+ * const validator = new QualityValidator();
+ * registerValidator(validator);
+ */
+class QualityValidator implements ValidatorProtocol {
+  name(): string {
+    return "quality-validator";
+  }
+
+  priority(): number {
+    return 10;
+  }
+
+  /**
+   * Validate extraction result meets quality standards
+   */
+  validate(result: ExtractionResult): void {
+    this.checkMinimumLength(result);
+    this.checkEmptyContent(result);
+    this.checkMetadata(result);
+  }
+
+  /**
+   * Ensure minimum content length
+   */
+  private checkMinimumLength(result: ExtractionResult): void {
+    const minLength = 50;
+    if (result.content.length < minLength) {
+      throw new ValidationError(
+        `Content too short: ${result.content.length} bytes (minimum ${minLength})`,
+      );
+    }
+  }
+
+  /**
+   * Ensure content is not empty
+   */
+  private checkEmptyContent(result: ExtractionResult): void {
+    const trimmed = result.content.trim();
+    if (trimmed.length === 0) {
+      throw new ValidationError("Extracted content is empty");
+    }
+  }
+
+  /**
+   * Validate metadata is present
+   */
+  private checkMetadata(result: ExtractionResult): void {
+    if (!result.metadata || Object.keys(result.metadata).length === 0) {
+      throw new ValidationError("Missing extraction metadata");
+    }
+  }
+}
+
+// Register the validator
+const validator = new QualityValidator();
+registerValidator(validator);
+
+// Usage with error handling (must use async extraction for custom validators)
+try {
+  const result = await extractFile("document.pdf");
+  console.log(`Validated content length: ${result.content.length} characters`);
+} catch (error) {
+  if (error instanceof ValidationError) {
+    console.error(`Validation failed: ${error.message}`);
+  } else {
+    throw error;
+  }
+}
+
+// Later, unregister if needed
+// unregisterValidator("quality-validator");
+```
--- a/docs/snippets/typescript/advanced/embedding_with_chunking.md
+++ b/docs/snippets/typescript/advanced/embedding_with_chunking.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+const config = {
+  chunking: {
+    maxChars: 1024,
+    maxOverlap: 100,
+    embedding: {
+      model: { type: "preset", name: "balanced" },
+      normalize: true,
+      batchSize: 32,
+      showDownloadProgress: false,
+    },
+  },
+};
+```
--- a/docs/snippets/typescript/advanced/keyword_extraction_config.md
+++ b/docs/snippets/typescript/advanced/keyword_extraction_config.md
@@ -0,0 +1,11 @@
+```typescript title="TypeScript"
+const config = {
+  keywords: {
+    algorithm: "yake",
+    maxKeywords: 10,
+    minScore: 0.3,
+    ngramRange: [1, 3],
+    language: "en",
+  },
+};
+```
--- a/docs/snippets/typescript/advanced/keyword_extraction_example.md
+++ b/docs/snippets/typescript/advanced/keyword_extraction_example.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  keywords: {
+    algorithm: "yake",
+    maxKeywords: 10,
+    minScore: 0.3,
+  },
+};
+
+const result = await extractFile("research_paper.pdf", null, config);
+
+for (const keyword of result.extractedKeywords ?? []) {
+  console.log(`${keyword.text}: ${keyword.score.toFixed(3)}`);
+}
+```
--- a/docs/snippets/typescript/advanced/language_detection_config.md
+++ b/docs/snippets/typescript/advanced/language_detection_config.md
@@ -0,0 +1,9 @@
+```typescript title="TypeScript"
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: false,
+  },
+};
+```
--- a/docs/snippets/typescript/advanced/language_detection_multilingual.md
+++ b/docs/snippets/typescript/advanced/language_detection_multilingual.md
@@ -0,0 +1,15 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: true,
+  },
+};
+
+const result = await extractFile("multilingual_document.pdf", null, config);
+
+console.log(`Detected languages: ${result.detectedLanguages?.join(", ") ?? ""}`);
+```
--- a/docs/snippets/typescript/advanced/plugin_registry.md
+++ b/docs/snippets/typescript/advanced/plugin_registry.md
@@ -0,0 +1,128 @@
+```typescript title="TypeScript"
+import {
+  registerPostProcessor,
+  registerValidator,
+  registerOcrBackend,
+  listPostProcessors,
+  listValidators,
+  listOcrBackends,
+  unregisterPostProcessor,
+  unregisterValidator,
+  clearPostProcessors,
+  clearValidators,
+  clearOcrBackends,
+  type PostProcessorProtocol,
+  type ValidatorProtocol,
+  type OcrBackendProtocol,
+} from "@kreuzberg/node";
+
+/**
+ * Plugin registry and lifecycle management
+ * Demonstrates how to register, list, and unregister plugins
+ * @example
+ * const registry = new PluginRegistry();
+ * registry.registerAll();
+ * registry.listAll();
+ */
+class PluginRegistry {
+  private postProcessors: PostProcessorProtocol[] = [];
+  private validators: ValidatorProtocol[] = [];
+  private ocrBackends: OcrBackendProtocol[] = [];
+
+  /**
+   * Register all available plugins
+   */
+  registerAll(): void {
+    console.log("Registering all plugins...");
+
+    // Register post-processors
+    this.postProcessors.forEach((processor) => {
+      registerPostProcessor(processor);
+      console.log(`Registered post-processor: ${processor.name()}`);
+    });
+
+    // Register validators
+    this.validators.forEach((validator) => {
+      registerValidator(validator);
+      console.log(`Registered validator: ${validator.name()}`);
+    });
+
+    // Register OCR backends
+    this.ocrBackends.forEach((backend) => {
+      registerOcrBackend(backend);
+      console.log(`Registered OCR backend: ${backend.name()}`);
+    });
+  }
+
+  /**
+   * List all registered plugins
+   */
+  listAll(): void {
+    const processors = listPostProcessors();
+    const validators = listValidators();
+    const backends = listOcrBackends();
+
+    console.log("Registered plugins:");
+    console.log(`  Post-processors: ${processors.join(", ")}`);
+    console.log(`  Validators: ${validators.join(", ")}`);
+    console.log(`  OCR backends: ${backends.join(", ")}`);
+  }
+
+  /**
+   * Unregister specific plugin by name
+   */
+  unregisterPlugin(name: string, type: "processor" | "validator"): void {
+    if (type === "processor") {
+      unregisterPostProcessor(name);
+      console.log(`Unregistered post-processor: ${name}`);
+    } else if (type === "validator") {
+      unregisterValidator(name);
+      console.log(`Unregistered validator: ${name}`);
+    }
+  }
+
+  /**
+   * Clear all registered plugins
+   */
+  clearAll(): void {
+    clearPostProcessors();
+    clearValidators();
+    clearOcrBackends();
+    console.log("Cleared all plugins");
+  }
+
+  /**
+   * Add plugin to registry
+   */
+  addPostProcessor(processor: PostProcessorProtocol): void {
+    this.postProcessors.push(processor);
+  }
+
+  addValidator(validator: ValidatorProtocol): void {
+    this.validators.push(validator);
+  }
+
+  addOcrBackend(backend: OcrBackendProtocol): void {
+    this.ocrBackends.push(backend);
+  }
+}
+
+// Usage
+const registry = new PluginRegistry();
+
+// Add plugins to registry
+// registry.addPostProcessor(customProcessor);
+// registry.addValidator(customValidator);
+
+// Register all plugins
+registry.registerAll();
+
+// List registered plugins
+registry.listAll();
+
+// Unregister specific plugin
+// registry.unregisterPlugin("custom-processor", "processor");
+
+// Clear all plugins
+// registry.clearAll();
+```
--- a/docs/snippets/typescript/advanced/quality_processing_config.md
+++ b/docs/snippets/typescript/advanced/quality_processing_config.md
@@ -0,0 +1,5 @@
+```typescript title="TypeScript"
+const config = {
+  enableQualityProcessing: true,
+};
+```
--- a/docs/snippets/typescript/advanced/quality_processing_example.md
+++ b/docs/snippets/typescript/advanced/quality_processing_example.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const result = await extractFile("scanned_document.pdf", null, config);
+
+if (result.qualityScore !== null && result.qualityScore !== undefined) {
+  if (result.qualityScore < 0.5) {
+    console.warn(`Warning: Low quality extraction (${result.qualityScore.toFixed(2)})`);
+  } else {
+    console.log(`Quality score: ${result.qualityScore.toFixed(2)}`);
+  }
+}
+```
--- a/docs/snippets/typescript/advanced/token_reduction_config.md
+++ b/docs/snippets/typescript/advanced/token_reduction_config.md
@@ -0,0 +1,10 @@
+```typescript title="TypeScript"
+const config = {
+  tokenReduction: {
+    level: "Moderate",
+    preserveMarkdown: true,
+    preserveCode: true,
+    languageHint: "eng",
+  },
+};
+```
--- a/docs/snippets/typescript/advanced/token_reduction_example.md
+++ b/docs/snippets/typescript/advanced/token_reduction_example.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  tokenReduction: {
+    level: "Moderate",
+    preserveMarkdown: true,
+  },
+};
+
+const result = await extractFile("verbose_document.pdf", null, config);
+
+console.log(`Reduced content length: ${result.content?.length ?? 0} chars`);
+```
--- a/docs/snippets/typescript/advanced/vector_database_integration.md
+++ b/docs/snippets/typescript/advanced/vector_database_integration.md
@@ -0,0 +1,49 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+interface VectorRecord {
+  id: string;
+  content: string;
+  embedding: number[];
+  metadata: Record<string, string>;
+}
+
+async function extractAndVectorize(
+  documentPath: string,
+  documentId: string,
+): Promise<VectorRecord[]> {
+  const config = {
+    chunking: {
+      maxChars: 512,
+      maxOverlap: 50,
+      embedding: {
+        model: { type: "preset", name: "balanced" },
+        normalize: true,
+        batchSize: 32,
+      },
+    },
+  };
+
+  const result = await extractFile(documentPath, null, config);
+
+  const records: VectorRecord[] = [];
+  for (const [index, chunk] of (result.chunks ?? []).entries()) {
+    if (!chunk.embedding) {
+      continue;
+    }
+    records.push({
+      id: `${documentId}_chunk_${index}`,
+      content: chunk.content,
+      embedding: chunk.embedding,
+      metadata: {
+        document_id: documentId,
+        chunk_index: String(index),
+        content_length: String(chunk.content.length),
+      },
+    });
+  }
+  return records;
+}
+
+await extractAndVectorize("document.pdf", "doc_001");
+```
--- a/docs/snippets/typescript/advanced/worker_pool.md
+++ b/docs/snippets/typescript/advanced/worker_pool.md
@@ -0,0 +1,50 @@
+# Worker Thread Pool Example
+
+Use worker pools for CPU-bound batch processing to maximize parallelism and throughput:
+
+```typescript title="worker_pool.ts"
+import {
+  createWorkerPool,
+  extractFileInWorker,
+  batchExtractFilesInWorker,
+  closeWorkerPool,
+} from "@kreuzberg/node";
+
+// Create a pool with 4 worker threads
+const pool = createWorkerPool(4);
+
+try {
+  // Extract single file in worker
+  const result = await extractFileInWorker(pool, "document.pdf", null, {
+    useCache: true,
+  });
+  console.log(result.content);
+
+  // Extract multiple files concurrently
+  const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
+  const results = await batchExtractFilesInWorker(pool, files, {
+    useCache: true,
+  });
+
+  results.forEach((result, i) => {
+    console.log(`File ${i + 1}: ${result.content.length} characters`);
+  });
+} finally {
+  // Always close the pool when done
+  await closeWorkerPool(pool);
+}
+```
+
+**Performance Benefits:**
+
+- **Parallel Processing**: Multiple documents extracted simultaneously
+- **CPU Usage**: Maximizes multi-core CPU usage for large batches
+- **Queue Management**: Automatically distributes work across available workers
+- **Resource Control**: Prevents thread exhaustion with configurable pool size
+
+**Best Practices:**
+
+- Use worker pools for batches of 10+ documents
+- Set pool size to number of CPU cores (default behavior)
+- Always close pools with `closeWorkerPool()` to prevent resource leaks
+- Reuse pools across multiple batch operations for efficiency
--- a/docs/snippets/typescript/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/typescript/api/batch_extract_bytes_sync.md
@@ -0,0 +1,18 @@
+```typescript title="TypeScript"
+import { batchExtractBytesSync } from "kreuzberg";
+import { readFileSync } from "fs";
+
+const doc1 = readFileSync("doc1.pdf");
+const doc2 = readFileSync("doc2.pdf");
+
+const items = [
+  { content: doc1, mimeType: "application/pdf", config: undefined },
+  { content: doc2, mimeType: "application/pdf", config: undefined },
+];
+
+const results = batchExtractBytesSync(items);
+
+results.forEach((result, i) => {
+  console.log(`Document ${i}: ${result.content.length} chars`);
+});
+```
--- a/docs/snippets/typescript/api/batch_extract_files_sync.md
+++ b/docs/snippets/typescript/api/batch_extract_files_sync.md
@@ -0,0 +1,15 @@
+```typescript title="TypeScript"
+import { batchExtractFilesSync } from "kreuzberg";
+
+const items = [
+  { path: "doc1.pdf", config: undefined },
+  { path: "doc2.docx", config: undefined },
+  { path: "report.pdf", config: undefined },
+];
+
+const results = batchExtractFilesSync(items);
+
+results.forEach((result, i) => {
+  console.log(`File ${i}: ${result.content.length} chars`);
+});
+```
--- a/docs/snippets/typescript/api/client_chunk_text.md
+++ b/docs/snippets/typescript/api/client_chunk_text.md
@@ -0,0 +1,23 @@
+```typescript title="TypeScript"
+const response = await fetch("http://localhost:8000/chunk", {
+  method: "POST",
+  headers: { "Content-Type": "application/json" },
+  body: JSON.stringify({
+    text: "Your long text content here...",
+    chunker_type: "text",
+    config: {
+      max_characters: 1000,
+      overlap: 50,
+      trim: true,
+    },
+  }),
+});
+
+const result = await response.json();
+
+console.log(`Created ${result.chunk_count} chunks`);
+result.chunks.forEach((chunk: { content: string; chunk_index: number }) => {
+  const preview = chunk.content.substring(0, 50);
+  console.log(`Chunk ${chunk.chunk_index}: ${preview}...`);
+});
+```
--- a/docs/snippets/typescript/api/client_extract_single_file.md
+++ b/docs/snippets/typescript/api/client_extract_single_file.md
@@ -0,0 +1,32 @@
+```typescript title="TypeScript"
+import { readFileSync } from "fs";
+
+async function extractViaClient() {
+  const formData = new FormData();
+  const fileData = readFileSync("document.pdf");
+  formData.append("files", new Blob([fileData]), "document.pdf");
+
+  try {
+    const response = await fetch("http://localhost:8000/extract", {
+      method: "POST",
+      body: formData,
+    });
+
+    if (!response.ok) {
+      const error = await response.json();
+      console.error(`Error: ${error.error_type}: ${error.message}`);
+      return;
+    }
+
+    const results = await response.json();
+    console.log(`Extracted ${results.length} document(s)`);
+    console.log(results[0].content);
+  } catch (error: unknown) {
+    if (error instanceof Error) {
+      console.error(`Request failed: ${error.message}`);
+    }
+  }
+}
+
+extractViaClient();
+```
--- a/docs/snippets/typescript/api/combining_all_features.md
+++ b/docs/snippets/typescript/api/combining_all_features.md
@@ -0,0 +1,45 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "kreuzberg";
+
+const config = {
+  // OCR: Tesseract on all pages with English text
+  force_ocr: false,
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+  // Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
+  chunking: {
+    max_characters: 800,
+    overlap: 100,
+    chunker_type: "markdown",
+    prepend_heading_context: true,
+  },
+  // Output: include document structure and tables
+  output_format: "markdown",
+  include_document_structure: true,
+  // Images: extract embedded images
+  images: {
+    extract_images: true,
+  },
+  // Cache extracted results on disk
+  use_cache: true,
+  enable_quality_processing: true,
+};
+
+const result = extractFileSync("report.pdf", undefined, config);
+
+console.log(`Content (${result.content.length} chars):`);
+console.log(result.content.slice(0, 200));
+
+if (result.chunks) {
+  console.log(`\nChunks: ${result.chunks.length}`);
+}
+console.log(`Tables: ${result.tables?.length ?? 0}`);
+if (result.detected_languages) {
+  console.log(`Languages: ${result.detected_languages}`);
+}
+if (result.extraction_method) {
+  console.log(`Extraction method: ${result.extraction_method}`);
+}
+```
--- a/docs/snippets/typescript/api/error_handling.md
+++ b/docs/snippets/typescript/api/error_handling.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "kreuzberg";
+
+try {
+  const result = extractFileSync("missing.pdf");
+  console.log(result.content);
+} catch (error: unknown) {
+  if (error instanceof Error) {
+    console.error(`Extraction failed: ${error.message}`);
+  }
+  throw error;
+}
+```
--- a/docs/snippets/typescript/api/error_handling_extract.md
+++ b/docs/snippets/typescript/api/error_handling_extract.md
@@ -0,0 +1,31 @@
+```typescript title="TypeScript"
+import { readFileSync } from "node:fs";
+
+async function extractDocument(): Promise<void> {
+  const formData = new FormData();
+  const fileData = readFileSync("document.pdf");
+  formData.append("files", new Blob([fileData]), "document.pdf");
+
+  try {
+    const response = await fetch("http://localhost:8000/extract", {
+      method: "POST",
+      body: formData,
+    });
+
+    if (!response.ok) {
+      const error = await response.json();
+      console.error(`Error: ${error.error_type}: ${error.message}`);
+      return;
+    }
+
+    const results = await response.json();
+    console.log(`Extracted ${results.length} documents`);
+  } catch (error: unknown) {
+    if (error instanceof Error) {
+      console.error(`Request failed: ${error.message}`);
+    }
+  }
+}
+
+extractDocument();
+```
--- a/docs/snippets/typescript/api/extract_bytes_async.md
+++ b/docs/snippets/typescript/api/extract_bytes_async.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+import { extractBytes } from "kreuzberg";
+import { readFileSync } from "fs";
+
+async function main() {
+  const content = readFileSync("document.pdf");
+  const result = await extractBytes(content, "application/pdf");
+
+  console.log(result.content);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+
+main();
+```
--- a/docs/snippets/typescript/api/extract_bytes_sync.md
+++ b/docs/snippets/typescript/api/extract_bytes_sync.md
@@ -0,0 +1,10 @@
+```typescript title="TypeScript"
+import { extractBytesSync } from "kreuzberg";
+import { readFileSync } from "fs";
+
+const content = readFileSync("document.pdf");
+const result = extractBytesSync(content, "application/pdf");
+
+console.log(result.content);
+console.log(`Tables: ${result.tables?.length ?? 0}`);
+```
--- a/docs/snippets/typescript/api/extract_file_async.md
+++ b/docs/snippets/typescript/api/extract_file_async.md
@@ -0,0 +1,12 @@
+```typescript title="TypeScript"
+import { extractFile } from "kreuzberg";
+
+async function main() {
+  const result = await extractFile("document.pdf");
+
+  console.log(result.content);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+
+main();
+```
--- a/docs/snippets/typescript/api/extract_file_sync.md
+++ b/docs/snippets/typescript/api/extract_file_sync.md
@@ -0,0 +1,9 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "kreuzberg";
+
+const result = extractFileSync("document.pdf");
+
+console.log(result.content);
+console.log(`MIME type: ${result.mime_type}`);
+console.log(`Tables: ${result.tables?.length ?? 0}`);
+```
--- a/docs/snippets/typescript/api/tables.md
+++ b/docs/snippets/typescript/api/tables.md
@@ -0,0 +1,11 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "kreuzberg";
+
+const result = extractFileSync("document.pdf");
+
+result.tables?.forEach((table) => {
+  console.log(`Table with ${table.cells?.length ?? 0} rows`);
+  console.log(table.markdown);
+  table.cells?.forEach((row) => console.log(row.join(" | ")));
+});
+```
--- a/docs/snippets/typescript/benchmarking/simple_benchmark.ts
+++ b/docs/snippets/typescript/benchmarking/simple_benchmark.ts
@@ -0,0 +1,49 @@
+```typescript title="simple_benchmark.ts"
+/* oxlint-disable */
+import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
+
+async function benchmarkExtractions(): Promise<void> {
+  const config: ExtractionConfig = { useCache: false };
+  const filePath = "document.pdf";
+  const numRuns = 10;
+
+  let start = performance.now();
+  for (let i = 0; i < numRuns; i++) {
+    await extractFile(filePath, null, config);
+  }
+  const syncDuration = (performance.now() - start) / 1000;
+  const avgSync = syncDuration / numRuns;
+
+  console.log("Sync extraction (" + numRuns + " runs):");
+  console.log("  - Total time: " + syncDuration.toFixed(3) + "s");
+  console.log("  - Average: " + avgSync.toFixed(3) + "s per extraction");
+
+  start = performance.now();
+  const tasks = Array(numRuns)
+    .fill(null)
+    .map(() => extractFile(filePath, null, config));
+  await Promise.all(tasks);
+  const asyncDuration = (performance.now() - start) / 1000;
+
+  console.log("\nAsync extraction (" + numRuns + " parallel runs):");
+  console.log("  - Total time: " + asyncDuration.toFixed(3) + "s");
+  console.log("  - Average: " + (asyncDuration / numRuns).toFixed(3) + "s per extraction");
+  console.log("  - Speedup: " + (syncDuration / asyncDuration).toFixed(1) + "x");
+
+  const cacheConfig: ExtractionConfig = { useCache: true };
+
+  console.log("\nFirst extraction (populates cache)...");
+  start = performance.now();
+  const _result1 = await extractFile(filePath, null, cacheConfig);
+  const firstDuration = (performance.now() - start) / 1000;
+  console.log("  - Time: " + firstDuration.toFixed(3) + "s");
+
+  console.log("Second extraction (from cache)...");
+  start = performance.now();
+  const _result2 = await extractFile(filePath, null, cacheConfig);
+  const cachedDuration = (performance.now() - start) / 1000;
+  console.log("  - Time: " + cachedDuration.toFixed(3) + "s");
+  console.log("  - Cache speedup: " + (firstDuration / cachedDuration).toFixed(1) + "x");
+}
+
+benchmarkExtractions().catch(console.error);
--- a/docs/snippets/typescript/cache/disk_cache.ts
+++ b/docs/snippets/typescript/cache/disk_cache.ts
@@ -0,0 +1,20 @@
+```typescript title="disk_cache.ts"
+/* oxlint-disable */
+import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
+
+const config: ExtractionConfig = { useCache: true };
+
+(async () => {
+  console.log("First extraction (will be cached)...");
+  const result1 = await extractFile("document.pdf", null, config);
+  const length1 = result1.content.length;
+  console.log("  - Content length: " + length1);
+
+  console.log("\nSecond extraction (from cache)...");
+  const result2 = await extractFile("document.pdf", null, config);
+  const length2 = result2.content.length;
+  console.log("  - Content length: " + length2);
+
+  const isIdentical = result1.content === result2.content;
+  console.log("\nResults are identical: " + isIdentical);
+})();
--- a/docs/snippets/typescript/cli/basic_cli.ts
+++ b/docs/snippets/typescript/cli/basic_cli.ts
@@ -0,0 +1,51 @@
+```typescript title="basic_cli.ts"
+/* oxlint-disable */
+import { spawn } from "child_process";
+
+interface ExtractionResult {
+  content: string;
+  format?: string;
+  languages?: string[];
+}
+
+async function extractWithCli(
+  filePath: string,
+  outputFormat: string = "text"
+): Promise<string | ExtractionResult> {
+  return new Promise((resolve, reject) => {
+    const child = spawn("kreuzberg", ["extract", filePath, "--format", outputFormat]);
+
+    let stdout = "";
+    let stderr = "";
+
+    child.stdout.on("data", (data) => {
+      stdout += data.toString();
+    });
+
+    child.stderr.on("data", (data) => {
+      stderr += data.toString();
+    });
+
+    child.on("close", (code) => {
+      if (code !== 0) {
+        const errMsg = "CLI exited with code " + code + ": " + stderr;
+        reject(new Error(errMsg));
+        return;
+      }
+
+      if (outputFormat === "json") {
+        resolve(JSON.parse(stdout));
+      } else {
+        resolve(stdout);
+      }
+    });
+  });
+}
+
+const document = "document.pdf";
+const textOutput = await extractWithCli(document, "text");
+console.log(`Extracted: ${(textOutput as string).length} characters`);
+
+const jsonOutput = (await extractWithCli(document, "json")) as ExtractionResult;
+console.log(`Format: ${jsonOutput.format}`);
+```
--- a/docs/snippets/typescript/cli/cli_with_config.ts
+++ b/docs/snippets/typescript/cli/cli_with_config.ts
@@ -0,0 +1,57 @@
+```typescript title="cli_with_config.ts"
+/* oxlint-disable */
+import { spawn } from "child_process";
+
+interface ExtractionResult {
+  content: string;
+  format?: string;
+  languages?: string[];
+}
+
+async function extractWithConfig(
+  filePath: string,
+  configPath: string
+): Promise<ExtractionResult> {
+  return new Promise((resolve, reject) => {
+    const child = spawn("kreuzberg", [
+      "extract",
+      filePath,
+      "--config",
+      configPath,
+      "--format",
+      "json",
+    ]);
+
+    let stdout = "";
+    let stderr = "";
+
+    child.stdout.on("data", (data) => {
+      stdout += data.toString();
+    });
+
+    child.stderr.on("data", (data) => {
+      stderr += data.toString();
+    });
+
+    child.on("close", (code) => {
+      if (code !== 0) {
+        const errMsg = "CLI exited with code " + code + ": " + stderr;
+        reject(new Error(errMsg));
+        return;
+      }
+
+      resolve(JSON.parse(stdout));
+    });
+  });
+}
+
+const configFile = "kreuzberg.toml";
+const document = "document.pdf";
+
+console.log("Extracting " + document + " with config " + configFile);
+const result = await extractWithConfig(document, configFile);
+
+console.log("Content length: " + result.content.length);
+console.log("Format: " + result.format);
+const languages = result.languages ? result.languages.join(", ") : "";
+console.log("Languages: " + languages);
--- a/docs/snippets/typescript/config/advanced_config.md
+++ b/docs/snippets/typescript/config/advanced_config.md
@@ -0,0 +1,35 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+  },
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 100,
+  },
+  tokenReduction: {
+    mode: "aggressive",
+  },
+  languageDetection: {
+    enabled: true,
+    detectMultiple: true,
+  },
+  useCache: true,
+  enableQualityProcessing: true,
+};
+
+const result = extractFileSync("document.pdf", null, config);
+
+if (result.chunks) {
+  for (const chunk of result.chunks) {
+    console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
+  }
+}
+
+if (result.detectedLanguages) {
+  console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
+}
+```
--- a/docs/snippets/typescript/config/chunking_config.md
+++ b/docs/snippets/typescript/config/chunking_config.md
@@ -0,0 +1,70 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 200,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
+```
+
+```typescript title="TypeScript - Markdown with Heading Context"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 500,
+    maxOverlap: 50,
+    sizingType: "tokenizer",
+    sizingModel: "Xenova/gpt-4o",
+  },
+};
+
+const result = await extractFile("document.md", null, config);
+for (const chunk of result.chunks ?? []) {
+  const headings = chunk.metadata?.headingContext?.headings ?? [];
+  for (const heading of headings) {
+    console.log(`Heading L${heading.level}: ${heading.text}`);
+  }
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+}
+```
+
+```typescript title="TypeScript - Semantic"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    chunkerType: "semantic",
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+for (const chunk of result.chunks ?? []) {
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+}
+```
+
+```typescript title="TypeScript - Prepend Heading Context"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 500,
+    maxOverlap: 50,
+    prependHeadingContext: true,
+  },
+};
+
+const result = await extractFile("document.md", null, config);
+for (const chunk of result.chunks ?? []) {
+  // Each chunk's content is prefixed with its heading breadcrumb
+  console.log(`Content: ${chunk.content.slice(0, 100)}...`);
+}
+```
--- a/docs/snippets/typescript/config/config_basic.md
+++ b/docs/snippets/typescript/config/config_basic.md
@@ -0,0 +1,11 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  useCache: true,
+  enableQualityProcessing: true,
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/config_discover.md
+++ b/docs/snippets/typescript/config/config_discover.md
@@ -0,0 +1,11 @@
+```typescript title="TypeScript"
+import { extractFile, ExtractionConfig } from "@kreuzberg/node";
+
+const config = ExtractionConfig.discover();
+if (config) {
+  const result = await extractFile("document.pdf", null, config);
+  console.log(result.content);
+} else {
+  console.log("No configuration file found");
+}
+```
--- a/docs/snippets/typescript/config/config_discovery.md
+++ b/docs/snippets/typescript/config/config_discovery.md
@@ -0,0 +1,20 @@
+# Configuration Discovery Example
+
+Use `ExtractionConfig.discover()` to automatically find and load configuration files from the current directory or parent directories:
+
+```typescript title="config_discovery.ts"
+import { ExtractionConfig, extractFile } from "@kreuzberg/node";
+
+const config = ExtractionConfig.discover();
+if (config) {
+  console.log("Found configuration file");
+  const result = await extractFile("document.pdf", null, config);
+  console.log(result.content);
+} else {
+  console.log("No configuration file found, using defaults");
+  const result = await extractFile("document.pdf");
+  console.log(result.content);
+}
+```
+
+The discovery method looks for `kreuzberg.toml`, `kreuzberg.yaml`, or `kreuzberg.json` files starting in the current directory and searching parent directories up to the filesystem root.
--- a/docs/snippets/typescript/config/config_ocr.md
+++ b/docs/snippets/typescript/config/config_ocr.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+fra",
+    tesseractConfig: {
+      psm: 3,
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/config_programmatic.md
+++ b/docs/snippets/typescript/config/config_programmatic.md
@@ -0,0 +1,22 @@
+```typescript title="TypeScript"
+import { extractFileSync, type ExtractionConfig } from "@kreuzberg/node";
+
+const config: ExtractionConfig = {
+  useCache: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseractConfig: {
+      psm: 6,
+    },
+  },
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 200,
+  },
+  enableQualityProcessing: true,
+};
+
+const result = extractFileSync("document.pdf", null, config);
+console.log(`Content length: ${result.content.length}`);
+```
--- a/docs/snippets/typescript/config/document_structure_config.md
+++ b/docs/snippets/typescript/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```typescript title="Document Structure Config (TypeScript)"
+import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
+
+const config: ExtractionConfig = {
+  includeDocumentStructure: true,
+};
+
+const result = extractFileSync("document.pdf", undefined, config);
+
+if (result.document) {
+  for (const node of result.document.nodes) {
+    console.log(`[${node.content.nodeType}] ${node.content.text ?? ""}`);
+  }
+}
+```
--- a/docs/snippets/typescript/config/element_based_output.md
+++ b/docs/snippets/typescript/config/element_based_output.md
@@ -0,0 +1,35 @@
+```typescript title="Element-Based Output (TypeScript)"
+import { extractFileSync, ExtractionConfig } from "@kreuzberg/node";
+
+// Configure element-based output
+const config: ExtractionConfig = {
+  outputFormat: "element_based",
+};
+
+// Extract document
+const result = extractFileSync("document.pdf", null, config);
+
+// Access elements
+for (const element of result.elements) {
+  console.log(`Type: ${element.elementType}`);
+  console.log(`Text: ${element.text.slice(0, 100)}`);
+
+  if (element.metadata.pageNumber) {
+    console.log(`Page: ${element.metadata.pageNumber}`);
+  }
+
+  if (element.metadata.coordinates) {
+    const coords = element.metadata.coordinates;
+    console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
+  }
+
+  console.log("---");
+}
+
+// Filter by element type
+const titles = result.elements.filter((e) => e.elementType === "title");
+for (const title of titles) {
+  const level = title.metadata.additional?.level || "unknown";
+  console.log(`[${level}] ${title.text}`);
+}
+```
--- a/docs/snippets/typescript/config/embedding_config.md
+++ b/docs/snippets/typescript/config/embedding_config.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  chunking: {
+    maxChars: 1000,
+    embedding: {
+      preset: "quality",
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+if (result.chunks && result.chunks.length > 0) {
+  console.log(`Chunk embeddings: ${result.chunks[0].embedding?.length ?? 0} dimensions`);
+}
+```
--- a/docs/snippets/typescript/config/embedding_config.ts
+++ b/docs/snippets/typescript/config/embedding_config.ts
@@ -0,0 +1,91 @@
+import type { ChunkingConfig, EmbeddingConfig, ExtractionConfig } from "@kreuzberg/node";
+
+// Example 1: Preset model (recommended)
+// Fast, balanced, or quality preset configurations optimized for common use cases.
+const _embeddingConfig: EmbeddingConfig = {
+  model: {
+    type: "preset",
+    name: "balanced",
+  },
+  batchSize: 32,
+  normalize: true,
+  showDownloadProgress: true,
+  cacheDir: "~/.cache/kreuzberg/embeddings",
+};
+
+// Available presets:
+// - "fast" (384 dims): Quick prototyping, development, resource-constrained
+// - "balanced" (768 dims): Production, general-purpose RAG, English documents
+// - "quality" (1024 dims): Complex documents, maximum accuracy
+// - "multilingual" (768 dims): International documents, 100+ languages
+
+// Example 2: Custom ONNX model (requires embeddings feature)
+// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+const _embeddingConfig2: EmbeddingConfig = {
+  model: {
+    type: "custom",
+    modelId: "BAAI/bge-small-en-v1.5",
+    dimensions: 384,
+  },
+  batchSize: 32,
+  normalize: true,
+  showDownloadProgress: true,
+  cacheDir: undefined, // Uses default: .kreuzberg/embeddings/
+};
+
+// Popular ONNX-compatible models:
+// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+// Example 3: Alternative Custom ONNX Model
+// For advanced users wanting different ONNX embedding models.
+const _embeddingConfig3: EmbeddingConfig = {
+  model: {
+    type: "custom",
+    modelId: "sentence-transformers/all-mpnet-base-v2",
+    dimensions: 768,
+  },
+  batchSize: 16, // Larger model requires smaller batch size
+  normalize: true,
+  showDownloadProgress: true,
+  cacheDir: "/var/cache/embeddings",
+};
+
+// Integration with ChunkingConfig
+// Add embeddings to your chunking configuration:
+const chunkingWithEmbeddings: ChunkingConfig = {
+  maxChars: 1024,
+  maxOverlap: 100,
+  preset: "balanced",
+  embedding: {
+    model: {
+      type: "preset",
+      name: "balanced",
+    },
+    batchSize: 32,
+    normalize: true,
+  },
+};
+
+const _extractionConfig: ExtractionConfig = {
+  chunking: chunkingWithEmbeddings,
+};
+
+// Key parameter explanations:
+//
+// batchSize: Number of texts to embed at once (32-128 typical)
+//   - Larger batches are faster but use more memory
+//   - Smaller batches for resource-constrained environments
+//
+// normalize: Whether to normalize vectors (L2 norm)
+//   - true (recommended): Enables cosine similarity in vector DBs
+//   - false: Raw embedding values
+//
+// cacheDir: Where to store downloaded models
+//   - undefined: Uses .kreuzberg/embeddings/ in current directory
+//   - string: Custom directory for model storage
+//
+// showDownloadProgress: Display download progress bar
+//   - Useful for monitoring large model downloads
--- a/docs/snippets/typescript/config/hierarchy_config.ts
+++ b/docs/snippets/typescript/config/hierarchy_config.ts
@@ -0,0 +1,89 @@
+import {
+  type ExtractionConfig,
+  extractFileSync,
+  type HierarchyConfig,
+  type PdfConfig,
+} from "@kreuzberg/node";
+
+// Example 1: Basic hierarchy extraction
+// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
+// Extract bounding box information for spatial layout awareness.
+const hierarchyConfigBasic: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
+  includeBbox: true, // Include bounding box coordinates
+  ocrCoverageThreshold: undefined, // No OCR coverage threshold
+};
+
+const pdfConfigBasic: PdfConfig = {
+  hierarchy: hierarchyConfigBasic,
+};
+
+const extractionConfigBasic: ExtractionConfig = {
+  pdfOptions: pdfConfigBasic,
+};
+
+const _result = extractFileSync("document.pdf", { config: extractionConfigBasic });
+
+// Example 2: Custom kClusters for minimal structure
+// Use 3 clusters for simpler hierarchy with minimal structure.
+// Useful when you only need major section divisions (Main, Subsection, Detail).
+const hierarchyConfigMinimal: HierarchyConfig = {
+  enabled: true,
+  kClusters: 3, // Minimal clustering: just 3 levels
+  includeBbox: true,
+  ocrCoverageThreshold: undefined,
+};
+
+const pdfConfigMinimal: PdfConfig = {
+  hierarchy: hierarchyConfigMinimal,
+};
+
+const extractionConfigMinimal: ExtractionConfig = {
+  pdfOptions: pdfConfigMinimal,
+};
+
+const _resultMinimal = extractFileSync("document.pdf", { config: extractionConfigMinimal });
+
+// Example 3: With OCR coverage threshold
+// Trigger OCR if less than 50% of text has font data.
+// Useful for documents with mixed digital and scanned content.
+const hierarchyConfigOcr: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6,
+  includeBbox: true,
+  ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
+};
+
+const pdfConfigOcr: PdfConfig = {
+  hierarchy: hierarchyConfigOcr,
+};
+
+const extractionConfigOcr: ExtractionConfig = {
+  pdfOptions: pdfConfigOcr,
+};
+
+const _resultOcr = extractFileSync("document.pdf", { config: extractionConfigOcr });
+
+// Field descriptions:
+//
+// enabled: boolean (default: true)
+//   - Enable or disable hierarchy extraction
+//   - When false, hierarchy structure is not analyzed
+//
+// kClusters: number (default: 6, valid: 1-7)
+//   - Number of font size clusters for hierarchy levels
+//   - 6 provides H1-H6 heading levels with body text
+//   - Higher values create more fine-grained hierarchy
+//   - Lower values create simpler structure
+//
+// includeBbox: boolean (default: true)
+//   - Include bounding box coordinates in hierarchy blocks
+//   - Required for spatial layout awareness and document structure
+//   - Set to false only if space optimization is critical
+//
+// ocrCoverageThreshold: number | undefined (default: undefined)
+//   - Range: 0.0 to 1.0
+//   - Triggers OCR when text block coverage falls below this fraction
+//   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+//   - undefined means no OCR coverage-based triggering
--- a/docs/snippets/typescript/config/html_output.md
+++ b/docs/snippets/typescript/config/html_output.md
@@ -0,0 +1,12 @@
+```typescript title="TypeScript"
+import { extractFile } from "kreuzberg";
+
+const result = await extractFile("document.pdf", {
+  outputFormat: "html",
+  htmlOutput: {
+    theme: "github",
+    embedCss: true,
+  },
+});
+console.log(result.content); // HTML with kb-* classes
+```
--- a/docs/snippets/typescript/config/keyword_config.ts
+++ b/docs/snippets/typescript/config/keyword_config.ts
@@ -0,0 +1,70 @@
+import {
+  type ExtractionConfig,
+  extractFile,
+  type RakeParams,
+  type YakeParams,
+} from "@kreuzberg/node";
+
+// Example 1: Basic YAKE configuration
+// Uses YAKE algorithm with default parameters and English stopword filtering
+async function basicYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 10,
+      minScore: 0.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 2: Advanced YAKE with custom parameters
+// Fine-tunes YAKE with custom window size for co-occurrence analysis
+async function _advancedYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 15,
+      minScore: 0.1,
+      ngramRange: [1, 2],
+      language: "en",
+      yakeParams: {
+        windowSize: 1,
+      } as YakeParams,
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 3: RAKE configuration
+// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+async function _rakeConfig(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "rake",
+      maxKeywords: 10,
+      minScore: 5.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: {
+        minWordLength: 1,
+        maxWordsPerPhrase: 3,
+      } as RakeParams,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+basicYake();
--- a/docs/snippets/typescript/config/keyword_extraction_config.md
+++ b/docs/snippets/typescript/config/keyword_extraction_config.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  keywords: {
+    algorithm: "yake",
+    maxKeywords: 10,
+    minScore: 0.3,
+    ngramRange: [1, 3],
+    language: "en",
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(`Content: ${result.content}`);
+```
--- a/docs/snippets/typescript/config/language_detection_config.md
+++ b/docs/snippets/typescript/config/language_detection_config.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: false,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+if (result.detectedLanguages) {
+  console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
+}
+```
--- a/docs/snippets/typescript/config/ocr_dpi_config.md
+++ b/docs/snippets/typescript/config/ocr_dpi_config.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    tesseractConfig: {
+      preprocessing: {
+        targetDpi: 300,
+      },
+    },
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+console.log(`content length: ${result.content.length}`);
+```
--- a/docs/snippets/typescript/config/pdf_config.md
+++ b/docs/snippets/typescript/config/pdf_config.md
@@ -0,0 +1,15 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  pdfOptions: {
+    extractImages: true,
+    extractMetadata: true,
+    passwords: ["password1", "password2"],
+    hierarchy: { enabled: true, kClusters: 6, includeBbox: true },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/pdf_hierarchy_config.md
+++ b/docs/snippets/typescript/config/pdf_hierarchy_config.md
@@ -0,0 +1,23 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  pdfOptions: {
+    extractMetadata: true,
+    hierarchy: {
+      enabled: true,
+      kClusters: 6,
+      includeBbox: true,
+      ocrCoverageThreshold: 0.8,
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+if (result.pages) {
+  result.pages.forEach((page) => {
+    console.log(`Page ${page.pageNumber}:`);
+    console.log(`  Content: ${page.content.substring(0, 100)}...`);
+  });
+}
+```
--- a/docs/snippets/typescript/config/postprocessor_config.md
+++ b/docs/snippets/typescript/config/postprocessor_config.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  postprocessor: {
+    enabled: true,
+    enabledProcessors: ["deduplication", "whitespace_normalization"],
+    disabledProcessors: ["mojibake_fix"],
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/quality_processing_config.md
+++ b/docs/snippets/typescript/config/quality_processing_config.md
@@ -0,0 +1,10 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/tesseract_config.md
+++ b/docs/snippets/typescript/config/tesseract_config.md
@@ -0,0 +1,18 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+fra+deu",
+    tesseractConfig: {
+      psm: 6,
+      tesseditCharWhitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?",
+      enableTableDetection: true,
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/config/token_reduction_config.md
+++ b/docs/snippets/typescript/config/token_reduction_config.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  tokenReduction: {
+    mode: "moderate",
+    preserveImportantWords: true,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/docker/usage.ts
+++ b/docs/snippets/typescript/docker/usage.ts
@@ -0,0 +1,66 @@
+```typescript title="usage.ts"
+import { exec } from "child_process";
+import { promisify } from "util";
+import axios from "axios";
+import { readFileSync } from "fs";
+import { basename } from "path";
+import FormData from "form-data";
+
+const execAsync = promisify(exec);
+
+class DockerKreuzbergClient {
+  private containerName: string;
+  private containerImage: string;
+  private apiPort: number;
+
+  constructor(
+    containerName: string = "kreuzberg-api",
+    containerImage: string = "kreuzberg:latest",
+    apiPort: number = 8000
+  ) {
+    this.containerName = containerName;
+    this.containerImage = containerImage;
+    this.apiPort = apiPort;
+  }
+
+  async startContainer(): Promise<void> {
+    console.log("Starting Kreuzberg Docker container...");
+    const cmd = `docker run -d --name ${this.containerName} -p ${this.apiPort}:8000 ${this.containerImage}`;
+    await execAsync(cmd);
+    console.log(`Container started on http://localhost:${this.apiPort}`);
+  }
+
+  async extractFile(filePath: string): Promise<string> {
+    const fileContent = readFileSync(filePath);
+    const form = new FormData();
+    form.append("file", fileContent, basename(filePath));
+
+    const response = await axios.post(`http://localhost:${this.apiPort}/api/extract`, form, {
+      headers: form.getHeaders(),
+    });
+
+    return response.data.content;
+  }
+
+  async stopContainer(): Promise<void> {
+    console.log("Stopping Kreuzberg Docker container...");
+    await execAsync(`docker stop ${this.containerName}`);
+    await execAsync(`docker rm ${this.containerName}`);
+    console.log("Container stopped and removed");
+  }
+}
+
+(async () => {
+  const dockerClient = new DockerKreuzbergClient();
+
+  try {
+    await dockerClient.startContainer();
+    await new Promise((resolve) => setTimeout(resolve, 2000));
+
+    const content = await dockerClient.extractFile("document.pdf");
+    console.log(`Extracted content:\n${content}`);
+  } finally {
+    await dockerClient.stopContainer();
+  }
+})()
+```
--- a/docs/snippets/typescript/getting-started/basic_usage.md
+++ b/docs/snippets/typescript/getting-started/basic_usage.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  useCache: true,
+  enableQualityProcessing: true,
+};
+
+const result = extractFileSync("document.pdf", null, config);
+
+console.log(result.content);
+console.log(`MIME Type: ${result.mimeType}`);
+```
--- a/docs/snippets/typescript/getting-started/batch_extract_bytes_sync.md
+++ b/docs/snippets/typescript/getting-started/batch_extract_bytes_sync.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { batchExtractBytesSync } from "@kreuzberg/node";
+import { readFileSync } from "fs";
+
+const files = ["doc1.pdf", "doc2.docx"];
+const dataList = files.map((f) => readFileSync(f));
+const mimeTypes = [
+  "application/pdf",
+  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+];
+
+const results = batchExtractBytesSync(dataList, mimeTypes);
+
+results.forEach((result, i) => {
+  console.log(`Document ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/typescript/getting-started/batch_extract_files_sync.md
+++ b/docs/snippets/typescript/getting-started/batch_extract_files_sync.md
@@ -0,0 +1,10 @@
+```typescript title="TypeScript"
+import { batchExtractFilesSync } from "@kreuzberg/node";
+
+const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
+const results = batchExtractFilesSync(files);
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/typescript/getting-started/client_extract_multiple_files.md
+++ b/docs/snippets/typescript/getting-started/client_extract_multiple_files.md
@@ -0,0 +1,12 @@
+```typescript title="TypeScript"
+// Multiple files
+const multipleFiles = new FormData();
+for (const file of fileInput.files) {
+  multipleFiles.append("files", file);
+}
+
+const response3 = await fetch("http://localhost:8000/extract", {
+  method: "POST",
+  body: multipleFiles,
+});
+```
--- a/docs/snippets/typescript/getting-started/client_extract_single_file.md
+++ b/docs/snippets/typescript/getting-started/client_extract_single_file.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+// Using fetch API
+const formData = new FormData();
+formData.append("files", fileInput.files[0]);
+
+const response = await fetch("http://localhost:8000/extract", {
+  method: "POST",
+  body: formData,
+});
+
+const results = await response.json();
+console.log(results[0].content);
+```
--- a/docs/snippets/typescript/getting-started/client_extract_with_config.md
+++ b/docs/snippets/typescript/getting-started/client_extract_with_config.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+// With configuration
+const formDataWithConfig = new FormData();
+formDataWithConfig.append("files", fileInput.files[0]);
+formDataWithConfig.append(
+  "config",
+  JSON.stringify({
+    ocr: { language: "eng" },
+    force_ocr: true,
+  }),
+);
+
+const response2 = await fetch("http://localhost:8000/extract", {
+  method: "POST",
+  body: formDataWithConfig,
+});
+```
--- a/docs/snippets/typescript/getting-started/combining_all_features.md
+++ b/docs/snippets/typescript/getting-started/combining_all_features.md
@@ -0,0 +1,36 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  enableQualityProcessing: true,
+  languageDetection: {
+    enabled: true,
+    detectMultiple: true,
+  },
+  tokenReduction: {
+    mode: "moderate",
+    preserveImportantWords: true,
+  },
+  chunking: {
+    maxChars: 512,
+    maxOverlap: 50,
+    embedding: {
+      preset: "balanced",
+    },
+  },
+  keywords: {
+    algorithm: "yake",
+    maxKeywords: 10,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+
+console.log(`Content length: ${result.content.length}`);
+if (result.detectedLanguages) {
+  console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
+}
+if (result.chunks && result.chunks.length > 0) {
+  console.log(`Chunks: ${result.chunks.length}`);
+}
+```
--- a/docs/snippets/typescript/getting-started/complete_example.md
+++ b/docs/snippets/typescript/getting-started/complete_example.md
@@ -0,0 +1,48 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  useCache: true,
+  enableQualityProcessing: true,
+  forceOcr: false,
+  ocr: {
+    backend: "tesseract",
+    language: "eng+fra",
+    tesseractConfig: {
+      psm: 3,
+      enableTableDetection: true,
+    },
+  },
+  pdfOptions: {
+    extractImages: true,
+    extractMetadata: true,
+  },
+  images: {
+    extractImages: true,
+    targetDpi: 150,
+    maxImageDimension: 2048,
+  },
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 200,
+    embedding: {
+      preset: "balanced",
+    },
+  },
+  tokenReduction: {
+    mode: "moderate",
+    preserveImportantWords: true,
+  },
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: false,
+  },
+  postprocessor: {
+    enabled: true,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(`Extracted content length: ${result.content.length}`);
+```
--- a/docs/snippets/typescript/getting-started/extract_bytes_async.md
+++ b/docs/snippets/typescript/getting-started/extract_bytes_async.md
@@ -0,0 +1,8 @@
+```typescript title="TypeScript"
+import { extractBytes } from "@kreuzberg/node";
+import { readFile } from "fs/promises";
+
+const data = await readFile("document.pdf");
+const result = await extractBytes(data, "application/pdf");
+console.log(result.content);
+```
--- a/docs/snippets/typescript/getting-started/extract_bytes_sync.md
+++ b/docs/snippets/typescript/getting-started/extract_bytes_sync.md
@@ -0,0 +1,8 @@
+```typescript title="TypeScript"
+import { extractBytesSync } from "@kreuzberg/node";
+import { readFileSync } from "fs";
+
+const data = readFileSync("document.pdf");
+const result = extractBytesSync(data, "application/pdf");
+console.log(result.content);
+```
--- a/docs/snippets/typescript/getting-started/extract_file.md
+++ b/docs/snippets/typescript/getting-started/extract_file.md
@@ -0,0 +1,9 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const result = await extractFile("document.pdf");
+
+console.log(result.content);
+console.log(`Tables: ${result.tables.length}`);
+console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+```
--- a/docs/snippets/typescript/getting-started/extract_file_async.md
+++ b/docs/snippets/typescript/getting-started/extract_file_async.md
@@ -0,0 +1,6 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const result = await extractFile("document.pdf");
+console.log(result.content);
+```
--- a/docs/snippets/typescript/getting-started/extract_file_sync.md
+++ b/docs/snippets/typescript/getting-started/extract_file_sync.md
@@ -0,0 +1,9 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+
+console.log(result.content);
+console.log(`Tables: ${result.tables.length}`);
+console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+```
--- a/docs/snippets/typescript/getting-started/extract_with_ocr.md
+++ b/docs/snippets/typescript/getting-started/extract_with_ocr.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  forceOcr: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+
+console.log(result.content);
+console.log(`Detected Languages: ${result.detectedLanguages?.join(", ") ?? "none"}`);
+```
--- a/docs/snippets/typescript/getting-started/hello_world.md
+++ b/docs/snippets/typescript/getting-started/hello_world.md
@@ -0,0 +1,6 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+console.log(result.content);
+```
--- a/docs/snippets/typescript/getting-started/install_verify.md
+++ b/docs/snippets/typescript/getting-started/install_verify.md
@@ -0,0 +1,9 @@
+```typescript title="TypeScript"
+import { getVersion, extractFileSync } from "@kreuzberg/node";
+
+const version = getVersion();
+console.log(`Kreuzberg version: ${version}`);
+
+const result = extractFileSync("document.pdf");
+console.log(`Extraction successful: ${result.success}`);
+```
--- a/docs/snippets/typescript/getting-started/read_content.md
+++ b/docs/snippets/typescript/getting-started/read_content.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+
+console.log(`Content: ${result.content}`);
+console.log(`Success: ${result.success}`);
+console.log(`Content Length: ${result.content.length}`);
+
+if (result.metadata.page_count) {
+  console.log(`Pages: ${result.metadata.page_count}`);
+}
+```
--- a/docs/snippets/typescript/llm/structured_extraction.md
+++ b/docs/snippets/typescript/llm/structured_extraction.md
@@ -0,0 +1,25 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  structuredExtraction: {
+    schema: {
+      type: "object",
+      properties: {
+        title: { type: "string" },
+        authors: { type: "array", items: { type: "string" } },
+        date: { type: "string" },
+      },
+      required: ["title", "authors", "date"],
+      additionalProperties: false,
+    },
+    llm: {
+      model: "openai/gpt-4o-mini",
+    },
+    strict: true,
+  },
+};
+
+const result = extractFileSync("paper.pdf", null, config);
+console.log(result.structuredOutput);
+```
--- a/docs/snippets/typescript/llm/vlm_ocr.md
+++ b/docs/snippets/typescript/llm/vlm_ocr.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  forceOcr: true,
+  ocr: {
+    backend: "vlm",
+    vlmConfig: {
+      model: "openai/gpt-4o-mini",
+    },
+  },
+};
+
+const result = extractFileSync("scan.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/mcp/client.md
+++ b/docs/snippets/typescript/mcp/client.md
@@ -0,0 +1,174 @@
+```typescript title="TypeScript"
+import { spawn } from "node:child_process";
+import * as readline from "node:readline";
+
+/**
+ * MCP Client for Kreuzberg
+ * Communicates with Kreuzberg MCP server via stdio
+ * @example
+ * const client = new KreuzbergMcpClient();
+ * await client.connect();
+ * const result = await client.callTool("extract_file", { path: "doc.pdf" });
+ */
+class KreuzbergMcpClient {
+  private process: ReturnType<typeof spawn> | null = null;
+  private rl: readline.Interface | null = null;
+  private requestId: number = 0;
+  private pendingRequests: Map<
+    number,
+    {
+      resolve: (value: unknown) => void;
+      reject: (error: Error) => void;
+    }
+  > = new Map();
+
+  /**
+   * Connect to MCP server
+   */
+  async connect(): Promise<void> {
+    this.process = spawn("kreuzberg", ["mcp"]);
+
+    this.rl = readline.createInterface({
+      input: this.process.stdout,
+      output: this.process.stdin,
+      terminal: false,
+    });
+
+    // Handle incoming responses
+    this.rl.on("line", (line) => {
+      try {
+        const response = JSON.parse(line) as {
+          id: number;
+          result?: unknown;
+          error?: { message: string };
+        };
+        const pending = this.pendingRequests.get(response.id);
+
+        if (pending) {
+          if (response.error) {
+            pending.reject(new Error(response.error.message));
+          } else {
+            pending.resolve(response.result);
+          }
+          this.pendingRequests.delete(response.id);
+        }
+      } catch (error) {
+        console.error("Failed to parse response:", error);
+      }
+    });
+
+    // Handle errors
+    this.process.stderr?.on("data", (data) => {
+      console.error("MCP server error:", data.toString());
+    });
+
+    // Wait for initialization
+    await this.listTools();
+  }
+
+  /**
+   * List available tools
+   */
+  async listTools(): Promise<Array<{ name: string; description: string }>> {
+    return this.sendRequest("tools/list", {}) as Promise<
+      Array<{ name: string; description: string }>
+    >;
+  }
+
+  /**
+   * Call a tool on the server
+   */
+  async callTool(toolName: string, args: Record<string, unknown>): Promise<unknown> {
+    return this.sendRequest("tools/call", {
+      name: toolName,
+      arguments: args,
+    });
+  }
+
+  /**
+   * Extract file from path
+   */
+  async extractFile(path: string, async: boolean = false): Promise<Record<string, unknown>> {
+    return this.callTool("extract_file", {
+      path,
+      async,
+    }) as Promise<Record<string, unknown>>;
+  }
+
+  /**
+   * Extract from bytes
+   */
+  async extractBytes(
+    data: Uint8Array,
+    mimeType: string,
+    async: boolean = false,
+  ): Promise<Record<string, unknown>> {
+    const base64 = Buffer.from(data).toString("base64");
+    return this.callTool("extract_bytes", {
+      data: base64,
+      mimeType,
+      async,
+    }) as Promise<Record<string, unknown>>;
+  }
+
+  /**
+   * Send request to server
+   */
+  private sendRequest(method: string, params: Record<string, unknown>): Promise<unknown> {
+    return new Promise((resolve, reject) => {
+      const id = ++this.requestId;
+      this.pendingRequests.set(id, { resolve, reject });
+
+      const request = {
+        jsonrpc: "2.0",
+        id,
+        method,
+        params,
+      };
+
+      this.process?.stdin.write(JSON.stringify(request) + "\n");
+    });
+  }
+
+  /**
+   * Disconnect from server
+   */
+  disconnect(): void {
+    this.rl?.close();
+    this.process?.kill();
+  }
+}
+
+// Usage example
+async function main(): Promise<void> {
+  const client = new KreuzbergMcpClient();
+
+  try {
+    // Connect to MCP server
+    await client.connect();
+    console.log("Connected to Kreuzberg MCP server");
+
+    // List available tools
+    const tools = await client.listTools();
+    console.log(
+      "Available tools:",
+      tools.map((t) => t.name),
+    );
+
+    // Extract file
+    const result = await client.extractFile("document.pdf", true);
+    console.log("Extraction result:", result);
+  } catch (error) {
+    console.error("Error:", error);
+  } finally {
+    client.disconnect();
+  }
+}
+
+// Run if executed directly
+if (require.main === module) {
+  main();
+}
+
+export { KreuzbergMcpClient };
+```
--- a/docs/snippets/typescript/mcp/mcp_custom_client.md
+++ b/docs/snippets/typescript/mcp/mcp_custom_client.md
@@ -0,0 +1,35 @@
+```typescript title="TypeScript"
+import { spawn } from "child_process";
+import * as readline from "readline";
+
+const mcpProcess = spawn("kreuzberg", ["mcp"]);
+
+const rl = readline.createInterface({
+  input: mcpProcess.stdout,
+  output: mcpProcess.stdin,
+  terminal: false,
+});
+
+const request = {
+  method: "tools/call",
+  params: {
+    name: "extract_file",
+    arguments: {
+      path: "document.pdf",
+      async: true,
+    },
+  },
+};
+
+mcpProcess.stdin.write(JSON.stringify(request) + "\n");
+
+rl.on("line", (line) => {
+  const response = JSON.parse(line);
+  console.log(response);
+  mcpProcess.kill();
+});
+
+mcpProcess.on("error", (err) => {
+  console.error("Failed to start MCP process:", err);
+});
+```
--- a/docs/snippets/typescript/mcp/mcp_server_start.md
+++ b/docs/snippets/typescript/mcp/mcp_server_start.md
@@ -0,0 +1,17 @@
+```typescript title="TypeScript"
+import { spawn } from "child_process";
+
+const mcpProcess = spawn("kreuzberg", ["mcp"]);
+
+mcpProcess.stdout.on("data", (data) => {
+  console.log(`MCP Server: ${data}`);
+});
+
+mcpProcess.stderr.on("data", (data) => {
+  console.error(`MCP Error: ${data}`);
+});
+
+mcpProcess.on("error", (err) => {
+  console.error(`Failed to start MCP server: ${err.message}`);
+});
+```
--- a/docs/snippets/typescript/mcp/server.md
+++ b/docs/snippets/typescript/mcp/server.md
@@ -0,0 +1,193 @@
+```typescript title="TypeScript"
+import {
+  extractFile,
+  extractFileSync,
+  extractBytes,
+  extractBytesSync,
+  type ExtractionResult,
+  type ExtractionConfig,
+} from "@kreuzberg/node";
+import * as readline from "node:readline";
+
+/**
+ * MCP Server for Kreuzberg
+ * Exposes document extraction as MCP tools
+ * @example
+ * const server = new KreuzbergMcpServer();
+ * await server.start();
+ */
+class KreuzbergMcpServer {
+  private config?: ExtractionConfig;
+  private rl: readline.Interface;
+
+  constructor(config?: ExtractionConfig) {
+    this.config = config;
+    this.rl = readline.createInterface({
+      input: process.stdin,
+      output: process.stdout,
+      terminal: false,
+    });
+  }
+
+  /**
+   * Start MCP server
+   */
+  async start(): Promise<void> {
+    console.error("[MCP Server] Starting Kreuzberg MCP server");
+
+    this.rl.on("line", async (line) => {
+      try {
+        const request = JSON.parse(line) as {
+          id: number;
+          method: string;
+          params: Record<string, unknown>;
+        };
+
+        const response = await this.handleRequest(request);
+        process.stdout.write(JSON.stringify(response) + "\n");
+      } catch (error) {
+        const errorMessage = error instanceof Error ? error.message : "Unknown error";
+        process.stdout.write(
+          JSON.stringify({
+            id: 0,
+            error: { message: errorMessage },
+          }) + "\n",
+        );
+      }
+    });
+  }
+
+  /**
+   * Handle incoming MCP request
+   */
+  private async handleRequest(request: {
+    id: number;
+    method: string;
+    params: Record<string, unknown>;
+  }): Promise<Record<string, unknown>> {
+    const { id, method, params } = request;
+
+    if (method === "tools/list") {
+      return {
+        id,
+        result: this.listTools(),
+      };
+    }
+
+    if (method === "tools/call") {
+      const result = await this.callTool(
+        params.name as string,
+        params.arguments as Record<string, unknown>,
+      );
+      return {
+        id,
+        result,
+      };
+    }
+
+    throw new Error(`Unknown method: ${method}`);
+  }
+
+  /**
+   * List available tools
+   */
+  private listTools(): Array<{
+    name: string;
+    description: string;
+    inputSchema: Record<string, unknown>;
+  }> {
+    return [
+      {
+        name: "extract_file",
+        description: "Extract content from a file by path",
+        inputSchema: {
+          type: "object",
+          properties: {
+            path: { type: "string", description: "Path to file" },
+            async: { type: "boolean", description: "Use async extraction" },
+            config: {
+              type: "object",
+              description: "Optional extraction config",
+            },
+          },
+          required: ["path"],
+        },
+      },
+      {
+        name: "extract_bytes",
+        description: "Extract content from raw bytes",
+        inputSchema: {
+          type: "object",
+          properties: {
+            data: { type: "string", description: "Base64-encoded data" },
+            mimeType: { type: "string", description: "MIME type" },
+            async: { type: "boolean", description: "Use async extraction" },
+            config: {
+              type: "object",
+              description: "Optional extraction config",
+            },
+          },
+          required: ["data", "mimeType"],
+        },
+      },
+    ];
+  }
+
+  /**
+   * Call tool
+   */
+  private async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
+    if (name === "extract_file") {
+      const path = args.path as string;
+      const useAsync = (args.async as boolean) ?? true;
+      const config = (args.config as ExtractionConfig) ?? this.config;
+
+      if (useAsync) {
+        return extractFile(path, null, config);
+      } else {
+        return extractFileSync(path, null, config);
+      }
+    }
+
+    if (name === "extract_bytes") {
+      const data = Buffer.from(args.data as string, "base64");
+      const mimeType = args.mimeType as string;
+      const useAsync = (args.async as boolean) ?? true;
+      const config = (args.config as ExtractionConfig) ?? this.config;
+
+      if (useAsync) {
+        return extractBytes(data, mimeType, config);
+      } else {
+        return extractBytesSync(data, mimeType, config);
+      }
+    }
+
+    throw new Error(`Unknown tool: ${name}`);
+  }
+}
+
+/**
+ * Main entry point
+ */
+async function main(): Promise<void> {
+  const config: ExtractionConfig = {
+    ocr: {
+      enabled: true,
+      backend: "tesseract",
+    },
+  };
+
+  const server = new KreuzbergMcpServer(config);
+  await server.start();
+}
+
+// Start server when invoked as MCP
+if (process.argv[2] === "mcp") {
+  main().catch((error) => {
+    console.error("Server error:", error);
+    process.exit(1);
+  });
+}
+
+export { KreuzbergMcpServer };
+```
--- a/docs/snippets/typescript/metadata/language_detection.md
+++ b/docs/snippets/typescript/metadata/language_detection.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.9,
+    detectMultiple: true,
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/metadata/language_detection_multilingual.md
+++ b/docs/snippets/typescript/metadata/language_detection_multilingual.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: true,
+  },
+};
+
+const result = await extractFile("multilingual_document.pdf", null, config);
+if (result.detectedLanguages) {
+  console.log(`Detected languages: ${result.detectedLanguages.join(", ")}`);
+}
+```
--- a/docs/snippets/typescript/metadata/metadata.md
+++ b/docs/snippets/typescript/metadata/metadata.md
@@ -0,0 +1,83 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+if (result.metadata.pageCount) {
+  console.log(`Pages: ${result.metadata.pageCount}`);
+}
+
+const htmlResult = extractFileSync("page.html");
+console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
+
+const htmlMeta = htmlResult.metadata;
+if (htmlMeta.title) {
+  console.log(`Title: ${htmlMeta.title}`);
+}
+
+// Access keywords as array
+if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
+  console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
+}
+
+// Access canonical URL (renamed from canonical)
+if (htmlMeta.canonicalUrl) {
+  console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
+}
+
+// Access Open Graph fields from map
+if (htmlMeta.openGraph) {
+  if (htmlMeta.openGraph["image"]) {
+    console.log(`Open Graph Image: ${htmlMeta.openGraph["image"]}`);
+  }
+  if (htmlMeta.openGraph["title"]) {
+    console.log(`Open Graph Title: ${htmlMeta.openGraph["title"]}`);
+  }
+  if (htmlMeta.openGraph["type"]) {
+    console.log(`Open Graph Type: ${htmlMeta.openGraph["type"]}`);
+  }
+}
+
+// Access Twitter Card fields from map
+if (htmlMeta.twitterCard) {
+  if (htmlMeta.twitterCard["card"]) {
+    console.log(`Twitter Card Type: ${htmlMeta.twitterCard["card"]}`);
+  }
+  if (htmlMeta.twitterCard["creator"]) {
+    console.log(`Twitter Creator: ${htmlMeta.twitterCard["creator"]}`);
+  }
+}
+
+// Access new fields
+if (htmlMeta.language) {
+  console.log(`Language: ${htmlMeta.language}`);
+}
+
+if (htmlMeta.textDirection) {
+  console.log(`Text Direction: ${htmlMeta.textDirection}`);
+}
+
+// Access headers
+if (htmlMeta.headers && htmlMeta.headers.length > 0) {
+  console.log(`Headers: ${htmlMeta.headers.map((h) => h.text).join(", ")}`);
+}
+
+// Access links
+if (htmlMeta.links && htmlMeta.links.length > 0) {
+  htmlMeta.links.forEach((link) => {
+    console.log(`Link: ${link.href} (${link.text})`);
+  });
+}
+
+// Access images
+if (htmlMeta.images && htmlMeta.images.length > 0) {
+  htmlMeta.images.forEach((image) => {
+    console.log(`Image: ${image.src}`);
+  });
+}
+
+// Access structured data
+if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
+  console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
+}
+```
--- a/docs/snippets/typescript/metadata/page_boundaries.md
+++ b/docs/snippets/typescript/metadata/page_boundaries.md
@@ -0,0 +1,18 @@
+Import { extractFileSync } from '@kreuzberg/node';
+
+Const result = extractFileSync('document.pdf');
+
+If (result.metadata.page_structure?.boundaries) {
+const encoder = new TextEncoder();
+const contentBytes = encoder.encode(result.content);
+
+For (const boundary of result.metadata.page_structure.boundaries.slice(0, 3)) {
+const pageBytes = contentBytes.slice(boundary.byteStart, boundary.byteEnd);
+const pageText = new TextDecoder().decode(pageBytes);
+
+    console.log(`Page ${boundary.pageNumber}:`);
+    console.log(`  Byte range: ${boundary.byteStart}-${boundary.byteEnd}`);
+    console.log(`  Preview: ${pageText.substring(0, 100)}...`);
+
+}
+}
--- a/docs/snippets/typescript/metadata/page_tracking_basic.md
+++ b/docs/snippets/typescript/metadata/page_tracking_basic.md
@@ -0,0 +1,12 @@
+Import { extractFileSync } from '@kreuzberg/node';
+
+Const result = extractFileSync('document.pdf', null, { pages: { extractPages: true } });
+
+If (result.pages) {
+for (const page of result.pages) {
+console.log(`Page ${page.pageNumber}:`);
+console.log(`  Content: ${page.content.length} chars`);
+console.log(`  Tables: ${page.tables.length}`);
+console.log(`  Images: ${page.images.length}`);
+}
+}
--- a/docs/snippets/typescript/metadata/tables.md
+++ b/docs/snippets/typescript/metadata/tables.md
@@ -0,0 +1,22 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+
+if (result.tables) {
+  for (const table of result.tables) {
+    const rowCount = table.cells?.length ?? 0;
+    console.log(`Table with ${rowCount} rows`);
+
+    if (table.markdown) {
+      console.log(table.markdown);
+    }
+
+    if (table.cells) {
+      for (const row of table.cells) {
+        console.log(row);
+      }
+    }
+  }
+}
+```
--- a/docs/snippets/typescript/metadata/vector_database_integration.md
+++ b/docs/snippets/typescript/metadata/vector_database_integration.md
@@ -0,0 +1,49 @@
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+interface VectorRecord {
+  id: string;
+  content: string;
+  embedding: number[];
+  metadata: Record<string, string>;
+}
+
+async function extractAndVectorize(
+  documentPath: string,
+  documentId: string,
+): Promise<VectorRecord[]> {
+  const config = {
+    chunking: {
+      max_chars: 512,
+      max_overlap: 50,
+      embedding: {
+        model: { type: "preset", name: "balanced" },
+        normalize: true,
+        batchSize: 32,
+      },
+    },
+  };
+
+  const result = await extractFile(documentPath, null, config);
+
+  const records: VectorRecord[] = [];
+  if (result.chunks) {
+    result.chunks.forEach((chunk, index) => {
+      if (chunk.embedding) {
+        records.push({
+          id: `${documentId}_chunk_${index}`,
+          content: chunk.content,
+          embedding: chunk.embedding,
+          metadata: {
+            document_id: documentId,
+            chunk_index: index.toString(),
+            content_length: chunk.content.length.toString(),
+          },
+        });
+      }
+    });
+  }
+
+  return records;
+}
+```
--- a/docs/snippets/typescript/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/typescript/ocr/cloud_ocr_backend.md
@@ -0,0 +1,29 @@
+```typescript title="TypeScript"
+import { registerOcrBackend, extractFileSync } from "@kreuzberg/node";
+
+const supportedLangs = ["eng", "deu", "fra"];
+
+const cloudBackend = {
+  name: () => "cloud-ocr",
+  version: () => "1.0.0",
+  initialize: () => {},
+  shutdown: () => {},
+  process_image: async (imageBytes: Uint8Array, config: { language?: string }) => {
+    // Call your cloud OCR API with imageBytes and config.language.
+    return { content: "Extracted text", mime_type: "text/plain" };
+  },
+  supports_language: (lang: string) => supportedLangs.includes(lang),
+  backend_type: () => "Custom",
+  supported_languages: () => supportedLangs,
+};
+
+registerOcrBackend(cloudBackend);
+
+const result = extractFileSync("scanned.pdf", {
+  ocr: {
+    backend: "cloud-ocr",
+    language: "eng",
+  },
+});
+console.log(result.content);
+```
--- a/docs/snippets/typescript/ocr/image_extraction.md
+++ b/docs/snippets/typescript/ocr/image_extraction.md
@@ -0,0 +1,16 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  images: {
+    extractImages: true,
+    targetDpi: 200,
+    maxImageDimension: 2048,
+    injectPlaceholders: true, // set to false to extract images without markdown references
+    autoAdjustDpi: true,
+  },
+};
+
+const result = extractFileSync("document.pdf", config);
+console.log(`content length: ${result.content.length}`);
+```
--- a/docs/snippets/typescript/ocr/image_preprocessing.md
+++ b/docs/snippets/typescript/ocr/image_preprocessing.md
@@ -0,0 +1,22 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+    tesseractConfig: {
+      preprocessing: {
+        targetDpi: 300,
+        denoise: true,
+        deskew: true,
+        contrastEnhance: true,
+        binarizationMethod: "otsu",
+      },
+    },
+  },
+};
+
+const result = extractFileSync("document.pdf", config);
+console.log(`content length: ${result.content.length}`);
+```
--- a/docs/snippets/typescript/ocr/ocr_dpi_config.md
+++ b/docs/snippets/typescript/ocr/ocr_dpi_config.md
@@ -0,0 +1,15 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+  },
+  pdfOptions: {
+    extractImages: true,
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/ocr/ocr_easyocr.md
+++ b/docs/snippets/typescript/ocr/ocr_easyocr.md
@@ -0,0 +1 @@
+EasyOCR is only available in Python.
--- a/docs/snippets/typescript/ocr/ocr_elements.md
+++ b/docs/snippets/typescript/ocr/ocr_elements.md
@@ -0,0 +1,24 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "paddle-ocr",
+    language: "en",
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+
+if (result.ocrElements) {
+  for (const element of result.ocrElements) {
+    console.log(`Text: ${element.text}`);
+    console.log(`Confidence: ${element.confidence.recognition.toFixed(2)}`);
+    console.log(`Geometry:`, element.geometry);
+    if (element.rotation) {
+      console.log(`Rotation: ${element.rotation.angle}°`);
+    }
+    console.log();
+  }
+}
+```
--- a/docs/snippets/typescript/ocr/ocr_extraction.md
+++ b/docs/snippets/typescript/ocr/ocr_extraction.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/typescript/ocr/ocr_force_all_pages.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+  },
+  forceOcr: true,
+};
+
+const result = extractFileSync("document.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/ocr/ocr_multi_language.md
+++ b/docs/snippets/typescript/ocr/ocr_multi_language.md
@@ -0,0 +1,13 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu+fra",
+  },
+};
+
+const result = extractFileSync("multilingual.pdf", null, config);
+console.log(result.content);
+```
--- a/docs/snippets/typescript/ocr/ocr_paddleocr.md
+++ b/docs/snippets/typescript/ocr/ocr_paddleocr.md
@@ -0,0 +1,14 @@
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "paddle-ocr",
+    language: "en",
+    // modelTier: 'server', // for max accuracy
+  },
+};
+
+const result = extractFileSync("scanned.pdf", null, config);
+console.log(result.content);
+```
--- a/Show More
+++ b/Show More