Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/rust/advanced/advanced_config.md
+++ b/docs/snippets/rust/advanced/advanced_config.md
@@ -0,0 +1,46 @@
+```rust title="Rust"
+use kreuzberg::{
+    extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
+};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu".to_string(),
+            ..Default::default()
+        }),
+
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 100,
+            ..Default::default()
+        }),
+
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            detect_multiple: true,
+            ..Default::default()
+        }),
+
+        use_cache: true,
+        enable_quality_processing: true,
+
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    if let Some(chunks) = result.chunks {
+        for chunk in chunks {
+            let preview: String = chunk.content.chars().take(100).collect();
+            println!("Chunk: {}...", preview);
+        }
+    }
+
+    if let Some(languages) = result.detected_languages {
+        println!("Languages: {:?}", languages);
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/advanced/chunk_page_mapping.md
+++ b/docs/snippets/rust/advanced/chunk_page_mapping.md
@@ -0,0 +1,29 @@
+Use Kreuzberg::{extract_file_sync, ExtractionConfig, ChunkingConfig, PageConfig};
+
+Let config = ExtractionConfig {
+chunking: Some(ChunkingConfig {
+max_characters: 500,
+overlap: 50,
+..Default::default()
+}),
+pages: Some(PageConfig {
+extract_pages: true,
+..Default::default()
+}),
+..Default::default()
+};
+
+Let result = extract_file_sync("document.pdf", None, &config)?;
+
+If let Some(chunks) = result.chunks {
+for chunk in chunks {
+if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
+let page_range = if first == last {
+format!("Page {}", first)
+} else {
+format!("Pages {}-{}", first, last)
+};
+println!("Chunk: {}... ({})", chunk.content.chars().take(50).collect::<String>(), page_range);
+}
+}
+}
--- a/docs/snippets/rust/advanced/chunking.md
+++ b/docs/snippets/rust/advanced/chunking.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1500,
+            overlap: 200,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset {
+                    name: "text-embedding-all-minilm-l6-v2".to_string(),
+                },
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.chunking);
+}
+```
--- a/docs/snippets/rust/advanced/chunking_config.md
+++ b/docs/snippets/rust/advanced/chunking_config.md
@@ -0,0 +1,39 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ChunkingConfig};
+
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 1000,
+        overlap: 200,
+        embedding: None,
+    }),
+    ..Default::default()
+};
+```
+
+```rust title="Rust - Semantic"
+use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
+
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        chunker_type: ChunkerType::Semantic,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+```
+
+```rust title="Rust - Prepend Heading Context"
+use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
+
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 500,
+        overlap: 50,
+        chunker_type: ChunkerType::Markdown,
+        prepend_heading_context: true,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/chunking_rag.md
+++ b/docs/snippets/rust/advanced/chunking_rag.md
@@ -0,0 +1,36 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
+
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 500,
+        overlap: 50,
+        embedding: Some(EmbeddingConfig {
+            model: "balanced".to_string(),
+            normalize: true,
+            ..Default::default()
+        }),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("research_paper.pdf", None, &config).await?;
+
+if let Some(chunks) = result.chunks {
+    for chunk in chunks {
+        println!("Chunk {}/{}",
+            chunk.metadata.chunk_index + 1,
+            chunk.metadata.total_chunks
+        );
+        println!("Position: {}-{}",
+            chunk.metadata.byte_start,
+            chunk.metadata.byte_end
+        );
+        println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
+        if let Some(embedding) = chunk.embedding {
+            println!("Embedding: {} dimensions", embedding.len());
+        }
+    }
+}
+```
--- a/docs/snippets/rust/advanced/embedding_config.md
+++ b/docs/snippets/rust/advanced/embedding_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset {
+                    name: "all-mpnet-base-v2".to_string(),
+                },
+                batch_size: 16,
+                normalize: true,
+                show_download_progress: true,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.chunking);
+}
+```
--- a/docs/snippets/rust/advanced/embedding_with_chunking.md
+++ b/docs/snippets/rust/advanced/embedding_with_chunking.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
+
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 1024,
+        overlap: 100,
+        embedding: Some(EmbeddingConfig {
+            model: "balanced".to_string(),
+            normalize: true,
+            batch_size: 32,
+            show_download_progress: false,
+            ..Default::default()
+        }),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/keyword_extraction_config.md
+++ b/docs/snippets/rust/advanced/keyword_extraction_config.md
@@ -0,0 +1,15 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
+
+let config = ExtractionConfig {
+    keywords: Some(KeywordConfig {
+        algorithm: KeywordAlgorithm::Yake,
+        max_keywords: 10,
+        min_score: 0.3,
+        ngram_range: (1, 3),
+        language: Some("en".to_string()),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/keyword_extraction_example.md
+++ b/docs/snippets/rust/advanced/keyword_extraction_example.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
+
+let config = ExtractionConfig {
+    keywords: Some(KeywordConfig {
+        algorithm: KeywordAlgorithm::Yake,
+        max_keywords: 10,
+        min_score: 0.3,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("research_paper.pdf", None, &config).await?;
+
+if let Some(keywords) = &result.extracted_keywords {
+    println!("Keywords: {:?}", keywords);
+}
+```
--- a/docs/snippets/rust/advanced/language_detection.md
+++ b/docs/snippets/rust/advanced/language_detection.md
@@ -0,0 +1,15 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: Some(true),
+            min_confidence: Some(0.9),
+            detect_multiple: Some(true),
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.language_detection);
+}
+```
--- a/docs/snippets/rust/advanced/language_detection_config.md
+++ b/docs/snippets/rust/advanced/language_detection_config.md
@@ -0,0 +1,12 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
+
+let config = ExtractionConfig {
+    language_detection: Some(LanguageDetectionConfig {
+        enabled: true,
+        min_confidence: 0.8,
+        detect_multiple: false,
+    }),
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/language_detection_multilingual.md
+++ b/docs/snippets/rust/advanced/language_detection_multilingual.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
+
+let config = ExtractionConfig {
+    language_detection: Some(LanguageDetectionConfig {
+        enabled: true,
+        min_confidence: 0.8,
+        detect_multiple: true,
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("multilingual_document.pdf", None, &config).await?;
+
+println!("Detected languages: {:?}", result.detected_languages);
+```
--- a/docs/snippets/rust/advanced/postprocessor_config.md
+++ b/docs/snippets/rust/advanced/postprocessor_config.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, PostProcessorConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        postprocessor: Some(PostProcessorConfig {
+            enabled: Some(true),
+            enabled_processors: Some(vec![
+                "deduplication".to_string(),
+                "whitespace_normalization".to_string(),
+            ]),
+            disabled_processors: Some(vec!["mojibake_fix".to_string()]),
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.postprocessor);
+}
+```
--- a/docs/snippets/rust/advanced/quality_processing_config.md
+++ b/docs/snippets/rust/advanced/quality_processing_config.md
@@ -0,0 +1,8 @@
+```rust title="Rust"
+use kreuzberg::ExtractionConfig;
+
+let config = ExtractionConfig {
+    enable_quality_processing: true,
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/quality_processing_example.md
+++ b/docs/snippets/rust/advanced/quality_processing_example.md
@@ -0,0 +1,17 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+let config = ExtractionConfig {
+    enable_quality_processing: true,
+    ..Default::default()
+};
+let result = extract_file("scanned_document.pdf", None, &config).await?;
+
+if let Some(score) = result.quality_score {
+    if score < 0.5 {
+        println!("Warning: Low quality extraction ({:.2})", score);
+    } else {
+        println!("Quality score: {:.2}", score);
+    }
+}
+```
--- a/docs/snippets/rust/advanced/token_reduction.md
+++ b/docs/snippets/rust/advanced/token_reduction.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, TokenReductionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        token_reduction: Some(TokenReductionConfig {
+            mode: "moderate".to_string(),
+            preserve_important_words: true,
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.token_reduction);
+}
+```
--- a/docs/snippets/rust/advanced/token_reduction_config.md
+++ b/docs/snippets/rust/advanced/token_reduction_config.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, TokenReductionConfig};
+
+let config = ExtractionConfig {
+    token_reduction: Some(TokenReductionConfig {
+        mode: "moderate".to_string(),
+        preserve_markdown: true,
+        preserve_code: true,
+        language_hint: Some("eng".to_string()),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+```
--- a/docs/snippets/rust/advanced/token_reduction_example.md
+++ b/docs/snippets/rust/advanced/token_reduction_example.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
+
+let config = ExtractionConfig {
+    token_reduction: Some(TokenReductionConfig {
+        mode: "moderate".to_string(),
+        preserve_markdown: true,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("verbose_document.pdf", None, &config).await?;
+
+if let Some(original) = result.original_token_count {
+    println!("Original tokens: {}", original);
+}
+if let Some(reduced) = result.reduced_token_count {
+    println!("Reduced tokens: {}", reduced);
+}
+```
--- a/docs/snippets/rust/advanced/vector_database_integration.md
+++ b/docs/snippets/rust/advanced/vector_database_integration.md
@@ -0,0 +1,55 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
+
+struct VectorRecord {
+    id: String,
+    content: String,
+    embedding: Vec<f32>,
+    metadata: std::collections::HashMap<String, String>,
+}
+
+async fn extract_and_vectorize(
+    document_path: &str,
+    document_id: &str,
+) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 512,
+            overlap: 50,
+            embedding: Some(EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Preset {
+                    name: "balanced".to_string(),
+                },
+                normalize: true,
+                batch_size: 32,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file(document_path, None, &config).await?;
+
+    let mut records = Vec::new();
+    if let Some(chunks) = result.chunks {
+        for (index, chunk) in chunks.iter().enumerate() {
+            if let Some(embedding) = &chunk.embedding {
+                let mut metadata = std::collections::HashMap::new();
+                metadata.insert("document_id".to_string(), document_id.to_string());
+                metadata.insert("chunk_index".to_string(), index.to_string());
+                metadata.insert("content_length".to_string(), chunk.content.len().to_string());
+
+                records.push(VectorRecord {
+                    id: format!("{}_chunk_{}", document_id, index),
+                    content: chunk.content.clone(),
+                    embedding: embedding.clone(),
+                    metadata,
+                });
+            }
+        }
+    }
+
+    Ok(records)
+}
+```
--- a/docs/snippets/rust/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/rust/api/batch_extract_bytes_sync.md
@@ -0,0 +1,25 @@
+```rust title="Rust"
+use kreuzberg::{batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let items = vec![
+        BatchBytesItem {
+            content: b"Hello, world!".to_vec(),
+            mime_type: "text/plain".to_string(),
+            config: None,
+        },
+        BatchBytesItem {
+            content: b"# Heading\n\nParagraph text.".to_vec(),
+            mime_type: "text/markdown".to_string(),
+            config: None,
+        },
+    ];
+    let results = batch_extract_bytes_sync(items, &config)?;
+
+    for (i, result) in results.iter().enumerate() {
+        println!("Item {}: {} chars", i, result.content.len());
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/batch_extract_files_sync.md
+++ b/docs/snippets/rust/api/batch_extract_files_sync.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let items = vec![
+        BatchFileItem { path: "doc1.pdf".into(), config: None },
+        BatchFileItem { path: "doc2.docx".into(), config: None },
+        BatchFileItem { path: "report.pdf".into(), config: None },
+    ];
+    let results = batch_extract_files_sync(items, &config)?;
+
+    for (i, result) in results.iter().enumerate() {
+        println!("File {}: {} chars", i, result.content.len());
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/client_chunk_text.md
+++ b/docs/snippets/rust/api/client_chunk_text.md
@@ -0,0 +1,29 @@
+```rust title="Rust"
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let client = reqwest::Client::new();
+    let bytes = tokio::fs::read("document.pdf").await?;
+
+    let part = reqwest::multipart::Part::bytes(bytes)
+        .file_name("document.pdf")
+        .mime_str("application/pdf")?;
+    let form = reqwest::multipart::Form::new()
+        .part("file", part)
+        .text("chunking", r#"{"max_characters":800,"overlap":100}"#);
+
+    let response = client
+        .post("http://localhost:8000/extract")
+        .multipart(form)
+        .send()
+        .await?;
+
+    let result: serde_json::Value = response.error_for_status()?.json().await?;
+    if let Some(chunks) = result["chunks"].as_array() {
+        println!("{} chunks", chunks.len());
+        for chunk in chunks {
+            println!("  {} chars", chunk["content"].as_str().unwrap_or("").len());
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/client_extract_single_file.md
+++ b/docs/snippets/rust/api/client_extract_single_file.md
@@ -0,0 +1,28 @@
+```rust title="Rust"
+use std::path::Path;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let client = reqwest::Client::new();
+    let bytes = tokio::fs::read("document.pdf").await?;
+    let file_name = Path::new("document.pdf")
+        .file_name()
+        .and_then(|n| n.to_str())
+        .unwrap_or("document.pdf");
+
+    let part = reqwest::multipart::Part::bytes(bytes)
+        .file_name(file_name.to_string())
+        .mime_str("application/pdf")?;
+    let form = reqwest::multipart::Form::new().part("file", part);
+
+    let response = client
+        .post("http://localhost:8000/extract")
+        .multipart(form)
+        .send()
+        .await?;
+
+    let result: serde_json::Value = response.error_for_status()?.json().await?;
+    println!("{}", result["content"].as_str().unwrap_or(""));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/combining_all_features.md
+++ b/docs/snippets/rust/api/combining_all_features.md
@@ -0,0 +1,55 @@
+```rust title="Rust"
+use kreuzberg::{
+    ChunkingConfig, ChunkerType, ExtractionConfig, ImageExtractionConfig,
+    OcrConfig, OutputFormat, extract_file_sync,
+};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        // OCR: force Tesseract on all pages with English text
+        force_ocr: false,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        // Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
+        chunking: Some(ChunkingConfig {
+            max_characters: 800,
+            overlap: 100,
+            chunker_type: ChunkerType::Markdown,
+            prepend_heading_context: true,
+            ..Default::default()
+        }),
+        // Output: include document structure and tables
+        output_format: OutputFormat::Markdown,
+        include_document_structure: true,
+        // Images: extract embedded images
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            ..Default::default()
+        }),
+        // Cache extracted results on disk
+        use_cache: true,
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("report.pdf", None, &config)?;
+
+    println!("Content ({} chars):", result.content.len());
+    println!("{}", &result.content[..result.content.len().min(200)]);
+
+    if let Some(chunks) = &result.chunks {
+        println!("\nChunks: {}", chunks.len());
+    }
+    println!("Tables: {}", result.tables.len());
+    if let Some(langs) = &result.detected_languages {
+        println!("Languages: {:?}", langs);
+    }
+    if let Some(method) = result.extraction_method {
+        println!("Extraction method: {:?}", method);
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/error_handling.md
+++ b/docs/snippets/rust/api/error_handling.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};
+
+fn main() {
+    let config = ExtractionConfig::default();
+    match extract_file_sync("document.pdf", None, &config) {
+        Ok(result) => println!("{}", result.content),
+        Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
+        Err(KreuzbergError::UnsupportedFormat(mime)) => {
+            eprintln!("Unsupported format: {mime}");
+        }
+        Err(KreuzbergError::Parsing { message, .. }) => {
+            eprintln!("Corrupt or invalid document: {message}");
+        }
+        Err(KreuzbergError::MissingDependency(dep)) => {
+            eprintln!("Missing dependency — install {dep}");
+        }
+        Err(e) => eprintln!("Extraction failed: {e}"),
+    }
+}
+```
--- a/docs/snippets/rust/api/error_handling_extract.md
+++ b/docs/snippets/rust/api/error_handling_extract.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_bytes_sync, ExtractionConfig, KreuzbergError, Result};
+
+fn extract_text(bytes: &[u8], mime_type: &str) -> Result<String> {
+    let config = ExtractionConfig::default();
+    let result = extract_bytes_sync(bytes, mime_type, &config)?;
+    Ok(result.content)
+}
+
+fn main() {
+    let bytes = std::fs::read("document.pdf").unwrap_or_default();
+    match extract_text(&bytes, "application/pdf") {
+        Ok(text) => println!("Extracted {} chars", text.len()),
+        Err(KreuzbergError::UnsupportedFormat(mime)) => {
+            eprintln!("Format not supported: {mime}");
+        }
+        Err(KreuzbergError::Ocr { message, .. }) => {
+            eprintln!("OCR failed: {message}");
+        }
+        Err(e) => eprintln!("Error: {e}"),
+    }
+}
+```
--- a/docs/snippets/rust/api/extract_bytes_async.md
+++ b/docs/snippets/rust/api/extract_bytes_async.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{extract_bytes, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let content = tokio::fs::read("document.pdf").await?;
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/pdf", &config).await?;
+
+    println!("{}", result.content);
+    println!("Tables: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/extract_bytes_sync.md
+++ b/docs/snippets/rust/api/extract_bytes_sync.md
@@ -0,0 +1,13 @@
+```rust title="Rust"
+use kreuzberg::{extract_bytes_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let content = std::fs::read("document.pdf")?;
+    let config = ExtractionConfig::default();
+    let result = extract_bytes_sync(&content, "application/pdf", &config)?;
+
+    println!("{}", result.content);
+    println!("Tables: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/extract_file_async.md
+++ b/docs/snippets/rust/api/extract_file_async.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+
+    println!("{}", result.content);
+    println!("MIME type: {}", result.mime_type);
+    println!("Tables: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/api/extract_file_sync.md
+++ b/docs/snippets/rust/api/extract_file_sync.md
@@ -0,0 +1,13 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    println!("{}", result.content);
+    println!("MIME type: {}", result.mime_type);
+    println!("Tables: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/benchmarking/simple_benchmark.rs
+++ b/docs/snippets/rust/benchmarking/simple_benchmark.rs
@@ -0,0 +1,61 @@
+```rust title="simple_benchmark.rs"
+use kreuzberg::{extract_file_sync, extract_file, ExtractionConfig};
+use std::time::Instant;
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: false,
+        ..Default::default()
+    };
+    let file_path = "document.pdf";
+    let num_runs = 10;
+
+    let start = Instant::now();
+    for _ in 0..num_runs {
+        let _ = extract_file_sync(file_path, None, &config)?;
+    }
+    let sync_duration = start.elapsed().as_secs_f64();
+    let avg_sync = sync_duration / num_runs as f64;
+
+    println!("Sync extraction ({} runs):", num_runs);
+    println!("  - Total time: {:.3}s", sync_duration);
+    println!("  - Average: {:.3}s per extraction", avg_sync);
+
+    let start = Instant::now();
+    let mut tasks = vec![];
+    for _ in 0..num_runs {
+        tasks.push(extract_file(file_path, None, &config));
+    }
+    let results = futures::future::join_all(tasks).await;
+    for result in results {
+        result?;
+    }
+    let async_duration = start.elapsed().as_secs_f64();
+
+    println!("\nAsync extraction ({} parallel runs):", num_runs);
+    println!("  - Total time: {:.3}s", async_duration);
+    println!("  - Average: {:.3}s per extraction", async_duration / num_runs as f64);
+    println!("  - Speedup: {:.1}x", sync_duration / async_duration);
+
+    let config_cached = ExtractionConfig {
+        use_cache: true,
+        ..Default::default()
+    };
+
+    println!("\nFirst extraction (populates cache)...");
+    let start = Instant::now();
+    let _result1 = extract_file_sync(file_path, None, &config_cached)?;
+    let first_duration = start.elapsed().as_secs_f64();
+    println!("  - Time: {:.3}s", first_duration);
+
+    println!("Second extraction (from cache)...");
+    let start = Instant::now();
+    let _result2 = extract_file_sync(file_path, None, &config_cached)?;
+    let cached_duration = start.elapsed().as_secs_f64();
+    println!("  - Time: {:.3}s", cached_duration);
+    println!("  - Cache speedup: {:.1}x", first_duration / cached_duration);
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/cache/disk_cache.rs
+++ b/docs/snippets/rust/cache/disk_cache.rs
@@ -0,0 +1,28 @@
+```rust title="disk_cache.rs"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let path = std::env::args()
+        .skip(1)
+        .find(|a| !a.is_empty() && a != "--")
+        .unwrap_or_else(|| "document.pdf".to_string());
+
+    // Enable caching (default: true). The Rust crate uses an internal disk cache.
+    let config = ExtractionConfig {
+        use_cache: true,
+        ..Default::default()
+    };
+
+    println!("First extraction (will be cached)...");
+    let result1 = extract_file_sync(&path, None, &config)?;
+    println!("  - Content length: {}", result1.content.len());
+
+    println!("\nSecond extraction (from cache when available)...");
+    let result2 = extract_file_sync(&path, None, &config)?;
+    println!("  - Content length: {}", result2.content.len());
+
+    println!("\nResults are identical: {}", result1.content == result2.content);
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/cli/basic_cli.rs
+++ b/docs/snippets/rust/cli/basic_cli.rs
@@ -0,0 +1,30 @@
+```rust title="basic_cli.rs"
+use std::process::Command;
+use anyhow::Result;
+
+fn extract_with_cli(file_path: &str, output_format: &str) -> Result<String> {
+    let output = Command::new("kreuzberg")
+        .args(&["extract", file_path, "--format", output_format])
+        .output()?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("CLI error: {}", stderr);
+    }
+
+    Ok(String::from_utf8(output.stdout)?)
+}
+
+fn main() -> Result<()> {
+    let document = "document.pdf";
+
+    let text_output = extract_with_cli(document, "text")?;
+    println!("Extracted: {} characters", text_output.len());
+
+    let json_output = extract_with_cli(document, "json")?;
+    let parsed: serde_json::Value = serde_json::from_str(&json_output)?;
+    println!("Format: {}", parsed.get("format").unwrap_or(&"unknown".into()));
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/cli/cli_with_config.rs
+++ b/docs/snippets/rust/cli/cli_with_config.rs
@@ -0,0 +1,39 @@
+```rust title="cli_with_config.rs"
+use std::process::Command;
+use anyhow::Result;
+
+fn extract_with_config(file_path: &str, config_path: &str) -> Result<serde_json::Value> {
+    let output = Command::new("kreuzberg")
+        .args(&[
+            "extract",
+            file_path,
+            "--config",
+            config_path,
+            "--format",
+            "json",
+        ])
+        .output()?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("CLI error: {}", stderr);
+    }
+
+    let result: serde_json::Value = serde_json::from_slice(&output.stdout)?;
+    Ok(result)
+}
+
+fn main() -> Result<()> {
+    let config_file = "kreuzberg.toml";
+    let document = "document.pdf";
+
+    println!("Extracting {} with config {}", document, config_file);
+    let result = extract_with_config(document, config_file)?;
+
+    println!("Content length: {}", result["content"].as_str().unwrap_or("").len());
+    println!("Format: {}", result["format"].as_str().unwrap_or("unknown"));
+    println!("Languages: {}", result["languages"].to_string());
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/advanced_config.md
+++ b/docs/snippets/rust/config/advanced_config.md
@@ -0,0 +1,63 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            tesseract_config: None,
+            output_format: None,
+            paddle_ocr_config: None,
+            element_config: None,
+        }),
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset { name: "balanced".to_string() },
+                batch_size: 32,
+                normalize: true,
+                show_download_progress: false,
+                cache_dir: None,
+            }),
+            ..Default::default()
+        }),
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.8,
+            detect_multiple: false,
+        }),
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.1,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            ..Default::default()
+        }),
+        token_reduction: Some(TokenReductionConfig {
+            mode: "moderate".to_string(),
+            preserve_important_words: true,
+        }),
+        postprocessor: Some(PostProcessorConfig {
+            enabled: true,
+            enabled_processors: None,
+            disabled_processors: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Content: {}", result.content);
+    if let Some(langs) = &result.detected_languages {
+        println!("Languages: {:?}", langs);
+    }
+    println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/chunking_config.md
+++ b/docs/snippets/rust/config/chunking_config.md
@@ -0,0 +1,79 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Chunks: {}", result.chunks.len());
+    for chunk in &result.chunks {
+        println!("Length: {}", chunk.content.len());
+    }
+    Ok(())
+}
+```
+
+```rust title="Rust - Markdown with Heading Context"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 500,
+            overlap: 50,
+            chunker_type: ChunkerType::Markdown,
+            sizing: ChunkSizing::Tokenizer {
+                model: "Xenova/gpt-4o".into(),
+                cache_dir: None,
+            },
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.md", None::<&str>, &config).await?;
+    for chunk in &result.chunks {
+        if let Some(heading_context) = &chunk.metadata.heading_context {
+            for heading in &heading_context.headings {
+                println!("Heading L{}: {}", heading.level, heading.text);
+            }
+        }
+        println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
+    }
+    Ok(())
+}
+```
+
+```rust title="Rust - Prepend Heading Context"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 500,
+            overlap: 50,
+            chunker_type: ChunkerType::Markdown,
+            prepend_heading_context: true,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.md", None::<&str>, &config).await?;
+    for chunk in &result.chunks {
+        // Each chunk's content is prefixed with its heading breadcrumb
+        println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_basic.md
+++ b/docs/snippets/rust/config/config_basic.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_discover.md
+++ b/docs/snippets/rust/config/config_discover.md
@@ -0,0 +1,11 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::discover()?.unwrap_or_default();
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_ocr.md
+++ b/docs/snippets/rust/config/config_ocr.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            tesseract_config: None,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None, &config)?;
+    println!("Content length: {}", result.content.len());
+    println!("Tables detected: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_programmatic.md
+++ b/docs/snippets/rust/config/config_programmatic.md
@@ -0,0 +1,29 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: 6,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            ..Default::default()
+        }),
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("Content length: {}", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/document_structure_config.md
+++ b/docs/snippets/rust/config/document_structure_config.md
@@ -0,0 +1,17 @@
+```rust title="Document Structure Config (Rust)"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+let config = ExtractionConfig {
+    include_document_structure: true,
+    ..Default::default()
+};
+
+let result = extract_file_sync("document.pdf", None, &config)?;
+
+if let Some(document) = &result.document {
+    for node in &document.nodes {
+        let text = node.content.text().unwrap_or("");
+        println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
+    }
+}
+```
--- a/docs/snippets/rust/config/element_based_output.md
+++ b/docs/snippets/rust/config/element_based_output.md
@@ -0,0 +1,48 @@
+```rust title="Element-Based Output (Rust)"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+use kreuzberg::types::OutputFormat as ResultFormat;
+
+fn main() -> kreuzberg::Result<()> {
+    // Configure element-based output (result_format controls Unified vs ElementBased)
+    let config = ExtractionConfig {
+        result_format: ResultFormat::ElementBased,
+        ..Default::default()
+    };
+
+    // Extract document
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    // Access elements
+    if let Some(elements) = result.elements {
+        for element in &elements {
+            println!("Type: {:?}", element.element_type);
+            println!("Text: {}", &element.text[..100.min(element.text.len())]);
+
+            if let Some(page) = element.metadata.page_number {
+                println!("Page: {}", page);
+            }
+
+            if let Some(coords) = &element.metadata.coordinates {
+                println!("Coords: ({}, {}) - ({}, {})",
+                    coords.x0, coords.y0, coords.x1, coords.y1);
+            }
+
+            println!("---");
+        }
+
+        // Filter by element type
+        let titles: Vec<_> = elements.iter()
+            .filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
+            .collect();
+
+        for title in titles {
+            let level = title.metadata.additional.get("level")
+                .map(|v| v.as_ref())
+                .unwrap_or("unknown");
+            println!("[{}] {}", level, title.text);
+        }
+    }
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/embedding_config.md
+++ b/docs/snippets/rust/config/embedding_config.md
@@ -0,0 +1,28 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset {
+                    name: "balanced".to_string(),
+                },
+                batch_size: 16,
+                normalize: true,
+                show_download_progress: true,
+                cache_dir: None,
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/embedding_config.rs
+++ b/docs/snippets/rust/config/embedding_config.rs
@@ -0,0 +1,85 @@
+// Example 1: Preset model (recommended)
+// Fast, balanced, or quality preset configurations optimized for common use cases.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Preset {
+        name: "balanced".to_string(),
+    },
+    batch_size: 32,
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
+    acceleration: None,
+};
+
+// Available presets:
+// - "fast" (384 dims): Quick prototyping, development, resource-constrained
+// - "balanced" (768 dims): Production, general-purpose RAG, English documents
+// - "quality" (1024 dims): Complex documents, maximum accuracy
+// - "multilingual" (768 dims): International documents, 100+ languages
+
+// Example 2: Custom ONNX model (requires embeddings feature)
+// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Custom {
+        model_id: "BAAI/bge-small-en-v1.5".to_string(),
+        dimensions: 384,
+    },
+    batch_size: 32,
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: None,  // Uses default: .kreuzberg/embeddings/
+    acceleration: None,
+};
+
+// Popular ONNX-compatible models:
+// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+// Example 3: Alternative Custom ONNX Model
+// For advanced users wanting different ONNX embedding models.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Custom {
+        model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
+        dimensions: 768,
+    },
+    batch_size: 16,  // Larger model requires smaller batch size
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
+    acceleration: None,
+};
+
+// Integration with ChunkingConfig
+// Add embeddings to your chunking configuration:
+use kreuzberg::{ChunkingConfig, ExtractionConfig};
+
+let chunking_with_embeddings = ChunkingConfig {
+    max_characters: 1024,
+    overlap: 100,
+    preset: Some("balanced".to_string()),
+    embedding: Some(EmbeddingConfig::default()),  // Uses balanced preset
+};
+
+let extraction_config = ExtractionConfig {
+    chunking: Some(chunking_with_embeddings),
+    ..Default::default()
+};
+
+// Key parameter explanations:
+//
+// batch_size: Number of texts to embed at once (32-128 typical)
+//   - Larger batches are faster but use more memory
+//   - Smaller batches for resource-constrained environments
+//
+// normalize: Whether to normalize vectors (L2 norm)
+//   - true (recommended): Enables cosine similarity in vector DBs
+//   - false: Raw embedding values
+//
+// cache_dir: Where to store downloaded models
+//   - None: Uses .kreuzberg/embeddings/ in current directory
+//   - Some(path): Custom directory for model storage
+//
+// show_download_progress: Display download progress bar
+//   - Useful for monitoring large model downloads
--- a/docs/snippets/rust/config/hierarchy_config.rs
+++ b/docs/snippets/rust/config/hierarchy_config.rs
@@ -0,0 +1,90 @@
+use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
+
+// Example 1: Basic hierarchy extraction
+// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
+// Extract bounding box information for spatial layout awareness.
+let hierarchy_config_basic = HierarchyConfig {
+    enabled: true,
+    k_clusters: 6,  // Default: creates 6 font size clusters (H1-H6 structure)
+    include_bbox: true,  // Include bounding box coordinates
+    ocr_coverage_threshold: None,  // No OCR coverage threshold
+};
+
+let pdf_config_basic = PdfConfig {
+    hierarchy: Some(hierarchy_config_basic),
+    ..Default::default()
+};
+
+let extraction_config_basic = ExtractionConfig {
+    pdf_options: Some(pdf_config_basic),
+    ..Default::default()
+};
+
+// Use with extract_file_sync or extract_bytes_sync
+// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
+
+
+// Example 2: Custom k_clusters for minimal structure
+// Use 3 clusters for simpler hierarchy with minimal structure.
+// Useful when you only need major section divisions (Main, Subsection, Detail).
+let hierarchy_config_minimal = HierarchyConfig {
+    enabled: true,
+    k_clusters: 3,  // Minimal clustering: just 3 levels
+    include_bbox: true,
+    ocr_coverage_threshold: None,
+};
+
+let pdf_config_minimal = PdfConfig {
+    hierarchy: Some(hierarchy_config_minimal),
+    ..Default::default()
+};
+
+let extraction_config_minimal = ExtractionConfig {
+    pdf_options: Some(pdf_config_minimal),
+    ..Default::default()
+};
+
+
+// Example 3: With OCR coverage threshold
+// Trigger OCR if less than 50% of text has font data.
+// Useful for documents with mixed digital and scanned content.
+let hierarchy_config_ocr = HierarchyConfig {
+    enabled: true,
+    k_clusters: 6,
+    include_bbox: true,
+    ocr_coverage_threshold: Some(0.5),  // Trigger OCR if text coverage < 50%
+};
+
+let pdf_config_ocr = PdfConfig {
+    hierarchy: Some(hierarchy_config_ocr),
+    ..Default::default()
+};
+
+let extraction_config_ocr = ExtractionConfig {
+    pdf_options: Some(pdf_config_ocr),
+    ..Default::default()
+};
+
+
+// Field descriptions:
+//
+// enabled: bool (default: true)
+//   - Enable or disable hierarchy extraction
+//   - When false, hierarchy structure is not analyzed
+//
+// k_clusters: usize (default: 6, valid: 1-7)
+//   - Number of font size clusters for hierarchy levels
+//   - 6 provides H1-H6 heading levels with body text
+//   - Higher values create more fine-grained hierarchy
+//   - Lower values create simpler structure
+//
+// include_bbox: bool (default: true)
+//   - Include bounding box coordinates in hierarchy blocks
+//   - Required for spatial layout awareness and document structure
+//   - Set to false only if space optimization is critical
+//
+// ocr_coverage_threshold: Option<f32> (default: None)
+//   - Range: 0.0 to 1.0
+//   - Triggers OCR when text block coverage falls below this fraction
+//   - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
+//   - None means no OCR coverage-based triggering
--- a/docs/snippets/rust/config/html_output.md
+++ b/docs/snippets/rust/config/html_output.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
+
+let config = ExtractionConfig {
+    output_format: OutputFormat::Html,
+    html_output: Some(HtmlOutputConfig {
+        theme: HtmlTheme::GitHub,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+let result = extract_file_sync("document.pdf", None, &config).unwrap();
+println!("{}", result.content); // HTML with kb-* classes
+```
--- a/docs/snippets/rust/config/keyword_config.rs
+++ b/docs/snippets/rust/config/keyword_config.rs
@@ -0,0 +1,72 @@
+use kreuzberg::{extract_file, ExtractionConfig};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
+
+// Example 1: Basic YAKE configuration
+// Uses YAKE algorithm with default parameters and English stopword filtering
+pub async fn basic_yake() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.0,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            yake_params: None,
+            rake_params: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+
+// Example 2: Advanced YAKE with custom parameters
+// Fine-tunes YAKE with custom window size for co-occurrence analysis
+#[cfg(feature = "keywords-yake")]
+pub async fn advanced_yake() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 15,
+            min_score: 0.1,
+            ngram_range: (1, 2),
+            language: Some("en".to_string()),
+            yake_params: Some(YakeParams {
+                window_size: 1,
+            }),
+            rake_params: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+
+// Example 3: RAKE configuration
+// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+#[cfg(feature = "keywords-rake")]
+pub async fn rake_config() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Rake,
+            max_keywords: 10,
+            min_score: 5.0,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            yake_params: None,
+            rake_params: Some(RakeParams {
+                min_word_length: 1,
+                max_words_per_phrase: 3,
+            }),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
--- a/docs/snippets/rust/config/keyword_extraction_config.md
+++ b/docs/snippets/rust/config/keyword_extraction_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.1,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/language_detection_config.md
+++ b/docs/snippets/rust/config/language_detection_config.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.8,
+            detect_multiple: true,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Detected language: {}", result.language);
+    println!("Confidence: {}", result.language_confidence);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/ocr_dpi_config.md
+++ b/docs/snippets/rust/config/ocr_dpi_config.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 150,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Extracted images: {}", result.images.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/pdf_config.md
+++ b/docs/snippets/rust/config/pdf_config.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        pdf_options: Some(PdfConfig {
+            extract_images: true,
+            passwords: Some(vec!["password123".to_string()]),
+            extract_metadata: true,
+            hierarchy: Some(HierarchyConfig::default()),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("encrypted.pdf", None, &config)?;
+    println!("Title: {:?}", result.metadata.title);
+    println!("Authors: {:?}", result.metadata.authors);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/pdf_hierarchy_config.md
+++ b/docs/snippets/rust/config/pdf_hierarchy_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        pdf_options: Some(PdfConfig {
+            hierarchy: Some(HierarchyConfig {
+                enabled: true,
+                detection_threshold: Some(0.75),
+                ocr_coverage_threshold: Some(0.8),
+                min_level: Some(1),
+                max_level: Some(5),
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Hierarchy levels: {}", result.hierarchy.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/postprocessor_config.md
+++ b/docs/snippets/rust/config/postprocessor_config.md
@@ -0,0 +1,22 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        postprocessor: Some(PostProcessorConfig {
+            enabled: true,
+            enabled_processors: Some(vec![
+                "whitespace_normalizer".to_string(),
+                "unicode_normalizer".to_string(),
+            ]),
+            disabled_processors: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Processed content: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/quality_processing_config.md
+++ b/docs/snippets/rust/config/quality_processing_config.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        enable_quality_processing: true,
+        use_cache: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Quality score: {}", result.quality_score);
+    println!("Processing time: {:?}", result.processing_time);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/tesseract_config.md
+++ b/docs/snippets/rust/config/tesseract_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+use kreuzberg::types::TesseractConfig;
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: Some(6),
+                oem: Some(3),
+                ..Default::default()
+            }),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
+    println!("OCR text: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/token_reduction_config.md
+++ b/docs/snippets/rust/config/token_reduction_config.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        token_reduction: Some(TokenReductionConfig {
+            mode: "moderate".to_string(),
+            preserve_important_words: true,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Original tokens: {}", result.token_count);
+    println!("Reduced content: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/docker/usage.rs
+++ b/docs/snippets/rust/docker/usage.rs
@@ -0,0 +1,86 @@
+```rust title="usage.rs"
+use std::process::Command;
+use anyhow::Result;
+use reqwest::multipart;
+use std::fs;
+
+struct DockerKreuzbergClient {
+    container_name: String,
+    api_port: u16,
+}
+
+impl DockerKreuzbergClient {
+    fn new(container_name: &str, api_port: u16) -> Self {
+        Self {
+            container_name: container_name.to_string(),
+            api_port,
+        }
+    }
+
+    fn start_container(&self, image: &str) -> Result<()> {
+        println!("Starting Kreuzberg Docker container...");
+        let status = Command::new("docker")
+            .args(&[
+                "run",
+                "-d",
+                "--name",
+                &self.container_name,
+                "-p",
+                &format!("{}:8000", self.api_port),
+                image,
+            ])
+            .status()?;
+
+        if !status.success() {
+            anyhow::bail!("Failed to start Docker container");
+        }
+
+        println!("Container started on http://localhost:{}", self.api_port);
+        Ok(())
+    }
+
+    async fn extract_file(&self, file_path: &str) -> Result<String> {
+        let file_bytes = fs::read(file_path)?;
+        let file_part = multipart::Part::bytes(file_bytes)
+            .file_name(file_path.to_string());
+        let form = multipart::Form::new()
+            .part("file", file_part);
+
+        let client = reqwest::Client::new();
+        let response = client
+            .post(&format!("http://localhost:{}/api/extract", self.api_port))
+            .multipart(form)
+            .send()
+            .await?;
+
+        let json: serde_json::Value = response.json().await?;
+        Ok(json["content"].as_str().unwrap_or("").to_string())
+    }
+
+    fn stop_container(&self) -> Result<()> {
+        println!("Stopping Kreuzberg Docker container...");
+        Command::new("docker")
+            .args(&["stop", &self.container_name])
+            .status()?;
+        Command::new("docker")
+            .args(&["rm", &self.container_name])
+            .status()?;
+        println!("Container stopped and removed");
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let docker_client = DockerKreuzbergClient::new("kreuzberg-api", 8000);
+
+    docker_client.start_container("kreuzberg:latest")?;
+    tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
+
+    let content = docker_client.extract_file("document.pdf").await?;
+    println!("Extracted content:\n{}", content);
+
+    docker_client.stop_container()?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/basic_usage.md
+++ b/docs/snippets/rust/getting-started/basic_usage.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("{}", result.content);
+    println!("MIME Type: {}", result.mime_type);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/extract_file.md
+++ b/docs/snippets/rust/getting-started/extract_file.md
@@ -0,0 +1,12 @@
+```rust title="Rust"
+use kreuzberg::extract_file_sync;
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &Default::default())?;
+
+    println!("Extracted content: {}", result.content);
+    println!("Tables found: {}", result.tables.len());
+    println!("Format: {:?}", result.metadata.as_ref().and_then(|m| m.format.as_ref()));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/extract_with_ocr.md
+++ b/docs/snippets/rust/getting-started/extract_with_ocr.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        force_ocr: true,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None, &config)?;
+    println!("{}", result.content);
+    println!("Detected languages: {:?}", result.detected_languages);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/hello_world.md
+++ b/docs/snippets/rust/getting-started/hello_world.md
@@ -0,0 +1,9 @@
+```rust title="Rust"
+use kreuzberg::extract_file_sync;
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &Default::default())?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/install_verify.md
+++ b/docs/snippets/rust/getting-started/install_verify.md
@@ -0,0 +1,10 @@
+```rust title="Rust"
+use kreuzberg::extract_file_sync;
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &Default::default())?;
+    println!("Extraction successful: {}", !result.content.is_empty());
+    println!("Content length: {} characters", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/getting-started/read_content.md
+++ b/docs/snippets/rust/getting-started/read_content.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use std::fs;
+use kreuzberg::extract_bytes_sync;
+
+fn main() -> kreuzberg::Result<()> {
+    let data = fs::read("document.pdf")?;
+    let result = extract_bytes_sync(&data, "application/pdf", &Default::default())?;
+
+    println!("{}", result.content);
+    println!("Success: true");
+    println!("Content length: {} characters", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/llm/structured_extraction.md
+++ b/docs/snippets/rust/llm/structured_extraction.md
@@ -0,0 +1,37 @@
+```rust title="Rust"
+use kreuzberg::{
+    extract_file, ExtractionConfig, LlmConfig, StructuredExtractionConfig,
+};
+use serde_json::json;
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        structured_extraction: Some(StructuredExtractionConfig {
+            schema: json!({
+                "type": "object",
+                "properties": {
+                    "title": { "type": "string" },
+                    "authors": { "type": "array", "items": { "type": "string" } },
+                    "date": { "type": "string" }
+                },
+                "required": ["title", "authors", "date"],
+                "additionalProperties": false
+            }),
+            llm: LlmConfig {
+                model: "openai/gpt-4o-mini".to_string(),
+                ..Default::default()
+            },
+            strict: true,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("paper.pdf", None, &config).await?;
+    if let Some(structured) = &result.structured_output {
+        println!("{}", structured);
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/mcp/mcp_custom_client.md
+++ b/docs/snippets/rust/mcp/mcp_custom_client.md
@@ -0,0 +1,41 @@
+```rust title="Rust"
+use serde_json::json;
+use std::io::{BufRead, BufReader, Write};
+use std::process::{Command, Stdio};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut child = Command::new("kreuzberg")
+        .arg("mcp")
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .spawn()?;
+
+    {
+        let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?;
+        let request = json!({
+            "method": "tools/call",
+            "params": {
+                "name": "extract_file",
+                "arguments": {
+                    "path": "document.pdf",
+                    "async": true
+                }
+            }
+        });
+        stdin.write_all(request.to_string().as_bytes())?;
+        stdin.write_all(b"\n")?;
+    }
+
+    let stdout = child.stdout.take().ok_or("Failed to open stdout")?;
+    let reader = BufReader::new(stdout);
+    for line in reader.lines() {
+        if let Ok(line) = line {
+            println!("{}", line);
+            break;
+        }
+    }
+
+    child.wait()?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/mcp/mcp_server_start.md
+++ b/docs/snippets/rust/mcp/mcp_server_start.md
@@ -0,0 +1,10 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, mcp::start_mcp_server_with_config};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let config = ExtractionConfig::discover()?;
+    start_mcp_server_with_config(config).await?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/image_extraction.md
+++ b/docs/snippets/rust/metadata/image_extraction.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: Some(true),
+            target_dpi: Some(200),
+            max_image_dimension: Some(2048),
+            inject_placeholders: Some(true), // set to false to extract images without markdown references
+            auto_adjust_dpi: Some(true),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.images);
+}
+```
--- a/docs/snippets/rust/metadata/image_preprocessing.md
+++ b/docs/snippets/rust/metadata/image_preprocessing.md
@@ -0,0 +1,25 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            tesseract_config: Some(TesseractConfig {
+                preprocessing: Some(ImagePreprocessingConfig {
+                    target_dpi: 300,
+                    denoise: true,
+                    deskew: true,
+                    contrast_enhance: true,
+                    binarization_method: "otsu".to_string(),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    println!("{:?}", config.ocr);
+}
+```
--- a/docs/snippets/rust/metadata/language_detection.md
+++ b/docs/snippets/rust/metadata/language_detection.md
@@ -0,0 +1,15 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.9,
+            detect_multiple: true,
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.language_detection);
+}
+```
--- a/docs/snippets/rust/metadata/language_detection_multilingual.md
+++ b/docs/snippets/rust/metadata/language_detection_multilingual.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
+
+let config = ExtractionConfig {
+    language_detection: Some(LanguageDetectionConfig {
+        enabled: true,
+        min_confidence: 0.8,
+        detect_multiple: true,
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("multilingual_document.pdf", None, &config).await?;
+
+println!("Detected languages: {:?}", result.detected_languages);
+```
--- a/docs/snippets/rust/metadata/metadata.md
+++ b/docs/snippets/rust/metadata/metadata.md
@@ -0,0 +1,82 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
+
+    if let Some(pdf_meta) = result.metadata.pdf {
+        if let Some(pages) = pdf_meta.page_count {
+            println!("Pages: {}", pages);
+        }
+        if let Some(author) = pdf_meta.author {
+            println!("Author: {}", author);
+        }
+        if let Some(title) = pdf_meta.title {
+            println!("Title: {}", title);
+        }
+    }
+
+    let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
+    if let Some(html_meta) = html_result.metadata.html {
+        if let Some(title) = html_meta.title {
+            println!("Title: {}", title);
+        }
+        if let Some(desc) = html_meta.description {
+            println!("Description: {}", desc);
+        }
+
+        // Access keywords array
+        println!("Keywords: {:?}", html_meta.keywords);
+
+        // Access canonical URL (renamed from canonical)
+        if let Some(canonical) = html_meta.canonical_url {
+            println!("Canonical URL: {}", canonical);
+        }
+
+        // Access Open Graph fields as a map
+        if let Some(og_image) = html_meta.open_graph.get("image") {
+            println!("Open Graph Image: {}", og_image);
+        }
+        if let Some(og_title) = html_meta.open_graph.get("title") {
+            println!("Open Graph Title: {}", og_title);
+        }
+
+        // Access Twitter Card fields as a map
+        if let Some(twitter_card) = html_meta.twitter_card.get("card") {
+            println!("Twitter Card Type: {}", twitter_card);
+        }
+
+        // Access new fields
+        if let Some(lang) = html_meta.language {
+            println!("Language: {}", lang);
+        }
+
+        // Access headers
+        if !html_meta.headers.is_empty() {
+            for header in &html_meta.headers {
+                println!("Header (level {}): {}", header.level, header.text);
+            }
+        }
+
+        // Access links
+        if !html_meta.links.is_empty() {
+            for link in &html_meta.links {
+                println!("Link: {} ({})", link.href, link.text);
+            }
+        }
+
+        // Access images
+        if !html_meta.images.is_empty() {
+            for image in &html_meta.images {
+                println!("Image: {}", image.src);
+            }
+        }
+
+        // Access structured data
+        if !html_meta.structured_data.is_empty() {
+            println!("Structured data items: {}", html_meta.structured_data.len());
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/page_boundaries.md
+++ b/docs/snippets/rust/metadata/page_boundaries.md
@@ -0,0 +1,26 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    let Some(pages) = &result.metadata.pages else {
+        return Ok(());
+    };
+    let Some(boundaries) = &pages.boundaries else {
+        return Ok(());
+    };
+
+    for boundary in boundaries.iter().take(3) {
+        let page_text = &result.content[boundary.byte_start..boundary.byte_end];
+        let preview_end = 100.min(page_text.len());
+
+        println!("Page {}:", boundary.page_number);
+        println!("  Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
+        println!("  Preview: {}...", &page_text[..preview_end]);
+    }
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/page_tracking_basic.md
+++ b/docs/snippets/rust/metadata/page_tracking_basic.md
@@ -0,0 +1,20 @@
+Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
+
+Let config = ExtractionConfig {
+pages: Some(PageConfig {
+extract_pages: true,
+..Default::default()
+}),
+..Default::default()
+};
+
+Let result = extract_file_sync("document.pdf", &config)?;
+
+If let Some(pages) = result.pages {
+for page in pages {
+println!("Page {}:", page.page_number);
+println!(" Content: {} chars", page.content.len());
+println!(" Tables: {}", page.tables.len());
+println!(" Images: {}", page.images.len());
+}
+}
--- a/docs/snippets/rust/metadata/pdf_metadata_extractor.md
+++ b/docs/snippets/rust/metadata/pdf_metadata_extractor.md
@@ -0,0 +1,79 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
+use async_trait::async_trait;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+struct PdfMetadataExtractor {
+    processed_count: AtomicUsize,
+}
+
+impl PdfMetadataExtractor {
+    fn new() -> Self {
+        Self {
+            processed_count: AtomicUsize::new(0),
+        }
+    }
+}
+
+impl Plugin for PdfMetadataExtractor {
+    fn name(&self) -> &str { "pdf-metadata-extractor" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn description(&self) -> &str {
+        "Extracts and enriches PDF metadata"
+    }
+    fn initialize(&self) -> Result<()> {
+        log::info!("PDF metadata extractor initialized");
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        let count = self.processed_count.load(Ordering::Acquire);
+        log::info!("Processed {} PDFs", count);
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl PostProcessor for PdfMetadataExtractor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> Result<()> {
+        self.processed_count.fetch_add(1, Ordering::AcqRel);
+
+        result.processing_warnings.push(ProcessingWarning {
+            source: "pdf-metadata-extractor".to_string(),
+            message: "PDF metadata extracted successfully".to_string()
+        });
+
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Early
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+
+    fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
+        10
+    }
+}
+
+use kreuzberg::plugins::registry::get_post_processor_registry;
+use std::sync::Arc;
+
+fn register() -> Result<()> {
+    let processor = Arc::new(PdfMetadataExtractor::new());
+    let registry = get_post_processor_registry();
+    registry.register(processor, 50)?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/pdf_only_processor.md
+++ b/docs/snippets/rust/metadata/pdf_only_processor.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+impl PostProcessor for PdfOnlyProcessor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Middle
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+}
+```
--- a/docs/snippets/rust/metadata/tables.md
+++ b/docs/snippets/rust/metadata/tables.md
@@ -0,0 +1,17 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
+
+    for table in &result.tables {
+        println!("Table with {} rows", table.cells.len());
+        println!("{}", table.markdown);
+
+        for row in &table.cells {
+            println!("{:?}", row);
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/vector_database_integration.md
+++ b/docs/snippets/rust/metadata/vector_database_integration.md
@@ -0,0 +1,55 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
+
+struct VectorRecord {
+    id: String,
+    content: String,
+    embedding: Vec<f32>,
+    metadata: std::collections::HashMap<String, String>,
+}
+
+async fn extract_and_vectorize(
+    document_path: &str,
+    document_id: &str,
+) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 512,
+            overlap: 50,
+            embedding: Some(EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Preset {
+                    name: "balanced".to_string(),
+                },
+                normalize: true,
+                batch_size: 32,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file(document_path, None, &config).await?;
+
+    let mut records = Vec::new();
+    if let Some(chunks) = result.chunks {
+        for (index, chunk) in chunks.iter().enumerate() {
+            if let Some(embedding) = &chunk.embedding {
+                let mut metadata = std::collections::HashMap::new();
+                metadata.insert("document_id".to_string(), document_id.to_string());
+                metadata.insert("chunk_index".to_string(), index.to_string());
+                metadata.insert("content_length".to_string(), chunk.content.len().to_string());
+
+                records.push(VectorRecord {
+                    id: format!("{}_chunk_{}", document_id, index),
+                    content: chunk.content.clone(),
+                    embedding: embedding.clone(),
+                    metadata,
+                });
+            }
+        }
+    }
+
+    Ok(records)
+}
+```
--- a/docs/snippets/rust/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/rust/ocr/cloud_ocr_backend.md
@@ -0,0 +1,61 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
+use kreuzberg::{Result, ExtractionResult, OcrConfig, Metadata};
+use async_trait::async_trait;
+use std::path::Path;
+
+struct CloudOcrBackend {
+    api_key: String,
+    supported_langs: Vec<String>,
+}
+
+impl Plugin for CloudOcrBackend {
+    fn name(&self) -> &str { "cloud-ocr" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> { Ok(()) }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl OcrBackend for CloudOcrBackend {
+    async fn process_image(
+        &self,
+        image_bytes: &[u8],
+        config: &OcrConfig,
+    ) -> Result<ExtractionResult> {
+        let text = self.call_cloud_api(image_bytes, &config.language).await?;
+
+        Ok(ExtractionResult {
+            content: text,
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+        })
+    }
+
+    fn supports_language(&self, lang: &str) -> bool {
+        self.supported_langs.iter().any(|l| l == lang)
+    }
+
+    fn backend_type(&self) -> OcrBackendType {
+        OcrBackendType::Custom
+    }
+
+    fn supported_languages(&self) -> Vec<String> {
+        self.supported_langs.clone()
+    }
+}
+
+impl CloudOcrBackend {
+    async fn call_cloud_api(
+        &self,
+        image: &[u8],
+        language: &str
+    ) -> Result<String> {
+        Ok("Extracted text".to_string())
+    }
+}
+```
--- a/docs/snippets/rust/ocr/config_ocr.md
+++ b/docs/snippets/rust/ocr/config_ocr.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+fra".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: 3,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+}
+```
--- a/docs/snippets/rust/ocr/image_extraction.md
+++ b/docs/snippets/rust/ocr/image_extraction.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 200,
+            max_image_dimension: 2048,
+            inject_placeholders: true, // set to false to extract images without markdown references
+            auto_adjust_dpi: true,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("content length: {}", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/image_preprocessing.md
+++ b/docs/snippets/rust/ocr/image_preprocessing.md
@@ -0,0 +1,33 @@
+```rust title="Rust"
+use kreuzberg::{
+    extract_file_sync, ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig,
+};
+
+fn main() -> kreuzberg::Result<()> {
+    let preprocessing = ImagePreprocessingConfig {
+        target_dpi: 300,
+        denoise: true,
+        deskew: true,
+        contrast_enhance: true,
+        binarization_method: "otsu".to_string(),
+        ..Default::default()
+    };
+
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                preprocessing: Some(preprocessing),
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("content length: {}", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_dpi_config.md
+++ b/docs/snippets/rust/ocr/ocr_dpi_config.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, PdfConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            ..Default::default()
+        }),
+        pdf_options: Some(PdfConfig {
+            dpi: Some(300),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None, &config)?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_easyocr.md
+++ b/docs/snippets/rust/ocr/ocr_easyocr.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "easyocr".to_string(),
+            language: "en".to_string(),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("Extracted text: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_elements.md
+++ b/docs/snippets/rust/ocr/ocr_elements.md
@@ -0,0 +1,35 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
+use kreuzberg::types::OcrElementConfig;
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "paddleocr".to_string(),
+            language: "en".to_string(),
+            element_config: Some(OcrElementConfig {
+                include_elements: true,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("scanned.pdf", None, &config).await?;
+
+    if let Some(elements) = &result.ocr_elements {
+        for element in elements {
+            println!("Text: {}", element.text);
+            println!("Confidence: {:.2}", element.confidence.recognition);
+            println!("Geometry: {:?}", element.geometry);
+            if let Some(rotation) = &element.rotation {
+                println!("Rotation: {}°", rotation.angle_degrees);
+            }
+            println!();
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_extraction.md
+++ b/docs/snippets/rust/ocr/ocr_extraction.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None, &config)?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/rust/ocr/ocr_force_all_pages.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_multi_language.md
+++ b/docs/snippets/rust/ocr/ocr_multi_language.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu+fra".to_string(),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("multilingual.pdf", None, &config)?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/ocr_paddleocr.md
+++ b/docs/snippets/rust/ocr/ocr_paddleocr.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "paddleocr".to_string(),
+            language: "en".to_string(),
+            // paddle_ocr_config: Some(serde_json::json!({"model_tier": "server"})), // for max accuracy
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("Extracted text: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/ocr/pdf_config.md
+++ b/docs/snippets/rust/ocr/pdf_config.md
@@ -0,0 +1,15 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, PdfConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        pdf_options: Some(PdfConfig {
+            extract_images: Some(true),
+            extract_metadata: Some(true),
+            passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.pdf_options);
+}
+```
--- a/docs/snippets/rust/ocr/tesseract_config.md
+++ b/docs/snippets/rust/ocr/tesseract_config.md
@@ -0,0 +1,22 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            language: "eng+fra+deu".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: 6,
+                oem: 1,
+                min_confidence: 0.8,
+                tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string(),
+                enable_table_detection: true,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.ocr);
+}
+```
--- a/docs/snippets/rust/plugins/clear_plugins.md
+++ b/docs/snippets/rust/plugins/clear_plugins.md
@@ -0,0 +1,12 @@
+```rust title="Rust"
+use kreuzberg::{clear_document_extractors, clear_post_processors, clear_ocr_backends, clear_validators};
+
+fn main() {
+    clear_document_extractors();
+    clear_post_processors();
+    clear_ocr_backends();
+    clear_validators();
+
+    println!("All plugins cleared");
+}
+```
--- a/docs/snippets/rust/plugins/embedding_backend.md
+++ b/docs/snippets/rust/plugins/embedding_backend.md
@@ -0,0 +1,47 @@
+```rust title="Rust"
+use kreuzberg::plugins::{EmbeddingBackend, Plugin, register_embedding_backend};
+use kreuzberg::{EmbeddingConfig, EmbeddingModelType, Result, embed_texts};
+use async_trait::async_trait;
+use std::sync::Arc;
+
+// Wrap an already-loaded embedder (e.g. a tuned ONNX session or any host-language
+// embedder) so kreuzberg can call back into it during chunking and standalone
+// embed requests.
+struct MyEmbedder {
+    // Hold whatever model handles the host already owns.
+}
+
+impl Plugin for MyEmbedder {
+    fn name(&self) -> &str { "my-embedder" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> { Ok(()) }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl EmbeddingBackend for MyEmbedder {
+    // Captured once at registration; used for shape validation on every dispatch.
+    fn dimensions(&self) -> usize { 768 }
+
+    async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
+        // Delegate to the already-loaded host model.
+        Ok(texts.iter().map(|_| vec![0.0; 768]).collect())
+    }
+}
+
+fn main() -> Result<()> {
+    // Register once at startup.
+    register_embedding_backend(Arc::new(MyEmbedder {}))?;
+
+    let config = EmbeddingConfig {
+        model: EmbeddingModelType::Plugin { name: "my-embedder".to_string() },
+        // Optional: bound the wait on a hung backend (default 60s; `None` disables).
+        max_embed_duration_secs: Some(30),
+        ..Default::default()
+    };
+
+    let vectors = embed_texts(&["Hello, world!", "Second text"], &config)?;
+    assert_eq!(vectors.len(), 2);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/plugins/extractor_registration.md
+++ b/docs/snippets/rust/plugins/extractor_registration.md
@@ -0,0 +1,11 @@
+```rust title="Rust"
+use kreuzberg::plugins::registry::get_document_extractor_registry;
+use std::sync::Arc;
+
+fn register_custom_extractor() -> kreuzberg::Result<()> {
+    let extractor = Arc::new(CustomJsonExtractor);
+    let registry = get_document_extractor_registry();
+    registry.write().unwrap().register(extractor)?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/plugins/list_plugins.md
+++ b/docs/snippets/rust/plugins/list_plugins.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::plugins::registry::*;
+
+let registry = get_document_extractor_registry();
+let extractors = registry.list()?;
+println!("Registered extractors: {:?}", extractors);
+
+let registry = get_post_processor_registry();
+let processors = registry.list()?;
+println!("Registered processors: {:?}", processors);
+
+let registry = get_ocr_backend_registry();
+let backends = registry.list()?;
+println!("Registered OCR backends: {:?}", backends);
+
+let registry = get_validator_registry();
+let validators = registry.list()?;
+println!("Registered validators: {:?}", validators);
+```
--- a/docs/snippets/rust/plugins/min_length_validator.md
+++ b/docs/snippets/rust/plugins/min_length_validator.md
@@ -0,0 +1,38 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, Validator};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
+use async_trait::async_trait;
+
+struct MinLengthValidator {
+    min_length: usize,
+}
+
+impl Plugin for MinLengthValidator {
+    fn name(&self) -> &str { "min-length-validator" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> { Ok(()) }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl Validator for MinLengthValidator {
+    async fn validate(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> Result<()> {
+        if result.content.len() < self.min_length {
+            return Err(KreuzbergError::validation(format!(
+                "Content too short: {} < {} characters",
+                result.content.len(),
+                self.min_length
+            )));
+        }
+        Ok(())
+    }
+
+    fn priority(&self) -> i32 {
+        100
+    }
+}
+```
--- a/docs/snippets/rust/plugins/pdf_metadata_extractor.md
+++ b/docs/snippets/rust/plugins/pdf_metadata_extractor.md
@@ -0,0 +1,69 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
+use async_trait::async_trait;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use serde_json::json;
+
+struct PdfMetadataExtractor {
+    processed_count: AtomicUsize,
+}
+
+impl Plugin for PdfMetadataExtractor {
+    fn name(&self) -> &str { "pdf-metadata-extractor" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> {
+        self.processed_count.store(0, Ordering::Release);
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl PostProcessor for PdfMetadataExtractor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> Result<()> {
+        if result.mime_type != "application/pdf" {
+            return Ok(());
+        }
+
+        let order = self.processed_count.fetch_add(1, Ordering::AcqRel) + 1;
+
+        result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
+        result.metadata.additional.insert("pdf_order".to_string(), json!(order));
+        result.metadata.additional.insert(
+            "content_length".to_string(),
+            json!(result.content.len()),
+        );
+        result.metadata.additional.insert(
+            "pdf_processor_version".to_string(),
+            json!("1.0.0"),
+        );
+
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Early
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+}
+
+fn main() -> Result<()> {
+    register_post_processor(Arc::new(PdfMetadataExtractor {
+        processed_count: AtomicUsize::new(0),
+    }))?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/plugins/pdf_only_processor.md
+++ b/docs/snippets/rust/plugins/pdf_only_processor.md
@@ -0,0 +1,50 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
+use async_trait::async_trait;
+use std::sync::Arc;
+use serde_json::json;
+
+struct PdfOnlyProcessor;
+
+impl Plugin for PdfOnlyProcessor {
+    fn name(&self) -> &str { "pdf-only-processor" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> { Ok(()) }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl PostProcessor for PdfOnlyProcessor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> Result<()> {
+        if result.mime_type != "application/pdf" {
+            return Ok(());
+        }
+
+        result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
+
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Early
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+}
+
+fn main() -> Result<()> {
+    register_post_processor(Arc::new(PdfOnlyProcessor))?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/plugins/plugin_extractor.md
+++ b/docs/snippets/rust/plugins/plugin_extractor.md
@@ -0,0 +1,53 @@
+```rust title="Rust"
+use kreuzberg::plugins::{DocumentExtractor, Plugin};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
+use async_trait::async_trait;
+use std::path::Path;
+
+struct CustomJsonExtractor;
+
+impl Plugin for CustomJsonExtractor {
+    fn name(&self) -> &str { "custom-json-extractor" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn initialize(&self) -> Result<()> { Ok(()) }
+    fn shutdown(&self) -> Result<()> { Ok(()) }
+}
+
+#[async_trait]
+impl DocumentExtractor for CustomJsonExtractor {
+    async fn extract_bytes(
+        &self,
+        content: &[u8],
+        _mime_type: &str,
+        _config: &ExtractionConfig,
+    ) -> Result<ExtractionResult> {
+        let json: serde_json::Value = serde_json::from_slice(content)?;
+        let text = extract_text_from_json(&json);
+
+        Ok(ExtractionResult {
+            content: text,
+            mime_type: "application/json".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+        })
+    }
+
+    fn supported_mime_types(&self) -> &[&str] {
+        &["application/json", "text/json"]
+    }
+
+    fn priority(&self) -> i32 { 50 }
+}
+
+fn extract_text_from_json(value: &serde_json::Value) -> String {
+    match value {
+        serde_json::Value::String(s) => format!("{}\n", s),
+        serde_json::Value::Array(arr) => arr.iter().map(extract_text_from_json).collect(),
+        serde_json::Value::Object(obj) => obj.values().map(extract_text_from_json).collect(),
+        _ => String::new(),
+    }
+}
+```
--- a/docs/snippets/rust/plugins/plugin_logging.md
+++ b/docs/snippets/rust/plugins/plugin_logging.md
@@ -0,0 +1,35 @@
+```rust title="Rust"
+use log::{info, warn, error};
+
+impl Plugin for MyPlugin {
+    fn initialize(&self) -> Result<()> {
+        info!("Initializing plugin: {}", self.name());
+        Ok(())
+    }
+
+    fn shutdown(&self) -> Result<()> {
+        info!("Shutting down plugin: {}", self.name());
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl DocumentExtractor for MyPlugin {
+    async fn extract_bytes(
+        &self,
+        content: &[u8],
+        mime_type: &str,
+        _config: &ExtractionConfig,
+    ) -> Result<ExtractionResult> {
+        info!("Extracting {} ({} bytes)", mime_type, content.len());
+
+        let result = ExtractionResult::default();
+
+        if result.content.is_empty() {
+            warn!("Extraction resulted in empty content");
+        }
+
+        Ok(result)
+    }
+}
+```
--- a/docs/snippets/rust/plugins/plugin_testing.md
+++ b/docs/snippets/rust/plugins/plugin_testing.md
@@ -0,0 +1,22 @@
+```rust title="Rust"
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_custom_extractor() {
+        let extractor = CustomJsonExtractor;
+
+        let json_data = br#"{"message": "Hello, world!"}"#;
+        let config = ExtractionConfig::default();
+
+        let result = extractor
+            .extract_bytes(json_data, "application/json", &config)
+            .await
+            .expect("Extraction failed");
+
+        assert!(result.content.contains("Hello, world!"));
+        assert_eq!(result.mime_type, "application/json");
+    }
+}
+```
--- a/Show More
+++ b/Show More