Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/rust/config/advanced_config.md
+++ b/docs/snippets/rust/config/advanced_config.md
@@ -0,0 +1,63 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            tesseract_config: None,
+            output_format: None,
+            paddle_ocr_config: None,
+            element_config: None,
+        }),
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset { name: "balanced".to_string() },
+                batch_size: 32,
+                normalize: true,
+                show_download_progress: false,
+                cache_dir: None,
+            }),
+            ..Default::default()
+        }),
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.8,
+            detect_multiple: false,
+        }),
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.1,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            ..Default::default()
+        }),
+        token_reduction: Some(TokenReductionConfig {
+            mode: "moderate".to_string(),
+            preserve_important_words: true,
+        }),
+        postprocessor: Some(PostProcessorConfig {
+            enabled: true,
+            enabled_processors: None,
+            disabled_processors: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Content: {}", result.content);
+    if let Some(langs) = &result.detected_languages {
+        println!("Languages: {:?}", langs);
+    }
+    println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/chunking_config.md
+++ b/docs/snippets/rust/config/chunking_config.md
@@ -0,0 +1,79 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Chunks: {}", result.chunks.len());
+    for chunk in &result.chunks {
+        println!("Length: {}", chunk.content.len());
+    }
+    Ok(())
+}
+```
+
+```rust title="Rust - Markdown with Heading Context"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 500,
+            overlap: 50,
+            chunker_type: ChunkerType::Markdown,
+            sizing: ChunkSizing::Tokenizer {
+                model: "Xenova/gpt-4o".into(),
+                cache_dir: None,
+            },
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.md", None::<&str>, &config).await?;
+    for chunk in &result.chunks {
+        if let Some(heading_context) = &chunk.metadata.heading_context {
+            for heading in &heading_context.headings {
+                println!("Heading L{}: {}", heading.level, heading.text);
+            }
+        }
+        println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
+    }
+    Ok(())
+}
+```
+
+```rust title="Rust - Prepend Heading Context"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 500,
+            overlap: 50,
+            chunker_type: ChunkerType::Markdown,
+            prepend_heading_context: true,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.md", None::<&str>, &config).await?;
+    for chunk in &result.chunks {
+        // Each chunk's content is prefixed with its heading breadcrumb
+        println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_basic.md
+++ b/docs/snippets/rust/config/config_basic.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_discover.md
+++ b/docs/snippets/rust/config/config_discover.md
@@ -0,0 +1,11 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::discover()?.unwrap_or_default();
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_ocr.md
+++ b/docs/snippets/rust/config/config_ocr.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            tesseract_config: None,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None, &config)?;
+    println!("Content length: {}", result.content.len());
+    println!("Tables detected: {}", result.tables.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/config_programmatic.md
+++ b/docs/snippets/rust/config/config_programmatic.md
@@ -0,0 +1,29 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let config = ExtractionConfig {
+        use_cache: true,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: 6,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            ..Default::default()
+        }),
+        enable_quality_processing: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("Content length: {}", result.content.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/document_structure_config.md
+++ b/docs/snippets/rust/config/document_structure_config.md
@@ -0,0 +1,17 @@
+```rust title="Document Structure Config (Rust)"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+let config = ExtractionConfig {
+    include_document_structure: true,
+    ..Default::default()
+};
+
+let result = extract_file_sync("document.pdf", None, &config)?;
+
+if let Some(document) = &result.document {
+    for node in &document.nodes {
+        let text = node.content.text().unwrap_or("");
+        println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
+    }
+}
+```
--- a/docs/snippets/rust/config/element_based_output.md
+++ b/docs/snippets/rust/config/element_based_output.md
@@ -0,0 +1,48 @@
+```rust title="Element-Based Output (Rust)"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+use kreuzberg::types::OutputFormat as ResultFormat;
+
+fn main() -> kreuzberg::Result<()> {
+    // Configure element-based output (result_format controls Unified vs ElementBased)
+    let config = ExtractionConfig {
+        result_format: ResultFormat::ElementBased,
+        ..Default::default()
+    };
+
+    // Extract document
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    // Access elements
+    if let Some(elements) = result.elements {
+        for element in &elements {
+            println!("Type: {:?}", element.element_type);
+            println!("Text: {}", &element.text[..100.min(element.text.len())]);
+
+            if let Some(page) = element.metadata.page_number {
+                println!("Page: {}", page);
+            }
+
+            if let Some(coords) = &element.metadata.coordinates {
+                println!("Coords: ({}, {}) - ({}, {})",
+                    coords.x0, coords.y0, coords.x1, coords.y1);
+            }
+
+            println!("---");
+        }
+
+        // Filter by element type
+        let titles: Vec<_> = elements.iter()
+            .filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
+            .collect();
+
+        for title in titles {
+            let level = title.metadata.additional.get("level")
+                .map(|v| v.as_ref())
+                .unwrap_or("unknown");
+            println!("[{}] {}", level, title.text);
+        }
+    }
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/embedding_config.md
+++ b/docs/snippets/rust/config/embedding_config.md
@@ -0,0 +1,28 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 1000,
+            overlap: 200,
+            embedding: Some(EmbeddingConfig {
+                model: EmbeddingModelType::Preset {
+                    name: "balanced".to_string(),
+                },
+                batch_size: 16,
+                normalize: true,
+                show_download_progress: true,
+                cache_dir: None,
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/embedding_config.rs
+++ b/docs/snippets/rust/config/embedding_config.rs
@@ -0,0 +1,85 @@
+// Example 1: Preset model (recommended)
+// Fast, balanced, or quality preset configurations optimized for common use cases.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Preset {
+        name: "balanced".to_string(),
+    },
+    batch_size: 32,
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
+    acceleration: None,
+};
+
+// Available presets:
+// - "fast" (384 dims): Quick prototyping, development, resource-constrained
+// - "balanced" (768 dims): Production, general-purpose RAG, English documents
+// - "quality" (1024 dims): Complex documents, maximum accuracy
+// - "multilingual" (768 dims): International documents, 100+ languages
+
+// Example 2: Custom ONNX model (requires embeddings feature)
+// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Custom {
+        model_id: "BAAI/bge-small-en-v1.5".to_string(),
+        dimensions: 384,
+    },
+    batch_size: 32,
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: None,  // Uses default: .kreuzberg/embeddings/
+    acceleration: None,
+};
+
+// Popular ONNX-compatible models:
+// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+// Example 3: Alternative Custom ONNX Model
+// For advanced users wanting different ONNX embedding models.
+let embedding_config = EmbeddingConfig {
+    model: EmbeddingModelType::Custom {
+        model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
+        dimensions: 768,
+    },
+    batch_size: 16,  // Larger model requires smaller batch size
+    normalize: true,
+    show_download_progress: true,
+    cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
+    acceleration: None,
+};
+
+// Integration with ChunkingConfig
+// Add embeddings to your chunking configuration:
+use kreuzberg::{ChunkingConfig, ExtractionConfig};
+
+let chunking_with_embeddings = ChunkingConfig {
+    max_characters: 1024,
+    overlap: 100,
+    preset: Some("balanced".to_string()),
+    embedding: Some(EmbeddingConfig::default()),  // Uses balanced preset
+};
+
+let extraction_config = ExtractionConfig {
+    chunking: Some(chunking_with_embeddings),
+    ..Default::default()
+};
+
+// Key parameter explanations:
+//
+// batch_size: Number of texts to embed at once (32-128 typical)
+//   - Larger batches are faster but use more memory
+//   - Smaller batches for resource-constrained environments
+//
+// normalize: Whether to normalize vectors (L2 norm)
+//   - true (recommended): Enables cosine similarity in vector DBs
+//   - false: Raw embedding values
+//
+// cache_dir: Where to store downloaded models
+//   - None: Uses .kreuzberg/embeddings/ in current directory
+//   - Some(path): Custom directory for model storage
+//
+// show_download_progress: Display download progress bar
+//   - Useful for monitoring large model downloads
--- a/docs/snippets/rust/config/hierarchy_config.rs
+++ b/docs/snippets/rust/config/hierarchy_config.rs
@@ -0,0 +1,90 @@
+use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
+
+// Example 1: Basic hierarchy extraction
+// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
+// Extract bounding box information for spatial layout awareness.
+let hierarchy_config_basic = HierarchyConfig {
+    enabled: true,
+    k_clusters: 6,  // Default: creates 6 font size clusters (H1-H6 structure)
+    include_bbox: true,  // Include bounding box coordinates
+    ocr_coverage_threshold: None,  // No OCR coverage threshold
+};
+
+let pdf_config_basic = PdfConfig {
+    hierarchy: Some(hierarchy_config_basic),
+    ..Default::default()
+};
+
+let extraction_config_basic = ExtractionConfig {
+    pdf_options: Some(pdf_config_basic),
+    ..Default::default()
+};
+
+// Use with extract_file_sync or extract_bytes_sync
+// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
+
+
+// Example 2: Custom k_clusters for minimal structure
+// Use 3 clusters for simpler hierarchy with minimal structure.
+// Useful when you only need major section divisions (Main, Subsection, Detail).
+let hierarchy_config_minimal = HierarchyConfig {
+    enabled: true,
+    k_clusters: 3,  // Minimal clustering: just 3 levels
+    include_bbox: true,
+    ocr_coverage_threshold: None,
+};
+
+let pdf_config_minimal = PdfConfig {
+    hierarchy: Some(hierarchy_config_minimal),
+    ..Default::default()
+};
+
+let extraction_config_minimal = ExtractionConfig {
+    pdf_options: Some(pdf_config_minimal),
+    ..Default::default()
+};
+
+
+// Example 3: With OCR coverage threshold
+// Trigger OCR if less than 50% of text has font data.
+// Useful for documents with mixed digital and scanned content.
+let hierarchy_config_ocr = HierarchyConfig {
+    enabled: true,
+    k_clusters: 6,
+    include_bbox: true,
+    ocr_coverage_threshold: Some(0.5),  // Trigger OCR if text coverage < 50%
+};
+
+let pdf_config_ocr = PdfConfig {
+    hierarchy: Some(hierarchy_config_ocr),
+    ..Default::default()
+};
+
+let extraction_config_ocr = ExtractionConfig {
+    pdf_options: Some(pdf_config_ocr),
+    ..Default::default()
+};
+
+
+// Field descriptions:
+//
+// enabled: bool (default: true)
+//   - Enable or disable hierarchy extraction
+//   - When false, hierarchy structure is not analyzed
+//
+// k_clusters: usize (default: 6, valid: 1-7)
+//   - Number of font size clusters for hierarchy levels
+//   - 6 provides H1-H6 heading levels with body text
+//   - Higher values create more fine-grained hierarchy
+//   - Lower values create simpler structure
+//
+// include_bbox: bool (default: true)
+//   - Include bounding box coordinates in hierarchy blocks
+//   - Required for spatial layout awareness and document structure
+//   - Set to false only if space optimization is critical
+//
+// ocr_coverage_threshold: Option<f32> (default: None)
+//   - Range: 0.0 to 1.0
+//   - Triggers OCR when text block coverage falls below this fraction
+//   - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
+//   - None means no OCR coverage-based triggering
--- a/docs/snippets/rust/config/html_output.md
+++ b/docs/snippets/rust/config/html_output.md
@@ -0,0 +1,14 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
+
+let config = ExtractionConfig {
+    output_format: OutputFormat::Html,
+    html_output: Some(HtmlOutputConfig {
+        theme: HtmlTheme::GitHub,
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+let result = extract_file_sync("document.pdf", None, &config).unwrap();
+println!("{}", result.content); // HTML with kb-* classes
+```
--- a/docs/snippets/rust/config/keyword_config.rs
+++ b/docs/snippets/rust/config/keyword_config.rs
@@ -0,0 +1,72 @@
+use kreuzberg::{extract_file, ExtractionConfig};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
+
+// Example 1: Basic YAKE configuration
+// Uses YAKE algorithm with default parameters and English stopword filtering
+pub async fn basic_yake() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.0,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            yake_params: None,
+            rake_params: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+
+// Example 2: Advanced YAKE with custom parameters
+// Fine-tunes YAKE with custom window size for co-occurrence analysis
+#[cfg(feature = "keywords-yake")]
+pub async fn advanced_yake() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 15,
+            min_score: 0.1,
+            ngram_range: (1, 2),
+            language: Some("en".to_string()),
+            yake_params: Some(YakeParams {
+                window_size: 1,
+            }),
+            rake_params: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+
+// Example 3: RAKE configuration
+// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+#[cfg(feature = "keywords-rake")]
+pub async fn rake_config() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Rake,
+            max_keywords: 10,
+            min_score: 5.0,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            yake_params: None,
+            rake_params: Some(RakeParams {
+                min_word_length: 1,
+                max_words_per_phrase: 3,
+            }),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
--- a/docs/snippets/rust/config/keyword_extraction_config.md
+++ b/docs/snippets/rust/config/keyword_extraction_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig};
+use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        keywords: Some(KeywordConfig {
+            algorithm: KeywordAlgorithm::Yake,
+            max_keywords: 10,
+            min_score: 0.1,
+            ngram_range: (1, 3),
+            language: Some("en".to_string()),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Keywords: {:?}", result.keywords);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/language_detection_config.md
+++ b/docs/snippets/rust/config/language_detection_config.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.8,
+            detect_multiple: true,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Detected language: {}", result.language);
+    println!("Confidence: {}", result.language_confidence);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/ocr_dpi_config.md
+++ b/docs/snippets/rust/config/ocr_dpi_config.md
@@ -0,0 +1,21 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 150,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Extracted images: {}", result.images.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/pdf_config.md
+++ b/docs/snippets/rust/config/pdf_config.md
@@ -0,0 +1,20 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        pdf_options: Some(PdfConfig {
+            extract_images: true,
+            passwords: Some(vec!["password123".to_string()]),
+            extract_metadata: true,
+            hierarchy: Some(HierarchyConfig::default()),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("encrypted.pdf", None, &config)?;
+    println!("Title: {:?}", result.metadata.title);
+    println!("Authors: {:?}", result.metadata.authors);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/pdf_hierarchy_config.md
+++ b/docs/snippets/rust/config/pdf_hierarchy_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        pdf_options: Some(PdfConfig {
+            hierarchy: Some(HierarchyConfig {
+                enabled: true,
+                detection_threshold: Some(0.75),
+                ocr_coverage_threshold: Some(0.8),
+                min_level: Some(1),
+                max_level: Some(5),
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Hierarchy levels: {}", result.hierarchy.len());
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/postprocessor_config.md
+++ b/docs/snippets/rust/config/postprocessor_config.md
@@ -0,0 +1,22 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        postprocessor: Some(PostProcessorConfig {
+            enabled: true,
+            enabled_processors: Some(vec![
+                "whitespace_normalizer".to_string(),
+                "unicode_normalizer".to_string(),
+            ]),
+            disabled_processors: None,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Processed content: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/quality_processing_config.md
+++ b/docs/snippets/rust/config/quality_processing_config.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        enable_quality_processing: true,
+        use_cache: true,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
+    println!("Quality score: {}", result.quality_score);
+    println!("Processing time: {:?}", result.processing_time);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/tesseract_config.md
+++ b/docs/snippets/rust/config/tesseract_config.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
+use kreuzberg::types::TesseractConfig;
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng+deu".to_string(),
+            tesseract_config: Some(TesseractConfig {
+                psm: Some(6),
+                oem: Some(3),
+                ..Default::default()
+            }),
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
+    println!("OCR text: {}", result.content);
+    Ok(())
+}
+```
--- a/docs/snippets/rust/config/token_reduction_config.md
+++ b/docs/snippets/rust/config/token_reduction_config.md
@@ -0,0 +1,19 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig {
+        token_reduction: Some(TokenReductionConfig {
+            mode: "moderate".to_string(),
+            preserve_important_words: true,
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file("document.pdf", None::<&str>, &config).await?;
+    println!("Original tokens: {}", result.token_count);
+    println!("Reduced content: {}", result.content);
+    Ok(())
+}
+```