Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/rust/metadata/image_extraction.md
+++ b/docs/snippets/rust/metadata/image_extraction.md
@@ -0,0 +1,18 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: Some(true),
+            target_dpi: Some(200),
+            max_image_dimension: Some(2048),
+            inject_placeholders: Some(true), // set to false to extract images without markdown references
+            auto_adjust_dpi: Some(true),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.images);
+}
+```
--- a/docs/snippets/rust/metadata/image_preprocessing.md
+++ b/docs/snippets/rust/metadata/image_preprocessing.md
@@ -0,0 +1,25 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            tesseract_config: Some(TesseractConfig {
+                preprocessing: Some(ImagePreprocessingConfig {
+                    target_dpi: 300,
+                    denoise: true,
+                    deskew: true,
+                    contrast_enhance: true,
+                    binarization_method: "otsu".to_string(),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    println!("{:?}", config.ocr);
+}
+```
--- a/docs/snippets/rust/metadata/language_detection.md
+++ b/docs/snippets/rust/metadata/language_detection.md
@@ -0,0 +1,15 @@
+```rust title="Rust"
+use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
+
+fn main() {
+    let config = ExtractionConfig {
+        language_detection: Some(LanguageDetectionConfig {
+            enabled: true,
+            min_confidence: 0.9,
+            detect_multiple: true,
+        }),
+        ..Default::default()
+    };
+    println!("{:?}", config.language_detection);
+}
+```
--- a/docs/snippets/rust/metadata/language_detection_multilingual.md
+++ b/docs/snippets/rust/metadata/language_detection_multilingual.md
@@ -0,0 +1,16 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
+
+let config = ExtractionConfig {
+    language_detection: Some(LanguageDetectionConfig {
+        enabled: true,
+        min_confidence: 0.8,
+        detect_multiple: true,
+    }),
+    ..Default::default()
+};
+
+let result = extract_file("multilingual_document.pdf", None, &config).await?;
+
+println!("Detected languages: {:?}", result.detected_languages);
+```
--- a/docs/snippets/rust/metadata/metadata.md
+++ b/docs/snippets/rust/metadata/metadata.md
@@ -0,0 +1,82 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
+
+    if let Some(pdf_meta) = result.metadata.pdf {
+        if let Some(pages) = pdf_meta.page_count {
+            println!("Pages: {}", pages);
+        }
+        if let Some(author) = pdf_meta.author {
+            println!("Author: {}", author);
+        }
+        if let Some(title) = pdf_meta.title {
+            println!("Title: {}", title);
+        }
+    }
+
+    let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
+    if let Some(html_meta) = html_result.metadata.html {
+        if let Some(title) = html_meta.title {
+            println!("Title: {}", title);
+        }
+        if let Some(desc) = html_meta.description {
+            println!("Description: {}", desc);
+        }
+
+        // Access keywords array
+        println!("Keywords: {:?}", html_meta.keywords);
+
+        // Access canonical URL (renamed from canonical)
+        if let Some(canonical) = html_meta.canonical_url {
+            println!("Canonical URL: {}", canonical);
+        }
+
+        // Access Open Graph fields as a map
+        if let Some(og_image) = html_meta.open_graph.get("image") {
+            println!("Open Graph Image: {}", og_image);
+        }
+        if let Some(og_title) = html_meta.open_graph.get("title") {
+            println!("Open Graph Title: {}", og_title);
+        }
+
+        // Access Twitter Card fields as a map
+        if let Some(twitter_card) = html_meta.twitter_card.get("card") {
+            println!("Twitter Card Type: {}", twitter_card);
+        }
+
+        // Access new fields
+        if let Some(lang) = html_meta.language {
+            println!("Language: {}", lang);
+        }
+
+        // Access headers
+        if !html_meta.headers.is_empty() {
+            for header in &html_meta.headers {
+                println!("Header (level {}): {}", header.level, header.text);
+            }
+        }
+
+        // Access links
+        if !html_meta.links.is_empty() {
+            for link in &html_meta.links {
+                println!("Link: {} ({})", link.href, link.text);
+            }
+        }
+
+        // Access images
+        if !html_meta.images.is_empty() {
+            for image in &html_meta.images {
+                println!("Image: {}", image.src);
+            }
+        }
+
+        // Access structured data
+        if !html_meta.structured_data.is_empty() {
+            println!("Structured data items: {}", html_meta.structured_data.len());
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/page_boundaries.md
+++ b/docs/snippets/rust/metadata/page_boundaries.md
@@ -0,0 +1,26 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+
+    let Some(pages) = &result.metadata.pages else {
+        return Ok(());
+    };
+    let Some(boundaries) = &pages.boundaries else {
+        return Ok(());
+    };
+
+    for boundary in boundaries.iter().take(3) {
+        let page_text = &result.content[boundary.byte_start..boundary.byte_end];
+        let preview_end = 100.min(page_text.len());
+
+        println!("Page {}:", boundary.page_number);
+        println!("  Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
+        println!("  Preview: {}...", &page_text[..preview_end]);
+    }
+
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/page_tracking_basic.md
+++ b/docs/snippets/rust/metadata/page_tracking_basic.md
@@ -0,0 +1,20 @@
+Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
+
+Let config = ExtractionConfig {
+pages: Some(PageConfig {
+extract_pages: true,
+..Default::default()
+}),
+..Default::default()
+};
+
+Let result = extract_file_sync("document.pdf", &config)?;
+
+If let Some(pages) = result.pages {
+for page in pages {
+println!("Page {}:", page.page_number);
+println!(" Content: {} chars", page.content.len());
+println!(" Tables: {}", page.tables.len());
+println!(" Images: {}", page.images.len());
+}
+}
--- a/docs/snippets/rust/metadata/pdf_metadata_extractor.md
+++ b/docs/snippets/rust/metadata/pdf_metadata_extractor.md
@@ -0,0 +1,79 @@
+```rust title="Rust"
+use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
+use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
+use async_trait::async_trait;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+struct PdfMetadataExtractor {
+    processed_count: AtomicUsize,
+}
+
+impl PdfMetadataExtractor {
+    fn new() -> Self {
+        Self {
+            processed_count: AtomicUsize::new(0),
+        }
+    }
+}
+
+impl Plugin for PdfMetadataExtractor {
+    fn name(&self) -> &str { "pdf-metadata-extractor" }
+    fn version(&self) -> String { "1.0.0".to_string() }
+    fn description(&self) -> &str {
+        "Extracts and enriches PDF metadata"
+    }
+    fn initialize(&self) -> Result<()> {
+        log::info!("PDF metadata extractor initialized");
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        let count = self.processed_count.load(Ordering::Acquire);
+        log::info!("Processed {} PDFs", count);
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl PostProcessor for PdfMetadataExtractor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> Result<()> {
+        self.processed_count.fetch_add(1, Ordering::AcqRel);
+
+        result.processing_warnings.push(ProcessingWarning {
+            source: "pdf-metadata-extractor".to_string(),
+            message: "PDF metadata extracted successfully".to_string()
+        });
+
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Early
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig,
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+
+    fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
+        10
+    }
+}
+
+use kreuzberg::plugins::registry::get_post_processor_registry;
+use std::sync::Arc;
+
+fn register() -> Result<()> {
+    let processor = Arc::new(PdfMetadataExtractor::new());
+    let registry = get_post_processor_registry();
+    registry.register(processor, 50)?;
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/pdf_only_processor.md
+++ b/docs/snippets/rust/metadata/pdf_only_processor.md
@@ -0,0 +1,23 @@
+```rust title="Rust"
+impl PostProcessor for PdfOnlyProcessor {
+    async fn process(
+        &self,
+        result: &mut ExtractionResult,
+        _config: &ExtractionConfig
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Middle
+    }
+
+    fn should_process(
+        &self,
+        result: &ExtractionResult,
+        _config: &ExtractionConfig
+    ) -> bool {
+        result.mime_type == "application/pdf"
+    }
+}
+```
--- a/docs/snippets/rust/metadata/tables.md
+++ b/docs/snippets/rust/metadata/tables.md
@@ -0,0 +1,17 @@
+```rust title="Rust"
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
+
+    for table in &result.tables {
+        println!("Table with {} rows", table.cells.len());
+        println!("{}", table.markdown);
+
+        for row in &table.cells {
+            println!("{:?}", row);
+        }
+    }
+    Ok(())
+}
+```
--- a/docs/snippets/rust/metadata/vector_database_integration.md
+++ b/docs/snippets/rust/metadata/vector_database_integration.md
@@ -0,0 +1,55 @@
+```rust title="Rust"
+use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
+
+struct VectorRecord {
+    id: String,
+    content: String,
+    embedding: Vec<f32>,
+    metadata: std::collections::HashMap<String, String>,
+}
+
+async fn extract_and_vectorize(
+    document_path: &str,
+    document_id: &str,
+) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
+    let config = ExtractionConfig {
+        chunking: Some(ChunkingConfig {
+            max_characters: 512,
+            overlap: 50,
+            embedding: Some(EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Preset {
+                    name: "balanced".to_string(),
+                },
+                normalize: true,
+                batch_size: 32,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
+
+    let result = extract_file(document_path, None, &config).await?;
+
+    let mut records = Vec::new();
+    if let Some(chunks) = result.chunks {
+        for (index, chunk) in chunks.iter().enumerate() {
+            if let Some(embedding) = &chunk.embedding {
+                let mut metadata = std::collections::HashMap::new();
+                metadata.insert("document_id".to_string(), document_id.to_string());
+                metadata.insert("chunk_index".to_string(), index.to_string());
+                metadata.insert("content_length".to_string(), chunk.content.len().to_string());
+
+                records.push(VectorRecord {
+                    id: format!("{}_chunk_{}", document_id, index),
+                    content: chunk.content.clone(),
+                    embedding: embedding.clone(),
+                    metadata,
+                });
+            }
+        }
+    }
+
+    Ok(records)
+}
+```