Files
fil/docs/snippets/rust/api/combining_all_features.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1.8 KiB

use kreuzberg::{
    ChunkingConfig, ChunkerType, ExtractionConfig, ImageExtractionConfig,
    OcrConfig, OutputFormat, extract_file_sync,
};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        // OCR: force Tesseract on all pages with English text
        force_ocr: false,
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        // Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
        chunking: Some(ChunkingConfig {
            max_characters: 800,
            overlap: 100,
            chunker_type: ChunkerType::Markdown,
            prepend_heading_context: true,
            ..Default::default()
        }),
        // Output: include document structure and tables
        output_format: OutputFormat::Markdown,
        include_document_structure: true,
        // Images: extract embedded images
        images: Some(ImageExtractionConfig {
            extract_images: true,
            ..Default::default()
        }),
        // Cache extracted results on disk
        use_cache: true,
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_file_sync("report.pdf", None, &config)?;

    println!("Content ({} chars):", result.content.len());
    println!("{}", &result.content[..result.content.len().min(200)]);

    if let Some(chunks) = &result.chunks {
        println!("\nChunks: {}", chunks.len());
    }
    println!("Tables: {}", result.tables.len());
    if let Some(langs) = &result.detected_languages {
        println!("Languages: {:?}", langs);
    }
    if let Some(method) = result.extraction_method {
        println!("Extraction method: {:?}", method);
    }
    Ok(())
}