This commit is contained in:
55
docs/snippets/rust/api/combining_all_features.md
Normal file
55
docs/snippets/rust/api/combining_all_features.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
ChunkingConfig, ChunkerType, ExtractionConfig, ImageExtractionConfig,
|
||||
OcrConfig, OutputFormat, extract_file_sync,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
// OCR: force Tesseract on all pages with English text
|
||||
force_ocr: false,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
// Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 800,
|
||||
overlap: 100,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
// Output: include document structure and tables
|
||||
output_format: OutputFormat::Markdown,
|
||||
include_document_structure: true,
|
||||
// Images: extract embedded images
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
..Default::default()
|
||||
}),
|
||||
// Cache extracted results on disk
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("report.pdf", None, &config)?;
|
||||
|
||||
println!("Content ({} chars):", result.content.len());
|
||||
println!("{}", &result.content[..result.content.len().min(200)]);
|
||||
|
||||
if let Some(chunks) = &result.chunks {
|
||||
println!("\nChunks: {}", chunks.len());
|
||||
}
|
||||
println!("Tables: {}", result.tables.len());
|
||||
if let Some(langs) = &result.detected_languages {
|
||||
println!("Languages: {:?}", langs);
|
||||
}
|
||||
if let Some(method) = result.extraction_method {
|
||||
println!("Extraction method: {:?}", method);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user