This commit is contained in:
46
docs/snippets/rust/advanced/advanced_config.md
Normal file
46
docs/snippets/rust/advanced/advanced_config.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 100,
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
detect_multiple: true,
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
let preview: String = chunk.content.chars().take(100).collect();
|
||||
println!("Chunk: {}...", preview);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(languages) = result.detected_languages {
|
||||
println!("Languages: {:?}", languages);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
29
docs/snippets/rust/advanced/chunk_page_mapping.md
Normal file
29
docs/snippets/rust/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Use Kreuzberg::{extract_file_sync, ExtractionConfig, ChunkingConfig, PageConfig};
|
||||
|
||||
Let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
If let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
||||
let page_range = if first == last {
|
||||
format!("Page {}", first)
|
||||
} else {
|
||||
format!("Pages {}-{}", first, last)
|
||||
};
|
||||
println!("Chunk: {}... ({})", chunk.content.chars().take(50).collect::<String>(), page_range);
|
||||
}
|
||||
}
|
||||
}
|
||||
21
docs/snippets/rust/advanced/chunking.md
Normal file
21
docs/snippets/rust/advanced/chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1500,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "text-embedding-all-minilm-l6-v2".to_string(),
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.chunking);
|
||||
}
|
||||
```
|
||||
39
docs/snippets/rust/advanced/chunking_config.md
Normal file
39
docs/snippets/rust/advanced/chunking_config.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
```rust title="Rust - Semantic"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
chunker_type: ChunkerType::Semantic,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
```rust title="Rust - Prepend Heading Context"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
36
docs/snippets/rust/advanced/chunking_rag.md
Normal file
36
docs/snippets/rust/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: "balanced".to_string(),
|
||||
normalize: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
println!("Chunk {}/{}",
|
||||
chunk.metadata.chunk_index + 1,
|
||||
chunk.metadata.total_chunks
|
||||
);
|
||||
println!("Position: {}-{}",
|
||||
chunk.metadata.byte_start,
|
||||
chunk.metadata.byte_end
|
||||
);
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
if let Some(embedding) = chunk.embedding {
|
||||
println!("Embedding: {} dimensions", embedding.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/advanced/embedding_config.md
Normal file
23
docs/snippets/rust/advanced/embedding_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "all-mpnet-base-v2".to_string(),
|
||||
},
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.chunking);
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/advanced/embedding_with_chunking.md
Normal file
19
docs/snippets/rust/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: "balanced".to_string(),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
15
docs/snippets/rust/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/rust/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
19
docs/snippets/rust/advanced/keyword_extraction_example.md
Normal file
19
docs/snippets/rust/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
if let Some(keywords) = &result.extracted_keywords {
|
||||
println!("Keywords: {:?}", keywords);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/advanced/language_detection.md
Normal file
15
docs/snippets/rust/advanced/language_detection.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: Some(true),
|
||||
min_confidence: Some(0.9),
|
||||
detect_multiple: Some(true),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.language_detection);
|
||||
}
|
||||
```
|
||||
12
docs/snippets/rust/advanced/language_detection_config.md
Normal file
12
docs/snippets/rust/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("multilingual_document.pdf", None, &config).await?;
|
||||
|
||||
println!("Detected languages: {:?}", result.detected_languages);
|
||||
```
|
||||
18
docs/snippets/rust/advanced/postprocessor_config.md
Normal file
18
docs/snippets/rust/advanced/postprocessor_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, PostProcessorConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: Some(true),
|
||||
enabled_processors: Some(vec![
|
||||
"deduplication".to_string(),
|
||||
"whitespace_normalization".to_string(),
|
||||
]),
|
||||
disabled_processors: Some(vec!["mojibake_fix".to_string()]),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.postprocessor);
|
||||
}
|
||||
```
|
||||
8
docs/snippets/rust/advanced/quality_processing_config.md
Normal file
8
docs/snippets/rust/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::ExtractionConfig;
|
||||
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
17
docs/snippets/rust/advanced/quality_processing_example.md
Normal file
17
docs/snippets/rust/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_file("scanned_document.pdf", None, &config).await?;
|
||||
|
||||
if let Some(score) = result.quality_score {
|
||||
if score < 0.5 {
|
||||
println!("Warning: Low quality extraction ({:.2})", score);
|
||||
} else {
|
||||
println!("Quality score: {:.2}", score);
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/advanced/token_reduction.md
Normal file
14
docs/snippets/rust/advanced/token_reduction.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.token_reduction);
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/advanced/token_reduction_config.md
Normal file
14
docs/snippets/rust/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: Some("eng".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
21
docs/snippets/rust/advanced/token_reduction_example.md
Normal file
21
docs/snippets/rust/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_markdown: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("verbose_document.pdf", None, &config).await?;
|
||||
|
||||
if let Some(original) = result.original_token_count {
|
||||
println!("Original tokens: {}", original);
|
||||
}
|
||||
if let Some(reduced) = result.reduced_token_count {
|
||||
println!("Reduced tokens: {}", reduced);
|
||||
}
|
||||
```
|
||||
55
docs/snippets/rust/advanced/vector_database_integration.md
Normal file
55
docs/snippets/rust/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
struct VectorRecord {
|
||||
id: String,
|
||||
content: String,
|
||||
embedding: Vec<f32>,
|
||||
metadata: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
async fn extract_and_vectorize(
|
||||
document_path: &str,
|
||||
document_id: &str,
|
||||
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file(document_path, None, &config).await?;
|
||||
|
||||
let mut records = Vec::new();
|
||||
if let Some(chunks) = result.chunks {
|
||||
for (index, chunk) in chunks.iter().enumerate() {
|
||||
if let Some(embedding) = &chunk.embedding {
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
metadata.insert("document_id".to_string(), document_id.to_string());
|
||||
metadata.insert("chunk_index".to_string(), index.to_string());
|
||||
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
|
||||
|
||||
records.push(VectorRecord {
|
||||
id: format!("{}_chunk_{}", document_id, index),
|
||||
content: chunk.content.clone(),
|
||||
embedding: embedding.clone(),
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
```
|
||||
25
docs/snippets/rust/api/batch_extract_bytes_sync.md
Normal file
25
docs/snippets/rust/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let items = vec![
|
||||
BatchBytesItem {
|
||||
content: b"Hello, world!".to_vec(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
config: None,
|
||||
},
|
||||
BatchBytesItem {
|
||||
content: b"# Heading\n\nParagraph text.".to_vec(),
|
||||
mime_type: "text/markdown".to_string(),
|
||||
config: None,
|
||||
},
|
||||
];
|
||||
let results = batch_extract_bytes_sync(items, &config)?;
|
||||
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
println!("Item {}: {} chars", i, result.content.len());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/api/batch_extract_files_sync.md
Normal file
18
docs/snippets/rust/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let items = vec![
|
||||
BatchFileItem { path: "doc1.pdf".into(), config: None },
|
||||
BatchFileItem { path: "doc2.docx".into(), config: None },
|
||||
BatchFileItem { path: "report.pdf".into(), config: None },
|
||||
];
|
||||
let results = batch_extract_files_sync(items, &config)?;
|
||||
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
println!("File {}: {} chars", i, result.content.len());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
29
docs/snippets/rust/api/client_chunk_text.md
Normal file
29
docs/snippets/rust/api/client_chunk_text.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```rust title="Rust"
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let client = reqwest::Client::new();
|
||||
let bytes = tokio::fs::read("document.pdf").await?;
|
||||
|
||||
let part = reqwest::multipart::Part::bytes(bytes)
|
||||
.file_name("document.pdf")
|
||||
.mime_str("application/pdf")?;
|
||||
let form = reqwest::multipart::Form::new()
|
||||
.part("file", part)
|
||||
.text("chunking", r#"{"max_characters":800,"overlap":100}"#);
|
||||
|
||||
let response = client
|
||||
.post("http://localhost:8000/extract")
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let result: serde_json::Value = response.error_for_status()?.json().await?;
|
||||
if let Some(chunks) = result["chunks"].as_array() {
|
||||
println!("{} chunks", chunks.len());
|
||||
for chunk in chunks {
|
||||
println!(" {} chars", chunk["content"].as_str().unwrap_or("").len());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
28
docs/snippets/rust/api/client_extract_single_file.md
Normal file
28
docs/snippets/rust/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```rust title="Rust"
|
||||
use std::path::Path;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let client = reqwest::Client::new();
|
||||
let bytes = tokio::fs::read("document.pdf").await?;
|
||||
let file_name = Path::new("document.pdf")
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("document.pdf");
|
||||
|
||||
let part = reqwest::multipart::Part::bytes(bytes)
|
||||
.file_name(file_name.to_string())
|
||||
.mime_str("application/pdf")?;
|
||||
let form = reqwest::multipart::Form::new().part("file", part);
|
||||
|
||||
let response = client
|
||||
.post("http://localhost:8000/extract")
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let result: serde_json::Value = response.error_for_status()?.json().await?;
|
||||
println!("{}", result["content"].as_str().unwrap_or(""));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
55
docs/snippets/rust/api/combining_all_features.md
Normal file
55
docs/snippets/rust/api/combining_all_features.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
ChunkingConfig, ChunkerType, ExtractionConfig, ImageExtractionConfig,
|
||||
OcrConfig, OutputFormat, extract_file_sync,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
// OCR: force Tesseract on all pages with English text
|
||||
force_ocr: false,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
// Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 800,
|
||||
overlap: 100,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
// Output: include document structure and tables
|
||||
output_format: OutputFormat::Markdown,
|
||||
include_document_structure: true,
|
||||
// Images: extract embedded images
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
..Default::default()
|
||||
}),
|
||||
// Cache extracted results on disk
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("report.pdf", None, &config)?;
|
||||
|
||||
println!("Content ({} chars):", result.content.len());
|
||||
println!("{}", &result.content[..result.content.len().min(200)]);
|
||||
|
||||
if let Some(chunks) = &result.chunks {
|
||||
println!("\nChunks: {}", chunks.len());
|
||||
}
|
||||
println!("Tables: {}", result.tables.len());
|
||||
if let Some(langs) = &result.detected_languages {
|
||||
println!("Languages: {:?}", langs);
|
||||
}
|
||||
if let Some(method) = result.extraction_method {
|
||||
println!("Extraction method: {:?}", method);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
21
docs/snippets/rust/api/error_handling.md
Normal file
21
docs/snippets/rust/api/error_handling.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig::default();
|
||||
match extract_file_sync("document.pdf", None, &config) {
|
||||
Ok(result) => println!("{}", result.content),
|
||||
Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
|
||||
Err(KreuzbergError::UnsupportedFormat(mime)) => {
|
||||
eprintln!("Unsupported format: {mime}");
|
||||
}
|
||||
Err(KreuzbergError::Parsing { message, .. }) => {
|
||||
eprintln!("Corrupt or invalid document: {message}");
|
||||
}
|
||||
Err(KreuzbergError::MissingDependency(dep)) => {
|
||||
eprintln!("Missing dependency — install {dep}");
|
||||
}
|
||||
Err(e) => eprintln!("Extraction failed: {e}"),
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/api/error_handling_extract.md
Normal file
23
docs/snippets/rust/api/error_handling_extract.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_bytes_sync, ExtractionConfig, KreuzbergError, Result};
|
||||
|
||||
fn extract_text(bytes: &[u8], mime_type: &str) -> Result<String> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(bytes, mime_type, &config)?;
|
||||
Ok(result.content)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let bytes = std::fs::read("document.pdf").unwrap_or_default();
|
||||
match extract_text(&bytes, "application/pdf") {
|
||||
Ok(text) => println!("Extracted {} chars", text.len()),
|
||||
Err(KreuzbergError::UnsupportedFormat(mime)) => {
|
||||
eprintln!("Format not supported: {mime}");
|
||||
}
|
||||
Err(KreuzbergError::Ocr { message, .. }) => {
|
||||
eprintln!("OCR failed: {message}");
|
||||
}
|
||||
Err(e) => eprintln!("Error: {e}"),
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/api/extract_bytes_async.md
Normal file
14
docs/snippets/rust/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_bytes, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let content = tokio::fs::read("document.pdf").await?;
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes(&content, "application/pdf", &config).await?;
|
||||
|
||||
println!("{}", result.content);
|
||||
println!("Tables: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
13
docs/snippets/rust/api/extract_bytes_sync.md
Normal file
13
docs/snippets/rust/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_bytes_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let content = std::fs::read("document.pdf")?;
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(&content, "application/pdf", &config)?;
|
||||
|
||||
println!("{}", result.content);
|
||||
println!("Tables: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/api/extract_file_async.md
Normal file
14
docs/snippets/rust/api/extract_file_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
|
||||
println!("{}", result.content);
|
||||
println!("MIME type: {}", result.mime_type);
|
||||
println!("Tables: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
13
docs/snippets/rust/api/extract_file_sync.md
Normal file
13
docs/snippets/rust/api/extract_file_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
println!("{}", result.content);
|
||||
println!("MIME type: {}", result.mime_type);
|
||||
println!("Tables: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
61
docs/snippets/rust/benchmarking/simple_benchmark.rs
Normal file
61
docs/snippets/rust/benchmarking/simple_benchmark.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
```rust title="simple_benchmark.rs"
|
||||
use kreuzberg::{extract_file_sync, extract_file, ExtractionConfig};
|
||||
use std::time::Instant;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: false,
|
||||
..Default::default()
|
||||
};
|
||||
let file_path = "document.pdf";
|
||||
let num_runs = 10;
|
||||
|
||||
let start = Instant::now();
|
||||
for _ in 0..num_runs {
|
||||
let _ = extract_file_sync(file_path, None, &config)?;
|
||||
}
|
||||
let sync_duration = start.elapsed().as_secs_f64();
|
||||
let avg_sync = sync_duration / num_runs as f64;
|
||||
|
||||
println!("Sync extraction ({} runs):", num_runs);
|
||||
println!(" - Total time: {:.3}s", sync_duration);
|
||||
println!(" - Average: {:.3}s per extraction", avg_sync);
|
||||
|
||||
let start = Instant::now();
|
||||
let mut tasks = vec![];
|
||||
for _ in 0..num_runs {
|
||||
tasks.push(extract_file(file_path, None, &config));
|
||||
}
|
||||
let results = futures::future::join_all(tasks).await;
|
||||
for result in results {
|
||||
result?;
|
||||
}
|
||||
let async_duration = start.elapsed().as_secs_f64();
|
||||
|
||||
println!("\nAsync extraction ({} parallel runs):", num_runs);
|
||||
println!(" - Total time: {:.3}s", async_duration);
|
||||
println!(" - Average: {:.3}s per extraction", async_duration / num_runs as f64);
|
||||
println!(" - Speedup: {:.1}x", sync_duration / async_duration);
|
||||
|
||||
let config_cached = ExtractionConfig {
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("\nFirst extraction (populates cache)...");
|
||||
let start = Instant::now();
|
||||
let _result1 = extract_file_sync(file_path, None, &config_cached)?;
|
||||
let first_duration = start.elapsed().as_secs_f64();
|
||||
println!(" - Time: {:.3}s", first_duration);
|
||||
|
||||
println!("Second extraction (from cache)...");
|
||||
let start = Instant::now();
|
||||
let _result2 = extract_file_sync(file_path, None, &config_cached)?;
|
||||
let cached_duration = start.elapsed().as_secs_f64();
|
||||
println!(" - Time: {:.3}s", cached_duration);
|
||||
println!(" - Cache speedup: {:.1}x", first_duration / cached_duration);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
28
docs/snippets/rust/cache/disk_cache.rs
vendored
Normal file
28
docs/snippets/rust/cache/disk_cache.rs
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
```rust title="disk_cache.rs"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let path = std::env::args()
|
||||
.skip(1)
|
||||
.find(|a| !a.is_empty() && a != "--")
|
||||
.unwrap_or_else(|| "document.pdf".to_string());
|
||||
|
||||
// Enable caching (default: true). The Rust crate uses an internal disk cache.
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("First extraction (will be cached)...");
|
||||
let result1 = extract_file_sync(&path, None, &config)?;
|
||||
println!(" - Content length: {}", result1.content.len());
|
||||
|
||||
println!("\nSecond extraction (from cache when available)...");
|
||||
let result2 = extract_file_sync(&path, None, &config)?;
|
||||
println!(" - Content length: {}", result2.content.len());
|
||||
|
||||
println!("\nResults are identical: {}", result1.content == result2.content);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
30
docs/snippets/rust/cli/basic_cli.rs
Normal file
30
docs/snippets/rust/cli/basic_cli.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
```rust title="basic_cli.rs"
|
||||
use std::process::Command;
|
||||
use anyhow::Result;
|
||||
|
||||
fn extract_with_cli(file_path: &str, output_format: &str) -> Result<String> {
|
||||
let output = Command::new("kreuzberg")
|
||||
.args(&["extract", file_path, "--format", output_format])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
anyhow::bail!("CLI error: {}", stderr);
|
||||
}
|
||||
|
||||
Ok(String::from_utf8(output.stdout)?)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let document = "document.pdf";
|
||||
|
||||
let text_output = extract_with_cli(document, "text")?;
|
||||
println!("Extracted: {} characters", text_output.len());
|
||||
|
||||
let json_output = extract_with_cli(document, "json")?;
|
||||
let parsed: serde_json::Value = serde_json::from_str(&json_output)?;
|
||||
println!("Format: {}", parsed.get("format").unwrap_or(&"unknown".into()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
39
docs/snippets/rust/cli/cli_with_config.rs
Normal file
39
docs/snippets/rust/cli/cli_with_config.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
```rust title="cli_with_config.rs"
|
||||
use std::process::Command;
|
||||
use anyhow::Result;
|
||||
|
||||
fn extract_with_config(file_path: &str, config_path: &str) -> Result<serde_json::Value> {
|
||||
let output = Command::new("kreuzberg")
|
||||
.args(&[
|
||||
"extract",
|
||||
file_path,
|
||||
"--config",
|
||||
config_path,
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
anyhow::bail!("CLI error: {}", stderr);
|
||||
}
|
||||
|
||||
let result: serde_json::Value = serde_json::from_slice(&output.stdout)?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let config_file = "kreuzberg.toml";
|
||||
let document = "document.pdf";
|
||||
|
||||
println!("Extracting {} with config {}", document, config_file);
|
||||
let result = extract_with_config(document, config_file)?;
|
||||
|
||||
println!("Content length: {}", result["content"].as_str().unwrap_or("").len());
|
||||
println!("Format: {}", result["format"].as_str().unwrap_or("unknown"));
|
||||
println!("Languages: {}", result["languages"].to_string());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
63
docs/snippets/rust/config/advanced_config.md
Normal file
63
docs/snippets/rust/config/advanced_config.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: None,
|
||||
output_format: None,
|
||||
paddle_ocr_config: None,
|
||||
element_config: None,
|
||||
}),
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: false,
|
||||
cache_dir: None,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: true,
|
||||
enabled_processors: None,
|
||||
disabled_processors: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Content: {}", result.content);
|
||||
if let Some(langs) = &result.detected_languages {
|
||||
println!("Languages: {:?}", langs);
|
||||
}
|
||||
println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
79
docs/snippets/rust/config/chunking_config.md
Normal file
79
docs/snippets/rust/config/chunking_config.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Chunks: {}", result.chunks.len());
|
||||
for chunk in &result.chunks {
|
||||
println!("Length: {}", chunk.content.len());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
```rust title="Rust - Markdown with Heading Context"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
sizing: ChunkSizing::Tokenizer {
|
||||
model: "Xenova/gpt-4o".into(),
|
||||
cache_dir: None,
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.md", None::<&str>, &config).await?;
|
||||
for chunk in &result.chunks {
|
||||
if let Some(heading_context) = &chunk.metadata.heading_context {
|
||||
for heading in &heading_context.headings {
|
||||
println!("Heading L{}: {}", heading.level, heading.text);
|
||||
}
|
||||
}
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
```rust title="Rust - Prepend Heading Context"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.md", None::<&str>, &config).await?;
|
||||
for chunk in &result.chunks {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
16
docs/snippets/rust/config/config_basic.md
Normal file
16
docs/snippets/rust/config/config_basic.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
11
docs/snippets/rust/config/config_discover.md
Normal file
11
docs/snippets/rust/config/config_discover.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::discover()?.unwrap_or_default();
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/config_ocr.md
Normal file
20
docs/snippets/rust/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: None,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
println!("Content length: {}", result.content.len());
|
||||
println!("Tables detected: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
29
docs/snippets/rust/config/config_programmatic.md
Normal file
29
docs/snippets/rust/config/config_programmatic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 6,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
..Default::default()
|
||||
}),
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("Content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
17
docs/snippets/rust/config/document_structure_config.md
Normal file
17
docs/snippets/rust/config/document_structure_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Document Structure Config (Rust)"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
include_document_structure: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
if let Some(document) = &result.document {
|
||||
for node in &document.nodes {
|
||||
let text = node.content.text().unwrap_or("");
|
||||
println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
|
||||
}
|
||||
}
|
||||
```
|
||||
48
docs/snippets/rust/config/element_based_output.md
Normal file
48
docs/snippets/rust/config/element_based_output.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```rust title="Element-Based Output (Rust)"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
use kreuzberg::types::OutputFormat as ResultFormat;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
// Configure element-based output (result_format controls Unified vs ElementBased)
|
||||
let config = ExtractionConfig {
|
||||
result_format: ResultFormat::ElementBased,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Extract document
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
// Access elements
|
||||
if let Some(elements) = result.elements {
|
||||
for element in &elements {
|
||||
println!("Type: {:?}", element.element_type);
|
||||
println!("Text: {}", &element.text[..100.min(element.text.len())]);
|
||||
|
||||
if let Some(page) = element.metadata.page_number {
|
||||
println!("Page: {}", page);
|
||||
}
|
||||
|
||||
if let Some(coords) = &element.metadata.coordinates {
|
||||
println!("Coords: ({}, {}) - ({}, {})",
|
||||
coords.x0, coords.y0, coords.x1, coords.y1);
|
||||
}
|
||||
|
||||
println!("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
let titles: Vec<_> = elements.iter()
|
||||
.filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
|
||||
.collect();
|
||||
|
||||
for title in titles {
|
||||
let level = title.metadata.additional.get("level")
|
||||
.map(|v| v.as_ref())
|
||||
.unwrap_or("unknown");
|
||||
println!("[{}] {}", level, title.text);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
28
docs/snippets/rust/config/embedding_config.md
Normal file
28
docs/snippets/rust/config/embedding_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: None,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
85
docs/snippets/rust/config/embedding_config.rs
Normal file
85
docs/snippets/rust/config/embedding_config.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Custom {
|
||||
model_id: "BAAI/bge-small-en-v1.5".to_string(),
|
||||
dimensions: 384,
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: None, // Uses default: .kreuzberg/embeddings/
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Custom {
|
||||
model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
|
||||
dimensions: 768,
|
||||
},
|
||||
batch_size: 16, // Larger model requires smaller batch size
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
use kreuzberg::{ChunkingConfig, ExtractionConfig};
|
||||
|
||||
let chunking_with_embeddings = ChunkingConfig {
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
preset: Some("balanced".to_string()),
|
||||
embedding: Some(EmbeddingConfig::default()), // Uses balanced preset
|
||||
};
|
||||
|
||||
let extraction_config = ExtractionConfig {
|
||||
chunking: Some(chunking_with_embeddings),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// batch_size: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// cache_dir: Where to store downloaded models
|
||||
// - None: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - Some(path): Custom directory for model storage
|
||||
//
|
||||
// show_download_progress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
90
docs/snippets/rust/config/hierarchy_config.rs
Normal file
90
docs/snippets/rust/config/hierarchy_config.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
|
||||
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
let hierarchy_config_basic = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox: true, // Include bounding box coordinates
|
||||
ocr_coverage_threshold: None, // No OCR coverage threshold
|
||||
};
|
||||
|
||||
let pdf_config_basic = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_basic),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_basic = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_basic),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Use with extract_file_sync or extract_bytes_sync
|
||||
// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
|
||||
|
||||
|
||||
// Example 2: Custom k_clusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
let hierarchy_config_minimal = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 3, // Minimal clustering: just 3 levels
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
};
|
||||
|
||||
let pdf_config_minimal = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_minimal),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_minimal = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_minimal),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
let hierarchy_config_ocr = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: Some(0.5), // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
let pdf_config_ocr = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_ocr),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_ocr = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_ocr),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// k_clusters: usize (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// include_bbox: bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocr_coverage_threshold: Option<f32> (default: None)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
|
||||
// - None means no OCR coverage-based triggering
|
||||
14
docs/snippets/rust/config/html_output.md
Normal file
14
docs/snippets/rust/config/html_output.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Html,
|
||||
html_output: Some(HtmlOutputConfig {
|
||||
theme: HtmlTheme::GitHub,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_file_sync("document.pdf", None, &config).unwrap();
|
||||
println!("{}", result.content); // HTML with kb-* classes
|
||||
```
|
||||
72
docs/snippets/rust/config/keyword_config.rs
Normal file
72
docs/snippets/rust/config/keyword_config.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
pub async fn basic_yake() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.0,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: None,
|
||||
rake_params: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
pub async fn advanced_yake() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 15,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 2),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: Some(YakeParams {
|
||||
window_size: 1,
|
||||
}),
|
||||
rake_params: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
pub async fn rake_config() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Rake,
|
||||
max_keywords: 10,
|
||||
min_score: 5.0,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: None,
|
||||
rake_params: Some(RakeParams {
|
||||
min_word_length: 1,
|
||||
max_words_per_phrase: 3,
|
||||
}),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
23
docs/snippets/rust/config/keyword_extraction_config.md
Normal file
23
docs/snippets/rust/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/language_detection_config.md
Normal file
20
docs/snippets/rust/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Detected language: {}", result.language);
|
||||
println!("Confidence: {}", result.language_confidence);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
21
docs/snippets/rust/config/ocr_dpi_config.md
Normal file
21
docs/snippets/rust/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4096,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 150,
|
||||
max_dpi: 600,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Extracted images: {}", result.images.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/pdf_config.md
Normal file
20
docs/snippets/rust/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: true,
|
||||
passwords: Some(vec!["password123".to_string()]),
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig::default()),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("encrypted.pdf", None, &config)?;
|
||||
println!("Title: {:?}", result.metadata.title);
|
||||
println!("Authors: {:?}", result.metadata.authors);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/config/pdf_hierarchy_config.md
Normal file
23
docs/snippets/rust/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: true,
|
||||
detection_threshold: Some(0.75),
|
||||
ocr_coverage_threshold: Some(0.8),
|
||||
min_level: Some(1),
|
||||
max_level: Some(5),
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Hierarchy levels: {}", result.hierarchy.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/config/postprocessor_config.md
Normal file
22
docs/snippets/rust/config/postprocessor_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: true,
|
||||
enabled_processors: Some(vec![
|
||||
"whitespace_normalizer".to_string(),
|
||||
"unicode_normalizer".to_string(),
|
||||
]),
|
||||
disabled_processors: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Processed content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
16
docs/snippets/rust/config/quality_processing_config.md
Normal file
16
docs/snippets/rust/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Quality score: {}", result.quality_score);
|
||||
println!("Processing time: {:?}", result.processing_time);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/config/tesseract_config.md
Normal file
23
docs/snippets/rust/config/tesseract_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
use kreuzberg::types::TesseractConfig;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: Some(6),
|
||||
oem: Some(3),
|
||||
..Default::default()
|
||||
}),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
|
||||
println!("OCR text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/config/token_reduction_config.md
Normal file
19
docs/snippets/rust/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Original tokens: {}", result.token_count);
|
||||
println!("Reduced content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
86
docs/snippets/rust/docker/usage.rs
Normal file
86
docs/snippets/rust/docker/usage.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
```rust title="usage.rs"
|
||||
use std::process::Command;
|
||||
use anyhow::Result;
|
||||
use reqwest::multipart;
|
||||
use std::fs;
|
||||
|
||||
struct DockerKreuzbergClient {
|
||||
container_name: String,
|
||||
api_port: u16,
|
||||
}
|
||||
|
||||
impl DockerKreuzbergClient {
|
||||
fn new(container_name: &str, api_port: u16) -> Self {
|
||||
Self {
|
||||
container_name: container_name.to_string(),
|
||||
api_port,
|
||||
}
|
||||
}
|
||||
|
||||
fn start_container(&self, image: &str) -> Result<()> {
|
||||
println!("Starting Kreuzberg Docker container...");
|
||||
let status = Command::new("docker")
|
||||
.args(&[
|
||||
"run",
|
||||
"-d",
|
||||
"--name",
|
||||
&self.container_name,
|
||||
"-p",
|
||||
&format!("{}:8000", self.api_port),
|
||||
image,
|
||||
])
|
||||
.status()?;
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to start Docker container");
|
||||
}
|
||||
|
||||
println!("Container started on http://localhost:{}", self.api_port);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_file(&self, file_path: &str) -> Result<String> {
|
||||
let file_bytes = fs::read(file_path)?;
|
||||
let file_part = multipart::Part::bytes(file_bytes)
|
||||
.file_name(file_path.to_string());
|
||||
let form = multipart::Form::new()
|
||||
.part("file", file_part);
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let response = client
|
||||
.post(&format!("http://localhost:{}/api/extract", self.api_port))
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let json: serde_json::Value = response.json().await?;
|
||||
Ok(json["content"].as_str().unwrap_or("").to_string())
|
||||
}
|
||||
|
||||
fn stop_container(&self) -> Result<()> {
|
||||
println!("Stopping Kreuzberg Docker container...");
|
||||
Command::new("docker")
|
||||
.args(&["stop", &self.container_name])
|
||||
.status()?;
|
||||
Command::new("docker")
|
||||
.args(&["rm", &self.container_name])
|
||||
.status()?;
|
||||
println!("Container stopped and removed");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let docker_client = DockerKreuzbergClient::new("kreuzberg-api", 8000);
|
||||
|
||||
docker_client.start_container("kreuzberg:latest")?;
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||
|
||||
let content = docker_client.extract_file("document.pdf").await?;
|
||||
println!("Extracted content:\n{}", content);
|
||||
|
||||
docker_client.stop_container()?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
16
docs/snippets/rust/getting-started/basic_usage.md
Normal file
16
docs/snippets/rust/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
println!("MIME Type: {}", result.mime_type);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
12
docs/snippets/rust/getting-started/extract_file.md
Normal file
12
docs/snippets/rust/getting-started/extract_file.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &Default::default())?;
|
||||
|
||||
println!("Extracted content: {}", result.content);
|
||||
println!("Tables found: {}", result.tables.len());
|
||||
println!("Format: {:?}", result.metadata.as_ref().and_then(|m| m.format.as_ref()));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/getting-started/extract_with_ocr.md
Normal file
20
docs/snippets/rust/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
force_ocr: true,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
println!("Detected languages: {:?}", result.detected_languages);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
9
docs/snippets/rust/getting-started/hello_world.md
Normal file
9
docs/snippets/rust/getting-started/hello_world.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &Default::default())?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
10
docs/snippets/rust/getting-started/install_verify.md
Normal file
10
docs/snippets/rust/getting-started/install_verify.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &Default::default())?;
|
||||
println!("Extraction successful: {}", !result.content.is_empty());
|
||||
println!("Content length: {} characters", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/getting-started/read_content.md
Normal file
14
docs/snippets/rust/getting-started/read_content.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use std::fs;
|
||||
use kreuzberg::extract_bytes_sync;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let data = fs::read("document.pdf")?;
|
||||
let result = extract_bytes_sync(&data, "application/pdf", &Default::default())?;
|
||||
|
||||
println!("{}", result.content);
|
||||
println!("Success: true");
|
||||
println!("Content length: {} characters", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
37
docs/snippets/rust/llm/structured_extraction.md
Normal file
37
docs/snippets/rust/llm/structured_extraction.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file, ExtractionConfig, LlmConfig, StructuredExtractionConfig,
|
||||
};
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
structured_extraction: Some(StructuredExtractionConfig {
|
||||
schema: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": { "type": "string" },
|
||||
"authors": { "type": "array", "items": { "type": "string" } },
|
||||
"date": { "type": "string" }
|
||||
},
|
||||
"required": ["title", "authors", "date"],
|
||||
"additionalProperties": false
|
||||
}),
|
||||
llm: LlmConfig {
|
||||
model: "openai/gpt-4o-mini".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
strict: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("paper.pdf", None, &config).await?;
|
||||
if let Some(structured) = &result.structured_output {
|
||||
println!("{}", structured);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
41
docs/snippets/rust/mcp/mcp_custom_client.md
Normal file
41
docs/snippets/rust/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```rust title="Rust"
|
||||
use serde_json::json;
|
||||
use std::io::{BufRead, BufReader, Write};
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let mut child = Command::new("kreuzberg")
|
||||
.arg("mcp")
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
{
|
||||
let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?;
|
||||
let request = json!({
|
||||
"method": "tools/call",
|
||||
"params": {
|
||||
"name": "extract_file",
|
||||
"arguments": {
|
||||
"path": "document.pdf",
|
||||
"async": true
|
||||
}
|
||||
}
|
||||
});
|
||||
stdin.write_all(request.to_string().as_bytes())?;
|
||||
stdin.write_all(b"\n")?;
|
||||
}
|
||||
|
||||
let stdout = child.stdout.take().ok_or("Failed to open stdout")?;
|
||||
let reader = BufReader::new(stdout);
|
||||
for line in reader.lines() {
|
||||
if let Ok(line) = line {
|
||||
println!("{}", line);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
child.wait()?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
10
docs/snippets/rust/mcp/mcp_server_start.md
Normal file
10
docs/snippets/rust/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, mcp::start_mcp_server_with_config};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let config = ExtractionConfig::discover()?;
|
||||
start_mcp_server_with_config(config).await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/metadata/image_extraction.md
Normal file
18
docs/snippets/rust/metadata/image_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: Some(true),
|
||||
target_dpi: Some(200),
|
||||
max_image_dimension: Some(2048),
|
||||
inject_placeholders: Some(true), // set to false to extract images without markdown references
|
||||
auto_adjust_dpi: Some(true),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.images);
|
||||
}
|
||||
```
|
||||
25
docs/snippets/rust/metadata/image_preprocessing.md
Normal file
25
docs/snippets/rust/metadata/image_preprocessing.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
preprocessing: Some(ImagePreprocessingConfig {
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: "otsu".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("{:?}", config.ocr);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/metadata/language_detection.md
Normal file
15
docs/snippets/rust/metadata/language_detection.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.9,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.language_detection);
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("multilingual_document.pdf", None, &config).await?;
|
||||
|
||||
println!("Detected languages: {:?}", result.detected_languages);
|
||||
```
|
||||
82
docs/snippets/rust/metadata/metadata.md
Normal file
82
docs/snippets/rust/metadata/metadata.md
Normal file
@@ -0,0 +1,82 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
|
||||
|
||||
if let Some(pdf_meta) = result.metadata.pdf {
|
||||
if let Some(pages) = pdf_meta.page_count {
|
||||
println!("Pages: {}", pages);
|
||||
}
|
||||
if let Some(author) = pdf_meta.author {
|
||||
println!("Author: {}", author);
|
||||
}
|
||||
if let Some(title) = pdf_meta.title {
|
||||
println!("Title: {}", title);
|
||||
}
|
||||
}
|
||||
|
||||
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
|
||||
if let Some(html_meta) = html_result.metadata.html {
|
||||
if let Some(title) = html_meta.title {
|
||||
println!("Title: {}", title);
|
||||
}
|
||||
if let Some(desc) = html_meta.description {
|
||||
println!("Description: {}", desc);
|
||||
}
|
||||
|
||||
// Access keywords array
|
||||
println!("Keywords: {:?}", html_meta.keywords);
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
if let Some(canonical) = html_meta.canonical_url {
|
||||
println!("Canonical URL: {}", canonical);
|
||||
}
|
||||
|
||||
// Access Open Graph fields as a map
|
||||
if let Some(og_image) = html_meta.open_graph.get("image") {
|
||||
println!("Open Graph Image: {}", og_image);
|
||||
}
|
||||
if let Some(og_title) = html_meta.open_graph.get("title") {
|
||||
println!("Open Graph Title: {}", og_title);
|
||||
}
|
||||
|
||||
// Access Twitter Card fields as a map
|
||||
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
|
||||
println!("Twitter Card Type: {}", twitter_card);
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
if let Some(lang) = html_meta.language {
|
||||
println!("Language: {}", lang);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if !html_meta.headers.is_empty() {
|
||||
for header in &html_meta.headers {
|
||||
println!("Header (level {}): {}", header.level, header.text);
|
||||
}
|
||||
}
|
||||
|
||||
// Access links
|
||||
if !html_meta.links.is_empty() {
|
||||
for link in &html_meta.links {
|
||||
println!("Link: {} ({})", link.href, link.text);
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
if !html_meta.images.is_empty() {
|
||||
for image in &html_meta.images {
|
||||
println!("Image: {}", image.src);
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if !html_meta.structured_data.is_empty() {
|
||||
println!("Structured data items: {}", html_meta.structured_data.len());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
26
docs/snippets/rust/metadata/page_boundaries.md
Normal file
26
docs/snippets/rust/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
let Some(pages) = &result.metadata.pages else {
|
||||
return Ok(());
|
||||
};
|
||||
let Some(boundaries) = &pages.boundaries else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
for boundary in boundaries.iter().take(3) {
|
||||
let page_text = &result.content[boundary.byte_start..boundary.byte_end];
|
||||
let preview_end = 100.min(page_text.len());
|
||||
|
||||
println!("Page {}:", boundary.page_number);
|
||||
println!(" Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
|
||||
println!(" Preview: {}...", &page_text[..preview_end]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/metadata/page_tracking_basic.md
Normal file
20
docs/snippets/rust/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
|
||||
|
||||
Let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Let result = extract_file_sync("document.pdf", &config)?;
|
||||
|
||||
If let Some(pages) = result.pages {
|
||||
for page in pages {
|
||||
println!("Page {}:", page.page_number);
|
||||
println!(" Content: {} chars", page.content.len());
|
||||
println!(" Tables: {}", page.tables.len());
|
||||
println!(" Images: {}", page.images.len());
|
||||
}
|
||||
}
|
||||
79
docs/snippets/rust/metadata/pdf_metadata_extractor.md
Normal file
79
docs/snippets/rust/metadata/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
struct PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl PdfMetadataExtractor {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
processed_count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Plugin for PdfMetadataExtractor {
|
||||
fn name(&self) -> &str { "pdf-metadata-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn description(&self) -> &str {
|
||||
"Extracts and enriches PDF metadata"
|
||||
}
|
||||
fn initialize(&self) -> Result<()> {
|
||||
log::info!("PDF metadata extractor initialized");
|
||||
Ok(())
|
||||
}
|
||||
fn shutdown(&self) -> Result<()> {
|
||||
let count = self.processed_count.load(Ordering::Acquire);
|
||||
log::info!("Processed {} PDFs", count);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfMetadataExtractor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
self.processed_count.fetch_add(1, Ordering::AcqRel);
|
||||
|
||||
result.processing_warnings.push(ProcessingWarning {
|
||||
source: "pdf-metadata-extractor".to_string(),
|
||||
message: "PDF metadata extracted successfully".to_string()
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
|
||||
fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
|
||||
10
|
||||
}
|
||||
}
|
||||
|
||||
use kreuzberg::plugins::registry::get_post_processor_registry;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn register() -> Result<()> {
|
||||
let processor = Arc::new(PdfMetadataExtractor::new());
|
||||
let registry = get_post_processor_registry();
|
||||
registry.register(processor, 50)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/metadata/pdf_only_processor.md
Normal file
23
docs/snippets/rust/metadata/pdf_only_processor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
impl PostProcessor for PdfOnlyProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Middle
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/rust/metadata/tables.md
Normal file
17
docs/snippets/rust/metadata/tables.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
|
||||
|
||||
for table in &result.tables {
|
||||
println!("Table with {} rows", table.cells.len());
|
||||
println!("{}", table.markdown);
|
||||
|
||||
for row in &table.cells {
|
||||
println!("{:?}", row);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
55
docs/snippets/rust/metadata/vector_database_integration.md
Normal file
55
docs/snippets/rust/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
struct VectorRecord {
|
||||
id: String,
|
||||
content: String,
|
||||
embedding: Vec<f32>,
|
||||
metadata: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
async fn extract_and_vectorize(
|
||||
document_path: &str,
|
||||
document_id: &str,
|
||||
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file(document_path, None, &config).await?;
|
||||
|
||||
let mut records = Vec::new();
|
||||
if let Some(chunks) = result.chunks {
|
||||
for (index, chunk) in chunks.iter().enumerate() {
|
||||
if let Some(embedding) = &chunk.embedding {
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
metadata.insert("document_id".to_string(), document_id.to_string());
|
||||
metadata.insert("chunk_index".to_string(), index.to_string());
|
||||
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
|
||||
|
||||
records.push(VectorRecord {
|
||||
id: format!("{}_chunk_{}", document_id, index),
|
||||
content: chunk.content.clone(),
|
||||
embedding: embedding.clone(),
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
```
|
||||
61
docs/snippets/rust/ocr/cloud_ocr_backend.md
Normal file
61
docs/snippets/rust/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,61 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
|
||||
use kreuzberg::{Result, ExtractionResult, OcrConfig, Metadata};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
|
||||
struct CloudOcrBackend {
|
||||
api_key: String,
|
||||
supported_langs: Vec<String>,
|
||||
}
|
||||
|
||||
impl Plugin for CloudOcrBackend {
|
||||
fn name(&self) -> &str { "cloud-ocr" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl OcrBackend for CloudOcrBackend {
|
||||
async fn process_image(
|
||||
&self,
|
||||
image_bytes: &[u8],
|
||||
config: &OcrConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
let text = self.call_cloud_api(image_bytes, &config.language).await?;
|
||||
|
||||
Ok(ExtractionResult {
|
||||
content: text,
|
||||
mime_type: "text/plain".to_string(),
|
||||
metadata: Metadata::default(),
|
||||
tables: vec![],
|
||||
detected_languages: None,
|
||||
chunks: None,
|
||||
images: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn supports_language(&self, lang: &str) -> bool {
|
||||
self.supported_langs.iter().any(|l| l == lang)
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> OcrBackendType {
|
||||
OcrBackendType::Custom
|
||||
}
|
||||
|
||||
fn supported_languages(&self) -> Vec<String> {
|
||||
self.supported_langs.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl CloudOcrBackend {
|
||||
async fn call_cloud_api(
|
||||
&self,
|
||||
image: &[u8],
|
||||
language: &str
|
||||
) -> Result<String> {
|
||||
Ok("Extracted text".to_string())
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/config_ocr.md
Normal file
18
docs/snippets/rust/ocr/config_ocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+fra".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
```
|
||||
21
docs/snippets/rust/ocr/image_extraction.md
Normal file
21
docs/snippets/rust/ocr/image_extraction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
target_dpi: 200,
|
||||
max_image_dimension: 2048,
|
||||
inject_placeholders: true, // set to false to extract images without markdown references
|
||||
auto_adjust_dpi: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
33
docs/snippets/rust/ocr/image_preprocessing.md
Normal file
33
docs/snippets/rust/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file_sync, ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let preprocessing = ImagePreprocessingConfig {
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: "otsu".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
preprocessing: Some(preprocessing),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/ocr/ocr_dpi_config.md
Normal file
20
docs/snippets/rust/ocr/ocr_dpi_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, PdfConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
dpi: Some(300),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/ocr/ocr_easyocr.md
Normal file
19
docs/snippets/rust/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "easyocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Extracted text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
35
docs/snippets/rust/ocr/ocr_elements.md
Normal file
35
docs/snippets/rust/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
use kreuzberg::types::OcrElementConfig;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "paddleocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
element_config: Some(OcrElementConfig {
|
||||
include_elements: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("scanned.pdf", None, &config).await?;
|
||||
|
||||
if let Some(elements) = &result.ocr_elements {
|
||||
for element in elements {
|
||||
println!("Text: {}", element.text);
|
||||
println!("Confidence: {:.2}", element.confidence.recognition);
|
||||
println!("Geometry: {:?}", element.geometry);
|
||||
if let Some(rotation) = &element.rotation {
|
||||
println!("Rotation: {}°", rotation.angle_degrees);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_extraction.md
Normal file
18
docs/snippets/rust/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_force_all_pages.md
Normal file
18
docs/snippets/rust/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_multi_language.md
Normal file
18
docs/snippets/rust/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu+fra".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("multilingual.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/ocr/ocr_paddleocr.md
Normal file
20
docs/snippets/rust/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "paddleocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
// paddle_ocr_config: Some(serde_json::json!({"model_tier": "server"})), // for max accuracy
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Extracted text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/ocr/pdf_config.md
Normal file
15
docs/snippets/rust/ocr/pdf_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, PdfConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: Some(true),
|
||||
extract_metadata: Some(true),
|
||||
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.pdf_options);
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/ocr/tesseract_config.md
Normal file
22
docs/snippets/rust/ocr/tesseract_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
language: "eng+fra+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 6,
|
||||
oem: 1,
|
||||
min_confidence: 0.8,
|
||||
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string(),
|
||||
enable_table_detection: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.ocr);
|
||||
}
|
||||
```
|
||||
12
docs/snippets/rust/plugins/clear_plugins.md
Normal file
12
docs/snippets/rust/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{clear_document_extractors, clear_post_processors, clear_ocr_backends, clear_validators};
|
||||
|
||||
fn main() {
|
||||
clear_document_extractors();
|
||||
clear_post_processors();
|
||||
clear_ocr_backends();
|
||||
clear_validators();
|
||||
|
||||
println!("All plugins cleared");
|
||||
}
|
||||
```
|
||||
47
docs/snippets/rust/plugins/embedding_backend.md
Normal file
47
docs/snippets/rust/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{EmbeddingBackend, Plugin, register_embedding_backend};
|
||||
use kreuzberg::{EmbeddingConfig, EmbeddingModelType, Result, embed_texts};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
|
||||
// Wrap an already-loaded embedder (e.g. a tuned ONNX session or any host-language
|
||||
// embedder) so kreuzberg can call back into it during chunking and standalone
|
||||
// embed requests.
|
||||
struct MyEmbedder {
|
||||
// Hold whatever model handles the host already owns.
|
||||
}
|
||||
|
||||
impl Plugin for MyEmbedder {
|
||||
fn name(&self) -> &str { "my-embedder" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl EmbeddingBackend for MyEmbedder {
|
||||
// Captured once at registration; used for shape validation on every dispatch.
|
||||
fn dimensions(&self) -> usize { 768 }
|
||||
|
||||
async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
|
||||
// Delegate to the already-loaded host model.
|
||||
Ok(texts.iter().map(|_| vec![0.0; 768]).collect())
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// Register once at startup.
|
||||
register_embedding_backend(Arc::new(MyEmbedder {}))?;
|
||||
|
||||
let config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Plugin { name: "my-embedder".to_string() },
|
||||
// Optional: bound the wait on a hung backend (default 60s; `None` disables).
|
||||
max_embed_duration_secs: Some(30),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let vectors = embed_texts(&["Hello, world!", "Second text"], &config)?;
|
||||
assert_eq!(vectors.len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
11
docs/snippets/rust/plugins/extractor_registration.md
Normal file
11
docs/snippets/rust/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::registry::get_document_extractor_registry;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn register_custom_extractor() -> kreuzberg::Result<()> {
|
||||
let extractor = Arc::new(CustomJsonExtractor);
|
||||
let registry = get_document_extractor_registry();
|
||||
registry.write().unwrap().register(extractor)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/plugins/list_plugins.md
Normal file
19
docs/snippets/rust/plugins/list_plugins.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::registry::*;
|
||||
|
||||
let registry = get_document_extractor_registry();
|
||||
let extractors = registry.list()?;
|
||||
println!("Registered extractors: {:?}", extractors);
|
||||
|
||||
let registry = get_post_processor_registry();
|
||||
let processors = registry.list()?;
|
||||
println!("Registered processors: {:?}", processors);
|
||||
|
||||
let registry = get_ocr_backend_registry();
|
||||
let backends = registry.list()?;
|
||||
println!("Registered OCR backends: {:?}", backends);
|
||||
|
||||
let registry = get_validator_registry();
|
||||
let validators = registry.list()?;
|
||||
println!("Registered validators: {:?}", validators);
|
||||
```
|
||||
38
docs/snippets/rust/plugins/min_length_validator.md
Normal file
38
docs/snippets/rust/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, Validator};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct MinLengthValidator {
|
||||
min_length: usize,
|
||||
}
|
||||
|
||||
impl Plugin for MinLengthValidator {
|
||||
fn name(&self) -> &str { "min-length-validator" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Validator for MinLengthValidator {
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.content.len() < self.min_length {
|
||||
return Err(KreuzbergError::validation(format!(
|
||||
"Content too short: {} < {} characters",
|
||||
result.content.len(),
|
||||
self.min_length
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 {
|
||||
100
|
||||
}
|
||||
}
|
||||
```
|
||||
69
docs/snippets/rust/plugins/pdf_metadata_extractor.md
Normal file
69
docs/snippets/rust/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,69 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use serde_json::json;
|
||||
|
||||
struct PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl Plugin for PdfMetadataExtractor {
|
||||
fn name(&self) -> &str { "pdf-metadata-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> {
|
||||
self.processed_count.store(0, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfMetadataExtractor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.mime_type != "application/pdf" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let order = self.processed_count.fetch_add(1, Ordering::AcqRel) + 1;
|
||||
|
||||
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
|
||||
result.metadata.additional.insert("pdf_order".to_string(), json!(order));
|
||||
result.metadata.additional.insert(
|
||||
"content_length".to_string(),
|
||||
json!(result.content.len()),
|
||||
);
|
||||
result.metadata.additional.insert(
|
||||
"pdf_processor_version".to_string(),
|
||||
json!("1.0.0"),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
register_post_processor(Arc::new(PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize::new(0),
|
||||
}))?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
50
docs/snippets/rust/plugins/pdf_only_processor.md
Normal file
50
docs/snippets/rust/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
use serde_json::json;
|
||||
|
||||
struct PdfOnlyProcessor;
|
||||
|
||||
impl Plugin for PdfOnlyProcessor {
|
||||
fn name(&self) -> &str { "pdf-only-processor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfOnlyProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.mime_type != "application/pdf" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
register_post_processor(Arc::new(PdfOnlyProcessor))?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
53
docs/snippets/rust/plugins/plugin_extractor.md
Normal file
53
docs/snippets/rust/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{DocumentExtractor, Plugin};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
|
||||
struct CustomJsonExtractor;
|
||||
|
||||
impl Plugin for CustomJsonExtractor {
|
||||
fn name(&self) -> &str { "custom-json-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for CustomJsonExtractor {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
_mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
let json: serde_json::Value = serde_json::from_slice(content)?;
|
||||
let text = extract_text_from_json(&json);
|
||||
|
||||
Ok(ExtractionResult {
|
||||
content: text,
|
||||
mime_type: "application/json".to_string(),
|
||||
metadata: Metadata::default(),
|
||||
tables: vec![],
|
||||
detected_languages: None,
|
||||
chunks: None,
|
||||
images: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn supported_mime_types(&self) -> &[&str] {
|
||||
&["application/json", "text/json"]
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
}
|
||||
|
||||
fn extract_text_from_json(value: &serde_json::Value) -> String {
|
||||
match value {
|
||||
serde_json::Value::String(s) => format!("{}\n", s),
|
||||
serde_json::Value::Array(arr) => arr.iter().map(extract_text_from_json).collect(),
|
||||
serde_json::Value::Object(obj) => obj.values().map(extract_text_from_json).collect(),
|
||||
_ => String::new(),
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/rust/plugins/plugin_logging.md
Normal file
35
docs/snippets/rust/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```rust title="Rust"
|
||||
use log::{info, warn, error};
|
||||
|
||||
impl Plugin for MyPlugin {
|
||||
fn initialize(&self) -> Result<()> {
|
||||
info!("Initializing plugin: {}", self.name());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn shutdown(&self) -> Result<()> {
|
||||
info!("Shutting down plugin: {}", self.name());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for MyPlugin {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
info!("Extracting {} ({} bytes)", mime_type, content.len());
|
||||
|
||||
let result = ExtractionResult::default();
|
||||
|
||||
if result.content.is_empty() {
|
||||
warn!("Extraction resulted in empty content");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/plugins/plugin_testing.md
Normal file
22
docs/snippets/rust/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_custom_extractor() {
|
||||
let extractor = CustomJsonExtractor;
|
||||
|
||||
let json_data = br#"{"message": "Hello, world!"}"#;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extractor
|
||||
.extract_bytes(json_data, "application/json", &config)
|
||||
.await
|
||||
.expect("Extraction failed");
|
||||
|
||||
assert!(result.content.contains("Hello, world!"));
|
||||
assert_eq!(result.mime_type, "application/json");
|
||||
}
|
||||
}
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user