Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
```rust title="Rust"
use kreuzberg::{
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 100,
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
detect_multiple: true,
..Default::default()
}),
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
let preview: String = chunk.content.chars().take(100).collect();
println!("Chunk: {}...", preview);
}
}
if let Some(languages) = result.detected_languages {
println!("Languages: {:?}", languages);
}
Ok(())
}
```

View File

@@ -0,0 +1,29 @@
Use Kreuzberg::{extract_file_sync, ExtractionConfig, ChunkingConfig, PageConfig};
Let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
..Default::default()
}),
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
..Default::default()
};
Let result = extract_file_sync("document.pdf", None, &config)?;
If let Some(chunks) = result.chunks {
for chunk in chunks {
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
let page_range = if first == last {
format!("Page {}", first)
} else {
format!("Pages {}-{}", first, last)
};
println!("Chunk: {}... ({})", chunk.content.chars().take(50).collect::<String>(), page_range);
}
}
}

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1500,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "text-embedding-all-minilm-l6-v2".to_string(),
},
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
```

View File

@@ -0,0 +1,39 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ChunkingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: None,
}),
..Default::default()
};
```
```rust title="Rust - Semantic"
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
chunker_type: ChunkerType::Semantic,
..Default::default()
}),
..Default::default()
};
```
```rust title="Rust - Prepend Heading Context"
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,36 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
println!("Chunk {}/{}",
chunk.metadata.chunk_index + 1,
chunk.metadata.total_chunks
);
println!("Position: {}-{}",
chunk.metadata.byte_start,
chunk.metadata.byte_end
);
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
if let Some(embedding) = chunk.embedding {
println!("Embedding: {} dimensions", embedding.len());
}
}
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "all-mpnet-base-v2".to_string(),
},
batch_size: 16,
normalize: true,
show_download_progress: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1024,
overlap: 100,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
batch_size: 32,
show_download_progress: false,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(keywords) = &result.extracted_keywords {
println!("Keywords: {:?}", keywords);
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
fn main() {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: Some(true),
min_confidence: Some(0.9),
detect_multiple: Some(true),
}),
..Default::default()
};
println!("{:?}", config.language_detection);
}
```

View File

@@ -0,0 +1,12 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: false,
}),
..Default::default()
};
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("multilingual_document.pdf", None, &config).await?;
println!("Detected languages: {:?}", result.detected_languages);
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, PostProcessorConfig};
fn main() {
let config = ExtractionConfig {
postprocessor: Some(PostProcessorConfig {
enabled: Some(true),
enabled_processors: Some(vec![
"deduplication".to_string(),
"whitespace_normalization".to_string(),
]),
disabled_processors: Some(vec!["mojibake_fix".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.postprocessor);
}
```

View File

@@ -0,0 +1,8 @@
```rust title="Rust"
use kreuzberg::ExtractionConfig;
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
```

View File

@@ -0,0 +1,17 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("scanned_document.pdf", None, &config).await?;
if let Some(score) = result.quality_score {
if score < 0.5 {
println!("Warning: Low quality extraction ({:.2})", score);
} else {
println!("Quality score: {:.2}", score);
}
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
fn main() {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
println!("{:?}", config.token_reduction);
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
preserve_code: true,
language_hint: Some("eng".to_string()),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("verbose_document.pdf", None, &config).await?;
if let Some(original) = result.original_token_count {
println!("Original tokens: {}", original);
}
if let Some(reduced) = result.reduced_token_count {
println!("Reduced tokens: {}", reduced);
}
```

View File

@@ -0,0 +1,55 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
struct VectorRecord {
id: String,
content: String,
embedding: Vec<f32>,
metadata: std::collections::HashMap<String, String>,
}
async fn extract_and_vectorize(
document_path: &str,
document_id: &str,
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
normalize: true,
batch_size: 32,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file(document_path, None, &config).await?;
let mut records = Vec::new();
if let Some(chunks) = result.chunks {
for (index, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
let mut metadata = std::collections::HashMap::new();
metadata.insert("document_id".to_string(), document_id.to_string());
metadata.insert("chunk_index".to_string(), index.to_string());
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
records.push(VectorRecord {
id: format!("{}_chunk_{}", document_id, index),
content: chunk.content.clone(),
embedding: embedding.clone(),
metadata,
});
}
}
}
Ok(records)
}
```

View File

@@ -0,0 +1,25 @@
```rust title="Rust"
use kreuzberg::{batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let items = vec![
BatchBytesItem {
content: b"Hello, world!".to_vec(),
mime_type: "text/plain".to_string(),
config: None,
},
BatchBytesItem {
content: b"# Heading\n\nParagraph text.".to_vec(),
mime_type: "text/markdown".to_string(),
config: None,
},
];
let results = batch_extract_bytes_sync(items, &config)?;
for (i, result) in results.iter().enumerate() {
println!("Item {}: {} chars", i, result.content.len());
}
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let items = vec![
BatchFileItem { path: "doc1.pdf".into(), config: None },
BatchFileItem { path: "doc2.docx".into(), config: None },
BatchFileItem { path: "report.pdf".into(), config: None },
];
let results = batch_extract_files_sync(items, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} chars", i, result.content.len());
}
Ok(())
}
```

View File

@@ -0,0 +1,29 @@
```rust title="Rust"
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let client = reqwest::Client::new();
let bytes = tokio::fs::read("document.pdf").await?;
let part = reqwest::multipart::Part::bytes(bytes)
.file_name("document.pdf")
.mime_str("application/pdf")?;
let form = reqwest::multipart::Form::new()
.part("file", part)
.text("chunking", r#"{"max_characters":800,"overlap":100}"#);
let response = client
.post("http://localhost:8000/extract")
.multipart(form)
.send()
.await?;
let result: serde_json::Value = response.error_for_status()?.json().await?;
if let Some(chunks) = result["chunks"].as_array() {
println!("{} chunks", chunks.len());
for chunk in chunks {
println!(" {} chars", chunk["content"].as_str().unwrap_or("").len());
}
}
Ok(())
}
```

View File

@@ -0,0 +1,28 @@
```rust title="Rust"
use std::path::Path;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let client = reqwest::Client::new();
let bytes = tokio::fs::read("document.pdf").await?;
let file_name = Path::new("document.pdf")
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("document.pdf");
let part = reqwest::multipart::Part::bytes(bytes)
.file_name(file_name.to_string())
.mime_str("application/pdf")?;
let form = reqwest::multipart::Form::new().part("file", part);
let response = client
.post("http://localhost:8000/extract")
.multipart(form)
.send()
.await?;
let result: serde_json::Value = response.error_for_status()?.json().await?;
println!("{}", result["content"].as_str().unwrap_or(""));
Ok(())
}
```

View File

@@ -0,0 +1,55 @@
```rust title="Rust"
use kreuzberg::{
ChunkingConfig, ChunkerType, ExtractionConfig, ImageExtractionConfig,
OcrConfig, OutputFormat, extract_file_sync,
};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
// OCR: force Tesseract on all pages with English text
force_ocr: false,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
// Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
chunking: Some(ChunkingConfig {
max_characters: 800,
overlap: 100,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
// Output: include document structure and tables
output_format: OutputFormat::Markdown,
include_document_structure: true,
// Images: extract embedded images
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
// Cache extracted results on disk
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("report.pdf", None, &config)?;
println!("Content ({} chars):", result.content.len());
println!("{}", &result.content[..result.content.len().min(200)]);
if let Some(chunks) = &result.chunks {
println!("\nChunks: {}", chunks.len());
}
println!("Tables: {}", result.tables.len());
if let Some(langs) = &result.detected_languages {
println!("Languages: {:?}", langs);
}
if let Some(method) = result.extraction_method {
println!("Extraction method: {:?}", method);
}
Ok(())
}
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};
fn main() {
let config = ExtractionConfig::default();
match extract_file_sync("document.pdf", None, &config) {
Ok(result) => println!("{}", result.content),
Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
Err(KreuzbergError::UnsupportedFormat(mime)) => {
eprintln!("Unsupported format: {mime}");
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Corrupt or invalid document: {message}");
}
Err(KreuzbergError::MissingDependency(dep)) => {
eprintln!("Missing dependency — install {dep}");
}
Err(e) => eprintln!("Extraction failed: {e}"),
}
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_bytes_sync, ExtractionConfig, KreuzbergError, Result};
fn extract_text(bytes: &[u8], mime_type: &str) -> Result<String> {
let config = ExtractionConfig::default();
let result = extract_bytes_sync(bytes, mime_type, &config)?;
Ok(result.content)
}
fn main() {
let bytes = std::fs::read("document.pdf").unwrap_or_default();
match extract_text(&bytes, "application/pdf") {
Ok(text) => println!("Extracted {} chars", text.len()),
Err(KreuzbergError::UnsupportedFormat(mime)) => {
eprintln!("Format not supported: {mime}");
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR failed: {message}");
}
Err(e) => eprintln!("Error: {e}"),
}
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{extract_bytes, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let content = tokio::fs::read("document.pdf").await?;
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/pdf", &config).await?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,13 @@
```rust title="Rust"
use kreuzberg::{extract_bytes_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let content = std::fs::read("document.pdf")?;
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&content, "application/pdf", &config)?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("{}", result.content);
println!("MIME type: {}", result.mime_type);
println!("Tables: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,13 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file_sync("document.pdf", None, &config)?;
println!("{}", result.content);
println!("MIME type: {}", result.mime_type);
println!("Tables: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,61 @@
```rust title="simple_benchmark.rs"
use kreuzberg::{extract_file_sync, extract_file, ExtractionConfig};
use std::time::Instant;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: false,
..Default::default()
};
let file_path = "document.pdf";
let num_runs = 10;
let start = Instant::now();
for _ in 0..num_runs {
let _ = extract_file_sync(file_path, None, &config)?;
}
let sync_duration = start.elapsed().as_secs_f64();
let avg_sync = sync_duration / num_runs as f64;
println!("Sync extraction ({} runs):", num_runs);
println!(" - Total time: {:.3}s", sync_duration);
println!(" - Average: {:.3}s per extraction", avg_sync);
let start = Instant::now();
let mut tasks = vec![];
for _ in 0..num_runs {
tasks.push(extract_file(file_path, None, &config));
}
let results = futures::future::join_all(tasks).await;
for result in results {
result?;
}
let async_duration = start.elapsed().as_secs_f64();
println!("\nAsync extraction ({} parallel runs):", num_runs);
println!(" - Total time: {:.3}s", async_duration);
println!(" - Average: {:.3}s per extraction", async_duration / num_runs as f64);
println!(" - Speedup: {:.1}x", sync_duration / async_duration);
let config_cached = ExtractionConfig {
use_cache: true,
..Default::default()
};
println!("\nFirst extraction (populates cache)...");
let start = Instant::now();
let _result1 = extract_file_sync(file_path, None, &config_cached)?;
let first_duration = start.elapsed().as_secs_f64();
println!(" - Time: {:.3}s", first_duration);
println!("Second extraction (from cache)...");
let start = Instant::now();
let _result2 = extract_file_sync(file_path, None, &config_cached)?;
let cached_duration = start.elapsed().as_secs_f64();
println!(" - Time: {:.3}s", cached_duration);
println!(" - Cache speedup: {:.1}x", first_duration / cached_duration);
Ok(())
}
```

28
docs/snippets/rust/cache/disk_cache.rs vendored Normal file
View File

@@ -0,0 +1,28 @@
```rust title="disk_cache.rs"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let path = std::env::args()
.skip(1)
.find(|a| !a.is_empty() && a != "--")
.unwrap_or_else(|| "document.pdf".to_string());
// Enable caching (default: true). The Rust crate uses an internal disk cache.
let config = ExtractionConfig {
use_cache: true,
..Default::default()
};
println!("First extraction (will be cached)...");
let result1 = extract_file_sync(&path, None, &config)?;
println!(" - Content length: {}", result1.content.len());
println!("\nSecond extraction (from cache when available)...");
let result2 = extract_file_sync(&path, None, &config)?;
println!(" - Content length: {}", result2.content.len());
println!("\nResults are identical: {}", result1.content == result2.content);
Ok(())
}
```

View File

@@ -0,0 +1,30 @@
```rust title="basic_cli.rs"
use std::process::Command;
use anyhow::Result;
fn extract_with_cli(file_path: &str, output_format: &str) -> Result<String> {
let output = Command::new("kreuzberg")
.args(&["extract", file_path, "--format", output_format])
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("CLI error: {}", stderr);
}
Ok(String::from_utf8(output.stdout)?)
}
fn main() -> Result<()> {
let document = "document.pdf";
let text_output = extract_with_cli(document, "text")?;
println!("Extracted: {} characters", text_output.len());
let json_output = extract_with_cli(document, "json")?;
let parsed: serde_json::Value = serde_json::from_str(&json_output)?;
println!("Format: {}", parsed.get("format").unwrap_or(&"unknown".into()));
Ok(())
}
```

View File

@@ -0,0 +1,39 @@
```rust title="cli_with_config.rs"
use std::process::Command;
use anyhow::Result;
fn extract_with_config(file_path: &str, config_path: &str) -> Result<serde_json::Value> {
let output = Command::new("kreuzberg")
.args(&[
"extract",
file_path,
"--config",
config_path,
"--format",
"json",
])
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("CLI error: {}", stderr);
}
let result: serde_json::Value = serde_json::from_slice(&output.stdout)?;
Ok(result)
}
fn main() -> Result<()> {
let config_file = "kreuzberg.toml";
let document = "document.pdf";
println!("Extracting {} with config {}", document, config_file);
let result = extract_with_config(document, config_file)?;
println!("Content length: {}", result["content"].as_str().unwrap_or("").len());
println!("Format: {}", result["format"].as_str().unwrap_or("unknown"));
println!("Languages: {}", result["languages"].to_string());
Ok(())
}
```

View File

@@ -0,0 +1,63 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: None,
output_format: None,
paddle_ocr_config: None,
element_config: None,
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
batch_size: 32,
normalize: true,
show_download_progress: false,
cache_dir: None,
}),
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: false,
}),
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.1,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
postprocessor: Some(PostProcessorConfig {
enabled: true,
enabled_processors: None,
disabled_processors: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Content: {}", result.content);
if let Some(langs) = &result.detected_languages {
println!("Languages: {:?}", langs);
}
println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
Ok(())
}
```

View File

@@ -0,0 +1,79 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Chunks: {}", result.chunks.len());
for chunk in &result.chunks {
println!("Length: {}", chunk.content.len());
}
Ok(())
}
```
```rust title="Rust - Markdown with Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
sizing: ChunkSizing::Tokenizer {
model: "Xenova/gpt-4o".into(),
cache_dir: None,
},
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
if let Some(heading_context) = &chunk.metadata.heading_context {
for heading in &heading_context.headings {
println!("Heading L{}: {}", heading.level, heading.text);
}
}
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```
```rust title="Rust - Prepend Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
// Each chunk's content is prefixed with its heading breadcrumb
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,11 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::discover()?.unwrap_or_default();
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: None,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("Content length: {}", result.content.len());
println!("Tables detected: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,29 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = ExtractionConfig {
use_cache: true,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 6,
..Default::default()
}),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
..Default::default()
}),
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("Content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,17 @@
```rust title="Document Structure Config (Rust)"
use kreuzberg::{extract_file_sync, ExtractionConfig};
let config = ExtractionConfig {
include_document_structure: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(document) = &result.document {
for node in &document.nodes {
let text = node.content.text().unwrap_or("");
println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
}
}
```

View File

@@ -0,0 +1,48 @@
```rust title="Element-Based Output (Rust)"
use kreuzberg::{extract_file_sync, ExtractionConfig};
use kreuzberg::types::OutputFormat as ResultFormat;
fn main() -> kreuzberg::Result<()> {
// Configure element-based output (result_format controls Unified vs ElementBased)
let config = ExtractionConfig {
result_format: ResultFormat::ElementBased,
..Default::default()
};
// Extract document
let result = extract_file_sync("document.pdf", None, &config)?;
// Access elements
if let Some(elements) = result.elements {
for element in &elements {
println!("Type: {:?}", element.element_type);
println!("Text: {}", &element.text[..100.min(element.text.len())]);
if let Some(page) = element.metadata.page_number {
println!("Page: {}", page);
}
if let Some(coords) = &element.metadata.coordinates {
println!("Coords: ({}, {}) - ({}, {})",
coords.x0, coords.y0, coords.x1, coords.y1);
}
println!("---");
}
// Filter by element type
let titles: Vec<_> = elements.iter()
.filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
.collect();
for title in titles {
let level = title.metadata.additional.get("level")
.map(|v| v.as_ref())
.unwrap_or("unknown");
println!("[{}] {}", level, title.text);
}
}
Ok(())
}
```

View File

@@ -0,0 +1,28 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
batch_size: 16,
normalize: true,
show_download_progress: true,
cache_dir: None,
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
Ok(())
}
```

View File

@@ -0,0 +1,85 @@
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
acceleration: None,
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Custom {
model_id: "BAAI/bge-small-en-v1.5".to_string(),
dimensions: 384,
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: None, // Uses default: .kreuzberg/embeddings/
acceleration: None,
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Custom {
model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
dimensions: 768,
},
batch_size: 16, // Larger model requires smaller batch size
normalize: true,
show_download_progress: true,
cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
acceleration: None,
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
use kreuzberg::{ChunkingConfig, ExtractionConfig};
let chunking_with_embeddings = ChunkingConfig {
max_characters: 1024,
overlap: 100,
preset: Some("balanced".to_string()),
embedding: Some(EmbeddingConfig::default()), // Uses balanced preset
};
let extraction_config = ExtractionConfig {
chunking: Some(chunking_with_embeddings),
..Default::default()
};
// Key parameter explanations:
//
// batch_size: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// cache_dir: Where to store downloaded models
// - None: Uses .kreuzberg/embeddings/ in current directory
// - Some(path): Custom directory for model storage
//
// show_download_progress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,90 @@
use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
// Example 1: Basic hierarchy extraction
// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
let hierarchy_config_basic = HierarchyConfig {
enabled: true,
k_clusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
include_bbox: true, // Include bounding box coordinates
ocr_coverage_threshold: None, // No OCR coverage threshold
};
let pdf_config_basic = PdfConfig {
hierarchy: Some(hierarchy_config_basic),
..Default::default()
};
let extraction_config_basic = ExtractionConfig {
pdf_options: Some(pdf_config_basic),
..Default::default()
};
// Use with extract_file_sync or extract_bytes_sync
// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
// Example 2: Custom k_clusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
let hierarchy_config_minimal = HierarchyConfig {
enabled: true,
k_clusters: 3, // Minimal clustering: just 3 levels
include_bbox: true,
ocr_coverage_threshold: None,
};
let pdf_config_minimal = PdfConfig {
hierarchy: Some(hierarchy_config_minimal),
..Default::default()
};
let extraction_config_minimal = ExtractionConfig {
pdf_options: Some(pdf_config_minimal),
..Default::default()
};
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
let hierarchy_config_ocr = HierarchyConfig {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: Some(0.5), // Trigger OCR if text coverage < 50%
};
let pdf_config_ocr = PdfConfig {
hierarchy: Some(hierarchy_config_ocr),
..Default::default()
};
let extraction_config_ocr = ExtractionConfig {
pdf_options: Some(pdf_config_ocr),
..Default::default()
};
// Field descriptions:
//
// enabled: bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// k_clusters: usize (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// include_bbox: bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocr_coverage_threshold: Option<f32> (default: None)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
// - None means no OCR coverage-based triggering

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
let config = ExtractionConfig {
output_format: OutputFormat::Html,
html_output: Some(HtmlOutputConfig {
theme: HtmlTheme::GitHub,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config).unwrap();
println!("{}", result.content); // HTML with kb-* classes
```

View File

@@ -0,0 +1,72 @@
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
pub async fn basic_yake() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.0,
ngram_range: (1, 3),
language: Some("en".to_string()),
yake_params: None,
rake_params: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
#[cfg(feature = "keywords-yake")]
pub async fn advanced_yake() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 15,
min_score: 0.1,
ngram_range: (1, 2),
language: Some("en".to_string()),
yake_params: Some(YakeParams {
window_size: 1,
}),
rake_params: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
#[cfg(feature = "keywords-rake")]
pub async fn rake_config() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Rake,
max_keywords: 10,
min_score: 5.0,
ngram_range: (1, 3),
language: Some("en".to_string()),
yake_params: None,
rake_params: Some(RakeParams {
min_word_length: 1,
max_words_per_phrase: 3,
}),
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.1,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Detected language: {}", result.language);
println!("Confidence: {}", result.language_confidence);
Ok(())
}
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
target_dpi: 300,
max_image_dimension: 4096,
auto_adjust_dpi: true,
min_dpi: 150,
max_dpi: 600,
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Extracted images: {}", result.images.len());
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_images: true,
passwords: Some(vec!["password123".to_string()]),
extract_metadata: true,
hierarchy: Some(HierarchyConfig::default()),
}),
..Default::default()
};
let result = extract_file_sync("encrypted.pdf", None, &config)?;
println!("Title: {:?}", result.metadata.title);
println!("Authors: {:?}", result.metadata.authors);
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
hierarchy: Some(HierarchyConfig {
enabled: true,
detection_threshold: Some(0.75),
ocr_coverage_threshold: Some(0.8),
min_level: Some(1),
max_level: Some(5),
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Hierarchy levels: {}", result.hierarchy.len());
Ok(())
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
postprocessor: Some(PostProcessorConfig {
enabled: true,
enabled_processors: Some(vec![
"whitespace_normalizer".to_string(),
"unicode_normalizer".to_string(),
]),
disabled_processors: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Processed content: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
enable_quality_processing: true,
use_cache: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Quality score: {}", result.quality_score);
println!("Processing time: {:?}", result.processing_time);
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
use kreuzberg::types::TesseractConfig;
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: Some(6),
oem: Some(3),
..Default::default()
}),
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
println!("OCR text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Original tokens: {}", result.token_count);
println!("Reduced content: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,86 @@
```rust title="usage.rs"
use std::process::Command;
use anyhow::Result;
use reqwest::multipart;
use std::fs;
struct DockerKreuzbergClient {
container_name: String,
api_port: u16,
}
impl DockerKreuzbergClient {
fn new(container_name: &str, api_port: u16) -> Self {
Self {
container_name: container_name.to_string(),
api_port,
}
}
fn start_container(&self, image: &str) -> Result<()> {
println!("Starting Kreuzberg Docker container...");
let status = Command::new("docker")
.args(&[
"run",
"-d",
"--name",
&self.container_name,
"-p",
&format!("{}:8000", self.api_port),
image,
])
.status()?;
if !status.success() {
anyhow::bail!("Failed to start Docker container");
}
println!("Container started on http://localhost:{}", self.api_port);
Ok(())
}
async fn extract_file(&self, file_path: &str) -> Result<String> {
let file_bytes = fs::read(file_path)?;
let file_part = multipart::Part::bytes(file_bytes)
.file_name(file_path.to_string());
let form = multipart::Form::new()
.part("file", file_part);
let client = reqwest::Client::new();
let response = client
.post(&format!("http://localhost:{}/api/extract", self.api_port))
.multipart(form)
.send()
.await?;
let json: serde_json::Value = response.json().await?;
Ok(json["content"].as_str().unwrap_or("").to_string())
}
fn stop_container(&self) -> Result<()> {
println!("Stopping Kreuzberg Docker container...");
Command::new("docker")
.args(&["stop", &self.container_name])
.status()?;
Command::new("docker")
.args(&["rm", &self.container_name])
.status()?;
println!("Container stopped and removed");
Ok(())
}
}
#[tokio::main]
async fn main() -> Result<()> {
let docker_client = DockerKreuzbergClient::new("kreuzberg-api", 8000);
docker_client.start_container("kreuzberg:latest")?;
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
let content = docker_client.extract_file("document.pdf").await?;
println!("Extracted content:\n{}", content);
docker_client.stop_container()?;
Ok(())
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("{}", result.content);
println!("MIME Type: {}", result.mime_type);
Ok(())
}
```

View File

@@ -0,0 +1,12 @@
```rust title="Rust"
use kreuzberg::extract_file_sync;
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &Default::default())?;
println!("Extracted content: {}", result.content);
println!("Tables found: {}", result.tables.len());
println!("Format: {:?}", result.metadata.as_ref().and_then(|m| m.format.as_ref()));
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
force_ocr: true,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
println!("Detected languages: {:?}", result.detected_languages);
Ok(())
}
```

View File

@@ -0,0 +1,9 @@
```rust title="Rust"
use kreuzberg::extract_file_sync;
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &Default::default())?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,10 @@
```rust title="Rust"
use kreuzberg::extract_file_sync;
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &Default::default())?;
println!("Extraction successful: {}", !result.content.is_empty());
println!("Content length: {} characters", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use std::fs;
use kreuzberg::extract_bytes_sync;
fn main() -> kreuzberg::Result<()> {
let data = fs::read("document.pdf")?;
let result = extract_bytes_sync(&data, "application/pdf", &Default::default())?;
println!("{}", result.content);
println!("Success: true");
println!("Content length: {} characters", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,37 @@
```rust title="Rust"
use kreuzberg::{
extract_file, ExtractionConfig, LlmConfig, StructuredExtractionConfig,
};
use serde_json::json;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
structured_extraction: Some(StructuredExtractionConfig {
schema: json!({
"type": "object",
"properties": {
"title": { "type": "string" },
"authors": { "type": "array", "items": { "type": "string" } },
"date": { "type": "string" }
},
"required": ["title", "authors", "date"],
"additionalProperties": false
}),
llm: LlmConfig {
model: "openai/gpt-4o-mini".to_string(),
..Default::default()
},
strict: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("paper.pdf", None, &config).await?;
if let Some(structured) = &result.structured_output {
println!("{}", structured);
}
Ok(())
}
```

View File

@@ -0,0 +1,41 @@
```rust title="Rust"
use serde_json::json;
use std::io::{BufRead, BufReader, Write};
use std::process::{Command, Stdio};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut child = Command::new("kreuzberg")
.arg("mcp")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
{
let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?;
let request = json!({
"method": "tools/call",
"params": {
"name": "extract_file",
"arguments": {
"path": "document.pdf",
"async": true
}
}
});
stdin.write_all(request.to_string().as_bytes())?;
stdin.write_all(b"\n")?;
}
let stdout = child.stdout.take().ok_or("Failed to open stdout")?;
let reader = BufReader::new(stdout);
for line in reader.lines() {
if let Ok(line) = line {
println!("{}", line);
break;
}
}
child.wait()?;
Ok(())
}
```

View File

@@ -0,0 +1,10 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, mcp::start_mcp_server_with_config};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let config = ExtractionConfig::discover()?;
start_mcp_server_with_config(config).await?;
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
fn main() {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: Some(true),
target_dpi: Some(200),
max_image_dimension: Some(2048),
inject_placeholders: Some(true), // set to false to extract images without markdown references
auto_adjust_dpi: Some(true),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.images);
}
```

View File

@@ -0,0 +1,25 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
tesseract_config: Some(TesseractConfig {
preprocessing: Some(ImagePreprocessingConfig {
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: "otsu".to_string(),
..Default::default()
}),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
fn main() {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.9,
detect_multiple: true,
}),
..Default::default()
};
println!("{:?}", config.language_detection);
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("multilingual_document.pdf", None, &config).await?;
println!("Detected languages: {:?}", result.detected_languages);
```

View File

@@ -0,0 +1,82 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
// Access keywords array
println!("Keywords: {:?}", html_meta.keywords);
// Access canonical URL (renamed from canonical)
if let Some(canonical) = html_meta.canonical_url {
println!("Canonical URL: {}", canonical);
}
// Access Open Graph fields as a map
if let Some(og_image) = html_meta.open_graph.get("image") {
println!("Open Graph Image: {}", og_image);
}
if let Some(og_title) = html_meta.open_graph.get("title") {
println!("Open Graph Title: {}", og_title);
}
// Access Twitter Card fields as a map
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
println!("Twitter Card Type: {}", twitter_card);
}
// Access new fields
if let Some(lang) = html_meta.language {
println!("Language: {}", lang);
}
// Access headers
if !html_meta.headers.is_empty() {
for header in &html_meta.headers {
println!("Header (level {}): {}", header.level, header.text);
}
}
// Access links
if !html_meta.links.is_empty() {
for link in &html_meta.links {
println!("Link: {} ({})", link.href, link.text);
}
}
// Access images
if !html_meta.images.is_empty() {
for image in &html_meta.images {
println!("Image: {}", image.src);
}
}
// Access structured data
if !html_meta.structured_data.is_empty() {
println!("Structured data items: {}", html_meta.structured_data.len());
}
}
Ok(())
}
```

View File

@@ -0,0 +1,26 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file_sync("document.pdf", None, &config)?;
let Some(pages) = &result.metadata.pages else {
return Ok(());
};
let Some(boundaries) = &pages.boundaries else {
return Ok(());
};
for boundary in boundaries.iter().take(3) {
let page_text = &result.content[boundary.byte_start..boundary.byte_end];
let preview_end = 100.min(page_text.len());
println!("Page {}:", boundary.page_number);
println!(" Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
println!(" Preview: {}...", &page_text[..preview_end]);
}
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
Let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
..Default::default()
};
Let result = extract_file_sync("document.pdf", &config)?;
If let Some(pages) = result.pages {
for page in pages {
println!("Page {}:", page.page_number);
println!(" Content: {} chars", page.content.len());
println!(" Tables: {}", page.tables.len());
println!(" Images: {}", page.images.len());
}
}

View File

@@ -0,0 +1,79 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
use std::sync::atomic::{AtomicUsize, Ordering};
struct PdfMetadataExtractor {
processed_count: AtomicUsize,
}
impl PdfMetadataExtractor {
fn new() -> Self {
Self {
processed_count: AtomicUsize::new(0),
}
}
}
impl Plugin for PdfMetadataExtractor {
fn name(&self) -> &str { "pdf-metadata-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn description(&self) -> &str {
"Extracts and enriches PDF metadata"
}
fn initialize(&self) -> Result<()> {
log::info!("PDF metadata extractor initialized");
Ok(())
}
fn shutdown(&self) -> Result<()> {
let count = self.processed_count.load(Ordering::Acquire);
log::info!("Processed {} PDFs", count);
Ok(())
}
}
#[async_trait]
impl PostProcessor for PdfMetadataExtractor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
self.processed_count.fetch_add(1, Ordering::AcqRel);
result.processing_warnings.push(ProcessingWarning {
source: "pdf-metadata-extractor".to_string(),
message: "PDF metadata extracted successfully".to_string()
});
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> bool {
result.mime_type == "application/pdf"
}
fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
10
}
}
use kreuzberg::plugins::registry::get_post_processor_registry;
use std::sync::Arc;
fn register() -> Result<()> {
let processor = Arc::new(PdfMetadataExtractor::new());
let registry = get_post_processor_registry();
registry.register(processor, 50)?;
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
impl PostProcessor for PdfOnlyProcessor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig
) -> Result<()> {
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Middle
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig
) -> bool {
result.mime_type == "application/pdf"
}
}
```

View File

@@ -0,0 +1,17 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
```

View File

@@ -0,0 +1,55 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
struct VectorRecord {
id: String,
content: String,
embedding: Vec<f32>,
metadata: std::collections::HashMap<String, String>,
}
async fn extract_and_vectorize(
document_path: &str,
document_id: &str,
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
normalize: true,
batch_size: 32,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file(document_path, None, &config).await?;
let mut records = Vec::new();
if let Some(chunks) = result.chunks {
for (index, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
let mut metadata = std::collections::HashMap::new();
metadata.insert("document_id".to_string(), document_id.to_string());
metadata.insert("chunk_index".to_string(), index.to_string());
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
records.push(VectorRecord {
id: format!("{}_chunk_{}", document_id, index),
content: chunk.content.clone(),
embedding: embedding.clone(),
metadata,
});
}
}
}
Ok(records)
}
```

View File

@@ -0,0 +1,61 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
use kreuzberg::{Result, ExtractionResult, OcrConfig, Metadata};
use async_trait::async_trait;
use std::path::Path;
struct CloudOcrBackend {
api_key: String,
supported_langs: Vec<String>,
}
impl Plugin for CloudOcrBackend {
fn name(&self) -> &str { "cloud-ocr" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl OcrBackend for CloudOcrBackend {
async fn process_image(
&self,
image_bytes: &[u8],
config: &OcrConfig,
) -> Result<ExtractionResult> {
let text = self.call_cloud_api(image_bytes, &config.language).await?;
Ok(ExtractionResult {
content: text,
mime_type: "text/plain".to_string(),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
})
}
fn supports_language(&self, lang: &str) -> bool {
self.supported_langs.iter().any(|l| l == lang)
}
fn backend_type(&self) -> OcrBackendType {
OcrBackendType::Custom
}
fn supported_languages(&self) -> Vec<String> {
self.supported_langs.clone()
}
}
impl CloudOcrBackend {
async fn call_cloud_api(
&self,
image: &[u8],
language: &str
) -> Result<String> {
Ok("Extracted text".to_string())
}
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+fra".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 3,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
}
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
target_dpi: 200,
max_image_dimension: 2048,
inject_placeholders: true, // set to false to extract images without markdown references
auto_adjust_dpi: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,33 @@
```rust title="Rust"
use kreuzberg::{
extract_file_sync, ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig,
};
fn main() -> kreuzberg::Result<()> {
let preprocessing = ImagePreprocessingConfig {
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: "otsu".to_string(),
..Default::default()
};
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: Some(TesseractConfig {
preprocessing: Some(preprocessing),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, PdfConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
pdf_options: Some(PdfConfig {
dpi: Some(300),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "easyocr".to_string(),
language: "en".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Extracted text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,35 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
use kreuzberg::types::OcrElementConfig;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "paddleocr".to_string(),
language: "en".to_string(),
element_config: Some(OcrElementConfig {
include_elements: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("scanned.pdf", None, &config).await?;
if let Some(elements) = &result.ocr_elements {
for element in elements {
println!("Text: {}", element.text);
println!("Confidence: {:.2}", element.confidence.recognition);
println!("Geometry: {:?}", element.geometry);
if let Some(rotation) = &element.rotation {
println!("Rotation: {}°", rotation.angle_degrees);
}
println!();
}
}
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu+fra".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("multilingual.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "paddleocr".to_string(),
language: "en".to_string(),
// paddle_ocr_config: Some(serde_json::json!({"model_tier": "server"})), // for max accuracy
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Extracted text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, PdfConfig};
fn main() {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_images: Some(true),
extract_metadata: Some(true),
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.pdf_options);
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
language: "eng+fra+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 6,
oem: 1,
min_confidence: 0.8,
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string(),
enable_table_detection: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
```

View File

@@ -0,0 +1,12 @@
```rust title="Rust"
use kreuzberg::{clear_document_extractors, clear_post_processors, clear_ocr_backends, clear_validators};
fn main() {
clear_document_extractors();
clear_post_processors();
clear_ocr_backends();
clear_validators();
println!("All plugins cleared");
}
```

View File

@@ -0,0 +1,47 @@
```rust title="Rust"
use kreuzberg::plugins::{EmbeddingBackend, Plugin, register_embedding_backend};
use kreuzberg::{EmbeddingConfig, EmbeddingModelType, Result, embed_texts};
use async_trait::async_trait;
use std::sync::Arc;
// Wrap an already-loaded embedder (e.g. a tuned ONNX session or any host-language
// embedder) so kreuzberg can call back into it during chunking and standalone
// embed requests.
struct MyEmbedder {
// Hold whatever model handles the host already owns.
}
impl Plugin for MyEmbedder {
fn name(&self) -> &str { "my-embedder" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl EmbeddingBackend for MyEmbedder {
// Captured once at registration; used for shape validation on every dispatch.
fn dimensions(&self) -> usize { 768 }
async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
// Delegate to the already-loaded host model.
Ok(texts.iter().map(|_| vec![0.0; 768]).collect())
}
}
fn main() -> Result<()> {
// Register once at startup.
register_embedding_backend(Arc::new(MyEmbedder {}))?;
let config = EmbeddingConfig {
model: EmbeddingModelType::Plugin { name: "my-embedder".to_string() },
// Optional: bound the wait on a hung backend (default 60s; `None` disables).
max_embed_duration_secs: Some(30),
..Default::default()
};
let vectors = embed_texts(&["Hello, world!", "Second text"], &config)?;
assert_eq!(vectors.len(), 2);
Ok(())
}
```

View File

@@ -0,0 +1,11 @@
```rust title="Rust"
use kreuzberg::plugins::registry::get_document_extractor_registry;
use std::sync::Arc;
fn register_custom_extractor() -> kreuzberg::Result<()> {
let extractor = Arc::new(CustomJsonExtractor);
let registry = get_document_extractor_registry();
registry.write().unwrap().register(extractor)?;
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::plugins::registry::*;
let registry = get_document_extractor_registry();
let extractors = registry.list()?;
println!("Registered extractors: {:?}", extractors);
let registry = get_post_processor_registry();
let processors = registry.list()?;
println!("Registered processors: {:?}", processors);
let registry = get_ocr_backend_registry();
let backends = registry.list()?;
println!("Registered OCR backends: {:?}", backends);
let registry = get_validator_registry();
let validators = registry.list()?;
println!("Registered validators: {:?}", validators);
```

View File

@@ -0,0 +1,38 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, Validator};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
use async_trait::async_trait;
struct MinLengthValidator {
min_length: usize,
}
impl Plugin for MinLengthValidator {
fn name(&self) -> &str { "min-length-validator" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl Validator for MinLengthValidator {
async fn validate(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
if result.content.len() < self.min_length {
return Err(KreuzbergError::validation(format!(
"Content too short: {} < {} characters",
result.content.len(),
self.min_length
)));
}
Ok(())
}
fn priority(&self) -> i32 {
100
}
}
```

View File

@@ -0,0 +1,69 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use serde_json::json;
struct PdfMetadataExtractor {
processed_count: AtomicUsize,
}
impl Plugin for PdfMetadataExtractor {
fn name(&self) -> &str { "pdf-metadata-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> {
self.processed_count.store(0, Ordering::Release);
Ok(())
}
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl PostProcessor for PdfMetadataExtractor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
if result.mime_type != "application/pdf" {
return Ok(());
}
let order = self.processed_count.fetch_add(1, Ordering::AcqRel) + 1;
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
result.metadata.additional.insert("pdf_order".to_string(), json!(order));
result.metadata.additional.insert(
"content_length".to_string(),
json!(result.content.len()),
);
result.metadata.additional.insert(
"pdf_processor_version".to_string(),
json!("1.0.0"),
);
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> bool {
result.mime_type == "application/pdf"
}
}
fn main() -> Result<()> {
register_post_processor(Arc::new(PdfMetadataExtractor {
processed_count: AtomicUsize::new(0),
}))?;
Ok(())
}
```

View File

@@ -0,0 +1,50 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
use std::sync::Arc;
use serde_json::json;
struct PdfOnlyProcessor;
impl Plugin for PdfOnlyProcessor {
fn name(&self) -> &str { "pdf-only-processor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl PostProcessor for PdfOnlyProcessor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
if result.mime_type != "application/pdf" {
return Ok(());
}
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> bool {
result.mime_type == "application/pdf"
}
}
fn main() -> Result<()> {
register_post_processor(Arc::new(PdfOnlyProcessor))?;
Ok(())
}
```

View File

@@ -0,0 +1,53 @@
```rust title="Rust"
use kreuzberg::plugins::{DocumentExtractor, Plugin};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
use async_trait::async_trait;
use std::path::Path;
struct CustomJsonExtractor;
impl Plugin for CustomJsonExtractor {
fn name(&self) -> &str { "custom-json-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl DocumentExtractor for CustomJsonExtractor {
async fn extract_bytes(
&self,
content: &[u8],
_mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let json: serde_json::Value = serde_json::from_slice(content)?;
let text = extract_text_from_json(&json);
Ok(ExtractionResult {
content: text,
mime_type: "application/json".to_string(),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["application/json", "text/json"]
}
fn priority(&self) -> i32 { 50 }
}
fn extract_text_from_json(value: &serde_json::Value) -> String {
match value {
serde_json::Value::String(s) => format!("{}\n", s),
serde_json::Value::Array(arr) => arr.iter().map(extract_text_from_json).collect(),
serde_json::Value::Object(obj) => obj.values().map(extract_text_from_json).collect(),
_ => String::new(),
}
}
```

View File

@@ -0,0 +1,35 @@
```rust title="Rust"
use log::{info, warn, error};
impl Plugin for MyPlugin {
fn initialize(&self) -> Result<()> {
info!("Initializing plugin: {}", self.name());
Ok(())
}
fn shutdown(&self) -> Result<()> {
info!("Shutting down plugin: {}", self.name());
Ok(())
}
}
#[async_trait]
impl DocumentExtractor for MyPlugin {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
info!("Extracting {} ({} bytes)", mime_type, content.len());
let result = ExtractionResult::default();
if result.content.is_empty() {
warn!("Extraction resulted in empty content");
}
Ok(result)
}
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_custom_extractor() {
let extractor = CustomJsonExtractor;
let json_data = br#"{"message": "Hello, world!"}"#;
let config = ExtractionConfig::default();
let result = extractor
.extract_bytes(json_data, "application/json", &config)
.await
.expect("Extraction failed");
assert!(result.content.contains("Hello, world!"));
assert_eq!(result.mime_type, "application/json");
}
}
```

Some files were not shown because too many files have changed in this diff Show More