Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: None,
output_format: None,
paddle_ocr_config: None,
element_config: None,
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
batch_size: 32,
normalize: true,
show_download_progress: false,
cache_dir: None,
}),
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: false,
}),
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.1,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
postprocessor: Some(PostProcessorConfig {
enabled: true,
enabled_processors: None,
disabled_processors: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Content: {}", result.content);
if let Some(langs) = &result.detected_languages {
println!("Languages: {:?}", langs);
}
println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
Ok(())
}
```

View File

@@ -0,0 +1,79 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Chunks: {}", result.chunks.len());
for chunk in &result.chunks {
println!("Length: {}", chunk.content.len());
}
Ok(())
}
```
```rust title="Rust - Markdown with Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
sizing: ChunkSizing::Tokenizer {
model: "Xenova/gpt-4o".into(),
cache_dir: None,
},
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
if let Some(heading_context) = &chunk.metadata.heading_context {
for heading in &heading_context.headings {
println!("Heading L{}: {}", heading.level, heading.text);
}
}
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```
```rust title="Rust - Prepend Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
// Each chunk's content is prefixed with its heading breadcrumb
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,11 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::discover()?.unwrap_or_default();
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: None,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("Content length: {}", result.content.len());
println!("Tables detected: {}", result.tables.len());
Ok(())
}
```

View File

@@ -0,0 +1,29 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = ExtractionConfig {
use_cache: true,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 6,
..Default::default()
}),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
..Default::default()
}),
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("Content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,17 @@
```rust title="Document Structure Config (Rust)"
use kreuzberg::{extract_file_sync, ExtractionConfig};
let config = ExtractionConfig {
include_document_structure: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(document) = &result.document {
for node in &document.nodes {
let text = node.content.text().unwrap_or("");
println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
}
}
```

View File

@@ -0,0 +1,48 @@
```rust title="Element-Based Output (Rust)"
use kreuzberg::{extract_file_sync, ExtractionConfig};
use kreuzberg::types::OutputFormat as ResultFormat;
fn main() -> kreuzberg::Result<()> {
// Configure element-based output (result_format controls Unified vs ElementBased)
let config = ExtractionConfig {
result_format: ResultFormat::ElementBased,
..Default::default()
};
// Extract document
let result = extract_file_sync("document.pdf", None, &config)?;
// Access elements
if let Some(elements) = result.elements {
for element in &elements {
println!("Type: {:?}", element.element_type);
println!("Text: {}", &element.text[..100.min(element.text.len())]);
if let Some(page) = element.metadata.page_number {
println!("Page: {}", page);
}
if let Some(coords) = &element.metadata.coordinates {
println!("Coords: ({}, {}) - ({}, {})",
coords.x0, coords.y0, coords.x1, coords.y1);
}
println!("---");
}
// Filter by element type
let titles: Vec<_> = elements.iter()
.filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
.collect();
for title in titles {
let level = title.metadata.additional.get("level")
.map(|v| v.as_ref())
.unwrap_or("unknown");
println!("[{}] {}", level, title.text);
}
}
Ok(())
}
```

View File

@@ -0,0 +1,28 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
batch_size: 16,
normalize: true,
show_download_progress: true,
cache_dir: None,
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
Ok(())
}
```

View File

@@ -0,0 +1,85 @@
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
acceleration: None,
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Custom {
model_id: "BAAI/bge-small-en-v1.5".to_string(),
dimensions: 384,
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: None, // Uses default: .kreuzberg/embeddings/
acceleration: None,
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
let embedding_config = EmbeddingConfig {
model: EmbeddingModelType::Custom {
model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
dimensions: 768,
},
batch_size: 16, // Larger model requires smaller batch size
normalize: true,
show_download_progress: true,
cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
acceleration: None,
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
use kreuzberg::{ChunkingConfig, ExtractionConfig};
let chunking_with_embeddings = ChunkingConfig {
max_characters: 1024,
overlap: 100,
preset: Some("balanced".to_string()),
embedding: Some(EmbeddingConfig::default()), // Uses balanced preset
};
let extraction_config = ExtractionConfig {
chunking: Some(chunking_with_embeddings),
..Default::default()
};
// Key parameter explanations:
//
// batch_size: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// cache_dir: Where to store downloaded models
// - None: Uses .kreuzberg/embeddings/ in current directory
// - Some(path): Custom directory for model storage
//
// show_download_progress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,90 @@
use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
// Example 1: Basic hierarchy extraction
// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
let hierarchy_config_basic = HierarchyConfig {
enabled: true,
k_clusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
include_bbox: true, // Include bounding box coordinates
ocr_coverage_threshold: None, // No OCR coverage threshold
};
let pdf_config_basic = PdfConfig {
hierarchy: Some(hierarchy_config_basic),
..Default::default()
};
let extraction_config_basic = ExtractionConfig {
pdf_options: Some(pdf_config_basic),
..Default::default()
};
// Use with extract_file_sync or extract_bytes_sync
// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
// Example 2: Custom k_clusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
let hierarchy_config_minimal = HierarchyConfig {
enabled: true,
k_clusters: 3, // Minimal clustering: just 3 levels
include_bbox: true,
ocr_coverage_threshold: None,
};
let pdf_config_minimal = PdfConfig {
hierarchy: Some(hierarchy_config_minimal),
..Default::default()
};
let extraction_config_minimal = ExtractionConfig {
pdf_options: Some(pdf_config_minimal),
..Default::default()
};
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
let hierarchy_config_ocr = HierarchyConfig {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: Some(0.5), // Trigger OCR if text coverage < 50%
};
let pdf_config_ocr = PdfConfig {
hierarchy: Some(hierarchy_config_ocr),
..Default::default()
};
let extraction_config_ocr = ExtractionConfig {
pdf_options: Some(pdf_config_ocr),
..Default::default()
};
// Field descriptions:
//
// enabled: bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// k_clusters: usize (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// include_bbox: bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocr_coverage_threshold: Option<f32> (default: None)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
// - None means no OCR coverage-based triggering

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
let config = ExtractionConfig {
output_format: OutputFormat::Html,
html_output: Some(HtmlOutputConfig {
theme: HtmlTheme::GitHub,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config).unwrap();
println!("{}", result.content); // HTML with kb-* classes
```

View File

@@ -0,0 +1,72 @@
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
pub async fn basic_yake() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.0,
ngram_range: (1, 3),
language: Some("en".to_string()),
yake_params: None,
rake_params: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
#[cfg(feature = "keywords-yake")]
pub async fn advanced_yake() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 15,
min_score: 0.1,
ngram_range: (1, 2),
language: Some("en".to_string()),
yake_params: Some(YakeParams {
window_size: 1,
}),
rake_params: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
#[cfg(feature = "keywords-rake")]
pub async fn rake_config() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Rake,
max_keywords: 10,
min_score: 5.0,
ngram_range: (1, 3),
language: Some("en".to_string()),
yake_params: None,
rake_params: Some(RakeParams {
min_word_length: 1,
max_words_per_phrase: 3,
}),
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.1,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Detected language: {}", result.language);
println!("Confidence: {}", result.language_confidence);
Ok(())
}
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
target_dpi: 300,
max_image_dimension: 4096,
auto_adjust_dpi: true,
min_dpi: 150,
max_dpi: 600,
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Extracted images: {}", result.images.len());
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_images: true,
passwords: Some(vec!["password123".to_string()]),
extract_metadata: true,
hierarchy: Some(HierarchyConfig::default()),
}),
..Default::default()
};
let result = extract_file_sync("encrypted.pdf", None, &config)?;
println!("Title: {:?}", result.metadata.title);
println!("Authors: {:?}", result.metadata.authors);
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
hierarchy: Some(HierarchyConfig {
enabled: true,
detection_threshold: Some(0.75),
ocr_coverage_threshold: Some(0.8),
min_level: Some(1),
max_level: Some(5),
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Hierarchy levels: {}", result.hierarchy.len());
Ok(())
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
postprocessor: Some(PostProcessorConfig {
enabled: true,
enabled_processors: Some(vec![
"whitespace_normalizer".to_string(),
"unicode_normalizer".to_string(),
]),
disabled_processors: None,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Processed content: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
enable_quality_processing: true,
use_cache: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Quality score: {}", result.quality_score);
println!("Processing time: {:?}", result.processing_time);
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
use kreuzberg::types::TesseractConfig;
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: Some(6),
oem: Some(3),
..Default::default()
}),
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
println!("OCR text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Original tokens: {}", result.token_count);
println!("Reduced content: {}", result.content);
Ok(())
}
```