Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
```rust title="Rust"
use kreuzberg::{
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu".to_string(),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 100,
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
detect_multiple: true,
..Default::default()
}),
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
let preview: String = chunk.content.chars().take(100).collect();
println!("Chunk: {}...", preview);
}
}
if let Some(languages) = result.detected_languages {
println!("Languages: {:?}", languages);
}
Ok(())
}
```

View File

@@ -0,0 +1,29 @@
Use Kreuzberg::{extract_file_sync, ExtractionConfig, ChunkingConfig, PageConfig};
Let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
..Default::default()
}),
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
..Default::default()
};
Let result = extract_file_sync("document.pdf", None, &config)?;
If let Some(chunks) = result.chunks {
for chunk in chunks {
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
let page_range = if first == last {
format!("Page {}", first)
} else {
format!("Pages {}-{}", first, last)
};
println!("Chunk: {}... ({})", chunk.content.chars().take(50).collect::<String>(), page_range);
}
}
}

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1500,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "text-embedding-all-minilm-l6-v2".to_string(),
},
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
```

View File

@@ -0,0 +1,39 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ChunkingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: None,
}),
..Default::default()
};
```
```rust title="Rust - Semantic"
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
chunker_type: ChunkerType::Semantic,
..Default::default()
}),
..Default::default()
};
```
```rust title="Rust - Prepend Heading Context"
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,36 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
println!("Chunk {}/{}",
chunk.metadata.chunk_index + 1,
chunk.metadata.total_chunks
);
println!("Position: {}-{}",
chunk.metadata.byte_start,
chunk.metadata.byte_end
);
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
if let Some(embedding) = chunk.embedding {
println!("Embedding: {} dimensions", embedding.len());
}
}
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "all-mpnet-base-v2".to_string(),
},
batch_size: 16,
normalize: true,
show_download_progress: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1024,
overlap: 100,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
batch_size: 32,
show_download_progress: false,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(keywords) = &result.extracted_keywords {
println!("Keywords: {:?}", keywords);
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
fn main() {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: Some(true),
min_confidence: Some(0.9),
detect_multiple: Some(true),
}),
..Default::default()
};
println!("{:?}", config.language_detection);
}
```

View File

@@ -0,0 +1,12 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: false,
}),
..Default::default()
};
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("multilingual_document.pdf", None, &config).await?;
println!("Detected languages: {:?}", result.detected_languages);
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, PostProcessorConfig};
fn main() {
let config = ExtractionConfig {
postprocessor: Some(PostProcessorConfig {
enabled: Some(true),
enabled_processors: Some(vec![
"deduplication".to_string(),
"whitespace_normalization".to_string(),
]),
disabled_processors: Some(vec!["mojibake_fix".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.postprocessor);
}
```

View File

@@ -0,0 +1,8 @@
```rust title="Rust"
use kreuzberg::ExtractionConfig;
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
```

View File

@@ -0,0 +1,17 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("scanned_document.pdf", None, &config).await?;
if let Some(score) = result.quality_score {
if score < 0.5 {
println!("Warning: Low quality extraction ({:.2})", score);
} else {
println!("Quality score: {:.2}", score);
}
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
fn main() {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
println!("{:?}", config.token_reduction);
}
```

View File

@@ -0,0 +1,14 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
preserve_code: true,
language_hint: Some("eng".to_string()),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("verbose_document.pdf", None, &config).await?;
if let Some(original) = result.original_token_count {
println!("Original tokens: {}", original);
}
if let Some(reduced) = result.reduced_token_count {
println!("Reduced tokens: {}", reduced);
}
```

View File

@@ -0,0 +1,55 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
struct VectorRecord {
id: String,
content: String,
embedding: Vec<f32>,
metadata: std::collections::HashMap<String, String>,
}
async fn extract_and_vectorize(
document_path: &str,
document_id: &str,
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
normalize: true,
batch_size: 32,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file(document_path, None, &config).await?;
let mut records = Vec::new();
if let Some(chunks) = result.chunks {
for (index, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
let mut metadata = std::collections::HashMap::new();
metadata.insert("document_id".to_string(), document_id.to_string());
metadata.insert("chunk_index".to_string(), index.to_string());
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
records.push(VectorRecord {
id: format!("{}_chunk_{}", document_id, index),
content: chunk.content.clone(),
embedding: embedding.clone(),
metadata,
});
}
}
}
Ok(records)
}
```