This commit is contained in:
46
docs/snippets/rust/advanced/advanced_config.md
Normal file
46
docs/snippets/rust/advanced/advanced_config.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 100,
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
detect_multiple: true,
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
let preview: String = chunk.content.chars().take(100).collect();
|
||||
println!("Chunk: {}...", preview);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(languages) = result.detected_languages {
|
||||
println!("Languages: {:?}", languages);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
29
docs/snippets/rust/advanced/chunk_page_mapping.md
Normal file
29
docs/snippets/rust/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Use Kreuzberg::{extract_file_sync, ExtractionConfig, ChunkingConfig, PageConfig};
|
||||
|
||||
Let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
If let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
||||
let page_range = if first == last {
|
||||
format!("Page {}", first)
|
||||
} else {
|
||||
format!("Pages {}-{}", first, last)
|
||||
};
|
||||
println!("Chunk: {}... ({})", chunk.content.chars().take(50).collect::<String>(), page_range);
|
||||
}
|
||||
}
|
||||
}
|
||||
21
docs/snippets/rust/advanced/chunking.md
Normal file
21
docs/snippets/rust/advanced/chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1500,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "text-embedding-all-minilm-l6-v2".to_string(),
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.chunking);
|
||||
}
|
||||
```
|
||||
39
docs/snippets/rust/advanced/chunking_config.md
Normal file
39
docs/snippets/rust/advanced/chunking_config.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
```rust title="Rust - Semantic"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
chunker_type: ChunkerType::Semantic,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
```rust title="Rust - Prepend Heading Context"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
36
docs/snippets/rust/advanced/chunking_rag.md
Normal file
36
docs/snippets/rust/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: "balanced".to_string(),
|
||||
normalize: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
for chunk in chunks {
|
||||
println!("Chunk {}/{}",
|
||||
chunk.metadata.chunk_index + 1,
|
||||
chunk.metadata.total_chunks
|
||||
);
|
||||
println!("Position: {}-{}",
|
||||
chunk.metadata.byte_start,
|
||||
chunk.metadata.byte_end
|
||||
);
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
if let Some(embedding) = chunk.embedding {
|
||||
println!("Embedding: {} dimensions", embedding.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/advanced/embedding_config.md
Normal file
23
docs/snippets/rust/advanced/embedding_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "all-mpnet-base-v2".to_string(),
|
||||
},
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.chunking);
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/advanced/embedding_with_chunking.md
Normal file
19
docs/snippets/rust/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: "balanced".to_string(),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
15
docs/snippets/rust/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/rust/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
19
docs/snippets/rust/advanced/keyword_extraction_example.md
Normal file
19
docs/snippets/rust/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
if let Some(keywords) = &result.extracted_keywords {
|
||||
println!("Keywords: {:?}", keywords);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/advanced/language_detection.md
Normal file
15
docs/snippets/rust/advanced/language_detection.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: Some(true),
|
||||
min_confidence: Some(0.9),
|
||||
detect_multiple: Some(true),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.language_detection);
|
||||
}
|
||||
```
|
||||
12
docs/snippets/rust/advanced/language_detection_config.md
Normal file
12
docs/snippets/rust/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("multilingual_document.pdf", None, &config).await?;
|
||||
|
||||
println!("Detected languages: {:?}", result.detected_languages);
|
||||
```
|
||||
18
docs/snippets/rust/advanced/postprocessor_config.md
Normal file
18
docs/snippets/rust/advanced/postprocessor_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, PostProcessorConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: Some(true),
|
||||
enabled_processors: Some(vec![
|
||||
"deduplication".to_string(),
|
||||
"whitespace_normalization".to_string(),
|
||||
]),
|
||||
disabled_processors: Some(vec!["mojibake_fix".to_string()]),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.postprocessor);
|
||||
}
|
||||
```
|
||||
8
docs/snippets/rust/advanced/quality_processing_config.md
Normal file
8
docs/snippets/rust/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::ExtractionConfig;
|
||||
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
17
docs/snippets/rust/advanced/quality_processing_example.md
Normal file
17
docs/snippets/rust/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_file("scanned_document.pdf", None, &config).await?;
|
||||
|
||||
if let Some(score) = result.quality_score {
|
||||
if score < 0.5 {
|
||||
println!("Warning: Low quality extraction ({:.2})", score);
|
||||
} else {
|
||||
println!("Quality score: {:.2}", score);
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/advanced/token_reduction.md
Normal file
14
docs/snippets/rust/advanced/token_reduction.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.token_reduction);
|
||||
}
|
||||
```
|
||||
14
docs/snippets/rust/advanced/token_reduction_config.md
Normal file
14
docs/snippets/rust/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: Some("eng".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
21
docs/snippets/rust/advanced/token_reduction_example.md
Normal file
21
docs/snippets/rust/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_markdown: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("verbose_document.pdf", None, &config).await?;
|
||||
|
||||
if let Some(original) = result.original_token_count {
|
||||
println!("Original tokens: {}", original);
|
||||
}
|
||||
if let Some(reduced) = result.reduced_token_count {
|
||||
println!("Reduced tokens: {}", reduced);
|
||||
}
|
||||
```
|
||||
55
docs/snippets/rust/advanced/vector_database_integration.md
Normal file
55
docs/snippets/rust/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
struct VectorRecord {
|
||||
id: String,
|
||||
content: String,
|
||||
embedding: Vec<f32>,
|
||||
metadata: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
async fn extract_and_vectorize(
|
||||
document_path: &str,
|
||||
document_id: &str,
|
||||
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file(document_path, None, &config).await?;
|
||||
|
||||
let mut records = Vec::new();
|
||||
if let Some(chunks) = result.chunks {
|
||||
for (index, chunk) in chunks.iter().enumerate() {
|
||||
if let Some(embedding) = &chunk.embedding {
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
metadata.insert("document_id".to_string(), document_id.to_string());
|
||||
metadata.insert("chunk_index".to_string(), index.to_string());
|
||||
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
|
||||
|
||||
records.push(VectorRecord {
|
||||
id: format!("{}_chunk_{}", document_id, index),
|
||||
content: chunk.content.clone(),
|
||||
embedding: embedding.clone(),
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user