This commit is contained in:
63
docs/snippets/rust/config/advanced_config.md
Normal file
63
docs/snippets/rust/config/advanced_config.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig, ChunkingConfig, LanguageDetectionConfig, TokenReductionConfig, PostProcessorConfig, EmbeddingConfig, EmbeddingModelType};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: None,
|
||||
output_format: None,
|
||||
paddle_ocr_config: None,
|
||||
element_config: None,
|
||||
}),
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: false,
|
||||
cache_dir: None,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: true,
|
||||
enabled_processors: None,
|
||||
disabled_processors: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Content: {}", result.content);
|
||||
if let Some(langs) = &result.detected_languages {
|
||||
println!("Languages: {:?}", langs);
|
||||
}
|
||||
println!("Chunks: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
79
docs/snippets/rust/config/chunking_config.md
Normal file
79
docs/snippets/rust/config/chunking_config.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Chunks: {}", result.chunks.len());
|
||||
for chunk in &result.chunks {
|
||||
println!("Length: {}", chunk.content.len());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
```rust title="Rust - Markdown with Heading Context"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
sizing: ChunkSizing::Tokenizer {
|
||||
model: "Xenova/gpt-4o".into(),
|
||||
cache_dir: None,
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.md", None::<&str>, &config).await?;
|
||||
for chunk in &result.chunks {
|
||||
if let Some(heading_context) = &chunk.metadata.heading_context {
|
||||
for heading in &heading_context.headings {
|
||||
println!("Heading L{}: {}", heading.level, heading.text);
|
||||
}
|
||||
}
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
```rust title="Rust - Prepend Heading Context"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
chunker_type: ChunkerType::Markdown,
|
||||
prepend_heading_context: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.md", None::<&str>, &config).await?;
|
||||
for chunk in &result.chunks {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
16
docs/snippets/rust/config/config_basic.md
Normal file
16
docs/snippets/rust/config/config_basic.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
11
docs/snippets/rust/config/config_discover.md
Normal file
11
docs/snippets/rust/config/config_discover.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::discover()?.unwrap_or_default();
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/config_ocr.md
Normal file
20
docs/snippets/rust/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: None,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
println!("Content length: {}", result.content.len());
|
||||
println!("Tables detected: {}", result.tables.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
29
docs/snippets/rust/config/config_programmatic.md
Normal file
29
docs/snippets/rust/config/config_programmatic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ChunkingConfig, ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 6,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
..Default::default()
|
||||
}),
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("Content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
17
docs/snippets/rust/config/document_structure_config.md
Normal file
17
docs/snippets/rust/config/document_structure_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Document Structure Config (Rust)"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
include_document_structure: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
if let Some(document) = &result.document {
|
||||
for node in &document.nodes {
|
||||
let text = node.content.text().unwrap_or("");
|
||||
println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
|
||||
}
|
||||
}
|
||||
```
|
||||
48
docs/snippets/rust/config/element_based_output.md
Normal file
48
docs/snippets/rust/config/element_based_output.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```rust title="Element-Based Output (Rust)"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
use kreuzberg::types::OutputFormat as ResultFormat;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
// Configure element-based output (result_format controls Unified vs ElementBased)
|
||||
let config = ExtractionConfig {
|
||||
result_format: ResultFormat::ElementBased,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Extract document
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
// Access elements
|
||||
if let Some(elements) = result.elements {
|
||||
for element in &elements {
|
||||
println!("Type: {:?}", element.element_type);
|
||||
println!("Text: {}", &element.text[..100.min(element.text.len())]);
|
||||
|
||||
if let Some(page) = element.metadata.page_number {
|
||||
println!("Page: {}", page);
|
||||
}
|
||||
|
||||
if let Some(coords) = &element.metadata.coordinates {
|
||||
println!("Coords: ({}, {}) - ({}, {})",
|
||||
coords.x0, coords.y0, coords.x1, coords.y1);
|
||||
}
|
||||
|
||||
println!("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
let titles: Vec<_> = elements.iter()
|
||||
.filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
|
||||
.collect();
|
||||
|
||||
for title in titles {
|
||||
let level = title.metadata.additional.get("level")
|
||||
.map(|v| v.as_ref())
|
||||
.unwrap_or("unknown");
|
||||
println!("[{}] {}", level, title.text);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
28
docs/snippets/rust/config/embedding_config.md
Normal file
28
docs/snippets/rust/config/embedding_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: None,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Chunks with embeddings: {}", result.chunks.as_ref().map(|c| c.len()).unwrap_or(0));
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
85
docs/snippets/rust/config/embedding_config.rs
Normal file
85
docs/snippets/rust/config/embedding_config.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: Some(std::path::PathBuf::from("~/.cache/kreuzberg/embeddings")),
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Custom {
|
||||
model_id: "BAAI/bge-small-en-v1.5".to_string(),
|
||||
dimensions: 384,
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: None, // Uses default: .kreuzberg/embeddings/
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
let embedding_config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Custom {
|
||||
model_id: "sentence-transformers/all-mpnet-base-v2".to_string(),
|
||||
dimensions: 768,
|
||||
},
|
||||
batch_size: 16, // Larger model requires smaller batch size
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: Some(std::path::PathBuf::from("/var/cache/embeddings")),
|
||||
acceleration: None,
|
||||
};
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
use kreuzberg::{ChunkingConfig, ExtractionConfig};
|
||||
|
||||
let chunking_with_embeddings = ChunkingConfig {
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
preset: Some("balanced".to_string()),
|
||||
embedding: Some(EmbeddingConfig::default()), // Uses balanced preset
|
||||
};
|
||||
|
||||
let extraction_config = ExtractionConfig {
|
||||
chunking: Some(chunking_with_embeddings),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// batch_size: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// cache_dir: Where to store downloaded models
|
||||
// - None: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - Some(path): Custom directory for model storage
|
||||
//
|
||||
// show_download_progress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
90
docs/snippets/rust/config/hierarchy_config.rs
Normal file
90
docs/snippets/rust/config/hierarchy_config.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use kreuzberg::{ExtractionConfig, HierarchyConfig, PdfConfig};
|
||||
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
let hierarchy_config_basic = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox: true, // Include bounding box coordinates
|
||||
ocr_coverage_threshold: None, // No OCR coverage threshold
|
||||
};
|
||||
|
||||
let pdf_config_basic = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_basic),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_basic = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_basic),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Use with extract_file_sync or extract_bytes_sync
|
||||
// let result = extract_file_sync("document.pdf", extraction_config_basic)?;
|
||||
|
||||
|
||||
// Example 2: Custom k_clusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
let hierarchy_config_minimal = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 3, // Minimal clustering: just 3 levels
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
};
|
||||
|
||||
let pdf_config_minimal = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_minimal),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_minimal = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_minimal),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
let hierarchy_config_ocr = HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: Some(0.5), // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
let pdf_config_ocr = PdfConfig {
|
||||
hierarchy: Some(hierarchy_config_ocr),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let extraction_config_ocr = ExtractionConfig {
|
||||
pdf_options: Some(pdf_config_ocr),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// k_clusters: usize (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// include_bbox: bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocr_coverage_threshold: Option<f32> (default: None)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: Some(0.5) means "run OCR if less than 50% of page has text data"
|
||||
// - None means no OCR coverage-based triggering
|
||||
14
docs/snippets/rust/config/html_output.md
Normal file
14
docs/snippets/rust/config/html_output.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, HtmlOutputConfig, HtmlTheme, OutputFormat};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Html,
|
||||
html_output: Some(HtmlOutputConfig {
|
||||
theme: HtmlTheme::GitHub,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_file_sync("document.pdf", None, &config).unwrap();
|
||||
println!("{}", result.content); // HTML with kb-* classes
|
||||
```
|
||||
72
docs/snippets/rust/config/keyword_config.rs
Normal file
72
docs/snippets/rust/config/keyword_config.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams};
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
pub async fn basic_yake() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.0,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: None,
|
||||
rake_params: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
pub async fn advanced_yake() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 15,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 2),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: Some(YakeParams {
|
||||
window_size: 1,
|
||||
}),
|
||||
rake_params: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
pub async fn rake_config() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Rake,
|
||||
max_keywords: 10,
|
||||
min_score: 5.0,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
yake_params: None,
|
||||
rake_params: Some(RakeParams {
|
||||
min_word_length: 1,
|
||||
max_words_per_phrase: 3,
|
||||
}),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
23
docs/snippets/rust/config/keyword_extraction_config.md
Normal file
23
docs/snippets/rust/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: (1, 3),
|
||||
language: Some("en".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Keywords: {:?}", result.keywords);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/language_detection_config.md
Normal file
20
docs/snippets/rust/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Detected language: {}", result.language);
|
||||
println!("Confidence: {}", result.language_confidence);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
21
docs/snippets/rust/config/ocr_dpi_config.md
Normal file
21
docs/snippets/rust/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4096,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 150,
|
||||
max_dpi: 600,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Extracted images: {}", result.images.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/config/pdf_config.md
Normal file
20
docs/snippets/rust/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: true,
|
||||
passwords: Some(vec!["password123".to_string()]),
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig::default()),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("encrypted.pdf", None, &config)?;
|
||||
println!("Title: {:?}", result.metadata.title);
|
||||
println!("Authors: {:?}", result.metadata.authors);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/config/pdf_hierarchy_config.md
Normal file
23
docs/snippets/rust/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: true,
|
||||
detection_threshold: Some(0.75),
|
||||
ocr_coverage_threshold: Some(0.8),
|
||||
min_level: Some(1),
|
||||
max_level: Some(5),
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Hierarchy levels: {}", result.hierarchy.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/config/postprocessor_config.md
Normal file
22
docs/snippets/rust/config/postprocessor_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, PostProcessorConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
postprocessor: Some(PostProcessorConfig {
|
||||
enabled: true,
|
||||
enabled_processors: Some(vec![
|
||||
"whitespace_normalizer".to_string(),
|
||||
"unicode_normalizer".to_string(),
|
||||
]),
|
||||
disabled_processors: None,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Processed content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
16
docs/snippets/rust/config/quality_processing_config.md
Normal file
16
docs/snippets/rust/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
|
||||
println!("Quality score: {}", result.quality_score);
|
||||
println!("Processing time: {:?}", result.processing_time);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/config/tesseract_config.md
Normal file
23
docs/snippets/rust/config/tesseract_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
use kreuzberg::types::TesseractConfig;
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: Some(6),
|
||||
oem: Some(3),
|
||||
..Default::default()
|
||||
}),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None::<&str>, &config)?;
|
||||
println!("OCR text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/config/token_reduction_config.md
Normal file
19
docs/snippets/rust/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionConfig {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None::<&str>, &config).await?;
|
||||
println!("Original tokens: {}", result.token_count);
|
||||
println!("Reduced content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user