This commit is contained in:
16
docs/snippets/rust/utils/chunking.md
Normal file
16
docs/snippets/rust/utils/chunking.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1500,
|
||||
overlap: 200,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "all-minilm-l6-v2".to_string() },
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
34
docs/snippets/rust/utils/chunking_rag.md
Normal file
34
docs/snippets/rust/utils/chunking_rag.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
normalize: true,
|
||||
batch_size: 16,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
let mut chunks_with_embeddings = 0usize;
|
||||
for chunk in result.chunks.unwrap_or_default() {
|
||||
if chunk.embedding.is_some() {
|
||||
chunks_with_embeddings += 1;
|
||||
}
|
||||
}
|
||||
println!("Chunks with embeddings: {chunks_with_embeddings}");
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/utils/embedding_with_chunking.md
Normal file
19
docs/snippets/rust/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
23
docs/snippets/rust/utils/keyword_extraction_example.md
Normal file
23
docs/snippets/rust/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, KeywordAlgorithm, KeywordConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
keywords: Some(KeywordConfig {
|
||||
algorithm: KeywordAlgorithm::Yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("research_paper.pdf", None, &config).await?;
|
||||
|
||||
for kw in result.extracted_keywords.unwrap_or_default() {
|
||||
println!("{}: {:.3}", kw.text, kw.score);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/utils/quality_processing_example.md
Normal file
22
docs/snippets/rust/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("scanned_document.pdf", None, &config).await?;
|
||||
let quality_score = result.quality_score.unwrap_or(0.0);
|
||||
|
||||
if quality_score < 0.5 {
|
||||
println!("Warning: Low quality extraction ({quality_score:.2})");
|
||||
println!("Consider re-scanning with higher DPI or adjusting OCR settings");
|
||||
} else {
|
||||
println!("Quality score: {quality_score:.2}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/utils/standalone_embed.md
Normal file
15
docs/snippets/rust/utils/standalone_embed.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{EmbeddingConfig, EmbeddingModelType, embed_texts};
|
||||
|
||||
let config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
normalize: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let texts = vec!["Hello, world!", "Kreuzberg is fast"];
|
||||
let embeddings = embed_texts(&texts, &config)?;
|
||||
|
||||
assert_eq!(embeddings.len(), 2);
|
||||
assert_eq!(embeddings[0].len(), 768);
|
||||
```
|
||||
18
docs/snippets/rust/utils/token_reduction.md
Normal file
18
docs/snippets/rust/utils/token_reduction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionOptions};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionOptions {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
39
docs/snippets/rust/utils/token_reduction_example.md
Normal file
39
docs/snippets/rust/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionOptions};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionOptions {
|
||||
mode: "moderate".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("verbose_document.pdf", None, &config).await?;
|
||||
|
||||
let original = result
|
||||
.metadata
|
||||
.additional
|
||||
.get("original_token_count")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0);
|
||||
let reduced = result
|
||||
.metadata
|
||||
.additional
|
||||
.get("token_count")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0);
|
||||
let ratio = result
|
||||
.metadata
|
||||
.additional
|
||||
.get("token_reduction_ratio")
|
||||
.and_then(|v| v.as_f64())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
println!("Reduced from {original} to {reduced} tokens");
|
||||
println!("Reduction: {:.1}%", ratio * 100.0);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
27
docs/snippets/rust/utils/vector_database_integration.md
Normal file
27
docs/snippets/rust/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset { name: "balanced".to_string() },
|
||||
normalize: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
if let Some(embedding) = &chunk.embedding {
|
||||
println!("Chunk {}: {} dimensions", i, embedding.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user