Files
fil/docs/snippets/rust/config/chunking_config.md

80 lines
2.4 KiB
Markdown
Raw Normal View History

2026-06-01 23:40:55 +02:00
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Chunks: {}", result.chunks.len());
for chunk in &result.chunks {
println!("Length: {}", chunk.content.len());
}
Ok(())
}
```
```rust title="Rust - Markdown with Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType, ChunkSizing};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
sizing: ChunkSizing::Tokenizer {
model: "Xenova/gpt-4o".into(),
cache_dir: None,
},
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
if let Some(heading_context) = &chunk.metadata.heading_context {
for heading in &heading_context.headings {
println!("Heading L{}: {}", heading.level, heading.text);
}
}
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```
```rust title="Rust - Prepend Heading Context"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, ChunkerType};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.md", None::<&str>, &config).await?;
for chunk in &result.chunks {
// Each chunk's content is prefixed with its heading breadcrumb
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
}
Ok(())
}
```