Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1500,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "all-minilm-l6-v2".to_string() },
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,34 @@
```rust title="Rust"
use kreuzberg::{
extract_file, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig,
};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
normalize: true,
batch_size: 16,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
let mut chunks_with_embeddings = 0usize;
for chunk in result.chunks.unwrap_or_default() {
if chunk.embedding.is_some() {
chunks_with_embeddings += 1;
}
}
println!("Chunks with embeddings: {chunks_with_embeddings}");
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1024,
overlap: 100,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
normalize: true,
batch_size: 32,
show_download_progress: false,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, KeywordAlgorithm, KeywordConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
for kw in result.extracted_keywords.unwrap_or_default() {
println!("{}: {:.3}", kw.text, kw.score);
}
Ok(())
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("scanned_document.pdf", None, &config).await?;
let quality_score = result.quality_score.unwrap_or(0.0);
if quality_score < 0.5 {
println!("Warning: Low quality extraction ({quality_score:.2})");
println!("Consider re-scanning with higher DPI or adjusting OCR settings");
} else {
println!("Quality score: {quality_score:.2}");
}
Ok(())
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{EmbeddingConfig, EmbeddingModelType, embed_texts};
let config = EmbeddingConfig {
model: EmbeddingModelType::Preset { name: "balanced".to_string() },
normalize: true,
..Default::default()
};
let texts = vec!["Hello, world!", "Kreuzberg is fast"];
let embeddings = embed_texts(&texts, &config)?;
assert_eq!(embeddings.len(), 2);
assert_eq!(embeddings[0].len(), 768);
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionOptions};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionOptions {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,39 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionOptions};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
token_reduction: Some(TokenReductionOptions {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
..Default::default()
};
let result = extract_file("verbose_document.pdf", None, &config).await?;
let original = result
.metadata
.additional
.get("original_token_count")
.and_then(|v| v.as_u64())
.unwrap_or(0);
let reduced = result
.metadata
.additional
.get("token_count")
.and_then(|v| v.as_u64())
.unwrap_or(0);
let ratio = result
.metadata
.additional
.get("token_reduction_ratio")
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
println!("Reduced from {original} to {reduced} tokens");
println!("Reduction: {:.1}%", ratio * 100.0);
Ok(())
}
```

View File

@@ -0,0 +1,27 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset { name: "balanced".to_string() },
normalize: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
if let Some(chunks) = result.chunks {
for (i, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
println!("Chunk {}: {} dimensions", i, embedding.len());
}
}
}
```