Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
fn main() {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: Some(true),
target_dpi: Some(200),
max_image_dimension: Some(2048),
inject_placeholders: Some(true), // set to false to extract images without markdown references
auto_adjust_dpi: Some(true),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.images);
}
```

View File

@@ -0,0 +1,25 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
tesseract_config: Some(TesseractConfig {
preprocessing: Some(ImagePreprocessingConfig {
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: "otsu".to_string(),
..Default::default()
}),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
fn main() {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.9,
detect_multiple: true,
}),
..Default::default()
};
println!("{:?}", config.language_detection);
}
```

View File

@@ -0,0 +1,16 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("multilingual_document.pdf", None, &config).await?;
println!("Detected languages: {:?}", result.detected_languages);
```

View File

@@ -0,0 +1,82 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
// Access keywords array
println!("Keywords: {:?}", html_meta.keywords);
// Access canonical URL (renamed from canonical)
if let Some(canonical) = html_meta.canonical_url {
println!("Canonical URL: {}", canonical);
}
// Access Open Graph fields as a map
if let Some(og_image) = html_meta.open_graph.get("image") {
println!("Open Graph Image: {}", og_image);
}
if let Some(og_title) = html_meta.open_graph.get("title") {
println!("Open Graph Title: {}", og_title);
}
// Access Twitter Card fields as a map
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
println!("Twitter Card Type: {}", twitter_card);
}
// Access new fields
if let Some(lang) = html_meta.language {
println!("Language: {}", lang);
}
// Access headers
if !html_meta.headers.is_empty() {
for header in &html_meta.headers {
println!("Header (level {}): {}", header.level, header.text);
}
}
// Access links
if !html_meta.links.is_empty() {
for link in &html_meta.links {
println!("Link: {} ({})", link.href, link.text);
}
}
// Access images
if !html_meta.images.is_empty() {
for image in &html_meta.images {
println!("Image: {}", image.src);
}
}
// Access structured data
if !html_meta.structured_data.is_empty() {
println!("Structured data items: {}", html_meta.structured_data.len());
}
}
Ok(())
}
```

View File

@@ -0,0 +1,26 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file_sync("document.pdf", None, &config)?;
let Some(pages) = &result.metadata.pages else {
return Ok(());
};
let Some(boundaries) = &pages.boundaries else {
return Ok(());
};
for boundary in boundaries.iter().take(3) {
let page_text = &result.content[boundary.byte_start..boundary.byte_end];
let preview_end = 100.min(page_text.len());
println!("Page {}:", boundary.page_number);
println!(" Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
println!(" Preview: {}...", &page_text[..preview_end]);
}
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
Let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
..Default::default()
};
Let result = extract_file_sync("document.pdf", &config)?;
If let Some(pages) = result.pages {
for page in pages {
println!("Page {}:", page.page_number);
println!(" Content: {} chars", page.content.len());
println!(" Tables: {}", page.tables.len());
println!(" Images: {}", page.images.len());
}
}

View File

@@ -0,0 +1,79 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
use std::sync::atomic::{AtomicUsize, Ordering};
struct PdfMetadataExtractor {
processed_count: AtomicUsize,
}
impl PdfMetadataExtractor {
fn new() -> Self {
Self {
processed_count: AtomicUsize::new(0),
}
}
}
impl Plugin for PdfMetadataExtractor {
fn name(&self) -> &str { "pdf-metadata-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn description(&self) -> &str {
"Extracts and enriches PDF metadata"
}
fn initialize(&self) -> Result<()> {
log::info!("PDF metadata extractor initialized");
Ok(())
}
fn shutdown(&self) -> Result<()> {
let count = self.processed_count.load(Ordering::Acquire);
log::info!("Processed {} PDFs", count);
Ok(())
}
}
#[async_trait]
impl PostProcessor for PdfMetadataExtractor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
self.processed_count.fetch_add(1, Ordering::AcqRel);
result.processing_warnings.push(ProcessingWarning {
source: "pdf-metadata-extractor".to_string(),
message: "PDF metadata extracted successfully".to_string()
});
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> bool {
result.mime_type == "application/pdf"
}
fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
10
}
}
use kreuzberg::plugins::registry::get_post_processor_registry;
use std::sync::Arc;
fn register() -> Result<()> {
let processor = Arc::new(PdfMetadataExtractor::new());
let registry = get_post_processor_registry();
registry.register(processor, 50)?;
Ok(())
}
```

View File

@@ -0,0 +1,23 @@
```rust title="Rust"
impl PostProcessor for PdfOnlyProcessor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig
) -> Result<()> {
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Middle
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig
) -> bool {
result.mime_type == "application/pdf"
}
}
```

View File

@@ -0,0 +1,17 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
```

View File

@@ -0,0 +1,55 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
struct VectorRecord {
id: String,
content: String,
embedding: Vec<f32>,
metadata: std::collections::HashMap<String, String>,
}
async fn extract_and_vectorize(
document_path: &str,
document_id: &str,
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
normalize: true,
batch_size: 32,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file(document_path, None, &config).await?;
let mut records = Vec::new();
if let Some(chunks) = result.chunks {
for (index, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
let mut metadata = std::collections::HashMap::new();
metadata.insert("document_id".to_string(), document_id.to_string());
metadata.insert("chunk_index".to_string(), index.to_string());
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
records.push(VectorRecord {
id: format!("{}_chunk_{}", document_id, index),
content: chunk.content.clone(),
embedding: embedding.clone(),
metadata,
});
}
}
}
Ok(records)
}
```