Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,61 @@
```rust title="Rust"
use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
use kreuzberg::{Result, ExtractionResult, OcrConfig, Metadata};
use async_trait::async_trait;
use std::path::Path;
struct CloudOcrBackend {
api_key: String,
supported_langs: Vec<String>,
}
impl Plugin for CloudOcrBackend {
fn name(&self) -> &str { "cloud-ocr" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl OcrBackend for CloudOcrBackend {
async fn process_image(
&self,
image_bytes: &[u8],
config: &OcrConfig,
) -> Result<ExtractionResult> {
let text = self.call_cloud_api(image_bytes, &config.language).await?;
Ok(ExtractionResult {
content: text,
mime_type: "text/plain".to_string(),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
})
}
fn supports_language(&self, lang: &str) -> bool {
self.supported_langs.iter().any(|l| l == lang)
}
fn backend_type(&self) -> OcrBackendType {
OcrBackendType::Custom
}
fn supported_languages(&self) -> Vec<String> {
self.supported_langs.clone()
}
}
impl CloudOcrBackend {
async fn call_cloud_api(
&self,
image: &[u8],
language: &str
) -> Result<String> {
Ok("Extracted text".to_string())
}
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+fra".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 3,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
}
```

View File

@@ -0,0 +1,21 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
target_dpi: 200,
max_image_dimension: 2048,
inject_placeholders: true, // set to false to extract images without markdown references
auto_adjust_dpi: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,33 @@
```rust title="Rust"
use kreuzberg::{
extract_file_sync, ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig,
};
fn main() -> kreuzberg::Result<()> {
let preprocessing = ImagePreprocessingConfig {
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: "otsu".to_string(),
..Default::default()
};
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: Some(TesseractConfig {
preprocessing: Some(preprocessing),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("content length: {}", result.content.len());
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, PdfConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
pdf_options: Some(PdfConfig {
dpi: Some(300),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
Ok(())
}
```

View File

@@ -0,0 +1,19 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "easyocr".to_string(),
language: "en".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Extracted text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,35 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
use kreuzberg::types::OcrElementConfig;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "paddleocr".to_string(),
language: "en".to_string(),
element_config: Some(OcrElementConfig {
include_elements: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("scanned.pdf", None, &config).await?;
if let Some(elements) = &result.ocr_elements {
for element in elements {
println!("Text: {}", element.text);
println!("Confidence: {:.2}", element.confidence.recognition);
println!("Geometry: {:?}", element.geometry);
if let Some(rotation) = &element.rotation {
println!("Rotation: {}°", rotation.angle_degrees);
}
println!();
}
}
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,18 @@
```rust title="Rust"
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+deu+fra".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("multilingual.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,20 @@
```rust title="Rust"
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "paddleocr".to_string(),
language: "en".to_string(),
// paddle_ocr_config: Some(serde_json::json!({"model_tier": "server"})), // for max accuracy
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Extracted text: {}", result.content);
Ok(())
}
```

View File

@@ -0,0 +1,15 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, PdfConfig};
fn main() {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_images: Some(true),
extract_metadata: Some(true),
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.pdf_options);
}
```

View File

@@ -0,0 +1,22 @@
```rust title="Rust"
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
language: "eng+fra+deu".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 6,
oem: 1,
min_confidence: 0.8,
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string(),
enable_table_detection: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
```