This commit is contained in:
61
docs/snippets/rust/ocr/cloud_ocr_backend.md
Normal file
61
docs/snippets/rust/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,61 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
|
||||
use kreuzberg::{Result, ExtractionResult, OcrConfig, Metadata};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
|
||||
struct CloudOcrBackend {
|
||||
api_key: String,
|
||||
supported_langs: Vec<String>,
|
||||
}
|
||||
|
||||
impl Plugin for CloudOcrBackend {
|
||||
fn name(&self) -> &str { "cloud-ocr" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl OcrBackend for CloudOcrBackend {
|
||||
async fn process_image(
|
||||
&self,
|
||||
image_bytes: &[u8],
|
||||
config: &OcrConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
let text = self.call_cloud_api(image_bytes, &config.language).await?;
|
||||
|
||||
Ok(ExtractionResult {
|
||||
content: text,
|
||||
mime_type: "text/plain".to_string(),
|
||||
metadata: Metadata::default(),
|
||||
tables: vec![],
|
||||
detected_languages: None,
|
||||
chunks: None,
|
||||
images: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn supports_language(&self, lang: &str) -> bool {
|
||||
self.supported_langs.iter().any(|l| l == lang)
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> OcrBackendType {
|
||||
OcrBackendType::Custom
|
||||
}
|
||||
|
||||
fn supported_languages(&self) -> Vec<String> {
|
||||
self.supported_langs.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl CloudOcrBackend {
|
||||
async fn call_cloud_api(
|
||||
&self,
|
||||
image: &[u8],
|
||||
language: &str
|
||||
) -> Result<String> {
|
||||
Ok("Extracted text".to_string())
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/config_ocr.md
Normal file
18
docs/snippets/rust/ocr/config_ocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+fra".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
```
|
||||
21
docs/snippets/rust/ocr/image_extraction.md
Normal file
21
docs/snippets/rust/ocr/image_extraction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: true,
|
||||
target_dpi: 200,
|
||||
max_image_dimension: 2048,
|
||||
inject_placeholders: true, // set to false to extract images without markdown references
|
||||
auto_adjust_dpi: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
33
docs/snippets/rust/ocr/image_preprocessing.md
Normal file
33
docs/snippets/rust/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{
|
||||
extract_file_sync, ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig,
|
||||
};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let preprocessing = ImagePreprocessingConfig {
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: "otsu".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
preprocessing: Some(preprocessing),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("content length: {}", result.content.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/ocr/ocr_dpi_config.md
Normal file
20
docs/snippets/rust/ocr/ocr_dpi_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, PdfConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
dpi: Some(300),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/ocr/ocr_easyocr.md
Normal file
19
docs/snippets/rust/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "easyocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Extracted text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
35
docs/snippets/rust/ocr/ocr_elements.md
Normal file
35
docs/snippets/rust/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
use kreuzberg::types::OcrElementConfig;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "paddleocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
element_config: Some(OcrElementConfig {
|
||||
include_elements: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("scanned.pdf", None, &config).await?;
|
||||
|
||||
if let Some(elements) = &result.ocr_elements {
|
||||
for element in elements {
|
||||
println!("Text: {}", element.text);
|
||||
println!("Confidence: {:.2}", element.confidence.recognition);
|
||||
println!("Geometry: {:?}", element.geometry);
|
||||
if let Some(rotation) = &element.rotation {
|
||||
println!("Rotation: {}°", rotation.angle_degrees);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_extraction.md
Normal file
18
docs/snippets/rust/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("scanned.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_force_all_pages.md
Normal file
18
docs/snippets/rust/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
18
docs/snippets/rust/ocr/ocr_multi_language.md
Normal file
18
docs/snippets/rust/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng+deu+fra".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync("multilingual.pdf", None, &config)?;
|
||||
println!("{}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/ocr/ocr_paddleocr.md
Normal file
20
docs/snippets/rust/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, OcrConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "paddleocr".to_string(),
|
||||
language: "en".to_string(),
|
||||
// paddle_ocr_config: Some(serde_json::json!({"model_tier": "server"})), // for max accuracy
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Extracted text: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/ocr/pdf_config.md
Normal file
15
docs/snippets/rust/ocr/pdf_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, PdfConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: Some(true),
|
||||
extract_metadata: Some(true),
|
||||
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.pdf_options);
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/ocr/tesseract_config.md
Normal file
22
docs/snippets/rust/ocr/tesseract_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
language: "eng+fra+deu".to_string(),
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
psm: 6,
|
||||
oem: 1,
|
||||
min_confidence: 0.8,
|
||||
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string(),
|
||||
enable_table_detection: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.ocr);
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user