This commit is contained in:
18
docs/snippets/rust/metadata/image_extraction.md
Normal file
18
docs/snippets/rust/metadata/image_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
images: Some(ImageExtractionConfig {
|
||||
extract_images: Some(true),
|
||||
target_dpi: Some(200),
|
||||
max_image_dimension: Some(2048),
|
||||
inject_placeholders: Some(true), // set to false to extract images without markdown references
|
||||
auto_adjust_dpi: Some(true),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.images);
|
||||
}
|
||||
```
|
||||
25
docs/snippets/rust/metadata/image_preprocessing.md
Normal file
25
docs/snippets/rust/metadata/image_preprocessing.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
tesseract_config: Some(TesseractConfig {
|
||||
preprocessing: Some(ImagePreprocessingConfig {
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: "otsu".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("{:?}", config.ocr);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/rust/metadata/language_detection.md
Normal file
15
docs/snippets/rust/metadata/language_detection.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
fn main() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.9,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
println!("{:?}", config.language_detection);
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file("multilingual_document.pdf", None, &config).await?;
|
||||
|
||||
println!("Detected languages: {:?}", result.detected_languages);
|
||||
```
|
||||
82
docs/snippets/rust/metadata/metadata.md
Normal file
82
docs/snippets/rust/metadata/metadata.md
Normal file
@@ -0,0 +1,82 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
|
||||
|
||||
if let Some(pdf_meta) = result.metadata.pdf {
|
||||
if let Some(pages) = pdf_meta.page_count {
|
||||
println!("Pages: {}", pages);
|
||||
}
|
||||
if let Some(author) = pdf_meta.author {
|
||||
println!("Author: {}", author);
|
||||
}
|
||||
if let Some(title) = pdf_meta.title {
|
||||
println!("Title: {}", title);
|
||||
}
|
||||
}
|
||||
|
||||
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
|
||||
if let Some(html_meta) = html_result.metadata.html {
|
||||
if let Some(title) = html_meta.title {
|
||||
println!("Title: {}", title);
|
||||
}
|
||||
if let Some(desc) = html_meta.description {
|
||||
println!("Description: {}", desc);
|
||||
}
|
||||
|
||||
// Access keywords array
|
||||
println!("Keywords: {:?}", html_meta.keywords);
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
if let Some(canonical) = html_meta.canonical_url {
|
||||
println!("Canonical URL: {}", canonical);
|
||||
}
|
||||
|
||||
// Access Open Graph fields as a map
|
||||
if let Some(og_image) = html_meta.open_graph.get("image") {
|
||||
println!("Open Graph Image: {}", og_image);
|
||||
}
|
||||
if let Some(og_title) = html_meta.open_graph.get("title") {
|
||||
println!("Open Graph Title: {}", og_title);
|
||||
}
|
||||
|
||||
// Access Twitter Card fields as a map
|
||||
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
|
||||
println!("Twitter Card Type: {}", twitter_card);
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
if let Some(lang) = html_meta.language {
|
||||
println!("Language: {}", lang);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if !html_meta.headers.is_empty() {
|
||||
for header in &html_meta.headers {
|
||||
println!("Header (level {}): {}", header.level, header.text);
|
||||
}
|
||||
}
|
||||
|
||||
// Access links
|
||||
if !html_meta.links.is_empty() {
|
||||
for link in &html_meta.links {
|
||||
println!("Link: {} ({})", link.href, link.text);
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
if !html_meta.images.is_empty() {
|
||||
for image in &html_meta.images {
|
||||
println!("Image: {}", image.src);
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if !html_meta.structured_data.is_empty() {
|
||||
println!("Structured data items: {}", html_meta.structured_data.len());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
26
docs/snippets/rust/metadata/page_boundaries.md
Normal file
26
docs/snippets/rust/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
|
||||
let Some(pages) = &result.metadata.pages else {
|
||||
return Ok(());
|
||||
};
|
||||
let Some(boundaries) = &pages.boundaries else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
for boundary in boundaries.iter().take(3) {
|
||||
let page_text = &result.content[boundary.byte_start..boundary.byte_end];
|
||||
let preview_end = 100.min(page_text.len());
|
||||
|
||||
println!("Page {}:", boundary.page_number);
|
||||
println!(" Byte range: {}-{}", boundary.byte_start, boundary.byte_end);
|
||||
println!(" Preview: {}...", &page_text[..preview_end]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/rust/metadata/page_tracking_basic.md
Normal file
20
docs/snippets/rust/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
Use Kreuzberg::{extract_file_sync, ExtractionConfig, PageConfig};
|
||||
|
||||
Let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Let result = extract_file_sync("document.pdf", &config)?;
|
||||
|
||||
If let Some(pages) = result.pages {
|
||||
for page in pages {
|
||||
println!("Page {}:", page.page_number);
|
||||
println!(" Content: {} chars", page.content.len());
|
||||
println!(" Tables: {}", page.tables.len());
|
||||
println!(" Images: {}", page.images.len());
|
||||
}
|
||||
}
|
||||
79
docs/snippets/rust/metadata/pdf_metadata_extractor.md
Normal file
79
docs/snippets/rust/metadata/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
struct PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl PdfMetadataExtractor {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
processed_count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Plugin for PdfMetadataExtractor {
|
||||
fn name(&self) -> &str { "pdf-metadata-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn description(&self) -> &str {
|
||||
"Extracts and enriches PDF metadata"
|
||||
}
|
||||
fn initialize(&self) -> Result<()> {
|
||||
log::info!("PDF metadata extractor initialized");
|
||||
Ok(())
|
||||
}
|
||||
fn shutdown(&self) -> Result<()> {
|
||||
let count = self.processed_count.load(Ordering::Acquire);
|
||||
log::info!("Processed {} PDFs", count);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfMetadataExtractor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
self.processed_count.fetch_add(1, Ordering::AcqRel);
|
||||
|
||||
result.processing_warnings.push(ProcessingWarning {
|
||||
source: "pdf-metadata-extractor".to_string(),
|
||||
message: "PDF metadata extracted successfully".to_string()
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
|
||||
fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
|
||||
10
|
||||
}
|
||||
}
|
||||
|
||||
use kreuzberg::plugins::registry::get_post_processor_registry;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn register() -> Result<()> {
|
||||
let processor = Arc::new(PdfMetadataExtractor::new());
|
||||
let registry = get_post_processor_registry();
|
||||
registry.register(processor, 50)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
23
docs/snippets/rust/metadata/pdf_only_processor.md
Normal file
23
docs/snippets/rust/metadata/pdf_only_processor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```rust title="Rust"
|
||||
impl PostProcessor for PdfOnlyProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Middle
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/rust/metadata/tables.md
Normal file
17
docs/snippets/rust/metadata/tables.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
|
||||
|
||||
for table in &result.tables {
|
||||
println!("Table with {} rows", table.cells.len());
|
||||
println!("{}", table.markdown);
|
||||
|
||||
for row in &table.cells {
|
||||
println!("{:?}", row);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
55
docs/snippets/rust/metadata/vector_database_integration.md
Normal file
55
docs/snippets/rust/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
|
||||
|
||||
struct VectorRecord {
|
||||
id: String,
|
||||
content: String,
|
||||
embedding: Vec<f32>,
|
||||
metadata: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
async fn extract_and_vectorize(
|
||||
document_path: &str,
|
||||
document_id: &str,
|
||||
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: "balanced".to_string(),
|
||||
},
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file(document_path, None, &config).await?;
|
||||
|
||||
let mut records = Vec::new();
|
||||
if let Some(chunks) = result.chunks {
|
||||
for (index, chunk) in chunks.iter().enumerate() {
|
||||
if let Some(embedding) = &chunk.embedding {
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
metadata.insert("document_id".to_string(), document_id.to_string());
|
||||
metadata.insert("chunk_index".to_string(), index.to_string());
|
||||
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
|
||||
|
||||
records.push(VectorRecord {
|
||||
id: format!("{}_chunk_{}", document_id, index),
|
||||
content: chunk.content.clone(),
|
||||
embedding: embedding.clone(),
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user