Files
fil/docs/snippets/rust/metadata/pdf_metadata_extractor.md

80 lines
2.0 KiB
Markdown
Raw Normal View History

2026-06-01 23:40:55 +02:00
```rust title="Rust"
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
use std::sync::atomic::{AtomicUsize, Ordering};
struct PdfMetadataExtractor {
processed_count: AtomicUsize,
}
impl PdfMetadataExtractor {
fn new() -> Self {
Self {
processed_count: AtomicUsize::new(0),
}
}
}
impl Plugin for PdfMetadataExtractor {
fn name(&self) -> &str { "pdf-metadata-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn description(&self) -> &str {
"Extracts and enriches PDF metadata"
}
fn initialize(&self) -> Result<()> {
log::info!("PDF metadata extractor initialized");
Ok(())
}
fn shutdown(&self) -> Result<()> {
let count = self.processed_count.load(Ordering::Acquire);
log::info!("Processed {} PDFs", count);
Ok(())
}
}
#[async_trait]
impl PostProcessor for PdfMetadataExtractor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
self.processed_count.fetch_add(1, Ordering::AcqRel);
result.processing_warnings.push(ProcessingWarning {
source: "pdf-metadata-extractor".to_string(),
message: "PDF metadata extracted successfully".to_string()
});
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> bool {
result.mime_type == "application/pdf"
}
fn estimated_duration_ms(&self, _result: &ExtractionResult) -> u64 {
10
}
}
use kreuzberg::plugins::registry::get_post_processor_registry;
use std::sync::Arc;
fn register() -> Result<()> {
let processor = Arc::new(PdfMetadataExtractor::new());
let registry = get_post_processor_registry();
registry.register(processor, 50)?;
Ok(())
}
```