21 KiB
Kreuzberg Rust API Reference
Complete API reference for the Kreuzberg document extraction library in Rust.
Setup
Add to your Cargo.toml:
[dependencies]
kreuzberg = { version = "4", features = [
"tokio-runtime",
"pdf",
"ocr",
"chunking",
"embeddings",
"language-detection",
"keywords-yake",
"keywords-rake",
"api",
"mcp"
] }
tokio = { version = "1", features = ["full"] }
Core Features
- tokio-runtime: Enables async/sync extraction (default). Required for
extract_file_sync,batch_extract_file_sync,batch_extract_file - pdf: PDF extraction with PDFium
- ocr: Tesseract-based OCR for scanned documents
- chunking: Text chunking for RAG pipelines
- embeddings: Vector embeddings generation
- language-detection: Detect document language
- keywords-yake / keywords-rake: Extract keywords using YAKE or RAKE
- api: HTTP API with Axum
- mcp: Model Context Protocol support
Core Extraction Functions
extract_file (async)
Extract content from a file path.
pub async fn extract_file(
path: impl AsRef<Path>,
mime_type: Option<&str>,
config: &ExtractionConfig,
) -> Result<ExtractionResult>
Always available. Requires async context (#[tokio::main], tokio::spawn, etc.).
use kreuzberg::{extract_file, ExtractionConfig};
use std::path::Path;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file("document.pdf", None, &config).await?;
println!("Content: {}", result.content);
Ok(())
}
extract_bytes (async)
Extract content from byte data.
pub async fn extract_bytes(
data: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult>
Always available. Requires async context.
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let pdf_bytes = std::fs::read("document.pdf")?;
let result = extract_bytes(&pdf_bytes, "application/pdf", &config).await?;
Ok(())
}
extract_file_sync (sync)
Synchronous wrapper around extract_file.
pub fn extract_file_sync(
path: impl AsRef<Path>,
mime_type: Option<&str>,
config: &ExtractionConfig,
) -> Result<ExtractionResult>
Requires tokio-runtime feature. Blocks the current thread using a global Tokio runtime.
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file_sync("document.pdf", None, &config)?;
println!("Content: {}", result.content);
Ok(())
}
extract_bytes_sync (sync)
Synchronous wrapper around extract_bytes.
pub fn extract_bytes_sync(
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult>
Always available. Works in sync and async contexts.
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let bytes = b"Hello, world!";
let result = extract_bytes_sync(bytes, "text/plain", &config)?;
Ok(())
}
batch_extract_file (async, parallel)
Extract multiple files concurrently.
pub async fn batch_extract_file(
paths: Vec<impl AsRef<Path>>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>>
Requires tokio-runtime feature. Processes files in parallel with automatic concurrency management (defaults to num_cpus * 1.5).
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let paths = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
let results = batch_extract_file(paths, &config).await?;
println!("Processed {} files", results.len());
Ok(())
}
batch_extract_bytes (async, parallel)
Extract multiple byte arrays concurrently.
pub async fn batch_extract_bytes(
contents: Vec<(Vec<u8>, String)>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>>
Requires tokio-runtime feature. Each tuple is (bytes, mime_type).
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let contents = vec![
(b"PDF content".to_vec(), "application/pdf".to_string()),
(b"Text content".to_vec(), "text/plain".to_string()),
];
let results = batch_extract_bytes(contents, &config).await?;
Ok(())
}
batch_extract_file_sync (sync, parallel)
Synchronous wrapper for batch file extraction.
pub fn batch_extract_file_sync(
paths: Vec<impl AsRef<Path>>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>>
Requires tokio-runtime feature. Uses global runtime for concurrency.
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let paths = vec!["doc1.pdf", "doc2.pdf"];
let results = batch_extract_file_sync(paths, &config)?;
Ok(())
}
batch_extract_bytes_sync (sync, parallel)
Synchronous wrapper for batch byte extraction.
pub fn batch_extract_bytes_sync(
contents: Vec<(Vec<u8>, String)>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>>
Always available. Each tuple is (bytes, mime_type).
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let contents = vec![
(b"content 1".to_vec(), "text/plain".to_string()),
(b"content 2".to_vec(), "text/plain".to_string()),
];
let results = batch_extract_bytes_sync(contents, &config)?;
Ok(())
}
FileExtractionConfig
Per-file overrides for batch operations, passed as an optional parameter to batch_extract_file / batch_extract_bytes (and their sync variants). All fields Option<T> — None = use batch default.
Note (v4.5.0): The separate
batch_extract_file_with_configs/batch_extract_bytes_with_configsfunctions have been removed. Per-file configs are now an optional parameter on the unified batch functions.
pub struct FileExtractionConfig {
pub enable_quality_processing: Option<bool>,
pub ocr: Option<OcrConfig>,
pub force_ocr: Option<bool>,
pub chunking: Option<ChunkingConfig>,
pub images: Option<ImageExtractionConfig>,
pub pdf_options: Option<PdfConfig>,
pub token_reduction: Option<TokenReductionConfig>,
pub language_detection: Option<LanguageDetectionConfig>,
pub pages: Option<PageConfig>,
pub postprocessor: Option<PostProcessorConfig>,
pub output_format: Option<OutputFormat>,
pub include_document_structure: Option<bool>,
}
Excluded batch-level fields: max_concurrent_extractions, use_cache, acceleration, security_limits.
Configuration
ExtractionConfig
Main configuration struct for all extraction operations.
pub struct ExtractionConfig {
/// Enable caching (default: true)
pub use_cache: bool,
/// Enable quality post-processing (default: true)
pub enable_quality_processing: bool,
/// OCR configuration (None = OCR disabled)
pub ocr: Option<OcrConfig>,
/// Force OCR even for searchable PDFs (default: false)
pub force_ocr: bool,
/// Text chunking configuration (None = disabled)
pub chunking: Option<ChunkingConfig>,
/// Image extraction configuration (None = disabled)
pub images: Option<ImageExtractionConfig>,
/// PDF-specific options (requires pdf feature)
#[cfg(feature = "pdf")]
pub pdf_options: Option<PdfConfig>,
/// Token reduction configuration (None = disabled)
pub token_reduction: Option<TokenReductionConfig>,
/// Language detection configuration (None = disabled)
pub language_detection: Option<LanguageDetectionConfig>,
/// Page extraction configuration (None = disabled)
pub pages: Option<PageConfig>,
/// Keyword extraction configuration (requires keywords-yake or keywords-rake)
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
pub keywords: Option<KeywordConfig>,
/// Post-processor configuration (None = use defaults)
pub postprocessor: Option<PostProcessorConfig>,
/// HTML to Markdown conversion options (requires html feature)
#[cfg(feature = "html")]
pub html_options: Option<ConversionOptions>,
/// Maximum concurrent extractions in batch (None = num_cpus * 1.5)
pub max_concurrent_extractions: Option<usize>,
/// Result structure format (default: Unified)
/// Uses types::OutputFormat (Unified | ElementBased)
pub result_format: types::OutputFormat,
/// Security limits for archives (requires archives feature)
#[cfg(feature = "archives")]
pub security_limits: Option<SecurityLimits>,
/// Content output format (default: Plain)
/// Uses config::OutputFormat (Plain | Markdown | Djot | Html)
pub output_format: OutputFormat,
}
Creating Configs
use kreuzberg::{ExtractionConfig, OcrConfig, ChunkingConfig, OutputFormat};
// Default configuration
let config = ExtractionConfig::default();
// With OCR
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
..Default::default()
};
// With chunking
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
..Default::default()
}),
output_format: OutputFormat::Markdown,
..Default::default()
};
Output Formats
There are two separate enums both named OutputFormat in different modules:
Content OutputFormat (core::config::formats::OutputFormat)
Controls the format of the content field text. Used by ExtractionConfig::output_format.
pub enum OutputFormat {
/// Plain text (default)
Plain,
/// Markdown formatted
Markdown,
/// Djot markup format
Djot,
/// HTML format
Html,
}
Result OutputFormat (types::extraction::OutputFormat)
Controls the result structure. Used by ExtractionConfig::result_format.
pub enum OutputFormat {
/// Unified format with all content in `content` field (default)
Unified,
/// Element-based format with semantic element extraction
ElementBased,
}
use kreuzberg::{ExtractionConfig, OutputFormat};
let config = ExtractionConfig {
output_format: OutputFormat::Markdown, // content format (Plain/Markdown/Djot/Html)
// result_format uses types::OutputFormat (Unified/ElementBased) — defaults to Unified
..Default::default()
};
Extraction Result
ExtractionResult
Result returned by all extraction functions.
pub struct ExtractionResult {
/// Main extracted content
pub content: String,
/// Document MIME type
pub mime_type: Cow<'static, str>,
/// Metadata about extraction
pub metadata: Metadata,
/// Extracted tables (HTML/Markdown)
pub tables: Vec<Table>,
/// Detected languages (if language-detection enabled)
pub detected_languages: Option<Vec<String>>,
/// Text chunks (if chunking enabled)
pub chunks: Option<Vec<Chunk>>,
/// Extracted images (if image extraction enabled)
pub images: Option<Vec<ExtractedImage>>,
/// Per-page content (if page extraction enabled)
pub pages: Option<Vec<PageContent>>,
/// Semantic elements (if element-based format enabled)
pub elements: Option<Vec<Element>>,
/// Djot document structure (if extracting Djot)
pub djot_content: Option<DjotContent>,
/// Extracted keywords with relevance scores (if keyword extraction enabled)
pub extracted_keywords: Option<Vec<ExtractedKeyword>>,
/// Quality score for extraction result (0.0-1.0)
pub quality_score: Option<f64>,
/// Non-fatal warnings during processing pipeline
pub processing_warnings: Vec<ProcessingWarning>,
}
ExtractedKeyword
Extracted keyword with relevance score and position information.
pub struct ExtractedKeyword {
/// Keyword text
pub text: String,
/// Relevance score (0.0-1.0)
pub score: f32,
/// Algorithm used for extraction ("tfidf", "textrank", "yake", etc.)
pub algorithm: String,
/// Character positions in content (if available)
pub positions: Option<Vec<usize>>,
}
ProcessingWarning
Non-fatal warning encountered during document processing.
pub struct ProcessingWarning {
/// Component that generated the warning
pub source: String,
/// Warning message describing the issue
pub message: String,
}
Chunk
Text chunk with optional embedding.
pub struct Chunk {
/// Chunk text content
pub content: String,
/// Optional embedding vector
pub embedding: Option<Vec<f32>>,
/// Chunk metadata
pub metadata: ChunkMetadata,
}
pub struct ChunkMetadata {
pub byte_start: usize,
pub byte_end: usize,
pub token_count: Option<usize>,
pub chunk_index: usize,
pub total_chunks: usize,
pub first_page: Option<usize>,
pub last_page: Option<usize>,
}
ExtractedImage
Image extracted from document.
pub struct ExtractedImage {
/// Raw image bytes
pub data: Bytes,
/// Format: "jpeg", "png", "webp", etc.
pub format: Cow<'static, str>,
/// Zero-indexed position
pub image_index: usize,
/// Page number (1-indexed)
pub page_number: Option<usize>,
/// Image dimensions
pub width: Option<u32>,
pub height: Option<u32>,
/// Colorspace: "RGB", "CMYK", "Gray"
pub colorspace: Option<String>,
/// Bits per component
pub bits_per_component: Option<u32>,
/// Whether this is a mask image
pub is_mask: bool,
/// Image description
pub description: Option<String>,
/// Nested OCR result (if OCRed)
pub ocr_result: Option<Box<ExtractionResult>>,
}
Error Handling
KreuzbergError enum
pub enum KreuzbergError {
/// File system errors (always bubble up)
Io(std::io::Error),
/// Document parsing errors
Parsing {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// OCR processing errors
Ocr {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// Configuration/input validation errors
Validation {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// Cache operation errors
Cache {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// Image processing errors
ImageProcessing {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// Serialization errors (JSON, MessagePack)
Serialization {
message: String,
source: Option<Box<dyn std::error::Error + Send + Sync>>,
},
/// Missing system dependency (e.g. Tesseract)
MissingDependency(String),
/// Plugin-specific errors
Plugin {
message: String,
plugin_name: String,
},
/// Mutex/RwLock poisoning
LockPoisoned(String),
/// Unsupported MIME type or format
UnsupportedFormat(String),
/// Other errors
Other(String),
}
Error Constructors
use kreuzberg::KreuzbergError;
// Create errors
let err = KreuzbergError::parsing("invalid PDF");
let err = KreuzbergError::ocr("Tesseract failed");
let err = KreuzbergError::validation("config invalid");
let err = KreuzbergError::unsupported_format("application/unknown");
let err = KreuzbergError::missing_dependency("tesseract");
// With source
let source = std::io::Error::new(std::io::ErrorKind::NotFound, "file missing");
let err = KreuzbergError::parsing_with_source("corrupt PDF", source);
Handling Errors
use kreuzberg::extract_file;
match extract_file("doc.pdf", None, &config).await {
Ok(result) => println!("Success: {}", result.content),
Err(kreuzberg::KreuzbergError::Io(e)) => {
println!("File error: {}", e);
}
Err(kreuzberg::KreuzbergError::UnsupportedFormat(fmt)) => {
println!("Unsupported: {}", fmt);
}
Err(e) => println!("Other error: {}", e),
}
MIME Type Detection
detect_mime_type
Detect MIME type from file path.
pub fn detect_mime_type(path: impl AsRef<Path>) -> Result<String>
use kreuzberg::detect_mime_type;
let mime = detect_mime_type("document.pdf")?;
assert_eq!(mime, "application/pdf");
detect_mime_type_from_bytes
Detect MIME type from byte data.
pub fn detect_mime_type_from_bytes(data: &[u8]) -> Result<String>
validate_mime_type
Check if a MIME type is supported.
pub fn validate_mime_type(mime_type: &str) -> Result<()>
use kreuzberg::validate_mime_type;
validate_mime_type("application/pdf")?; // OK
validate_mime_type("application/unknown")?; // Error
get_extensions_for_mime
Get file extensions for a MIME type.
pub fn get_extensions_for_mime(mime_type: &str) -> Vec<String>
use kreuzberg::get_extensions_for_mime;
let exts = get_extensions_for_mime("application/pdf");
// ["pdf"]
let exts = get_extensions_for_mime("text/plain");
// ["txt", "text"]
MIME Type Constants
use kreuzberg::{
PDF_MIME_TYPE,
PLAIN_TEXT_MIME_TYPE,
HTML_MIME_TYPE,
MARKDOWN_MIME_TYPE,
JSON_MIME_TYPE,
XML_MIME_TYPE,
DOCX_MIME_TYPE,
POWER_POINT_MIME_TYPE,
EXCEL_MIME_TYPE,
};
assert_eq!(PDF_MIME_TYPE, "application/pdf");
assert_eq!(PLAIN_TEXT_MIME_TYPE, "text/plain");
Plugin Registry
Access extractors, OCR backends, and validators.
get_document_extractor_registry
Get all available document extractors.
pub fn get_document_extractor_registry() -> Arc<RwLock<DocumentExtractorRegistry>>
get_ocr_backend_registry
Get all available OCR backends.
pub fn get_ocr_backend_registry() -> Arc<RwLock<OcrBackendRegistry>>
get_post_processor_registry
Get all available post-processors.
pub fn get_post_processor_registry() -> Arc<RwLock<PostProcessorRegistry>>
get_validator_registry
Get all available validators.
pub fn get_validator_registry() -> Arc<RwLock<ValidatorRegistry>>
Complete Example
use kreuzberg::{
extract_file, ExtractionConfig, OutputFormat,
ChunkingConfig, OcrConfig, LanguageDetectionConfig,
};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
// Configure extraction
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig::default()),
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
..Default::default()
}),
force_ocr: false,
..Default::default()
};
// Extract from file
let result = extract_file("document.pdf", None, &config).await?;
// Use results
println!("Content:\n{}", result.content);
println!("MIME: {}", result.mime_type);
if let Some(langs) = result.detected_languages {
println!("Languages: {:?}", langs);
}
if let Some(chunks) = result.chunks {
println!("Chunks: {}", chunks.len());
for chunk in chunks {
println!(" - {}", &chunk.content[..50.min(chunk.content.len())]);
}
}
if let Some(images) = result.images {
println!("Images: {}", images.len());
}
if let Some(pages) = result.pages {
println!("Pages: {}", pages.len());
}
Ok(())
}
Result Type Alias
pub type Result<T> = std::result::Result<T, KreuzbergError>;
All fallible operations return Result<T> where errors are KreuzbergError.
Feature Flags Summary
| Feature | Availability | Dependencies |
|---|---|---|
| tokio-runtime | Default | Tokio runtime for async/sync |
| Default | PDFium | |
| ocr | Optional | Tesseract |
| chunking | Optional | text-splitter |
| embeddings | Optional | FastEmbed, requires tokio-runtime |
| language-detection | Optional | whatlang |
| keywords-yake | Optional | yake-rust |
| keywords-rake | Optional | rake |
| api | Optional | Axum, requires tokio-runtime |
| mcp | Optional | Model Context Protocol, requires tokio-runtime |
Version
This reference is for Kreuzberg 4.x.