This commit is contained in:
478
crates/kreuzberg/src/core/pipeline/mod.rs
Normal file
478
crates/kreuzberg/src/core/pipeline/mod.rs
Normal file
@@ -0,0 +1,478 @@
|
||||
//! Post-processing pipeline orchestration.
|
||||
//!
|
||||
//! This module orchestrates the post-processing pipeline, executing validators,
|
||||
//! quality processing, chunking, and custom hooks in the correct order.
|
||||
|
||||
mod cache;
|
||||
mod execution;
|
||||
mod features;
|
||||
mod format;
|
||||
mod initialization;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub use cache::clear_processor_cache;
|
||||
pub use format::apply_output_format;
|
||||
|
||||
use crate::Result;
|
||||
use crate::core::config::ExtractionConfig;
|
||||
use crate::types::ExtractionResult;
|
||||
use crate::types::internal::InternalDocument;
|
||||
|
||||
use execution::{execute_processors, execute_validators};
|
||||
use features::{execute_chunking, execute_language_detection, execute_token_reduction};
|
||||
use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};
|
||||
|
||||
/// Run the post-processing pipeline on an `InternalDocument`.
|
||||
///
|
||||
/// Derives `ExtractionResult` from `InternalDocument` via the derivation pipeline,
|
||||
/// then executes post-processing in the following order:
|
||||
/// 1. Post-Processors - Execute by stage (Early, Middle, Late) to modify/enhance the result
|
||||
/// 2. Quality Processing - Text cleaning and quality scoring
|
||||
/// 3. Chunking - Text splitting if enabled
|
||||
/// 4. Validators - Run validation hooks on the processed result (can fail fast)
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `doc` - The internal document produced by the extractor
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The processed extraction result.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - Validator errors bubble up immediately
|
||||
/// - Post-processor errors are caught and recorded in metadata
|
||||
/// - System errors (IO, RuntimeError equivalents) always bubble up
|
||||
#[cfg_attr(feature = "otel", tracing::instrument(
|
||||
skip(doc, config),
|
||||
fields(
|
||||
pipeline.stage = "post_processing",
|
||||
content.element_count = doc.elements.len(),
|
||||
)
|
||||
))]
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
pub async fn run_pipeline(mut doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
||||
// Propagate rendering preferences from config into the document.
|
||||
doc.ocr_text_only = config.images.as_ref().map(|i| i.ocr_text_only).unwrap_or(false);
|
||||
doc.append_ocr_text = config.images.as_ref().map(|i| i.append_ocr_text).unwrap_or(false);
|
||||
|
||||
// 1. Process extracted images with OCR if configured
|
||||
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
||||
let image_ocr_enabled = config.images.as_ref().map(|i| i.run_ocr_on_images).unwrap_or(true);
|
||||
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
||||
if image_ocr_enabled && config.ocr.is_some() && !doc.images.is_empty() {
|
||||
let images_to_process = std::mem::take(&mut doc.images);
|
||||
match crate::extraction::image_ocr::process_images_with_ocr(
|
||||
images_to_process,
|
||||
config,
|
||||
&mut doc.processing_warnings,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(processed) => {
|
||||
doc.images = processed;
|
||||
}
|
||||
Err(e) => {
|
||||
doc.processing_warnings.push(crate::types::ProcessingWarning {
|
||||
source: std::borrow::Cow::Borrowed("image_ocr"),
|
||||
message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
replace_embedded_image_markdown_with_ocr(&mut doc);
|
||||
append_embedded_image_ocr_text(&mut doc);
|
||||
|
||||
// Pre-render markdown for the chunker's heading context resolution when:
|
||||
// - Markdown chunking is configured
|
||||
// - Output format is not already Markdown (which would produce formatted_content anyway)
|
||||
// Plain-text rendering strips heading markers, so the markdown chunker needs
|
||||
// a separate markdown rendering to build the heading hierarchy for chunk metadata.
|
||||
#[cfg(feature = "chunking")]
|
||||
let chunker_heading_source = {
|
||||
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
|
||||
c.chunker_type == crate::core::config::ChunkerType::Markdown
|
||||
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
|
||||
}) && config.output_format == crate::core::config::OutputFormat::Plain;
|
||||
if needs_markdown {
|
||||
Some(crate::rendering::render_markdown(&doc))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Pre-render styled HTML before `doc` is consumed by `derive_extraction_result`.
|
||||
// When `html` is active and the caller has configured `html_output`, we
|
||||
// render the document here and inject the result after derivation.
|
||||
#[cfg(feature = "html")]
|
||||
let styled_html_prerender: Option<String> = {
|
||||
use crate::plugins::Renderer as _;
|
||||
if config.output_format == crate::core::config::OutputFormat::Html {
|
||||
config.html_output.as_ref().and_then(|html_cfg| {
|
||||
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
|
||||
Ok(renderer) => match renderer.render(&doc) {
|
||||
Ok(html) => Some(html),
|
||||
Err(e) => {
|
||||
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Derive ExtractionResult from InternalDocument
|
||||
let include_structure = config.include_document_structure;
|
||||
let mut result =
|
||||
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
|
||||
|
||||
// Inject pre-rendered styled HTML (overrides the default render_html output).
|
||||
#[cfg(feature = "html")]
|
||||
if let Some(html) = styled_html_prerender {
|
||||
result.formatted_content = Some(html);
|
||||
}
|
||||
|
||||
// Temporarily store pre-rendered markdown for chunker heading context.
|
||||
// Tracked separately so we can remove it after chunking — apply_output_format
|
||||
// must not swap this into result.content when output_format is Plain.
|
||||
#[cfg(feature = "chunking")]
|
||||
let chunker_only_markdown = result.formatted_content.is_none();
|
||||
#[cfg(feature = "chunking")]
|
||||
if chunker_only_markdown && let Some(md) = chunker_heading_source {
|
||||
result.formatted_content = Some(md);
|
||||
}
|
||||
|
||||
// 2. Run post-processing pipeline
|
||||
let pp_config = config.postprocessor.as_ref();
|
||||
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
|
||||
|
||||
if postprocessing_enabled {
|
||||
initialize_features();
|
||||
initialize_processor_cache()?;
|
||||
|
||||
let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;
|
||||
|
||||
execute_processors(
|
||||
&mut result,
|
||||
config,
|
||||
&pp_config,
|
||||
early_processors,
|
||||
middle_processors,
|
||||
late_processors,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
execute_chunking(&mut result, config)?;
|
||||
|
||||
// Clear temporary markdown if it was only stored for chunker heading context.
|
||||
// This prevents apply_output_format from swapping it into result.content.
|
||||
#[cfg(feature = "chunking")]
|
||||
if chunker_only_markdown {
|
||||
result.formatted_content = None;
|
||||
}
|
||||
|
||||
execute_language_detection(&mut result, config)?;
|
||||
execute_token_reduction(&mut result, config)?;
|
||||
execute_validators(&result, config).await?;
|
||||
|
||||
apply_element_transform(&mut result, config);
|
||||
normalize_nfc(&mut result);
|
||||
|
||||
// Run LLM-based structured extraction BEFORE output formatting
|
||||
// so extraction sees plain text, not markdown/HTML
|
||||
// TODO(wasm-llm): hosted structured extraction should run on wasm through
|
||||
// liter-llm's wasm-http backend once browser/runtime support is wired.
|
||||
#[cfg(all(feature = "liter-llm", not(target_os = "windows"), not(target_arch = "wasm32")))]
|
||||
if let Some(ref structured_config) = config.structured_extraction {
|
||||
match crate::llm::structured::extract_structured(&result.content, structured_config).await {
|
||||
Ok((output, usage)) => {
|
||||
result.structured_output = Some(output);
|
||||
crate::llm::usage::push_llm_usage(&mut result, usage);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Structured extraction failed: {e}");
|
||||
result.processing_warnings.push(crate::types::ProcessingWarning {
|
||||
source: std::borrow::Cow::Borrowed("structured_extraction"),
|
||||
message: std::borrow::Cow::Owned(format!("Structured extraction failed: {e}")),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(wasm-llm): keep wasm in the fallback branch until structured
|
||||
// extraction has an async wasm-compatible runtime path.
|
||||
#[cfg(any(not(feature = "liter-llm"), target_os = "windows", target_arch = "wasm32"))]
|
||||
if config.structured_extraction.is_some() {
|
||||
result.processing_warnings.push(crate::types::ProcessingWarning {
|
||||
source: std::borrow::Cow::Borrowed("structured_extraction"),
|
||||
message: std::borrow::Cow::Borrowed("Structured extraction requires the 'liter-llm' feature"),
|
||||
});
|
||||
}
|
||||
|
||||
// Apply output format conversion as the final step
|
||||
result = apply_output_format(result, config.output_format.clone());
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Run the post-processing pipeline synchronously (WASM-compatible version).
|
||||
///
|
||||
/// This is a synchronous implementation for WASM and non-async contexts.
|
||||
/// It performs a subset of the full async pipeline, excluding async post-processors
|
||||
/// and validators.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `doc` - The internal document produced by the extractor
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The processed extraction result.
|
||||
///
|
||||
/// # Notes
|
||||
///
|
||||
/// This function is only available when the `tokio-runtime` feature is disabled.
|
||||
/// It handles:
|
||||
/// - Quality processing (if enabled)
|
||||
/// - Chunking (if enabled)
|
||||
/// - Language detection (if enabled)
|
||||
///
|
||||
/// It does NOT handle:
|
||||
/// - Async post-processors
|
||||
/// - Async validators
|
||||
#[cfg(not(feature = "tokio-runtime"))]
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
pub fn run_pipeline_sync(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
||||
// Pre-render markdown for chunker heading context (same logic as async path).
|
||||
#[cfg(feature = "chunking")]
|
||||
let chunker_heading_source = {
|
||||
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
|
||||
c.chunker_type == crate::core::config::ChunkerType::Markdown
|
||||
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
|
||||
}) && config.output_format == crate::core::config::OutputFormat::Plain;
|
||||
if needs_markdown {
|
||||
Some(crate::rendering::render_markdown(&doc))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Pre-render styled HTML before `doc` is consumed (mirrors async path).
|
||||
#[cfg(feature = "html")]
|
||||
let styled_html_prerender: Option<String> = {
|
||||
use crate::plugins::Renderer as _;
|
||||
if config.output_format == crate::core::config::OutputFormat::Html {
|
||||
config.html_output.as_ref().and_then(|html_cfg| {
|
||||
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
|
||||
Ok(renderer) => match renderer.render(&doc) {
|
||||
Ok(html) => Some(html),
|
||||
Err(e) => {
|
||||
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// 1. Derive ExtractionResult from InternalDocument
|
||||
let include_structure = config.include_document_structure;
|
||||
let mut result =
|
||||
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
|
||||
|
||||
// Inject pre-rendered styled HTML.
|
||||
#[cfg(feature = "html")]
|
||||
if let Some(html) = styled_html_prerender {
|
||||
result.formatted_content = Some(html);
|
||||
}
|
||||
|
||||
#[cfg(feature = "chunking")]
|
||||
let chunker_only_markdown = result.formatted_content.is_none();
|
||||
#[cfg(feature = "chunking")]
|
||||
if chunker_only_markdown && let Some(md) = chunker_heading_source {
|
||||
result.formatted_content = Some(md);
|
||||
}
|
||||
|
||||
// 2. Run synchronous post-processing
|
||||
execute_chunking(&mut result, config)?;
|
||||
|
||||
#[cfg(feature = "chunking")]
|
||||
if chunker_only_markdown {
|
||||
result.formatted_content = None;
|
||||
}
|
||||
|
||||
execute_language_detection(&mut result, config)?;
|
||||
execute_token_reduction(&mut result, config)?;
|
||||
|
||||
apply_element_transform(&mut result, config);
|
||||
normalize_nfc(&mut result);
|
||||
|
||||
// Apply output format conversion as the final step
|
||||
result = apply_output_format(result, config.output_format.clone());
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Transform to element-based output if requested by the config.
|
||||
fn apply_element_transform(result: &mut ExtractionResult, config: &ExtractionConfig) {
|
||||
if config.result_format == crate::types::ResultFormat::ElementBased {
|
||||
result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
|
||||
result,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace inline markdown image references with OCR text for formats (e.g. PPTX)
|
||||
/// that bake placeholders into paragraph text rather than using `ElementKind::Image`.
|
||||
fn replace_embedded_image_markdown_with_ocr(doc: &mut InternalDocument) {
|
||||
if !doc.ocr_text_only || doc.images.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut image_idx = 0usize;
|
||||
|
||||
for elem in &mut doc.elements {
|
||||
if !matches!(elem.kind, crate::types::internal::ElementKind::Paragraph) {
|
||||
continue;
|
||||
}
|
||||
if !is_markdown_image_reference(&elem.text) {
|
||||
continue;
|
||||
}
|
||||
if let Some(img) = doc.images.get(image_idx)
|
||||
&& let Some(ocr) = &img.ocr_result
|
||||
&& !ocr.content.is_empty()
|
||||
{
|
||||
elem.text = ocr.content.clone();
|
||||
image_idx += 1;
|
||||
continue;
|
||||
}
|
||||
image_idx += 1;
|
||||
}
|
||||
|
||||
for table in &mut doc.tables {
|
||||
for row in &mut table.cells {
|
||||
for cell in row {
|
||||
if !is_markdown_image_reference(cell) {
|
||||
continue;
|
||||
}
|
||||
if let Some(img) = doc.images.get(image_idx)
|
||||
&& let Some(ocr) = &img.ocr_result
|
||||
&& !ocr.content.is_empty()
|
||||
{
|
||||
*cell = ocr.content.clone();
|
||||
image_idx += 1;
|
||||
continue;
|
||||
}
|
||||
image_idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Append OCR text after inline markdown image references for formats (e.g. PPTX)
|
||||
/// that bake placeholders into paragraph text. Only runs when `append_ocr_text` is
|
||||
/// `true` and `ocr_text_only` is `false`.
|
||||
fn append_embedded_image_ocr_text(doc: &mut InternalDocument) {
|
||||
if doc.ocr_text_only || !doc.append_ocr_text || doc.images.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut image_idx = 0usize;
|
||||
let mut new_elements = Vec::with_capacity(doc.elements.len() * 2);
|
||||
|
||||
for elem in &doc.elements {
|
||||
new_elements.push(elem.clone());
|
||||
|
||||
if matches!(elem.kind, crate::types::internal::ElementKind::Paragraph)
|
||||
&& is_markdown_image_reference(&elem.text)
|
||||
{
|
||||
if let Some(img) = doc.images.get(image_idx)
|
||||
&& let Some(ocr) = &img.ocr_result
|
||||
&& !ocr.content.is_empty()
|
||||
{
|
||||
let ocr_elem = crate::types::internal::InternalElement::text(
|
||||
crate::types::internal::ElementKind::Paragraph,
|
||||
ocr.content.clone(),
|
||||
0,
|
||||
);
|
||||
new_elements.push(ocr_elem);
|
||||
}
|
||||
image_idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
doc.elements = new_elements;
|
||||
|
||||
for table in &mut doc.tables {
|
||||
for row in &mut table.cells {
|
||||
for cell in row {
|
||||
if !is_markdown_image_reference(cell) {
|
||||
continue;
|
||||
}
|
||||
if let Some(img) = doc.images.get(image_idx)
|
||||
&& let Some(ocr) = &img.ocr_result
|
||||
&& !ocr.content.is_empty()
|
||||
{
|
||||
*cell = format!("{}\n\n{}", cell.trim(), ocr.content);
|
||||
}
|
||||
image_idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if `text` is exactly a markdown image reference (``).
|
||||
fn is_markdown_image_reference(text: &str) -> bool {
|
||||
let t = text.trim();
|
||||
if !t.starts_with(" else {
|
||||
return false;
|
||||
};
|
||||
if bracket_end < 2 {
|
||||
return false;
|
||||
}
|
||||
let after = &t[bracket_end + 2..];
|
||||
after.ends_with(')')
|
||||
}
|
||||
|
||||
/// Apply NFC unicode normalization to all text content.
|
||||
///
|
||||
/// Ensures consistent representation of composed characters (e.g., é vs e+combining accent)
|
||||
/// across all extraction backends (PDF, OCR, DOCX, HTML, etc.).
|
||||
fn normalize_nfc(result: &mut ExtractionResult) {
|
||||
#[cfg(feature = "quality")]
|
||||
{
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
result.content = result.content.nfc().collect();
|
||||
if let Some(pages) = result.pages.as_mut() {
|
||||
for page in pages.iter_mut() {
|
||||
page.content = page.content.nfc().collect();
|
||||
}
|
||||
}
|
||||
}
|
||||
// Suppress unused variable warning when quality feature is disabled
|
||||
let _ = result;
|
||||
}
|
||||
Reference in New Issue
Block a user