479 lines
18 KiB
Rust
479 lines
18 KiB
Rust
//! Post-processing pipeline orchestration.
|
|
//!
|
|
//! This module orchestrates the post-processing pipeline, executing validators,
|
|
//! quality processing, chunking, and custom hooks in the correct order.
|
|
|
|
mod cache;
|
|
mod execution;
|
|
mod features;
|
|
mod format;
|
|
mod initialization;
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|
|
|
|
pub use cache::clear_processor_cache;
|
|
pub use format::apply_output_format;
|
|
|
|
use crate::Result;
|
|
use crate::core::config::ExtractionConfig;
|
|
use crate::types::ExtractionResult;
|
|
use crate::types::internal::InternalDocument;
|
|
|
|
use execution::{execute_processors, execute_validators};
|
|
use features::{execute_chunking, execute_language_detection, execute_token_reduction};
|
|
use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};
|
|
|
|
/// Run the post-processing pipeline on an `InternalDocument`.
|
|
///
|
|
/// Derives `ExtractionResult` from `InternalDocument` via the derivation pipeline,
|
|
/// then executes post-processing in the following order:
|
|
/// 1. Post-Processors - Execute by stage (Early, Middle, Late) to modify/enhance the result
|
|
/// 2. Quality Processing - Text cleaning and quality scoring
|
|
/// 3. Chunking - Text splitting if enabled
|
|
/// 4. Validators - Run validation hooks on the processed result (can fail fast)
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `doc` - The internal document produced by the extractor
|
|
/// * `config` - Extraction configuration
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// The processed extraction result.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// - Validator errors bubble up immediately
|
|
/// - Post-processor errors are caught and recorded in metadata
|
|
/// - System errors (IO, RuntimeError equivalents) always bubble up
|
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
skip(doc, config),
|
|
fields(
|
|
pipeline.stage = "post_processing",
|
|
content.element_count = doc.elements.len(),
|
|
)
|
|
))]
|
|
#[cfg_attr(alef, alef(skip))]
|
|
pub async fn run_pipeline(mut doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
// Propagate rendering preferences from config into the document.
|
|
doc.ocr_text_only = config.images.as_ref().map(|i| i.ocr_text_only).unwrap_or(false);
|
|
doc.append_ocr_text = config.images.as_ref().map(|i| i.append_ocr_text).unwrap_or(false);
|
|
|
|
// 1. Process extracted images with OCR if configured
|
|
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
|
let image_ocr_enabled = config.images.as_ref().map(|i| i.run_ocr_on_images).unwrap_or(true);
|
|
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
|
if image_ocr_enabled && config.ocr.is_some() && !doc.images.is_empty() {
|
|
let images_to_process = std::mem::take(&mut doc.images);
|
|
match crate::extraction::image_ocr::process_images_with_ocr(
|
|
images_to_process,
|
|
config,
|
|
&mut doc.processing_warnings,
|
|
)
|
|
.await
|
|
{
|
|
Ok(processed) => {
|
|
doc.images = processed;
|
|
}
|
|
Err(e) => {
|
|
doc.processing_warnings.push(crate::types::ProcessingWarning {
|
|
source: std::borrow::Cow::Borrowed("image_ocr"),
|
|
message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
replace_embedded_image_markdown_with_ocr(&mut doc);
|
|
append_embedded_image_ocr_text(&mut doc);
|
|
|
|
// Pre-render markdown for the chunker's heading context resolution when:
|
|
// - Markdown chunking is configured
|
|
// - Output format is not already Markdown (which would produce formatted_content anyway)
|
|
// Plain-text rendering strips heading markers, so the markdown chunker needs
|
|
// a separate markdown rendering to build the heading hierarchy for chunk metadata.
|
|
#[cfg(feature = "chunking")]
|
|
let chunker_heading_source = {
|
|
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
|
|
c.chunker_type == crate::core::config::ChunkerType::Markdown
|
|
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
|
|
}) && config.output_format == crate::core::config::OutputFormat::Plain;
|
|
if needs_markdown {
|
|
Some(crate::rendering::render_markdown(&doc))
|
|
} else {
|
|
None
|
|
}
|
|
};
|
|
|
|
// Pre-render styled HTML before `doc` is consumed by `derive_extraction_result`.
|
|
// When `html` is active and the caller has configured `html_output`, we
|
|
// render the document here and inject the result after derivation.
|
|
#[cfg(feature = "html")]
|
|
let styled_html_prerender: Option<String> = {
|
|
use crate::plugins::Renderer as _;
|
|
if config.output_format == crate::core::config::OutputFormat::Html {
|
|
config.html_output.as_ref().and_then(|html_cfg| {
|
|
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
|
|
Ok(renderer) => match renderer.render(&doc) {
|
|
Ok(html) => Some(html),
|
|
Err(e) => {
|
|
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
|
|
None
|
|
}
|
|
},
|
|
Err(e) => {
|
|
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
|
|
None
|
|
}
|
|
}
|
|
})
|
|
} else {
|
|
None
|
|
}
|
|
};
|
|
|
|
// 2. Derive ExtractionResult from InternalDocument
|
|
let include_structure = config.include_document_structure;
|
|
let mut result =
|
|
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
|
|
|
|
// Inject pre-rendered styled HTML (overrides the default render_html output).
|
|
#[cfg(feature = "html")]
|
|
if let Some(html) = styled_html_prerender {
|
|
result.formatted_content = Some(html);
|
|
}
|
|
|
|
// Temporarily store pre-rendered markdown for chunker heading context.
|
|
// Tracked separately so we can remove it after chunking — apply_output_format
|
|
// must not swap this into result.content when output_format is Plain.
|
|
#[cfg(feature = "chunking")]
|
|
let chunker_only_markdown = result.formatted_content.is_none();
|
|
#[cfg(feature = "chunking")]
|
|
if chunker_only_markdown && let Some(md) = chunker_heading_source {
|
|
result.formatted_content = Some(md);
|
|
}
|
|
|
|
// 2. Run post-processing pipeline
|
|
let pp_config = config.postprocessor.as_ref();
|
|
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
|
|
|
|
if postprocessing_enabled {
|
|
initialize_features();
|
|
initialize_processor_cache()?;
|
|
|
|
let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;
|
|
|
|
execute_processors(
|
|
&mut result,
|
|
config,
|
|
&pp_config,
|
|
early_processors,
|
|
middle_processors,
|
|
late_processors,
|
|
)
|
|
.await?;
|
|
}
|
|
|
|
execute_chunking(&mut result, config)?;
|
|
|
|
// Clear temporary markdown if it was only stored for chunker heading context.
|
|
// This prevents apply_output_format from swapping it into result.content.
|
|
#[cfg(feature = "chunking")]
|
|
if chunker_only_markdown {
|
|
result.formatted_content = None;
|
|
}
|
|
|
|
execute_language_detection(&mut result, config)?;
|
|
execute_token_reduction(&mut result, config)?;
|
|
execute_validators(&result, config).await?;
|
|
|
|
apply_element_transform(&mut result, config);
|
|
normalize_nfc(&mut result);
|
|
|
|
// Run LLM-based structured extraction BEFORE output formatting
|
|
// so extraction sees plain text, not markdown/HTML
|
|
// TODO(wasm-llm): hosted structured extraction should run on wasm through
|
|
// liter-llm's wasm-http backend once browser/runtime support is wired.
|
|
#[cfg(all(feature = "liter-llm", not(target_os = "windows"), not(target_arch = "wasm32")))]
|
|
if let Some(ref structured_config) = config.structured_extraction {
|
|
match crate::llm::structured::extract_structured(&result.content, structured_config).await {
|
|
Ok((output, usage)) => {
|
|
result.structured_output = Some(output);
|
|
crate::llm::usage::push_llm_usage(&mut result, usage);
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!("Structured extraction failed: {e}");
|
|
result.processing_warnings.push(crate::types::ProcessingWarning {
|
|
source: std::borrow::Cow::Borrowed("structured_extraction"),
|
|
message: std::borrow::Cow::Owned(format!("Structured extraction failed: {e}")),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO(wasm-llm): keep wasm in the fallback branch until structured
|
|
// extraction has an async wasm-compatible runtime path.
|
|
#[cfg(any(not(feature = "liter-llm"), target_os = "windows", target_arch = "wasm32"))]
|
|
if config.structured_extraction.is_some() {
|
|
result.processing_warnings.push(crate::types::ProcessingWarning {
|
|
source: std::borrow::Cow::Borrowed("structured_extraction"),
|
|
message: std::borrow::Cow::Borrowed("Structured extraction requires the 'liter-llm' feature"),
|
|
});
|
|
}
|
|
|
|
// Apply output format conversion as the final step
|
|
result = apply_output_format(result, config.output_format.clone());
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Run the post-processing pipeline synchronously (WASM-compatible version).
|
|
///
|
|
/// This is a synchronous implementation for WASM and non-async contexts.
|
|
/// It performs a subset of the full async pipeline, excluding async post-processors
|
|
/// and validators.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `doc` - The internal document produced by the extractor
|
|
/// * `config` - Extraction configuration
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// The processed extraction result.
|
|
///
|
|
/// # Notes
|
|
///
|
|
/// This function is only available when the `tokio-runtime` feature is disabled.
|
|
/// It handles:
|
|
/// - Quality processing (if enabled)
|
|
/// - Chunking (if enabled)
|
|
/// - Language detection (if enabled)
|
|
///
|
|
/// It does NOT handle:
|
|
/// - Async post-processors
|
|
/// - Async validators
|
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
#[cfg_attr(alef, alef(skip))]
|
|
pub fn run_pipeline_sync(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
// Pre-render markdown for chunker heading context (same logic as async path).
|
|
#[cfg(feature = "chunking")]
|
|
let chunker_heading_source = {
|
|
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
|
|
c.chunker_type == crate::core::config::ChunkerType::Markdown
|
|
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
|
|
}) && config.output_format == crate::core::config::OutputFormat::Plain;
|
|
if needs_markdown {
|
|
Some(crate::rendering::render_markdown(&doc))
|
|
} else {
|
|
None
|
|
}
|
|
};
|
|
|
|
// Pre-render styled HTML before `doc` is consumed (mirrors async path).
|
|
#[cfg(feature = "html")]
|
|
let styled_html_prerender: Option<String> = {
|
|
use crate::plugins::Renderer as _;
|
|
if config.output_format == crate::core::config::OutputFormat::Html {
|
|
config.html_output.as_ref().and_then(|html_cfg| {
|
|
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
|
|
Ok(renderer) => match renderer.render(&doc) {
|
|
Ok(html) => Some(html),
|
|
Err(e) => {
|
|
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
|
|
None
|
|
}
|
|
},
|
|
Err(e) => {
|
|
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
|
|
None
|
|
}
|
|
}
|
|
})
|
|
} else {
|
|
None
|
|
}
|
|
};
|
|
|
|
// 1. Derive ExtractionResult from InternalDocument
|
|
let include_structure = config.include_document_structure;
|
|
let mut result =
|
|
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
|
|
|
|
// Inject pre-rendered styled HTML.
|
|
#[cfg(feature = "html")]
|
|
if let Some(html) = styled_html_prerender {
|
|
result.formatted_content = Some(html);
|
|
}
|
|
|
|
#[cfg(feature = "chunking")]
|
|
let chunker_only_markdown = result.formatted_content.is_none();
|
|
#[cfg(feature = "chunking")]
|
|
if chunker_only_markdown && let Some(md) = chunker_heading_source {
|
|
result.formatted_content = Some(md);
|
|
}
|
|
|
|
// 2. Run synchronous post-processing
|
|
execute_chunking(&mut result, config)?;
|
|
|
|
#[cfg(feature = "chunking")]
|
|
if chunker_only_markdown {
|
|
result.formatted_content = None;
|
|
}
|
|
|
|
execute_language_detection(&mut result, config)?;
|
|
execute_token_reduction(&mut result, config)?;
|
|
|
|
apply_element_transform(&mut result, config);
|
|
normalize_nfc(&mut result);
|
|
|
|
// Apply output format conversion as the final step
|
|
result = apply_output_format(result, config.output_format.clone());
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Transform to element-based output if requested by the config.
|
|
fn apply_element_transform(result: &mut ExtractionResult, config: &ExtractionConfig) {
|
|
if config.result_format == crate::types::ResultFormat::ElementBased {
|
|
result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
|
|
result,
|
|
));
|
|
}
|
|
}
|
|
|
|
/// Replace inline markdown image references with OCR text for formats (e.g. PPTX)
|
|
/// that bake placeholders into paragraph text rather than using `ElementKind::Image`.
|
|
fn replace_embedded_image_markdown_with_ocr(doc: &mut InternalDocument) {
|
|
if !doc.ocr_text_only || doc.images.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let mut image_idx = 0usize;
|
|
|
|
for elem in &mut doc.elements {
|
|
if !matches!(elem.kind, crate::types::internal::ElementKind::Paragraph) {
|
|
continue;
|
|
}
|
|
if !is_markdown_image_reference(&elem.text) {
|
|
continue;
|
|
}
|
|
if let Some(img) = doc.images.get(image_idx)
|
|
&& let Some(ocr) = &img.ocr_result
|
|
&& !ocr.content.is_empty()
|
|
{
|
|
elem.text = ocr.content.clone();
|
|
image_idx += 1;
|
|
continue;
|
|
}
|
|
image_idx += 1;
|
|
}
|
|
|
|
for table in &mut doc.tables {
|
|
for row in &mut table.cells {
|
|
for cell in row {
|
|
if !is_markdown_image_reference(cell) {
|
|
continue;
|
|
}
|
|
if let Some(img) = doc.images.get(image_idx)
|
|
&& let Some(ocr) = &img.ocr_result
|
|
&& !ocr.content.is_empty()
|
|
{
|
|
*cell = ocr.content.clone();
|
|
image_idx += 1;
|
|
continue;
|
|
}
|
|
image_idx += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Append OCR text after inline markdown image references for formats (e.g. PPTX)
|
|
/// that bake placeholders into paragraph text. Only runs when `append_ocr_text` is
|
|
/// `true` and `ocr_text_only` is `false`.
|
|
fn append_embedded_image_ocr_text(doc: &mut InternalDocument) {
|
|
if doc.ocr_text_only || !doc.append_ocr_text || doc.images.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let mut image_idx = 0usize;
|
|
let mut new_elements = Vec::with_capacity(doc.elements.len() * 2);
|
|
|
|
for elem in &doc.elements {
|
|
new_elements.push(elem.clone());
|
|
|
|
if matches!(elem.kind, crate::types::internal::ElementKind::Paragraph)
|
|
&& is_markdown_image_reference(&elem.text)
|
|
{
|
|
if let Some(img) = doc.images.get(image_idx)
|
|
&& let Some(ocr) = &img.ocr_result
|
|
&& !ocr.content.is_empty()
|
|
{
|
|
let ocr_elem = crate::types::internal::InternalElement::text(
|
|
crate::types::internal::ElementKind::Paragraph,
|
|
ocr.content.clone(),
|
|
0,
|
|
);
|
|
new_elements.push(ocr_elem);
|
|
}
|
|
image_idx += 1;
|
|
}
|
|
}
|
|
|
|
doc.elements = new_elements;
|
|
|
|
for table in &mut doc.tables {
|
|
for row in &mut table.cells {
|
|
for cell in row {
|
|
if !is_markdown_image_reference(cell) {
|
|
continue;
|
|
}
|
|
if let Some(img) = doc.images.get(image_idx)
|
|
&& let Some(ocr) = &img.ocr_result
|
|
&& !ocr.content.is_empty()
|
|
{
|
|
*cell = format!("{}\n\n{}", cell.trim(), ocr.content);
|
|
}
|
|
image_idx += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns `true` if `text` is exactly a markdown image reference (``).
|
|
fn is_markdown_image_reference(text: &str) -> bool {
|
|
let t = text.trim();
|
|
if !t.starts_with(" else {
|
|
return false;
|
|
};
|
|
if bracket_end < 2 {
|
|
return false;
|
|
}
|
|
let after = &t[bracket_end + 2..];
|
|
after.ends_with(')')
|
|
}
|
|
|
|
/// Apply NFC unicode normalization to all text content.
|
|
///
|
|
/// Ensures consistent representation of composed characters (e.g., é vs e+combining accent)
|
|
/// across all extraction backends (PDF, OCR, DOCX, HTML, etc.).
|
|
fn normalize_nfc(result: &mut ExtractionResult) {
|
|
#[cfg(feature = "quality")]
|
|
{
|
|
use unicode_normalization::UnicodeNormalization;
|
|
result.content = result.content.nfc().collect();
|
|
if let Some(pages) = result.pages.as_mut() {
|
|
for page in pages.iter_mut() {
|
|
page.content = page.content.nfc().collect();
|
|
}
|
|
}
|
|
}
|
|
// Suppress unused variable warning when quality feature is disabled
|
|
let _ = result;
|
|
}
|