fil/crates/kreuzberg/src/core/pipeline/mod.rs

//! Post-processing pipeline orchestration.
//!
//! This module orchestrates the post-processing pipeline, executing validators,
//! quality processing, chunking, and custom hooks in the correct order.

mod cache;
mod execution;
mod features;
mod format;
mod initialization;

#[cfg(test)]
mod tests;

pub use cache::clear_processor_cache;
pub use format::apply_output_format;

use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::types::ExtractionResult;
use crate::types::internal::InternalDocument;

use execution::{execute_processors, execute_validators};
use features::{execute_chunking, execute_language_detection, execute_token_reduction};
use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};

/// Run the post-processing pipeline on an `InternalDocument`.
///
/// Derives `ExtractionResult` from `InternalDocument` via the derivation pipeline,
/// then executes post-processing in the following order:
/// 1. Post-Processors - Execute by stage (Early, Middle, Late) to modify/enhance the result
/// 2. Quality Processing - Text cleaning and quality scoring
/// 3. Chunking - Text splitting if enabled
/// 4. Validators - Run validation hooks on the processed result (can fail fast)
///
/// # Arguments
///
/// * `doc` - The internal document produced by the extractor
/// * `config` - Extraction configuration
///
/// # Returns
///
/// The processed extraction result.
///
/// # Errors
///
/// - Validator errors bubble up immediately
/// - Post-processor errors are caught and recorded in metadata
/// - System errors (IO, RuntimeError equivalents) always bubble up
#[cfg_attr(feature = "otel", tracing::instrument(
    skip(doc, config),
    fields(
        pipeline.stage = "post_processing",
        content.element_count = doc.elements.len(),
    )
))]
#[cfg_attr(alef, alef(skip))]
pub async fn run_pipeline(mut doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
    // Propagate rendering preferences from config into the document.
    doc.ocr_text_only = config.images.as_ref().map(|i| i.ocr_text_only).unwrap_or(false);
    doc.append_ocr_text = config.images.as_ref().map(|i| i.append_ocr_text).unwrap_or(false);

    // 1. Process extracted images with OCR if configured
    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
    let image_ocr_enabled = config.images.as_ref().map(|i| i.run_ocr_on_images).unwrap_or(true);
    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
    if image_ocr_enabled && config.ocr.is_some() && !doc.images.is_empty() {
        let images_to_process = std::mem::take(&mut doc.images);
        match crate::extraction::image_ocr::process_images_with_ocr(
            images_to_process,
            config,
            &mut doc.processing_warnings,
        )
        .await
        {
            Ok(processed) => {
                doc.images = processed;
            }
            Err(e) => {
                doc.processing_warnings.push(crate::types::ProcessingWarning {
                    source: std::borrow::Cow::Borrowed("image_ocr"),
                    message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
                });
            }
        }
    }

    replace_embedded_image_markdown_with_ocr(&mut doc);
    append_embedded_image_ocr_text(&mut doc);

    // Pre-render markdown for the chunker's heading context resolution when:
    // - Markdown chunking is configured
    // - Output format is not already Markdown (which would produce formatted_content anyway)
    // Plain-text rendering strips heading markers, so the markdown chunker needs
    // a separate markdown rendering to build the heading hierarchy for chunk metadata.
    #[cfg(feature = "chunking")]
    let chunker_heading_source = {
        let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
            c.chunker_type == crate::core::config::ChunkerType::Markdown
                || c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
        }) && config.output_format == crate::core::config::OutputFormat::Plain;
        if needs_markdown {
            Some(crate::rendering::render_markdown(&doc))
        } else {
            None
        }
    };

    // Pre-render styled HTML before `doc` is consumed by `derive_extraction_result`.
    // When `html` is active and the caller has configured `html_output`, we
    // render the document here and inject the result after derivation.
    #[cfg(feature = "html")]
    let styled_html_prerender: Option<String> = {
        use crate::plugins::Renderer as _;
        if config.output_format == crate::core::config::OutputFormat::Html {
            config.html_output.as_ref().and_then(|html_cfg| {
                match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
                    Ok(renderer) => match renderer.render(&doc) {
                        Ok(html) => Some(html),
                        Err(e) => {
                            tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
                            None
                        }
                    },
                    Err(e) => {
                        tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
                        None
                    }
                }
            })
        } else {
            None
        }
    };

    // 2. Derive ExtractionResult from InternalDocument
    let include_structure = config.include_document_structure;
    let mut result =
        crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());

    // Inject pre-rendered styled HTML (overrides the default render_html output).
    #[cfg(feature = "html")]
    if let Some(html) = styled_html_prerender {
        result.formatted_content = Some(html);
    }

    // Temporarily store pre-rendered markdown for chunker heading context.
    // Tracked separately so we can remove it after chunking — apply_output_format
    // must not swap this into result.content when output_format is Plain.
    #[cfg(feature = "chunking")]
    let chunker_only_markdown = result.formatted_content.is_none();
    #[cfg(feature = "chunking")]
    if chunker_only_markdown && let Some(md) = chunker_heading_source {
        result.formatted_content = Some(md);
    }

    // 2. Run post-processing pipeline
    let pp_config = config.postprocessor.as_ref();
    let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);

    if postprocessing_enabled {
        initialize_features();
        initialize_processor_cache()?;

        let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;

        execute_processors(
            &mut result,
            config,
            &pp_config,
            early_processors,
            middle_processors,
            late_processors,
        )
        .await?;
    }

    execute_chunking(&mut result, config)?;

    // Clear temporary markdown if it was only stored for chunker heading context.
    // This prevents apply_output_format from swapping it into result.content.
    #[cfg(feature = "chunking")]
    if chunker_only_markdown {
        result.formatted_content = None;
    }

    execute_language_detection(&mut result, config)?;
    execute_token_reduction(&mut result, config)?;
    execute_validators(&result, config).await?;

    apply_element_transform(&mut result, config);
    normalize_nfc(&mut result);

    // Run LLM-based structured extraction BEFORE output formatting
    // so extraction sees plain text, not markdown/HTML
    // TODO(wasm-llm): hosted structured extraction should run on wasm through
    // liter-llm's wasm-http backend once browser/runtime support is wired.
    #[cfg(all(feature = "liter-llm", not(target_os = "windows"), not(target_arch = "wasm32")))]
    if let Some(ref structured_config) = config.structured_extraction {
        match crate::llm::structured::extract_structured(&result.content, structured_config).await {
            Ok((output, usage)) => {
                result.structured_output = Some(output);
                crate::llm::usage::push_llm_usage(&mut result, usage);
            }
            Err(e) => {
                tracing::warn!("Structured extraction failed: {e}");
                result.processing_warnings.push(crate::types::ProcessingWarning {
                    source: std::borrow::Cow::Borrowed("structured_extraction"),
                    message: std::borrow::Cow::Owned(format!("Structured extraction failed: {e}")),
                });
            }
        }
    }

    // TODO(wasm-llm): keep wasm in the fallback branch until structured
    // extraction has an async wasm-compatible runtime path.
    #[cfg(any(not(feature = "liter-llm"), target_os = "windows", target_arch = "wasm32"))]
    if config.structured_extraction.is_some() {
        result.processing_warnings.push(crate::types::ProcessingWarning {
            source: std::borrow::Cow::Borrowed("structured_extraction"),
            message: std::borrow::Cow::Borrowed("Structured extraction requires the 'liter-llm' feature"),
        });
    }

    // Apply output format conversion as the final step
    result = apply_output_format(result, config.output_format.clone());

    Ok(result)
}

/// Run the post-processing pipeline synchronously (WASM-compatible version).
///
/// This is a synchronous implementation for WASM and non-async contexts.
/// It performs a subset of the full async pipeline, excluding async post-processors
/// and validators.
///
/// # Arguments
///
/// * `doc` - The internal document produced by the extractor
/// * `config` - Extraction configuration
///
/// # Returns
///
/// The processed extraction result.
///
/// # Notes
///
/// This function is only available when the `tokio-runtime` feature is disabled.
/// It handles:
/// - Quality processing (if enabled)
/// - Chunking (if enabled)
/// - Language detection (if enabled)
///
/// It does NOT handle:
/// - Async post-processors
/// - Async validators
#[cfg(not(feature = "tokio-runtime"))]
#[cfg_attr(alef, alef(skip))]
pub fn run_pipeline_sync(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
    // Pre-render markdown for chunker heading context (same logic as async path).
    #[cfg(feature = "chunking")]
    let chunker_heading_source = {
        let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
            c.chunker_type == crate::core::config::ChunkerType::Markdown
                || c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
        }) && config.output_format == crate::core::config::OutputFormat::Plain;
        if needs_markdown {
            Some(crate::rendering::render_markdown(&doc))
        } else {
            None
        }
    };

    // Pre-render styled HTML before `doc` is consumed (mirrors async path).
    #[cfg(feature = "html")]
    let styled_html_prerender: Option<String> = {
        use crate::plugins::Renderer as _;
        if config.output_format == crate::core::config::OutputFormat::Html {
            config.html_output.as_ref().and_then(|html_cfg| {
                match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
                    Ok(renderer) => match renderer.render(&doc) {
                        Ok(html) => Some(html),
                        Err(e) => {
                            tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
                            None
                        }
                    },
                    Err(e) => {
                        tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
                        None
                    }
                }
            })
        } else {
            None
        }
    };

    // 1. Derive ExtractionResult from InternalDocument
    let include_structure = config.include_document_structure;
    let mut result =
        crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());

    // Inject pre-rendered styled HTML.
    #[cfg(feature = "html")]
    if let Some(html) = styled_html_prerender {
        result.formatted_content = Some(html);
    }

    #[cfg(feature = "chunking")]
    let chunker_only_markdown = result.formatted_content.is_none();
    #[cfg(feature = "chunking")]
    if chunker_only_markdown && let Some(md) = chunker_heading_source {
        result.formatted_content = Some(md);
    }

    // 2. Run synchronous post-processing
    execute_chunking(&mut result, config)?;

    #[cfg(feature = "chunking")]
    if chunker_only_markdown {
        result.formatted_content = None;
    }

    execute_language_detection(&mut result, config)?;
    execute_token_reduction(&mut result, config)?;

    apply_element_transform(&mut result, config);
    normalize_nfc(&mut result);

    // Apply output format conversion as the final step
    result = apply_output_format(result, config.output_format.clone());

    Ok(result)
}

/// Transform to element-based output if requested by the config.
fn apply_element_transform(result: &mut ExtractionResult, config: &ExtractionConfig) {
    if config.result_format == crate::types::ResultFormat::ElementBased {
        result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
            result,
        ));
    }
}

/// Replace inline markdown image references with OCR text for formats (e.g. PPTX)
/// that bake placeholders into paragraph text rather than using `ElementKind::Image`.
fn replace_embedded_image_markdown_with_ocr(doc: &mut InternalDocument) {
    if !doc.ocr_text_only || doc.images.is_empty() {
        return;
    }

    let mut image_idx = 0usize;

    for elem in &mut doc.elements {
        if !matches!(elem.kind, crate::types::internal::ElementKind::Paragraph) {
            continue;
        }
        if !is_markdown_image_reference(&elem.text) {
            continue;
        }
        if let Some(img) = doc.images.get(image_idx)
            && let Some(ocr) = &img.ocr_result
            && !ocr.content.is_empty()
        {
            elem.text = ocr.content.clone();
            image_idx += 1;
            continue;
        }
        image_idx += 1;
    }

    for table in &mut doc.tables {
        for row in &mut table.cells {
            for cell in row {
                if !is_markdown_image_reference(cell) {
                    continue;
                }
                if let Some(img) = doc.images.get(image_idx)
                    && let Some(ocr) = &img.ocr_result
                    && !ocr.content.is_empty()
                {
                    *cell = ocr.content.clone();
                    image_idx += 1;
                    continue;
                }
                image_idx += 1;
            }
        }
    }
}

/// Append OCR text after inline markdown image references for formats (e.g. PPTX)
/// that bake placeholders into paragraph text. Only runs when `append_ocr_text` is
/// `true` and `ocr_text_only` is `false`.
fn append_embedded_image_ocr_text(doc: &mut InternalDocument) {
    if doc.ocr_text_only || !doc.append_ocr_text || doc.images.is_empty() {
        return;
    }

    let mut image_idx = 0usize;
    let mut new_elements = Vec::with_capacity(doc.elements.len() * 2);

    for elem in &doc.elements {
        new_elements.push(elem.clone());

        if matches!(elem.kind, crate::types::internal::ElementKind::Paragraph)
            && is_markdown_image_reference(&elem.text)
        {
            if let Some(img) = doc.images.get(image_idx)
                && let Some(ocr) = &img.ocr_result
                && !ocr.content.is_empty()
            {
                let ocr_elem = crate::types::internal::InternalElement::text(
                    crate::types::internal::ElementKind::Paragraph,
                    ocr.content.clone(),
                    0,
                );
                new_elements.push(ocr_elem);
            }
            image_idx += 1;
        }
    }

    doc.elements = new_elements;

    for table in &mut doc.tables {
        for row in &mut table.cells {
            for cell in row {
                if !is_markdown_image_reference(cell) {
                    continue;
                }
                if let Some(img) = doc.images.get(image_idx)
                    && let Some(ocr) = &img.ocr_result
                    && !ocr.content.is_empty()
                {
                    *cell = format!("{}\n\n{}", cell.trim(), ocr.content);
                }
                image_idx += 1;
            }
        }
    }
}

/// Returns `true` if `text` is exactly a markdown image reference (`![alt](url)`).
fn is_markdown_image_reference(text: &str) -> bool {
    let t = text.trim();
    if !t.starts_with("![") {
        return false;
    }
    let Some(bracket_end) = t.find("](") else {
        return false;
    };
    if bracket_end < 2 {
        return false;
    }
    let after = &t[bracket_end + 2..];
    after.ends_with(')')
}

/// Apply NFC unicode normalization to all text content.
///
/// Ensures consistent representation of composed characters (e.g., é vs e+combining accent)
/// across all extraction backends (PDF, OCR, DOCX, HTML, etc.).
fn normalize_nfc(result: &mut ExtractionResult) {
    #[cfg(feature = "quality")]
    {
        use unicode_normalization::UnicodeNormalization;
        result.content = result.content.nfc().collect();
        if let Some(pages) = result.pages.as_mut() {
            for page in pages.iter_mut() {
                page.content = page.content.nfc().collect();
            }
        }
    }
    // Suppress unused variable warning when quality feature is disabled
    let _ = result;
}