Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/src/core/pipeline/mod.rs
+++ b/crates/kreuzberg/src/core/pipeline/mod.rs
@@ -0,0 +1,478 @@
+//! Post-processing pipeline orchestration.
+//!
+//! This module orchestrates the post-processing pipeline, executing validators,
+//! quality processing, chunking, and custom hooks in the correct order.
+
+mod cache;
+mod execution;
+mod features;
+mod format;
+mod initialization;
+
+#[cfg(test)]
+mod tests;
+
+pub use cache::clear_processor_cache;
+pub use format::apply_output_format;
+
+use crate::Result;
+use crate::core::config::ExtractionConfig;
+use crate::types::ExtractionResult;
+use crate::types::internal::InternalDocument;
+
+use execution::{execute_processors, execute_validators};
+use features::{execute_chunking, execute_language_detection, execute_token_reduction};
+use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};
+
+/// Run the post-processing pipeline on an `InternalDocument`.
+///
+/// Derives `ExtractionResult` from `InternalDocument` via the derivation pipeline,
+/// then executes post-processing in the following order:
+/// 1. Post-Processors - Execute by stage (Early, Middle, Late) to modify/enhance the result
+/// 2. Quality Processing - Text cleaning and quality scoring
+/// 3. Chunking - Text splitting if enabled
+/// 4. Validators - Run validation hooks on the processed result (can fail fast)
+///
+/// # Arguments
+///
+/// * `doc` - The internal document produced by the extractor
+/// * `config` - Extraction configuration
+///
+/// # Returns
+///
+/// The processed extraction result.
+///
+/// # Errors
+///
+/// - Validator errors bubble up immediately
+/// - Post-processor errors are caught and recorded in metadata
+/// - System errors (IO, RuntimeError equivalents) always bubble up
+#[cfg_attr(feature = "otel", tracing::instrument(
+    skip(doc, config),
+    fields(
+        pipeline.stage = "post_processing",
+        content.element_count = doc.elements.len(),
+    )
+))]
+#[cfg_attr(alef, alef(skip))]
+pub async fn run_pipeline(mut doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
+    // Propagate rendering preferences from config into the document.
+    doc.ocr_text_only = config.images.as_ref().map(|i| i.ocr_text_only).unwrap_or(false);
+    doc.append_ocr_text = config.images.as_ref().map(|i| i.append_ocr_text).unwrap_or(false);
+
+    // 1. Process extracted images with OCR if configured
+    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
+    let image_ocr_enabled = config.images.as_ref().map(|i| i.run_ocr_on_images).unwrap_or(true);
+    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
+    if image_ocr_enabled && config.ocr.is_some() && !doc.images.is_empty() {
+        let images_to_process = std::mem::take(&mut doc.images);
+        match crate::extraction::image_ocr::process_images_with_ocr(
+            images_to_process,
+            config,
+            &mut doc.processing_warnings,
+        )
+        .await
+        {
+            Ok(processed) => {
+                doc.images = processed;
+            }
+            Err(e) => {
+                doc.processing_warnings.push(crate::types::ProcessingWarning {
+                    source: std::borrow::Cow::Borrowed("image_ocr"),
+                    message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
+                });
+            }
+        }
+    }
+
+    replace_embedded_image_markdown_with_ocr(&mut doc);
+    append_embedded_image_ocr_text(&mut doc);
+
+    // Pre-render markdown for the chunker's heading context resolution when:
+    // - Markdown chunking is configured
+    // - Output format is not already Markdown (which would produce formatted_content anyway)
+    // Plain-text rendering strips heading markers, so the markdown chunker needs
+    // a separate markdown rendering to build the heading hierarchy for chunk metadata.
+    #[cfg(feature = "chunking")]
+    let chunker_heading_source = {
+        let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
+            c.chunker_type == crate::core::config::ChunkerType::Markdown
+                || c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
+        }) && config.output_format == crate::core::config::OutputFormat::Plain;
+        if needs_markdown {
+            Some(crate::rendering::render_markdown(&doc))
+        } else {
+            None
+        }
+    };
+
+    // Pre-render styled HTML before `doc` is consumed by `derive_extraction_result`.
+    // When `html` is active and the caller has configured `html_output`, we
+    // render the document here and inject the result after derivation.
+    #[cfg(feature = "html")]
+    let styled_html_prerender: Option<String> = {
+        use crate::plugins::Renderer as _;
+        if config.output_format == crate::core::config::OutputFormat::Html {
+            config.html_output.as_ref().and_then(|html_cfg| {
+                match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
+                    Ok(renderer) => match renderer.render(&doc) {
+                        Ok(html) => Some(html),
+                        Err(e) => {
+                            tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
+                            None
+                        }
+                    },
+                    Err(e) => {
+                        tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
+                        None
+                    }
+                }
+            })
+        } else {
+            None
+        }
+    };
+
+    // 2. Derive ExtractionResult from InternalDocument
+    let include_structure = config.include_document_structure;
+    let mut result =
+        crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
+
+    // Inject pre-rendered styled HTML (overrides the default render_html output).
+    #[cfg(feature = "html")]
+    if let Some(html) = styled_html_prerender {
+        result.formatted_content = Some(html);
+    }
+
+    // Temporarily store pre-rendered markdown for chunker heading context.
+    // Tracked separately so we can remove it after chunking — apply_output_format
+    // must not swap this into result.content when output_format is Plain.
+    #[cfg(feature = "chunking")]
+    let chunker_only_markdown = result.formatted_content.is_none();
+    #[cfg(feature = "chunking")]
+    if chunker_only_markdown && let Some(md) = chunker_heading_source {
+        result.formatted_content = Some(md);
+    }
+
+    // 2. Run post-processing pipeline
+    let pp_config = config.postprocessor.as_ref();
+    let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
+
+    if postprocessing_enabled {
+        initialize_features();
+        initialize_processor_cache()?;
+
+        let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;
+
+        execute_processors(
+            &mut result,
+            config,
+            &pp_config,
+            early_processors,
+            middle_processors,
+            late_processors,
+        )
+        .await?;
+    }
+
+    execute_chunking(&mut result, config)?;
+
+    // Clear temporary markdown if it was only stored for chunker heading context.
+    // This prevents apply_output_format from swapping it into result.content.
+    #[cfg(feature = "chunking")]
+    if chunker_only_markdown {
+        result.formatted_content = None;
+    }
+
+    execute_language_detection(&mut result, config)?;
+    execute_token_reduction(&mut result, config)?;
+    execute_validators(&result, config).await?;
+
+    apply_element_transform(&mut result, config);
+    normalize_nfc(&mut result);
+
+    // Run LLM-based structured extraction BEFORE output formatting
+    // so extraction sees plain text, not markdown/HTML
+    // TODO(wasm-llm): hosted structured extraction should run on wasm through
+    // liter-llm's wasm-http backend once browser/runtime support is wired.
+    #[cfg(all(feature = "liter-llm", not(target_os = "windows"), not(target_arch = "wasm32")))]
+    if let Some(ref structured_config) = config.structured_extraction {
+        match crate::llm::structured::extract_structured(&result.content, structured_config).await {
+            Ok((output, usage)) => {
+                result.structured_output = Some(output);
+                crate::llm::usage::push_llm_usage(&mut result, usage);
+            }
+            Err(e) => {
+                tracing::warn!("Structured extraction failed: {e}");
+                result.processing_warnings.push(crate::types::ProcessingWarning {
+                    source: std::borrow::Cow::Borrowed("structured_extraction"),
+                    message: std::borrow::Cow::Owned(format!("Structured extraction failed: {e}")),
+                });
+            }
+        }
+    }
+
+    // TODO(wasm-llm): keep wasm in the fallback branch until structured
+    // extraction has an async wasm-compatible runtime path.
+    #[cfg(any(not(feature = "liter-llm"), target_os = "windows", target_arch = "wasm32"))]
+    if config.structured_extraction.is_some() {
+        result.processing_warnings.push(crate::types::ProcessingWarning {
+            source: std::borrow::Cow::Borrowed("structured_extraction"),
+            message: std::borrow::Cow::Borrowed("Structured extraction requires the 'liter-llm' feature"),
+        });
+    }
+
+    // Apply output format conversion as the final step
+    result = apply_output_format(result, config.output_format.clone());
+
+    Ok(result)
+}
+
+/// Run the post-processing pipeline synchronously (WASM-compatible version).
+///
+/// This is a synchronous implementation for WASM and non-async contexts.
+/// It performs a subset of the full async pipeline, excluding async post-processors
+/// and validators.
+///
+/// # Arguments
+///
+/// * `doc` - The internal document produced by the extractor
+/// * `config` - Extraction configuration
+///
+/// # Returns
+///
+/// The processed extraction result.
+///
+/// # Notes
+///
+/// This function is only available when the `tokio-runtime` feature is disabled.
+/// It handles:
+/// - Quality processing (if enabled)
+/// - Chunking (if enabled)
+/// - Language detection (if enabled)
+///
+/// It does NOT handle:
+/// - Async post-processors
+/// - Async validators
+#[cfg(not(feature = "tokio-runtime"))]
+#[cfg_attr(alef, alef(skip))]
+pub fn run_pipeline_sync(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
+    // Pre-render markdown for chunker heading context (same logic as async path).
+    #[cfg(feature = "chunking")]
+    let chunker_heading_source = {
+        let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
+            c.chunker_type == crate::core::config::ChunkerType::Markdown
+                || c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
+        }) && config.output_format == crate::core::config::OutputFormat::Plain;
+        if needs_markdown {
+            Some(crate::rendering::render_markdown(&doc))
+        } else {
+            None
+        }
+    };
+
+    // Pre-render styled HTML before `doc` is consumed (mirrors async path).
+    #[cfg(feature = "html")]
+    let styled_html_prerender: Option<String> = {
+        use crate::plugins::Renderer as _;
+        if config.output_format == crate::core::config::OutputFormat::Html {
+            config.html_output.as_ref().and_then(|html_cfg| {
+                match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
+                    Ok(renderer) => match renderer.render(&doc) {
+                        Ok(html) => Some(html),
+                        Err(e) => {
+                            tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
+                            None
+                        }
+                    },
+                    Err(e) => {
+                        tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
+                        None
+                    }
+                }
+            })
+        } else {
+            None
+        }
+    };
+
+    // 1. Derive ExtractionResult from InternalDocument
+    let include_structure = config.include_document_structure;
+    let mut result =
+        crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
+
+    // Inject pre-rendered styled HTML.
+    #[cfg(feature = "html")]
+    if let Some(html) = styled_html_prerender {
+        result.formatted_content = Some(html);
+    }
+
+    #[cfg(feature = "chunking")]
+    let chunker_only_markdown = result.formatted_content.is_none();
+    #[cfg(feature = "chunking")]
+    if chunker_only_markdown && let Some(md) = chunker_heading_source {
+        result.formatted_content = Some(md);
+    }
+
+    // 2. Run synchronous post-processing
+    execute_chunking(&mut result, config)?;
+
+    #[cfg(feature = "chunking")]
+    if chunker_only_markdown {
+        result.formatted_content = None;
+    }
+
+    execute_language_detection(&mut result, config)?;
+    execute_token_reduction(&mut result, config)?;
+
+    apply_element_transform(&mut result, config);
+    normalize_nfc(&mut result);
+
+    // Apply output format conversion as the final step
+    result = apply_output_format(result, config.output_format.clone());
+
+    Ok(result)
+}
+
+/// Transform to element-based output if requested by the config.
+fn apply_element_transform(result: &mut ExtractionResult, config: &ExtractionConfig) {
+    if config.result_format == crate::types::ResultFormat::ElementBased {
+        result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
+            result,
+        ));
+    }
+}
+
+/// Replace inline markdown image references with OCR text for formats (e.g. PPTX)
+/// that bake placeholders into paragraph text rather than using `ElementKind::Image`.
+fn replace_embedded_image_markdown_with_ocr(doc: &mut InternalDocument) {
+    if !doc.ocr_text_only || doc.images.is_empty() {
+        return;
+    }
+
+    let mut image_idx = 0usize;
+
+    for elem in &mut doc.elements {
+        if !matches!(elem.kind, crate::types::internal::ElementKind::Paragraph) {
+            continue;
+        }
+        if !is_markdown_image_reference(&elem.text) {
+            continue;
+        }
+        if let Some(img) = doc.images.get(image_idx)
+            && let Some(ocr) = &img.ocr_result
+            && !ocr.content.is_empty()
+        {
+            elem.text = ocr.content.clone();
+            image_idx += 1;
+            continue;
+        }
+        image_idx += 1;
+    }
+
+    for table in &mut doc.tables {
+        for row in &mut table.cells {
+            for cell in row {
+                if !is_markdown_image_reference(cell) {
+                    continue;
+                }
+                if let Some(img) = doc.images.get(image_idx)
+                    && let Some(ocr) = &img.ocr_result
+                    && !ocr.content.is_empty()
+                {
+                    *cell = ocr.content.clone();
+                    image_idx += 1;
+                    continue;
+                }
+                image_idx += 1;
+            }
+        }
+    }
+}
+
+/// Append OCR text after inline markdown image references for formats (e.g. PPTX)
+/// that bake placeholders into paragraph text. Only runs when `append_ocr_text` is
+/// `true` and `ocr_text_only` is `false`.
+fn append_embedded_image_ocr_text(doc: &mut InternalDocument) {
+    if doc.ocr_text_only || !doc.append_ocr_text || doc.images.is_empty() {
+        return;
+    }
+
+    let mut image_idx = 0usize;
+    let mut new_elements = Vec::with_capacity(doc.elements.len() * 2);
+
+    for elem in &doc.elements {
+        new_elements.push(elem.clone());
+
+        if matches!(elem.kind, crate::types::internal::ElementKind::Paragraph)
+            && is_markdown_image_reference(&elem.text)
+        {
+            if let Some(img) = doc.images.get(image_idx)
+                && let Some(ocr) = &img.ocr_result
+                && !ocr.content.is_empty()
+            {
+                let ocr_elem = crate::types::internal::InternalElement::text(
+                    crate::types::internal::ElementKind::Paragraph,
+                    ocr.content.clone(),
+                    0,
+                );
+                new_elements.push(ocr_elem);
+            }
+            image_idx += 1;
+        }
+    }
+
+    doc.elements = new_elements;
+
+    for table in &mut doc.tables {
+        for row in &mut table.cells {
+            for cell in row {
+                if !is_markdown_image_reference(cell) {
+                    continue;
+                }
+                if let Some(img) = doc.images.get(image_idx)
+                    && let Some(ocr) = &img.ocr_result
+                    && !ocr.content.is_empty()
+                {
+                    *cell = format!("{}\n\n{}", cell.trim(), ocr.content);
+                }
+                image_idx += 1;
+            }
+        }
+    }
+}
+
+/// Returns `true` if `text` is exactly a markdown image reference (`![alt](url)`).
+fn is_markdown_image_reference(text: &str) -> bool {
+    let t = text.trim();
+    if !t.starts_with("![") {
+        return false;
+    }
+    let Some(bracket_end) = t.find("](") else {
+        return false;
+    };
+    if bracket_end < 2 {
+        return false;
+    }
+    let after = &t[bracket_end + 2..];
+    after.ends_with(')')
+}
+
+/// Apply NFC unicode normalization to all text content.
+///
+/// Ensures consistent representation of composed characters (e.g., é vs e+combining accent)
+/// across all extraction backends (PDF, OCR, DOCX, HTML, etc.).
+fn normalize_nfc(result: &mut ExtractionResult) {
+    #[cfg(feature = "quality")]
+    {
+        use unicode_normalization::UnicodeNormalization;
+        result.content = result.content.nfc().collect();
+        if let Some(pages) = result.pages.as_mut() {
+            for page in pages.iter_mut() {
+                page.content = page.content.nfc().collect();
+            }
+        }
+    }
+    // Suppress unused variable warning when quality feature is disabled
+    let _ = result;
+}