packages/java/dev/kreuzberg/ExtractionConfig.java

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;

import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;

/**
 * Main extraction configuration.
 *
 * This struct contains all configuration options for the extraction process.
 * It can be loaded from TOML, YAML, or JSON files, or created programmatically.
 */
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionConfig.Builder.class)
public record ExtractionConfig(
    /**
     * Enable caching of extraction results
     */
    @Nullable @JsonProperty("use_cache") Boolean useCache,
    /**
     * Enable quality post-processing
     */
    @Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
    /**
     * OCR configuration (null = OCR disabled)
     */
    @Nullable @JsonProperty("ocr") OcrConfig ocr,
    /**
     * Force OCR even for searchable PDFs
     */
    @Nullable @JsonProperty("force_ocr") Boolean forceOcr,
    /**
     * Force OCR on specific pages only (1-indexed page numbers, must be &gt;= 1).
     *
     * When set, only the listed pages are OCR'd regardless of text layer quality.
     * Unlisted pages use native text extraction. Ignored when {@code force_ocr} is {@code true}.
     * Only applies to PDF documents. Duplicates are automatically deduplicated.
     * An {@code ocr} config is recommended for backend/language selection; defaults are used if absent.
     */
    @Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
    /**
     * Disable OCR entirely, even for images.
     *
     * When {@code true}, OCR is skipped for all document types. Images return metadata
     * only (dimensions, format, EXIF) without text extraction. PDFs use only
     * native text extraction without OCR fallback.
     *
     * Cannot be {@code true} simultaneously with {@code force_ocr}.
     *
     * *Added in v4.7.0.*
     */
    @Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
    /**
     * Text chunking configuration (null = chunking disabled)
     */
    @Nullable @JsonProperty("chunking") ChunkingConfig chunking,
    /**
     * Content filtering configuration (null = use extractor defaults).
     *
     * Controls whether document "furniture" (headers, footers, watermarks,
     * repeating text) is included in or stripped from extraction results.
     * See ContentFilterConfig for per-field documentation.
     */
    @Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
    /**
     * Image extraction configuration (null = no image extraction)
     */
    @Nullable @JsonProperty("images") ImageExtractionConfig images,
    /**
     * PDF-specific options (null = use defaults)
     */
    @Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
    /**
     * Token reduction configuration (null = no token reduction)
     */
    @Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
    /**
     * Language detection configuration (null = no language detection)
     */
    @Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
    /**
     * Page extraction configuration (null = no page tracking)
     */
    @Nullable @JsonProperty("pages") PageConfig pages,
    /**
     * Keyword extraction configuration (null = no keyword extraction)
     */
    @Nullable @JsonProperty("keywords") KeywordConfig keywords,
    /**
     * Post-processor configuration (null = use defaults)
     */
    @Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
    /**
     * HTML to Markdown conversion options (null = use defaults)
     *
     * Configure how HTML documents are converted to Markdown, including heading styles,
     * list formatting, code block styles, and preprocessing options.
     */
    @Nullable @JsonProperty("html_options") String htmlOptions,
    /**
     * Styled HTML output configuration.
     *
     * When set alongside {@code output_format = OutputFormat.Html}, the extraction
     * pipeline uses StyledHtmlRenderer(crate.rendering.StyledHtmlRenderer)
     * which emits stable {@code kb-*} CSS class hooks on every structural element
     * and optionally embeds theme CSS or user-supplied CSS in a {@code &lt;style&gt;} block.
     *
     * When {@code None}, the existing plain comrak-based HTML renderer is used.
     */
    @Nullable @JsonProperty("html_output") HtmlOutputConfig htmlOutput,
    /**
     * Default per-file timeout in seconds for batch extraction.
     *
     * When set, each file in a batch will be canceled after this duration
     * unless overridden by FileExtractionConfig.timeout_secs.
     *
     * Defaults to {@code Some(60)} to prevent pathological files (e.g. deeply
     * nested archives, documents with millions of cells) from running
     * indefinitely and exhausting caller resources. Set to {@code None} to
     * disable the timeout for trusted input or long-running workloads.
     */
    @Nullable @JsonProperty("extraction_timeout_secs") Long extractionTimeoutSecs,
    /**
     * Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
     *
     * Limits parallelism to prevent resource exhaustion when processing
     * large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
     */
    @Nullable @JsonProperty("max_concurrent_extractions") Long maxConcurrentExtractions,
    /**
     * Result structure format
     *
     * Controls whether results are returned in unified format (default) with all
     * content in the {@code content} field, or element-based format with semantic
     * elements (for Unstructured-compatible output).
     */
    @Nullable @JsonProperty("result_format") ResultFormat resultFormat,
    /**
     * Security limits for archive extraction.
     *
     * Controls maximum archive size, compression ratio, file count, and other
     * security thresholds to prevent decompression bomb attacks. Also caps
     * nesting depth, iteration count, entity / token length, total
     * content size, and table cell count for every extraction path that
     * ingests user-controlled bytes.
     * When {@code None}, default limits are used.
     */
    @Nullable @JsonProperty("security_limits") SecurityLimits securityLimits,
    /**
     * Maximum uncompressed size in bytes for a single embedded file before
     * recursive extraction is attempted (default: 50 MiB).
     *
     * Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
     * to email attachments processed via recursive extraction. Files that
     * exceed this limit are skipped with a {@code ProcessingWarning} rather than
     * passed to the extraction pipeline, preventing a single oversized
     * embedded object from consuming unbounded memory or time.
     *
     * Set to {@code None} to disable the per-embedded-file cap (falls back to
     * {@code security_limits.max_archive_size} as the only guard).
     */
    @Nullable @JsonProperty("max_embedded_file_bytes") Long maxEmbeddedFileBytes,
    /**
     * Content text format (default: Plain).
     *
     * Controls the format of the extracted content:
     * - {@code Plain}: Raw extracted text (default)
     * - {@code Markdown}: Markdown formatted output
     * - {@code Djot}: Djot markup format (requires djot feature)
     * - {@code Html}: HTML formatted output
     *
     * When set to a structured format, extraction results will include
     * formatted output. The {@code formatted_content} field may be populated
     * when format conversion is applied.
     */
    @Nullable @JsonProperty("output_format") OutputFormat outputFormat,
    /**
     * Layout detection configuration (null = layout detection disabled).
     *
     * When set, PDF pages and images are analyzed for document structure
     * (headings, code, formulas, tables, figures, etc.) using RT-DETR models
     * via ONNX Runtime. For PDFs, layout hints override paragraph classification
     * in the markdown pipeline. For images, per-region OCR is performed with
     * markdown formatting based on detected layout classes.
     * Requires the {@code layout-detection} feature to run inference; the field is
     * present whenever the {@code layout-types} feature is active (which includes
     * {@code layout-detection} as well as the no-ORT target groups).
     */
    @Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
    /**
     * Run layout detection on the non-OCR PDF markdown path.
     *
     * When {@code true} and {@code layout} is {@code Some(_)}, layout regions inform heading,
     * table, list, and figure detection in the structure pipeline that would
     * otherwise rely on font-clustering heuristics alone. Significantly
     * improves SF1 (structural F1) at the cost of inference latency
     * (~150-300ms/page CPU, ~20-50ms/page GPU). Default: {@code false}.
     * Requires the {@code layout-detection} feature.
     */
    @Nullable @JsonProperty("use_layout_for_markdown") Boolean useLayoutForMarkdown,
    /**
     * Enable structured document tree output.
     *
     * When true, populates the {@code document} field on {@code ExtractionResult} with a
     * hierarchical {@code DocumentStructure} containing heading-driven section nesting,
     * table grids, content layer classification, and inline annotations.
     *
     * Independent of {@code result_format} — can be combined with Unified or ElementBased.
     */
    @Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
    /**
     * Hardware acceleration configuration for ONNX Runtime models.
     *
     * Controls execution provider selection for layout detection and embedding
     * models. When {@code None}, uses platform defaults (CoreML on macOS, CUDA on
     * Linux, CPU on Windows).
     */
    @Nullable @JsonProperty("acceleration") AccelerationConfig acceleration,
    /**
     * Cache namespace for tenant isolation.
     *
     * When set, cache entries are stored under {@code {cache_dir}/{namespace}/}.
     * Must be alphanumeric, hyphens, or underscores only (max 64 chars).
     * Different namespaces have isolated cache spaces on the same filesystem.
     */
    @Nullable @JsonProperty("cache_namespace") String cacheNamespace,
    /**
     * Per-request cache TTL in seconds.
     *
     * Overrides the global {@code max_age_days} for this specific extraction.
     * When {@code 0}, caching is completely skipped (no read or write).
     * When {@code None}, the global TTL applies.
     */
    @Nullable @JsonProperty("cache_ttl_secs") Long cacheTtlSecs,
    /**
     * Email extraction configuration (null = use defaults).
     *
     * Currently supports configuring the fallback codepage for MSG files
     * that do not specify one. See {@code EmailConfig} for details.
     */
    @Nullable @JsonProperty("email") EmailConfig email,
    /**
     * Concurrency limits for constrained environments (null = use defaults).
     *
     * Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
     * (when {@code max_concurrent_extractions} is unset) the batch concurrency
     * semaphore. See {@code ConcurrencyConfig} for details.
     */
    @Nullable @JsonProperty("concurrency") String concurrency,
    /**
     * Maximum recursion depth for archive extraction (default: 3).
     * Set to 0 to disable recursive extraction (legacy behavior).
     */
    @Nullable @JsonProperty("max_archive_depth") Long maxArchiveDepth,
    /**
     * Tree-sitter language pack configuration (null = tree-sitter disabled).
     *
     * When set, enables code file extraction using tree-sitter parsers.
     * Controls grammar download behavior and code analysis options.
     */
    @Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
    /**
     * Structured extraction via LLM (null = disabled).
     *
     * When set, the extracted document content is sent to an LLM with the
     * provided JSON schema. The structured response is stored in
     * {@code ExtractionResult.structured_output}.
     */
    @Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction,
    /**
     * Cancellation token for this extraction (null = no external cancellation).
     *
     * Pass a CancellationToken clone here and call CancellationToken.cancel
     * from another thread / task to abort the extraction in progress. The extractor
     * checks the token at safe checkpoints (before lock acquisition, between pages,
     * between batch items) and returns KreuzbergError.Cancelled when set.
     *
     * The field is excluded from serialization because {@code CancellationToken} is a
     * runtime handle, not a configuration value.
     */
    @Nullable @JsonProperty("cancel_token") String cancelToken
) {
    public static Builder builder() {
        return new Builder();
    }

    // CPD-OFF
    @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
    public static final class Builder {

        @JsonProperty("use_cache")
private Boolean useCache = null;
        @JsonProperty("enable_quality_processing")
private Boolean enableQualityProcessing = null;
        @Nullable private OcrConfig ocr = null;
        @JsonProperty("force_ocr")
private Boolean forceOcr = null;
        @JsonProperty("force_ocr_pages")
private List<Integer> forceOcrPages = null;
        @JsonProperty("disable_ocr")
private Boolean disableOcr = null;
        @Nullable private ChunkingConfig chunking = null;
        @JsonProperty("content_filter")
        @Nullable private ContentFilterConfig contentFilter = null;
        @Nullable private ImageExtractionConfig images = null;
        @JsonProperty("pdf_options")
        @Nullable private PdfConfig pdfOptions = null;
        @JsonProperty("token_reduction")
        @Nullable private TokenReductionOptions tokenReduction = null;
        @JsonProperty("language_detection")
        @Nullable private LanguageDetectionConfig languageDetection = null;
        @Nullable private PageConfig pages = null;
        @Nullable private KeywordConfig keywords = null;
        @Nullable private PostProcessorConfig postprocessor = null;
        @JsonProperty("html_options")
private String htmlOptions = null;
        @JsonProperty("html_output")
        @Nullable private HtmlOutputConfig htmlOutput = null;
        @JsonProperty("extraction_timeout_secs")
private Long extractionTimeoutSecs = null;
        @JsonProperty("max_concurrent_extractions")
private Long maxConcurrentExtractions = null;
        @JsonProperty("result_format")
        @Nullable private ResultFormat resultFormat = ResultFormat.Unified;
        @JsonProperty("security_limits")
        @Nullable private SecurityLimits securityLimits = null;
        @JsonProperty("max_embedded_file_bytes")
private Long maxEmbeddedFileBytes = null;
        @JsonProperty("output_format")
        @Nullable private OutputFormat outputFormat = OutputFormat.Plain;
        @Nullable private LayoutDetectionConfig layout = null;
        @JsonProperty("use_layout_for_markdown")
private Boolean useLayoutForMarkdown = null;
        @JsonProperty("include_document_structure")
private Boolean includeDocumentStructure = null;
        @Nullable private AccelerationConfig acceleration = null;
        @JsonProperty("cache_namespace")
private String cacheNamespace = null;
        @JsonProperty("cache_ttl_secs")
private Long cacheTtlSecs = null;
        @Nullable private EmailConfig email = null;
private String concurrency = null;
        @JsonProperty("max_archive_depth")
private Long maxArchiveDepth = null;
        @JsonProperty("tree_sitter")
        @Nullable private TreeSitterConfig treeSitter = null;
        @JsonProperty("structured_extraction")
        @Nullable private StructuredExtractionConfig structuredExtraction = null;
        @JsonProperty("cancel_token")
private String cancelToken = null;

        /** Sets the useCache field. */
        @JsonProperty("use_cache")
        public Builder withUseCache(final @Nullable Boolean value) {
            this.useCache = value;
            return this;
        }

        /** Sets the enableQualityProcessing field. */
        @JsonProperty("enable_quality_processing")
        public Builder withEnableQualityProcessing(final @Nullable Boolean value) {
            this.enableQualityProcessing = value;
            return this;
        }

        /** Sets the ocr field. */
        @JsonProperty("ocr")
        public Builder withOcr(final @Nullable OcrConfig value) {
            this.ocr = value;
            return this;
        }

        /** Sets the forceOcr field. */
        @JsonProperty("force_ocr")
        public Builder withForceOcr(final @Nullable Boolean value) {
            this.forceOcr = value;
            return this;
        }

        /** Sets the forceOcrPages field. */
        @JsonProperty("force_ocr_pages")
        public Builder withForceOcrPages(final @Nullable List<Integer> value) {
            this.forceOcrPages = value;
            return this;
        }

        /** Sets the disableOcr field. */
        @JsonProperty("disable_ocr")
        public Builder withDisableOcr(final @Nullable Boolean value) {
            this.disableOcr = value;
            return this;
        }

        /** Sets the chunking field. */
        @JsonProperty("chunking")
        public Builder withChunking(final @Nullable ChunkingConfig value) {
            this.chunking = value;
            return this;
        }

        /** Sets the contentFilter field. */
        @JsonProperty("content_filter")
        public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
            this.contentFilter = value;
            return this;
        }

        /** Sets the images field. */
        @JsonProperty("images")
        public Builder withImages(final @Nullable ImageExtractionConfig value) {
            this.images = value;
            return this;
        }

        /** Sets the pdfOptions field. */
        @JsonProperty("pdf_options")
        public Builder withPdfOptions(final @Nullable PdfConfig value) {
            this.pdfOptions = value;
            return this;
        }

        /** Sets the tokenReduction field. */
        @JsonProperty("token_reduction")
        public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
            this.tokenReduction = value;
            return this;
        }

        /** Sets the languageDetection field. */
        @JsonProperty("language_detection")
        public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
            this.languageDetection = value;
            return this;
        }

        /** Sets the pages field. */
        @JsonProperty("pages")
        public Builder withPages(final @Nullable PageConfig value) {
            this.pages = value;
            return this;
        }

        /** Sets the keywords field. */
        @JsonProperty("keywords")
        public Builder withKeywords(final @Nullable KeywordConfig value) {
            this.keywords = value;
            return this;
        }

        /** Sets the postprocessor field. */
        @JsonProperty("postprocessor")
        public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
            this.postprocessor = value;
            return this;
        }

        /** Sets the htmlOptions field. */
        @JsonProperty("html_options")
        public Builder withHtmlOptions(final @Nullable String value) {
            this.htmlOptions = value;
            return this;
        }

        /** Sets the htmlOutput field. */
        @JsonProperty("html_output")
        public Builder withHtmlOutput(final @Nullable HtmlOutputConfig value) {
            this.htmlOutput = value;
            return this;
        }

        /** Sets the extractionTimeoutSecs field. */
        @JsonProperty("extraction_timeout_secs")
        public Builder withExtractionTimeoutSecs(final @Nullable Long value) {
            this.extractionTimeoutSecs = value;
            return this;
        }

        /** Sets the maxConcurrentExtractions field. */
        @JsonProperty("max_concurrent_extractions")
        public Builder withMaxConcurrentExtractions(final @Nullable Long value) {
            this.maxConcurrentExtractions = value;
            return this;
        }

        /** Sets the resultFormat field. */
        @JsonProperty("result_format")
        public Builder withResultFormat(final @Nullable ResultFormat value) {
            this.resultFormat = value;
            return this;
        }

        /** Sets the securityLimits field. */
        @JsonProperty("security_limits")
        public Builder withSecurityLimits(final @Nullable SecurityLimits value) {
            this.securityLimits = value;
            return this;
        }

        /** Sets the maxEmbeddedFileBytes field. */
        @JsonProperty("max_embedded_file_bytes")
        public Builder withMaxEmbeddedFileBytes(final @Nullable Long value) {
            this.maxEmbeddedFileBytes = value;
            return this;
        }

        /** Sets the outputFormat field. */
        @JsonProperty("output_format")
        public Builder withOutputFormat(final @Nullable OutputFormat value) {
            this.outputFormat = value;
            return this;
        }

        /** Sets the layout field. */
        @JsonProperty("layout")
        public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
            this.layout = value;
            return this;
        }

        /** Sets the useLayoutForMarkdown field. */
        @JsonProperty("use_layout_for_markdown")
        public Builder withUseLayoutForMarkdown(final @Nullable Boolean value) {
            this.useLayoutForMarkdown = value;
            return this;
        }

        /** Sets the includeDocumentStructure field. */
        @JsonProperty("include_document_structure")
        public Builder withIncludeDocumentStructure(final @Nullable Boolean value) {
            this.includeDocumentStructure = value;
            return this;
        }

        /** Sets the acceleration field. */
        @JsonProperty("acceleration")
        public Builder withAcceleration(final @Nullable AccelerationConfig value) {
            this.acceleration = value;
            return this;
        }

        /** Sets the cacheNamespace field. */
        @JsonProperty("cache_namespace")
        public Builder withCacheNamespace(final @Nullable String value) {
            this.cacheNamespace = value;
            return this;
        }

        /** Sets the cacheTtlSecs field. */
        @JsonProperty("cache_ttl_secs")
        public Builder withCacheTtlSecs(final @Nullable Long value) {
            this.cacheTtlSecs = value;
            return this;
        }

        /** Sets the email field. */
        @JsonProperty("email")
        public Builder withEmail(final @Nullable EmailConfig value) {
            this.email = value;
            return this;
        }

        /** Sets the concurrency field. */
        @JsonProperty("concurrency")
        public Builder withConcurrency(final @Nullable String value) {
            this.concurrency = value;
            return this;
        }

        /** Sets the maxArchiveDepth field. */
        @JsonProperty("max_archive_depth")
        public Builder withMaxArchiveDepth(final @Nullable Long value) {
            this.maxArchiveDepth = value;
            return this;
        }

        /** Sets the treeSitter field. */
        @JsonProperty("tree_sitter")
        public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
            this.treeSitter = value;
            return this;
        }

        /** Sets the structuredExtraction field. */
        @JsonProperty("structured_extraction")
        public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
            this.structuredExtraction = value;
            return this;
        }

        /** Sets the cancelToken field. */
        @JsonProperty("cancel_token")
        public Builder withCancelToken(final @Nullable String value) {
            this.cancelToken = value;
            return this;
        }

        /** Builds the ExtractionConfig instance. */
        public ExtractionConfig build() {
            return new ExtractionConfig(
                useCache,
                enableQualityProcessing,
                ocr,
                forceOcr,
                forceOcrPages,
                disableOcr,
                chunking,
                contentFilter,
                images,
                pdfOptions,
                tokenReduction,
                languageDetection,
                pages,
                keywords,
                postprocessor,
                htmlOptions,
                htmlOutput,
                extractionTimeoutSecs,
                maxConcurrentExtractions,
                resultFormat,
                securityLimits,
                maxEmbeddedFileBytes,
                outputFormat,
                layout,
                useLayoutForMarkdown,
                includeDocumentStructure,
                acceleration,
                cacheNamespace,
                cacheTtlSecs,
                email,
                concurrency,
                maxArchiveDepth,
                treeSitter,
                structuredExtraction,
                cancelToken
            );
        }
    }
    // CPD-ON
    public static ExtractionConfig defaultInstance() {
        throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
    }
}