// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import java.util.List; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder; import org.jspecify.annotations.Nullable; /** * Per-file extraction configuration overrides for batch processing. * * All fields are {@code Option<T>} — {@code None} means "use the batch-level default." * This type is used with {@code batch_extract_files} and * {@code batch_extract_bytes} to allow heterogeneous * extraction settings within a single batch. * * # Excluded Fields * * The following {@code ExtractionConfig} fields are batch-level only and * cannot be overridden per file: * - {@code max_concurrent_extractions} — controls batch parallelism * - {@code use_cache} — global caching policy * - {@code acceleration} — shared ONNX execution provider * - {@code security_limits} — global archive security policy */ @JsonInclude(JsonInclude.Include.NON_ABSENT) @JsonDeserialize(builder = FileExtractionConfig.Builder.class) public record FileExtractionConfig( /** * Override quality post-processing for this file. */ @Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing, /** * Override OCR configuration for this file (null in the Option = use batch default). */ @Nullable @JsonProperty("ocr") OcrConfig ocr, /** * Override force OCR for this file. */ @Nullable @JsonProperty("force_ocr") Boolean forceOcr, /** * Override force OCR pages for this file (1-indexed page numbers). */ @Nullable @JsonProperty("force_ocr_pages") List forceOcrPages, /** * Override disable OCR for this file. */ @Nullable @JsonProperty("disable_ocr") Boolean disableOcr, /** * Override chunking configuration for this file. */ @Nullable @JsonProperty("chunking") ChunkingConfig chunking, /** * Override content filtering configuration for this file. */ @Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter, /** * Override image extraction configuration for this file. */ @Nullable @JsonProperty("images") ImageExtractionConfig images, /** * Override PDF options for this file. */ @Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions, /** * Override token reduction for this file. */ @Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction, /** * Override language detection for this file. */ @Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection, /** * Override page extraction for this file. */ @Nullable @JsonProperty("pages") PageConfig pages, /** * Override keyword extraction for this file. */ @Nullable @JsonProperty("keywords") KeywordConfig keywords, /** * Override post-processor for this file. */ @Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor, /** * Override HTML conversion options for this file. */ @Nullable @JsonProperty("html_options") String htmlOptions, /** * Override result format for this file. */ @Nullable @JsonProperty("result_format") ResultFormat resultFormat, /** * Override output content format for this file. */ @Nullable @JsonProperty("output_format") OutputFormat outputFormat, /** * Override document structure output for this file. */ @Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure, /** * Override layout detection for this file. */ @Nullable @JsonProperty("layout") LayoutDetectionConfig layout, /** * Override per-file extraction timeout in seconds. * * When set, the extraction for this file will be canceled after the * specified duration. A timed-out file produces an error result without * affecting other files in the batch. */ @Nullable @JsonProperty("timeout_secs") Long timeoutSecs, /** * Override tree-sitter configuration for this file. */ @Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter, /** * Override structured extraction configuration for this file. * * When set, enables LLM-based structured extraction with a JSON schema * for this specific file. The extracted content is sent to a VLM/LLM * and the response is parsed according to the provided schema. */ @Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction ) { public static Builder builder() { return new Builder(); } // CPD-OFF @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build") public static final class Builder { @JsonProperty("enable_quality_processing") private Boolean enableQualityProcessing = null; private OcrConfig ocr = null; @JsonProperty("force_ocr") private Boolean forceOcr = null; @JsonProperty("force_ocr_pages") private List forceOcrPages = null; @JsonProperty("disable_ocr") private Boolean disableOcr = null; private ChunkingConfig chunking = null; @JsonProperty("content_filter") private ContentFilterConfig contentFilter = null; private ImageExtractionConfig images = null; @JsonProperty("pdf_options") private PdfConfig pdfOptions = null; @JsonProperty("token_reduction") private TokenReductionOptions tokenReduction = null; @JsonProperty("language_detection") private LanguageDetectionConfig languageDetection = null; private PageConfig pages = null; private KeywordConfig keywords = null; private PostProcessorConfig postprocessor = null; @JsonProperty("html_options") private String htmlOptions = null; @JsonProperty("result_format") private ResultFormat resultFormat = null; @JsonProperty("output_format") private OutputFormat outputFormat = null; @JsonProperty("include_document_structure") private Boolean includeDocumentStructure = null; private LayoutDetectionConfig layout = null; @JsonProperty("timeout_secs") private Long timeoutSecs = null; @JsonProperty("tree_sitter") private TreeSitterConfig treeSitter = null; @JsonProperty("structured_extraction") private StructuredExtractionConfig structuredExtraction = null; /** Sets the enableQualityProcessing field. */ @JsonProperty("enable_quality_processing") public Builder withEnableQualityProcessing(final @Nullable boolean value) { this.enableQualityProcessing = value; return this; } /** Sets the ocr field. */ @JsonProperty("ocr") public Builder withOcr(final @Nullable OcrConfig value) { this.ocr = value; return this; } /** Sets the forceOcr field. */ @JsonProperty("force_ocr") public Builder withForceOcr(final @Nullable boolean value) { this.forceOcr = value; return this; } /** Sets the forceOcrPages field. */ @JsonProperty("force_ocr_pages") public Builder withForceOcrPages(final @Nullable List value) { this.forceOcrPages = value; return this; } /** Sets the disableOcr field. */ @JsonProperty("disable_ocr") public Builder withDisableOcr(final @Nullable boolean value) { this.disableOcr = value; return this; } /** Sets the chunking field. */ @JsonProperty("chunking") public Builder withChunking(final @Nullable ChunkingConfig value) { this.chunking = value; return this; } /** Sets the contentFilter field. */ @JsonProperty("content_filter") public Builder withContentFilter(final @Nullable ContentFilterConfig value) { this.contentFilter = value; return this; } /** Sets the images field. */ @JsonProperty("images") public Builder withImages(final @Nullable ImageExtractionConfig value) { this.images = value; return this; } /** Sets the pdfOptions field. */ @JsonProperty("pdf_options") public Builder withPdfOptions(final @Nullable PdfConfig value) { this.pdfOptions = value; return this; } /** Sets the tokenReduction field. */ @JsonProperty("token_reduction") public Builder withTokenReduction(final @Nullable TokenReductionOptions value) { this.tokenReduction = value; return this; } /** Sets the languageDetection field. */ @JsonProperty("language_detection") public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) { this.languageDetection = value; return this; } /** Sets the pages field. */ @JsonProperty("pages") public Builder withPages(final @Nullable PageConfig value) { this.pages = value; return this; } /** Sets the keywords field. */ @JsonProperty("keywords") public Builder withKeywords(final @Nullable KeywordConfig value) { this.keywords = value; return this; } /** Sets the postprocessor field. */ @JsonProperty("postprocessor") public Builder withPostprocessor(final @Nullable PostProcessorConfig value) { this.postprocessor = value; return this; } /** Sets the htmlOptions field. */ @JsonProperty("html_options") public Builder withHtmlOptions(final @Nullable String value) { this.htmlOptions = value; return this; } /** Sets the resultFormat field. */ @JsonProperty("result_format") public Builder withResultFormat(final @Nullable ResultFormat value) { this.resultFormat = value; return this; } /** Sets the outputFormat field. */ @JsonProperty("output_format") public Builder withOutputFormat(final @Nullable OutputFormat value) { this.outputFormat = value; return this; } /** Sets the includeDocumentStructure field. */ @JsonProperty("include_document_structure") public Builder withIncludeDocumentStructure(final @Nullable boolean value) { this.includeDocumentStructure = value; return this; } /** Sets the layout field. */ @JsonProperty("layout") public Builder withLayout(final @Nullable LayoutDetectionConfig value) { this.layout = value; return this; } /** Sets the timeoutSecs field. */ @JsonProperty("timeout_secs") public Builder withTimeoutSecs(final @Nullable long value) { this.timeoutSecs = value; return this; } /** Sets the treeSitter field. */ @JsonProperty("tree_sitter") public Builder withTreeSitter(final @Nullable TreeSitterConfig value) { this.treeSitter = value; return this; } /** Sets the structuredExtraction field. */ @JsonProperty("structured_extraction") public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) { this.structuredExtraction = value; return this; } /** Builds the FileExtractionConfig instance. */ public FileExtractionConfig build() { return new FileExtractionConfig( enableQualityProcessing, ocr, forceOcr, forceOcrPages, disableOcr, chunking, contentFilter, images, pdfOptions, tokenReduction, languageDetection, pages, keywords, postprocessor, htmlOptions, resultFormat, outputFormat, includeDocumentStructure, layout, timeoutSecs, treeSitter, structuredExtraction ); } } // CPD-ON }