Files
fil/packages/java/dev/kreuzberg/FileExtractionConfig.java

362 lines
12 KiB
Java
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are {@code Option<T>} {@code None} means "use the batch-level default."
* This type is used with {@code batch_extract_files} and
* {@code batch_extract_bytes} to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following {@code ExtractionConfig} fields are batch-level only and
* cannot be overridden per file:
* - {@code max_concurrent_extractions} controls batch parallelism
* - {@code use_cache} global caching policy
* - {@code acceleration} shared ONNX execution provider
* - {@code security_limits} global archive security policy
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = FileExtractionConfig.Builder.class)
public record FileExtractionConfig(
/**
* Override quality post-processing for this file.
*/
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
/**
* Override OCR configuration for this file (null in the Option = use batch default).
*/
@Nullable @JsonProperty("ocr") OcrConfig ocr,
/**
* Override force OCR for this file.
*/
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
/**
* Override force OCR pages for this file (1-indexed page numbers).
*/
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
/**
* Override disable OCR for this file.
*/
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
/**
* Override chunking configuration for this file.
*/
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
/**
* Override content filtering configuration for this file.
*/
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
/**
* Override image extraction configuration for this file.
*/
@Nullable @JsonProperty("images") ImageExtractionConfig images,
/**
* Override PDF options for this file.
*/
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
/**
* Override token reduction for this file.
*/
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
/**
* Override language detection for this file.
*/
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
/**
* Override page extraction for this file.
*/
@Nullable @JsonProperty("pages") PageConfig pages,
/**
* Override keyword extraction for this file.
*/
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
/**
* Override post-processor for this file.
*/
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
/**
* Override HTML conversion options for this file.
*/
@Nullable @JsonProperty("html_options") String htmlOptions,
/**
* Override result format for this file.
*/
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
/**
* Override output content format for this file.
*/
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
/**
* Override document structure output for this file.
*/
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
/**
* Override layout detection for this file.
*/
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
@Nullable @JsonProperty("timeout_secs") Long timeoutSecs,
/**
* Override tree-sitter configuration for this file.
*/
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("enable_quality_processing")
private Boolean enableQualityProcessing = null;
private OcrConfig ocr = null;
@JsonProperty("force_ocr")
private Boolean forceOcr = null;
@JsonProperty("force_ocr_pages")
private List<Integer> forceOcrPages = null;
@JsonProperty("disable_ocr")
private Boolean disableOcr = null;
private ChunkingConfig chunking = null;
@JsonProperty("content_filter")
private ContentFilterConfig contentFilter = null;
private ImageExtractionConfig images = null;
@JsonProperty("pdf_options")
private PdfConfig pdfOptions = null;
@JsonProperty("token_reduction")
private TokenReductionOptions tokenReduction = null;
@JsonProperty("language_detection")
private LanguageDetectionConfig languageDetection = null;
private PageConfig pages = null;
private KeywordConfig keywords = null;
private PostProcessorConfig postprocessor = null;
@JsonProperty("html_options")
private String htmlOptions = null;
@JsonProperty("result_format")
private ResultFormat resultFormat = null;
@JsonProperty("output_format")
private OutputFormat outputFormat = null;
@JsonProperty("include_document_structure")
private Boolean includeDocumentStructure = null;
private LayoutDetectionConfig layout = null;
@JsonProperty("timeout_secs")
private Long timeoutSecs = null;
@JsonProperty("tree_sitter")
private TreeSitterConfig treeSitter = null;
@JsonProperty("structured_extraction")
private StructuredExtractionConfig structuredExtraction = null;
/** Sets the enableQualityProcessing field. */
@JsonProperty("enable_quality_processing")
public Builder withEnableQualityProcessing(final @Nullable boolean value) {
this.enableQualityProcessing = value;
return this;
}
/** Sets the ocr field. */
@JsonProperty("ocr")
public Builder withOcr(final @Nullable OcrConfig value) {
this.ocr = value;
return this;
}
/** Sets the forceOcr field. */
@JsonProperty("force_ocr")
public Builder withForceOcr(final @Nullable boolean value) {
this.forceOcr = value;
return this;
}
/** Sets the forceOcrPages field. */
@JsonProperty("force_ocr_pages")
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
this.forceOcrPages = value;
return this;
}
/** Sets the disableOcr field. */
@JsonProperty("disable_ocr")
public Builder withDisableOcr(final @Nullable boolean value) {
this.disableOcr = value;
return this;
}
/** Sets the chunking field. */
@JsonProperty("chunking")
public Builder withChunking(final @Nullable ChunkingConfig value) {
this.chunking = value;
return this;
}
/** Sets the contentFilter field. */
@JsonProperty("content_filter")
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
this.contentFilter = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable ImageExtractionConfig value) {
this.images = value;
return this;
}
/** Sets the pdfOptions field. */
@JsonProperty("pdf_options")
public Builder withPdfOptions(final @Nullable PdfConfig value) {
this.pdfOptions = value;
return this;
}
/** Sets the tokenReduction field. */
@JsonProperty("token_reduction")
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
this.tokenReduction = value;
return this;
}
/** Sets the languageDetection field. */
@JsonProperty("language_detection")
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
this.languageDetection = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable PageConfig value) {
this.pages = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable KeywordConfig value) {
this.keywords = value;
return this;
}
/** Sets the postprocessor field. */
@JsonProperty("postprocessor")
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
this.postprocessor = value;
return this;
}
/** Sets the htmlOptions field. */
@JsonProperty("html_options")
public Builder withHtmlOptions(final @Nullable String value) {
this.htmlOptions = value;
return this;
}
/** Sets the resultFormat field. */
@JsonProperty("result_format")
public Builder withResultFormat(final @Nullable ResultFormat value) {
this.resultFormat = value;
return this;
}
/** Sets the outputFormat field. */
@JsonProperty("output_format")
public Builder withOutputFormat(final @Nullable OutputFormat value) {
this.outputFormat = value;
return this;
}
/** Sets the includeDocumentStructure field. */
@JsonProperty("include_document_structure")
public Builder withIncludeDocumentStructure(final @Nullable boolean value) {
this.includeDocumentStructure = value;
return this;
}
/** Sets the layout field. */
@JsonProperty("layout")
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
this.layout = value;
return this;
}
/** Sets the timeoutSecs field. */
@JsonProperty("timeout_secs")
public Builder withTimeoutSecs(final @Nullable long value) {
this.timeoutSecs = value;
return this;
}
/** Sets the treeSitter field. */
@JsonProperty("tree_sitter")
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
this.treeSitter = value;
return this;
}
/** Sets the structuredExtraction field. */
@JsonProperty("structured_extraction")
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
this.structuredExtraction = value;
return this;
}
/** Builds the FileExtractionConfig instance. */
public FileExtractionConfig build() {
return new FileExtractionConfig(
enableQualityProcessing,
ocr,
forceOcr,
forceOcrPages,
disableOcr,
chunking,
contentFilter,
images,
pdfOptions,
tokenReduction,
languageDetection,
pages,
keywords,
postprocessor,
htmlOptions,
resultFormat,
outputFormat,
includeDocumentStructure,
layout,
timeoutSecs,
treeSitter,
structuredExtraction
);
}
}
// CPD-ON
}