362 lines
12 KiB
Java
362 lines
12 KiB
Java
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
package dev.kreuzberg;
|
||
|
|
|
||
|
|
import java.util.List;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||
|
|
import org.jspecify.annotations.Nullable;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Per-file extraction configuration overrides for batch processing.
|
||
|
|
*
|
||
|
|
* All fields are {@code Option<T>} — {@code None} means "use the batch-level default."
|
||
|
|
* This type is used with {@code batch_extract_files} and
|
||
|
|
* {@code batch_extract_bytes} to allow heterogeneous
|
||
|
|
* extraction settings within a single batch.
|
||
|
|
*
|
||
|
|
* # Excluded Fields
|
||
|
|
*
|
||
|
|
* The following {@code ExtractionConfig} fields are batch-level only and
|
||
|
|
* cannot be overridden per file:
|
||
|
|
* - {@code max_concurrent_extractions} — controls batch parallelism
|
||
|
|
* - {@code use_cache} — global caching policy
|
||
|
|
* - {@code acceleration} — shared ONNX execution provider
|
||
|
|
* - {@code security_limits} — global archive security policy
|
||
|
|
*/
|
||
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||
|
|
@JsonDeserialize(builder = FileExtractionConfig.Builder.class)
|
||
|
|
public record FileExtractionConfig(
|
||
|
|
/**
|
||
|
|
* Override quality post-processing for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
|
||
|
|
/**
|
||
|
|
* Override OCR configuration for this file (null in the Option = use batch default).
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("ocr") OcrConfig ocr,
|
||
|
|
/**
|
||
|
|
* Override force OCR for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
|
||
|
|
/**
|
||
|
|
* Override force OCR pages for this file (1-indexed page numbers).
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
|
||
|
|
/**
|
||
|
|
* Override disable OCR for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
|
||
|
|
/**
|
||
|
|
* Override chunking configuration for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
|
||
|
|
/**
|
||
|
|
* Override content filtering configuration for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
|
||
|
|
/**
|
||
|
|
* Override image extraction configuration for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("images") ImageExtractionConfig images,
|
||
|
|
/**
|
||
|
|
* Override PDF options for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
|
||
|
|
/**
|
||
|
|
* Override token reduction for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
|
||
|
|
/**
|
||
|
|
* Override language detection for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
|
||
|
|
/**
|
||
|
|
* Override page extraction for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("pages") PageConfig pages,
|
||
|
|
/**
|
||
|
|
* Override keyword extraction for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
|
||
|
|
/**
|
||
|
|
* Override post-processor for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
|
||
|
|
/**
|
||
|
|
* Override HTML conversion options for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("html_options") String htmlOptions,
|
||
|
|
/**
|
||
|
|
* Override result format for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
|
||
|
|
/**
|
||
|
|
* Override output content format for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
|
||
|
|
/**
|
||
|
|
* Override document structure output for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
|
||
|
|
/**
|
||
|
|
* Override layout detection for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
|
||
|
|
/**
|
||
|
|
* Override per-file extraction timeout in seconds.
|
||
|
|
*
|
||
|
|
* When set, the extraction for this file will be canceled after the
|
||
|
|
* specified duration. A timed-out file produces an error result without
|
||
|
|
* affecting other files in the batch.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("timeout_secs") Long timeoutSecs,
|
||
|
|
/**
|
||
|
|
* Override tree-sitter configuration for this file.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
|
||
|
|
/**
|
||
|
|
* Override structured extraction configuration for this file.
|
||
|
|
*
|
||
|
|
* When set, enables LLM-based structured extraction with a JSON schema
|
||
|
|
* for this specific file. The extracted content is sent to a VLM/LLM
|
||
|
|
* and the response is parsed according to the provided schema.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction
|
||
|
|
) {
|
||
|
|
public static Builder builder() {
|
||
|
|
return new Builder();
|
||
|
|
}
|
||
|
|
|
||
|
|
// CPD-OFF
|
||
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||
|
|
public static final class Builder {
|
||
|
|
|
||
|
|
@JsonProperty("enable_quality_processing")
|
||
|
|
private Boolean enableQualityProcessing = null;
|
||
|
|
private OcrConfig ocr = null;
|
||
|
|
@JsonProperty("force_ocr")
|
||
|
|
private Boolean forceOcr = null;
|
||
|
|
@JsonProperty("force_ocr_pages")
|
||
|
|
private List<Integer> forceOcrPages = null;
|
||
|
|
@JsonProperty("disable_ocr")
|
||
|
|
private Boolean disableOcr = null;
|
||
|
|
private ChunkingConfig chunking = null;
|
||
|
|
@JsonProperty("content_filter")
|
||
|
|
private ContentFilterConfig contentFilter = null;
|
||
|
|
private ImageExtractionConfig images = null;
|
||
|
|
@JsonProperty("pdf_options")
|
||
|
|
private PdfConfig pdfOptions = null;
|
||
|
|
@JsonProperty("token_reduction")
|
||
|
|
private TokenReductionOptions tokenReduction = null;
|
||
|
|
@JsonProperty("language_detection")
|
||
|
|
private LanguageDetectionConfig languageDetection = null;
|
||
|
|
private PageConfig pages = null;
|
||
|
|
private KeywordConfig keywords = null;
|
||
|
|
private PostProcessorConfig postprocessor = null;
|
||
|
|
@JsonProperty("html_options")
|
||
|
|
private String htmlOptions = null;
|
||
|
|
@JsonProperty("result_format")
|
||
|
|
private ResultFormat resultFormat = null;
|
||
|
|
@JsonProperty("output_format")
|
||
|
|
private OutputFormat outputFormat = null;
|
||
|
|
@JsonProperty("include_document_structure")
|
||
|
|
private Boolean includeDocumentStructure = null;
|
||
|
|
private LayoutDetectionConfig layout = null;
|
||
|
|
@JsonProperty("timeout_secs")
|
||
|
|
private Long timeoutSecs = null;
|
||
|
|
@JsonProperty("tree_sitter")
|
||
|
|
private TreeSitterConfig treeSitter = null;
|
||
|
|
@JsonProperty("structured_extraction")
|
||
|
|
private StructuredExtractionConfig structuredExtraction = null;
|
||
|
|
|
||
|
|
/** Sets the enableQualityProcessing field. */
|
||
|
|
@JsonProperty("enable_quality_processing")
|
||
|
|
public Builder withEnableQualityProcessing(final @Nullable boolean value) {
|
||
|
|
this.enableQualityProcessing = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the ocr field. */
|
||
|
|
@JsonProperty("ocr")
|
||
|
|
public Builder withOcr(final @Nullable OcrConfig value) {
|
||
|
|
this.ocr = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the forceOcr field. */
|
||
|
|
@JsonProperty("force_ocr")
|
||
|
|
public Builder withForceOcr(final @Nullable boolean value) {
|
||
|
|
this.forceOcr = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the forceOcrPages field. */
|
||
|
|
@JsonProperty("force_ocr_pages")
|
||
|
|
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
|
||
|
|
this.forceOcrPages = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the disableOcr field. */
|
||
|
|
@JsonProperty("disable_ocr")
|
||
|
|
public Builder withDisableOcr(final @Nullable boolean value) {
|
||
|
|
this.disableOcr = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the chunking field. */
|
||
|
|
@JsonProperty("chunking")
|
||
|
|
public Builder withChunking(final @Nullable ChunkingConfig value) {
|
||
|
|
this.chunking = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the contentFilter field. */
|
||
|
|
@JsonProperty("content_filter")
|
||
|
|
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
|
||
|
|
this.contentFilter = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the images field. */
|
||
|
|
@JsonProperty("images")
|
||
|
|
public Builder withImages(final @Nullable ImageExtractionConfig value) {
|
||
|
|
this.images = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the pdfOptions field. */
|
||
|
|
@JsonProperty("pdf_options")
|
||
|
|
public Builder withPdfOptions(final @Nullable PdfConfig value) {
|
||
|
|
this.pdfOptions = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the tokenReduction field. */
|
||
|
|
@JsonProperty("token_reduction")
|
||
|
|
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
|
||
|
|
this.tokenReduction = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the languageDetection field. */
|
||
|
|
@JsonProperty("language_detection")
|
||
|
|
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
|
||
|
|
this.languageDetection = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the pages field. */
|
||
|
|
@JsonProperty("pages")
|
||
|
|
public Builder withPages(final @Nullable PageConfig value) {
|
||
|
|
this.pages = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the keywords field. */
|
||
|
|
@JsonProperty("keywords")
|
||
|
|
public Builder withKeywords(final @Nullable KeywordConfig value) {
|
||
|
|
this.keywords = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the postprocessor field. */
|
||
|
|
@JsonProperty("postprocessor")
|
||
|
|
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
|
||
|
|
this.postprocessor = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the htmlOptions field. */
|
||
|
|
@JsonProperty("html_options")
|
||
|
|
public Builder withHtmlOptions(final @Nullable String value) {
|
||
|
|
this.htmlOptions = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the resultFormat field. */
|
||
|
|
@JsonProperty("result_format")
|
||
|
|
public Builder withResultFormat(final @Nullable ResultFormat value) {
|
||
|
|
this.resultFormat = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the outputFormat field. */
|
||
|
|
@JsonProperty("output_format")
|
||
|
|
public Builder withOutputFormat(final @Nullable OutputFormat value) {
|
||
|
|
this.outputFormat = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the includeDocumentStructure field. */
|
||
|
|
@JsonProperty("include_document_structure")
|
||
|
|
public Builder withIncludeDocumentStructure(final @Nullable boolean value) {
|
||
|
|
this.includeDocumentStructure = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the layout field. */
|
||
|
|
@JsonProperty("layout")
|
||
|
|
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
|
||
|
|
this.layout = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the timeoutSecs field. */
|
||
|
|
@JsonProperty("timeout_secs")
|
||
|
|
public Builder withTimeoutSecs(final @Nullable long value) {
|
||
|
|
this.timeoutSecs = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the treeSitter field. */
|
||
|
|
@JsonProperty("tree_sitter")
|
||
|
|
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
|
||
|
|
this.treeSitter = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the structuredExtraction field. */
|
||
|
|
@JsonProperty("structured_extraction")
|
||
|
|
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
|
||
|
|
this.structuredExtraction = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Builds the FileExtractionConfig instance. */
|
||
|
|
public FileExtractionConfig build() {
|
||
|
|
return new FileExtractionConfig(
|
||
|
|
enableQualityProcessing,
|
||
|
|
ocr,
|
||
|
|
forceOcr,
|
||
|
|
forceOcrPages,
|
||
|
|
disableOcr,
|
||
|
|
chunking,
|
||
|
|
contentFilter,
|
||
|
|
images,
|
||
|
|
pdfOptions,
|
||
|
|
tokenReduction,
|
||
|
|
languageDetection,
|
||
|
|
pages,
|
||
|
|
keywords,
|
||
|
|
postprocessor,
|
||
|
|
htmlOptions,
|
||
|
|
resultFormat,
|
||
|
|
outputFormat,
|
||
|
|
includeDocumentStructure,
|
||
|
|
layout,
|
||
|
|
timeoutSecs,
|
||
|
|
treeSitter,
|
||
|
|
structuredExtraction
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// CPD-ON
|
||
|
|
}
|