This commit is contained in:
361
packages/java/dev/kreuzberg/FileExtractionConfig.java
generated
Normal file
361
packages/java/dev/kreuzberg/FileExtractionConfig.java
generated
Normal file
@@ -0,0 +1,361 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import java.util.List;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Per-file extraction configuration overrides for batch processing.
|
||||
*
|
||||
* All fields are {@code Option<T>} — {@code None} means "use the batch-level default."
|
||||
* This type is used with {@code batch_extract_files} and
|
||||
* {@code batch_extract_bytes} to allow heterogeneous
|
||||
* extraction settings within a single batch.
|
||||
*
|
||||
* # Excluded Fields
|
||||
*
|
||||
* The following {@code ExtractionConfig} fields are batch-level only and
|
||||
* cannot be overridden per file:
|
||||
* - {@code max_concurrent_extractions} — controls batch parallelism
|
||||
* - {@code use_cache} — global caching policy
|
||||
* - {@code acceleration} — shared ONNX execution provider
|
||||
* - {@code security_limits} — global archive security policy
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = FileExtractionConfig.Builder.class)
|
||||
public record FileExtractionConfig(
|
||||
/**
|
||||
* Override quality post-processing for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
|
||||
/**
|
||||
* Override OCR configuration for this file (null in the Option = use batch default).
|
||||
*/
|
||||
@Nullable @JsonProperty("ocr") OcrConfig ocr,
|
||||
/**
|
||||
* Override force OCR for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
|
||||
/**
|
||||
* Override force OCR pages for this file (1-indexed page numbers).
|
||||
*/
|
||||
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
|
||||
/**
|
||||
* Override disable OCR for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
|
||||
/**
|
||||
* Override chunking configuration for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
|
||||
/**
|
||||
* Override content filtering configuration for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
|
||||
/**
|
||||
* Override image extraction configuration for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("images") ImageExtractionConfig images,
|
||||
/**
|
||||
* Override PDF options for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
|
||||
/**
|
||||
* Override token reduction for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
|
||||
/**
|
||||
* Override language detection for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
|
||||
/**
|
||||
* Override page extraction for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("pages") PageConfig pages,
|
||||
/**
|
||||
* Override keyword extraction for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
|
||||
/**
|
||||
* Override post-processor for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
|
||||
/**
|
||||
* Override HTML conversion options for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("html_options") String htmlOptions,
|
||||
/**
|
||||
* Override result format for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
|
||||
/**
|
||||
* Override output content format for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
|
||||
/**
|
||||
* Override document structure output for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
|
||||
/**
|
||||
* Override layout detection for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
|
||||
/**
|
||||
* Override per-file extraction timeout in seconds.
|
||||
*
|
||||
* When set, the extraction for this file will be canceled after the
|
||||
* specified duration. A timed-out file produces an error result without
|
||||
* affecting other files in the batch.
|
||||
*/
|
||||
@Nullable @JsonProperty("timeout_secs") Long timeoutSecs,
|
||||
/**
|
||||
* Override tree-sitter configuration for this file.
|
||||
*/
|
||||
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
|
||||
/**
|
||||
* Override structured extraction configuration for this file.
|
||||
*
|
||||
* When set, enables LLM-based structured extraction with a JSON schema
|
||||
* for this specific file. The extracted content is sent to a VLM/LLM
|
||||
* and the response is parsed according to the provided schema.
|
||||
*/
|
||||
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
@JsonProperty("enable_quality_processing")
|
||||
private Boolean enableQualityProcessing = null;
|
||||
private OcrConfig ocr = null;
|
||||
@JsonProperty("force_ocr")
|
||||
private Boolean forceOcr = null;
|
||||
@JsonProperty("force_ocr_pages")
|
||||
private List<Integer> forceOcrPages = null;
|
||||
@JsonProperty("disable_ocr")
|
||||
private Boolean disableOcr = null;
|
||||
private ChunkingConfig chunking = null;
|
||||
@JsonProperty("content_filter")
|
||||
private ContentFilterConfig contentFilter = null;
|
||||
private ImageExtractionConfig images = null;
|
||||
@JsonProperty("pdf_options")
|
||||
private PdfConfig pdfOptions = null;
|
||||
@JsonProperty("token_reduction")
|
||||
private TokenReductionOptions tokenReduction = null;
|
||||
@JsonProperty("language_detection")
|
||||
private LanguageDetectionConfig languageDetection = null;
|
||||
private PageConfig pages = null;
|
||||
private KeywordConfig keywords = null;
|
||||
private PostProcessorConfig postprocessor = null;
|
||||
@JsonProperty("html_options")
|
||||
private String htmlOptions = null;
|
||||
@JsonProperty("result_format")
|
||||
private ResultFormat resultFormat = null;
|
||||
@JsonProperty("output_format")
|
||||
private OutputFormat outputFormat = null;
|
||||
@JsonProperty("include_document_structure")
|
||||
private Boolean includeDocumentStructure = null;
|
||||
private LayoutDetectionConfig layout = null;
|
||||
@JsonProperty("timeout_secs")
|
||||
private Long timeoutSecs = null;
|
||||
@JsonProperty("tree_sitter")
|
||||
private TreeSitterConfig treeSitter = null;
|
||||
@JsonProperty("structured_extraction")
|
||||
private StructuredExtractionConfig structuredExtraction = null;
|
||||
|
||||
/** Sets the enableQualityProcessing field. */
|
||||
@JsonProperty("enable_quality_processing")
|
||||
public Builder withEnableQualityProcessing(final @Nullable boolean value) {
|
||||
this.enableQualityProcessing = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the ocr field. */
|
||||
@JsonProperty("ocr")
|
||||
public Builder withOcr(final @Nullable OcrConfig value) {
|
||||
this.ocr = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the forceOcr field. */
|
||||
@JsonProperty("force_ocr")
|
||||
public Builder withForceOcr(final @Nullable boolean value) {
|
||||
this.forceOcr = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the forceOcrPages field. */
|
||||
@JsonProperty("force_ocr_pages")
|
||||
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
|
||||
this.forceOcrPages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the disableOcr field. */
|
||||
@JsonProperty("disable_ocr")
|
||||
public Builder withDisableOcr(final @Nullable boolean value) {
|
||||
this.disableOcr = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the chunking field. */
|
||||
@JsonProperty("chunking")
|
||||
public Builder withChunking(final @Nullable ChunkingConfig value) {
|
||||
this.chunking = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the contentFilter field. */
|
||||
@JsonProperty("content_filter")
|
||||
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
|
||||
this.contentFilter = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the images field. */
|
||||
@JsonProperty("images")
|
||||
public Builder withImages(final @Nullable ImageExtractionConfig value) {
|
||||
this.images = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the pdfOptions field. */
|
||||
@JsonProperty("pdf_options")
|
||||
public Builder withPdfOptions(final @Nullable PdfConfig value) {
|
||||
this.pdfOptions = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tokenReduction field. */
|
||||
@JsonProperty("token_reduction")
|
||||
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
|
||||
this.tokenReduction = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the languageDetection field. */
|
||||
@JsonProperty("language_detection")
|
||||
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
|
||||
this.languageDetection = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the pages field. */
|
||||
@JsonProperty("pages")
|
||||
public Builder withPages(final @Nullable PageConfig value) {
|
||||
this.pages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the keywords field. */
|
||||
@JsonProperty("keywords")
|
||||
public Builder withKeywords(final @Nullable KeywordConfig value) {
|
||||
this.keywords = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the postprocessor field. */
|
||||
@JsonProperty("postprocessor")
|
||||
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
|
||||
this.postprocessor = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the htmlOptions field. */
|
||||
@JsonProperty("html_options")
|
||||
public Builder withHtmlOptions(final @Nullable String value) {
|
||||
this.htmlOptions = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the resultFormat field. */
|
||||
@JsonProperty("result_format")
|
||||
public Builder withResultFormat(final @Nullable ResultFormat value) {
|
||||
this.resultFormat = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the outputFormat field. */
|
||||
@JsonProperty("output_format")
|
||||
public Builder withOutputFormat(final @Nullable OutputFormat value) {
|
||||
this.outputFormat = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the includeDocumentStructure field. */
|
||||
@JsonProperty("include_document_structure")
|
||||
public Builder withIncludeDocumentStructure(final @Nullable boolean value) {
|
||||
this.includeDocumentStructure = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the layout field. */
|
||||
@JsonProperty("layout")
|
||||
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
|
||||
this.layout = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the timeoutSecs field. */
|
||||
@JsonProperty("timeout_secs")
|
||||
public Builder withTimeoutSecs(final @Nullable long value) {
|
||||
this.timeoutSecs = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the treeSitter field. */
|
||||
@JsonProperty("tree_sitter")
|
||||
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
|
||||
this.treeSitter = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the structuredExtraction field. */
|
||||
@JsonProperty("structured_extraction")
|
||||
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
|
||||
this.structuredExtraction = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the FileExtractionConfig instance. */
|
||||
public FileExtractionConfig build() {
|
||||
return new FileExtractionConfig(
|
||||
enableQualityProcessing,
|
||||
ocr,
|
||||
forceOcr,
|
||||
forceOcrPages,
|
||||
disableOcr,
|
||||
chunking,
|
||||
contentFilter,
|
||||
images,
|
||||
pdfOptions,
|
||||
tokenReduction,
|
||||
languageDetection,
|
||||
pages,
|
||||
keywords,
|
||||
postprocessor,
|
||||
htmlOptions,
|
||||
resultFormat,
|
||||
outputFormat,
|
||||
includeDocumentStructure,
|
||||
layout,
|
||||
timeoutSecs,
|
||||
treeSitter,
|
||||
structuredExtraction
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
}
|
||||
Reference in New Issue
Block a user