651 lines
26 KiB
Java
651 lines
26 KiB
Java
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
|||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|||
|
|
// To regenerate: alef generate
|
|||
|
|
// To verify freshness: alef verify --exit-code
|
|||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|||
|
|
package dev.kreuzberg;
|
|||
|
|
|
|||
|
|
import java.util.List;
|
|||
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|||
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|||
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|||
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|||
|
|
import org.jspecify.annotations.Nullable;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Main extraction configuration.
|
|||
|
|
*
|
|||
|
|
* This struct contains all configuration options for the extraction process.
|
|||
|
|
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|||
|
|
*/
|
|||
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|||
|
|
@JsonDeserialize(builder = ExtractionConfig.Builder.class)
|
|||
|
|
public record ExtractionConfig(
|
|||
|
|
/**
|
|||
|
|
* Enable caching of extraction results
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("use_cache") Boolean useCache,
|
|||
|
|
/**
|
|||
|
|
* Enable quality post-processing
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
|
|||
|
|
/**
|
|||
|
|
* OCR configuration (null = OCR disabled)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("ocr") OcrConfig ocr,
|
|||
|
|
/**
|
|||
|
|
* Force OCR even for searchable PDFs
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
|
|||
|
|
/**
|
|||
|
|
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
|||
|
|
*
|
|||
|
|
* When set, only the listed pages are OCR'd regardless of text layer quality.
|
|||
|
|
* Unlisted pages use native text extraction. Ignored when {@code force_ocr} is {@code true}.
|
|||
|
|
* Only applies to PDF documents. Duplicates are automatically deduplicated.
|
|||
|
|
* An {@code ocr} config is recommended for backend/language selection; defaults are used if absent.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
|
|||
|
|
/**
|
|||
|
|
* Disable OCR entirely, even for images.
|
|||
|
|
*
|
|||
|
|
* When {@code true}, OCR is skipped for all document types. Images return metadata
|
|||
|
|
* only (dimensions, format, EXIF) without text extraction. PDFs use only
|
|||
|
|
* native text extraction without OCR fallback.
|
|||
|
|
*
|
|||
|
|
* Cannot be {@code true} simultaneously with {@code force_ocr}.
|
|||
|
|
*
|
|||
|
|
* *Added in v4.7.0.*
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
|
|||
|
|
/**
|
|||
|
|
* Text chunking configuration (null = chunking disabled)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
|
|||
|
|
/**
|
|||
|
|
* Content filtering configuration (null = use extractor defaults).
|
|||
|
|
*
|
|||
|
|
* Controls whether document "furniture" (headers, footers, watermarks,
|
|||
|
|
* repeating text) is included in or stripped from extraction results.
|
|||
|
|
* See ContentFilterConfig for per-field documentation.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
|
|||
|
|
/**
|
|||
|
|
* Image extraction configuration (null = no image extraction)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("images") ImageExtractionConfig images,
|
|||
|
|
/**
|
|||
|
|
* PDF-specific options (null = use defaults)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
|
|||
|
|
/**
|
|||
|
|
* Token reduction configuration (null = no token reduction)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
|
|||
|
|
/**
|
|||
|
|
* Language detection configuration (null = no language detection)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
|
|||
|
|
/**
|
|||
|
|
* Page extraction configuration (null = no page tracking)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("pages") PageConfig pages,
|
|||
|
|
/**
|
|||
|
|
* Keyword extraction configuration (null = no keyword extraction)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
|
|||
|
|
/**
|
|||
|
|
* Post-processor configuration (null = use defaults)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
|
|||
|
|
/**
|
|||
|
|
* HTML to Markdown conversion options (null = use defaults)
|
|||
|
|
*
|
|||
|
|
* Configure how HTML documents are converted to Markdown, including heading styles,
|
|||
|
|
* list formatting, code block styles, and preprocessing options.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("html_options") String htmlOptions,
|
|||
|
|
/**
|
|||
|
|
* Styled HTML output configuration.
|
|||
|
|
*
|
|||
|
|
* When set alongside {@code output_format = OutputFormat.Html}, the extraction
|
|||
|
|
* pipeline uses StyledHtmlRenderer(crate.rendering.StyledHtmlRenderer)
|
|||
|
|
* which emits stable {@code kb-*} CSS class hooks on every structural element
|
|||
|
|
* and optionally embeds theme CSS or user-supplied CSS in a {@code <style>} block.
|
|||
|
|
*
|
|||
|
|
* When {@code None}, the existing plain comrak-based HTML renderer is used.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("html_output") HtmlOutputConfig htmlOutput,
|
|||
|
|
/**
|
|||
|
|
* Default per-file timeout in seconds for batch extraction.
|
|||
|
|
*
|
|||
|
|
* When set, each file in a batch will be canceled after this duration
|
|||
|
|
* unless overridden by FileExtractionConfig.timeout_secs.
|
|||
|
|
*
|
|||
|
|
* Defaults to {@code Some(60)} to prevent pathological files (e.g. deeply
|
|||
|
|
* nested archives, documents with millions of cells) from running
|
|||
|
|
* indefinitely and exhausting caller resources. Set to {@code None} to
|
|||
|
|
* disable the timeout for trusted input or long-running workloads.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("extraction_timeout_secs") Long extractionTimeoutSecs,
|
|||
|
|
/**
|
|||
|
|
* Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
|
|||
|
|
*
|
|||
|
|
* Limits parallelism to prevent resource exhaustion when processing
|
|||
|
|
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("max_concurrent_extractions") Long maxConcurrentExtractions,
|
|||
|
|
/**
|
|||
|
|
* Result structure format
|
|||
|
|
*
|
|||
|
|
* Controls whether results are returned in unified format (default) with all
|
|||
|
|
* content in the {@code content} field, or element-based format with semantic
|
|||
|
|
* elements (for Unstructured-compatible output).
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
|
|||
|
|
/**
|
|||
|
|
* Security limits for archive extraction.
|
|||
|
|
*
|
|||
|
|
* Controls maximum archive size, compression ratio, file count, and other
|
|||
|
|
* security thresholds to prevent decompression bomb attacks. Also caps
|
|||
|
|
* nesting depth, iteration count, entity / token length, total
|
|||
|
|
* content size, and table cell count for every extraction path that
|
|||
|
|
* ingests user-controlled bytes.
|
|||
|
|
* When {@code None}, default limits are used.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("security_limits") SecurityLimits securityLimits,
|
|||
|
|
/**
|
|||
|
|
* Maximum uncompressed size in bytes for a single embedded file before
|
|||
|
|
* recursive extraction is attempted (default: 50 MiB).
|
|||
|
|
*
|
|||
|
|
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
|||
|
|
* to email attachments processed via recursive extraction. Files that
|
|||
|
|
* exceed this limit are skipped with a {@code ProcessingWarning} rather than
|
|||
|
|
* passed to the extraction pipeline, preventing a single oversized
|
|||
|
|
* embedded object from consuming unbounded memory or time.
|
|||
|
|
*
|
|||
|
|
* Set to {@code None} to disable the per-embedded-file cap (falls back to
|
|||
|
|
* {@code security_limits.max_archive_size} as the only guard).
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("max_embedded_file_bytes") Long maxEmbeddedFileBytes,
|
|||
|
|
/**
|
|||
|
|
* Content text format (default: Plain).
|
|||
|
|
*
|
|||
|
|
* Controls the format of the extracted content:
|
|||
|
|
* - {@code Plain}: Raw extracted text (default)
|
|||
|
|
* - {@code Markdown}: Markdown formatted output
|
|||
|
|
* - {@code Djot}: Djot markup format (requires djot feature)
|
|||
|
|
* - {@code Html}: HTML formatted output
|
|||
|
|
*
|
|||
|
|
* When set to a structured format, extraction results will include
|
|||
|
|
* formatted output. The {@code formatted_content} field may be populated
|
|||
|
|
* when format conversion is applied.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
|
|||
|
|
/**
|
|||
|
|
* Layout detection configuration (null = layout detection disabled).
|
|||
|
|
*
|
|||
|
|
* When set, PDF pages and images are analyzed for document structure
|
|||
|
|
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
|||
|
|
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
|||
|
|
* in the markdown pipeline. For images, per-region OCR is performed with
|
|||
|
|
* markdown formatting based on detected layout classes.
|
|||
|
|
* Requires the {@code layout-detection} feature to run inference; the field is
|
|||
|
|
* present whenever the {@code layout-types} feature is active (which includes
|
|||
|
|
* {@code layout-detection} as well as the no-ORT target groups).
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
|
|||
|
|
/**
|
|||
|
|
* Run layout detection on the non-OCR PDF markdown path.
|
|||
|
|
*
|
|||
|
|
* When {@code true} and {@code layout} is {@code Some(_)}, layout regions inform heading,
|
|||
|
|
* table, list, and figure detection in the structure pipeline that would
|
|||
|
|
* otherwise rely on font-clustering heuristics alone. Significantly
|
|||
|
|
* improves SF1 (structural F1) at the cost of inference latency
|
|||
|
|
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: {@code false}.
|
|||
|
|
* Requires the {@code layout-detection} feature.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("use_layout_for_markdown") Boolean useLayoutForMarkdown,
|
|||
|
|
/**
|
|||
|
|
* Enable structured document tree output.
|
|||
|
|
*
|
|||
|
|
* When true, populates the {@code document} field on {@code ExtractionResult} with a
|
|||
|
|
* hierarchical {@code DocumentStructure} containing heading-driven section nesting,
|
|||
|
|
* table grids, content layer classification, and inline annotations.
|
|||
|
|
*
|
|||
|
|
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
*
|
|||
|
|
* Controls execution provider selection for layout detection and embedding
|
|||
|
|
* models. When {@code None}, uses platform defaults (CoreML on macOS, CUDA on
|
|||
|
|
* Linux, CPU on Windows).
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("acceleration") AccelerationConfig acceleration,
|
|||
|
|
/**
|
|||
|
|
* Cache namespace for tenant isolation.
|
|||
|
|
*
|
|||
|
|
* When set, cache entries are stored under {@code {cache_dir}/{namespace}/}.
|
|||
|
|
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
|||
|
|
* Different namespaces have isolated cache spaces on the same filesystem.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("cache_namespace") String cacheNamespace,
|
|||
|
|
/**
|
|||
|
|
* Per-request cache TTL in seconds.
|
|||
|
|
*
|
|||
|
|
* Overrides the global {@code max_age_days} for this specific extraction.
|
|||
|
|
* When {@code 0}, caching is completely skipped (no read or write).
|
|||
|
|
* When {@code None}, the global TTL applies.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("cache_ttl_secs") Long cacheTtlSecs,
|
|||
|
|
/**
|
|||
|
|
* Email extraction configuration (null = use defaults).
|
|||
|
|
*
|
|||
|
|
* Currently supports configuring the fallback codepage for MSG files
|
|||
|
|
* that do not specify one. See {@code EmailConfig} for details.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("email") EmailConfig email,
|
|||
|
|
/**
|
|||
|
|
* Concurrency limits for constrained environments (null = use defaults).
|
|||
|
|
*
|
|||
|
|
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
|||
|
|
* (when {@code max_concurrent_extractions} is unset) the batch concurrency
|
|||
|
|
* semaphore. See {@code ConcurrencyConfig} for details.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("concurrency") String concurrency,
|
|||
|
|
/**
|
|||
|
|
* Maximum recursion depth for archive extraction (default: 3).
|
|||
|
|
* Set to 0 to disable recursive extraction (legacy behavior).
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("max_archive_depth") Long maxArchiveDepth,
|
|||
|
|
/**
|
|||
|
|
* Tree-sitter language pack configuration (null = tree-sitter disabled).
|
|||
|
|
*
|
|||
|
|
* When set, enables code file extraction using tree-sitter parsers.
|
|||
|
|
* Controls grammar download behavior and code analysis options.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
|
|||
|
|
/**
|
|||
|
|
* Structured extraction via LLM (null = disabled).
|
|||
|
|
*
|
|||
|
|
* When set, the extracted document content is sent to an LLM with the
|
|||
|
|
* provided JSON schema. The structured response is stored in
|
|||
|
|
* {@code ExtractionResult.structured_output}.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction,
|
|||
|
|
/**
|
|||
|
|
* Cancellation token for this extraction (null = no external cancellation).
|
|||
|
|
*
|
|||
|
|
* Pass a CancellationToken clone here and call CancellationToken.cancel
|
|||
|
|
* from another thread / task to abort the extraction in progress. The extractor
|
|||
|
|
* checks the token at safe checkpoints (before lock acquisition, between pages,
|
|||
|
|
* between batch items) and returns KreuzbergError.Cancelled when set.
|
|||
|
|
*
|
|||
|
|
* The field is excluded from serialization because {@code CancellationToken} is a
|
|||
|
|
* runtime handle, not a configuration value.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("cancel_token") String cancelToken
|
|||
|
|
) {
|
|||
|
|
public static Builder builder() {
|
|||
|
|
return new Builder();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CPD-OFF
|
|||
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|||
|
|
public static final class Builder {
|
|||
|
|
|
|||
|
|
@JsonProperty("use_cache")
|
|||
|
|
private Boolean useCache = null;
|
|||
|
|
@JsonProperty("enable_quality_processing")
|
|||
|
|
private Boolean enableQualityProcessing = null;
|
|||
|
|
@Nullable private OcrConfig ocr = null;
|
|||
|
|
@JsonProperty("force_ocr")
|
|||
|
|
private Boolean forceOcr = null;
|
|||
|
|
@JsonProperty("force_ocr_pages")
|
|||
|
|
private List<Integer> forceOcrPages = null;
|
|||
|
|
@JsonProperty("disable_ocr")
|
|||
|
|
private Boolean disableOcr = null;
|
|||
|
|
@Nullable private ChunkingConfig chunking = null;
|
|||
|
|
@JsonProperty("content_filter")
|
|||
|
|
@Nullable private ContentFilterConfig contentFilter = null;
|
|||
|
|
@Nullable private ImageExtractionConfig images = null;
|
|||
|
|
@JsonProperty("pdf_options")
|
|||
|
|
@Nullable private PdfConfig pdfOptions = null;
|
|||
|
|
@JsonProperty("token_reduction")
|
|||
|
|
@Nullable private TokenReductionOptions tokenReduction = null;
|
|||
|
|
@JsonProperty("language_detection")
|
|||
|
|
@Nullable private LanguageDetectionConfig languageDetection = null;
|
|||
|
|
@Nullable private PageConfig pages = null;
|
|||
|
|
@Nullable private KeywordConfig keywords = null;
|
|||
|
|
@Nullable private PostProcessorConfig postprocessor = null;
|
|||
|
|
@JsonProperty("html_options")
|
|||
|
|
private String htmlOptions = null;
|
|||
|
|
@JsonProperty("html_output")
|
|||
|
|
@Nullable private HtmlOutputConfig htmlOutput = null;
|
|||
|
|
@JsonProperty("extraction_timeout_secs")
|
|||
|
|
private Long extractionTimeoutSecs = null;
|
|||
|
|
@JsonProperty("max_concurrent_extractions")
|
|||
|
|
private Long maxConcurrentExtractions = null;
|
|||
|
|
@JsonProperty("result_format")
|
|||
|
|
@Nullable private ResultFormat resultFormat = ResultFormat.Unified;
|
|||
|
|
@JsonProperty("security_limits")
|
|||
|
|
@Nullable private SecurityLimits securityLimits = null;
|
|||
|
|
@JsonProperty("max_embedded_file_bytes")
|
|||
|
|
private Long maxEmbeddedFileBytes = null;
|
|||
|
|
@JsonProperty("output_format")
|
|||
|
|
@Nullable private OutputFormat outputFormat = OutputFormat.Plain;
|
|||
|
|
@Nullable private LayoutDetectionConfig layout = null;
|
|||
|
|
@JsonProperty("use_layout_for_markdown")
|
|||
|
|
private Boolean useLayoutForMarkdown = null;
|
|||
|
|
@JsonProperty("include_document_structure")
|
|||
|
|
private Boolean includeDocumentStructure = null;
|
|||
|
|
@Nullable private AccelerationConfig acceleration = null;
|
|||
|
|
@JsonProperty("cache_namespace")
|
|||
|
|
private String cacheNamespace = null;
|
|||
|
|
@JsonProperty("cache_ttl_secs")
|
|||
|
|
private Long cacheTtlSecs = null;
|
|||
|
|
@Nullable private EmailConfig email = null;
|
|||
|
|
private String concurrency = null;
|
|||
|
|
@JsonProperty("max_archive_depth")
|
|||
|
|
private Long maxArchiveDepth = null;
|
|||
|
|
@JsonProperty("tree_sitter")
|
|||
|
|
@Nullable private TreeSitterConfig treeSitter = null;
|
|||
|
|
@JsonProperty("structured_extraction")
|
|||
|
|
@Nullable private StructuredExtractionConfig structuredExtraction = null;
|
|||
|
|
@JsonProperty("cancel_token")
|
|||
|
|
private String cancelToken = null;
|
|||
|
|
|
|||
|
|
/** Sets the useCache field. */
|
|||
|
|
@JsonProperty("use_cache")
|
|||
|
|
public Builder withUseCache(final @Nullable Boolean value) {
|
|||
|
|
this.useCache = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the enableQualityProcessing field. */
|
|||
|
|
@JsonProperty("enable_quality_processing")
|
|||
|
|
public Builder withEnableQualityProcessing(final @Nullable Boolean value) {
|
|||
|
|
this.enableQualityProcessing = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the ocr field. */
|
|||
|
|
@JsonProperty("ocr")
|
|||
|
|
public Builder withOcr(final @Nullable OcrConfig value) {
|
|||
|
|
this.ocr = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the forceOcr field. */
|
|||
|
|
@JsonProperty("force_ocr")
|
|||
|
|
public Builder withForceOcr(final @Nullable Boolean value) {
|
|||
|
|
this.forceOcr = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the forceOcrPages field. */
|
|||
|
|
@JsonProperty("force_ocr_pages")
|
|||
|
|
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
|
|||
|
|
this.forceOcrPages = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the disableOcr field. */
|
|||
|
|
@JsonProperty("disable_ocr")
|
|||
|
|
public Builder withDisableOcr(final @Nullable Boolean value) {
|
|||
|
|
this.disableOcr = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the chunking field. */
|
|||
|
|
@JsonProperty("chunking")
|
|||
|
|
public Builder withChunking(final @Nullable ChunkingConfig value) {
|
|||
|
|
this.chunking = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the contentFilter field. */
|
|||
|
|
@JsonProperty("content_filter")
|
|||
|
|
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
|
|||
|
|
this.contentFilter = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the images field. */
|
|||
|
|
@JsonProperty("images")
|
|||
|
|
public Builder withImages(final @Nullable ImageExtractionConfig value) {
|
|||
|
|
this.images = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the pdfOptions field. */
|
|||
|
|
@JsonProperty("pdf_options")
|
|||
|
|
public Builder withPdfOptions(final @Nullable PdfConfig value) {
|
|||
|
|
this.pdfOptions = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the tokenReduction field. */
|
|||
|
|
@JsonProperty("token_reduction")
|
|||
|
|
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
|
|||
|
|
this.tokenReduction = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the languageDetection field. */
|
|||
|
|
@JsonProperty("language_detection")
|
|||
|
|
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
|
|||
|
|
this.languageDetection = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the pages field. */
|
|||
|
|
@JsonProperty("pages")
|
|||
|
|
public Builder withPages(final @Nullable PageConfig value) {
|
|||
|
|
this.pages = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the keywords field. */
|
|||
|
|
@JsonProperty("keywords")
|
|||
|
|
public Builder withKeywords(final @Nullable KeywordConfig value) {
|
|||
|
|
this.keywords = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the postprocessor field. */
|
|||
|
|
@JsonProperty("postprocessor")
|
|||
|
|
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
|
|||
|
|
this.postprocessor = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the htmlOptions field. */
|
|||
|
|
@JsonProperty("html_options")
|
|||
|
|
public Builder withHtmlOptions(final @Nullable String value) {
|
|||
|
|
this.htmlOptions = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the htmlOutput field. */
|
|||
|
|
@JsonProperty("html_output")
|
|||
|
|
public Builder withHtmlOutput(final @Nullable HtmlOutputConfig value) {
|
|||
|
|
this.htmlOutput = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the extractionTimeoutSecs field. */
|
|||
|
|
@JsonProperty("extraction_timeout_secs")
|
|||
|
|
public Builder withExtractionTimeoutSecs(final @Nullable Long value) {
|
|||
|
|
this.extractionTimeoutSecs = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the maxConcurrentExtractions field. */
|
|||
|
|
@JsonProperty("max_concurrent_extractions")
|
|||
|
|
public Builder withMaxConcurrentExtractions(final @Nullable Long value) {
|
|||
|
|
this.maxConcurrentExtractions = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the resultFormat field. */
|
|||
|
|
@JsonProperty("result_format")
|
|||
|
|
public Builder withResultFormat(final @Nullable ResultFormat value) {
|
|||
|
|
this.resultFormat = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the securityLimits field. */
|
|||
|
|
@JsonProperty("security_limits")
|
|||
|
|
public Builder withSecurityLimits(final @Nullable SecurityLimits value) {
|
|||
|
|
this.securityLimits = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the maxEmbeddedFileBytes field. */
|
|||
|
|
@JsonProperty("max_embedded_file_bytes")
|
|||
|
|
public Builder withMaxEmbeddedFileBytes(final @Nullable Long value) {
|
|||
|
|
this.maxEmbeddedFileBytes = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the outputFormat field. */
|
|||
|
|
@JsonProperty("output_format")
|
|||
|
|
public Builder withOutputFormat(final @Nullable OutputFormat value) {
|
|||
|
|
this.outputFormat = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the layout field. */
|
|||
|
|
@JsonProperty("layout")
|
|||
|
|
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
|
|||
|
|
this.layout = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the useLayoutForMarkdown field. */
|
|||
|
|
@JsonProperty("use_layout_for_markdown")
|
|||
|
|
public Builder withUseLayoutForMarkdown(final @Nullable Boolean value) {
|
|||
|
|
this.useLayoutForMarkdown = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the includeDocumentStructure field. */
|
|||
|
|
@JsonProperty("include_document_structure")
|
|||
|
|
public Builder withIncludeDocumentStructure(final @Nullable Boolean value) {
|
|||
|
|
this.includeDocumentStructure = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the acceleration field. */
|
|||
|
|
@JsonProperty("acceleration")
|
|||
|
|
public Builder withAcceleration(final @Nullable AccelerationConfig value) {
|
|||
|
|
this.acceleration = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the cacheNamespace field. */
|
|||
|
|
@JsonProperty("cache_namespace")
|
|||
|
|
public Builder withCacheNamespace(final @Nullable String value) {
|
|||
|
|
this.cacheNamespace = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the cacheTtlSecs field. */
|
|||
|
|
@JsonProperty("cache_ttl_secs")
|
|||
|
|
public Builder withCacheTtlSecs(final @Nullable Long value) {
|
|||
|
|
this.cacheTtlSecs = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the email field. */
|
|||
|
|
@JsonProperty("email")
|
|||
|
|
public Builder withEmail(final @Nullable EmailConfig value) {
|
|||
|
|
this.email = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the concurrency field. */
|
|||
|
|
@JsonProperty("concurrency")
|
|||
|
|
public Builder withConcurrency(final @Nullable String value) {
|
|||
|
|
this.concurrency = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the maxArchiveDepth field. */
|
|||
|
|
@JsonProperty("max_archive_depth")
|
|||
|
|
public Builder withMaxArchiveDepth(final @Nullable Long value) {
|
|||
|
|
this.maxArchiveDepth = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the treeSitter field. */
|
|||
|
|
@JsonProperty("tree_sitter")
|
|||
|
|
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
|
|||
|
|
this.treeSitter = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the structuredExtraction field. */
|
|||
|
|
@JsonProperty("structured_extraction")
|
|||
|
|
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
|
|||
|
|
this.structuredExtraction = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the cancelToken field. */
|
|||
|
|
@JsonProperty("cancel_token")
|
|||
|
|
public Builder withCancelToken(final @Nullable String value) {
|
|||
|
|
this.cancelToken = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Builds the ExtractionConfig instance. */
|
|||
|
|
public ExtractionConfig build() {
|
|||
|
|
return new ExtractionConfig(
|
|||
|
|
useCache,
|
|||
|
|
enableQualityProcessing,
|
|||
|
|
ocr,
|
|||
|
|
forceOcr,
|
|||
|
|
forceOcrPages,
|
|||
|
|
disableOcr,
|
|||
|
|
chunking,
|
|||
|
|
contentFilter,
|
|||
|
|
images,
|
|||
|
|
pdfOptions,
|
|||
|
|
tokenReduction,
|
|||
|
|
languageDetection,
|
|||
|
|
pages,
|
|||
|
|
keywords,
|
|||
|
|
postprocessor,
|
|||
|
|
htmlOptions,
|
|||
|
|
htmlOutput,
|
|||
|
|
extractionTimeoutSecs,
|
|||
|
|
maxConcurrentExtractions,
|
|||
|
|
resultFormat,
|
|||
|
|
securityLimits,
|
|||
|
|
maxEmbeddedFileBytes,
|
|||
|
|
outputFormat,
|
|||
|
|
layout,
|
|||
|
|
useLayoutForMarkdown,
|
|||
|
|
includeDocumentStructure,
|
|||
|
|
acceleration,
|
|||
|
|
cacheNamespace,
|
|||
|
|
cacheTtlSecs,
|
|||
|
|
email,
|
|||
|
|
concurrency,
|
|||
|
|
maxArchiveDepth,
|
|||
|
|
treeSitter,
|
|||
|
|
structuredExtraction,
|
|||
|
|
cancelToken
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// CPD-ON
|
|||
|
|
public static ExtractionConfig defaultInstance() {
|
|||
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|||
|
|
}
|
|||
|
|
}
|