651 lines
26 KiB
Java
Generated
651 lines
26 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
// To regenerate: alef generate
|
||
// To verify freshness: alef verify --exit-code
|
||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
package dev.kreuzberg;
|
||
|
||
import java.util.List;
|
||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||
import org.jspecify.annotations.Nullable;
|
||
|
||
/**
|
||
* Main extraction configuration.
|
||
*
|
||
* This struct contains all configuration options for the extraction process.
|
||
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||
*/
|
||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||
@JsonDeserialize(builder = ExtractionConfig.Builder.class)
|
||
public record ExtractionConfig(
|
||
/**
|
||
* Enable caching of extraction results
|
||
*/
|
||
@Nullable @JsonProperty("use_cache") Boolean useCache,
|
||
/**
|
||
* Enable quality post-processing
|
||
*/
|
||
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
|
||
/**
|
||
* OCR configuration (null = OCR disabled)
|
||
*/
|
||
@Nullable @JsonProperty("ocr") OcrConfig ocr,
|
||
/**
|
||
* Force OCR even for searchable PDFs
|
||
*/
|
||
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
|
||
/**
|
||
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
||
*
|
||
* When set, only the listed pages are OCR'd regardless of text layer quality.
|
||
* Unlisted pages use native text extraction. Ignored when {@code force_ocr} is {@code true}.
|
||
* Only applies to PDF documents. Duplicates are automatically deduplicated.
|
||
* An {@code ocr} config is recommended for backend/language selection; defaults are used if absent.
|
||
*/
|
||
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
|
||
/**
|
||
* Disable OCR entirely, even for images.
|
||
*
|
||
* When {@code true}, OCR is skipped for all document types. Images return metadata
|
||
* only (dimensions, format, EXIF) without text extraction. PDFs use only
|
||
* native text extraction without OCR fallback.
|
||
*
|
||
* Cannot be {@code true} simultaneously with {@code force_ocr}.
|
||
*
|
||
* *Added in v4.7.0.*
|
||
*/
|
||
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
|
||
/**
|
||
* Text chunking configuration (null = chunking disabled)
|
||
*/
|
||
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
|
||
/**
|
||
* Content filtering configuration (null = use extractor defaults).
|
||
*
|
||
* Controls whether document "furniture" (headers, footers, watermarks,
|
||
* repeating text) is included in or stripped from extraction results.
|
||
* See ContentFilterConfig for per-field documentation.
|
||
*/
|
||
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
|
||
/**
|
||
* Image extraction configuration (null = no image extraction)
|
||
*/
|
||
@Nullable @JsonProperty("images") ImageExtractionConfig images,
|
||
/**
|
||
* PDF-specific options (null = use defaults)
|
||
*/
|
||
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
|
||
/**
|
||
* Token reduction configuration (null = no token reduction)
|
||
*/
|
||
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
|
||
/**
|
||
* Language detection configuration (null = no language detection)
|
||
*/
|
||
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
|
||
/**
|
||
* Page extraction configuration (null = no page tracking)
|
||
*/
|
||
@Nullable @JsonProperty("pages") PageConfig pages,
|
||
/**
|
||
* Keyword extraction configuration (null = no keyword extraction)
|
||
*/
|
||
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
|
||
/**
|
||
* Post-processor configuration (null = use defaults)
|
||
*/
|
||
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
|
||
/**
|
||
* HTML to Markdown conversion options (null = use defaults)
|
||
*
|
||
* Configure how HTML documents are converted to Markdown, including heading styles,
|
||
* list formatting, code block styles, and preprocessing options.
|
||
*/
|
||
@Nullable @JsonProperty("html_options") String htmlOptions,
|
||
/**
|
||
* Styled HTML output configuration.
|
||
*
|
||
* When set alongside {@code output_format = OutputFormat.Html}, the extraction
|
||
* pipeline uses StyledHtmlRenderer(crate.rendering.StyledHtmlRenderer)
|
||
* which emits stable {@code kb-*} CSS class hooks on every structural element
|
||
* and optionally embeds theme CSS or user-supplied CSS in a {@code <style>} block.
|
||
*
|
||
* When {@code None}, the existing plain comrak-based HTML renderer is used.
|
||
*/
|
||
@Nullable @JsonProperty("html_output") HtmlOutputConfig htmlOutput,
|
||
/**
|
||
* Default per-file timeout in seconds for batch extraction.
|
||
*
|
||
* When set, each file in a batch will be canceled after this duration
|
||
* unless overridden by FileExtractionConfig.timeout_secs.
|
||
*
|
||
* Defaults to {@code Some(60)} to prevent pathological files (e.g. deeply
|
||
* nested archives, documents with millions of cells) from running
|
||
* indefinitely and exhausting caller resources. Set to {@code None} to
|
||
* disable the timeout for trusted input or long-running workloads.
|
||
*/
|
||
@Nullable @JsonProperty("extraction_timeout_secs") Long extractionTimeoutSecs,
|
||
/**
|
||
* Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
|
||
*
|
||
* Limits parallelism to prevent resource exhaustion when processing
|
||
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
||
*/
|
||
@Nullable @JsonProperty("max_concurrent_extractions") Long maxConcurrentExtractions,
|
||
/**
|
||
* Result structure format
|
||
*
|
||
* Controls whether results are returned in unified format (default) with all
|
||
* content in the {@code content} field, or element-based format with semantic
|
||
* elements (for Unstructured-compatible output).
|
||
*/
|
||
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
|
||
/**
|
||
* Security limits for archive extraction.
|
||
*
|
||
* Controls maximum archive size, compression ratio, file count, and other
|
||
* security thresholds to prevent decompression bomb attacks. Also caps
|
||
* nesting depth, iteration count, entity / token length, total
|
||
* content size, and table cell count for every extraction path that
|
||
* ingests user-controlled bytes.
|
||
* When {@code None}, default limits are used.
|
||
*/
|
||
@Nullable @JsonProperty("security_limits") SecurityLimits securityLimits,
|
||
/**
|
||
* Maximum uncompressed size in bytes for a single embedded file before
|
||
* recursive extraction is attempted (default: 50 MiB).
|
||
*
|
||
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
||
* to email attachments processed via recursive extraction. Files that
|
||
* exceed this limit are skipped with a {@code ProcessingWarning} rather than
|
||
* passed to the extraction pipeline, preventing a single oversized
|
||
* embedded object from consuming unbounded memory or time.
|
||
*
|
||
* Set to {@code None} to disable the per-embedded-file cap (falls back to
|
||
* {@code security_limits.max_archive_size} as the only guard).
|
||
*/
|
||
@Nullable @JsonProperty("max_embedded_file_bytes") Long maxEmbeddedFileBytes,
|
||
/**
|
||
* Content text format (default: Plain).
|
||
*
|
||
* Controls the format of the extracted content:
|
||
* - {@code Plain}: Raw extracted text (default)
|
||
* - {@code Markdown}: Markdown formatted output
|
||
* - {@code Djot}: Djot markup format (requires djot feature)
|
||
* - {@code Html}: HTML formatted output
|
||
*
|
||
* When set to a structured format, extraction results will include
|
||
* formatted output. The {@code formatted_content} field may be populated
|
||
* when format conversion is applied.
|
||
*/
|
||
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
|
||
/**
|
||
* Layout detection configuration (null = layout detection disabled).
|
||
*
|
||
* When set, PDF pages and images are analyzed for document structure
|
||
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
||
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
||
* in the markdown pipeline. For images, per-region OCR is performed with
|
||
* markdown formatting based on detected layout classes.
|
||
* Requires the {@code layout-detection} feature to run inference; the field is
|
||
* present whenever the {@code layout-types} feature is active (which includes
|
||
* {@code layout-detection} as well as the no-ORT target groups).
|
||
*/
|
||
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
|
||
/**
|
||
* Run layout detection on the non-OCR PDF markdown path.
|
||
*
|
||
* When {@code true} and {@code layout} is {@code Some(_)}, layout regions inform heading,
|
||
* table, list, and figure detection in the structure pipeline that would
|
||
* otherwise rely on font-clustering heuristics alone. Significantly
|
||
* improves SF1 (structural F1) at the cost of inference latency
|
||
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: {@code false}.
|
||
* Requires the {@code layout-detection} feature.
|
||
*/
|
||
@Nullable @JsonProperty("use_layout_for_markdown") Boolean useLayoutForMarkdown,
|
||
/**
|
||
* Enable structured document tree output.
|
||
*
|
||
* When true, populates the {@code document} field on {@code ExtractionResult} with a
|
||
* hierarchical {@code DocumentStructure} containing heading-driven section nesting,
|
||
* table grids, content layer classification, and inline annotations.
|
||
*
|
||
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
|
||
*/
|
||
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
|
||
/**
|
||
* Hardware acceleration configuration for ONNX Runtime models.
|
||
*
|
||
* Controls execution provider selection for layout detection and embedding
|
||
* models. When {@code None}, uses platform defaults (CoreML on macOS, CUDA on
|
||
* Linux, CPU on Windows).
|
||
*/
|
||
@Nullable @JsonProperty("acceleration") AccelerationConfig acceleration,
|
||
/**
|
||
* Cache namespace for tenant isolation.
|
||
*
|
||
* When set, cache entries are stored under {@code {cache_dir}/{namespace}/}.
|
||
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
||
* Different namespaces have isolated cache spaces on the same filesystem.
|
||
*/
|
||
@Nullable @JsonProperty("cache_namespace") String cacheNamespace,
|
||
/**
|
||
* Per-request cache TTL in seconds.
|
||
*
|
||
* Overrides the global {@code max_age_days} for this specific extraction.
|
||
* When {@code 0}, caching is completely skipped (no read or write).
|
||
* When {@code None}, the global TTL applies.
|
||
*/
|
||
@Nullable @JsonProperty("cache_ttl_secs") Long cacheTtlSecs,
|
||
/**
|
||
* Email extraction configuration (null = use defaults).
|
||
*
|
||
* Currently supports configuring the fallback codepage for MSG files
|
||
* that do not specify one. See {@code EmailConfig} for details.
|
||
*/
|
||
@Nullable @JsonProperty("email") EmailConfig email,
|
||
/**
|
||
* Concurrency limits for constrained environments (null = use defaults).
|
||
*
|
||
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
||
* (when {@code max_concurrent_extractions} is unset) the batch concurrency
|
||
* semaphore. See {@code ConcurrencyConfig} for details.
|
||
*/
|
||
@Nullable @JsonProperty("concurrency") String concurrency,
|
||
/**
|
||
* Maximum recursion depth for archive extraction (default: 3).
|
||
* Set to 0 to disable recursive extraction (legacy behavior).
|
||
*/
|
||
@Nullable @JsonProperty("max_archive_depth") Long maxArchiveDepth,
|
||
/**
|
||
* Tree-sitter language pack configuration (null = tree-sitter disabled).
|
||
*
|
||
* When set, enables code file extraction using tree-sitter parsers.
|
||
* Controls grammar download behavior and code analysis options.
|
||
*/
|
||
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
|
||
/**
|
||
* Structured extraction via LLM (null = disabled).
|
||
*
|
||
* When set, the extracted document content is sent to an LLM with the
|
||
* provided JSON schema. The structured response is stored in
|
||
* {@code ExtractionResult.structured_output}.
|
||
*/
|
||
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction,
|
||
/**
|
||
* Cancellation token for this extraction (null = no external cancellation).
|
||
*
|
||
* Pass a CancellationToken clone here and call CancellationToken.cancel
|
||
* from another thread / task to abort the extraction in progress. The extractor
|
||
* checks the token at safe checkpoints (before lock acquisition, between pages,
|
||
* between batch items) and returns KreuzbergError.Cancelled when set.
|
||
*
|
||
* The field is excluded from serialization because {@code CancellationToken} is a
|
||
* runtime handle, not a configuration value.
|
||
*/
|
||
@Nullable @JsonProperty("cancel_token") String cancelToken
|
||
) {
|
||
public static Builder builder() {
|
||
return new Builder();
|
||
}
|
||
|
||
// CPD-OFF
|
||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||
public static final class Builder {
|
||
|
||
@JsonProperty("use_cache")
|
||
private Boolean useCache = null;
|
||
@JsonProperty("enable_quality_processing")
|
||
private Boolean enableQualityProcessing = null;
|
||
@Nullable private OcrConfig ocr = null;
|
||
@JsonProperty("force_ocr")
|
||
private Boolean forceOcr = null;
|
||
@JsonProperty("force_ocr_pages")
|
||
private List<Integer> forceOcrPages = null;
|
||
@JsonProperty("disable_ocr")
|
||
private Boolean disableOcr = null;
|
||
@Nullable private ChunkingConfig chunking = null;
|
||
@JsonProperty("content_filter")
|
||
@Nullable private ContentFilterConfig contentFilter = null;
|
||
@Nullable private ImageExtractionConfig images = null;
|
||
@JsonProperty("pdf_options")
|
||
@Nullable private PdfConfig pdfOptions = null;
|
||
@JsonProperty("token_reduction")
|
||
@Nullable private TokenReductionOptions tokenReduction = null;
|
||
@JsonProperty("language_detection")
|
||
@Nullable private LanguageDetectionConfig languageDetection = null;
|
||
@Nullable private PageConfig pages = null;
|
||
@Nullable private KeywordConfig keywords = null;
|
||
@Nullable private PostProcessorConfig postprocessor = null;
|
||
@JsonProperty("html_options")
|
||
private String htmlOptions = null;
|
||
@JsonProperty("html_output")
|
||
@Nullable private HtmlOutputConfig htmlOutput = null;
|
||
@JsonProperty("extraction_timeout_secs")
|
||
private Long extractionTimeoutSecs = null;
|
||
@JsonProperty("max_concurrent_extractions")
|
||
private Long maxConcurrentExtractions = null;
|
||
@JsonProperty("result_format")
|
||
@Nullable private ResultFormat resultFormat = ResultFormat.Unified;
|
||
@JsonProperty("security_limits")
|
||
@Nullable private SecurityLimits securityLimits = null;
|
||
@JsonProperty("max_embedded_file_bytes")
|
||
private Long maxEmbeddedFileBytes = null;
|
||
@JsonProperty("output_format")
|
||
@Nullable private OutputFormat outputFormat = OutputFormat.Plain;
|
||
@Nullable private LayoutDetectionConfig layout = null;
|
||
@JsonProperty("use_layout_for_markdown")
|
||
private Boolean useLayoutForMarkdown = null;
|
||
@JsonProperty("include_document_structure")
|
||
private Boolean includeDocumentStructure = null;
|
||
@Nullable private AccelerationConfig acceleration = null;
|
||
@JsonProperty("cache_namespace")
|
||
private String cacheNamespace = null;
|
||
@JsonProperty("cache_ttl_secs")
|
||
private Long cacheTtlSecs = null;
|
||
@Nullable private EmailConfig email = null;
|
||
private String concurrency = null;
|
||
@JsonProperty("max_archive_depth")
|
||
private Long maxArchiveDepth = null;
|
||
@JsonProperty("tree_sitter")
|
||
@Nullable private TreeSitterConfig treeSitter = null;
|
||
@JsonProperty("structured_extraction")
|
||
@Nullable private StructuredExtractionConfig structuredExtraction = null;
|
||
@JsonProperty("cancel_token")
|
||
private String cancelToken = null;
|
||
|
||
/** Sets the useCache field. */
|
||
@JsonProperty("use_cache")
|
||
public Builder withUseCache(final @Nullable Boolean value) {
|
||
this.useCache = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the enableQualityProcessing field. */
|
||
@JsonProperty("enable_quality_processing")
|
||
public Builder withEnableQualityProcessing(final @Nullable Boolean value) {
|
||
this.enableQualityProcessing = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the ocr field. */
|
||
@JsonProperty("ocr")
|
||
public Builder withOcr(final @Nullable OcrConfig value) {
|
||
this.ocr = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the forceOcr field. */
|
||
@JsonProperty("force_ocr")
|
||
public Builder withForceOcr(final @Nullable Boolean value) {
|
||
this.forceOcr = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the forceOcrPages field. */
|
||
@JsonProperty("force_ocr_pages")
|
||
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
|
||
this.forceOcrPages = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the disableOcr field. */
|
||
@JsonProperty("disable_ocr")
|
||
public Builder withDisableOcr(final @Nullable Boolean value) {
|
||
this.disableOcr = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the chunking field. */
|
||
@JsonProperty("chunking")
|
||
public Builder withChunking(final @Nullable ChunkingConfig value) {
|
||
this.chunking = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the contentFilter field. */
|
||
@JsonProperty("content_filter")
|
||
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
|
||
this.contentFilter = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the images field. */
|
||
@JsonProperty("images")
|
||
public Builder withImages(final @Nullable ImageExtractionConfig value) {
|
||
this.images = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the pdfOptions field. */
|
||
@JsonProperty("pdf_options")
|
||
public Builder withPdfOptions(final @Nullable PdfConfig value) {
|
||
this.pdfOptions = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the tokenReduction field. */
|
||
@JsonProperty("token_reduction")
|
||
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
|
||
this.tokenReduction = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the languageDetection field. */
|
||
@JsonProperty("language_detection")
|
||
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
|
||
this.languageDetection = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the pages field. */
|
||
@JsonProperty("pages")
|
||
public Builder withPages(final @Nullable PageConfig value) {
|
||
this.pages = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the keywords field. */
|
||
@JsonProperty("keywords")
|
||
public Builder withKeywords(final @Nullable KeywordConfig value) {
|
||
this.keywords = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the postprocessor field. */
|
||
@JsonProperty("postprocessor")
|
||
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
|
||
this.postprocessor = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the htmlOptions field. */
|
||
@JsonProperty("html_options")
|
||
public Builder withHtmlOptions(final @Nullable String value) {
|
||
this.htmlOptions = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the htmlOutput field. */
|
||
@JsonProperty("html_output")
|
||
public Builder withHtmlOutput(final @Nullable HtmlOutputConfig value) {
|
||
this.htmlOutput = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the extractionTimeoutSecs field. */
|
||
@JsonProperty("extraction_timeout_secs")
|
||
public Builder withExtractionTimeoutSecs(final @Nullable Long value) {
|
||
this.extractionTimeoutSecs = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the maxConcurrentExtractions field. */
|
||
@JsonProperty("max_concurrent_extractions")
|
||
public Builder withMaxConcurrentExtractions(final @Nullable Long value) {
|
||
this.maxConcurrentExtractions = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the resultFormat field. */
|
||
@JsonProperty("result_format")
|
||
public Builder withResultFormat(final @Nullable ResultFormat value) {
|
||
this.resultFormat = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the securityLimits field. */
|
||
@JsonProperty("security_limits")
|
||
public Builder withSecurityLimits(final @Nullable SecurityLimits value) {
|
||
this.securityLimits = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the maxEmbeddedFileBytes field. */
|
||
@JsonProperty("max_embedded_file_bytes")
|
||
public Builder withMaxEmbeddedFileBytes(final @Nullable Long value) {
|
||
this.maxEmbeddedFileBytes = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the outputFormat field. */
|
||
@JsonProperty("output_format")
|
||
public Builder withOutputFormat(final @Nullable OutputFormat value) {
|
||
this.outputFormat = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the layout field. */
|
||
@JsonProperty("layout")
|
||
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
|
||
this.layout = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the useLayoutForMarkdown field. */
|
||
@JsonProperty("use_layout_for_markdown")
|
||
public Builder withUseLayoutForMarkdown(final @Nullable Boolean value) {
|
||
this.useLayoutForMarkdown = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the includeDocumentStructure field. */
|
||
@JsonProperty("include_document_structure")
|
||
public Builder withIncludeDocumentStructure(final @Nullable Boolean value) {
|
||
this.includeDocumentStructure = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the acceleration field. */
|
||
@JsonProperty("acceleration")
|
||
public Builder withAcceleration(final @Nullable AccelerationConfig value) {
|
||
this.acceleration = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the cacheNamespace field. */
|
||
@JsonProperty("cache_namespace")
|
||
public Builder withCacheNamespace(final @Nullable String value) {
|
||
this.cacheNamespace = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the cacheTtlSecs field. */
|
||
@JsonProperty("cache_ttl_secs")
|
||
public Builder withCacheTtlSecs(final @Nullable Long value) {
|
||
this.cacheTtlSecs = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the email field. */
|
||
@JsonProperty("email")
|
||
public Builder withEmail(final @Nullable EmailConfig value) {
|
||
this.email = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the concurrency field. */
|
||
@JsonProperty("concurrency")
|
||
public Builder withConcurrency(final @Nullable String value) {
|
||
this.concurrency = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the maxArchiveDepth field. */
|
||
@JsonProperty("max_archive_depth")
|
||
public Builder withMaxArchiveDepth(final @Nullable Long value) {
|
||
this.maxArchiveDepth = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the treeSitter field. */
|
||
@JsonProperty("tree_sitter")
|
||
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
|
||
this.treeSitter = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the structuredExtraction field. */
|
||
@JsonProperty("structured_extraction")
|
||
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
|
||
this.structuredExtraction = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the cancelToken field. */
|
||
@JsonProperty("cancel_token")
|
||
public Builder withCancelToken(final @Nullable String value) {
|
||
this.cancelToken = value;
|
||
return this;
|
||
}
|
||
|
||
/** Builds the ExtractionConfig instance. */
|
||
public ExtractionConfig build() {
|
||
return new ExtractionConfig(
|
||
useCache,
|
||
enableQualityProcessing,
|
||
ocr,
|
||
forceOcr,
|
||
forceOcrPages,
|
||
disableOcr,
|
||
chunking,
|
||
contentFilter,
|
||
images,
|
||
pdfOptions,
|
||
tokenReduction,
|
||
languageDetection,
|
||
pages,
|
||
keywords,
|
||
postprocessor,
|
||
htmlOptions,
|
||
htmlOutput,
|
||
extractionTimeoutSecs,
|
||
maxConcurrentExtractions,
|
||
resultFormat,
|
||
securityLimits,
|
||
maxEmbeddedFileBytes,
|
||
outputFormat,
|
||
layout,
|
||
useLayoutForMarkdown,
|
||
includeDocumentStructure,
|
||
acceleration,
|
||
cacheNamespace,
|
||
cacheTtlSecs,
|
||
email,
|
||
concurrency,
|
||
maxArchiveDepth,
|
||
treeSitter,
|
||
structuredExtraction,
|
||
cancelToken
|
||
);
|
||
}
|
||
}
|
||
// CPD-ON
|
||
public static ExtractionConfig defaultInstance() {
|
||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||
}
|
||
}
|