// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder; import org.jspecify.annotations.Nullable; /** * Tesseract OCR configuration. * * Provides fine-grained control over Tesseract OCR engine parameters. * Most users can use the defaults, but these settings allow optimization * for specific document types (invoices, handwriting, etc.). */ @JsonInclude(JsonInclude.Include.NON_ABSENT) @JsonDeserialize(builder = TesseractConfig.Builder.class) public record TesseractConfig( /** * Language code (e.g., "eng", "deu", "fra") */ @JsonProperty("language") String language, /** * Page Segmentation Mode (0-13). * * Common values: * - 3: Fully automatic page segmentation (native default) * - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang) * - 11: Sparse text with no particular order */ @JsonProperty("psm") int psm, /** * Output format ("text" or "markdown") */ @JsonProperty("output_format") String outputFormat, /** * OCR Engine Mode (0-3). * * - 0: Legacy engine only * - 1: Neural nets (LSTM) only (usually best) * - 2: Legacy + LSTM * - 3: Default (based on what's available) */ @JsonProperty("oem") int oem, /** * Minimum confidence threshold (0.0-100.0). * * Words with confidence below this threshold may be rejected or flagged. */ @JsonProperty("min_confidence") double minConfidence, /** * Image preprocessing configuration. * * Controls how images are preprocessed before OCR. Can significantly * improve quality for scanned documents or low-quality images. */ @Nullable @JsonProperty("preprocessing") ImagePreprocessingConfig preprocessing, /** * Enable automatic table detection and reconstruction */ @JsonProperty("enable_table_detection") boolean enableTableDetection, /** * Minimum confidence threshold for table detection (0.0-1.0) */ @JsonProperty("table_min_confidence") double tableMinConfidence, /** * Column threshold for table detection (pixels) */ @JsonProperty("table_column_threshold") int tableColumnThreshold, /** * Row threshold ratio for table detection (0.0-1.0) */ @JsonProperty("table_row_threshold_ratio") double tableRowThresholdRatio, /** * Enable OCR result caching */ @JsonProperty("use_cache") boolean useCache, /** * Use pre-adapted templates for character classification */ @JsonProperty("classify_use_pre_adapted_templates") boolean classifyUsePreAdaptedTemplates, /** * Enable N-gram language model */ @JsonProperty("language_model_ngram_on") boolean languageModelNgramOn, /** * Don't reject good words during block-level processing */ @JsonProperty("tessedit_dont_blkrej_good_wds") boolean tesseditDontBlkrejGoodWds, /** * Don't reject good words during row-level processing */ @JsonProperty("tessedit_dont_rowrej_good_wds") boolean tesseditDontRowrejGoodWds, /** * Enable dictionary correction */ @JsonProperty("tessedit_enable_dict_correction") boolean tesseditEnableDictCorrection, /** * Whitelist of allowed characters (empty = all allowed) */ @JsonProperty("tessedit_char_whitelist") String tesseditCharWhitelist, /** * Blacklist of forbidden characters (empty = none forbidden) */ @JsonProperty("tessedit_char_blacklist") String tesseditCharBlacklist, /** * Use primary language params model */ @JsonProperty("tessedit_use_primary_params_model") boolean tesseditUsePrimaryParamsModel, /** * Variable-width space detection */ @JsonProperty("textord_space_size_is_variable") boolean textordSpaceSizeIsVariable, /** * Use adaptive thresholding method */ @JsonProperty("thresholding_method") boolean thresholdingMethod ) { public static Builder builder() { return new Builder(); } public TesseractConfig{ if (psm == 0) psm = 3; if (oem == 0) oem = 3; if (tableColumnThreshold == 0) tableColumnThreshold = 50; } // CPD-OFF @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build") public static final class Builder { private String language = "eng"; private int psm = 0; @JsonProperty("output_format") private String outputFormat = "markdown"; private int oem = 0; @JsonProperty("min_confidence") private double minConfidence = 0.0; private ImagePreprocessingConfig preprocessing = null; @JsonProperty("enable_table_detection") private boolean enableTableDetection = true; @JsonProperty("table_min_confidence") private double tableMinConfidence = 0.0; @JsonProperty("table_column_threshold") private int tableColumnThreshold = 0; @JsonProperty("table_row_threshold_ratio") private double tableRowThresholdRatio = 0.0; @JsonProperty("use_cache") private boolean useCache = true; @JsonProperty("classify_use_pre_adapted_templates") private boolean classifyUsePreAdaptedTemplates = true; @JsonProperty("language_model_ngram_on") private boolean languageModelNgramOn = false; @JsonProperty("tessedit_dont_blkrej_good_wds") private boolean tesseditDontBlkrejGoodWds = true; @JsonProperty("tessedit_dont_rowrej_good_wds") private boolean tesseditDontRowrejGoodWds = true; @JsonProperty("tessedit_enable_dict_correction") private boolean tesseditEnableDictCorrection = true; @JsonProperty("tessedit_char_whitelist") private String tesseditCharWhitelist = ""; @JsonProperty("tessedit_char_blacklist") private String tesseditCharBlacklist = ""; @JsonProperty("tessedit_use_primary_params_model") private boolean tesseditUsePrimaryParamsModel = true; @JsonProperty("textord_space_size_is_variable") private boolean textordSpaceSizeIsVariable = true; @JsonProperty("thresholding_method") private boolean thresholdingMethod = false; /** Sets the language field. */ @JsonProperty("language") public Builder withLanguage(final String value) { this.language = value; return this; } /** Sets the psm field. */ @JsonProperty("psm") public Builder withPsm(final int value) { this.psm = value; return this; } /** Sets the outputFormat field. */ @JsonProperty("output_format") public Builder withOutputFormat(final String value) { this.outputFormat = value; return this; } /** Sets the oem field. */ @JsonProperty("oem") public Builder withOem(final int value) { this.oem = value; return this; } /** Sets the minConfidence field. */ @JsonProperty("min_confidence") public Builder withMinConfidence(final double value) { this.minConfidence = value; return this; } /** Sets the preprocessing field. */ @JsonProperty("preprocessing") public Builder withPreprocessing(final @Nullable ImagePreprocessingConfig value) { this.preprocessing = value; return this; } /** Sets the enableTableDetection field. */ @JsonProperty("enable_table_detection") public Builder withEnableTableDetection(final boolean value) { this.enableTableDetection = value; return this; } /** Sets the tableMinConfidence field. */ @JsonProperty("table_min_confidence") public Builder withTableMinConfidence(final double value) { this.tableMinConfidence = value; return this; } /** Sets the tableColumnThreshold field. */ @JsonProperty("table_column_threshold") public Builder withTableColumnThreshold(final int value) { this.tableColumnThreshold = value; return this; } /** Sets the tableRowThresholdRatio field. */ @JsonProperty("table_row_threshold_ratio") public Builder withTableRowThresholdRatio(final double value) { this.tableRowThresholdRatio = value; return this; } /** Sets the useCache field. */ @JsonProperty("use_cache") public Builder withUseCache(final boolean value) { this.useCache = value; return this; } /** Sets the classifyUsePreAdaptedTemplates field. */ @JsonProperty("classify_use_pre_adapted_templates") public Builder withClassifyUsePreAdaptedTemplates(final boolean value) { this.classifyUsePreAdaptedTemplates = value; return this; } /** Sets the languageModelNgramOn field. */ @JsonProperty("language_model_ngram_on") public Builder withLanguageModelNgramOn(final boolean value) { this.languageModelNgramOn = value; return this; } /** Sets the tesseditDontBlkrejGoodWds field. */ @JsonProperty("tessedit_dont_blkrej_good_wds") public Builder withTesseditDontBlkrejGoodWds(final boolean value) { this.tesseditDontBlkrejGoodWds = value; return this; } /** Sets the tesseditDontRowrejGoodWds field. */ @JsonProperty("tessedit_dont_rowrej_good_wds") public Builder withTesseditDontRowrejGoodWds(final boolean value) { this.tesseditDontRowrejGoodWds = value; return this; } /** Sets the tesseditEnableDictCorrection field. */ @JsonProperty("tessedit_enable_dict_correction") public Builder withTesseditEnableDictCorrection(final boolean value) { this.tesseditEnableDictCorrection = value; return this; } /** Sets the tesseditCharWhitelist field. */ @JsonProperty("tessedit_char_whitelist") public Builder withTesseditCharWhitelist(final String value) { this.tesseditCharWhitelist = value; return this; } /** Sets the tesseditCharBlacklist field. */ @JsonProperty("tessedit_char_blacklist") public Builder withTesseditCharBlacklist(final String value) { this.tesseditCharBlacklist = value; return this; } /** Sets the tesseditUsePrimaryParamsModel field. */ @JsonProperty("tessedit_use_primary_params_model") public Builder withTesseditUsePrimaryParamsModel(final boolean value) { this.tesseditUsePrimaryParamsModel = value; return this; } /** Sets the textordSpaceSizeIsVariable field. */ @JsonProperty("textord_space_size_is_variable") public Builder withTextordSpaceSizeIsVariable(final boolean value) { this.textordSpaceSizeIsVariable = value; return this; } /** Sets the thresholdingMethod field. */ @JsonProperty("thresholding_method") public Builder withThresholdingMethod(final boolean value) { this.thresholdingMethod = value; return this; } /** Builds the TesseractConfig instance. */ public TesseractConfig build() { return new TesseractConfig( language, psm, outputFormat, oem, minConfidence, preprocessing, enableTableDetection, tableMinConfidence, tableColumnThreshold, tableRowThresholdRatio, useCache, classifyUsePreAdaptedTemplates, languageModelNgramOn, tesseditDontBlkrejGoodWds, tesseditDontRowrejGoodWds, tesseditEnableDictCorrection, tesseditCharWhitelist, tesseditCharBlacklist, tesseditUsePrimaryParamsModel, textordSpaceSizeIsVariable, thresholdingMethod ); } } // CPD-ON public static TesseractConfig defaultInstance() { throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead."); } }