Files
fil/packages/java/dev/kreuzberg/TesseractConfig.java

355 lines
13 KiB
Java
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = TesseractConfig.Builder.class)
public record TesseractConfig(
/**
* Language code (e.g., "eng", "deu", "fra")
*/
@JsonProperty("language") String language,
/**
* Page Segmentation Mode (0-13).
*
* Common values:
* - 3: Fully automatic page segmentation (native default)
* - 6: Assume a single uniform block of text (WASM default avoids layout-analysis hang)
* - 11: Sparse text with no particular order
*/
@JsonProperty("psm") int psm,
/**
* Output format ("text" or "markdown")
*/
@JsonProperty("output_format") String outputFormat,
/**
* OCR Engine Mode (0-3).
*
* - 0: Legacy engine only
* - 1: Neural nets (LSTM) only (usually best)
* - 2: Legacy + LSTM
* - 3: Default (based on what's available)
*/
@JsonProperty("oem") int oem,
/**
* Minimum confidence threshold (0.0-100.0).
*
* Words with confidence below this threshold may be rejected or flagged.
*/
@JsonProperty("min_confidence") double minConfidence,
/**
* Image preprocessing configuration.
*
* Controls how images are preprocessed before OCR. Can significantly
* improve quality for scanned documents or low-quality images.
*/
@Nullable @JsonProperty("preprocessing") ImagePreprocessingConfig preprocessing,
/**
* Enable automatic table detection and reconstruction
*/
@JsonProperty("enable_table_detection") boolean enableTableDetection,
/**
* Minimum confidence threshold for table detection (0.0-1.0)
*/
@JsonProperty("table_min_confidence") double tableMinConfidence,
/**
* Column threshold for table detection (pixels)
*/
@JsonProperty("table_column_threshold") int tableColumnThreshold,
/**
* Row threshold ratio for table detection (0.0-1.0)
*/
@JsonProperty("table_row_threshold_ratio") double tableRowThresholdRatio,
/**
* Enable OCR result caching
*/
@JsonProperty("use_cache") boolean useCache,
/**
* Use pre-adapted templates for character classification
*/
@JsonProperty("classify_use_pre_adapted_templates") boolean classifyUsePreAdaptedTemplates,
/**
* Enable N-gram language model
*/
@JsonProperty("language_model_ngram_on") boolean languageModelNgramOn,
/**
* Don't reject good words during block-level processing
*/
@JsonProperty("tessedit_dont_blkrej_good_wds") boolean tesseditDontBlkrejGoodWds,
/**
* Don't reject good words during row-level processing
*/
@JsonProperty("tessedit_dont_rowrej_good_wds") boolean tesseditDontRowrejGoodWds,
/**
* Enable dictionary correction
*/
@JsonProperty("tessedit_enable_dict_correction") boolean tesseditEnableDictCorrection,
/**
* Whitelist of allowed characters (empty = all allowed)
*/
@JsonProperty("tessedit_char_whitelist") String tesseditCharWhitelist,
/**
* Blacklist of forbidden characters (empty = none forbidden)
*/
@JsonProperty("tessedit_char_blacklist") String tesseditCharBlacklist,
/**
* Use primary language params model
*/
@JsonProperty("tessedit_use_primary_params_model") boolean tesseditUsePrimaryParamsModel,
/**
* Variable-width space detection
*/
@JsonProperty("textord_space_size_is_variable") boolean textordSpaceSizeIsVariable,
/**
* Use adaptive thresholding method
*/
@JsonProperty("thresholding_method") boolean thresholdingMethod
) {
public static Builder builder() {
return new Builder();
}
public TesseractConfig{
if (psm == 0) psm = 3;
if (oem == 0) oem = 3;
if (tableColumnThreshold == 0) tableColumnThreshold = 50;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String language = "eng";
private int psm = 0;
@JsonProperty("output_format")
private String outputFormat = "markdown";
private int oem = 0;
@JsonProperty("min_confidence")
private double minConfidence = 0.0;
private ImagePreprocessingConfig preprocessing = null;
@JsonProperty("enable_table_detection")
private boolean enableTableDetection = true;
@JsonProperty("table_min_confidence")
private double tableMinConfidence = 0.0;
@JsonProperty("table_column_threshold")
private int tableColumnThreshold = 0;
@JsonProperty("table_row_threshold_ratio")
private double tableRowThresholdRatio = 0.0;
@JsonProperty("use_cache")
private boolean useCache = true;
@JsonProperty("classify_use_pre_adapted_templates")
private boolean classifyUsePreAdaptedTemplates = true;
@JsonProperty("language_model_ngram_on")
private boolean languageModelNgramOn = false;
@JsonProperty("tessedit_dont_blkrej_good_wds")
private boolean tesseditDontBlkrejGoodWds = true;
@JsonProperty("tessedit_dont_rowrej_good_wds")
private boolean tesseditDontRowrejGoodWds = true;
@JsonProperty("tessedit_enable_dict_correction")
private boolean tesseditEnableDictCorrection = true;
@JsonProperty("tessedit_char_whitelist")
private String tesseditCharWhitelist = "";
@JsonProperty("tessedit_char_blacklist")
private String tesseditCharBlacklist = "";
@JsonProperty("tessedit_use_primary_params_model")
private boolean tesseditUsePrimaryParamsModel = true;
@JsonProperty("textord_space_size_is_variable")
private boolean textordSpaceSizeIsVariable = true;
@JsonProperty("thresholding_method")
private boolean thresholdingMethod = false;
/** Sets the language field. */
@JsonProperty("language")
public Builder withLanguage(final String value) {
this.language = value;
return this;
}
/** Sets the psm field. */
@JsonProperty("psm")
public Builder withPsm(final int value) {
this.psm = value;
return this;
}
/** Sets the outputFormat field. */
@JsonProperty("output_format")
public Builder withOutputFormat(final String value) {
this.outputFormat = value;
return this;
}
/** Sets the oem field. */
@JsonProperty("oem")
public Builder withOem(final int value) {
this.oem = value;
return this;
}
/** Sets the minConfidence field. */
@JsonProperty("min_confidence")
public Builder withMinConfidence(final double value) {
this.minConfidence = value;
return this;
}
/** Sets the preprocessing field. */
@JsonProperty("preprocessing")
public Builder withPreprocessing(final @Nullable ImagePreprocessingConfig value) {
this.preprocessing = value;
return this;
}
/** Sets the enableTableDetection field. */
@JsonProperty("enable_table_detection")
public Builder withEnableTableDetection(final boolean value) {
this.enableTableDetection = value;
return this;
}
/** Sets the tableMinConfidence field. */
@JsonProperty("table_min_confidence")
public Builder withTableMinConfidence(final double value) {
this.tableMinConfidence = value;
return this;
}
/** Sets the tableColumnThreshold field. */
@JsonProperty("table_column_threshold")
public Builder withTableColumnThreshold(final int value) {
this.tableColumnThreshold = value;
return this;
}
/** Sets the tableRowThresholdRatio field. */
@JsonProperty("table_row_threshold_ratio")
public Builder withTableRowThresholdRatio(final double value) {
this.tableRowThresholdRatio = value;
return this;
}
/** Sets the useCache field. */
@JsonProperty("use_cache")
public Builder withUseCache(final boolean value) {
this.useCache = value;
return this;
}
/** Sets the classifyUsePreAdaptedTemplates field. */
@JsonProperty("classify_use_pre_adapted_templates")
public Builder withClassifyUsePreAdaptedTemplates(final boolean value) {
this.classifyUsePreAdaptedTemplates = value;
return this;
}
/** Sets the languageModelNgramOn field. */
@JsonProperty("language_model_ngram_on")
public Builder withLanguageModelNgramOn(final boolean value) {
this.languageModelNgramOn = value;
return this;
}
/** Sets the tesseditDontBlkrejGoodWds field. */
@JsonProperty("tessedit_dont_blkrej_good_wds")
public Builder withTesseditDontBlkrejGoodWds(final boolean value) {
this.tesseditDontBlkrejGoodWds = value;
return this;
}
/** Sets the tesseditDontRowrejGoodWds field. */
@JsonProperty("tessedit_dont_rowrej_good_wds")
public Builder withTesseditDontRowrejGoodWds(final boolean value) {
this.tesseditDontRowrejGoodWds = value;
return this;
}
/** Sets the tesseditEnableDictCorrection field. */
@JsonProperty("tessedit_enable_dict_correction")
public Builder withTesseditEnableDictCorrection(final boolean value) {
this.tesseditEnableDictCorrection = value;
return this;
}
/** Sets the tesseditCharWhitelist field. */
@JsonProperty("tessedit_char_whitelist")
public Builder withTesseditCharWhitelist(final String value) {
this.tesseditCharWhitelist = value;
return this;
}
/** Sets the tesseditCharBlacklist field. */
@JsonProperty("tessedit_char_blacklist")
public Builder withTesseditCharBlacklist(final String value) {
this.tesseditCharBlacklist = value;
return this;
}
/** Sets the tesseditUsePrimaryParamsModel field. */
@JsonProperty("tessedit_use_primary_params_model")
public Builder withTesseditUsePrimaryParamsModel(final boolean value) {
this.tesseditUsePrimaryParamsModel = value;
return this;
}
/** Sets the textordSpaceSizeIsVariable field. */
@JsonProperty("textord_space_size_is_variable")
public Builder withTextordSpaceSizeIsVariable(final boolean value) {
this.textordSpaceSizeIsVariable = value;
return this;
}
/** Sets the thresholdingMethod field. */
@JsonProperty("thresholding_method")
public Builder withThresholdingMethod(final boolean value) {
this.thresholdingMethod = value;
return this;
}
/** Builds the TesseractConfig instance. */
public TesseractConfig build() {
return new TesseractConfig(
language,
psm,
outputFormat,
oem,
minConfidence,
preprocessing,
enableTableDetection,
tableMinConfidence,
tableColumnThreshold,
tableRowThresholdRatio,
useCache,
classifyUsePreAdaptedTemplates,
languageModelNgramOn,
tesseditDontBlkrejGoodWds,
tesseditDontRowrejGoodWds,
tesseditEnableDictCorrection,
tesseditCharWhitelist,
tesseditCharBlacklist,
tesseditUsePrimaryParamsModel,
textordSpaceSizeIsVariable,
thresholdingMethod
);
}
}
// CPD-ON
public static TesseractConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}