355 lines
13 KiB
Java
Generated
355 lines
13 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* Tesseract OCR configuration.
|
|
*
|
|
* Provides fine-grained control over Tesseract OCR engine parameters.
|
|
* Most users can use the defaults, but these settings allow optimization
|
|
* for specific document types (invoices, handwriting, etc.).
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = TesseractConfig.Builder.class)
|
|
public record TesseractConfig(
|
|
/**
|
|
* Language code (e.g., "eng", "deu", "fra")
|
|
*/
|
|
@JsonProperty("language") String language,
|
|
/**
|
|
* Page Segmentation Mode (0-13).
|
|
*
|
|
* Common values:
|
|
* - 3: Fully automatic page segmentation (native default)
|
|
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
|
* - 11: Sparse text with no particular order
|
|
*/
|
|
@JsonProperty("psm") int psm,
|
|
/**
|
|
* Output format ("text" or "markdown")
|
|
*/
|
|
@JsonProperty("output_format") String outputFormat,
|
|
/**
|
|
* OCR Engine Mode (0-3).
|
|
*
|
|
* - 0: Legacy engine only
|
|
* - 1: Neural nets (LSTM) only (usually best)
|
|
* - 2: Legacy + LSTM
|
|
* - 3: Default (based on what's available)
|
|
*/
|
|
@JsonProperty("oem") int oem,
|
|
/**
|
|
* Minimum confidence threshold (0.0-100.0).
|
|
*
|
|
* Words with confidence below this threshold may be rejected or flagged.
|
|
*/
|
|
@JsonProperty("min_confidence") double minConfidence,
|
|
/**
|
|
* Image preprocessing configuration.
|
|
*
|
|
* Controls how images are preprocessed before OCR. Can significantly
|
|
* improve quality for scanned documents or low-quality images.
|
|
*/
|
|
@Nullable @JsonProperty("preprocessing") ImagePreprocessingConfig preprocessing,
|
|
/**
|
|
* Enable automatic table detection and reconstruction
|
|
*/
|
|
@JsonProperty("enable_table_detection") boolean enableTableDetection,
|
|
/**
|
|
* Minimum confidence threshold for table detection (0.0-1.0)
|
|
*/
|
|
@JsonProperty("table_min_confidence") double tableMinConfidence,
|
|
/**
|
|
* Column threshold for table detection (pixels)
|
|
*/
|
|
@JsonProperty("table_column_threshold") int tableColumnThreshold,
|
|
/**
|
|
* Row threshold ratio for table detection (0.0-1.0)
|
|
*/
|
|
@JsonProperty("table_row_threshold_ratio") double tableRowThresholdRatio,
|
|
/**
|
|
* Enable OCR result caching
|
|
*/
|
|
@JsonProperty("use_cache") boolean useCache,
|
|
/**
|
|
* Use pre-adapted templates for character classification
|
|
*/
|
|
@JsonProperty("classify_use_pre_adapted_templates") boolean classifyUsePreAdaptedTemplates,
|
|
/**
|
|
* Enable N-gram language model
|
|
*/
|
|
@JsonProperty("language_model_ngram_on") boolean languageModelNgramOn,
|
|
/**
|
|
* Don't reject good words during block-level processing
|
|
*/
|
|
@JsonProperty("tessedit_dont_blkrej_good_wds") boolean tesseditDontBlkrejGoodWds,
|
|
/**
|
|
* Don't reject good words during row-level processing
|
|
*/
|
|
@JsonProperty("tessedit_dont_rowrej_good_wds") boolean tesseditDontRowrejGoodWds,
|
|
/**
|
|
* Enable dictionary correction
|
|
*/
|
|
@JsonProperty("tessedit_enable_dict_correction") boolean tesseditEnableDictCorrection,
|
|
/**
|
|
* Whitelist of allowed characters (empty = all allowed)
|
|
*/
|
|
@JsonProperty("tessedit_char_whitelist") String tesseditCharWhitelist,
|
|
/**
|
|
* Blacklist of forbidden characters (empty = none forbidden)
|
|
*/
|
|
@JsonProperty("tessedit_char_blacklist") String tesseditCharBlacklist,
|
|
/**
|
|
* Use primary language params model
|
|
*/
|
|
@JsonProperty("tessedit_use_primary_params_model") boolean tesseditUsePrimaryParamsModel,
|
|
/**
|
|
* Variable-width space detection
|
|
*/
|
|
@JsonProperty("textord_space_size_is_variable") boolean textordSpaceSizeIsVariable,
|
|
/**
|
|
* Use adaptive thresholding method
|
|
*/
|
|
@JsonProperty("thresholding_method") boolean thresholdingMethod
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
public TesseractConfig{
|
|
if (psm == 0) psm = 3;
|
|
if (oem == 0) oem = 3;
|
|
if (tableColumnThreshold == 0) tableColumnThreshold = 50;
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
private String language = "eng";
|
|
private int psm = 0;
|
|
@JsonProperty("output_format")
|
|
private String outputFormat = "markdown";
|
|
private int oem = 0;
|
|
@JsonProperty("min_confidence")
|
|
private double minConfidence = 0.0;
|
|
private ImagePreprocessingConfig preprocessing = null;
|
|
@JsonProperty("enable_table_detection")
|
|
private boolean enableTableDetection = true;
|
|
@JsonProperty("table_min_confidence")
|
|
private double tableMinConfidence = 0.0;
|
|
@JsonProperty("table_column_threshold")
|
|
private int tableColumnThreshold = 0;
|
|
@JsonProperty("table_row_threshold_ratio")
|
|
private double tableRowThresholdRatio = 0.0;
|
|
@JsonProperty("use_cache")
|
|
private boolean useCache = true;
|
|
@JsonProperty("classify_use_pre_adapted_templates")
|
|
private boolean classifyUsePreAdaptedTemplates = true;
|
|
@JsonProperty("language_model_ngram_on")
|
|
private boolean languageModelNgramOn = false;
|
|
@JsonProperty("tessedit_dont_blkrej_good_wds")
|
|
private boolean tesseditDontBlkrejGoodWds = true;
|
|
@JsonProperty("tessedit_dont_rowrej_good_wds")
|
|
private boolean tesseditDontRowrejGoodWds = true;
|
|
@JsonProperty("tessedit_enable_dict_correction")
|
|
private boolean tesseditEnableDictCorrection = true;
|
|
@JsonProperty("tessedit_char_whitelist")
|
|
private String tesseditCharWhitelist = "";
|
|
@JsonProperty("tessedit_char_blacklist")
|
|
private String tesseditCharBlacklist = "";
|
|
@JsonProperty("tessedit_use_primary_params_model")
|
|
private boolean tesseditUsePrimaryParamsModel = true;
|
|
@JsonProperty("textord_space_size_is_variable")
|
|
private boolean textordSpaceSizeIsVariable = true;
|
|
@JsonProperty("thresholding_method")
|
|
private boolean thresholdingMethod = false;
|
|
|
|
/** Sets the language field. */
|
|
@JsonProperty("language")
|
|
public Builder withLanguage(final String value) {
|
|
this.language = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the psm field. */
|
|
@JsonProperty("psm")
|
|
public Builder withPsm(final int value) {
|
|
this.psm = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the outputFormat field. */
|
|
@JsonProperty("output_format")
|
|
public Builder withOutputFormat(final String value) {
|
|
this.outputFormat = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the oem field. */
|
|
@JsonProperty("oem")
|
|
public Builder withOem(final int value) {
|
|
this.oem = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minConfidence field. */
|
|
@JsonProperty("min_confidence")
|
|
public Builder withMinConfidence(final double value) {
|
|
this.minConfidence = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the preprocessing field. */
|
|
@JsonProperty("preprocessing")
|
|
public Builder withPreprocessing(final @Nullable ImagePreprocessingConfig value) {
|
|
this.preprocessing = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the enableTableDetection field. */
|
|
@JsonProperty("enable_table_detection")
|
|
public Builder withEnableTableDetection(final boolean value) {
|
|
this.enableTableDetection = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tableMinConfidence field. */
|
|
@JsonProperty("table_min_confidence")
|
|
public Builder withTableMinConfidence(final double value) {
|
|
this.tableMinConfidence = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tableColumnThreshold field. */
|
|
@JsonProperty("table_column_threshold")
|
|
public Builder withTableColumnThreshold(final int value) {
|
|
this.tableColumnThreshold = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tableRowThresholdRatio field. */
|
|
@JsonProperty("table_row_threshold_ratio")
|
|
public Builder withTableRowThresholdRatio(final double value) {
|
|
this.tableRowThresholdRatio = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the useCache field. */
|
|
@JsonProperty("use_cache")
|
|
public Builder withUseCache(final boolean value) {
|
|
this.useCache = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the classifyUsePreAdaptedTemplates field. */
|
|
@JsonProperty("classify_use_pre_adapted_templates")
|
|
public Builder withClassifyUsePreAdaptedTemplates(final boolean value) {
|
|
this.classifyUsePreAdaptedTemplates = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the languageModelNgramOn field. */
|
|
@JsonProperty("language_model_ngram_on")
|
|
public Builder withLanguageModelNgramOn(final boolean value) {
|
|
this.languageModelNgramOn = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditDontBlkrejGoodWds field. */
|
|
@JsonProperty("tessedit_dont_blkrej_good_wds")
|
|
public Builder withTesseditDontBlkrejGoodWds(final boolean value) {
|
|
this.tesseditDontBlkrejGoodWds = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditDontRowrejGoodWds field. */
|
|
@JsonProperty("tessedit_dont_rowrej_good_wds")
|
|
public Builder withTesseditDontRowrejGoodWds(final boolean value) {
|
|
this.tesseditDontRowrejGoodWds = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditEnableDictCorrection field. */
|
|
@JsonProperty("tessedit_enable_dict_correction")
|
|
public Builder withTesseditEnableDictCorrection(final boolean value) {
|
|
this.tesseditEnableDictCorrection = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditCharWhitelist field. */
|
|
@JsonProperty("tessedit_char_whitelist")
|
|
public Builder withTesseditCharWhitelist(final String value) {
|
|
this.tesseditCharWhitelist = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditCharBlacklist field. */
|
|
@JsonProperty("tessedit_char_blacklist")
|
|
public Builder withTesseditCharBlacklist(final String value) {
|
|
this.tesseditCharBlacklist = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tesseditUsePrimaryParamsModel field. */
|
|
@JsonProperty("tessedit_use_primary_params_model")
|
|
public Builder withTesseditUsePrimaryParamsModel(final boolean value) {
|
|
this.tesseditUsePrimaryParamsModel = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the textordSpaceSizeIsVariable field. */
|
|
@JsonProperty("textord_space_size_is_variable")
|
|
public Builder withTextordSpaceSizeIsVariable(final boolean value) {
|
|
this.textordSpaceSizeIsVariable = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the thresholdingMethod field. */
|
|
@JsonProperty("thresholding_method")
|
|
public Builder withThresholdingMethod(final boolean value) {
|
|
this.thresholdingMethod = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the TesseractConfig instance. */
|
|
public TesseractConfig build() {
|
|
return new TesseractConfig(
|
|
language,
|
|
psm,
|
|
outputFormat,
|
|
oem,
|
|
minConfidence,
|
|
preprocessing,
|
|
enableTableDetection,
|
|
tableMinConfidence,
|
|
tableColumnThreshold,
|
|
tableRowThresholdRatio,
|
|
useCache,
|
|
classifyUsePreAdaptedTemplates,
|
|
languageModelNgramOn,
|
|
tesseditDontBlkrejGoodWds,
|
|
tesseditDontRowrejGoodWds,
|
|
tesseditEnableDictCorrection,
|
|
tesseditCharWhitelist,
|
|
tesseditCharBlacklist,
|
|
tesseditUsePrimaryParamsModel,
|
|
textordSpaceSizeIsVariable,
|
|
thresholdingMethod
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
public static TesseractConfig defaultInstance() {
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|
}
|
|
}
|