This commit is contained in:
354
packages/java/dev/kreuzberg/TesseractConfig.java
generated
Normal file
354
packages/java/dev/kreuzberg/TesseractConfig.java
generated
Normal file
@@ -0,0 +1,354 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Tesseract OCR configuration.
|
||||
*
|
||||
* Provides fine-grained control over Tesseract OCR engine parameters.
|
||||
* Most users can use the defaults, but these settings allow optimization
|
||||
* for specific document types (invoices, handwriting, etc.).
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = TesseractConfig.Builder.class)
|
||||
public record TesseractConfig(
|
||||
/**
|
||||
* Language code (e.g., "eng", "deu", "fra")
|
||||
*/
|
||||
@JsonProperty("language") String language,
|
||||
/**
|
||||
* Page Segmentation Mode (0-13).
|
||||
*
|
||||
* Common values:
|
||||
* - 3: Fully automatic page segmentation (native default)
|
||||
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
||||
* - 11: Sparse text with no particular order
|
||||
*/
|
||||
@JsonProperty("psm") int psm,
|
||||
/**
|
||||
* Output format ("text" or "markdown")
|
||||
*/
|
||||
@JsonProperty("output_format") String outputFormat,
|
||||
/**
|
||||
* OCR Engine Mode (0-3).
|
||||
*
|
||||
* - 0: Legacy engine only
|
||||
* - 1: Neural nets (LSTM) only (usually best)
|
||||
* - 2: Legacy + LSTM
|
||||
* - 3: Default (based on what's available)
|
||||
*/
|
||||
@JsonProperty("oem") int oem,
|
||||
/**
|
||||
* Minimum confidence threshold (0.0-100.0).
|
||||
*
|
||||
* Words with confidence below this threshold may be rejected or flagged.
|
||||
*/
|
||||
@JsonProperty("min_confidence") double minConfidence,
|
||||
/**
|
||||
* Image preprocessing configuration.
|
||||
*
|
||||
* Controls how images are preprocessed before OCR. Can significantly
|
||||
* improve quality for scanned documents or low-quality images.
|
||||
*/
|
||||
@Nullable @JsonProperty("preprocessing") ImagePreprocessingConfig preprocessing,
|
||||
/**
|
||||
* Enable automatic table detection and reconstruction
|
||||
*/
|
||||
@JsonProperty("enable_table_detection") boolean enableTableDetection,
|
||||
/**
|
||||
* Minimum confidence threshold for table detection (0.0-1.0)
|
||||
*/
|
||||
@JsonProperty("table_min_confidence") double tableMinConfidence,
|
||||
/**
|
||||
* Column threshold for table detection (pixels)
|
||||
*/
|
||||
@JsonProperty("table_column_threshold") int tableColumnThreshold,
|
||||
/**
|
||||
* Row threshold ratio for table detection (0.0-1.0)
|
||||
*/
|
||||
@JsonProperty("table_row_threshold_ratio") double tableRowThresholdRatio,
|
||||
/**
|
||||
* Enable OCR result caching
|
||||
*/
|
||||
@JsonProperty("use_cache") boolean useCache,
|
||||
/**
|
||||
* Use pre-adapted templates for character classification
|
||||
*/
|
||||
@JsonProperty("classify_use_pre_adapted_templates") boolean classifyUsePreAdaptedTemplates,
|
||||
/**
|
||||
* Enable N-gram language model
|
||||
*/
|
||||
@JsonProperty("language_model_ngram_on") boolean languageModelNgramOn,
|
||||
/**
|
||||
* Don't reject good words during block-level processing
|
||||
*/
|
||||
@JsonProperty("tessedit_dont_blkrej_good_wds") boolean tesseditDontBlkrejGoodWds,
|
||||
/**
|
||||
* Don't reject good words during row-level processing
|
||||
*/
|
||||
@JsonProperty("tessedit_dont_rowrej_good_wds") boolean tesseditDontRowrejGoodWds,
|
||||
/**
|
||||
* Enable dictionary correction
|
||||
*/
|
||||
@JsonProperty("tessedit_enable_dict_correction") boolean tesseditEnableDictCorrection,
|
||||
/**
|
||||
* Whitelist of allowed characters (empty = all allowed)
|
||||
*/
|
||||
@JsonProperty("tessedit_char_whitelist") String tesseditCharWhitelist,
|
||||
/**
|
||||
* Blacklist of forbidden characters (empty = none forbidden)
|
||||
*/
|
||||
@JsonProperty("tessedit_char_blacklist") String tesseditCharBlacklist,
|
||||
/**
|
||||
* Use primary language params model
|
||||
*/
|
||||
@JsonProperty("tessedit_use_primary_params_model") boolean tesseditUsePrimaryParamsModel,
|
||||
/**
|
||||
* Variable-width space detection
|
||||
*/
|
||||
@JsonProperty("textord_space_size_is_variable") boolean textordSpaceSizeIsVariable,
|
||||
/**
|
||||
* Use adaptive thresholding method
|
||||
*/
|
||||
@JsonProperty("thresholding_method") boolean thresholdingMethod
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
public TesseractConfig{
|
||||
if (psm == 0) psm = 3;
|
||||
if (oem == 0) oem = 3;
|
||||
if (tableColumnThreshold == 0) tableColumnThreshold = 50;
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
private String language = "eng";
|
||||
private int psm = 0;
|
||||
@JsonProperty("output_format")
|
||||
private String outputFormat = "markdown";
|
||||
private int oem = 0;
|
||||
@JsonProperty("min_confidence")
|
||||
private double minConfidence = 0.0;
|
||||
private ImagePreprocessingConfig preprocessing = null;
|
||||
@JsonProperty("enable_table_detection")
|
||||
private boolean enableTableDetection = true;
|
||||
@JsonProperty("table_min_confidence")
|
||||
private double tableMinConfidence = 0.0;
|
||||
@JsonProperty("table_column_threshold")
|
||||
private int tableColumnThreshold = 0;
|
||||
@JsonProperty("table_row_threshold_ratio")
|
||||
private double tableRowThresholdRatio = 0.0;
|
||||
@JsonProperty("use_cache")
|
||||
private boolean useCache = true;
|
||||
@JsonProperty("classify_use_pre_adapted_templates")
|
||||
private boolean classifyUsePreAdaptedTemplates = true;
|
||||
@JsonProperty("language_model_ngram_on")
|
||||
private boolean languageModelNgramOn = false;
|
||||
@JsonProperty("tessedit_dont_blkrej_good_wds")
|
||||
private boolean tesseditDontBlkrejGoodWds = true;
|
||||
@JsonProperty("tessedit_dont_rowrej_good_wds")
|
||||
private boolean tesseditDontRowrejGoodWds = true;
|
||||
@JsonProperty("tessedit_enable_dict_correction")
|
||||
private boolean tesseditEnableDictCorrection = true;
|
||||
@JsonProperty("tessedit_char_whitelist")
|
||||
private String tesseditCharWhitelist = "";
|
||||
@JsonProperty("tessedit_char_blacklist")
|
||||
private String tesseditCharBlacklist = "";
|
||||
@JsonProperty("tessedit_use_primary_params_model")
|
||||
private boolean tesseditUsePrimaryParamsModel = true;
|
||||
@JsonProperty("textord_space_size_is_variable")
|
||||
private boolean textordSpaceSizeIsVariable = true;
|
||||
@JsonProperty("thresholding_method")
|
||||
private boolean thresholdingMethod = false;
|
||||
|
||||
/** Sets the language field. */
|
||||
@JsonProperty("language")
|
||||
public Builder withLanguage(final String value) {
|
||||
this.language = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the psm field. */
|
||||
@JsonProperty("psm")
|
||||
public Builder withPsm(final int value) {
|
||||
this.psm = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the outputFormat field. */
|
||||
@JsonProperty("output_format")
|
||||
public Builder withOutputFormat(final String value) {
|
||||
this.outputFormat = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the oem field. */
|
||||
@JsonProperty("oem")
|
||||
public Builder withOem(final int value) {
|
||||
this.oem = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the minConfidence field. */
|
||||
@JsonProperty("min_confidence")
|
||||
public Builder withMinConfidence(final double value) {
|
||||
this.minConfidence = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the preprocessing field. */
|
||||
@JsonProperty("preprocessing")
|
||||
public Builder withPreprocessing(final @Nullable ImagePreprocessingConfig value) {
|
||||
this.preprocessing = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the enableTableDetection field. */
|
||||
@JsonProperty("enable_table_detection")
|
||||
public Builder withEnableTableDetection(final boolean value) {
|
||||
this.enableTableDetection = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tableMinConfidence field. */
|
||||
@JsonProperty("table_min_confidence")
|
||||
public Builder withTableMinConfidence(final double value) {
|
||||
this.tableMinConfidence = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tableColumnThreshold field. */
|
||||
@JsonProperty("table_column_threshold")
|
||||
public Builder withTableColumnThreshold(final int value) {
|
||||
this.tableColumnThreshold = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tableRowThresholdRatio field. */
|
||||
@JsonProperty("table_row_threshold_ratio")
|
||||
public Builder withTableRowThresholdRatio(final double value) {
|
||||
this.tableRowThresholdRatio = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the useCache field. */
|
||||
@JsonProperty("use_cache")
|
||||
public Builder withUseCache(final boolean value) {
|
||||
this.useCache = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the classifyUsePreAdaptedTemplates field. */
|
||||
@JsonProperty("classify_use_pre_adapted_templates")
|
||||
public Builder withClassifyUsePreAdaptedTemplates(final boolean value) {
|
||||
this.classifyUsePreAdaptedTemplates = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the languageModelNgramOn field. */
|
||||
@JsonProperty("language_model_ngram_on")
|
||||
public Builder withLanguageModelNgramOn(final boolean value) {
|
||||
this.languageModelNgramOn = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditDontBlkrejGoodWds field. */
|
||||
@JsonProperty("tessedit_dont_blkrej_good_wds")
|
||||
public Builder withTesseditDontBlkrejGoodWds(final boolean value) {
|
||||
this.tesseditDontBlkrejGoodWds = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditDontRowrejGoodWds field. */
|
||||
@JsonProperty("tessedit_dont_rowrej_good_wds")
|
||||
public Builder withTesseditDontRowrejGoodWds(final boolean value) {
|
||||
this.tesseditDontRowrejGoodWds = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditEnableDictCorrection field. */
|
||||
@JsonProperty("tessedit_enable_dict_correction")
|
||||
public Builder withTesseditEnableDictCorrection(final boolean value) {
|
||||
this.tesseditEnableDictCorrection = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditCharWhitelist field. */
|
||||
@JsonProperty("tessedit_char_whitelist")
|
||||
public Builder withTesseditCharWhitelist(final String value) {
|
||||
this.tesseditCharWhitelist = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditCharBlacklist field. */
|
||||
@JsonProperty("tessedit_char_blacklist")
|
||||
public Builder withTesseditCharBlacklist(final String value) {
|
||||
this.tesseditCharBlacklist = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tesseditUsePrimaryParamsModel field. */
|
||||
@JsonProperty("tessedit_use_primary_params_model")
|
||||
public Builder withTesseditUsePrimaryParamsModel(final boolean value) {
|
||||
this.tesseditUsePrimaryParamsModel = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the textordSpaceSizeIsVariable field. */
|
||||
@JsonProperty("textord_space_size_is_variable")
|
||||
public Builder withTextordSpaceSizeIsVariable(final boolean value) {
|
||||
this.textordSpaceSizeIsVariable = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the thresholdingMethod field. */
|
||||
@JsonProperty("thresholding_method")
|
||||
public Builder withThresholdingMethod(final boolean value) {
|
||||
this.thresholdingMethod = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the TesseractConfig instance. */
|
||||
public TesseractConfig build() {
|
||||
return new TesseractConfig(
|
||||
language,
|
||||
psm,
|
||||
outputFormat,
|
||||
oem,
|
||||
minConfidence,
|
||||
preprocessing,
|
||||
enableTableDetection,
|
||||
tableMinConfidence,
|
||||
tableColumnThreshold,
|
||||
tableRowThresholdRatio,
|
||||
useCache,
|
||||
classifyUsePreAdaptedTemplates,
|
||||
languageModelNgramOn,
|
||||
tesseditDontBlkrejGoodWds,
|
||||
tesseditDontRowrejGoodWds,
|
||||
tesseditEnableDictCorrection,
|
||||
tesseditCharWhitelist,
|
||||
tesseditCharBlacklist,
|
||||
tesseditUsePrimaryParamsModel,
|
||||
textordSpaceSizeIsVariable,
|
||||
thresholdingMethod
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
public static TesseractConfig defaultInstance() {
|
||||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user