Files
fil/packages/java/dev/kreuzberg/OcrQualityThresholds.java

280 lines
11 KiB
Java
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so {@code OcrQualityThresholds.default()} preserves existing semantics exactly.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = OcrQualityThresholds.Builder.class)
public record OcrQualityThresholds(
/**
* Minimum total non-whitespace characters to consider text substantive.
*/
@Nullable @JsonProperty("min_total_non_whitespace") Long minTotalNonWhitespace,
/**
* Minimum non-whitespace characters per page on average.
*/
@Nullable @JsonProperty("min_non_whitespace_per_page") Double minNonWhitespacePerPage,
/**
* Minimum character count for a word to be "meaningful".
*/
@Nullable @JsonProperty("min_meaningful_word_len") Long minMeaningfulWordLen,
/**
* Minimum count of meaningful words before text is accepted.
*/
@Nullable @JsonProperty("min_meaningful_words") Long minMeaningfulWords,
/**
* Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
*/
@Nullable @JsonProperty("min_alnum_ratio") Double minAlnumRatio,
/**
* Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
*/
@Nullable @JsonProperty("min_garbage_chars") Long minGarbageChars,
/**
* Maximum fraction of short (1-2 char) words before text is considered fragmented.
*/
@Nullable @JsonProperty("max_fragmented_word_ratio") Double maxFragmentedWordRatio,
/**
* Critical fragmentation threshold triggers OCR regardless of meaningful words.
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
*/
@Nullable @JsonProperty("critical_fragmented_word_ratio") Double criticalFragmentedWordRatio,
/**
* Minimum average word length. Below this with enough words indicates garbled extraction.
*/
@Nullable @JsonProperty("min_avg_word_length") Double minAvgWordLength,
/**
* Minimum word count before average word length check applies.
*/
@Nullable @JsonProperty("min_words_for_avg_length_check") Long minWordsForAvgLengthCheck,
/**
* Minimum consecutive word repetition ratio to detect column scrambling.
*/
@Nullable @JsonProperty("min_consecutive_repeat_ratio") Double minConsecutiveRepeatRatio,
/**
* Minimum word count before consecutive repetition check is applied.
*/
@Nullable @JsonProperty("min_words_for_repeat_check") Long minWordsForRepeatCheck,
/**
* Minimum character count for "substantive markdown" OCR skip gate.
*/
@Nullable @JsonProperty("substantive_min_chars") Long substantiveMinChars,
/**
* Minimum character count for "non-text content" OCR skip gate.
*/
@Nullable @JsonProperty("non_text_min_chars") Long nonTextMinChars,
/**
* Alphanumeric+whitespace ratio threshold for skip decisions.
*/
@Nullable @JsonProperty("alnum_ws_ratio_threshold") Double alnumWsRatioThreshold,
/**
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
* If the result from a backend scores below this, try the next backend.
*/
@Nullable @JsonProperty("pipeline_min_quality") Double pipelineMinQuality
) {
public static Builder builder() {
return new Builder();
}
public OcrQualityThresholds{
if (minTotalNonWhitespace == null) minTotalNonWhitespace = 64L;
if (minMeaningfulWordLen == null) minMeaningfulWordLen = 4L;
if (minMeaningfulWords == null) minMeaningfulWords = 3L;
if (minGarbageChars == null) minGarbageChars = 5L;
if (minWordsForAvgLengthCheck == null) minWordsForAvgLengthCheck = 50L;
if (minWordsForRepeatCheck == null) minWordsForRepeatCheck = 50L;
if (substantiveMinChars == null) substantiveMinChars = 100L;
if (nonTextMinChars == null) nonTextMinChars = 20L;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("min_total_non_whitespace")
private Long minTotalNonWhitespace = null;
@JsonProperty("min_non_whitespace_per_page")
private Double minNonWhitespacePerPage = null;
@JsonProperty("min_meaningful_word_len")
private Long minMeaningfulWordLen = null;
@JsonProperty("min_meaningful_words")
private Long minMeaningfulWords = null;
@JsonProperty("min_alnum_ratio")
private Double minAlnumRatio = null;
@JsonProperty("min_garbage_chars")
private Long minGarbageChars = null;
@JsonProperty("max_fragmented_word_ratio")
private Double maxFragmentedWordRatio = null;
@JsonProperty("critical_fragmented_word_ratio")
private Double criticalFragmentedWordRatio = null;
@JsonProperty("min_avg_word_length")
private Double minAvgWordLength = null;
@JsonProperty("min_words_for_avg_length_check")
private Long minWordsForAvgLengthCheck = null;
@JsonProperty("min_consecutive_repeat_ratio")
private Double minConsecutiveRepeatRatio = null;
@JsonProperty("min_words_for_repeat_check")
private Long minWordsForRepeatCheck = null;
@JsonProperty("substantive_min_chars")
private Long substantiveMinChars = null;
@JsonProperty("non_text_min_chars")
private Long nonTextMinChars = null;
@JsonProperty("alnum_ws_ratio_threshold")
private Double alnumWsRatioThreshold = null;
@JsonProperty("pipeline_min_quality")
private Double pipelineMinQuality = null;
/** Sets the minTotalNonWhitespace field. */
@JsonProperty("min_total_non_whitespace")
public Builder withMinTotalNonWhitespace(final @Nullable Long value) {
this.minTotalNonWhitespace = value;
return this;
}
/** Sets the minNonWhitespacePerPage field. */
@JsonProperty("min_non_whitespace_per_page")
public Builder withMinNonWhitespacePerPage(final @Nullable Double value) {
this.minNonWhitespacePerPage = value;
return this;
}
/** Sets the minMeaningfulWordLen field. */
@JsonProperty("min_meaningful_word_len")
public Builder withMinMeaningfulWordLen(final @Nullable Long value) {
this.minMeaningfulWordLen = value;
return this;
}
/** Sets the minMeaningfulWords field. */
@JsonProperty("min_meaningful_words")
public Builder withMinMeaningfulWords(final @Nullable Long value) {
this.minMeaningfulWords = value;
return this;
}
/** Sets the minAlnumRatio field. */
@JsonProperty("min_alnum_ratio")
public Builder withMinAlnumRatio(final @Nullable Double value) {
this.minAlnumRatio = value;
return this;
}
/** Sets the minGarbageChars field. */
@JsonProperty("min_garbage_chars")
public Builder withMinGarbageChars(final @Nullable Long value) {
this.minGarbageChars = value;
return this;
}
/** Sets the maxFragmentedWordRatio field. */
@JsonProperty("max_fragmented_word_ratio")
public Builder withMaxFragmentedWordRatio(final @Nullable Double value) {
this.maxFragmentedWordRatio = value;
return this;
}
/** Sets the criticalFragmentedWordRatio field. */
@JsonProperty("critical_fragmented_word_ratio")
public Builder withCriticalFragmentedWordRatio(final @Nullable Double value) {
this.criticalFragmentedWordRatio = value;
return this;
}
/** Sets the minAvgWordLength field. */
@JsonProperty("min_avg_word_length")
public Builder withMinAvgWordLength(final @Nullable Double value) {
this.minAvgWordLength = value;
return this;
}
/** Sets the minWordsForAvgLengthCheck field. */
@JsonProperty("min_words_for_avg_length_check")
public Builder withMinWordsForAvgLengthCheck(final @Nullable Long value) {
this.minWordsForAvgLengthCheck = value;
return this;
}
/** Sets the minConsecutiveRepeatRatio field. */
@JsonProperty("min_consecutive_repeat_ratio")
public Builder withMinConsecutiveRepeatRatio(final @Nullable Double value) {
this.minConsecutiveRepeatRatio = value;
return this;
}
/** Sets the minWordsForRepeatCheck field. */
@JsonProperty("min_words_for_repeat_check")
public Builder withMinWordsForRepeatCheck(final @Nullable Long value) {
this.minWordsForRepeatCheck = value;
return this;
}
/** Sets the substantiveMinChars field. */
@JsonProperty("substantive_min_chars")
public Builder withSubstantiveMinChars(final @Nullable Long value) {
this.substantiveMinChars = value;
return this;
}
/** Sets the nonTextMinChars field. */
@JsonProperty("non_text_min_chars")
public Builder withNonTextMinChars(final @Nullable Long value) {
this.nonTextMinChars = value;
return this;
}
/** Sets the alnumWsRatioThreshold field. */
@JsonProperty("alnum_ws_ratio_threshold")
public Builder withAlnumWsRatioThreshold(final @Nullable Double value) {
this.alnumWsRatioThreshold = value;
return this;
}
/** Sets the pipelineMinQuality field. */
@JsonProperty("pipeline_min_quality")
public Builder withPipelineMinQuality(final @Nullable Double value) {
this.pipelineMinQuality = value;
return this;
}
/** Builds the OcrQualityThresholds instance. */
public OcrQualityThresholds build() {
return new OcrQualityThresholds(
minTotalNonWhitespace,
minNonWhitespacePerPage,
minMeaningfulWordLen,
minMeaningfulWords,
minAlnumRatio,
minGarbageChars,
maxFragmentedWordRatio,
criticalFragmentedWordRatio,
minAvgWordLength,
minWordsForAvgLengthCheck,
minConsecutiveRepeatRatio,
minWordsForRepeatCheck,
substantiveMinChars,
nonTextMinChars,
alnumWsRatioThreshold,
pipelineMinQuality
);
}
}
// CPD-ON
public static OcrQualityThresholds defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}