// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder; import org.jspecify.annotations.Nullable; /** * Quality thresholds for OCR fallback decisions and pipeline quality gating. * * All fields default to the values that match the previous hardcoded behavior, * so {@code OcrQualityThresholds.default()} preserves existing semantics exactly. */ @JsonInclude(JsonInclude.Include.NON_ABSENT) @JsonDeserialize(builder = OcrQualityThresholds.Builder.class) public record OcrQualityThresholds( /** * Minimum total non-whitespace characters to consider text substantive. */ @Nullable @JsonProperty("min_total_non_whitespace") Long minTotalNonWhitespace, /** * Minimum non-whitespace characters per page on average. */ @Nullable @JsonProperty("min_non_whitespace_per_page") Double minNonWhitespacePerPage, /** * Minimum character count for a word to be "meaningful". */ @Nullable @JsonProperty("min_meaningful_word_len") Long minMeaningfulWordLen, /** * Minimum count of meaningful words before text is accepted. */ @Nullable @JsonProperty("min_meaningful_words") Long minMeaningfulWords, /** * Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */ @Nullable @JsonProperty("min_alnum_ratio") Double minAlnumRatio, /** * Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */ @Nullable @JsonProperty("min_garbage_chars") Long minGarbageChars, /** * Maximum fraction of short (1-2 char) words before text is considered fragmented. */ @Nullable @JsonProperty("max_fragmented_word_ratio") Double maxFragmentedWordRatio, /** * Critical fragmentation threshold — triggers OCR regardless of meaningful words. * Normal English text has ~20-30% short words. 80%+ is definitive garbage. */ @Nullable @JsonProperty("critical_fragmented_word_ratio") Double criticalFragmentedWordRatio, /** * Minimum average word length. Below this with enough words indicates garbled extraction. */ @Nullable @JsonProperty("min_avg_word_length") Double minAvgWordLength, /** * Minimum word count before average word length check applies. */ @Nullable @JsonProperty("min_words_for_avg_length_check") Long minWordsForAvgLengthCheck, /** * Minimum consecutive word repetition ratio to detect column scrambling. */ @Nullable @JsonProperty("min_consecutive_repeat_ratio") Double minConsecutiveRepeatRatio, /** * Minimum word count before consecutive repetition check is applied. */ @Nullable @JsonProperty("min_words_for_repeat_check") Long minWordsForRepeatCheck, /** * Minimum character count for "substantive markdown" OCR skip gate. */ @Nullable @JsonProperty("substantive_min_chars") Long substantiveMinChars, /** * Minimum character count for "non-text content" OCR skip gate. */ @Nullable @JsonProperty("non_text_min_chars") Long nonTextMinChars, /** * Alphanumeric+whitespace ratio threshold for skip decisions. */ @Nullable @JsonProperty("alnum_ws_ratio_threshold") Double alnumWsRatioThreshold, /** * Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. * If the result from a backend scores below this, try the next backend. */ @Nullable @JsonProperty("pipeline_min_quality") Double pipelineMinQuality ) { public static Builder builder() { return new Builder(); } public OcrQualityThresholds{ if (minTotalNonWhitespace == null) minTotalNonWhitespace = 64L; if (minMeaningfulWordLen == null) minMeaningfulWordLen = 4L; if (minMeaningfulWords == null) minMeaningfulWords = 3L; if (minGarbageChars == null) minGarbageChars = 5L; if (minWordsForAvgLengthCheck == null) minWordsForAvgLengthCheck = 50L; if (minWordsForRepeatCheck == null) minWordsForRepeatCheck = 50L; if (substantiveMinChars == null) substantiveMinChars = 100L; if (nonTextMinChars == null) nonTextMinChars = 20L; } // CPD-OFF @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build") public static final class Builder { @JsonProperty("min_total_non_whitespace") private Long minTotalNonWhitespace = null; @JsonProperty("min_non_whitespace_per_page") private Double minNonWhitespacePerPage = null; @JsonProperty("min_meaningful_word_len") private Long minMeaningfulWordLen = null; @JsonProperty("min_meaningful_words") private Long minMeaningfulWords = null; @JsonProperty("min_alnum_ratio") private Double minAlnumRatio = null; @JsonProperty("min_garbage_chars") private Long minGarbageChars = null; @JsonProperty("max_fragmented_word_ratio") private Double maxFragmentedWordRatio = null; @JsonProperty("critical_fragmented_word_ratio") private Double criticalFragmentedWordRatio = null; @JsonProperty("min_avg_word_length") private Double minAvgWordLength = null; @JsonProperty("min_words_for_avg_length_check") private Long minWordsForAvgLengthCheck = null; @JsonProperty("min_consecutive_repeat_ratio") private Double minConsecutiveRepeatRatio = null; @JsonProperty("min_words_for_repeat_check") private Long minWordsForRepeatCheck = null; @JsonProperty("substantive_min_chars") private Long substantiveMinChars = null; @JsonProperty("non_text_min_chars") private Long nonTextMinChars = null; @JsonProperty("alnum_ws_ratio_threshold") private Double alnumWsRatioThreshold = null; @JsonProperty("pipeline_min_quality") private Double pipelineMinQuality = null; /** Sets the minTotalNonWhitespace field. */ @JsonProperty("min_total_non_whitespace") public Builder withMinTotalNonWhitespace(final @Nullable Long value) { this.minTotalNonWhitespace = value; return this; } /** Sets the minNonWhitespacePerPage field. */ @JsonProperty("min_non_whitespace_per_page") public Builder withMinNonWhitespacePerPage(final @Nullable Double value) { this.minNonWhitespacePerPage = value; return this; } /** Sets the minMeaningfulWordLen field. */ @JsonProperty("min_meaningful_word_len") public Builder withMinMeaningfulWordLen(final @Nullable Long value) { this.minMeaningfulWordLen = value; return this; } /** Sets the minMeaningfulWords field. */ @JsonProperty("min_meaningful_words") public Builder withMinMeaningfulWords(final @Nullable Long value) { this.minMeaningfulWords = value; return this; } /** Sets the minAlnumRatio field. */ @JsonProperty("min_alnum_ratio") public Builder withMinAlnumRatio(final @Nullable Double value) { this.minAlnumRatio = value; return this; } /** Sets the minGarbageChars field. */ @JsonProperty("min_garbage_chars") public Builder withMinGarbageChars(final @Nullable Long value) { this.minGarbageChars = value; return this; } /** Sets the maxFragmentedWordRatio field. */ @JsonProperty("max_fragmented_word_ratio") public Builder withMaxFragmentedWordRatio(final @Nullable Double value) { this.maxFragmentedWordRatio = value; return this; } /** Sets the criticalFragmentedWordRatio field. */ @JsonProperty("critical_fragmented_word_ratio") public Builder withCriticalFragmentedWordRatio(final @Nullable Double value) { this.criticalFragmentedWordRatio = value; return this; } /** Sets the minAvgWordLength field. */ @JsonProperty("min_avg_word_length") public Builder withMinAvgWordLength(final @Nullable Double value) { this.minAvgWordLength = value; return this; } /** Sets the minWordsForAvgLengthCheck field. */ @JsonProperty("min_words_for_avg_length_check") public Builder withMinWordsForAvgLengthCheck(final @Nullable Long value) { this.minWordsForAvgLengthCheck = value; return this; } /** Sets the minConsecutiveRepeatRatio field. */ @JsonProperty("min_consecutive_repeat_ratio") public Builder withMinConsecutiveRepeatRatio(final @Nullable Double value) { this.minConsecutiveRepeatRatio = value; return this; } /** Sets the minWordsForRepeatCheck field. */ @JsonProperty("min_words_for_repeat_check") public Builder withMinWordsForRepeatCheck(final @Nullable Long value) { this.minWordsForRepeatCheck = value; return this; } /** Sets the substantiveMinChars field. */ @JsonProperty("substantive_min_chars") public Builder withSubstantiveMinChars(final @Nullable Long value) { this.substantiveMinChars = value; return this; } /** Sets the nonTextMinChars field. */ @JsonProperty("non_text_min_chars") public Builder withNonTextMinChars(final @Nullable Long value) { this.nonTextMinChars = value; return this; } /** Sets the alnumWsRatioThreshold field. */ @JsonProperty("alnum_ws_ratio_threshold") public Builder withAlnumWsRatioThreshold(final @Nullable Double value) { this.alnumWsRatioThreshold = value; return this; } /** Sets the pipelineMinQuality field. */ @JsonProperty("pipeline_min_quality") public Builder withPipelineMinQuality(final @Nullable Double value) { this.pipelineMinQuality = value; return this; } /** Builds the OcrQualityThresholds instance. */ public OcrQualityThresholds build() { return new OcrQualityThresholds( minTotalNonWhitespace, minNonWhitespacePerPage, minMeaningfulWordLen, minMeaningfulWords, minAlnumRatio, minGarbageChars, maxFragmentedWordRatio, criticalFragmentedWordRatio, minAvgWordLength, minWordsForAvgLengthCheck, minConsecutiveRepeatRatio, minWordsForRepeatCheck, substantiveMinChars, nonTextMinChars, alnumWsRatioThreshold, pipelineMinQuality ); } } // CPD-ON public static OcrQualityThresholds defaultInstance() { throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead."); } }