280 lines
11 KiB
Java
Generated
280 lines
11 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
|
|
*
|
|
* All fields default to the values that match the previous hardcoded behavior,
|
|
* so {@code OcrQualityThresholds.default()} preserves existing semantics exactly.
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = OcrQualityThresholds.Builder.class)
|
|
public record OcrQualityThresholds(
|
|
/**
|
|
* Minimum total non-whitespace characters to consider text substantive.
|
|
*/
|
|
@Nullable @JsonProperty("min_total_non_whitespace") Long minTotalNonWhitespace,
|
|
/**
|
|
* Minimum non-whitespace characters per page on average.
|
|
*/
|
|
@Nullable @JsonProperty("min_non_whitespace_per_page") Double minNonWhitespacePerPage,
|
|
/**
|
|
* Minimum character count for a word to be "meaningful".
|
|
*/
|
|
@Nullable @JsonProperty("min_meaningful_word_len") Long minMeaningfulWordLen,
|
|
/**
|
|
* Minimum count of meaningful words before text is accepted.
|
|
*/
|
|
@Nullable @JsonProperty("min_meaningful_words") Long minMeaningfulWords,
|
|
/**
|
|
* Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
|
|
*/
|
|
@Nullable @JsonProperty("min_alnum_ratio") Double minAlnumRatio,
|
|
/**
|
|
* Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
|
|
*/
|
|
@Nullable @JsonProperty("min_garbage_chars") Long minGarbageChars,
|
|
/**
|
|
* Maximum fraction of short (1-2 char) words before text is considered fragmented.
|
|
*/
|
|
@Nullable @JsonProperty("max_fragmented_word_ratio") Double maxFragmentedWordRatio,
|
|
/**
|
|
* Critical fragmentation threshold — triggers OCR regardless of meaningful words.
|
|
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
|
|
*/
|
|
@Nullable @JsonProperty("critical_fragmented_word_ratio") Double criticalFragmentedWordRatio,
|
|
/**
|
|
* Minimum average word length. Below this with enough words indicates garbled extraction.
|
|
*/
|
|
@Nullable @JsonProperty("min_avg_word_length") Double minAvgWordLength,
|
|
/**
|
|
* Minimum word count before average word length check applies.
|
|
*/
|
|
@Nullable @JsonProperty("min_words_for_avg_length_check") Long minWordsForAvgLengthCheck,
|
|
/**
|
|
* Minimum consecutive word repetition ratio to detect column scrambling.
|
|
*/
|
|
@Nullable @JsonProperty("min_consecutive_repeat_ratio") Double minConsecutiveRepeatRatio,
|
|
/**
|
|
* Minimum word count before consecutive repetition check is applied.
|
|
*/
|
|
@Nullable @JsonProperty("min_words_for_repeat_check") Long minWordsForRepeatCheck,
|
|
/**
|
|
* Minimum character count for "substantive markdown" OCR skip gate.
|
|
*/
|
|
@Nullable @JsonProperty("substantive_min_chars") Long substantiveMinChars,
|
|
/**
|
|
* Minimum character count for "non-text content" OCR skip gate.
|
|
*/
|
|
@Nullable @JsonProperty("non_text_min_chars") Long nonTextMinChars,
|
|
/**
|
|
* Alphanumeric+whitespace ratio threshold for skip decisions.
|
|
*/
|
|
@Nullable @JsonProperty("alnum_ws_ratio_threshold") Double alnumWsRatioThreshold,
|
|
/**
|
|
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
|
|
* If the result from a backend scores below this, try the next backend.
|
|
*/
|
|
@Nullable @JsonProperty("pipeline_min_quality") Double pipelineMinQuality
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
public OcrQualityThresholds{
|
|
if (minTotalNonWhitespace == null) minTotalNonWhitespace = 64L;
|
|
if (minMeaningfulWordLen == null) minMeaningfulWordLen = 4L;
|
|
if (minMeaningfulWords == null) minMeaningfulWords = 3L;
|
|
if (minGarbageChars == null) minGarbageChars = 5L;
|
|
if (minWordsForAvgLengthCheck == null) minWordsForAvgLengthCheck = 50L;
|
|
if (minWordsForRepeatCheck == null) minWordsForRepeatCheck = 50L;
|
|
if (substantiveMinChars == null) substantiveMinChars = 100L;
|
|
if (nonTextMinChars == null) nonTextMinChars = 20L;
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
@JsonProperty("min_total_non_whitespace")
|
|
private Long minTotalNonWhitespace = null;
|
|
@JsonProperty("min_non_whitespace_per_page")
|
|
private Double minNonWhitespacePerPage = null;
|
|
@JsonProperty("min_meaningful_word_len")
|
|
private Long minMeaningfulWordLen = null;
|
|
@JsonProperty("min_meaningful_words")
|
|
private Long minMeaningfulWords = null;
|
|
@JsonProperty("min_alnum_ratio")
|
|
private Double minAlnumRatio = null;
|
|
@JsonProperty("min_garbage_chars")
|
|
private Long minGarbageChars = null;
|
|
@JsonProperty("max_fragmented_word_ratio")
|
|
private Double maxFragmentedWordRatio = null;
|
|
@JsonProperty("critical_fragmented_word_ratio")
|
|
private Double criticalFragmentedWordRatio = null;
|
|
@JsonProperty("min_avg_word_length")
|
|
private Double minAvgWordLength = null;
|
|
@JsonProperty("min_words_for_avg_length_check")
|
|
private Long minWordsForAvgLengthCheck = null;
|
|
@JsonProperty("min_consecutive_repeat_ratio")
|
|
private Double minConsecutiveRepeatRatio = null;
|
|
@JsonProperty("min_words_for_repeat_check")
|
|
private Long minWordsForRepeatCheck = null;
|
|
@JsonProperty("substantive_min_chars")
|
|
private Long substantiveMinChars = null;
|
|
@JsonProperty("non_text_min_chars")
|
|
private Long nonTextMinChars = null;
|
|
@JsonProperty("alnum_ws_ratio_threshold")
|
|
private Double alnumWsRatioThreshold = null;
|
|
@JsonProperty("pipeline_min_quality")
|
|
private Double pipelineMinQuality = null;
|
|
|
|
/** Sets the minTotalNonWhitespace field. */
|
|
@JsonProperty("min_total_non_whitespace")
|
|
public Builder withMinTotalNonWhitespace(final @Nullable Long value) {
|
|
this.minTotalNonWhitespace = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minNonWhitespacePerPage field. */
|
|
@JsonProperty("min_non_whitespace_per_page")
|
|
public Builder withMinNonWhitespacePerPage(final @Nullable Double value) {
|
|
this.minNonWhitespacePerPage = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minMeaningfulWordLen field. */
|
|
@JsonProperty("min_meaningful_word_len")
|
|
public Builder withMinMeaningfulWordLen(final @Nullable Long value) {
|
|
this.minMeaningfulWordLen = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minMeaningfulWords field. */
|
|
@JsonProperty("min_meaningful_words")
|
|
public Builder withMinMeaningfulWords(final @Nullable Long value) {
|
|
this.minMeaningfulWords = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minAlnumRatio field. */
|
|
@JsonProperty("min_alnum_ratio")
|
|
public Builder withMinAlnumRatio(final @Nullable Double value) {
|
|
this.minAlnumRatio = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minGarbageChars field. */
|
|
@JsonProperty("min_garbage_chars")
|
|
public Builder withMinGarbageChars(final @Nullable Long value) {
|
|
this.minGarbageChars = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the maxFragmentedWordRatio field. */
|
|
@JsonProperty("max_fragmented_word_ratio")
|
|
public Builder withMaxFragmentedWordRatio(final @Nullable Double value) {
|
|
this.maxFragmentedWordRatio = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the criticalFragmentedWordRatio field. */
|
|
@JsonProperty("critical_fragmented_word_ratio")
|
|
public Builder withCriticalFragmentedWordRatio(final @Nullable Double value) {
|
|
this.criticalFragmentedWordRatio = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minAvgWordLength field. */
|
|
@JsonProperty("min_avg_word_length")
|
|
public Builder withMinAvgWordLength(final @Nullable Double value) {
|
|
this.minAvgWordLength = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minWordsForAvgLengthCheck field. */
|
|
@JsonProperty("min_words_for_avg_length_check")
|
|
public Builder withMinWordsForAvgLengthCheck(final @Nullable Long value) {
|
|
this.minWordsForAvgLengthCheck = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minConsecutiveRepeatRatio field. */
|
|
@JsonProperty("min_consecutive_repeat_ratio")
|
|
public Builder withMinConsecutiveRepeatRatio(final @Nullable Double value) {
|
|
this.minConsecutiveRepeatRatio = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the minWordsForRepeatCheck field. */
|
|
@JsonProperty("min_words_for_repeat_check")
|
|
public Builder withMinWordsForRepeatCheck(final @Nullable Long value) {
|
|
this.minWordsForRepeatCheck = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the substantiveMinChars field. */
|
|
@JsonProperty("substantive_min_chars")
|
|
public Builder withSubstantiveMinChars(final @Nullable Long value) {
|
|
this.substantiveMinChars = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the nonTextMinChars field. */
|
|
@JsonProperty("non_text_min_chars")
|
|
public Builder withNonTextMinChars(final @Nullable Long value) {
|
|
this.nonTextMinChars = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the alnumWsRatioThreshold field. */
|
|
@JsonProperty("alnum_ws_ratio_threshold")
|
|
public Builder withAlnumWsRatioThreshold(final @Nullable Double value) {
|
|
this.alnumWsRatioThreshold = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the pipelineMinQuality field. */
|
|
@JsonProperty("pipeline_min_quality")
|
|
public Builder withPipelineMinQuality(final @Nullable Double value) {
|
|
this.pipelineMinQuality = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the OcrQualityThresholds instance. */
|
|
public OcrQualityThresholds build() {
|
|
return new OcrQualityThresholds(
|
|
minTotalNonWhitespace,
|
|
minNonWhitespacePerPage,
|
|
minMeaningfulWordLen,
|
|
minMeaningfulWords,
|
|
minAlnumRatio,
|
|
minGarbageChars,
|
|
maxFragmentedWordRatio,
|
|
criticalFragmentedWordRatio,
|
|
minAvgWordLength,
|
|
minWordsForAvgLengthCheck,
|
|
minConsecutiveRepeatRatio,
|
|
minWordsForRepeatCheck,
|
|
substantiveMinChars,
|
|
nonTextMinChars,
|
|
alnumWsRatioThreshold,
|
|
pipelineMinQuality
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
public static OcrQualityThresholds defaultInstance() {
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|
}
|
|
}
|