110 lines
3.8 KiB
Java
Generated
110 lines
3.8 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* Hierarchy extraction configuration for PDF text structure analysis.
|
|
*
|
|
* Enables extraction of document hierarchy levels (H1-H6) based on font size
|
|
* clustering and semantic analysis. When enabled, hierarchical blocks are
|
|
* included in page content.
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = HierarchyConfig.Builder.class)
|
|
public record HierarchyConfig(
|
|
/**
|
|
* Enable hierarchy extraction
|
|
*/
|
|
@Nullable @JsonProperty("enabled") Boolean enabled,
|
|
/**
|
|
* Number of font size clusters to use for hierarchy levels (1-7)
|
|
*
|
|
* Default: 6, which provides H1-H6 heading levels with body text.
|
|
* Larger values create more fine-grained hierarchy levels.
|
|
*/
|
|
@Nullable @JsonProperty("k_clusters") Long kClusters,
|
|
/**
|
|
* Include bounding box information in hierarchy blocks
|
|
*/
|
|
@Nullable @JsonProperty("include_bbox") Boolean includeBbox,
|
|
/**
|
|
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
|
*
|
|
* Determines when OCR should be triggered based on text block coverage.
|
|
* OCR is triggered when text blocks cover less than this fraction of the page.
|
|
* Default: 0.5 (trigger OCR if less than 50% of page has text)
|
|
*/
|
|
@Nullable @JsonProperty("ocr_coverage_threshold") Float ocrCoverageThreshold
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
public HierarchyConfig{
|
|
if (kClusters == null) kClusters = 3L;
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
private Boolean enabled = null;
|
|
@JsonProperty("k_clusters")
|
|
private Long kClusters = null;
|
|
@JsonProperty("include_bbox")
|
|
private Boolean includeBbox = null;
|
|
@JsonProperty("ocr_coverage_threshold")
|
|
private Float ocrCoverageThreshold = null;
|
|
|
|
/** Sets the enabled field. */
|
|
@JsonProperty("enabled")
|
|
public Builder withEnabled(final @Nullable Boolean value) {
|
|
this.enabled = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the kClusters field. */
|
|
@JsonProperty("k_clusters")
|
|
public Builder withKClusters(final @Nullable Long value) {
|
|
this.kClusters = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the includeBbox field. */
|
|
@JsonProperty("include_bbox")
|
|
public Builder withIncludeBbox(final @Nullable Boolean value) {
|
|
this.includeBbox = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the ocrCoverageThreshold field. */
|
|
@JsonProperty("ocr_coverage_threshold")
|
|
public Builder withOcrCoverageThreshold(final @Nullable Float value) {
|
|
this.ocrCoverageThreshold = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the HierarchyConfig instance. */
|
|
public HierarchyConfig build() {
|
|
return new HierarchyConfig(
|
|
enabled,
|
|
kClusters,
|
|
includeBbox,
|
|
ocrCoverageThreshold
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
public static HierarchyConfig defaultInstance() {
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|
}
|
|
}
|