199 lines
7.4 KiB
Java
199 lines
7.4 KiB
Java
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
|||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|||
|
|
// To regenerate: alef generate
|
|||
|
|
// To verify freshness: alef verify --exit-code
|
|||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|||
|
|
package dev.kreuzberg;
|
|||
|
|
|
|||
|
|
import java.util.List;
|
|||
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|||
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|||
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|||
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|||
|
|
import org.jspecify.annotations.Nullable;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* PDF-specific configuration.
|
|||
|
|
*/
|
|||
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|||
|
|
@JsonDeserialize(builder = PdfConfig.Builder.class)
|
|||
|
|
public record PdfConfig(
|
|||
|
|
/**
|
|||
|
|
* Extract images from PDF
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("extract_images") Boolean extractImages,
|
|||
|
|
/**
|
|||
|
|
* Extract tables from PDF.
|
|||
|
|
*
|
|||
|
|
* When {@code true} (default), runs pdf_oxide's native grid detector and, if it
|
|||
|
|
* finds nothing, falls back to the heuristic text-layer reconstruction in
|
|||
|
|
* {@code pdf.oxide.table.extract_tables_heuristic}. Set to {@code false} to skip
|
|||
|
|
* both passes — {@code tables} will then be empty in the result.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("extract_tables") Boolean extractTables,
|
|||
|
|
/**
|
|||
|
|
* List of passwords to try when opening encrypted PDFs
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("passwords") List<String> passwords,
|
|||
|
|
/**
|
|||
|
|
* Extract PDF metadata
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("extract_metadata") Boolean extractMetadata,
|
|||
|
|
/**
|
|||
|
|
* Hierarchy extraction configuration (null = hierarchy extraction disabled)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("hierarchy") HierarchyConfig hierarchy,
|
|||
|
|
/**
|
|||
|
|
* Extract PDF annotations (text notes, highlights, links, stamps).
|
|||
|
|
* Default: false
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("extract_annotations") Boolean extractAnnotations,
|
|||
|
|
/**
|
|||
|
|
* Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
|||
|
|
* Default: 0.06 (6%)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("top_margin_fraction") Float topMarginFraction,
|
|||
|
|
/**
|
|||
|
|
* Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
|||
|
|
* Default: 0.05 (5%)
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("bottom_margin_fraction") Float bottomMarginFraction,
|
|||
|
|
/**
|
|||
|
|
* Allow single-column pseudo tables in extraction results.
|
|||
|
|
*
|
|||
|
|
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
|||
|
|
* (heuristic) are rejected. When {@code true}, the minimum column count is relaxed
|
|||
|
|
* to 1, allowing single-column structured data (glossaries, itemized lists)
|
|||
|
|
* to be emitted as tables. Other quality filters (density, sparsity, prose
|
|||
|
|
* detection) still apply.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("allow_single_column_tables") Boolean allowSingleColumnTables,
|
|||
|
|
/**
|
|||
|
|
* Perform OCR on inline images extracted from PDF pages and attach the
|
|||
|
|
* recognized text to each {@code ExtractedImage.ocr_result}. Requires Tesseract
|
|||
|
|
* to be available; if {@code ExtractionConfig.ocr} is {@code None} the extractor
|
|||
|
|
* falls back to {@code TesseractConfig.default()}. Per-image failures degrade
|
|||
|
|
* gracefully (the image is returned without OCR text rather than failing
|
|||
|
|
* the whole extraction). Default: {@code false}.
|
|||
|
|
*/
|
|||
|
|
@Nullable @JsonProperty("ocr_inline_images") Boolean ocrInlineImages
|
|||
|
|
) {
|
|||
|
|
public static Builder builder() {
|
|||
|
|
return new Builder();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CPD-OFF
|
|||
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|||
|
|
public static final class Builder {
|
|||
|
|
|
|||
|
|
@JsonProperty("extract_images")
|
|||
|
|
private Boolean extractImages = null;
|
|||
|
|
@JsonProperty("extract_tables")
|
|||
|
|
private Boolean extractTables = null;
|
|||
|
|
private List<String> passwords = null;
|
|||
|
|
@JsonProperty("extract_metadata")
|
|||
|
|
private Boolean extractMetadata = null;
|
|||
|
|
@Nullable private HierarchyConfig hierarchy = null;
|
|||
|
|
@JsonProperty("extract_annotations")
|
|||
|
|
private Boolean extractAnnotations = null;
|
|||
|
|
@JsonProperty("top_margin_fraction")
|
|||
|
|
private Float topMarginFraction = null;
|
|||
|
|
@JsonProperty("bottom_margin_fraction")
|
|||
|
|
private Float bottomMarginFraction = null;
|
|||
|
|
@JsonProperty("allow_single_column_tables")
|
|||
|
|
private Boolean allowSingleColumnTables = null;
|
|||
|
|
@JsonProperty("ocr_inline_images")
|
|||
|
|
private Boolean ocrInlineImages = null;
|
|||
|
|
|
|||
|
|
/** Sets the extractImages field. */
|
|||
|
|
@JsonProperty("extract_images")
|
|||
|
|
public Builder withExtractImages(final @Nullable Boolean value) {
|
|||
|
|
this.extractImages = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the extractTables field. */
|
|||
|
|
@JsonProperty("extract_tables")
|
|||
|
|
public Builder withExtractTables(final @Nullable Boolean value) {
|
|||
|
|
this.extractTables = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the passwords field. */
|
|||
|
|
@JsonProperty("passwords")
|
|||
|
|
public Builder withPasswords(final @Nullable List<String> value) {
|
|||
|
|
this.passwords = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the extractMetadata field. */
|
|||
|
|
@JsonProperty("extract_metadata")
|
|||
|
|
public Builder withExtractMetadata(final @Nullable Boolean value) {
|
|||
|
|
this.extractMetadata = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the hierarchy field. */
|
|||
|
|
@JsonProperty("hierarchy")
|
|||
|
|
public Builder withHierarchy(final @Nullable HierarchyConfig value) {
|
|||
|
|
this.hierarchy = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the extractAnnotations field. */
|
|||
|
|
@JsonProperty("extract_annotations")
|
|||
|
|
public Builder withExtractAnnotations(final @Nullable Boolean value) {
|
|||
|
|
this.extractAnnotations = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the topMarginFraction field. */
|
|||
|
|
@JsonProperty("top_margin_fraction")
|
|||
|
|
public Builder withTopMarginFraction(final @Nullable Float value) {
|
|||
|
|
this.topMarginFraction = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the bottomMarginFraction field. */
|
|||
|
|
@JsonProperty("bottom_margin_fraction")
|
|||
|
|
public Builder withBottomMarginFraction(final @Nullable Float value) {
|
|||
|
|
this.bottomMarginFraction = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the allowSingleColumnTables field. */
|
|||
|
|
@JsonProperty("allow_single_column_tables")
|
|||
|
|
public Builder withAllowSingleColumnTables(final @Nullable Boolean value) {
|
|||
|
|
this.allowSingleColumnTables = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Sets the ocrInlineImages field. */
|
|||
|
|
@JsonProperty("ocr_inline_images")
|
|||
|
|
public Builder withOcrInlineImages(final @Nullable Boolean value) {
|
|||
|
|
this.ocrInlineImages = value;
|
|||
|
|
return this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Builds the PdfConfig instance. */
|
|||
|
|
public PdfConfig build() {
|
|||
|
|
return new PdfConfig(
|
|||
|
|
extractImages,
|
|||
|
|
extractTables,
|
|||
|
|
passwords,
|
|||
|
|
extractMetadata,
|
|||
|
|
hierarchy,
|
|||
|
|
extractAnnotations,
|
|||
|
|
topMarginFraction,
|
|||
|
|
bottomMarginFraction,
|
|||
|
|
allowSingleColumnTables,
|
|||
|
|
ocrInlineImages
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// CPD-ON
|
|||
|
|
public static PdfConfig defaultInstance() {
|
|||
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|||
|
|
}
|
|||
|
|
}
|