199 lines
7.4 KiB
Java
Generated
199 lines
7.4 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
// To regenerate: alef generate
|
||
// To verify freshness: alef verify --exit-code
|
||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
package dev.kreuzberg;
|
||
|
||
import java.util.List;
|
||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||
import org.jspecify.annotations.Nullable;
|
||
|
||
/**
|
||
* PDF-specific configuration.
|
||
*/
|
||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||
@JsonDeserialize(builder = PdfConfig.Builder.class)
|
||
public record PdfConfig(
|
||
/**
|
||
* Extract images from PDF
|
||
*/
|
||
@Nullable @JsonProperty("extract_images") Boolean extractImages,
|
||
/**
|
||
* Extract tables from PDF.
|
||
*
|
||
* When {@code true} (default), runs pdf_oxide's native grid detector and, if it
|
||
* finds nothing, falls back to the heuristic text-layer reconstruction in
|
||
* {@code pdf.oxide.table.extract_tables_heuristic}. Set to {@code false} to skip
|
||
* both passes — {@code tables} will then be empty in the result.
|
||
*/
|
||
@Nullable @JsonProperty("extract_tables") Boolean extractTables,
|
||
/**
|
||
* List of passwords to try when opening encrypted PDFs
|
||
*/
|
||
@Nullable @JsonProperty("passwords") List<String> passwords,
|
||
/**
|
||
* Extract PDF metadata
|
||
*/
|
||
@Nullable @JsonProperty("extract_metadata") Boolean extractMetadata,
|
||
/**
|
||
* Hierarchy extraction configuration (null = hierarchy extraction disabled)
|
||
*/
|
||
@Nullable @JsonProperty("hierarchy") HierarchyConfig hierarchy,
|
||
/**
|
||
* Extract PDF annotations (text notes, highlights, links, stamps).
|
||
* Default: false
|
||
*/
|
||
@Nullable @JsonProperty("extract_annotations") Boolean extractAnnotations,
|
||
/**
|
||
* Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
||
* Default: 0.06 (6%)
|
||
*/
|
||
@Nullable @JsonProperty("top_margin_fraction") Float topMarginFraction,
|
||
/**
|
||
* Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
||
* Default: 0.05 (5%)
|
||
*/
|
||
@Nullable @JsonProperty("bottom_margin_fraction") Float bottomMarginFraction,
|
||
/**
|
||
* Allow single-column pseudo tables in extraction results.
|
||
*
|
||
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
||
* (heuristic) are rejected. When {@code true}, the minimum column count is relaxed
|
||
* to 1, allowing single-column structured data (glossaries, itemized lists)
|
||
* to be emitted as tables. Other quality filters (density, sparsity, prose
|
||
* detection) still apply.
|
||
*/
|
||
@Nullable @JsonProperty("allow_single_column_tables") Boolean allowSingleColumnTables,
|
||
/**
|
||
* Perform OCR on inline images extracted from PDF pages and attach the
|
||
* recognized text to each {@code ExtractedImage.ocr_result}. Requires Tesseract
|
||
* to be available; if {@code ExtractionConfig.ocr} is {@code None} the extractor
|
||
* falls back to {@code TesseractConfig.default()}. Per-image failures degrade
|
||
* gracefully (the image is returned without OCR text rather than failing
|
||
* the whole extraction). Default: {@code false}.
|
||
*/
|
||
@Nullable @JsonProperty("ocr_inline_images") Boolean ocrInlineImages
|
||
) {
|
||
public static Builder builder() {
|
||
return new Builder();
|
||
}
|
||
|
||
// CPD-OFF
|
||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||
public static final class Builder {
|
||
|
||
@JsonProperty("extract_images")
|
||
private Boolean extractImages = null;
|
||
@JsonProperty("extract_tables")
|
||
private Boolean extractTables = null;
|
||
private List<String> passwords = null;
|
||
@JsonProperty("extract_metadata")
|
||
private Boolean extractMetadata = null;
|
||
@Nullable private HierarchyConfig hierarchy = null;
|
||
@JsonProperty("extract_annotations")
|
||
private Boolean extractAnnotations = null;
|
||
@JsonProperty("top_margin_fraction")
|
||
private Float topMarginFraction = null;
|
||
@JsonProperty("bottom_margin_fraction")
|
||
private Float bottomMarginFraction = null;
|
||
@JsonProperty("allow_single_column_tables")
|
||
private Boolean allowSingleColumnTables = null;
|
||
@JsonProperty("ocr_inline_images")
|
||
private Boolean ocrInlineImages = null;
|
||
|
||
/** Sets the extractImages field. */
|
||
@JsonProperty("extract_images")
|
||
public Builder withExtractImages(final @Nullable Boolean value) {
|
||
this.extractImages = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the extractTables field. */
|
||
@JsonProperty("extract_tables")
|
||
public Builder withExtractTables(final @Nullable Boolean value) {
|
||
this.extractTables = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the passwords field. */
|
||
@JsonProperty("passwords")
|
||
public Builder withPasswords(final @Nullable List<String> value) {
|
||
this.passwords = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the extractMetadata field. */
|
||
@JsonProperty("extract_metadata")
|
||
public Builder withExtractMetadata(final @Nullable Boolean value) {
|
||
this.extractMetadata = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the hierarchy field. */
|
||
@JsonProperty("hierarchy")
|
||
public Builder withHierarchy(final @Nullable HierarchyConfig value) {
|
||
this.hierarchy = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the extractAnnotations field. */
|
||
@JsonProperty("extract_annotations")
|
||
public Builder withExtractAnnotations(final @Nullable Boolean value) {
|
||
this.extractAnnotations = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the topMarginFraction field. */
|
||
@JsonProperty("top_margin_fraction")
|
||
public Builder withTopMarginFraction(final @Nullable Float value) {
|
||
this.topMarginFraction = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the bottomMarginFraction field. */
|
||
@JsonProperty("bottom_margin_fraction")
|
||
public Builder withBottomMarginFraction(final @Nullable Float value) {
|
||
this.bottomMarginFraction = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the allowSingleColumnTables field. */
|
||
@JsonProperty("allow_single_column_tables")
|
||
public Builder withAllowSingleColumnTables(final @Nullable Boolean value) {
|
||
this.allowSingleColumnTables = value;
|
||
return this;
|
||
}
|
||
|
||
/** Sets the ocrInlineImages field. */
|
||
@JsonProperty("ocr_inline_images")
|
||
public Builder withOcrInlineImages(final @Nullable Boolean value) {
|
||
this.ocrInlineImages = value;
|
||
return this;
|
||
}
|
||
|
||
/** Builds the PdfConfig instance. */
|
||
public PdfConfig build() {
|
||
return new PdfConfig(
|
||
extractImages,
|
||
extractTables,
|
||
passwords,
|
||
extractMetadata,
|
||
hierarchy,
|
||
extractAnnotations,
|
||
topMarginFraction,
|
||
bottomMarginFraction,
|
||
allowSingleColumnTables,
|
||
ocrInlineImages
|
||
);
|
||
}
|
||
}
|
||
// CPD-ON
|
||
public static PdfConfig defaultInstance() {
|
||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||
}
|
||
}
|