Files
fil/packages/java/dev/kreuzberg/PdfConfig.java

199 lines
7.4 KiB
Java
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* PDF-specific configuration.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = PdfConfig.Builder.class)
public record PdfConfig(
/**
* Extract images from PDF
*/
@Nullable @JsonProperty("extract_images") Boolean extractImages,
/**
* Extract tables from PDF.
*
* When {@code true} (default), runs pdf_oxide's native grid detector and, if it
* finds nothing, falls back to the heuristic text-layer reconstruction in
* {@code pdf.oxide.table.extract_tables_heuristic}. Set to {@code false} to skip
* both passes {@code tables} will then be empty in the result.
*/
@Nullable @JsonProperty("extract_tables") Boolean extractTables,
/**
* List of passwords to try when opening encrypted PDFs
*/
@Nullable @JsonProperty("passwords") List<String> passwords,
/**
* Extract PDF metadata
*/
@Nullable @JsonProperty("extract_metadata") Boolean extractMetadata,
/**
* Hierarchy extraction configuration (null = hierarchy extraction disabled)
*/
@Nullable @JsonProperty("hierarchy") HierarchyConfig hierarchy,
/**
* Extract PDF annotations (text notes, highlights, links, stamps).
* Default: false
*/
@Nullable @JsonProperty("extract_annotations") Boolean extractAnnotations,
/**
* Top margin fraction (0.01.0) of page height to exclude headers/running heads.
* Default: 0.06 (6%)
*/
@Nullable @JsonProperty("top_margin_fraction") Float topMarginFraction,
/**
* Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
* Default: 0.05 (5%)
*/
@Nullable @JsonProperty("bottom_margin_fraction") Float bottomMarginFraction,
/**
* Allow single-column pseudo tables in extraction results.
*
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
* (heuristic) are rejected. When {@code true}, the minimum column count is relaxed
* to 1, allowing single-column structured data (glossaries, itemized lists)
* to be emitted as tables. Other quality filters (density, sparsity, prose
* detection) still apply.
*/
@Nullable @JsonProperty("allow_single_column_tables") Boolean allowSingleColumnTables,
/**
* Perform OCR on inline images extracted from PDF pages and attach the
* recognized text to each {@code ExtractedImage.ocr_result}. Requires Tesseract
* to be available; if {@code ExtractionConfig.ocr} is {@code None} the extractor
* falls back to {@code TesseractConfig.default()}. Per-image failures degrade
* gracefully (the image is returned without OCR text rather than failing
* the whole extraction). Default: {@code false}.
*/
@Nullable @JsonProperty("ocr_inline_images") Boolean ocrInlineImages
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("extract_images")
private Boolean extractImages = null;
@JsonProperty("extract_tables")
private Boolean extractTables = null;
private List<String> passwords = null;
@JsonProperty("extract_metadata")
private Boolean extractMetadata = null;
@Nullable private HierarchyConfig hierarchy = null;
@JsonProperty("extract_annotations")
private Boolean extractAnnotations = null;
@JsonProperty("top_margin_fraction")
private Float topMarginFraction = null;
@JsonProperty("bottom_margin_fraction")
private Float bottomMarginFraction = null;
@JsonProperty("allow_single_column_tables")
private Boolean allowSingleColumnTables = null;
@JsonProperty("ocr_inline_images")
private Boolean ocrInlineImages = null;
/** Sets the extractImages field. */
@JsonProperty("extract_images")
public Builder withExtractImages(final @Nullable Boolean value) {
this.extractImages = value;
return this;
}
/** Sets the extractTables field. */
@JsonProperty("extract_tables")
public Builder withExtractTables(final @Nullable Boolean value) {
this.extractTables = value;
return this;
}
/** Sets the passwords field. */
@JsonProperty("passwords")
public Builder withPasswords(final @Nullable List<String> value) {
this.passwords = value;
return this;
}
/** Sets the extractMetadata field. */
@JsonProperty("extract_metadata")
public Builder withExtractMetadata(final @Nullable Boolean value) {
this.extractMetadata = value;
return this;
}
/** Sets the hierarchy field. */
@JsonProperty("hierarchy")
public Builder withHierarchy(final @Nullable HierarchyConfig value) {
this.hierarchy = value;
return this;
}
/** Sets the extractAnnotations field. */
@JsonProperty("extract_annotations")
public Builder withExtractAnnotations(final @Nullable Boolean value) {
this.extractAnnotations = value;
return this;
}
/** Sets the topMarginFraction field. */
@JsonProperty("top_margin_fraction")
public Builder withTopMarginFraction(final @Nullable Float value) {
this.topMarginFraction = value;
return this;
}
/** Sets the bottomMarginFraction field. */
@JsonProperty("bottom_margin_fraction")
public Builder withBottomMarginFraction(final @Nullable Float value) {
this.bottomMarginFraction = value;
return this;
}
/** Sets the allowSingleColumnTables field. */
@JsonProperty("allow_single_column_tables")
public Builder withAllowSingleColumnTables(final @Nullable Boolean value) {
this.allowSingleColumnTables = value;
return this;
}
/** Sets the ocrInlineImages field. */
@JsonProperty("ocr_inline_images")
public Builder withOcrInlineImages(final @Nullable Boolean value) {
this.ocrInlineImages = value;
return this;
}
/** Builds the PdfConfig instance. */
public PdfConfig build() {
return new PdfConfig(
extractImages,
extractTables,
passwords,
extractMetadata,
hierarchy,
extractAnnotations,
topMarginFraction,
bottomMarginFraction,
allowSingleColumnTables,
ocrInlineImages
);
}
}
// CPD-ON
public static PdfConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}