This commit is contained in:
198
packages/java/dev/kreuzberg/PdfConfig.java
generated
Normal file
198
packages/java/dev/kreuzberg/PdfConfig.java
generated
Normal file
@@ -0,0 +1,198 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import java.util.List;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* PDF-specific configuration.
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = PdfConfig.Builder.class)
|
||||
public record PdfConfig(
|
||||
/**
|
||||
* Extract images from PDF
|
||||
*/
|
||||
@Nullable @JsonProperty("extract_images") Boolean extractImages,
|
||||
/**
|
||||
* Extract tables from PDF.
|
||||
*
|
||||
* When {@code true} (default), runs pdf_oxide's native grid detector and, if it
|
||||
* finds nothing, falls back to the heuristic text-layer reconstruction in
|
||||
* {@code pdf.oxide.table.extract_tables_heuristic}. Set to {@code false} to skip
|
||||
* both passes — {@code tables} will then be empty in the result.
|
||||
*/
|
||||
@Nullable @JsonProperty("extract_tables") Boolean extractTables,
|
||||
/**
|
||||
* List of passwords to try when opening encrypted PDFs
|
||||
*/
|
||||
@Nullable @JsonProperty("passwords") List<String> passwords,
|
||||
/**
|
||||
* Extract PDF metadata
|
||||
*/
|
||||
@Nullable @JsonProperty("extract_metadata") Boolean extractMetadata,
|
||||
/**
|
||||
* Hierarchy extraction configuration (null = hierarchy extraction disabled)
|
||||
*/
|
||||
@Nullable @JsonProperty("hierarchy") HierarchyConfig hierarchy,
|
||||
/**
|
||||
* Extract PDF annotations (text notes, highlights, links, stamps).
|
||||
* Default: false
|
||||
*/
|
||||
@Nullable @JsonProperty("extract_annotations") Boolean extractAnnotations,
|
||||
/**
|
||||
* Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
||||
* Default: 0.06 (6%)
|
||||
*/
|
||||
@Nullable @JsonProperty("top_margin_fraction") Float topMarginFraction,
|
||||
/**
|
||||
* Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
||||
* Default: 0.05 (5%)
|
||||
*/
|
||||
@Nullable @JsonProperty("bottom_margin_fraction") Float bottomMarginFraction,
|
||||
/**
|
||||
* Allow single-column pseudo tables in extraction results.
|
||||
*
|
||||
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
||||
* (heuristic) are rejected. When {@code true}, the minimum column count is relaxed
|
||||
* to 1, allowing single-column structured data (glossaries, itemized lists)
|
||||
* to be emitted as tables. Other quality filters (density, sparsity, prose
|
||||
* detection) still apply.
|
||||
*/
|
||||
@Nullable @JsonProperty("allow_single_column_tables") Boolean allowSingleColumnTables,
|
||||
/**
|
||||
* Perform OCR on inline images extracted from PDF pages and attach the
|
||||
* recognized text to each {@code ExtractedImage.ocr_result}. Requires Tesseract
|
||||
* to be available; if {@code ExtractionConfig.ocr} is {@code None} the extractor
|
||||
* falls back to {@code TesseractConfig.default()}. Per-image failures degrade
|
||||
* gracefully (the image is returned without OCR text rather than failing
|
||||
* the whole extraction). Default: {@code false}.
|
||||
*/
|
||||
@Nullable @JsonProperty("ocr_inline_images") Boolean ocrInlineImages
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
@JsonProperty("extract_images")
|
||||
private Boolean extractImages = null;
|
||||
@JsonProperty("extract_tables")
|
||||
private Boolean extractTables = null;
|
||||
private List<String> passwords = null;
|
||||
@JsonProperty("extract_metadata")
|
||||
private Boolean extractMetadata = null;
|
||||
@Nullable private HierarchyConfig hierarchy = null;
|
||||
@JsonProperty("extract_annotations")
|
||||
private Boolean extractAnnotations = null;
|
||||
@JsonProperty("top_margin_fraction")
|
||||
private Float topMarginFraction = null;
|
||||
@JsonProperty("bottom_margin_fraction")
|
||||
private Float bottomMarginFraction = null;
|
||||
@JsonProperty("allow_single_column_tables")
|
||||
private Boolean allowSingleColumnTables = null;
|
||||
@JsonProperty("ocr_inline_images")
|
||||
private Boolean ocrInlineImages = null;
|
||||
|
||||
/** Sets the extractImages field. */
|
||||
@JsonProperty("extract_images")
|
||||
public Builder withExtractImages(final @Nullable Boolean value) {
|
||||
this.extractImages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the extractTables field. */
|
||||
@JsonProperty("extract_tables")
|
||||
public Builder withExtractTables(final @Nullable Boolean value) {
|
||||
this.extractTables = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the passwords field. */
|
||||
@JsonProperty("passwords")
|
||||
public Builder withPasswords(final @Nullable List<String> value) {
|
||||
this.passwords = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the extractMetadata field. */
|
||||
@JsonProperty("extract_metadata")
|
||||
public Builder withExtractMetadata(final @Nullable Boolean value) {
|
||||
this.extractMetadata = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the hierarchy field. */
|
||||
@JsonProperty("hierarchy")
|
||||
public Builder withHierarchy(final @Nullable HierarchyConfig value) {
|
||||
this.hierarchy = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the extractAnnotations field. */
|
||||
@JsonProperty("extract_annotations")
|
||||
public Builder withExtractAnnotations(final @Nullable Boolean value) {
|
||||
this.extractAnnotations = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the topMarginFraction field. */
|
||||
@JsonProperty("top_margin_fraction")
|
||||
public Builder withTopMarginFraction(final @Nullable Float value) {
|
||||
this.topMarginFraction = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the bottomMarginFraction field. */
|
||||
@JsonProperty("bottom_margin_fraction")
|
||||
public Builder withBottomMarginFraction(final @Nullable Float value) {
|
||||
this.bottomMarginFraction = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the allowSingleColumnTables field. */
|
||||
@JsonProperty("allow_single_column_tables")
|
||||
public Builder withAllowSingleColumnTables(final @Nullable Boolean value) {
|
||||
this.allowSingleColumnTables = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the ocrInlineImages field. */
|
||||
@JsonProperty("ocr_inline_images")
|
||||
public Builder withOcrInlineImages(final @Nullable Boolean value) {
|
||||
this.ocrInlineImages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the PdfConfig instance. */
|
||||
public PdfConfig build() {
|
||||
return new PdfConfig(
|
||||
extractImages,
|
||||
extractTables,
|
||||
passwords,
|
||||
extractMetadata,
|
||||
hierarchy,
|
||||
extractAnnotations,
|
||||
topMarginFraction,
|
||||
bottomMarginFraction,
|
||||
allowSingleColumnTables,
|
||||
ocrInlineImages
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
public static PdfConfig defaultInstance() {
|
||||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user