Files
fil/packages/java/dev/kreuzberg/ImageExtractionConfig.java

256 lines
9.5 KiB
Java
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Image extraction configuration.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImageExtractionConfig.Builder.class)
public record ImageExtractionConfig(
/**
* Extract images from documents
*/
@Nullable @JsonProperty("extract_images") Boolean extractImages,
/**
* Target DPI for image normalization
*/
@Nullable @JsonProperty("target_dpi") Integer targetDpi,
/**
* Maximum dimension for images (width or height)
*/
@Nullable @JsonProperty("max_image_dimension") Integer maxImageDimension,
/**
* Whether to inject image reference placeholders into markdown output.
* When {@code true} (default), image references like {@code ![Image 1](embedded:p1_i0)}
* are appended to the markdown. Set to {@code false} to extract images as data
* without polluting the markdown output.
*/
@Nullable @JsonProperty("inject_placeholders") Boolean injectPlaceholders,
/**
* Automatically adjust DPI based on image content
*/
@Nullable @JsonProperty("auto_adjust_dpi") Boolean autoAdjustDpi,
/**
* Minimum DPI threshold
*/
@Nullable @JsonProperty("min_dpi") Integer minDpi,
/**
* Maximum DPI threshold
*/
@Nullable @JsonProperty("max_dpi") Integer maxDpi,
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* {@code None} (default) means no limit all images are extracted.
*/
@Nullable @JsonProperty("max_images_per_page") Integer maxImagesPerPage,
/**
* When {@code true} (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*/
@Nullable @JsonProperty("classify") Boolean classify,
/**
* When {@code true}, full-page renders produced during OCR preprocessing are captured
* and returned as {@code ImageKind.PageRaster} entries in {@code ExtractionResult.images}.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a {@code ProcessingWarning} is emitted in {@code ExtractionResult.processing_warnings}.
*
* Defaults to {@code false}. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*/
@Nullable @JsonProperty("include_page_rasters") Boolean includePageRasters,
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When {@code true} (default) and {@code ExtractionConfig.ocr} is configured, extracted images
* are processed with the configured OCR backend. Set to {@code false} to extract images
* without OCR processing, even when OCR is enabled.
*/
@Nullable @JsonProperty("run_ocr_on_images") Boolean runOcrOnImages,
/**
* When {@code true}, image OCR results are rendered as plain text without the
* {@code ![...](...)} markdown placeholder. Only takes effect when {@code run_ocr_on_images}
* is also {@code true}.
*/
@Nullable @JsonProperty("ocr_text_only") Boolean ocrTextOnly,
/**
* When {@code true} and {@code ocr_text_only} is {@code false}, append the OCR text after
* the image placeholder in the rendered output.
*/
@Nullable @JsonProperty("append_ocr_text") Boolean appendOcrText
) {
public static Builder builder() {
return new Builder();
}
public ImageExtractionConfig{
if (targetDpi == null) targetDpi = 300;
if (maxImageDimension == null) maxImageDimension = 4096;
if (minDpi == null) minDpi = 72;
if (maxDpi == null) maxDpi = 600;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("extract_images")
private Boolean extractImages = null;
@JsonProperty("target_dpi")
private Integer targetDpi = null;
@JsonProperty("max_image_dimension")
private Integer maxImageDimension = null;
@JsonProperty("inject_placeholders")
private Boolean injectPlaceholders = null;
@JsonProperty("auto_adjust_dpi")
private Boolean autoAdjustDpi = null;
@JsonProperty("min_dpi")
private Integer minDpi = null;
@JsonProperty("max_dpi")
private Integer maxDpi = null;
@JsonProperty("max_images_per_page")
private Integer maxImagesPerPage = null;
private Boolean classify = null;
@JsonProperty("include_page_rasters")
private Boolean includePageRasters = null;
@JsonProperty("run_ocr_on_images")
private Boolean runOcrOnImages = null;
@JsonProperty("ocr_text_only")
private Boolean ocrTextOnly = null;
@JsonProperty("append_ocr_text")
private Boolean appendOcrText = null;
/** Sets the extractImages field. */
@JsonProperty("extract_images")
public Builder withExtractImages(final @Nullable Boolean value) {
this.extractImages = value;
return this;
}
/** Sets the targetDpi field. */
@JsonProperty("target_dpi")
public Builder withTargetDpi(final @Nullable Integer value) {
this.targetDpi = value;
return this;
}
/** Sets the maxImageDimension field. */
@JsonProperty("max_image_dimension")
public Builder withMaxImageDimension(final @Nullable Integer value) {
this.maxImageDimension = value;
return this;
}
/** Sets the injectPlaceholders field. */
@JsonProperty("inject_placeholders")
public Builder withInjectPlaceholders(final @Nullable Boolean value) {
this.injectPlaceholders = value;
return this;
}
/** Sets the autoAdjustDpi field. */
@JsonProperty("auto_adjust_dpi")
public Builder withAutoAdjustDpi(final @Nullable Boolean value) {
this.autoAdjustDpi = value;
return this;
}
/** Sets the minDpi field. */
@JsonProperty("min_dpi")
public Builder withMinDpi(final @Nullable Integer value) {
this.minDpi = value;
return this;
}
/** Sets the maxDpi field. */
@JsonProperty("max_dpi")
public Builder withMaxDpi(final @Nullable Integer value) {
this.maxDpi = value;
return this;
}
/** Sets the maxImagesPerPage field. */
@JsonProperty("max_images_per_page")
public Builder withMaxImagesPerPage(final @Nullable Integer value) {
this.maxImagesPerPage = value;
return this;
}
/** Sets the classify field. */
@JsonProperty("classify")
public Builder withClassify(final @Nullable Boolean value) {
this.classify = value;
return this;
}
/** Sets the includePageRasters field. */
@JsonProperty("include_page_rasters")
public Builder withIncludePageRasters(final @Nullable Boolean value) {
this.includePageRasters = value;
return this;
}
/** Sets the runOcrOnImages field. */
@JsonProperty("run_ocr_on_images")
public Builder withRunOcrOnImages(final @Nullable Boolean value) {
this.runOcrOnImages = value;
return this;
}
/** Sets the ocrTextOnly field. */
@JsonProperty("ocr_text_only")
public Builder withOcrTextOnly(final @Nullable Boolean value) {
this.ocrTextOnly = value;
return this;
}
/** Sets the appendOcrText field. */
@JsonProperty("append_ocr_text")
public Builder withAppendOcrText(final @Nullable Boolean value) {
this.appendOcrText = value;
return this;
}
/** Builds the ImageExtractionConfig instance. */
public ImageExtractionConfig build() {
return new ImageExtractionConfig(
extractImages,
targetDpi,
maxImageDimension,
injectPlaceholders,
autoAdjustDpi,
minDpi,
maxDpi,
maxImagesPerPage,
classify,
includePageRasters,
runOcrOnImages,
ocrTextOnly,
appendOcrText
);
}
}
// CPD-ON
public static ImageExtractionConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}