This commit is contained in:
255
packages/java/dev/kreuzberg/ImageExtractionConfig.java
generated
Normal file
255
packages/java/dev/kreuzberg/ImageExtractionConfig.java
generated
Normal file
@@ -0,0 +1,255 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Image extraction configuration.
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = ImageExtractionConfig.Builder.class)
|
||||
public record ImageExtractionConfig(
|
||||
/**
|
||||
* Extract images from documents
|
||||
*/
|
||||
@Nullable @JsonProperty("extract_images") Boolean extractImages,
|
||||
/**
|
||||
* Target DPI for image normalization
|
||||
*/
|
||||
@Nullable @JsonProperty("target_dpi") Integer targetDpi,
|
||||
/**
|
||||
* Maximum dimension for images (width or height)
|
||||
*/
|
||||
@Nullable @JsonProperty("max_image_dimension") Integer maxImageDimension,
|
||||
/**
|
||||
* Whether to inject image reference placeholders into markdown output.
|
||||
* When {@code true} (default), image references like {@code }
|
||||
* are appended to the markdown. Set to {@code false} to extract images as data
|
||||
* without polluting the markdown output.
|
||||
*/
|
||||
@Nullable @JsonProperty("inject_placeholders") Boolean injectPlaceholders,
|
||||
/**
|
||||
* Automatically adjust DPI based on image content
|
||||
*/
|
||||
@Nullable @JsonProperty("auto_adjust_dpi") Boolean autoAdjustDpi,
|
||||
/**
|
||||
* Minimum DPI threshold
|
||||
*/
|
||||
@Nullable @JsonProperty("min_dpi") Integer minDpi,
|
||||
/**
|
||||
* Maximum DPI threshold
|
||||
*/
|
||||
@Nullable @JsonProperty("max_dpi") Integer maxDpi,
|
||||
/**
|
||||
* Maximum number of image objects to extract per PDF page.
|
||||
*
|
||||
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
||||
* can trigger extremely long or indefinite extraction times when every image
|
||||
* object on a dense page is decoded individually via the PDF extractor. Setting this
|
||||
* limit causes kreuzberg to stop collecting individual images once the count
|
||||
* per page reaches the cap and emit a warning instead.
|
||||
*
|
||||
* {@code None} (default) means no limit — all images are extracted.
|
||||
*/
|
||||
@Nullable @JsonProperty("max_images_per_page") Integer maxImagesPerPage,
|
||||
/**
|
||||
* When {@code true} (default), extracted images are classified by kind and grouped
|
||||
* into clusters where they appear to belong to one figure.
|
||||
*/
|
||||
@Nullable @JsonProperty("classify") Boolean classify,
|
||||
/**
|
||||
* When {@code true}, full-page renders produced during OCR preprocessing are captured
|
||||
* and returned as {@code ImageKind.PageRaster} entries in {@code ExtractionResult.images}.
|
||||
*
|
||||
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
||||
* document-level OCR bypass is active (whole-document backend). When OCR is
|
||||
* enabled and this flag is set but the active backend skips per-page rendering,
|
||||
* a {@code ProcessingWarning} is emitted in {@code ExtractionResult.processing_warnings}.
|
||||
*
|
||||
* Defaults to {@code false}. Enable when downstream consumers need page thumbnails
|
||||
* (e.g. citation previews, visual grounding).
|
||||
*/
|
||||
@Nullable @JsonProperty("include_page_rasters") Boolean includePageRasters,
|
||||
/**
|
||||
* Run OCR on extracted images and include the recognized text in the document content.
|
||||
*
|
||||
* When {@code true} (default) and {@code ExtractionConfig.ocr} is configured, extracted images
|
||||
* are processed with the configured OCR backend. Set to {@code false} to extract images
|
||||
* without OCR processing, even when OCR is enabled.
|
||||
*/
|
||||
@Nullable @JsonProperty("run_ocr_on_images") Boolean runOcrOnImages,
|
||||
/**
|
||||
* When {@code true}, image OCR results are rendered as plain text without the
|
||||
* {@code } markdown placeholder. Only takes effect when {@code run_ocr_on_images}
|
||||
* is also {@code true}.
|
||||
*/
|
||||
@Nullable @JsonProperty("ocr_text_only") Boolean ocrTextOnly,
|
||||
/**
|
||||
* When {@code true} and {@code ocr_text_only} is {@code false}, append the OCR text after
|
||||
* the image placeholder in the rendered output.
|
||||
*/
|
||||
@Nullable @JsonProperty("append_ocr_text") Boolean appendOcrText
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
public ImageExtractionConfig{
|
||||
if (targetDpi == null) targetDpi = 300;
|
||||
if (maxImageDimension == null) maxImageDimension = 4096;
|
||||
if (minDpi == null) minDpi = 72;
|
||||
if (maxDpi == null) maxDpi = 600;
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
@JsonProperty("extract_images")
|
||||
private Boolean extractImages = null;
|
||||
@JsonProperty("target_dpi")
|
||||
private Integer targetDpi = null;
|
||||
@JsonProperty("max_image_dimension")
|
||||
private Integer maxImageDimension = null;
|
||||
@JsonProperty("inject_placeholders")
|
||||
private Boolean injectPlaceholders = null;
|
||||
@JsonProperty("auto_adjust_dpi")
|
||||
private Boolean autoAdjustDpi = null;
|
||||
@JsonProperty("min_dpi")
|
||||
private Integer minDpi = null;
|
||||
@JsonProperty("max_dpi")
|
||||
private Integer maxDpi = null;
|
||||
@JsonProperty("max_images_per_page")
|
||||
private Integer maxImagesPerPage = null;
|
||||
private Boolean classify = null;
|
||||
@JsonProperty("include_page_rasters")
|
||||
private Boolean includePageRasters = null;
|
||||
@JsonProperty("run_ocr_on_images")
|
||||
private Boolean runOcrOnImages = null;
|
||||
@JsonProperty("ocr_text_only")
|
||||
private Boolean ocrTextOnly = null;
|
||||
@JsonProperty("append_ocr_text")
|
||||
private Boolean appendOcrText = null;
|
||||
|
||||
/** Sets the extractImages field. */
|
||||
@JsonProperty("extract_images")
|
||||
public Builder withExtractImages(final @Nullable Boolean value) {
|
||||
this.extractImages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the targetDpi field. */
|
||||
@JsonProperty("target_dpi")
|
||||
public Builder withTargetDpi(final @Nullable Integer value) {
|
||||
this.targetDpi = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the maxImageDimension field. */
|
||||
@JsonProperty("max_image_dimension")
|
||||
public Builder withMaxImageDimension(final @Nullable Integer value) {
|
||||
this.maxImageDimension = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the injectPlaceholders field. */
|
||||
@JsonProperty("inject_placeholders")
|
||||
public Builder withInjectPlaceholders(final @Nullable Boolean value) {
|
||||
this.injectPlaceholders = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the autoAdjustDpi field. */
|
||||
@JsonProperty("auto_adjust_dpi")
|
||||
public Builder withAutoAdjustDpi(final @Nullable Boolean value) {
|
||||
this.autoAdjustDpi = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the minDpi field. */
|
||||
@JsonProperty("min_dpi")
|
||||
public Builder withMinDpi(final @Nullable Integer value) {
|
||||
this.minDpi = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the maxDpi field. */
|
||||
@JsonProperty("max_dpi")
|
||||
public Builder withMaxDpi(final @Nullable Integer value) {
|
||||
this.maxDpi = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the maxImagesPerPage field. */
|
||||
@JsonProperty("max_images_per_page")
|
||||
public Builder withMaxImagesPerPage(final @Nullable Integer value) {
|
||||
this.maxImagesPerPage = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the classify field. */
|
||||
@JsonProperty("classify")
|
||||
public Builder withClassify(final @Nullable Boolean value) {
|
||||
this.classify = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the includePageRasters field. */
|
||||
@JsonProperty("include_page_rasters")
|
||||
public Builder withIncludePageRasters(final @Nullable Boolean value) {
|
||||
this.includePageRasters = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the runOcrOnImages field. */
|
||||
@JsonProperty("run_ocr_on_images")
|
||||
public Builder withRunOcrOnImages(final @Nullable Boolean value) {
|
||||
this.runOcrOnImages = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the ocrTextOnly field. */
|
||||
@JsonProperty("ocr_text_only")
|
||||
public Builder withOcrTextOnly(final @Nullable Boolean value) {
|
||||
this.ocrTextOnly = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the appendOcrText field. */
|
||||
@JsonProperty("append_ocr_text")
|
||||
public Builder withAppendOcrText(final @Nullable Boolean value) {
|
||||
this.appendOcrText = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the ImageExtractionConfig instance. */
|
||||
public ImageExtractionConfig build() {
|
||||
return new ImageExtractionConfig(
|
||||
extractImages,
|
||||
targetDpi,
|
||||
maxImageDimension,
|
||||
injectPlaceholders,
|
||||
autoAdjustDpi,
|
||||
minDpi,
|
||||
maxDpi,
|
||||
maxImagesPerPage,
|
||||
classify,
|
||||
includePageRasters,
|
||||
runOcrOnImages,
|
||||
ocrTextOnly,
|
||||
appendOcrText
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
public static ImageExtractionConfig defaultInstance() {
|
||||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user