Files
fil/packages/java/dev/kreuzberg/PageContent.java

215 lines
7.4 KiB
Java
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - {@code Vec<Arc<Table>>} enables zero-copy sharing of table data
* - {@code Vec<Arc<ExtractedImage>>} enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = PageContent.Builder.class)
public record PageContent(
/**
* Page number (1-indexed)
*/
@JsonProperty("page_number") int pageNumber,
/**
* Text content for this page
*/
@JsonProperty("content") String content,
/**
* Tables found on this page (uses Arc for memory efficiency)
*
* Serializes as Table[] for JSON compatibility while maintaining
* Arc semantics in-memory for zero-copy sharing.
*/
@Nullable @JsonProperty("tables") List<Table> tables,
/**
* Indices into {@code ExtractionResult.images} for images found on this page.
*
* Each value is a zero-based index into the top-level {@code images} collection.
* Only populated when {@code extract_images = true} in the extraction config.
*/
@Nullable @JsonProperty("image_indices") List<Integer> imageIndices,
/**
* Hierarchy information for the page (when hierarchy extraction is enabled)
*
* Contains text hierarchy levels (H1-H6) extracted from the page content.
*/
@Nullable @JsonProperty("hierarchy") PageHierarchy hierarchy,
/**
* Whether this page is blank (no meaningful text content)
*
* Determined during extraction based on text content analysis.
* A page is blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images.
*/
@Nullable @JsonProperty("is_blank") Boolean isBlank,
/**
* Layout detection regions for this page (when layout detection is enabled).
*
* Contains detected layout regions with class, confidence, bounding box,
* and area fraction. Only populated when layout detection is configured.
*/
@Nullable @JsonProperty("layout_regions") List<LayoutRegion> layoutRegions,
/**
* Speaker notes for this slide (PPTX only).
*
* Contains the text from the slide's notes pane ({@code ppt/notesSlides/notesSlide{N}.xml}).
* Only populated when the source is a PPTX file and notes are present.
*/
@Nullable @JsonProperty("speaker_notes") String speakerNotes,
/**
* Section name this slide belongs to (PPTX only).
*
* PowerPoint sections group slides into logical chapters ({@code &lt;p:sectionLst&gt;} in
* {@code ppt/presentation.xml}). Only populated when the source is a PPTX file and
* the slide belongs to a named section.
*/
@Nullable @JsonProperty("section_name") String sectionName,
/**
* Sheet name for this page (XLSX/ODS only).
*
* Each spreadsheet sheet maps to one {@code PageContent} entry. This field carries the
* sheet's display name as it appears in the workbook. {@code None} for all non-spreadsheet
* formats and for sheets with an empty name.
*/
@Nullable @JsonProperty("sheet_name") String sheetName
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("page_number")
private int pageNumber = 0;
private String content = "";
private List<Table> tables = null;
@JsonProperty("image_indices")
private List<Integer> imageIndices = null;
private PageHierarchy hierarchy = null;
@JsonProperty("is_blank")
private Boolean isBlank = null;
@JsonProperty("layout_regions")
private List<LayoutRegion> layoutRegions = null;
@JsonProperty("speaker_notes")
private String speakerNotes = null;
@JsonProperty("section_name")
private String sectionName = null;
@JsonProperty("sheet_name")
private String sheetName = null;
/** Sets the pageNumber field. */
@JsonProperty("page_number")
public Builder withPageNumber(final int value) {
this.pageNumber = value;
return this;
}
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the tables field. */
@JsonProperty("tables")
public Builder withTables(final @Nullable List<Table> value) {
this.tables = value;
return this;
}
/** Sets the imageIndices field. */
@JsonProperty("image_indices")
public Builder withImageIndices(final @Nullable List<Integer> value) {
this.imageIndices = value;
return this;
}
/** Sets the hierarchy field. */
@JsonProperty("hierarchy")
public Builder withHierarchy(final @Nullable PageHierarchy value) {
this.hierarchy = value;
return this;
}
/** Sets the isBlank field. */
@JsonProperty("is_blank")
public Builder withIsBlank(final @Nullable boolean value) {
this.isBlank = value;
return this;
}
/** Sets the layoutRegions field. */
@JsonProperty("layout_regions")
public Builder withLayoutRegions(final @Nullable List<LayoutRegion> value) {
this.layoutRegions = value;
return this;
}
/** Sets the speakerNotes field. */
@JsonProperty("speaker_notes")
public Builder withSpeakerNotes(final @Nullable String value) {
this.speakerNotes = value;
return this;
}
/** Sets the sectionName field. */
@JsonProperty("section_name")
public Builder withSectionName(final @Nullable String value) {
this.sectionName = value;
return this;
}
/** Sets the sheetName field. */
@JsonProperty("sheet_name")
public Builder withSheetName(final @Nullable String value) {
this.sheetName = value;
return this;
}
/** Builds the PageContent instance. */
public PageContent build() {
return new PageContent(
pageNumber,
content,
tables,
imageIndices,
hierarchy,
isBlank,
layoutRegions,
speakerNotes,
sectionName,
sheetName
);
}
}
// CPD-ON
}