This commit is contained in:
214
packages/java/dev/kreuzberg/PageContent.java
generated
Normal file
214
packages/java/dev/kreuzberg/PageContent.java
generated
Normal file
@@ -0,0 +1,214 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import java.util.List;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Content for a single page/slide.
|
||||
*
|
||||
* When page extraction is enabled, documents are split into per-page content
|
||||
* with associated tables and images mapped to each page.
|
||||
*
|
||||
* # Performance
|
||||
*
|
||||
* Uses Arc-wrapped tables and images for memory efficiency:
|
||||
* - {@code Vec<Arc<Table>>} enables zero-copy sharing of table data
|
||||
* - {@code Vec<Arc<ExtractedImage>>} enables zero-copy sharing of image data
|
||||
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
||||
*
|
||||
* This reduces memory overhead for documents with shared tables/images
|
||||
* by avoiding redundant copies during serialization.
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = PageContent.Builder.class)
|
||||
public record PageContent(
|
||||
/**
|
||||
* Page number (1-indexed)
|
||||
*/
|
||||
@JsonProperty("page_number") int pageNumber,
|
||||
/**
|
||||
* Text content for this page
|
||||
*/
|
||||
@JsonProperty("content") String content,
|
||||
/**
|
||||
* Tables found on this page (uses Arc for memory efficiency)
|
||||
*
|
||||
* Serializes as Table[] for JSON compatibility while maintaining
|
||||
* Arc semantics in-memory for zero-copy sharing.
|
||||
*/
|
||||
@Nullable @JsonProperty("tables") List<Table> tables,
|
||||
/**
|
||||
* Indices into {@code ExtractionResult.images} for images found on this page.
|
||||
*
|
||||
* Each value is a zero-based index into the top-level {@code images} collection.
|
||||
* Only populated when {@code extract_images = true} in the extraction config.
|
||||
*/
|
||||
@Nullable @JsonProperty("image_indices") List<Integer> imageIndices,
|
||||
/**
|
||||
* Hierarchy information for the page (when hierarchy extraction is enabled)
|
||||
*
|
||||
* Contains text hierarchy levels (H1-H6) extracted from the page content.
|
||||
*/
|
||||
@Nullable @JsonProperty("hierarchy") PageHierarchy hierarchy,
|
||||
/**
|
||||
* Whether this page is blank (no meaningful text content)
|
||||
*
|
||||
* Determined during extraction based on text content analysis.
|
||||
* A page is blank if it has fewer than 3 non-whitespace characters
|
||||
* and contains no tables or images.
|
||||
*/
|
||||
@Nullable @JsonProperty("is_blank") Boolean isBlank,
|
||||
/**
|
||||
* Layout detection regions for this page (when layout detection is enabled).
|
||||
*
|
||||
* Contains detected layout regions with class, confidence, bounding box,
|
||||
* and area fraction. Only populated when layout detection is configured.
|
||||
*/
|
||||
@Nullable @JsonProperty("layout_regions") List<LayoutRegion> layoutRegions,
|
||||
/**
|
||||
* Speaker notes for this slide (PPTX only).
|
||||
*
|
||||
* Contains the text from the slide's notes pane ({@code ppt/notesSlides/notesSlide{N}.xml}).
|
||||
* Only populated when the source is a PPTX file and notes are present.
|
||||
*/
|
||||
@Nullable @JsonProperty("speaker_notes") String speakerNotes,
|
||||
/**
|
||||
* Section name this slide belongs to (PPTX only).
|
||||
*
|
||||
* PowerPoint sections group slides into logical chapters ({@code <p:sectionLst>} in
|
||||
* {@code ppt/presentation.xml}). Only populated when the source is a PPTX file and
|
||||
* the slide belongs to a named section.
|
||||
*/
|
||||
@Nullable @JsonProperty("section_name") String sectionName,
|
||||
/**
|
||||
* Sheet name for this page (XLSX/ODS only).
|
||||
*
|
||||
* Each spreadsheet sheet maps to one {@code PageContent} entry. This field carries the
|
||||
* sheet's display name as it appears in the workbook. {@code None} for all non-spreadsheet
|
||||
* formats and for sheets with an empty name.
|
||||
*/
|
||||
@Nullable @JsonProperty("sheet_name") String sheetName
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
@JsonProperty("page_number")
|
||||
private int pageNumber = 0;
|
||||
private String content = "";
|
||||
private List<Table> tables = null;
|
||||
@JsonProperty("image_indices")
|
||||
private List<Integer> imageIndices = null;
|
||||
private PageHierarchy hierarchy = null;
|
||||
@JsonProperty("is_blank")
|
||||
private Boolean isBlank = null;
|
||||
@JsonProperty("layout_regions")
|
||||
private List<LayoutRegion> layoutRegions = null;
|
||||
@JsonProperty("speaker_notes")
|
||||
private String speakerNotes = null;
|
||||
@JsonProperty("section_name")
|
||||
private String sectionName = null;
|
||||
@JsonProperty("sheet_name")
|
||||
private String sheetName = null;
|
||||
|
||||
/** Sets the pageNumber field. */
|
||||
@JsonProperty("page_number")
|
||||
public Builder withPageNumber(final int value) {
|
||||
this.pageNumber = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the content field. */
|
||||
@JsonProperty("content")
|
||||
public Builder withContent(final String value) {
|
||||
this.content = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the tables field. */
|
||||
@JsonProperty("tables")
|
||||
public Builder withTables(final @Nullable List<Table> value) {
|
||||
this.tables = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the imageIndices field. */
|
||||
@JsonProperty("image_indices")
|
||||
public Builder withImageIndices(final @Nullable List<Integer> value) {
|
||||
this.imageIndices = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the hierarchy field. */
|
||||
@JsonProperty("hierarchy")
|
||||
public Builder withHierarchy(final @Nullable PageHierarchy value) {
|
||||
this.hierarchy = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the isBlank field. */
|
||||
@JsonProperty("is_blank")
|
||||
public Builder withIsBlank(final @Nullable boolean value) {
|
||||
this.isBlank = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the layoutRegions field. */
|
||||
@JsonProperty("layout_regions")
|
||||
public Builder withLayoutRegions(final @Nullable List<LayoutRegion> value) {
|
||||
this.layoutRegions = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the speakerNotes field. */
|
||||
@JsonProperty("speaker_notes")
|
||||
public Builder withSpeakerNotes(final @Nullable String value) {
|
||||
this.speakerNotes = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the sectionName field. */
|
||||
@JsonProperty("section_name")
|
||||
public Builder withSectionName(final @Nullable String value) {
|
||||
this.sectionName = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the sheetName field. */
|
||||
@JsonProperty("sheet_name")
|
||||
public Builder withSheetName(final @Nullable String value) {
|
||||
this.sheetName = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the PageContent instance. */
|
||||
public PageContent build() {
|
||||
return new PageContent(
|
||||
pageNumber,
|
||||
content,
|
||||
tables,
|
||||
imageIndices,
|
||||
hierarchy,
|
||||
isBlank,
|
||||
layoutRegions,
|
||||
speakerNotes,
|
||||
sectionName,
|
||||
sheetName
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
}
|
||||
Reference in New Issue
Block a user