215 lines
7.4 KiB
Java
215 lines
7.4 KiB
Java
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
package dev.kreuzberg;
|
||
|
|
|
||
|
|
import java.util.List;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||
|
|
import org.jspecify.annotations.Nullable;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Content for a single page/slide.
|
||
|
|
*
|
||
|
|
* When page extraction is enabled, documents are split into per-page content
|
||
|
|
* with associated tables and images mapped to each page.
|
||
|
|
*
|
||
|
|
* # Performance
|
||
|
|
*
|
||
|
|
* Uses Arc-wrapped tables and images for memory efficiency:
|
||
|
|
* - {@code Vec<Arc<Table>>} enables zero-copy sharing of table data
|
||
|
|
* - {@code Vec<Arc<ExtractedImage>>} enables zero-copy sharing of image data
|
||
|
|
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
||
|
|
*
|
||
|
|
* This reduces memory overhead for documents with shared tables/images
|
||
|
|
* by avoiding redundant copies during serialization.
|
||
|
|
*/
|
||
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||
|
|
@JsonDeserialize(builder = PageContent.Builder.class)
|
||
|
|
public record PageContent(
|
||
|
|
/**
|
||
|
|
* Page number (1-indexed)
|
||
|
|
*/
|
||
|
|
@JsonProperty("page_number") int pageNumber,
|
||
|
|
/**
|
||
|
|
* Text content for this page
|
||
|
|
*/
|
||
|
|
@JsonProperty("content") String content,
|
||
|
|
/**
|
||
|
|
* Tables found on this page (uses Arc for memory efficiency)
|
||
|
|
*
|
||
|
|
* Serializes as Table[] for JSON compatibility while maintaining
|
||
|
|
* Arc semantics in-memory for zero-copy sharing.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("tables") List<Table> tables,
|
||
|
|
/**
|
||
|
|
* Indices into {@code ExtractionResult.images} for images found on this page.
|
||
|
|
*
|
||
|
|
* Each value is a zero-based index into the top-level {@code images} collection.
|
||
|
|
* Only populated when {@code extract_images = true} in the extraction config.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("image_indices") List<Integer> imageIndices,
|
||
|
|
/**
|
||
|
|
* Hierarchy information for the page (when hierarchy extraction is enabled)
|
||
|
|
*
|
||
|
|
* Contains text hierarchy levels (H1-H6) extracted from the page content.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("hierarchy") PageHierarchy hierarchy,
|
||
|
|
/**
|
||
|
|
* Whether this page is blank (no meaningful text content)
|
||
|
|
*
|
||
|
|
* Determined during extraction based on text content analysis.
|
||
|
|
* A page is blank if it has fewer than 3 non-whitespace characters
|
||
|
|
* and contains no tables or images.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("is_blank") Boolean isBlank,
|
||
|
|
/**
|
||
|
|
* Layout detection regions for this page (when layout detection is enabled).
|
||
|
|
*
|
||
|
|
* Contains detected layout regions with class, confidence, bounding box,
|
||
|
|
* and area fraction. Only populated when layout detection is configured.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("layout_regions") List<LayoutRegion> layoutRegions,
|
||
|
|
/**
|
||
|
|
* Speaker notes for this slide (PPTX only).
|
||
|
|
*
|
||
|
|
* Contains the text from the slide's notes pane ({@code ppt/notesSlides/notesSlide{N}.xml}).
|
||
|
|
* Only populated when the source is a PPTX file and notes are present.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("speaker_notes") String speakerNotes,
|
||
|
|
/**
|
||
|
|
* Section name this slide belongs to (PPTX only).
|
||
|
|
*
|
||
|
|
* PowerPoint sections group slides into logical chapters ({@code <p:sectionLst>} in
|
||
|
|
* {@code ppt/presentation.xml}). Only populated when the source is a PPTX file and
|
||
|
|
* the slide belongs to a named section.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("section_name") String sectionName,
|
||
|
|
/**
|
||
|
|
* Sheet name for this page (XLSX/ODS only).
|
||
|
|
*
|
||
|
|
* Each spreadsheet sheet maps to one {@code PageContent} entry. This field carries the
|
||
|
|
* sheet's display name as it appears in the workbook. {@code None} for all non-spreadsheet
|
||
|
|
* formats and for sheets with an empty name.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("sheet_name") String sheetName
|
||
|
|
) {
|
||
|
|
public static Builder builder() {
|
||
|
|
return new Builder();
|
||
|
|
}
|
||
|
|
|
||
|
|
// CPD-OFF
|
||
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||
|
|
public static final class Builder {
|
||
|
|
|
||
|
|
@JsonProperty("page_number")
|
||
|
|
private int pageNumber = 0;
|
||
|
|
private String content = "";
|
||
|
|
private List<Table> tables = null;
|
||
|
|
@JsonProperty("image_indices")
|
||
|
|
private List<Integer> imageIndices = null;
|
||
|
|
private PageHierarchy hierarchy = null;
|
||
|
|
@JsonProperty("is_blank")
|
||
|
|
private Boolean isBlank = null;
|
||
|
|
@JsonProperty("layout_regions")
|
||
|
|
private List<LayoutRegion> layoutRegions = null;
|
||
|
|
@JsonProperty("speaker_notes")
|
||
|
|
private String speakerNotes = null;
|
||
|
|
@JsonProperty("section_name")
|
||
|
|
private String sectionName = null;
|
||
|
|
@JsonProperty("sheet_name")
|
||
|
|
private String sheetName = null;
|
||
|
|
|
||
|
|
/** Sets the pageNumber field. */
|
||
|
|
@JsonProperty("page_number")
|
||
|
|
public Builder withPageNumber(final int value) {
|
||
|
|
this.pageNumber = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the content field. */
|
||
|
|
@JsonProperty("content")
|
||
|
|
public Builder withContent(final String value) {
|
||
|
|
this.content = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the tables field. */
|
||
|
|
@JsonProperty("tables")
|
||
|
|
public Builder withTables(final @Nullable List<Table> value) {
|
||
|
|
this.tables = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the imageIndices field. */
|
||
|
|
@JsonProperty("image_indices")
|
||
|
|
public Builder withImageIndices(final @Nullable List<Integer> value) {
|
||
|
|
this.imageIndices = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the hierarchy field. */
|
||
|
|
@JsonProperty("hierarchy")
|
||
|
|
public Builder withHierarchy(final @Nullable PageHierarchy value) {
|
||
|
|
this.hierarchy = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the isBlank field. */
|
||
|
|
@JsonProperty("is_blank")
|
||
|
|
public Builder withIsBlank(final @Nullable boolean value) {
|
||
|
|
this.isBlank = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the layoutRegions field. */
|
||
|
|
@JsonProperty("layout_regions")
|
||
|
|
public Builder withLayoutRegions(final @Nullable List<LayoutRegion> value) {
|
||
|
|
this.layoutRegions = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the speakerNotes field. */
|
||
|
|
@JsonProperty("speaker_notes")
|
||
|
|
public Builder withSpeakerNotes(final @Nullable String value) {
|
||
|
|
this.speakerNotes = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the sectionName field. */
|
||
|
|
@JsonProperty("section_name")
|
||
|
|
public Builder withSectionName(final @Nullable String value) {
|
||
|
|
this.sectionName = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the sheetName field. */
|
||
|
|
@JsonProperty("sheet_name")
|
||
|
|
public Builder withSheetName(final @Nullable String value) {
|
||
|
|
this.sheetName = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Builds the PageContent instance. */
|
||
|
|
public PageContent build() {
|
||
|
|
return new PageContent(
|
||
|
|
pageNumber,
|
||
|
|
content,
|
||
|
|
tables,
|
||
|
|
imageIndices,
|
||
|
|
hierarchy,
|
||
|
|
isBlank,
|
||
|
|
layoutRegions,
|
||
|
|
speakerNotes,
|
||
|
|
sectionName,
|
||
|
|
sheetName
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// CPD-ON
|
||
|
|
}
|