178 lines
5.9 KiB
Java
178 lines
5.9 KiB
Java
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
package dev.kreuzberg;
|
||
|
|
|
||
|
|
import java.util.List;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||
|
|
import org.jspecify.annotations.Nullable;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Metadata about a chunk's position in the original document.
|
||
|
|
*/
|
||
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||
|
|
@JsonDeserialize(builder = ChunkMetadata.Builder.class)
|
||
|
|
public record ChunkMetadata(
|
||
|
|
/**
|
||
|
|
* Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
||
|
|
*/
|
||
|
|
@JsonProperty("byte_start") long byteStart,
|
||
|
|
/**
|
||
|
|
* Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
||
|
|
*/
|
||
|
|
@JsonProperty("byte_end") long byteEnd,
|
||
|
|
/**
|
||
|
|
* Number of tokens in this chunk (if available).
|
||
|
|
*
|
||
|
|
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("token_count") Long tokenCount,
|
||
|
|
/**
|
||
|
|
* Zero-based index of this chunk in the document.
|
||
|
|
*/
|
||
|
|
@JsonProperty("chunk_index") long chunkIndex,
|
||
|
|
/**
|
||
|
|
* Total number of chunks in the document.
|
||
|
|
*/
|
||
|
|
@JsonProperty("total_chunks") long totalChunks,
|
||
|
|
/**
|
||
|
|
* First page number this chunk spans (1-indexed).
|
||
|
|
*
|
||
|
|
* Only populated when page tracking is enabled in extraction configuration.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("first_page") Integer firstPage,
|
||
|
|
/**
|
||
|
|
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
||
|
|
*
|
||
|
|
* Only populated when page tracking is enabled in extraction configuration.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("last_page") Integer lastPage,
|
||
|
|
/**
|
||
|
|
* Heading context when using Markdown chunker.
|
||
|
|
*
|
||
|
|
* Contains the heading hierarchy this chunk falls under.
|
||
|
|
* Only populated when {@code ChunkerType.Markdown} is used.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("heading_context") HeadingContext headingContext,
|
||
|
|
/**
|
||
|
|
* Indices into {@code ExtractionResult.images} for images on pages covered by this chunk.
|
||
|
|
*
|
||
|
|
* Contains zero-based indices into the top-level {@code images} collection for every
|
||
|
|
* image whose {@code page_number} falls within {@code [first_page, last_page]}.
|
||
|
|
* Empty when image extraction is disabled or the chunk spans no pages with images.
|
||
|
|
*/
|
||
|
|
@Nullable @JsonProperty("image_indices") List<Integer> imageIndices
|
||
|
|
) {
|
||
|
|
public static Builder builder() {
|
||
|
|
return new Builder();
|
||
|
|
}
|
||
|
|
|
||
|
|
// CPD-OFF
|
||
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||
|
|
public static final class Builder {
|
||
|
|
|
||
|
|
@JsonProperty("byte_start")
|
||
|
|
private long byteStart = 0;
|
||
|
|
@JsonProperty("byte_end")
|
||
|
|
private long byteEnd = 0;
|
||
|
|
@JsonProperty("token_count")
|
||
|
|
private Long tokenCount = null;
|
||
|
|
@JsonProperty("chunk_index")
|
||
|
|
private long chunkIndex = 0;
|
||
|
|
@JsonProperty("total_chunks")
|
||
|
|
private long totalChunks = 0;
|
||
|
|
@JsonProperty("first_page")
|
||
|
|
private Integer firstPage = null;
|
||
|
|
@JsonProperty("last_page")
|
||
|
|
private Integer lastPage = null;
|
||
|
|
@JsonProperty("heading_context")
|
||
|
|
@Nullable private HeadingContext headingContext = null;
|
||
|
|
@JsonProperty("image_indices")
|
||
|
|
private List<Integer> imageIndices = null;
|
||
|
|
|
||
|
|
/** Sets the byteStart field. */
|
||
|
|
@JsonProperty("byte_start")
|
||
|
|
public Builder withByteStart(final long value) {
|
||
|
|
this.byteStart = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the byteEnd field. */
|
||
|
|
@JsonProperty("byte_end")
|
||
|
|
public Builder withByteEnd(final long value) {
|
||
|
|
this.byteEnd = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the tokenCount field. */
|
||
|
|
@JsonProperty("token_count")
|
||
|
|
public Builder withTokenCount(final @Nullable long value) {
|
||
|
|
this.tokenCount = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the chunkIndex field. */
|
||
|
|
@JsonProperty("chunk_index")
|
||
|
|
public Builder withChunkIndex(final long value) {
|
||
|
|
this.chunkIndex = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the totalChunks field. */
|
||
|
|
@JsonProperty("total_chunks")
|
||
|
|
public Builder withTotalChunks(final long value) {
|
||
|
|
this.totalChunks = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the firstPage field. */
|
||
|
|
@JsonProperty("first_page")
|
||
|
|
public Builder withFirstPage(final @Nullable int value) {
|
||
|
|
this.firstPage = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the lastPage field. */
|
||
|
|
@JsonProperty("last_page")
|
||
|
|
public Builder withLastPage(final @Nullable int value) {
|
||
|
|
this.lastPage = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the headingContext field. */
|
||
|
|
@JsonProperty("heading_context")
|
||
|
|
public Builder withHeadingContext(final @Nullable HeadingContext value) {
|
||
|
|
this.headingContext = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Sets the imageIndices field. */
|
||
|
|
@JsonProperty("image_indices")
|
||
|
|
public Builder withImageIndices(final @Nullable List<Integer> value) {
|
||
|
|
this.imageIndices = value;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Builds the ChunkMetadata instance. */
|
||
|
|
public ChunkMetadata build() {
|
||
|
|
return new ChunkMetadata(
|
||
|
|
byteStart,
|
||
|
|
byteEnd,
|
||
|
|
tokenCount,
|
||
|
|
chunkIndex,
|
||
|
|
totalChunks,
|
||
|
|
firstPage,
|
||
|
|
lastPage,
|
||
|
|
headingContext,
|
||
|
|
imageIndices
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// CPD-ON
|
||
|
|
}
|