204 lines
6.9 KiB
Java
Generated
204 lines
6.9 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* Chunking configuration.
|
|
*
|
|
* Configures text chunking for document content, including chunk size,
|
|
* overlap, trimming behavior, and optional embeddings.
|
|
*
|
|
* Use {@code ..Default.default()} when constructing to allow for future field additions:
|
|
*
|
|
* let config = ChunkingConfig {
|
|
* max_characters: 500,
|
|
* ..Default.default()
|
|
* };
|
|
*
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = ChunkingConfig.Builder.class)
|
|
public record ChunkingConfig(
|
|
/**
|
|
* Maximum size per chunk (in units determined by {@code sizing}).
|
|
*
|
|
* When {@code sizing} is {@code Characters} (default), this is the max character count.
|
|
* When using token-based sizing, this is the max token count.
|
|
*
|
|
* Default: 1000
|
|
*/
|
|
@Nullable @JsonProperty("max_chars") Long maxCharacters,
|
|
/**
|
|
* Overlap between chunks (in units determined by {@code sizing}).
|
|
*
|
|
* Default: 200
|
|
*/
|
|
@Nullable @JsonProperty("max_overlap") Long overlap,
|
|
/**
|
|
* Whether to trim whitespace from chunk boundaries.
|
|
*
|
|
* Default: true
|
|
*/
|
|
@Nullable @JsonProperty("trim") Boolean trim,
|
|
/**
|
|
* Type of chunker to use (Text or Markdown).
|
|
*
|
|
* Default: Text
|
|
*/
|
|
@Nullable @JsonProperty("chunker_type") ChunkerType chunkerType,
|
|
/**
|
|
* Optional embedding configuration for chunk embeddings.
|
|
*/
|
|
@Nullable @JsonProperty("embedding") EmbeddingConfig embedding,
|
|
/**
|
|
* Use a preset configuration (overrides individual settings if provided).
|
|
*/
|
|
@Nullable @JsonProperty("preset") String preset,
|
|
/**
|
|
* How to measure chunk size.
|
|
*
|
|
* Default: {@code Characters} (Unicode character count).
|
|
* Enable {@code chunking-tiktoken} or {@code chunking-tokenizers} features for token-based sizing.
|
|
*/
|
|
@Nullable @JsonProperty("sizing") ChunkSizing sizing,
|
|
/**
|
|
* When {@code true} and {@code chunker_type} is {@code Markdown}, prepend the heading hierarchy
|
|
* path (e.g. {@code "# Title > ## Section\n\n"}) to each chunk's content string.
|
|
*
|
|
* This is useful for RAG pipelines where each chunk needs self-contained
|
|
* context about its position in the document structure.
|
|
*
|
|
* Default: {@code false}
|
|
*/
|
|
@Nullable @JsonProperty("prepend_heading_context") Boolean prependHeadingContext,
|
|
/**
|
|
* Optional cosine similarity threshold for semantic topic boundary detection.
|
|
*
|
|
* Only used when {@code chunker_type} is {@code Semantic} and an {@code EmbeddingConfig} is
|
|
* provided. You almost never need to set this. When omitted, defaults to
|
|
* {@code 0.75} which works well for most documents. Lower values detect more
|
|
* topic boundaries (more, smaller chunks); higher values detect fewer.
|
|
* Range: {@code 0.0..=1.0}.
|
|
*/
|
|
@Nullable @JsonProperty("topic_threshold") Float topicThreshold
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
public ChunkingConfig{
|
|
if (maxCharacters == null) maxCharacters = 1000L;
|
|
if (overlap == null) overlap = 200L;
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
@JsonProperty("max_chars")
|
|
private Long maxCharacters = null;
|
|
@JsonProperty("max_overlap")
|
|
private Long overlap = null;
|
|
private Boolean trim = null;
|
|
@JsonProperty("chunker_type")
|
|
@Nullable private ChunkerType chunkerType = ChunkerType.Text;
|
|
private EmbeddingConfig embedding = null;
|
|
private String preset = null;
|
|
@Nullable private ChunkSizing sizing = new ChunkSizing.Characters();
|
|
@JsonProperty("prepend_heading_context")
|
|
private Boolean prependHeadingContext = null;
|
|
@JsonProperty("topic_threshold")
|
|
private Float topicThreshold = null;
|
|
|
|
/** Sets the maxCharacters field. */
|
|
@JsonProperty("max_chars")
|
|
public Builder withMaxCharacters(final @Nullable Long value) {
|
|
this.maxCharacters = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the overlap field. */
|
|
@JsonProperty("max_overlap")
|
|
public Builder withOverlap(final @Nullable Long value) {
|
|
this.overlap = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the trim field. */
|
|
@JsonProperty("trim")
|
|
public Builder withTrim(final @Nullable Boolean value) {
|
|
this.trim = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the chunkerType field. */
|
|
@JsonProperty("chunker_type")
|
|
public Builder withChunkerType(final @Nullable ChunkerType value) {
|
|
this.chunkerType = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the embedding field. */
|
|
@JsonProperty("embedding")
|
|
public Builder withEmbedding(final @Nullable EmbeddingConfig value) {
|
|
this.embedding = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the preset field. */
|
|
@JsonProperty("preset")
|
|
public Builder withPreset(final @Nullable String value) {
|
|
this.preset = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the sizing field. */
|
|
@JsonProperty("sizing")
|
|
public Builder withSizing(final @Nullable ChunkSizing value) {
|
|
this.sizing = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the prependHeadingContext field. */
|
|
@JsonProperty("prepend_heading_context")
|
|
public Builder withPrependHeadingContext(final @Nullable Boolean value) {
|
|
this.prependHeadingContext = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the topicThreshold field. */
|
|
@JsonProperty("topic_threshold")
|
|
public Builder withTopicThreshold(final @Nullable Float value) {
|
|
this.topicThreshold = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the ChunkingConfig instance. */
|
|
public ChunkingConfig build() {
|
|
return new ChunkingConfig(
|
|
maxCharacters,
|
|
overlap,
|
|
trim,
|
|
chunkerType,
|
|
embedding,
|
|
preset,
|
|
sizing,
|
|
prependHeadingContext,
|
|
topicThreshold
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
public static ChunkingConfig defaultInstance() {
|
|
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
|
}
|
|
}
|