Files
fil/packages/java/dev/kreuzberg/ChunkingConfig.java

204 lines
6.9 KiB
Java
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use {@code ..Default.default()} when constructing to allow for future field additions:
*
* let config = ChunkingConfig {
* max_characters: 500,
* ..Default.default()
* };
*
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ChunkingConfig.Builder.class)
public record ChunkingConfig(
/**
* Maximum size per chunk (in units determined by {@code sizing}).
*
* When {@code sizing} is {@code Characters} (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
@Nullable @JsonProperty("max_chars") Long maxCharacters,
/**
* Overlap between chunks (in units determined by {@code sizing}).
*
* Default: 200
*/
@Nullable @JsonProperty("max_overlap") Long overlap,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
@Nullable @JsonProperty("trim") Boolean trim,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
@Nullable @JsonProperty("chunker_type") ChunkerType chunkerType,
/**
* Optional embedding configuration for chunk embeddings.
*/
@Nullable @JsonProperty("embedding") EmbeddingConfig embedding,
/**
* Use a preset configuration (overrides individual settings if provided).
*/
@Nullable @JsonProperty("preset") String preset,
/**
* How to measure chunk size.
*
* Default: {@code Characters} (Unicode character count).
* Enable {@code chunking-tiktoken} or {@code chunking-tokenizers} features for token-based sizing.
*/
@Nullable @JsonProperty("sizing") ChunkSizing sizing,
/**
* When {@code true} and {@code chunker_type} is {@code Markdown}, prepend the heading hierarchy
* path (e.g. {@code "# Title > ## Section\n\n"}) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: {@code false}
*/
@Nullable @JsonProperty("prepend_heading_context") Boolean prependHeadingContext,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when {@code chunker_type} is {@code Semantic} and an {@code EmbeddingConfig} is
* provided. You almost never need to set this. When omitted, defaults to
* {@code 0.75} which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: {@code 0.0..=1.0}.
*/
@Nullable @JsonProperty("topic_threshold") Float topicThreshold
) {
public static Builder builder() {
return new Builder();
}
public ChunkingConfig{
if (maxCharacters == null) maxCharacters = 1000L;
if (overlap == null) overlap = 200L;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("max_chars")
private Long maxCharacters = null;
@JsonProperty("max_overlap")
private Long overlap = null;
private Boolean trim = null;
@JsonProperty("chunker_type")
@Nullable private ChunkerType chunkerType = ChunkerType.Text;
private EmbeddingConfig embedding = null;
private String preset = null;
@Nullable private ChunkSizing sizing = new ChunkSizing.Characters();
@JsonProperty("prepend_heading_context")
private Boolean prependHeadingContext = null;
@JsonProperty("topic_threshold")
private Float topicThreshold = null;
/** Sets the maxCharacters field. */
@JsonProperty("max_chars")
public Builder withMaxCharacters(final @Nullable Long value) {
this.maxCharacters = value;
return this;
}
/** Sets the overlap field. */
@JsonProperty("max_overlap")
public Builder withOverlap(final @Nullable Long value) {
this.overlap = value;
return this;
}
/** Sets the trim field. */
@JsonProperty("trim")
public Builder withTrim(final @Nullable Boolean value) {
this.trim = value;
return this;
}
/** Sets the chunkerType field. */
@JsonProperty("chunker_type")
public Builder withChunkerType(final @Nullable ChunkerType value) {
this.chunkerType = value;
return this;
}
/** Sets the embedding field. */
@JsonProperty("embedding")
public Builder withEmbedding(final @Nullable EmbeddingConfig value) {
this.embedding = value;
return this;
}
/** Sets the preset field. */
@JsonProperty("preset")
public Builder withPreset(final @Nullable String value) {
this.preset = value;
return this;
}
/** Sets the sizing field. */
@JsonProperty("sizing")
public Builder withSizing(final @Nullable ChunkSizing value) {
this.sizing = value;
return this;
}
/** Sets the prependHeadingContext field. */
@JsonProperty("prepend_heading_context")
public Builder withPrependHeadingContext(final @Nullable Boolean value) {
this.prependHeadingContext = value;
return this;
}
/** Sets the topicThreshold field. */
@JsonProperty("topic_threshold")
public Builder withTopicThreshold(final @Nullable Float value) {
this.topicThreshold = value;
return this;
}
/** Builds the ChunkingConfig instance. */
public ChunkingConfig build() {
return new ChunkingConfig(
maxCharacters,
overlap,
trim,
chunkerType,
embedding,
preset,
sizing,
prependHeadingContext,
topicThreshold
);
}
}
// CPD-ON
public static ChunkingConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}