// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder; import org.jspecify.annotations.Nullable; /** * Chunking configuration. * * Configures text chunking for document content, including chunk size, * overlap, trimming behavior, and optional embeddings. * * Use {@code ..Default.default()} when constructing to allow for future field additions: * * let config = ChunkingConfig { * max_characters: 500, * ..Default.default() * }; * */ @JsonInclude(JsonInclude.Include.NON_ABSENT) @JsonDeserialize(builder = ChunkingConfig.Builder.class) public record ChunkingConfig( /** * Maximum size per chunk (in units determined by {@code sizing}). * * When {@code sizing} is {@code Characters} (default), this is the max character count. * When using token-based sizing, this is the max token count. * * Default: 1000 */ @Nullable @JsonProperty("max_chars") Long maxCharacters, /** * Overlap between chunks (in units determined by {@code sizing}). * * Default: 200 */ @Nullable @JsonProperty("max_overlap") Long overlap, /** * Whether to trim whitespace from chunk boundaries. * * Default: true */ @Nullable @JsonProperty("trim") Boolean trim, /** * Type of chunker to use (Text or Markdown). * * Default: Text */ @Nullable @JsonProperty("chunker_type") ChunkerType chunkerType, /** * Optional embedding configuration for chunk embeddings. */ @Nullable @JsonProperty("embedding") EmbeddingConfig embedding, /** * Use a preset configuration (overrides individual settings if provided). */ @Nullable @JsonProperty("preset") String preset, /** * How to measure chunk size. * * Default: {@code Characters} (Unicode character count). * Enable {@code chunking-tiktoken} or {@code chunking-tokenizers} features for token-based sizing. */ @Nullable @JsonProperty("sizing") ChunkSizing sizing, /** * When {@code true} and {@code chunker_type} is {@code Markdown}, prepend the heading hierarchy * path (e.g. {@code "# Title > ## Section\n\n"}) to each chunk's content string. * * This is useful for RAG pipelines where each chunk needs self-contained * context about its position in the document structure. * * Default: {@code false} */ @Nullable @JsonProperty("prepend_heading_context") Boolean prependHeadingContext, /** * Optional cosine similarity threshold for semantic topic boundary detection. * * Only used when {@code chunker_type} is {@code Semantic} and an {@code EmbeddingConfig} is * provided. You almost never need to set this. When omitted, defaults to * {@code 0.75} which works well for most documents. Lower values detect more * topic boundaries (more, smaller chunks); higher values detect fewer. * Range: {@code 0.0..=1.0}. */ @Nullable @JsonProperty("topic_threshold") Float topicThreshold ) { public static Builder builder() { return new Builder(); } public ChunkingConfig{ if (maxCharacters == null) maxCharacters = 1000L; if (overlap == null) overlap = 200L; } // CPD-OFF @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build") public static final class Builder { @JsonProperty("max_chars") private Long maxCharacters = null; @JsonProperty("max_overlap") private Long overlap = null; private Boolean trim = null; @JsonProperty("chunker_type") @Nullable private ChunkerType chunkerType = ChunkerType.Text; private EmbeddingConfig embedding = null; private String preset = null; @Nullable private ChunkSizing sizing = new ChunkSizing.Characters(); @JsonProperty("prepend_heading_context") private Boolean prependHeadingContext = null; @JsonProperty("topic_threshold") private Float topicThreshold = null; /** Sets the maxCharacters field. */ @JsonProperty("max_chars") public Builder withMaxCharacters(final @Nullable Long value) { this.maxCharacters = value; return this; } /** Sets the overlap field. */ @JsonProperty("max_overlap") public Builder withOverlap(final @Nullable Long value) { this.overlap = value; return this; } /** Sets the trim field. */ @JsonProperty("trim") public Builder withTrim(final @Nullable Boolean value) { this.trim = value; return this; } /** Sets the chunkerType field. */ @JsonProperty("chunker_type") public Builder withChunkerType(final @Nullable ChunkerType value) { this.chunkerType = value; return this; } /** Sets the embedding field. */ @JsonProperty("embedding") public Builder withEmbedding(final @Nullable EmbeddingConfig value) { this.embedding = value; return this; } /** Sets the preset field. */ @JsonProperty("preset") public Builder withPreset(final @Nullable String value) { this.preset = value; return this; } /** Sets the sizing field. */ @JsonProperty("sizing") public Builder withSizing(final @Nullable ChunkSizing value) { this.sizing = value; return this; } /** Sets the prependHeadingContext field. */ @JsonProperty("prepend_heading_context") public Builder withPrependHeadingContext(final @Nullable Boolean value) { this.prependHeadingContext = value; return this; } /** Sets the topicThreshold field. */ @JsonProperty("topic_threshold") public Builder withTopicThreshold(final @Nullable Float value) { this.topicThreshold = value; return this; } /** Builds the ChunkingConfig instance. */ public ChunkingConfig build() { return new ChunkingConfig( maxCharacters, overlap, trim, chunkerType, embedding, preset, sizing, prependHeadingContext, topicThreshold ); } } // CPD-ON public static ChunkingConfig defaultInstance() { throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead."); } }