This commit is contained in:
203
packages/java/dev/kreuzberg/ChunkingConfig.java
generated
Normal file
203
packages/java/dev/kreuzberg/ChunkingConfig.java
generated
Normal file
@@ -0,0 +1,203 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
package dev.kreuzberg;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Chunking configuration.
|
||||
*
|
||||
* Configures text chunking for document content, including chunk size,
|
||||
* overlap, trimming behavior, and optional embeddings.
|
||||
*
|
||||
* Use {@code ..Default.default()} when constructing to allow for future field additions:
|
||||
*
|
||||
* let config = ChunkingConfig {
|
||||
* max_characters: 500,
|
||||
* ..Default.default()
|
||||
* };
|
||||
*
|
||||
*/
|
||||
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
||||
@JsonDeserialize(builder = ChunkingConfig.Builder.class)
|
||||
public record ChunkingConfig(
|
||||
/**
|
||||
* Maximum size per chunk (in units determined by {@code sizing}).
|
||||
*
|
||||
* When {@code sizing} is {@code Characters} (default), this is the max character count.
|
||||
* When using token-based sizing, this is the max token count.
|
||||
*
|
||||
* Default: 1000
|
||||
*/
|
||||
@Nullable @JsonProperty("max_chars") Long maxCharacters,
|
||||
/**
|
||||
* Overlap between chunks (in units determined by {@code sizing}).
|
||||
*
|
||||
* Default: 200
|
||||
*/
|
||||
@Nullable @JsonProperty("max_overlap") Long overlap,
|
||||
/**
|
||||
* Whether to trim whitespace from chunk boundaries.
|
||||
*
|
||||
* Default: true
|
||||
*/
|
||||
@Nullable @JsonProperty("trim") Boolean trim,
|
||||
/**
|
||||
* Type of chunker to use (Text or Markdown).
|
||||
*
|
||||
* Default: Text
|
||||
*/
|
||||
@Nullable @JsonProperty("chunker_type") ChunkerType chunkerType,
|
||||
/**
|
||||
* Optional embedding configuration for chunk embeddings.
|
||||
*/
|
||||
@Nullable @JsonProperty("embedding") EmbeddingConfig embedding,
|
||||
/**
|
||||
* Use a preset configuration (overrides individual settings if provided).
|
||||
*/
|
||||
@Nullable @JsonProperty("preset") String preset,
|
||||
/**
|
||||
* How to measure chunk size.
|
||||
*
|
||||
* Default: {@code Characters} (Unicode character count).
|
||||
* Enable {@code chunking-tiktoken} or {@code chunking-tokenizers} features for token-based sizing.
|
||||
*/
|
||||
@Nullable @JsonProperty("sizing") ChunkSizing sizing,
|
||||
/**
|
||||
* When {@code true} and {@code chunker_type} is {@code Markdown}, prepend the heading hierarchy
|
||||
* path (e.g. {@code "# Title > ## Section\n\n"}) to each chunk's content string.
|
||||
*
|
||||
* This is useful for RAG pipelines where each chunk needs self-contained
|
||||
* context about its position in the document structure.
|
||||
*
|
||||
* Default: {@code false}
|
||||
*/
|
||||
@Nullable @JsonProperty("prepend_heading_context") Boolean prependHeadingContext,
|
||||
/**
|
||||
* Optional cosine similarity threshold for semantic topic boundary detection.
|
||||
*
|
||||
* Only used when {@code chunker_type} is {@code Semantic} and an {@code EmbeddingConfig} is
|
||||
* provided. You almost never need to set this. When omitted, defaults to
|
||||
* {@code 0.75} which works well for most documents. Lower values detect more
|
||||
* topic boundaries (more, smaller chunks); higher values detect fewer.
|
||||
* Range: {@code 0.0..=1.0}.
|
||||
*/
|
||||
@Nullable @JsonProperty("topic_threshold") Float topicThreshold
|
||||
) {
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
public ChunkingConfig{
|
||||
if (maxCharacters == null) maxCharacters = 1000L;
|
||||
if (overlap == null) overlap = 200L;
|
||||
}
|
||||
|
||||
// CPD-OFF
|
||||
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
||||
public static final class Builder {
|
||||
|
||||
@JsonProperty("max_chars")
|
||||
private Long maxCharacters = null;
|
||||
@JsonProperty("max_overlap")
|
||||
private Long overlap = null;
|
||||
private Boolean trim = null;
|
||||
@JsonProperty("chunker_type")
|
||||
@Nullable private ChunkerType chunkerType = ChunkerType.Text;
|
||||
private EmbeddingConfig embedding = null;
|
||||
private String preset = null;
|
||||
@Nullable private ChunkSizing sizing = new ChunkSizing.Characters();
|
||||
@JsonProperty("prepend_heading_context")
|
||||
private Boolean prependHeadingContext = null;
|
||||
@JsonProperty("topic_threshold")
|
||||
private Float topicThreshold = null;
|
||||
|
||||
/** Sets the maxCharacters field. */
|
||||
@JsonProperty("max_chars")
|
||||
public Builder withMaxCharacters(final @Nullable Long value) {
|
||||
this.maxCharacters = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the overlap field. */
|
||||
@JsonProperty("max_overlap")
|
||||
public Builder withOverlap(final @Nullable Long value) {
|
||||
this.overlap = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the trim field. */
|
||||
@JsonProperty("trim")
|
||||
public Builder withTrim(final @Nullable Boolean value) {
|
||||
this.trim = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the chunkerType field. */
|
||||
@JsonProperty("chunker_type")
|
||||
public Builder withChunkerType(final @Nullable ChunkerType value) {
|
||||
this.chunkerType = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the embedding field. */
|
||||
@JsonProperty("embedding")
|
||||
public Builder withEmbedding(final @Nullable EmbeddingConfig value) {
|
||||
this.embedding = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the preset field. */
|
||||
@JsonProperty("preset")
|
||||
public Builder withPreset(final @Nullable String value) {
|
||||
this.preset = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the sizing field. */
|
||||
@JsonProperty("sizing")
|
||||
public Builder withSizing(final @Nullable ChunkSizing value) {
|
||||
this.sizing = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the prependHeadingContext field. */
|
||||
@JsonProperty("prepend_heading_context")
|
||||
public Builder withPrependHeadingContext(final @Nullable Boolean value) {
|
||||
this.prependHeadingContext = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the topicThreshold field. */
|
||||
@JsonProperty("topic_threshold")
|
||||
public Builder withTopicThreshold(final @Nullable Float value) {
|
||||
this.topicThreshold = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the ChunkingConfig instance. */
|
||||
public ChunkingConfig build() {
|
||||
return new ChunkingConfig(
|
||||
maxCharacters,
|
||||
overlap,
|
||||
trim,
|
||||
chunkerType,
|
||||
embedding,
|
||||
preset,
|
||||
sizing,
|
||||
prependHeadingContext,
|
||||
topicThreshold
|
||||
);
|
||||
}
|
||||
}
|
||||
// CPD-ON
|
||||
public static ChunkingConfig defaultInstance() {
|
||||
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user