// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; import java.util.Optional; /** * How chunk size is measured. * * Defaults to {@code Characters} (Unicode character count). When using token-based sizing, * chunks are sized by token count according to the specified tokenizer. * * Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer * available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers * (e.g., {@code Xenova/gpt-4o}, {@code Xenova/cl100k_base}). */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type", visible = false) @JsonSubTypes({ @JsonSubTypes.Type(value = ChunkSizing.Characters.class, name = "characters"), @JsonSubTypes.Type(value = ChunkSizing.Tokenizer.class, name = "tokenizer") }) @com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true) public sealed interface ChunkSizing { /** Size measured in Unicode characters (default). */ record Characters() implements ChunkSizing { } /** Size measured in tokens from a HuggingFace tokenizer. */ record Tokenizer( @JsonProperty("model") String model, @JsonProperty("cache_dir") Optional cacheDir ) implements ChunkSizing { } }