26 lines
1.0 KiB
Elixir
26 lines
1.0 KiB
Elixir
|
|
# This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
# To regenerate: alef generate
|
||
|
|
# To verify freshness: alef verify --exit-code
|
||
|
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
defmodule Kreuzberg.ChunkSizing do
|
||
|
|
@moduledoc """
|
||
|
|
How chunk size is measured.
|
||
|
|
|
||
|
|
Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||
|
|
chunks are sized by token count according to the specified tokenizer.
|
||
|
|
|
||
|
|
Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||
|
|
available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||
|
|
(e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||
|
|
"""
|
||
|
|
|
||
|
|
@typedoc "How chunk size is measured."
|
||
|
|
@type t :: term()
|
||
|
|
|
||
|
|
@typedoc "Size measured in Unicode characters (default)."
|
||
|
|
@type characters :: :characters
|
||
|
|
@typedoc "Size measured in tokens from a HuggingFace tokenizer."
|
||
|
|
@type tokenizer :: %{type: :tokenizer, model: String.t(), cache_dir: String.t()}
|
||
|
|
end
|