Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

276
packages/elixir/lib/kreuzberg.ex generated Normal file
View File

@@ -0,0 +1,276 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg do
@moduledoc "High-level API for kreuzberg"
@doc "Extract content from a byte array."
@spec extract_bytes_async(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
def extract_bytes_async(content, mime_type) do
Kreuzberg.Native.extract_bytes_async(content, mime_type, nil)
end
@doc "Extract content from a byte array."
@spec extract_bytes_async(binary(), String.t(), String.t() | nil) ::
{:ok, map()} | {:error, atom, String.t()}
def extract_bytes_async(content, mime_type, config) do
Kreuzberg.Native.extract_bytes_async(content, mime_type, config)
end
@doc "Extract content from a file."
@spec extract_file_async(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
def extract_file_async(path, opts \\ []) do
Kreuzberg.Native.extract_file_async(
path,
Keyword.get(opts, :mime_type),
Keyword.get(opts, :config)
)
end
@doc "Synchronous wrapper for `extract_file`."
@spec extract_file_sync(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
def extract_file_sync(path, opts \\ []) do
Kreuzberg.Native.extract_file_sync(
path,
Keyword.get(opts, :mime_type),
Keyword.get(opts, :config)
)
end
@doc "Synchronous wrapper for `extract_bytes`."
@spec extract_bytes_sync(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
def extract_bytes_sync(content, mime_type) do
Kreuzberg.Native.extract_bytes_sync(content, mime_type, nil)
end
@doc "Synchronous wrapper for `extract_bytes`."
@spec extract_bytes_sync(binary(), String.t(), String.t() | nil) ::
{:ok, map()} | {:error, atom, String.t()}
def extract_bytes_sync(content, mime_type, config) do
Kreuzberg.Native.extract_bytes_sync(content, mime_type, config)
end
@doc "Detect MIME type from raw file bytes."
@spec detect_mime_type_from_bytes(binary()) :: {:ok, String.t()} | {:error, atom, String.t()}
def detect_mime_type_from_bytes(content) do
Kreuzberg.Native.detect_mime_type_from_bytes(content)
end
@doc "Get file extensions for a given MIME type."
@spec get_extensions_for_mime(String.t()) :: {:ok, [String.t()]} | {:error, atom, String.t()}
def get_extensions_for_mime(mime_type) do
Kreuzberg.Native.get_extensions_for_mime(mime_type)
end
@doc "Clear all embedding backends from the global registry."
@spec clear_embedding_backends() :: {:ok, nil} | {:error, atom, String.t()}
def clear_embedding_backends do
Kreuzberg.Native.clear_embedding_backends()
end
@doc "List the names of all registered embedding backends."
@spec list_embedding_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_embedding_backends do
Kreuzberg.Native.list_embedding_backends()
end
@doc "List names of all registered document extractors."
@spec list_document_extractors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_document_extractors do
Kreuzberg.Native.list_document_extractors()
end
@doc "Clear all document extractors from the global registry."
@spec clear_document_extractors() :: {:ok, nil} | {:error, atom, String.t()}
def clear_document_extractors do
Kreuzberg.Native.clear_document_extractors()
end
@doc "List all registered OCR backends."
@spec list_ocr_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_ocr_backends do
Kreuzberg.Native.list_ocr_backends()
end
@doc "Clear all OCR backends from the global registry."
@spec clear_ocr_backends() :: {:ok, nil} | {:error, atom, String.t()}
def clear_ocr_backends do
Kreuzberg.Native.clear_ocr_backends()
end
@doc "List all registered post-processor names."
@spec list_post_processors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_post_processors do
Kreuzberg.Native.list_post_processors()
end
@doc "Remove all registered post-processors."
@spec clear_post_processors() :: {:ok, nil} | {:error, atom, String.t()}
def clear_post_processors do
Kreuzberg.Native.clear_post_processors()
end
@doc "List names of all registered renderers."
@spec list_renderers() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_renderers do
Kreuzberg.Native.list_renderers()
end
@doc "Clear all renderers from the global registry."
@spec clear_renderers() :: {:ok, nil} | {:error, atom, String.t()}
def clear_renderers do
Kreuzberg.Native.clear_renderers()
end
@doc "List names of all registered validators."
@spec list_validators() :: {:ok, [String.t()]} | {:error, atom, String.t()}
def list_validators do
Kreuzberg.Native.list_validators()
end
@doc "Remove all registered validators."
@spec clear_validators() :: {:ok, nil} | {:error, atom, String.t()}
def clear_validators do
Kreuzberg.Native.clear_validators()
end
@doc "Compare two extraction results and return a structured diff."
@spec compare(keyword()) :: map()
def compare(opts \\ []) do
Kreuzberg.Native.compare(
Keyword.get(opts, :a),
Keyword.get(opts, :b),
Keyword.get(opts, :opts)
)
end
@doc "Generate embeddings asynchronously for a list of text strings."
@spec embed_texts_async([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
def embed_texts_async(texts) do
Kreuzberg.Native.embed_texts_async(texts, nil)
end
@doc "Generate embeddings asynchronously for a list of text strings."
@spec embed_texts_async([String.t()], String.t() | nil) ::
{:ok, [[float()]]} | {:error, atom, String.t()}
def embed_texts_async(texts, config) do
Kreuzberg.Native.embed_texts_async(texts, config)
end
@doc "Render a single PDF page to PNG bytes."
@spec render_pdf_page_to_png(binary(), non_neg_integer(), keyword()) ::
{:ok, binary()} | {:error, atom, String.t()}
def render_pdf_page_to_png(pdf_bytes, page_index, opts \\ []) do
Kreuzberg.Native.render_pdf_page_to_png(
pdf_bytes,
page_index,
Keyword.get(opts, :dpi),
Keyword.get(opts, :password)
)
end
@doc "Detect the MIME type of a file at the given path."
@spec detect_mime_type(String.t(), boolean()) :: {:ok, String.t()} | {:error, atom, String.t()}
def detect_mime_type(path, check_exists) do
Kreuzberg.Native.detect_mime_type(path, check_exists)
end
@doc "Embed a list of texts using the configured embedding model."
@spec embed_texts([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
def embed_texts(texts) do
Kreuzberg.Native.embed_texts(texts, nil)
end
@doc "Embed a list of texts using the configured embedding model."
@spec embed_texts([String.t()], String.t() | nil) ::
{:ok, [[float()]]} | {:error, atom, String.t()}
def embed_texts(texts, config) do
Kreuzberg.Native.embed_texts(texts, config)
end
@doc "Get an embedding preset by name."
@spec get_embedding_preset(String.t()) :: map() | nil
def get_embedding_preset(name) do
Kreuzberg.Native.get_embedding_preset(name)
end
@doc "List the names of all available embedding presets."
@spec list_embedding_presets() :: [String.t()]
def list_embedding_presets do
Kreuzberg.Native.list_embedding_presets()
end
@doc "Register a OcrBackend plugin with a GenServer PID and name."
@spec register_ocr_backend(pid(), String.t()) :: :ok | :error
def register_ocr_backend(genserver_pid, plugin_name) do
Kreuzberg.Native.register_ocr_backend(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered OcrBackend plugin by name."
@spec unregister_ocr_backend(String.t()) :: :ok | :error
def unregister_ocr_backend(name) do
Kreuzberg.Native.unregister_ocr_backend(name)
end
@doc "Register a PostProcessor plugin with a GenServer PID and name."
@spec register_post_processor(pid(), String.t()) :: :ok | :error
def register_post_processor(genserver_pid, plugin_name) do
Kreuzberg.Native.register_post_processor(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered PostProcessor plugin by name."
@spec unregister_post_processor(String.t()) :: :ok | :error
def unregister_post_processor(name) do
Kreuzberg.Native.unregister_post_processor(name)
end
@doc "Register a Validator plugin with a GenServer PID and name."
@spec register_validator(pid(), String.t()) :: :ok | :error
def register_validator(genserver_pid, plugin_name) do
Kreuzberg.Native.register_validator(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered Validator plugin by name."
@spec unregister_validator(String.t()) :: :ok | :error
def unregister_validator(name) do
Kreuzberg.Native.unregister_validator(name)
end
@doc "Register a EmbeddingBackend plugin with a GenServer PID and name."
@spec register_embedding_backend(pid(), String.t()) :: :ok | :error
def register_embedding_backend(genserver_pid, plugin_name) do
Kreuzberg.Native.register_embedding_backend(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered EmbeddingBackend plugin by name."
@spec unregister_embedding_backend(String.t()) :: :ok | :error
def unregister_embedding_backend(name) do
Kreuzberg.Native.unregister_embedding_backend(name)
end
@doc "Register a DocumentExtractor plugin with a GenServer PID and name."
@spec register_document_extractor(pid(), String.t()) :: :ok | :error
def register_document_extractor(genserver_pid, plugin_name) do
Kreuzberg.Native.register_document_extractor(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered DocumentExtractor plugin by name."
@spec unregister_document_extractor(String.t()) :: :ok | :error
def unregister_document_extractor(name) do
Kreuzberg.Native.unregister_document_extractor(name)
end
@doc "Register a Renderer plugin with a GenServer PID and name."
@spec register_renderer(pid(), String.t()) :: :ok | :error
def register_renderer(genserver_pid, plugin_name) do
Kreuzberg.Native.register_renderer(genserver_pid, plugin_name)
end
@doc "Unregister a previously registered Renderer plugin by name."
@spec unregister_renderer(String.t()) :: :ok | :error
def unregister_renderer(name) do
Kreuzberg.Native.unregister_renderer(name)
end
end

View File

@@ -0,0 +1,48 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.AccelerationConfig do
@moduledoc """
Hardware acceleration configuration for ONNX Runtime models.
Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
for inference in layout detection and embedding generation.
# Example
```rust
use kreuzberg::AccelerationConfig;
// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
let config = AccelerationConfig::default();
// Force CPU only
let config = AccelerationConfig {
provider: kreuzberg::ExecutionProviderType::Cpu,
..Default::default()
};
```
"""
@typedoc "Hardware acceleration configuration for ONNX Runtime models."
@type t :: %__MODULE__{
provider: String.t() | nil,
device_id: non_neg_integer()
}
defstruct provider: :auto,
device_id: 0
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,28 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.AnnotationKind do
@moduledoc "Types of inline text annotations."
@typedoc "Types of inline text annotations."
@type t :: term()
@type bold :: :bold
@type italic :: :italic
@type underline :: :underline
@type strikethrough :: :strikethrough
@type code :: :code
@type subscript :: :subscript
@type superscript :: :superscript
@type link :: %{type: :link, url: String.t(), title: String.t()}
@typedoc "Highlighted text (PDF highlights, HTML `<mark>`)."
@type highlight :: :highlight
@typedoc "Text color (CSS-compatible value, e.g. \"#ff0000\", \"red\")."
@type color :: %{type: :color, value: String.t()}
@typedoc "Font size with units (e.g. \"12pt\", \"1.2em\", \"16px\")."
@type font_size :: %{type: :font_size, value: String.t()}
@typedoc "Extensible annotation for format-specific styling."
@type custom :: %{type: :custom, name: String.t(), value: String.t()}
end

View File

@@ -0,0 +1,24 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ArchiveEntry do
@moduledoc """
A single file extracted from an archive.
When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
enabled, each processable file produces its own full `ExtractionResult`.
"""
@typedoc "A single file extracted from an archive."
@type t :: %__MODULE__{
path: String.t() | nil,
mime_type: String.t() | nil,
result: map()
}
defstruct path: nil,
mime_type: nil,
result: nil
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ArchiveMetadata do
@moduledoc """
Archive (ZIP/TAR/7Z) metadata.
Extracted from compressed archive files containing file lists and size information.
"""
@typedoc "Archive (ZIP/TAR/7Z) metadata."
@type t :: %__MODULE__{
format: String.t() | nil,
file_count: non_neg_integer(),
file_list: [String.t()],
total_size: non_neg_integer(),
compressed_size: non_neg_integer() | nil
}
defstruct format: nil,
file_count: 0,
file_list: [],
total_size: 0,
compressed_size: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

21
packages/elixir/lib/kreuzberg/b_box.ex generated Normal file
View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BBox do
@moduledoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
@typedoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
@type t :: %__MODULE__{
x1: float(),
y1: float(),
x2: float(),
y2: float()
}
defstruct x1: 0.0,
y1: 0.0,
x2: 0.0,
y2: 0.0
end

View File

@@ -0,0 +1,24 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BatchBytesItem do
@moduledoc """
Batch item for byte array extraction.
Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
to represent a single item in a batch extraction job.
"""
@typedoc "Batch item for byte array extraction."
@type t :: %__MODULE__{
content: binary(),
mime_type: String.t() | nil,
config: map() | nil
}
defstruct content: <<>>,
mime_type: nil,
config: nil
end

View File

@@ -0,0 +1,22 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BatchFileItem do
@moduledoc """
Batch item for file extraction.
Used with `batch_extract_files` and `batch_extract_files_sync`
to represent a single file in a batch extraction job.
"""
@typedoc "Batch item for file extraction."
@type t :: %__MODULE__{
path: String.t() | nil,
config: map() | nil
}
defstruct path: nil,
config: nil
end

View File

@@ -0,0 +1,34 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BibtexMetadata do
@moduledoc "BibTeX bibliography metadata."
@typedoc "BibTeX bibliography metadata."
@type t :: %__MODULE__{
entry_count: non_neg_integer(),
citation_keys: [String.t()],
authors: [String.t()],
year_range: map() | nil,
entry_types: map() | nil
}
defstruct entry_count: 0,
citation_keys: [],
authors: [],
year_range: nil,
entry_types: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,77 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BlockType do
@moduledoc "Types of block-level elements in Djot."
@typedoc "Types of block-level elements in Djot."
@type t ::
:paragraph
| :heading
| :blockquote
| :code_block
| :list_item
| :ordered_list
| :bullet_list
| :task_list
| :definition_list
| :definition_term
| :definition_description
| :div
| :section
| :thematic_break
| :raw_block
| :math_display
@paragraph :paragraph
@heading :heading
@blockquote :blockquote
@code_block :code_block
@list_item :list_item
@ordered_list :ordered_list
@bullet_list :bullet_list
@task_list :task_list
@definition_list :definition_list
@definition_term :definition_term
@definition_description :definition_description
@div :div
@section :section
@thematic_break :thematic_break
@raw_block :raw_block
@math_display :math_display
@spec paragraph() :: t()
def paragraph, do: @paragraph
@spec heading() :: t()
def heading, do: @heading
@spec blockquote() :: t()
def blockquote, do: @blockquote
@spec code_block() :: t()
def code_block, do: @code_block
@spec list_item() :: t()
def list_item, do: @list_item
@spec ordered_list() :: t()
def ordered_list, do: @ordered_list
@spec bullet_list() :: t()
def bullet_list, do: @bullet_list
@spec task_list() :: t()
def task_list, do: @task_list
@spec definition_list() :: t()
def definition_list, do: @definition_list
@spec definition_term() :: t()
def definition_term, do: @definition_term
@spec definition_description() :: t()
def definition_description, do: @definition_description
@spec div() :: t()
def div, do: @div
@spec section() :: t()
def section, do: @section
@spec thematic_break() :: t()
def thematic_break, do: @thematic_break
@spec raw_block() :: t()
def raw_block, do: @raw_block
@spec math_display() :: t()
def math_display, do: @math_display
end

View File

@@ -0,0 +1,32 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.BoundingBox do
@moduledoc "Bounding box coordinates for element positioning."
@typedoc "Bounding box coordinates for element positioning."
@type t :: %__MODULE__{
x0: float(),
y0: float(),
x1: float(),
y1: float()
}
defstruct x0: 0.0,
y0: 0.0,
x1: 0.0,
y1: 0.0
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,22 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CacheStats do
@moduledoc false
@type t :: %__MODULE__{
total_files: non_neg_integer(),
total_size_mb: float(),
available_space_mb: float(),
oldest_file_age_days: float(),
newest_file_age_days: float()
}
defstruct total_files: 0,
total_size_mb: 0.0,
available_space_mb: 0.0,
oldest_file_age_days: 0.0,
newest_file_age_days: 0.0
end

View File

@@ -0,0 +1,27 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CellChange do
@moduledoc """
A single changed cell within a table.
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
reference it unconditionally, without requiring the `diff` Cargo feature.
`crate::diff` re-exports this type verbatim.
"""
@typedoc "A single changed cell within a table."
@type t :: %__MODULE__{
row: non_neg_integer(),
col: non_neg_integer(),
from: String.t() | nil,
to: String.t() | nil
}
defstruct row: 0,
col: 0,
from: nil,
to: nil
end

27
packages/elixir/lib/kreuzberg/chunk.ex generated Normal file
View File

@@ -0,0 +1,27 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.Chunk do
@moduledoc """
A text chunk with optional embedding and metadata.
Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
contains the text content, optional embedding vector (if embedding generation
is configured), and metadata about its position in the document.
"""
@typedoc "A text chunk with optional embedding and metadata."
@type t :: %__MODULE__{
content: String.t() | nil,
chunk_type: String.t() | nil,
embedding: [float()] | nil,
metadata: map()
}
defstruct content: nil,
chunk_type: :unknown,
embedding: nil,
metadata: nil
end

View File

@@ -0,0 +1,31 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ChunkMetadata do
@moduledoc "Metadata about a chunk's position in the original document."
@typedoc "Metadata about a chunk's position in the original document."
@type t :: %__MODULE__{
byte_start: non_neg_integer(),
byte_end: non_neg_integer(),
token_count: non_neg_integer() | nil,
chunk_index: non_neg_integer(),
total_chunks: non_neg_integer(),
first_page: non_neg_integer() | nil,
last_page: non_neg_integer() | nil,
heading_context: map() | nil,
image_indices: [non_neg_integer()]
}
defstruct byte_start: 0,
byte_end: 0,
token_count: nil,
chunk_index: 0,
total_chunks: 0,
first_page: nil,
last_page: nil,
heading_context: nil,
image_indices: []
end

View File

@@ -0,0 +1,25 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ChunkSizing do
@moduledoc """
How chunk size is measured.
Defaults to `Characters` (Unicode character count). When using token-based sizing,
chunks are sized by token count according to the specified tokenizer.
Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
(e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
"""
@typedoc "How chunk size is measured."
@type t :: term()
@typedoc "Size measured in Unicode characters (default)."
@type characters :: :characters
@typedoc "Size measured in tokens from a HuggingFace tokenizer."
@type tokenizer :: %{type: :tokenizer, model: String.t(), cache_dir: String.t()}
end

View File

@@ -0,0 +1,84 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ChunkType do
@moduledoc """
Semantic structural classification of a text chunk.
Assigned by the heuristic classifier in `chunking::classifier`.
Defaults to `Unknown` when no rule matches.
Designed to be extended in future versions without breaking changes.
"""
@typedoc "Semantic structural classification of a text chunk."
@type t ::
:heading
| :party_list
| :definitions
| :operative_clause
| :signature_block
| :schedule
| :table_like
| :formula
| :code_block
| :image
| :org_chart
| :diagram
| :unknown
@heading :heading
@party_list :party_list
@definitions :definitions
@operative_clause :operative_clause
@signature_block :signature_block
@schedule :schedule
@table_like :table_like
@formula :formula
@code_block :code_block
@image :image
@org_chart :org_chart
@diagram :diagram
@unknown :unknown
@doc "Section heading or document title."
@spec heading() :: t()
def heading, do: @heading
@doc "Party list: names, addresses, and signatories."
@spec party_list() :: t()
def party_list, do: @party_list
@doc "Definition clause (\"X means…\", \"X shall mean…\")."
@spec definitions() :: t()
def definitions, do: @definitions
@doc "Operative clause containing legal/contractual action verbs."
@spec operative_clause() :: t()
def operative_clause, do: @operative_clause
@doc "Signature block with signatures, names, and dates."
@spec signature_block() :: t()
def signature_block, do: @signature_block
@doc "Schedule, annex, appendix, or exhibit section."
@spec schedule() :: t()
def schedule, do: @schedule
@doc "Table-like content with aligned columns or repeated patterns."
@spec table_like() :: t()
def table_like, do: @table_like
@doc "Mathematical formula or equation."
@spec formula() :: t()
def formula, do: @formula
@doc "Code block or preformatted content."
@spec code_block() :: t()
def code_block, do: @code_block
@doc "Embedded or referenced image content."
@spec image() :: t()
def image, do: @image
@doc "Organizational chart or hierarchy diagram."
@spec org_chart() :: t()
def org_chart, do: @org_chart
@doc "Diagram, figure, or visual illustration."
@spec diagram() :: t()
def diagram, do: @diagram
@doc "Unclassified or mixed content."
@spec unknown() :: t()
def unknown, do: @unknown
end

View File

@@ -0,0 +1,40 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ChunkerType do
@moduledoc """
Type of text chunker to use.
# Variants
* `Text` - Generic text splitter, splits on whitespace and punctuation
* `Markdown` - Markdown-aware splitter, preserves formatting and structure
* `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
lower = more splits). Without an embedding, falls back to a
structural-boundary heuristic (ALL-CAPS headers, numbered sections,
blank-line paragraphs) and merges groups into chunks capped at
`max_characters` (default 1000). `topic_threshold` has no effect in the
fallback path. For best results, pair with an embedding model.
"""
@typedoc "Type of text chunker to use."
@type t :: :text | :markdown | :yaml | :semantic
@text :text
@markdown :markdown
@yaml :yaml
@semantic :semantic
@spec text() :: t()
def text, do: @text
@spec markdown() :: t()
def markdown, do: @markdown
@spec yaml() :: t()
def yaml, do: @yaml
@spec semantic() :: t()
def semantic, do: @semantic
end

View File

@@ -0,0 +1,55 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ChunkingConfig do
@moduledoc """
Chunking configuration.
Configures text chunking for document content, including chunk size,
overlap, trimming behavior, and optional embeddings.
Use `..Default::default()` when constructing to allow for future field additions:
```rust
let config = ChunkingConfig {
max_characters: 500,
..Default::default()
};
```
"""
@typedoc "Chunking configuration."
@type t :: %__MODULE__{
max_characters: non_neg_integer(),
overlap: non_neg_integer(),
trim: boolean(),
chunker_type: String.t() | nil,
embedding: map() | nil,
preset: String.t() | nil,
sizing: String.t() | nil,
prepend_heading_context: boolean(),
topic_threshold: float() | nil
}
defstruct max_characters: 1_000,
overlap: 200,
trim: true,
chunker_type: :text,
embedding: nil,
preset: nil,
sizing: :characters,
prepend_heading_context: false,
topic_threshold: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,36 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CitationMetadata do
@moduledoc "Citation file metadata (RIS, PubMed, EndNote)."
@typedoc "Citation file metadata (RIS, PubMed, EndNote)."
@type t :: %__MODULE__{
citation_count: non_neg_integer(),
format: String.t() | nil,
authors: [String.t()],
year_range: map() | nil,
dois: [String.t()],
keywords: [String.t()]
}
defstruct citation_count: 0,
format: nil,
authors: [],
year_range: nil,
dois: [],
keywords: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CodeContentMode do
@moduledoc """
Content rendering mode for code extraction.
Controls how extracted code content is represented in the `content` field
of `ExtractionResult`.
"""
@typedoc "Content rendering mode for code extraction."
@type t :: :chunks | :raw | :structure
@chunks :chunks
@raw :raw
@structure :structure
@doc "Use TSLP semantic chunks as content (default)."
@spec chunks() :: t()
def chunks, do: @chunks
@doc "Use raw source code as content."
@spec raw() :: t()
def raw, do: @raw
@doc "Emit function/class headings + docstrings (no code bodies)."
@spec structure() :: t()
def structure, do: @structure
end

View File

@@ -0,0 +1,42 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ContentFilterConfig do
@moduledoc """
Cross-extractor content filtering configuration.
Controls whether "furniture" content (headers, footers, page numbers,
watermarks, repeating text) is included in or stripped from extraction
results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
with format-specific implementation.
When `None` on `ExtractionConfig`, each extractor uses its current
default behavior unchanged.
"""
@typedoc "Cross-extractor content filtering configuration."
@type t :: %__MODULE__{
include_headers: boolean(),
include_footers: boolean(),
strip_repeating_text: boolean(),
include_watermarks: boolean()
}
defstruct include_headers: false,
include_footers: false,
strip_repeating_text: true,
include_watermarks: false
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,33 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ContentLayer do
@moduledoc """
Content layer classification for document nodes.
Replaces separate body/furniture arrays with per-node granularity.
"""
@typedoc "Content layer classification for document nodes."
@type t :: :body | :header | :footer | :footnote
@body :body
@header :header
@footer :footer
@footnote :footnote
@doc "Main document body content."
@spec body() :: t()
def body, do: @body
@doc "Page/section header (running header)."
@spec header() :: t()
def header, do: @header
@doc "Page/section footer (running footer)."
@spec footer() :: t()
def footer, do: @footer
@doc "Footnote content."
@spec footnote() :: t()
def footnote, do: @footnote
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ContributorRole do
@moduledoc "JATS contributor with role."
@typedoc "JATS contributor with role."
@type t :: %__MODULE__{
name: String.t() | nil,
role: String.t() | nil
}
defstruct name: nil,
role: nil
end

View File

@@ -0,0 +1,59 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CoreProperties do
@moduledoc """
Dublin Core metadata from docProps/core.xml
Contains standard metadata fields defined by the Dublin Core standard
and Office-specific extensions.
"""
@typedoc "Dublin Core metadata from docProps/core.xml"
@type t :: %__MODULE__{
title: String.t() | nil,
subject: String.t() | nil,
creator: String.t() | nil,
keywords: String.t() | nil,
description: String.t() | nil,
last_modified_by: String.t() | nil,
revision: String.t() | nil,
created: String.t() | nil,
modified: String.t() | nil,
category: String.t() | nil,
content_status: String.t() | nil,
language: String.t() | nil,
identifier: String.t() | nil,
version: String.t() | nil,
last_printed: String.t() | nil
}
defstruct title: nil,
subject: nil,
creator: nil,
keywords: nil,
description: nil,
last_modified_by: nil,
revision: nil,
created: nil,
modified: nil,
category: nil,
content_status: nil,
language: nil,
identifier: nil,
version: nil,
last_printed: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,34 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.CsvMetadata do
@moduledoc "CSV/TSV file metadata."
@typedoc "CSV/TSV file metadata."
@type t :: %__MODULE__{
row_count: non_neg_integer(),
column_count: non_neg_integer(),
delimiter: String.t() | nil,
has_header: boolean(),
column_types: [String.t()] | nil
}
defstruct row_count: 0,
column_count: 0,
delimiter: nil,
has_header: false,
column_types: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DbfFieldInfo do
@moduledoc "dBASE field information."
@typedoc "dBASE field information."
@type t :: %__MODULE__{
name: String.t() | nil,
field_type: String.t() | nil
}
defstruct name: nil,
field_type: nil
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DbfMetadata do
@moduledoc "dBASE (DBF) file metadata."
@typedoc "dBASE (DBF) file metadata."
@type t :: %__MODULE__{
record_count: non_neg_integer(),
field_count: non_neg_integer(),
fields: [map()]
}
defstruct record_count: 0,
field_count: 0,
fields: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DetectResponse do
@moduledoc "MIME type detection response."
@typedoc "MIME type detection response."
@type t :: %__MODULE__{
mime_type: String.t() | nil,
filename: String.t() | nil
}
defstruct mime_type: nil,
filename: nil
end

View File

@@ -0,0 +1,19 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DetectionResult do
@moduledoc "Page-level detection result containing all detections and page metadata."
@typedoc "Page-level detection result containing all detections and page metadata."
@type t :: %__MODULE__{
page_width: non_neg_integer(),
page_height: non_neg_integer(),
detections: [map()]
}
defstruct page_width: 0,
page_height: 0,
detections: []
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DiffHunk do
@moduledoc "A single contiguous hunk in a unified diff."
@typedoc "A single contiguous hunk in a unified diff."
@type t :: %__MODULE__{
from_line: non_neg_integer(),
from_count: non_neg_integer(),
to_line: non_neg_integer(),
to_count: non_neg_integer(),
lines: [String.t() | nil]
}
defstruct from_line: 0,
from_count: 0,
to_line: 0,
to_count: 0,
lines: []
end

View File

@@ -0,0 +1,24 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DiffLine do
@moduledoc """
A single line in a unified-diff hunk.
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
reference it unconditionally, without requiring the `diff` Cargo feature.
`crate::diff` re-exports this type verbatim.
"""
@typedoc "A single line in a unified-diff hunk."
@type t :: term()
@typedoc "Unchanged context line."
@type context :: %{type: :context, value: String.t()}
@typedoc "Line added in the \"after\" version."
@type added :: %{type: :added, value: String.t()}
@typedoc "Line removed from the \"before\" version."
@type removed :: %{type: :removed, value: String.t()}
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DiffOptions do
@moduledoc "Options controlling how two `ExtractionResult` values are compared."
@typedoc "Options controlling how two `ExtractionResult` values are compared."
@type t :: %__MODULE__{
include_metadata: boolean(),
include_embedded: boolean(),
max_content_chars: non_neg_integer() | nil
}
defstruct include_metadata: true,
include_embedded: true,
max_content_chars: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,41 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DjotContent do
@moduledoc """
Comprehensive Djot document structure with semantic preservation.
This type captures the full richness of Djot markup, including:
- Block-level structures (headings, lists, blockquotes, code blocks, etc.)
- Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
- Attributes (classes, IDs, key-value pairs)
- Links, images, footnotes
- Math expressions (inline and display)
- Tables with full structure
Available when the `djot` feature is enabled.
"""
@typedoc "Comprehensive Djot document structure with semantic preservation."
@type t :: %__MODULE__{
plain_text: String.t() | nil,
blocks: [map()],
metadata: map(),
tables: [map()],
images: [map()],
links: [map()],
footnotes: [map()],
attributes: [String.t()]
}
defstruct plain_text: nil,
blocks: [],
metadata: nil,
tables: [],
images: [],
links: [],
footnotes: [],
attributes: []
end

View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DjotImage do
@moduledoc "Image element in Djot."
@typedoc "Image element in Djot."
@type t :: %__MODULE__{
src: String.t() | nil,
alt: String.t() | nil,
title: String.t() | nil,
attributes: String.t() | nil
}
defstruct src: nil,
alt: nil,
title: nil,
attributes: nil
end

View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DjotLink do
@moduledoc "Link element in Djot."
@typedoc "Link element in Djot."
@type t :: %__MODULE__{
url: String.t() | nil,
text: String.t() | nil,
title: String.t() | nil,
attributes: String.t() | nil
}
defstruct url: nil,
text: nil,
title: nil,
attributes: nil
end

View File

@@ -0,0 +1,74 @@
defmodule KreuzbergDocumentExtractorBridge do
@moduledoc """
GenServer bridge for DocumentExtractor implementation in kreuzberg.
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
"""
use GenServer
require Logger
@doc """
Start a GenServer linked to the current process.
impl_module should be a module that implements the DocumentExtractor trait methods.
"""
def start_link(impl_module) do
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
end
@impl GenServer
def init(impl_module) do
{:ok, impl_module}
end
@doc """
Handle an incoming trait call message.
Message format: {:trait_call, method_atom, args_json, reply_id}
"""
@impl GenServer
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
try do
args = Jason.decode!(args_json)
method_name = to_string(method)
ordered_args = ordered_args(impl_module, method_name, args)
# Dispatch to the implementation module
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
# Send result back to Rust
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
rescue
e ->
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
end
{:noreply, impl_module}
end
defp ordered_args(impl_module, method_name, args) when is_map(args) do
if function_exported?(impl_module, :__alef_arg_order__, 1) do
impl_module.__alef_arg_order__(method_name)
|> Enum.map(&Map.fetch!(args, &1))
else
args
|> Map.keys()
|> Enum.sort()
|> Enum.map(&Map.fetch!(args, &1))
end
end
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
@doc """
Register an implementation module, starting a GenServer to handle trait calls.
"""
def register(impl_module) do
plugin_name = impl_module.name()
{:ok, pid} = start_link(impl_module)
Kreuzberg.Native.register_document_extractor(pid, plugin_name)
end
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocumentNode do
@moduledoc """
A single node in the document tree.
Each node has deterministic `id`, typed `content`, optional `parent`/`children`
for tree structure, and metadata like page number, bounding box, and content layer.
"""
@typedoc "A single node in the document tree."
@type t :: %__MODULE__{
id: String.t() | nil,
content: String.t() | nil,
parent: non_neg_integer() | nil,
children: [non_neg_integer()],
content_layer: String.t() | nil,
page: non_neg_integer() | nil,
page_end: non_neg_integer() | nil,
bbox: map() | nil,
annotations: [map()],
attributes: map() | nil
}
defstruct id: nil,
content: :title,
parent: nil,
children: [],
content_layer: :body,
page: nil,
page_end: nil,
bbox: nil,
annotations: [],
attributes: nil
end

View File

@@ -0,0 +1,19 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocumentRelationship do
@moduledoc "A resolved relationship between two nodes in the document tree."
@typedoc "A resolved relationship between two nodes in the document tree."
@type t :: %__MODULE__{
source: non_neg_integer(),
target: non_neg_integer(),
kind: String.t() | nil
}
defstruct source: 0,
target: 0,
kind: :footnote_reference
end

View File

@@ -0,0 +1,32 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocumentRevision do
@moduledoc """
A single tracked change embedded in a document.
Populated by per-format extractors that understand change-tracking metadata
(DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
extractor defaults to `ExtractionResult.revisions = None` until a
format-specific implementation is added.
"""
@typedoc "A single tracked change embedded in a document."
@type t :: %__MODULE__{
revision_id: String.t() | nil,
author: String.t() | nil,
timestamp: String.t() | nil,
kind: String.t() | nil,
anchor: String.t() | nil | nil,
delta: map()
}
defstruct revision_id: nil,
author: nil,
timestamp: nil,
kind: :insertion,
anchor: nil,
delta: nil
end

View File

@@ -0,0 +1,43 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocumentStructure do
@moduledoc """
Top-level structured document representation.
A flat array of nodes with index-based parent/child references forming a tree.
Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
to iterate over top-level content by layer.
# Validation
Call `validate()` after construction to verify all node indices are in bounds
and parent-child relationships are bidirectionally consistent.
"""
@typedoc "Top-level structured document representation."
@type t :: %__MODULE__{
nodes: [map()],
source_format: String.t() | nil,
relationships: [map()],
node_types: [String.t()]
}
defstruct nodes: [],
source_format: nil,
relationships: [],
node_types: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,60 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocxAppProperties do
@moduledoc """
Application properties from docProps/app.xml for DOCX
Contains Word-specific document statistics and metadata.
"""
@typedoc "Application properties from docProps/app.xml for DOCX"
@type t :: %__MODULE__{
application: String.t() | nil,
app_version: String.t() | nil,
template: String.t() | nil,
total_time: integer() | nil,
pages: integer() | nil,
words: integer() | nil,
characters: integer() | nil,
characters_with_spaces: integer() | nil,
lines: integer() | nil,
paragraphs: integer() | nil,
company: String.t() | nil,
doc_security: integer() | nil,
scale_crop: boolean() | nil,
links_up_to_date: boolean() | nil,
shared_doc: boolean() | nil,
hyperlinks_changed: boolean() | nil
}
defstruct application: nil,
app_version: nil,
template: nil,
total_time: nil,
pages: nil,
words: nil,
characters: nil,
characters_with_spaces: nil,
lines: nil,
paragraphs: nil,
company: nil,
doc_security: nil,
scale_crop: nil,
links_up_to_date: nil,
shared_doc: nil,
hyperlinks_changed: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,35 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.DocxMetadata do
@moduledoc """
Word document metadata.
Extracted from DOCX files using shared Office Open XML metadata extraction.
Integrates with `office_metadata` module for core/app/custom properties.
"""
@typedoc "Word document metadata."
@type t :: %__MODULE__{
core_properties: map() | nil,
app_properties: map() | nil,
custom_properties: map() | nil
}
defstruct core_properties: nil,
app_properties: nil,
custom_properties: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

26
packages/elixir/lib/kreuzberg/element.ex generated Normal file
View File

@@ -0,0 +1,26 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.Element do
@moduledoc """
Semantic element extracted from document.
Represents a logical unit of content with semantic classification,
unique identifier, and metadata for tracking origin and position.
"""
@typedoc "Semantic element extracted from document."
@type t :: %__MODULE__{
element_id: String.t() | nil,
element_type: String.t() | nil,
text: String.t() | nil,
metadata: map()
}
defstruct element_id: nil,
element_type: :title,
text: nil,
metadata: nil
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ElementMetadata do
@moduledoc "Metadata for a semantic element."
@typedoc "Metadata for a semantic element."
@type t :: %__MODULE__{
page_number: non_neg_integer() | nil,
filename: String.t() | nil,
coordinates: map() | nil,
element_index: non_neg_integer() | nil,
additional: map()
}
defstruct page_number: nil,
filename: nil,
coordinates: nil,
element_index: nil,
additional: %{}
end

View File

@@ -0,0 +1,73 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ElementType do
@moduledoc """
Semantic element type classification.
Categorizes text content into semantic units for downstream processing.
Supports the element types commonly found in Unstructured documents.
"""
@typedoc "Semantic element type classification."
@type t ::
:title
| :narrative_text
| :heading
| :list_item
| :table
| :image
| :page_break
| :code_block
| :block_quote
| :footer
| :header
@title :title
@narrative_text :narrative_text
@heading :heading
@list_item :list_item
@table :table
@image :image
@page_break :page_break
@code_block :code_block
@block_quote :block_quote
@footer :footer
@header :header
@doc "Document title"
@spec title() :: t()
def title, do: @title
@doc "Main narrative text body"
@spec narrative_text() :: t()
def narrative_text, do: @narrative_text
@doc "Section heading"
@spec heading() :: t()
def heading, do: @heading
@doc "List item (bullet, numbered, etc.)"
@spec list_item() :: t()
def list_item, do: @list_item
@doc "Table element"
@spec table() :: t()
def table, do: @table
@doc "Image element"
@spec image() :: t()
def image, do: @image
@doc "Page break marker"
@spec page_break() :: t()
def page_break, do: @page_break
@doc "Code block"
@spec code_block() :: t()
def code_block, do: @code_block
@doc "Block quote"
@spec block_quote() :: t()
def block_quote, do: @block_quote
@doc "Footer text"
@spec footer() :: t()
def footer, do: @footer
@doc "Header text"
@spec header() :: t()
def header, do: @header
end

View File

@@ -0,0 +1,29 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmailAttachment do
@moduledoc """
Email attachment representation.
Contains metadata and optionally the content of an email attachment.
"""
@typedoc "Email attachment representation."
@type t :: %__MODULE__{
name: String.t() | nil,
filename: String.t() | nil,
mime_type: String.t() | nil,
size: non_neg_integer() | nil,
is_image: boolean(),
data: binary() | nil
}
defstruct name: nil,
filename: nil,
mime_type: nil,
size: nil,
is_image: false,
data: nil
end

View File

@@ -0,0 +1,26 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmailConfig do
@moduledoc "Configuration for email extraction."
@typedoc "Configuration for email extraction."
@type t :: %__MODULE__{
msg_fallback_codepage: non_neg_integer() | nil
}
defstruct msg_fallback_codepage: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,42 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmailExtractionResult do
@moduledoc """
Email extraction result.
Complete representation of an extracted email message (.eml or .msg)
including headers, body content, and attachments.
"""
@typedoc "Email extraction result."
@type t :: %__MODULE__{
subject: String.t() | nil,
from_email: String.t() | nil,
to_emails: [String.t()],
cc_emails: [String.t()],
bcc_emails: [String.t()],
date: String.t() | nil,
message_id: String.t() | nil,
plain_text: String.t() | nil,
html_content: String.t() | nil,
content: String.t() | nil,
attachments: [map()],
metadata: map()
}
defstruct subject: nil,
from_email: nil,
to_emails: [],
cc_emails: [],
bcc_emails: [],
date: nil,
message_id: nil,
plain_text: nil,
html_content: nil,
content: nil,
attachments: [],
metadata: %{}
end

View File

@@ -0,0 +1,42 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmailMetadata do
@moduledoc """
Email metadata extracted from .eml and .msg files.
Includes sender/recipient information, message ID, and attachment list.
"""
@typedoc "Email metadata extracted from .eml and .msg files."
@type t :: %__MODULE__{
from_email: String.t() | nil,
from_name: String.t() | nil,
to_emails: [String.t()],
cc_emails: [String.t()],
bcc_emails: [String.t()],
message_id: String.t() | nil,
attachments: [String.t()]
}
defstruct from_email: nil,
from_name: nil,
to_emails: [],
cc_emails: [],
bcc_emails: [],
message_id: nil,
attachments: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,19 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddedChanges do
@moduledoc "Changes to embedded archive children between two results."
@typedoc "Changes to embedded archive children between two results."
@type t :: %__MODULE__{
added: [map()],
removed: [map()],
changed: [map()]
}
defstruct added: [],
removed: [],
changed: []
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddedDiff do
@moduledoc "Diff for a single embedded archive entry that appears in both results."
@typedoc "Diff for a single embedded archive entry that appears in both results."
@type t :: %__MODULE__{
path: String.t() | nil,
diff: map()
}
defstruct path: nil,
diff: nil
end

View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddedFile do
@moduledoc "Embedded file descriptor extracted from the PDF name tree."
@typedoc "Embedded file descriptor extracted from the PDF name tree."
@type t :: %__MODULE__{
name: String.t() | nil,
data: binary(),
compressed_size: non_neg_integer(),
mime_type: String.t() | nil
}
defstruct name: nil,
data: <<>>,
compressed_size: 0,
mime_type: nil
end

View File

@@ -0,0 +1,74 @@
defmodule KreuzbergEmbeddingBackendBridge do
@moduledoc """
GenServer bridge for EmbeddingBackend implementation in kreuzberg.
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
"""
use GenServer
require Logger
@doc """
Start a GenServer linked to the current process.
impl_module should be a module that implements the EmbeddingBackend trait methods.
"""
def start_link(impl_module) do
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
end
@impl GenServer
def init(impl_module) do
{:ok, impl_module}
end
@doc """
Handle an incoming trait call message.
Message format: {:trait_call, method_atom, args_json, reply_id}
"""
@impl GenServer
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
try do
args = Jason.decode!(args_json)
method_name = to_string(method)
ordered_args = ordered_args(impl_module, method_name, args)
# Dispatch to the implementation module
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
# Send result back to Rust
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
rescue
e ->
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
end
{:noreply, impl_module}
end
defp ordered_args(impl_module, method_name, args) when is_map(args) do
if function_exported?(impl_module, :__alef_arg_order__, 1) do
impl_module.__alef_arg_order__(method_name)
|> Enum.map(&Map.fetch!(args, &1))
else
args
|> Map.keys()
|> Enum.sort()
|> Enum.map(&Map.fetch!(args, &1))
end
end
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
@doc """
Register an implementation module, starting a GenServer to handle trait calls.
"""
def register(impl_module) do
plugin_name = impl_module.name()
{:ok, pid} = start_link(impl_module)
Kreuzberg.Native.register_embedding_backend(pid, plugin_name)
end
end

View File

@@ -0,0 +1,43 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddingConfig do
@moduledoc """
Embedding configuration for text chunks.
Configures embedding generation using ONNX models via the vendored embedding engine.
Requires the `embeddings` feature to be enabled.
"""
@typedoc "Embedding configuration for text chunks."
@type t :: %__MODULE__{
model: String.t() | nil,
normalize: boolean(),
batch_size: non_neg_integer(),
show_download_progress: boolean(),
cache_dir: String.t() | nil,
acceleration: map() | nil,
max_embed_duration_secs: non_neg_integer() | nil
}
defstruct model: :preset,
normalize: true,
batch_size: 32,
show_download_progress: false,
cache_dir: nil,
acceleration: nil,
max_embed_duration_secs: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,20 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddingModelType do
@moduledoc "Embedding model types supported by Kreuzberg."
@typedoc "Embedding model types supported by Kreuzberg."
@type t :: term()
@typedoc "Use a preset model configuration (recommended)"
@type preset :: %{type: :preset, name: String.t()}
@typedoc "Use a custom ONNX model from HuggingFace"
@type custom :: %{type: :custom, model_id: String.t(), dimensions: non_neg_integer()}
@typedoc "Provider-hosted embedding model via liter-llm."
@type llm :: %{type: :llm, llm: Kreuzberg.LlmConfig.t()}
@typedoc "In-process embedding backend registered via the plugin system."
@type plugin :: %{type: :plugin, name: String.t()}
end

View File

@@ -0,0 +1,37 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EmbeddingPreset do
@moduledoc """
Preset configurations for common RAG use cases.
Each preset combines chunk size, overlap, and embedding model
to provide an optimized configuration for specific scenarios.
All string fields are owned `String` for FFI compatibility — instances
are safe to clone and pass across language boundaries.
"""
@typedoc "Preset configurations for common RAG use cases."
@type t :: %__MODULE__{
name: String.t() | nil,
chunk_size: non_neg_integer(),
overlap: non_neg_integer(),
model_repo: String.t() | nil,
pooling: String.t() | nil,
model_file: String.t() | nil,
dimensions: non_neg_integer(),
description: String.t() | nil
}
defstruct name: nil,
chunk_size: 0,
overlap: 0,
model_repo: nil,
pooling: nil,
model_file: nil,
dimensions: 0,
description: nil
end

View File

@@ -0,0 +1,36 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.EpubMetadata do
@moduledoc "EPUB metadata (Dublin Core extensions)."
@typedoc "EPUB metadata (Dublin Core extensions)."
@type t :: %__MODULE__{
coverage: String.t() | nil,
dc_format: String.t() | nil,
relation: String.t() | nil,
source: String.t() | nil,
dc_type: String.t() | nil,
cover_image: String.t() | nil
}
defstruct coverage: nil,
dc_format: nil,
relation: nil,
source: nil,
dc_type: nil,
cover_image: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ErrorMetadata do
@moduledoc "Error metadata (for batch operations)."
@typedoc "Error metadata (for batch operations)."
@type t :: %__MODULE__{
error_type: String.t() | nil,
message: String.t() | nil
}
defstruct error_type: nil,
message: nil
end

View File

@@ -0,0 +1,33 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExcelMetadata do
@moduledoc """
Excel/spreadsheet format metadata.
Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
discriminant. Sheet count and sheet names are stored inside this struct.
"""
@typedoc "Excel/spreadsheet format metadata."
@type t :: %__MODULE__{
sheet_count: non_neg_integer() | nil,
sheet_names: [String.t()] | nil
}
defstruct sheet_count: nil,
sheet_names: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExcelSheet do
@moduledoc """
Single Excel worksheet.
Represents one sheet from an Excel workbook with its content
converted to Markdown format and dimensional statistics.
"""
@typedoc "Single Excel worksheet."
@type t :: %__MODULE__{
name: String.t() | nil,
markdown: String.t() | nil,
row_count: non_neg_integer(),
col_count: non_neg_integer(),
cell_count: non_neg_integer(),
table_cells: [[String.t()]] | nil
}
defstruct name: nil,
markdown: nil,
row_count: 0,
col_count: 0,
cell_count: 0,
table_cells: nil
end

View File

@@ -0,0 +1,24 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExcelWorkbook do
@moduledoc """
Excel workbook representation.
Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
extracted content and metadata.
"""
@typedoc "Excel workbook representation."
@type t :: %__MODULE__{
sheets: [map()],
metadata: map(),
revisions: [map()] | nil
}
defstruct sheets: [],
metadata: %{},
revisions: nil
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExecutionProviderType do
@moduledoc """
ONNX Runtime execution provider type.
Determines which hardware backend is used for model inference.
`Auto` (default) selects the best available provider per platform.
"""
@typedoc "ONNX Runtime execution provider type."
@type t :: :auto | :cpu | :core_ml | :cuda | :tensor_rt
@auto :auto
@cpu :cpu
@core_ml :core_ml
@cuda :cuda
@tensor_rt :tensor_rt
@doc "Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere."
@spec auto() :: t()
def auto, do: @auto
@doc "CPU execution provider (always available)."
@spec cpu() :: t()
def cpu, do: @cpu
@doc "Apple CoreML (macOS/iOS Neural Engine + GPU)."
@spec core_ml() :: t()
def core_ml, do: @core_ml
@doc "NVIDIA CUDA GPU acceleration."
@spec cuda() :: t()
def cuda, do: @cuda
@doc "NVIDIA TensorRT (optimized CUDA inference)."
@spec tensor_rt() :: t()
def tensor_rt, do: @tensor_rt
end

View File

@@ -0,0 +1,51 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractedImage do
@moduledoc """
Extracted image from a document.
Contains raw image data, metadata, and optional nested OCR results.
Raw bytes allow cross-language compatibility - users can convert to
PIL.Image (Python), Sharp (Node.js), or other formats as needed.
"""
@typedoc "Extracted image from a document."
@type t :: %__MODULE__{
data: binary(),
format: String.t() | nil,
image_index: non_neg_integer(),
page_number: non_neg_integer() | nil,
width: non_neg_integer() | nil,
height: non_neg_integer() | nil,
colorspace: String.t() | nil,
bits_per_component: non_neg_integer() | nil,
is_mask: boolean(),
description: String.t() | nil,
ocr_result: map() | nil,
bounding_box: map() | nil,
source_path: String.t() | nil,
image_kind: String.t() | nil | nil,
kind_confidence: float() | nil,
cluster_id: non_neg_integer() | nil
}
defstruct data: <<>>,
format: nil,
image_index: 0,
page_number: nil,
width: nil,
height: nil,
colorspace: nil,
bits_per_component: nil,
is_mask: false,
description: nil,
ocr_result: nil,
bounding_box: nil,
source_path: nil,
image_kind: nil,
kind_confidence: nil,
cluster_id: nil
end

View File

@@ -0,0 +1,27 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractedUri do
@moduledoc """
A URI extracted from a document.
Represents any link, reference, or resource pointer found during extraction.
The `kind` field classifies the URI semantically, while `label` carries
optional human-readable display text.
"""
@typedoc "A URI extracted from a document."
@type t :: %__MODULE__{
url: String.t() | nil,
label: String.t() | nil,
page: non_neg_integer() | nil,
kind: String.t() | nil
}
defstruct url: nil,
label: nil,
page: nil,
kind: :hyperlink
end

View File

@@ -0,0 +1,111 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractionConfig do
@moduledoc """
Main extraction configuration.
This struct contains all configuration options for the extraction process.
It can be loaded from TOML, YAML, or JSON files, or created programmatically.
# Example
```rust
use kreuzberg::core::config::ExtractionConfig;
// Create with defaults
let config = ExtractionConfig::default();
// Load from TOML file
// let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
```
"""
@typedoc "Main extraction configuration."
@type t :: %__MODULE__{
use_cache: boolean(),
enable_quality_processing: boolean(),
ocr: map() | nil,
force_ocr: boolean(),
force_ocr_pages: [non_neg_integer()] | nil,
disable_ocr: boolean(),
chunking: map() | nil,
content_filter: map() | nil,
images: map() | nil,
pdf_options: map() | nil,
token_reduction: map() | nil,
language_detection: map() | nil,
pages: map() | nil,
keywords: map() | nil,
postprocessor: map() | nil,
html_options: String.t() | nil,
html_output: map() | nil,
extraction_timeout_secs: non_neg_integer() | nil,
max_concurrent_extractions: non_neg_integer() | nil,
result_format: String.t() | nil,
security_limits: map() | nil,
max_embedded_file_bytes: non_neg_integer() | nil,
output_format: String.t() | nil,
layout: map() | nil,
use_layout_for_markdown: boolean(),
include_document_structure: boolean(),
acceleration: map() | nil,
cache_namespace: String.t() | nil,
cache_ttl_secs: non_neg_integer() | nil,
email: map() | nil,
concurrency: String.t() | nil,
max_archive_depth: non_neg_integer(),
tree_sitter: map() | nil,
structured_extraction: map() | nil,
cancel_token: String.t() | nil
}
defstruct use_cache: true,
enable_quality_processing: true,
ocr: nil,
force_ocr: false,
force_ocr_pages: nil,
disable_ocr: false,
chunking: nil,
content_filter: nil,
images: nil,
pdf_options: nil,
token_reduction: nil,
language_detection: nil,
pages: nil,
keywords: nil,
postprocessor: nil,
html_options: nil,
html_output: nil,
extraction_timeout_secs: nil,
max_concurrent_extractions: nil,
result_format: :unified,
security_limits: nil,
max_embedded_file_bytes: nil,
output_format: :plain,
layout: nil,
use_layout_for_markdown: false,
include_document_structure: false,
acceleration: nil,
cache_namespace: nil,
cache_ttl_secs: nil,
email: nil,
concurrency: nil,
max_archive_depth: 0,
tree_sitter: nil,
structured_extraction: nil,
cancel_token: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,25 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractionDiff do
@moduledoc "The complete diff between two `ExtractionResult` values."
@typedoc "The complete diff between two `ExtractionResult` values."
@type t :: %__MODULE__{
content_diff: [map()],
tables_added: [map()],
tables_removed: [map()],
tables_changed: [map()],
metadata_changed: String.t() | nil,
embedded_changes: map()
}
defstruct content_diff: [],
tables_added: [],
tables_removed: [],
tables_changed: [],
metadata_changed: nil,
embedded_changes: nil
end

View File

@@ -0,0 +1,22 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractionMethod do
@moduledoc "How the extracted text was produced."
@typedoc "How the extracted text was produced."
@type t :: :native | :ocr | :mixed
@native :native
@ocr :ocr
@mixed :mixed
@spec native() :: t()
def native, do: @native
@spec ocr() :: t()
def ocr, do: @ocr
@spec mixed() :: t()
def mixed, do: @mixed
end

View File

@@ -0,0 +1,78 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ExtractionResult do
@moduledoc """
General extraction result used by the core extraction API.
This is the main result type returned by all extraction functions.
"""
@typedoc "General extraction result used by the core extraction API."
@type t :: %__MODULE__{
content: String.t() | nil,
mime_type: String.t() | nil,
metadata: map(),
extraction_method: String.t() | nil | nil,
tables: [map()],
detected_languages: [String.t()] | nil,
chunks: [map()] | nil,
images: [map()] | nil,
pages: [map()] | nil,
elements: [map()] | nil,
djot_content: map() | nil,
ocr_elements: [map()] | nil,
document: map() | nil,
extracted_keywords: [map()] | nil,
quality_score: float() | nil,
processing_warnings: [map()],
annotations: [map()] | nil,
children: [map()] | nil,
uris: [map()] | nil,
revisions: [map()] | nil,
structured_output: String.t() | nil,
code_intelligence: String.t() | nil,
llm_usage: [map()] | nil,
formatted_content: String.t() | nil,
ocr_internal_document: String.t() | nil
}
defstruct content: nil,
mime_type: nil,
metadata: nil,
extraction_method: nil,
tables: [],
detected_languages: nil,
chunks: nil,
images: nil,
pages: nil,
elements: nil,
djot_content: nil,
ocr_elements: nil,
document: nil,
extracted_keywords: nil,
quality_score: nil,
processing_warnings: [],
annotations: nil,
children: nil,
uris: nil,
revisions: nil,
structured_output: nil,
code_intelligence: nil,
llm_usage: nil,
formatted_content: nil,
ocr_internal_document: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.FictionBookMetadata do
@moduledoc "FictionBook (FB2) metadata."
@typedoc "FictionBook (FB2) metadata."
@type t :: %__MODULE__{
genres: [String.t()],
sequences: [String.t()],
annotation: String.t() | nil
}
defstruct genres: [],
sequences: [],
annotation: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,96 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.FileExtractionConfig do
@moduledoc """
Per-file extraction configuration overrides for batch processing.
All fields are `Option<T>` — `None` means "use the batch-level default."
This type is used with `batch_extract_files` and
`batch_extract_bytes` to allow heterogeneous
extraction settings within a single batch.
# Excluded Fields
The following `ExtractionConfig` fields are batch-level only and
cannot be overridden per file:
- `max_concurrent_extractions` — controls batch parallelism
- `use_cache` — global caching policy
- `acceleration` — shared ONNX execution provider
- `security_limits` — global archive security policy
# Example
```rust
use kreuzberg::FileExtractionConfig;
// Override just OCR forcing for a specific file
let config = FileExtractionConfig {
force_ocr: Some(true),
..Default::default()
};
```
"""
@typedoc "Per-file extraction configuration overrides for batch processing."
@type t :: %__MODULE__{
enable_quality_processing: boolean() | nil,
ocr: map() | nil,
force_ocr: boolean() | nil,
force_ocr_pages: [non_neg_integer()] | nil,
disable_ocr: boolean() | nil,
chunking: map() | nil,
content_filter: map() | nil,
images: map() | nil,
pdf_options: map() | nil,
token_reduction: map() | nil,
language_detection: map() | nil,
pages: map() | nil,
keywords: map() | nil,
postprocessor: map() | nil,
html_options: String.t() | nil,
result_format: String.t() | nil | nil,
output_format: String.t() | nil | nil,
include_document_structure: boolean() | nil,
layout: map() | nil,
timeout_secs: non_neg_integer() | nil,
tree_sitter: map() | nil,
structured_extraction: map() | nil
}
defstruct enable_quality_processing: nil,
ocr: nil,
force_ocr: nil,
force_ocr_pages: nil,
disable_ocr: nil,
chunking: nil,
content_filter: nil,
images: nil,
pdf_options: nil,
token_reduction: nil,
language_detection: nil,
pages: nil,
keywords: nil,
postprocessor: nil,
html_options: nil,
result_format: nil,
output_format: nil,
include_document_structure: nil,
layout: nil,
timeout_secs: nil,
tree_sitter: nil,
structured_extraction: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.Footnote do
@moduledoc "Footnote in Djot."
@typedoc "Footnote in Djot."
@type t :: %__MODULE__{
label: String.t() | nil,
content: [map()]
}
defstruct label: nil,
content: []
end

View File

@@ -0,0 +1,37 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.FormatMetadata do
@moduledoc """
Format-specific metadata (discriminated union).
Only one format type can exist per extraction result. This provides
type-safe, clean metadata without nested optionals.
"""
@typedoc "Format-specific metadata (discriminated union)."
@type t :: term()
@type pdf :: %{type: :pdf, metadata: Kreuzberg.PdfMetadata.t()}
@type docx :: %{type: :docx, metadata: Kreuzberg.DocxMetadata.t()}
@type excel :: %{type: :excel, metadata: Kreuzberg.ExcelMetadata.t()}
@type email :: %{type: :email, metadata: Kreuzberg.EmailMetadata.t()}
@type pptx :: %{type: :pptx, metadata: Kreuzberg.PptxMetadata.t()}
@type archive :: %{type: :archive, metadata: Kreuzberg.ArchiveMetadata.t()}
@type image :: %{type: :image, metadata: Kreuzberg.ImageMetadata.t()}
@type xml :: %{type: :xml, metadata: Kreuzberg.XmlMetadata.t()}
@type text :: %{type: :text, metadata: Kreuzberg.TextMetadata.t()}
@type html :: %{type: :html, metadata: Kreuzberg.HtmlMetadata.t()}
@type ocr :: %{type: :ocr, metadata: Kreuzberg.OcrMetadata.t()}
@type csv :: %{type: :csv, metadata: Kreuzberg.CsvMetadata.t()}
@type bibtex :: %{type: :bibtex, metadata: Kreuzberg.BibtexMetadata.t()}
@type citation :: %{type: :citation, metadata: Kreuzberg.CitationMetadata.t()}
@type fiction_book :: %{type: :fiction_book, metadata: Kreuzberg.FictionBookMetadata.t()}
@type dbf :: %{type: :dbf, metadata: Kreuzberg.DbfMetadata.t()}
@type jats :: %{type: :jats, metadata: Kreuzberg.JatsMetadata.t()}
@type epub :: %{type: :epub, metadata: Kreuzberg.EpubMetadata.t()}
@type pst :: %{type: :pst, metadata: Kreuzberg.PstMetadata.t()}
@type code :: %{type: :code, value: String.t()}
end

View File

@@ -0,0 +1,31 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.FormattedBlock do
@moduledoc """
Block-level element in a Djot document.
Represents structural elements like headings, paragraphs, lists, code blocks, etc.
"""
@typedoc "Block-level element in a Djot document."
@type t :: %__MODULE__{
block_type: String.t() | nil,
level: non_neg_integer() | nil,
inline_content: [map()],
attributes: String.t() | nil,
language: String.t() | nil,
code: String.t() | nil,
children: [map()]
}
defstruct block_type: :paragraph,
level: nil,
inline_content: [],
attributes: nil,
language: nil,
code: nil,
children: []
end

View File

@@ -0,0 +1,27 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.GridCell do
@moduledoc "Individual grid cell with position and span metadata."
@typedoc "Individual grid cell with position and span metadata."
@type t :: %__MODULE__{
content: String.t() | nil,
row: non_neg_integer(),
col: non_neg_integer(),
row_span: non_neg_integer(),
col_span: non_neg_integer(),
is_header: boolean(),
bbox: map() | nil
}
defstruct content: nil,
row: 0,
col: 0,
row_span: 0,
col_span: 0,
is_header: false,
bbox: nil
end

View File

@@ -0,0 +1,29 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HeaderMetadata do
@moduledoc "Header/heading element metadata."
@typedoc "Header/heading element metadata."
@type t :: %__MODULE__{
level: non_neg_integer(),
text: String.t() | nil,
id: String.t() | nil,
depth: non_neg_integer(),
html_offset: non_neg_integer()
}
defstruct level: 0,
text: nil,
id: nil,
depth: 0,
html_offset: 0
@doc "Validate that the header level is within valid range (1-6)."
@spec valid?(t()) :: boolean()
def valid?(%__MODULE__{level: level}) do
level >= 1 and level <= 6
end
end

View File

@@ -0,0 +1,19 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HeadingContext do
@moduledoc """
Heading context for a chunk within a Markdown document.
Contains the heading hierarchy from document root to this chunk's section.
"""
@typedoc "Heading context for a chunk within a Markdown document."
@type t :: %__MODULE__{
headings: [map()]
}
defstruct headings: []
end

View File

@@ -0,0 +1,17 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HeadingLevel do
@moduledoc "A single heading in the hierarchy."
@typedoc "A single heading in the hierarchy."
@type t :: %__MODULE__{
level: non_neg_integer(),
text: String.t() | nil
}
defstruct level: 0,
text: nil
end

View File

@@ -0,0 +1,26 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HierarchicalBlock do
@moduledoc """
A text block with hierarchy level assignment.
Represents a block of text with semantic heading information extracted from
font size clustering and hierarchical analysis.
"""
@typedoc "A text block with hierarchy level assignment."
@type t :: %__MODULE__{
text: String.t() | nil,
font_size: float(),
level: String.t() | nil,
bbox: [float()] | nil
}
defstruct text: nil,
font_size: 0.0,
level: nil,
bbox: nil
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HierarchyConfig do
@moduledoc """
Hierarchy extraction configuration for PDF text structure analysis.
Enables extraction of document hierarchy levels (H1-H6) based on font size
clustering and semantic analysis. When enabled, hierarchical blocks are
included in page content.
"""
@typedoc "Hierarchy extraction configuration for PDF text structure analysis."
@type t :: %__MODULE__{
enabled: boolean(),
k_clusters: non_neg_integer(),
include_bbox: boolean(),
ocr_coverage_threshold: float() | nil
}
defstruct enabled: true,
k_clusters: 3,
include_bbox: true,
ocr_coverage_threshold: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,59 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HtmlMetadata do
@moduledoc """
HTML metadata extracted from HTML documents.
Includes document-level metadata, Open Graph data, Twitter Card metadata,
and extracted structural elements (headers, links, images, structured data).
"""
@typedoc "HTML metadata extracted from HTML documents."
@type t :: %__MODULE__{
title: String.t() | nil,
description: String.t() | nil,
keywords: [String.t()],
author: String.t() | nil,
canonical_url: String.t() | nil,
base_href: String.t() | nil,
language: String.t() | nil,
text_direction: String.t() | nil | nil,
open_graph: map(),
twitter_card: map(),
meta_tags: map(),
headers: [map()],
links: [map()],
images: [map()],
structured_data: [map()]
}
defstruct title: nil,
description: nil,
keywords: [],
author: nil,
canonical_url: nil,
base_href: nil,
language: nil,
text_direction: nil,
open_graph: %{},
twitter_card: %{},
meta_tags: %{},
headers: [],
links: [],
images: [],
structured_data: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,53 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HtmlOutputConfig do
@moduledoc """
Configuration for styled HTML output.
When set on [`ExtractionConfig::html_output`] alongside
`output_format = OutputFormat::Html`, the pipeline builds a
[`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
the plain comrak-based renderer.
# Example
```rust
use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
let config = HtmlOutputConfig {
theme: HtmlTheme::GitHub,
css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
..Default::default()
};
```
"""
@typedoc "Configuration for styled HTML output."
@type t :: %__MODULE__{
css: String.t() | nil,
css_file: String.t() | nil,
theme: String.t() | nil,
class_prefix: String.t() | nil,
embed_css: boolean()
}
defstruct css: nil,
css_file: nil,
theme: :unstyled,
class_prefix: nil,
embed_css: true
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,33 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.HtmlTheme do
@moduledoc "Built-in HTML theme selection."
@typedoc "Built-in HTML theme selection."
@type t :: :default | :git_hub | :dark | :light | :unstyled
@default :default
@git_hub :git_hub
@dark :dark
@light :light
@unstyled :unstyled
@doc "Sensible defaults: system font stack, neutral colours, readable line measure. CSS custom properties (`--kb-*`) are all defined so user CSS can override individual values."
@spec default() :: t()
def default, do: @default
@doc "GitHub Markdown-inspired palette and spacing."
@spec git_hub() :: t()
def git_hub, do: @git_hub
@doc "Dark background, light text."
@spec dark() :: t()
def dark, do: @dark
@doc "Minimal light theme with generous whitespace."
@spec light() :: t()
def light, do: @light
@doc "No built-in stylesheet emitted. CSS custom properties are still defined on `:root` so user stylesheets can reference `var(--kb-*)` tokens."
@spec unstyled() :: t()
def unstyled, do: @unstyled
end

View File

@@ -0,0 +1,50 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImageExtractionConfig do
@moduledoc "Image extraction configuration."
@typedoc "Image extraction configuration."
@type t :: %__MODULE__{
extract_images: boolean(),
target_dpi: integer(),
max_image_dimension: integer(),
inject_placeholders: boolean(),
auto_adjust_dpi: boolean(),
min_dpi: integer(),
max_dpi: integer(),
max_images_per_page: non_neg_integer() | nil,
classify: boolean(),
include_page_rasters: boolean(),
run_ocr_on_images: boolean(),
ocr_text_only: boolean(),
append_ocr_text: boolean()
}
defstruct extract_images: true,
target_dpi: 300,
max_image_dimension: 4_096,
inject_placeholders: true,
auto_adjust_dpi: true,
min_dpi: 72,
max_dpi: 600,
max_images_per_page: nil,
classify: true,
include_page_rasters: false,
run_ocr_on_images: true,
ocr_text_only: false,
append_ocr_text: false
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,73 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImageKind do
@moduledoc "Heuristic classification of what an image likely depicts."
@typedoc "Heuristic classification of what an image likely depicts."
@type t ::
:photograph
| :diagram
| :chart
| :drawing
| :text_block
| :decoration
| :logo
| :icon
| :tile_fragment
| :mask
| :page_raster
| :unknown
@photograph :photograph
@diagram :diagram
@chart :chart
@drawing :drawing
@text_block :text_block
@decoration :decoration
@logo :logo
@icon :icon
@tile_fragment :tile_fragment
@mask :mask
@page_raster :page_raster
@unknown :unknown
@doc "Photographic image (natural scene, photograph)"
@spec photograph() :: t()
def photograph, do: @photograph
@doc "Technical or schematic diagram"
@spec diagram() :: t()
def diagram, do: @diagram
@doc "Chart, graph, or plot"
@spec chart() :: t()
def chart, do: @chart
@doc "Freehand or technical drawing"
@spec drawing() :: t()
def drawing, do: @drawing
@doc "Text-heavy image (scanned text, document)"
@spec text_block() :: t()
def text_block, do: @text_block
@doc "Decorative element or border"
@spec decoration() :: t()
def decoration, do: @decoration
@doc "Logo or brand mark"
@spec logo() :: t()
def logo, do: @logo
@doc "Small icon"
@spec icon() :: t()
def icon, do: @icon
@doc "Fragment of a larger tiled image (tile of a technical drawing)"
@spec tile_fragment() :: t()
def tile_fragment, do: @tile_fragment
@doc "Mask or transparency map"
@spec mask() :: t()
def mask, do: @mask
@doc "Full-page render produced during OCR preprocessing; used as a citation thumbnail."
@spec page_raster() :: t()
def page_raster, do: @page_raster
@doc "Could not classify with reasonable confidence"
@spec unknown() :: t()
def unknown, do: @unknown
end

View File

@@ -0,0 +1,36 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImageMetadata do
@moduledoc """
Image metadata extracted from image files.
Includes dimensions, format, and EXIF data.
"""
@typedoc "Image metadata extracted from image files."
@type t :: %__MODULE__{
width: non_neg_integer(),
height: non_neg_integer(),
format: String.t() | nil,
exif: map()
}
defstruct width: 0,
height: 0,
format: nil,
exif: %{}
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,25 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImageMetadataType do
@moduledoc "Image element metadata."
@typedoc "Image element metadata."
@type t :: %__MODULE__{
src: String.t() | nil,
alt: String.t() | nil,
title: String.t() | nil,
dimensions: [non_neg_integer()] | nil,
image_type: String.t() | nil,
attributes: [[String.t()]]
}
defstruct src: nil,
alt: nil,
title: nil,
dimensions: nil,
image_type: :data_uri,
attributes: []
end

View File

@@ -0,0 +1,44 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImagePreprocessingConfig do
@moduledoc """
Image preprocessing configuration for OCR.
These settings control how images are preprocessed before OCR to improve
text recognition quality. Different preprocessing strategies work better
for different document types.
"""
@typedoc "Image preprocessing configuration for OCR."
@type t :: %__MODULE__{
target_dpi: integer(),
auto_rotate: boolean(),
deskew: boolean(),
denoise: boolean(),
contrast_enhance: boolean(),
binarization_method: String.t() | nil,
invert_colors: boolean()
}
defstruct target_dpi: 300,
auto_rotate: true,
deskew: true,
denoise: false,
contrast_enhance: false,
binarization_method: "otsu",
invert_colors: false
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,42 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImagePreprocessingMetadata do
@moduledoc """
Image preprocessing metadata.
Tracks the transformations applied to an image during OCR preprocessing,
including DPI normalization, resizing, and resampling.
"""
@typedoc "Image preprocessing metadata."
@type t :: %__MODULE__{
original_dimensions: [non_neg_integer()],
original_dpi: [float()],
target_dpi: integer(),
scale_factor: float(),
auto_adjusted: boolean(),
final_dpi: integer(),
new_dimensions: [non_neg_integer()] | nil,
resample_method: String.t() | nil,
dimension_clamped: boolean(),
calculated_dpi: integer() | nil,
skipped_resize: boolean(),
resize_error: String.t() | nil
}
defstruct original_dimensions: [],
original_dpi: [],
target_dpi: 0,
scale_factor: 0.0,
auto_adjusted: false,
final_dpi: 0,
new_dimensions: nil,
resample_method: nil,
dimension_clamped: false,
calculated_dpi: nil,
skipped_resize: false,
resize_error: nil
end

View File

@@ -0,0 +1,29 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.ImageType do
@moduledoc "Image type classification."
@typedoc "Image type classification."
@type t :: :"data-uri" | :"inline-svg" | :external | :relative
@data_uri :"data-uri"
@inline_svg :"inline-svg"
@external :external
@relative :relative
@doc "Data URI image"
@spec data_uri() :: t()
def data_uri, do: @data_uri
@doc "Inline SVG"
@spec inline_svg() :: t()
def inline_svg, do: @inline_svg
@doc "External image URL"
@spec external() :: t()
def external, do: @external
@doc "Relative path image"
@spec relative() :: t()
def relative, do: @relative
end

View File

@@ -0,0 +1,25 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.InlineElement do
@moduledoc """
Inline element within a block.
Represents text with formatting, links, images, etc.
"""
@typedoc "Inline element within a block."
@type t :: %__MODULE__{
element_type: String.t() | nil,
content: String.t() | nil,
attributes: String.t() | nil,
metadata: map() | nil
}
defstruct element_type: :text,
content: nil,
attributes: nil,
metadata: nil
end

View File

@@ -0,0 +1,77 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.InlineType do
@moduledoc "Types of inline elements in Djot."
@typedoc "Types of inline elements in Djot."
@type t ::
:text
| :strong
| :emphasis
| :highlight
| :subscript
| :superscript
| :insert
| :delete
| :code
| :link
| :image
| :span
| :math
| :raw_inline
| :footnote_ref
| :symbol
@text :text
@strong :strong
@emphasis :emphasis
@highlight :highlight
@subscript :subscript
@superscript :superscript
@insert :insert
@delete :delete
@code :code
@link :link
@image :image
@span :span
@math :math
@raw_inline :raw_inline
@footnote_ref :footnote_ref
@symbol :symbol
@spec text() :: t()
def text, do: @text
@spec strong() :: t()
def strong, do: @strong
@spec emphasis() :: t()
def emphasis, do: @emphasis
@spec highlight() :: t()
def highlight, do: @highlight
@spec subscript() :: t()
def subscript, do: @subscript
@spec superscript() :: t()
def superscript, do: @superscript
@spec insert() :: t()
def insert, do: @insert
@spec delete() :: t()
def delete, do: @delete
@spec code() :: t()
def code, do: @code
@spec link() :: t()
def link, do: @link
@spec image() :: t()
def image, do: @image
@spec span() :: t()
def span, do: @span
@spec math() :: t()
def math, do: @math
@spec raw_inline() :: t()
def raw_inline, do: @raw_inline
@spec footnote_ref() :: t()
def footnote_ref, do: @footnote_ref
@spec symbol() :: t()
def symbol, do: @symbol
end

View File

@@ -0,0 +1,32 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.JatsMetadata do
@moduledoc "JATS (Journal Article Tag Suite) metadata."
@typedoc "JATS (Journal Article Tag Suite) metadata."
@type t :: %__MODULE__{
copyright: String.t() | nil,
license: String.t() | nil,
history_dates: map(),
contributor_roles: [map()]
}
defstruct copyright: nil,
license: nil,
history_dates: %{},
contributor_roles: []
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

21
packages/elixir/lib/kreuzberg/keyword.ex generated Normal file
View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.Keyword do
@moduledoc "Extracted keyword with metadata."
@typedoc "Extracted keyword with metadata."
@type t :: %__MODULE__{
text: String.t() | nil,
score: float(),
algorithm: String.t() | nil,
positions: [non_neg_integer()] | nil
}
defstruct text: nil,
score: 0.0,
algorithm: :yake,
positions: nil
end

View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.KeywordAlgorithm do
@moduledoc "Keyword algorithm selection."
@typedoc "Keyword algorithm selection."
@type t :: :yake | :rake
@yake :yake
@rake :rake
@doc "YAKE (Yet Another Keyword Extractor) - statistical approach"
@spec yake() :: t()
def yake, do: @yake
@doc "RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based"
@spec rake() :: t()
def rake, do: @rake
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.KeywordConfig do
@moduledoc "Keyword extraction configuration."
@typedoc "Keyword extraction configuration."
@type t :: %__MODULE__{
algorithm: String.t() | nil,
max_keywords: non_neg_integer(),
min_score: float(),
ngram_range: [non_neg_integer()],
language: String.t() | nil,
yake_params: map() | nil,
rake_params: map() | nil
}
defstruct algorithm: :yake,
max_keywords: 10,
min_score: 0,
ngram_range: [],
language: nil,
yake_params: nil,
rake_params: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.LanguageDetectionConfig do
@moduledoc "Language detection configuration."
@typedoc "Language detection configuration."
@type t :: %__MODULE__{
enabled: boolean(),
min_confidence: float(),
detect_multiple: boolean()
}
defstruct enabled: true,
min_confidence: 0.8,
detect_multiple: false
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

View File

@@ -0,0 +1,89 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.LayoutClass do
@moduledoc """
The 17 canonical document layout classes.
All model backends (RT-DETR, YOLO, etc.) map their native class IDs
to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
map to the closest equivalent.
Wire format is snake_case in all serializers (JSON, TOML, YAML).
"""
@typedoc "The 17 canonical document layout classes."
@type t ::
:caption
| :footnote
| :formula
| :list_item
| :page_footer
| :page_header
| :picture
| :section_header
| :table
| :text
| :title
| :document_index
| :code
| :checkbox_selected
| :checkbox_unselected
| :form
| :key_value_region
@caption :caption
@footnote :footnote
@formula :formula
@list_item :list_item
@page_footer :page_footer
@page_header :page_header
@picture :picture
@section_header :section_header
@table :table
@text :text
@title :title
@document_index :document_index
@code :code
@checkbox_selected :checkbox_selected
@checkbox_unselected :checkbox_unselected
@form :form
@key_value_region :key_value_region
@spec caption() :: t()
def caption, do: @caption
@spec footnote() :: t()
def footnote, do: @footnote
@spec formula() :: t()
def formula, do: @formula
@spec list_item() :: t()
def list_item, do: @list_item
@spec page_footer() :: t()
def page_footer, do: @page_footer
@spec page_header() :: t()
def page_header, do: @page_header
@spec picture() :: t()
def picture, do: @picture
@spec section_header() :: t()
def section_header, do: @section_header
@spec table() :: t()
def table, do: @table
@spec text() :: t()
def text, do: @text
@spec title() :: t()
def title, do: @title
@spec document_index() :: t()
def document_index, do: @document_index
@spec code() :: t()
def code, do: @code
@spec checkbox_selected() :: t()
def checkbox_selected, do: @checkbox_selected
@spec checkbox_unselected() :: t()
def checkbox_unselected, do: @checkbox_unselected
@spec form() :: t()
def form, do: @form
@spec key_value_region() :: t()
def key_value_region, do: @key_value_region
end

View File

@@ -0,0 +1,19 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.LayoutDetection do
@moduledoc "A single layout detection result."
@typedoc "A single layout detection result."
@type t :: %__MODULE__{
class_name: String.t() | nil,
confidence: float(),
bbox: map()
}
defstruct class_name: :caption,
confidence: 0.0,
bbox: nil
end

View File

@@ -0,0 +1,38 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
defmodule Kreuzberg.LayoutDetectionConfig do
@moduledoc """
Layout detection configuration.
Controls layout detection behavior in the extraction pipeline.
When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
is enabled for PDF extraction.
"""
@typedoc "Layout detection configuration."
@type t :: %__MODULE__{
confidence_threshold: float() | nil,
apply_heuristics: boolean(),
table_model: String.t() | nil,
acceleration: map() | nil
}
defstruct confidence_threshold: nil,
apply_heuristics: true,
table_model: :tatr,
acceleration: nil
defimpl Jason.Encoder do
@doc false
def encode(value, opts) do
value
|> Map.from_struct()
|> Enum.reject(fn {_k, v} -> v == nil end)
|> Enum.into(%{})
|> Jason.Encoder.encode(opts)
end
end
end

Some files were not shown because too many files have changed in this diff Show More