This commit is contained in:
276
packages/elixir/lib/kreuzberg.ex
generated
Normal file
276
packages/elixir/lib/kreuzberg.ex
generated
Normal file
@@ -0,0 +1,276 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg do
|
||||
@moduledoc "High-level API for kreuzberg"
|
||||
|
||||
@doc "Extract content from a byte array."
|
||||
@spec extract_bytes_async(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_async(content, mime_type) do
|
||||
Kreuzberg.Native.extract_bytes_async(content, mime_type, nil)
|
||||
end
|
||||
|
||||
@doc "Extract content from a byte array."
|
||||
@spec extract_bytes_async(binary(), String.t(), String.t() | nil) ::
|
||||
{:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_async(content, mime_type, config) do
|
||||
Kreuzberg.Native.extract_bytes_async(content, mime_type, config)
|
||||
end
|
||||
|
||||
@doc "Extract content from a file."
|
||||
@spec extract_file_async(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_file_async(path, opts \\ []) do
|
||||
Kreuzberg.Native.extract_file_async(
|
||||
path,
|
||||
Keyword.get(opts, :mime_type),
|
||||
Keyword.get(opts, :config)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_file`."
|
||||
@spec extract_file_sync(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_file_sync(path, opts \\ []) do
|
||||
Kreuzberg.Native.extract_file_sync(
|
||||
path,
|
||||
Keyword.get(opts, :mime_type),
|
||||
Keyword.get(opts, :config)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_bytes`."
|
||||
@spec extract_bytes_sync(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_sync(content, mime_type) do
|
||||
Kreuzberg.Native.extract_bytes_sync(content, mime_type, nil)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_bytes`."
|
||||
@spec extract_bytes_sync(binary(), String.t(), String.t() | nil) ::
|
||||
{:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_sync(content, mime_type, config) do
|
||||
Kreuzberg.Native.extract_bytes_sync(content, mime_type, config)
|
||||
end
|
||||
|
||||
@doc "Detect MIME type from raw file bytes."
|
||||
@spec detect_mime_type_from_bytes(binary()) :: {:ok, String.t()} | {:error, atom, String.t()}
|
||||
def detect_mime_type_from_bytes(content) do
|
||||
Kreuzberg.Native.detect_mime_type_from_bytes(content)
|
||||
end
|
||||
|
||||
@doc "Get file extensions for a given MIME type."
|
||||
@spec get_extensions_for_mime(String.t()) :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def get_extensions_for_mime(mime_type) do
|
||||
Kreuzberg.Native.get_extensions_for_mime(mime_type)
|
||||
end
|
||||
|
||||
@doc "Clear all embedding backends from the global registry."
|
||||
@spec clear_embedding_backends() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_embedding_backends do
|
||||
Kreuzberg.Native.clear_embedding_backends()
|
||||
end
|
||||
|
||||
@doc "List the names of all registered embedding backends."
|
||||
@spec list_embedding_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_embedding_backends do
|
||||
Kreuzberg.Native.list_embedding_backends()
|
||||
end
|
||||
|
||||
@doc "List names of all registered document extractors."
|
||||
@spec list_document_extractors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_document_extractors do
|
||||
Kreuzberg.Native.list_document_extractors()
|
||||
end
|
||||
|
||||
@doc "Clear all document extractors from the global registry."
|
||||
@spec clear_document_extractors() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_document_extractors do
|
||||
Kreuzberg.Native.clear_document_extractors()
|
||||
end
|
||||
|
||||
@doc "List all registered OCR backends."
|
||||
@spec list_ocr_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_ocr_backends do
|
||||
Kreuzberg.Native.list_ocr_backends()
|
||||
end
|
||||
|
||||
@doc "Clear all OCR backends from the global registry."
|
||||
@spec clear_ocr_backends() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_ocr_backends do
|
||||
Kreuzberg.Native.clear_ocr_backends()
|
||||
end
|
||||
|
||||
@doc "List all registered post-processor names."
|
||||
@spec list_post_processors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_post_processors do
|
||||
Kreuzberg.Native.list_post_processors()
|
||||
end
|
||||
|
||||
@doc "Remove all registered post-processors."
|
||||
@spec clear_post_processors() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_post_processors do
|
||||
Kreuzberg.Native.clear_post_processors()
|
||||
end
|
||||
|
||||
@doc "List names of all registered renderers."
|
||||
@spec list_renderers() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_renderers do
|
||||
Kreuzberg.Native.list_renderers()
|
||||
end
|
||||
|
||||
@doc "Clear all renderers from the global registry."
|
||||
@spec clear_renderers() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_renderers do
|
||||
Kreuzberg.Native.clear_renderers()
|
||||
end
|
||||
|
||||
@doc "List names of all registered validators."
|
||||
@spec list_validators() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_validators do
|
||||
Kreuzberg.Native.list_validators()
|
||||
end
|
||||
|
||||
@doc "Remove all registered validators."
|
||||
@spec clear_validators() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_validators do
|
||||
Kreuzberg.Native.clear_validators()
|
||||
end
|
||||
|
||||
@doc "Compare two extraction results and return a structured diff."
|
||||
@spec compare(keyword()) :: map()
|
||||
def compare(opts \\ []) do
|
||||
Kreuzberg.Native.compare(
|
||||
Keyword.get(opts, :a),
|
||||
Keyword.get(opts, :b),
|
||||
Keyword.get(opts, :opts)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Generate embeddings asynchronously for a list of text strings."
|
||||
@spec embed_texts_async([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts_async(texts) do
|
||||
Kreuzberg.Native.embed_texts_async(texts, nil)
|
||||
end
|
||||
|
||||
@doc "Generate embeddings asynchronously for a list of text strings."
|
||||
@spec embed_texts_async([String.t()], String.t() | nil) ::
|
||||
{:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts_async(texts, config) do
|
||||
Kreuzberg.Native.embed_texts_async(texts, config)
|
||||
end
|
||||
|
||||
@doc "Render a single PDF page to PNG bytes."
|
||||
@spec render_pdf_page_to_png(binary(), non_neg_integer(), keyword()) ::
|
||||
{:ok, binary()} | {:error, atom, String.t()}
|
||||
def render_pdf_page_to_png(pdf_bytes, page_index, opts \\ []) do
|
||||
Kreuzberg.Native.render_pdf_page_to_png(
|
||||
pdf_bytes,
|
||||
page_index,
|
||||
Keyword.get(opts, :dpi),
|
||||
Keyword.get(opts, :password)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Detect the MIME type of a file at the given path."
|
||||
@spec detect_mime_type(String.t(), boolean()) :: {:ok, String.t()} | {:error, atom, String.t()}
|
||||
def detect_mime_type(path, check_exists) do
|
||||
Kreuzberg.Native.detect_mime_type(path, check_exists)
|
||||
end
|
||||
|
||||
@doc "Embed a list of texts using the configured embedding model."
|
||||
@spec embed_texts([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts(texts) do
|
||||
Kreuzberg.Native.embed_texts(texts, nil)
|
||||
end
|
||||
|
||||
@doc "Embed a list of texts using the configured embedding model."
|
||||
@spec embed_texts([String.t()], String.t() | nil) ::
|
||||
{:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts(texts, config) do
|
||||
Kreuzberg.Native.embed_texts(texts, config)
|
||||
end
|
||||
|
||||
@doc "Get an embedding preset by name."
|
||||
@spec get_embedding_preset(String.t()) :: map() | nil
|
||||
def get_embedding_preset(name) do
|
||||
Kreuzberg.Native.get_embedding_preset(name)
|
||||
end
|
||||
|
||||
@doc "List the names of all available embedding presets."
|
||||
@spec list_embedding_presets() :: [String.t()]
|
||||
def list_embedding_presets do
|
||||
Kreuzberg.Native.list_embedding_presets()
|
||||
end
|
||||
|
||||
@doc "Register a OcrBackend plugin with a GenServer PID and name."
|
||||
@spec register_ocr_backend(pid(), String.t()) :: :ok | :error
|
||||
def register_ocr_backend(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_ocr_backend(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered OcrBackend plugin by name."
|
||||
@spec unregister_ocr_backend(String.t()) :: :ok | :error
|
||||
def unregister_ocr_backend(name) do
|
||||
Kreuzberg.Native.unregister_ocr_backend(name)
|
||||
end
|
||||
|
||||
@doc "Register a PostProcessor plugin with a GenServer PID and name."
|
||||
@spec register_post_processor(pid(), String.t()) :: :ok | :error
|
||||
def register_post_processor(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_post_processor(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered PostProcessor plugin by name."
|
||||
@spec unregister_post_processor(String.t()) :: :ok | :error
|
||||
def unregister_post_processor(name) do
|
||||
Kreuzberg.Native.unregister_post_processor(name)
|
||||
end
|
||||
|
||||
@doc "Register a Validator plugin with a GenServer PID and name."
|
||||
@spec register_validator(pid(), String.t()) :: :ok | :error
|
||||
def register_validator(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_validator(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered Validator plugin by name."
|
||||
@spec unregister_validator(String.t()) :: :ok | :error
|
||||
def unregister_validator(name) do
|
||||
Kreuzberg.Native.unregister_validator(name)
|
||||
end
|
||||
|
||||
@doc "Register a EmbeddingBackend plugin with a GenServer PID and name."
|
||||
@spec register_embedding_backend(pid(), String.t()) :: :ok | :error
|
||||
def register_embedding_backend(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_embedding_backend(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered EmbeddingBackend plugin by name."
|
||||
@spec unregister_embedding_backend(String.t()) :: :ok | :error
|
||||
def unregister_embedding_backend(name) do
|
||||
Kreuzberg.Native.unregister_embedding_backend(name)
|
||||
end
|
||||
|
||||
@doc "Register a DocumentExtractor plugin with a GenServer PID and name."
|
||||
@spec register_document_extractor(pid(), String.t()) :: :ok | :error
|
||||
def register_document_extractor(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_document_extractor(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered DocumentExtractor plugin by name."
|
||||
@spec unregister_document_extractor(String.t()) :: :ok | :error
|
||||
def unregister_document_extractor(name) do
|
||||
Kreuzberg.Native.unregister_document_extractor(name)
|
||||
end
|
||||
|
||||
@doc "Register a Renderer plugin with a GenServer PID and name."
|
||||
@spec register_renderer(pid(), String.t()) :: :ok | :error
|
||||
def register_renderer(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_renderer(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered Renderer plugin by name."
|
||||
@spec unregister_renderer(String.t()) :: :ok | :error
|
||||
def unregister_renderer(name) do
|
||||
Kreuzberg.Native.unregister_renderer(name)
|
||||
end
|
||||
end
|
||||
48
packages/elixir/lib/kreuzberg/acceleration_config.ex
generated
Normal file
48
packages/elixir/lib/kreuzberg/acceleration_config.ex
generated
Normal file
@@ -0,0 +1,48 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.AccelerationConfig do
|
||||
@moduledoc """
|
||||
Hardware acceleration configuration for ONNX Runtime models.
|
||||
|
||||
Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||||
for inference in layout detection and embedding generation.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::AccelerationConfig;
|
||||
|
||||
// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
|
||||
let config = AccelerationConfig::default();
|
||||
|
||||
// Force CPU only
|
||||
let config = AccelerationConfig {
|
||||
provider: kreuzberg::ExecutionProviderType::Cpu,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Hardware acceleration configuration for ONNX Runtime models."
|
||||
@type t :: %__MODULE__{
|
||||
provider: String.t() | nil,
|
||||
device_id: non_neg_integer()
|
||||
}
|
||||
|
||||
defstruct provider: :auto,
|
||||
device_id: 0
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
28
packages/elixir/lib/kreuzberg/annotation_kind.ex
generated
Normal file
28
packages/elixir/lib/kreuzberg/annotation_kind.ex
generated
Normal file
@@ -0,0 +1,28 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.AnnotationKind do
|
||||
@moduledoc "Types of inline text annotations."
|
||||
|
||||
@typedoc "Types of inline text annotations."
|
||||
@type t :: term()
|
||||
|
||||
@type bold :: :bold
|
||||
@type italic :: :italic
|
||||
@type underline :: :underline
|
||||
@type strikethrough :: :strikethrough
|
||||
@type code :: :code
|
||||
@type subscript :: :subscript
|
||||
@type superscript :: :superscript
|
||||
@type link :: %{type: :link, url: String.t(), title: String.t()}
|
||||
@typedoc "Highlighted text (PDF highlights, HTML `<mark>`)."
|
||||
@type highlight :: :highlight
|
||||
@typedoc "Text color (CSS-compatible value, e.g. \"#ff0000\", \"red\")."
|
||||
@type color :: %{type: :color, value: String.t()}
|
||||
@typedoc "Font size with units (e.g. \"12pt\", \"1.2em\", \"16px\")."
|
||||
@type font_size :: %{type: :font_size, value: String.t()}
|
||||
@typedoc "Extensible annotation for format-specific styling."
|
||||
@type custom :: %{type: :custom, name: String.t(), value: String.t()}
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/archive_entry.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/archive_entry.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ArchiveEntry do
|
||||
@moduledoc """
|
||||
A single file extracted from an archive.
|
||||
|
||||
When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||||
enabled, each processable file produces its own full `ExtractionResult`.
|
||||
"""
|
||||
|
||||
@typedoc "A single file extracted from an archive."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
result: map()
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
mime_type: nil,
|
||||
result: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/archive_metadata.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/archive_metadata.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ArchiveMetadata do
|
||||
@moduledoc """
|
||||
Archive (ZIP/TAR/7Z) metadata.
|
||||
|
||||
Extracted from compressed archive files containing file lists and size information.
|
||||
"""
|
||||
|
||||
@typedoc "Archive (ZIP/TAR/7Z) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
format: String.t() | nil,
|
||||
file_count: non_neg_integer(),
|
||||
file_list: [String.t()],
|
||||
total_size: non_neg_integer(),
|
||||
compressed_size: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct format: nil,
|
||||
file_count: 0,
|
||||
file_list: [],
|
||||
total_size: 0,
|
||||
compressed_size: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/b_box.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/b_box.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BBox do
|
||||
@moduledoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
|
||||
|
||||
@typedoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
|
||||
@type t :: %__MODULE__{
|
||||
x1: float(),
|
||||
y1: float(),
|
||||
x2: float(),
|
||||
y2: float()
|
||||
}
|
||||
|
||||
defstruct x1: 0.0,
|
||||
y1: 0.0,
|
||||
x2: 0.0,
|
||||
y2: 0.0
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/batch_bytes_item.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/batch_bytes_item.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BatchBytesItem do
|
||||
@moduledoc """
|
||||
Batch item for byte array extraction.
|
||||
|
||||
Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||||
to represent a single item in a batch extraction job.
|
||||
"""
|
||||
|
||||
@typedoc "Batch item for byte array extraction."
|
||||
@type t :: %__MODULE__{
|
||||
content: binary(),
|
||||
mime_type: String.t() | nil,
|
||||
config: map() | nil
|
||||
}
|
||||
|
||||
defstruct content: <<>>,
|
||||
mime_type: nil,
|
||||
config: nil
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/batch_file_item.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/batch_file_item.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BatchFileItem do
|
||||
@moduledoc """
|
||||
Batch item for file extraction.
|
||||
|
||||
Used with `batch_extract_files` and `batch_extract_files_sync`
|
||||
to represent a single file in a batch extraction job.
|
||||
"""
|
||||
|
||||
@typedoc "Batch item for file extraction."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
config: map() | nil
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
config: nil
|
||||
end
|
||||
34
packages/elixir/lib/kreuzberg/bibtex_metadata.ex
generated
Normal file
34
packages/elixir/lib/kreuzberg/bibtex_metadata.ex
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BibtexMetadata do
|
||||
@moduledoc "BibTeX bibliography metadata."
|
||||
|
||||
@typedoc "BibTeX bibliography metadata."
|
||||
@type t :: %__MODULE__{
|
||||
entry_count: non_neg_integer(),
|
||||
citation_keys: [String.t()],
|
||||
authors: [String.t()],
|
||||
year_range: map() | nil,
|
||||
entry_types: map() | nil
|
||||
}
|
||||
|
||||
defstruct entry_count: 0,
|
||||
citation_keys: [],
|
||||
authors: [],
|
||||
year_range: nil,
|
||||
entry_types: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
77
packages/elixir/lib/kreuzberg/block_type.ex
generated
Normal file
77
packages/elixir/lib/kreuzberg/block_type.ex
generated
Normal file
@@ -0,0 +1,77 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BlockType do
|
||||
@moduledoc "Types of block-level elements in Djot."
|
||||
|
||||
@typedoc "Types of block-level elements in Djot."
|
||||
@type t ::
|
||||
:paragraph
|
||||
| :heading
|
||||
| :blockquote
|
||||
| :code_block
|
||||
| :list_item
|
||||
| :ordered_list
|
||||
| :bullet_list
|
||||
| :task_list
|
||||
| :definition_list
|
||||
| :definition_term
|
||||
| :definition_description
|
||||
| :div
|
||||
| :section
|
||||
| :thematic_break
|
||||
| :raw_block
|
||||
| :math_display
|
||||
|
||||
@paragraph :paragraph
|
||||
@heading :heading
|
||||
@blockquote :blockquote
|
||||
@code_block :code_block
|
||||
@list_item :list_item
|
||||
@ordered_list :ordered_list
|
||||
@bullet_list :bullet_list
|
||||
@task_list :task_list
|
||||
@definition_list :definition_list
|
||||
@definition_term :definition_term
|
||||
@definition_description :definition_description
|
||||
@div :div
|
||||
@section :section
|
||||
@thematic_break :thematic_break
|
||||
@raw_block :raw_block
|
||||
@math_display :math_display
|
||||
|
||||
@spec paragraph() :: t()
|
||||
def paragraph, do: @paragraph
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@spec blockquote() :: t()
|
||||
def blockquote, do: @blockquote
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@spec list_item() :: t()
|
||||
def list_item, do: @list_item
|
||||
@spec ordered_list() :: t()
|
||||
def ordered_list, do: @ordered_list
|
||||
@spec bullet_list() :: t()
|
||||
def bullet_list, do: @bullet_list
|
||||
@spec task_list() :: t()
|
||||
def task_list, do: @task_list
|
||||
@spec definition_list() :: t()
|
||||
def definition_list, do: @definition_list
|
||||
@spec definition_term() :: t()
|
||||
def definition_term, do: @definition_term
|
||||
@spec definition_description() :: t()
|
||||
def definition_description, do: @definition_description
|
||||
@spec div() :: t()
|
||||
def div, do: @div
|
||||
@spec section() :: t()
|
||||
def section, do: @section
|
||||
@spec thematic_break() :: t()
|
||||
def thematic_break, do: @thematic_break
|
||||
@spec raw_block() :: t()
|
||||
def raw_block, do: @raw_block
|
||||
@spec math_display() :: t()
|
||||
def math_display, do: @math_display
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/bounding_box.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/bounding_box.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BoundingBox do
|
||||
@moduledoc "Bounding box coordinates for element positioning."
|
||||
|
||||
@typedoc "Bounding box coordinates for element positioning."
|
||||
@type t :: %__MODULE__{
|
||||
x0: float(),
|
||||
y0: float(),
|
||||
x1: float(),
|
||||
y1: float()
|
||||
}
|
||||
|
||||
defstruct x0: 0.0,
|
||||
y0: 0.0,
|
||||
x1: 0.0,
|
||||
y1: 0.0
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/cache_stats.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/cache_stats.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CacheStats do
|
||||
@moduledoc false
|
||||
|
||||
@type t :: %__MODULE__{
|
||||
total_files: non_neg_integer(),
|
||||
total_size_mb: float(),
|
||||
available_space_mb: float(),
|
||||
oldest_file_age_days: float(),
|
||||
newest_file_age_days: float()
|
||||
}
|
||||
|
||||
defstruct total_files: 0,
|
||||
total_size_mb: 0.0,
|
||||
available_space_mb: 0.0,
|
||||
oldest_file_age_days: 0.0,
|
||||
newest_file_age_days: 0.0
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/cell_change.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/cell_change.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CellChange do
|
||||
@moduledoc """
|
||||
A single changed cell within a table.
|
||||
|
||||
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||||
reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
`crate::diff` re-exports this type verbatim.
|
||||
"""
|
||||
|
||||
@typedoc "A single changed cell within a table."
|
||||
@type t :: %__MODULE__{
|
||||
row: non_neg_integer(),
|
||||
col: non_neg_integer(),
|
||||
from: String.t() | nil,
|
||||
to: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct row: 0,
|
||||
col: 0,
|
||||
from: nil,
|
||||
to: nil
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/chunk.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/chunk.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Chunk do
|
||||
@moduledoc """
|
||||
A text chunk with optional embedding and metadata.
|
||||
|
||||
Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||||
contains the text content, optional embedding vector (if embedding generation
|
||||
is configured), and metadata about its position in the document.
|
||||
"""
|
||||
|
||||
@typedoc "A text chunk with optional embedding and metadata."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
chunk_type: String.t() | nil,
|
||||
embedding: [float()] | nil,
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
chunk_type: :unknown,
|
||||
embedding: nil,
|
||||
metadata: nil
|
||||
end
|
||||
31
packages/elixir/lib/kreuzberg/chunk_metadata.ex
generated
Normal file
31
packages/elixir/lib/kreuzberg/chunk_metadata.ex
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkMetadata do
|
||||
@moduledoc "Metadata about a chunk's position in the original document."
|
||||
|
||||
@typedoc "Metadata about a chunk's position in the original document."
|
||||
@type t :: %__MODULE__{
|
||||
byte_start: non_neg_integer(),
|
||||
byte_end: non_neg_integer(),
|
||||
token_count: non_neg_integer() | nil,
|
||||
chunk_index: non_neg_integer(),
|
||||
total_chunks: non_neg_integer(),
|
||||
first_page: non_neg_integer() | nil,
|
||||
last_page: non_neg_integer() | nil,
|
||||
heading_context: map() | nil,
|
||||
image_indices: [non_neg_integer()]
|
||||
}
|
||||
|
||||
defstruct byte_start: 0,
|
||||
byte_end: 0,
|
||||
token_count: nil,
|
||||
chunk_index: 0,
|
||||
total_chunks: 0,
|
||||
first_page: nil,
|
||||
last_page: nil,
|
||||
heading_context: nil,
|
||||
image_indices: []
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/chunk_sizing.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/chunk_sizing.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkSizing do
|
||||
@moduledoc """
|
||||
How chunk size is measured.
|
||||
|
||||
Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||||
chunks are sized by token count according to the specified tokenizer.
|
||||
|
||||
Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||||
available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||||
(e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||||
"""
|
||||
|
||||
@typedoc "How chunk size is measured."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Size measured in Unicode characters (default)."
|
||||
@type characters :: :characters
|
||||
@typedoc "Size measured in tokens from a HuggingFace tokenizer."
|
||||
@type tokenizer :: %{type: :tokenizer, model: String.t(), cache_dir: String.t()}
|
||||
end
|
||||
84
packages/elixir/lib/kreuzberg/chunk_type.ex
generated
Normal file
84
packages/elixir/lib/kreuzberg/chunk_type.ex
generated
Normal file
@@ -0,0 +1,84 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkType do
|
||||
@moduledoc """
|
||||
Semantic structural classification of a text chunk.
|
||||
|
||||
Assigned by the heuristic classifier in `chunking::classifier`.
|
||||
Defaults to `Unknown` when no rule matches.
|
||||
Designed to be extended in future versions without breaking changes.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic structural classification of a text chunk."
|
||||
@type t ::
|
||||
:heading
|
||||
| :party_list
|
||||
| :definitions
|
||||
| :operative_clause
|
||||
| :signature_block
|
||||
| :schedule
|
||||
| :table_like
|
||||
| :formula
|
||||
| :code_block
|
||||
| :image
|
||||
| :org_chart
|
||||
| :diagram
|
||||
| :unknown
|
||||
|
||||
@heading :heading
|
||||
@party_list :party_list
|
||||
@definitions :definitions
|
||||
@operative_clause :operative_clause
|
||||
@signature_block :signature_block
|
||||
@schedule :schedule
|
||||
@table_like :table_like
|
||||
@formula :formula
|
||||
@code_block :code_block
|
||||
@image :image
|
||||
@org_chart :org_chart
|
||||
@diagram :diagram
|
||||
@unknown :unknown
|
||||
|
||||
@doc "Section heading or document title."
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@doc "Party list: names, addresses, and signatories."
|
||||
@spec party_list() :: t()
|
||||
def party_list, do: @party_list
|
||||
@doc "Definition clause (\"X means…\", \"X shall mean…\")."
|
||||
@spec definitions() :: t()
|
||||
def definitions, do: @definitions
|
||||
@doc "Operative clause containing legal/contractual action verbs."
|
||||
@spec operative_clause() :: t()
|
||||
def operative_clause, do: @operative_clause
|
||||
@doc "Signature block with signatures, names, and dates."
|
||||
@spec signature_block() :: t()
|
||||
def signature_block, do: @signature_block
|
||||
@doc "Schedule, annex, appendix, or exhibit section."
|
||||
@spec schedule() :: t()
|
||||
def schedule, do: @schedule
|
||||
@doc "Table-like content with aligned columns or repeated patterns."
|
||||
@spec table_like() :: t()
|
||||
def table_like, do: @table_like
|
||||
@doc "Mathematical formula or equation."
|
||||
@spec formula() :: t()
|
||||
def formula, do: @formula
|
||||
@doc "Code block or preformatted content."
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@doc "Embedded or referenced image content."
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@doc "Organizational chart or hierarchy diagram."
|
||||
@spec org_chart() :: t()
|
||||
def org_chart, do: @org_chart
|
||||
@doc "Diagram, figure, or visual illustration."
|
||||
@spec diagram() :: t()
|
||||
def diagram, do: @diagram
|
||||
@doc "Unclassified or mixed content."
|
||||
@spec unknown() :: t()
|
||||
def unknown, do: @unknown
|
||||
end
|
||||
40
packages/elixir/lib/kreuzberg/chunker_type.ex
generated
Normal file
40
packages/elixir/lib/kreuzberg/chunker_type.ex
generated
Normal file
@@ -0,0 +1,40 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkerType do
|
||||
@moduledoc """
|
||||
Type of text chunker to use.
|
||||
|
||||
# Variants
|
||||
|
||||
* `Text` - Generic text splitter, splits on whitespace and punctuation
|
||||
* `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
||||
* `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
||||
* `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
||||
embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
||||
lower = more splits). Without an embedding, falls back to a
|
||||
structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
||||
blank-line paragraphs) and merges groups into chunks capped at
|
||||
`max_characters` (default 1000). `topic_threshold` has no effect in the
|
||||
fallback path. For best results, pair with an embedding model.
|
||||
"""
|
||||
|
||||
@typedoc "Type of text chunker to use."
|
||||
@type t :: :text | :markdown | :yaml | :semantic
|
||||
|
||||
@text :text
|
||||
@markdown :markdown
|
||||
@yaml :yaml
|
||||
@semantic :semantic
|
||||
|
||||
@spec text() :: t()
|
||||
def text, do: @text
|
||||
@spec markdown() :: t()
|
||||
def markdown, do: @markdown
|
||||
@spec yaml() :: t()
|
||||
def yaml, do: @yaml
|
||||
@spec semantic() :: t()
|
||||
def semantic, do: @semantic
|
||||
end
|
||||
55
packages/elixir/lib/kreuzberg/chunking_config.ex
generated
Normal file
55
packages/elixir/lib/kreuzberg/chunking_config.ex
generated
Normal file
@@ -0,0 +1,55 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkingConfig do
|
||||
@moduledoc """
|
||||
Chunking configuration.
|
||||
|
||||
Configures text chunking for document content, including chunk size,
|
||||
overlap, trimming behavior, and optional embeddings.
|
||||
|
||||
Use `..Default::default()` when constructing to allow for future field additions:
|
||||
```rust
|
||||
let config = ChunkingConfig {
|
||||
max_characters: 500,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Chunking configuration."
|
||||
@type t :: %__MODULE__{
|
||||
max_characters: non_neg_integer(),
|
||||
overlap: non_neg_integer(),
|
||||
trim: boolean(),
|
||||
chunker_type: String.t() | nil,
|
||||
embedding: map() | nil,
|
||||
preset: String.t() | nil,
|
||||
sizing: String.t() | nil,
|
||||
prepend_heading_context: boolean(),
|
||||
topic_threshold: float() | nil
|
||||
}
|
||||
|
||||
defstruct max_characters: 1_000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunker_type: :text,
|
||||
embedding: nil,
|
||||
preset: nil,
|
||||
sizing: :characters,
|
||||
prepend_heading_context: false,
|
||||
topic_threshold: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/citation_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/citation_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CitationMetadata do
|
||||
@moduledoc "Citation file metadata (RIS, PubMed, EndNote)."
|
||||
|
||||
@typedoc "Citation file metadata (RIS, PubMed, EndNote)."
|
||||
@type t :: %__MODULE__{
|
||||
citation_count: non_neg_integer(),
|
||||
format: String.t() | nil,
|
||||
authors: [String.t()],
|
||||
year_range: map() | nil,
|
||||
dois: [String.t()],
|
||||
keywords: [String.t()]
|
||||
}
|
||||
|
||||
defstruct citation_count: 0,
|
||||
format: nil,
|
||||
authors: [],
|
||||
year_range: nil,
|
||||
dois: [],
|
||||
keywords: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/code_content_mode.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/code_content_mode.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CodeContentMode do
|
||||
@moduledoc """
|
||||
Content rendering mode for code extraction.
|
||||
|
||||
Controls how extracted code content is represented in the `content` field
|
||||
of `ExtractionResult`.
|
||||
"""
|
||||
|
||||
@typedoc "Content rendering mode for code extraction."
|
||||
@type t :: :chunks | :raw | :structure
|
||||
|
||||
@chunks :chunks
|
||||
@raw :raw
|
||||
@structure :structure
|
||||
|
||||
@doc "Use TSLP semantic chunks as content (default)."
|
||||
@spec chunks() :: t()
|
||||
def chunks, do: @chunks
|
||||
@doc "Use raw source code as content."
|
||||
@spec raw() :: t()
|
||||
def raw, do: @raw
|
||||
@doc "Emit function/class headings + docstrings (no code bodies)."
|
||||
@spec structure() :: t()
|
||||
def structure, do: @structure
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/content_filter_config.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/content_filter_config.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContentFilterConfig do
|
||||
@moduledoc """
|
||||
Cross-extractor content filtering configuration.
|
||||
|
||||
Controls whether "furniture" content (headers, footers, page numbers,
|
||||
watermarks, repeating text) is included in or stripped from extraction
|
||||
results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||||
with format-specific implementation.
|
||||
|
||||
When `None` on `ExtractionConfig`, each extractor uses its current
|
||||
default behavior unchanged.
|
||||
"""
|
||||
|
||||
@typedoc "Cross-extractor content filtering configuration."
|
||||
@type t :: %__MODULE__{
|
||||
include_headers: boolean(),
|
||||
include_footers: boolean(),
|
||||
strip_repeating_text: boolean(),
|
||||
include_watermarks: boolean()
|
||||
}
|
||||
|
||||
defstruct include_headers: false,
|
||||
include_footers: false,
|
||||
strip_repeating_text: true,
|
||||
include_watermarks: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/content_layer.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/content_layer.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContentLayer do
|
||||
@moduledoc """
|
||||
Content layer classification for document nodes.
|
||||
|
||||
Replaces separate body/furniture arrays with per-node granularity.
|
||||
"""
|
||||
|
||||
@typedoc "Content layer classification for document nodes."
|
||||
@type t :: :body | :header | :footer | :footnote
|
||||
|
||||
@body :body
|
||||
@header :header
|
||||
@footer :footer
|
||||
@footnote :footnote
|
||||
|
||||
@doc "Main document body content."
|
||||
@spec body() :: t()
|
||||
def body, do: @body
|
||||
@doc "Page/section header (running header)."
|
||||
@spec header() :: t()
|
||||
def header, do: @header
|
||||
@doc "Page/section footer (running footer)."
|
||||
@spec footer() :: t()
|
||||
def footer, do: @footer
|
||||
@doc "Footnote content."
|
||||
@spec footnote() :: t()
|
||||
def footnote, do: @footnote
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/contributor_role.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/contributor_role.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContributorRole do
|
||||
@moduledoc "JATS contributor with role."
|
||||
|
||||
@typedoc "JATS contributor with role."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
role: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
role: nil
|
||||
end
|
||||
59
packages/elixir/lib/kreuzberg/core_properties.ex
generated
Normal file
59
packages/elixir/lib/kreuzberg/core_properties.ex
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CoreProperties do
|
||||
@moduledoc """
|
||||
Dublin Core metadata from docProps/core.xml
|
||||
|
||||
Contains standard metadata fields defined by the Dublin Core standard
|
||||
and Office-specific extensions.
|
||||
"""
|
||||
|
||||
@typedoc "Dublin Core metadata from docProps/core.xml"
|
||||
@type t :: %__MODULE__{
|
||||
title: String.t() | nil,
|
||||
subject: String.t() | nil,
|
||||
creator: String.t() | nil,
|
||||
keywords: String.t() | nil,
|
||||
description: String.t() | nil,
|
||||
last_modified_by: String.t() | nil,
|
||||
revision: String.t() | nil,
|
||||
created: String.t() | nil,
|
||||
modified: String.t() | nil,
|
||||
category: String.t() | nil,
|
||||
content_status: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
identifier: String.t() | nil,
|
||||
version: String.t() | nil,
|
||||
last_printed: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct title: nil,
|
||||
subject: nil,
|
||||
creator: nil,
|
||||
keywords: nil,
|
||||
description: nil,
|
||||
last_modified_by: nil,
|
||||
revision: nil,
|
||||
created: nil,
|
||||
modified: nil,
|
||||
category: nil,
|
||||
content_status: nil,
|
||||
language: nil,
|
||||
identifier: nil,
|
||||
version: nil,
|
||||
last_printed: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
34
packages/elixir/lib/kreuzberg/csv_metadata.ex
generated
Normal file
34
packages/elixir/lib/kreuzberg/csv_metadata.ex
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CsvMetadata do
|
||||
@moduledoc "CSV/TSV file metadata."
|
||||
|
||||
@typedoc "CSV/TSV file metadata."
|
||||
@type t :: %__MODULE__{
|
||||
row_count: non_neg_integer(),
|
||||
column_count: non_neg_integer(),
|
||||
delimiter: String.t() | nil,
|
||||
has_header: boolean(),
|
||||
column_types: [String.t()] | nil
|
||||
}
|
||||
|
||||
defstruct row_count: 0,
|
||||
column_count: 0,
|
||||
delimiter: nil,
|
||||
has_header: false,
|
||||
column_types: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/dbf_field_info.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/dbf_field_info.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DbfFieldInfo do
|
||||
@moduledoc "dBASE field information."
|
||||
|
||||
@typedoc "dBASE field information."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
field_type: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
field_type: nil
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/dbf_metadata.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/dbf_metadata.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DbfMetadata do
|
||||
@moduledoc "dBASE (DBF) file metadata."
|
||||
|
||||
@typedoc "dBASE (DBF) file metadata."
|
||||
@type t :: %__MODULE__{
|
||||
record_count: non_neg_integer(),
|
||||
field_count: non_neg_integer(),
|
||||
fields: [map()]
|
||||
}
|
||||
|
||||
defstruct record_count: 0,
|
||||
field_count: 0,
|
||||
fields: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/detect_response.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/detect_response.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DetectResponse do
|
||||
@moduledoc "MIME type detection response."
|
||||
|
||||
@typedoc "MIME type detection response."
|
||||
@type t :: %__MODULE__{
|
||||
mime_type: String.t() | nil,
|
||||
filename: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct mime_type: nil,
|
||||
filename: nil
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/detection_result.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/detection_result.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DetectionResult do
|
||||
@moduledoc "Page-level detection result containing all detections and page metadata."
|
||||
|
||||
@typedoc "Page-level detection result containing all detections and page metadata."
|
||||
@type t :: %__MODULE__{
|
||||
page_width: non_neg_integer(),
|
||||
page_height: non_neg_integer(),
|
||||
detections: [map()]
|
||||
}
|
||||
|
||||
defstruct page_width: 0,
|
||||
page_height: 0,
|
||||
detections: []
|
||||
end
|
||||
23
packages/elixir/lib/kreuzberg/diff_hunk.ex
generated
Normal file
23
packages/elixir/lib/kreuzberg/diff_hunk.ex
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffHunk do
|
||||
@moduledoc "A single contiguous hunk in a unified diff."
|
||||
|
||||
@typedoc "A single contiguous hunk in a unified diff."
|
||||
@type t :: %__MODULE__{
|
||||
from_line: non_neg_integer(),
|
||||
from_count: non_neg_integer(),
|
||||
to_line: non_neg_integer(),
|
||||
to_count: non_neg_integer(),
|
||||
lines: [String.t() | nil]
|
||||
}
|
||||
|
||||
defstruct from_line: 0,
|
||||
from_count: 0,
|
||||
to_line: 0,
|
||||
to_count: 0,
|
||||
lines: []
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/diff_line.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/diff_line.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffLine do
|
||||
@moduledoc """
|
||||
A single line in a unified-diff hunk.
|
||||
|
||||
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||||
reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
`crate::diff` re-exports this type verbatim.
|
||||
"""
|
||||
|
||||
@typedoc "A single line in a unified-diff hunk."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Unchanged context line."
|
||||
@type context :: %{type: :context, value: String.t()}
|
||||
@typedoc "Line added in the \"after\" version."
|
||||
@type added :: %{type: :added, value: String.t()}
|
||||
@typedoc "Line removed from the \"before\" version."
|
||||
@type removed :: %{type: :removed, value: String.t()}
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/diff_options.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/diff_options.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffOptions do
|
||||
@moduledoc "Options controlling how two `ExtractionResult` values are compared."
|
||||
|
||||
@typedoc "Options controlling how two `ExtractionResult` values are compared."
|
||||
@type t :: %__MODULE__{
|
||||
include_metadata: boolean(),
|
||||
include_embedded: boolean(),
|
||||
max_content_chars: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct include_metadata: true,
|
||||
include_embedded: true,
|
||||
max_content_chars: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
41
packages/elixir/lib/kreuzberg/djot_content.ex
generated
Normal file
41
packages/elixir/lib/kreuzberg/djot_content.ex
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotContent do
|
||||
@moduledoc """
|
||||
Comprehensive Djot document structure with semantic preservation.
|
||||
|
||||
This type captures the full richness of Djot markup, including:
|
||||
- Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
||||
- Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
||||
- Attributes (classes, IDs, key-value pairs)
|
||||
- Links, images, footnotes
|
||||
- Math expressions (inline and display)
|
||||
- Tables with full structure
|
||||
|
||||
Available when the `djot` feature is enabled.
|
||||
"""
|
||||
|
||||
@typedoc "Comprehensive Djot document structure with semantic preservation."
|
||||
@type t :: %__MODULE__{
|
||||
plain_text: String.t() | nil,
|
||||
blocks: [map()],
|
||||
metadata: map(),
|
||||
tables: [map()],
|
||||
images: [map()],
|
||||
links: [map()],
|
||||
footnotes: [map()],
|
||||
attributes: [String.t()]
|
||||
}
|
||||
|
||||
defstruct plain_text: nil,
|
||||
blocks: [],
|
||||
metadata: nil,
|
||||
tables: [],
|
||||
images: [],
|
||||
links: [],
|
||||
footnotes: [],
|
||||
attributes: []
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/djot_image.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/djot_image.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotImage do
|
||||
@moduledoc "Image element in Djot."
|
||||
|
||||
@typedoc "Image element in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
src: String.t() | nil,
|
||||
alt: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
attributes: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct src: nil,
|
||||
alt: nil,
|
||||
title: nil,
|
||||
attributes: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/djot_link.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/djot_link.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotLink do
|
||||
@moduledoc "Link element in Djot."
|
||||
|
||||
@typedoc "Link element in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
url: String.t() | nil,
|
||||
text: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
attributes: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct url: nil,
|
||||
text: nil,
|
||||
title: nil,
|
||||
attributes: nil
|
||||
end
|
||||
74
packages/elixir/lib/kreuzberg/document_extractor_bridge.ex
generated
Normal file
74
packages/elixir/lib/kreuzberg/document_extractor_bridge.ex
generated
Normal file
@@ -0,0 +1,74 @@
|
||||
defmodule KreuzbergDocumentExtractorBridge do
|
||||
@moduledoc """
|
||||
GenServer bridge for DocumentExtractor implementation in kreuzberg.
|
||||
|
||||
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
require Logger
|
||||
|
||||
@doc """
|
||||
Start a GenServer linked to the current process.
|
||||
|
||||
impl_module should be a module that implements the DocumentExtractor trait methods.
|
||||
"""
|
||||
def start_link(impl_module) do
|
||||
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
|
||||
end
|
||||
|
||||
@impl GenServer
|
||||
def init(impl_module) do
|
||||
{:ok, impl_module}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Handle an incoming trait call message.
|
||||
|
||||
Message format: {:trait_call, method_atom, args_json, reply_id}
|
||||
"""
|
||||
@impl GenServer
|
||||
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
|
||||
try do
|
||||
args = Jason.decode!(args_json)
|
||||
method_name = to_string(method)
|
||||
ordered_args = ordered_args(impl_module, method_name, args)
|
||||
|
||||
# Dispatch to the implementation module
|
||||
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
|
||||
|
||||
# Send result back to Rust
|
||||
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
|
||||
rescue
|
||||
e ->
|
||||
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
|
||||
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
|
||||
end
|
||||
|
||||
{:noreply, impl_module}
|
||||
end
|
||||
|
||||
defp ordered_args(impl_module, method_name, args) when is_map(args) do
|
||||
if function_exported?(impl_module, :__alef_arg_order__, 1) do
|
||||
impl_module.__alef_arg_order__(method_name)
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
else
|
||||
args
|
||||
|> Map.keys()
|
||||
|> Enum.sort()
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
end
|
||||
end
|
||||
|
||||
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
|
||||
|
||||
@doc """
|
||||
Register an implementation module, starting a GenServer to handle trait calls.
|
||||
"""
|
||||
def register(impl_module) do
|
||||
plugin_name = impl_module.name()
|
||||
{:ok, pid} = start_link(impl_module)
|
||||
Kreuzberg.Native.register_document_extractor(pid, plugin_name)
|
||||
end
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/document_node.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/document_node.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentNode do
|
||||
@moduledoc """
|
||||
A single node in the document tree.
|
||||
|
||||
Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
||||
for tree structure, and metadata like page number, bounding box, and content layer.
|
||||
"""
|
||||
|
||||
@typedoc "A single node in the document tree."
|
||||
@type t :: %__MODULE__{
|
||||
id: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
parent: non_neg_integer() | nil,
|
||||
children: [non_neg_integer()],
|
||||
content_layer: String.t() | nil,
|
||||
page: non_neg_integer() | nil,
|
||||
page_end: non_neg_integer() | nil,
|
||||
bbox: map() | nil,
|
||||
annotations: [map()],
|
||||
attributes: map() | nil
|
||||
}
|
||||
|
||||
defstruct id: nil,
|
||||
content: :title,
|
||||
parent: nil,
|
||||
children: [],
|
||||
content_layer: :body,
|
||||
page: nil,
|
||||
page_end: nil,
|
||||
bbox: nil,
|
||||
annotations: [],
|
||||
attributes: nil
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/document_relationship.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/document_relationship.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentRelationship do
|
||||
@moduledoc "A resolved relationship between two nodes in the document tree."
|
||||
|
||||
@typedoc "A resolved relationship between two nodes in the document tree."
|
||||
@type t :: %__MODULE__{
|
||||
source: non_neg_integer(),
|
||||
target: non_neg_integer(),
|
||||
kind: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct source: 0,
|
||||
target: 0,
|
||||
kind: :footnote_reference
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/document_revision.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/document_revision.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentRevision do
|
||||
@moduledoc """
|
||||
A single tracked change embedded in a document.
|
||||
|
||||
Populated by per-format extractors that understand change-tracking metadata
|
||||
(DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||||
extractor defaults to `ExtractionResult.revisions = None` until a
|
||||
format-specific implementation is added.
|
||||
"""
|
||||
|
||||
@typedoc "A single tracked change embedded in a document."
|
||||
@type t :: %__MODULE__{
|
||||
revision_id: String.t() | nil,
|
||||
author: String.t() | nil,
|
||||
timestamp: String.t() | nil,
|
||||
kind: String.t() | nil,
|
||||
anchor: String.t() | nil | nil,
|
||||
delta: map()
|
||||
}
|
||||
|
||||
defstruct revision_id: nil,
|
||||
author: nil,
|
||||
timestamp: nil,
|
||||
kind: :insertion,
|
||||
anchor: nil,
|
||||
delta: nil
|
||||
end
|
||||
43
packages/elixir/lib/kreuzberg/document_structure.ex
generated
Normal file
43
packages/elixir/lib/kreuzberg/document_structure.ex
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentStructure do
|
||||
@moduledoc """
|
||||
Top-level structured document representation.
|
||||
|
||||
A flat array of nodes with index-based parent/child references forming a tree.
|
||||
Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
||||
to iterate over top-level content by layer.
|
||||
|
||||
# Validation
|
||||
|
||||
Call `validate()` after construction to verify all node indices are in bounds
|
||||
and parent-child relationships are bidirectionally consistent.
|
||||
"""
|
||||
|
||||
@typedoc "Top-level structured document representation."
|
||||
@type t :: %__MODULE__{
|
||||
nodes: [map()],
|
||||
source_format: String.t() | nil,
|
||||
relationships: [map()],
|
||||
node_types: [String.t()]
|
||||
}
|
||||
|
||||
defstruct nodes: [],
|
||||
source_format: nil,
|
||||
relationships: [],
|
||||
node_types: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
60
packages/elixir/lib/kreuzberg/docx_app_properties.ex
generated
Normal file
60
packages/elixir/lib/kreuzberg/docx_app_properties.ex
generated
Normal file
@@ -0,0 +1,60 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocxAppProperties do
|
||||
@moduledoc """
|
||||
Application properties from docProps/app.xml for DOCX
|
||||
|
||||
Contains Word-specific document statistics and metadata.
|
||||
"""
|
||||
|
||||
@typedoc "Application properties from docProps/app.xml for DOCX"
|
||||
@type t :: %__MODULE__{
|
||||
application: String.t() | nil,
|
||||
app_version: String.t() | nil,
|
||||
template: String.t() | nil,
|
||||
total_time: integer() | nil,
|
||||
pages: integer() | nil,
|
||||
words: integer() | nil,
|
||||
characters: integer() | nil,
|
||||
characters_with_spaces: integer() | nil,
|
||||
lines: integer() | nil,
|
||||
paragraphs: integer() | nil,
|
||||
company: String.t() | nil,
|
||||
doc_security: integer() | nil,
|
||||
scale_crop: boolean() | nil,
|
||||
links_up_to_date: boolean() | nil,
|
||||
shared_doc: boolean() | nil,
|
||||
hyperlinks_changed: boolean() | nil
|
||||
}
|
||||
|
||||
defstruct application: nil,
|
||||
app_version: nil,
|
||||
template: nil,
|
||||
total_time: nil,
|
||||
pages: nil,
|
||||
words: nil,
|
||||
characters: nil,
|
||||
characters_with_spaces: nil,
|
||||
lines: nil,
|
||||
paragraphs: nil,
|
||||
company: nil,
|
||||
doc_security: nil,
|
||||
scale_crop: nil,
|
||||
links_up_to_date: nil,
|
||||
shared_doc: nil,
|
||||
hyperlinks_changed: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
35
packages/elixir/lib/kreuzberg/docx_metadata.ex
generated
Normal file
35
packages/elixir/lib/kreuzberg/docx_metadata.ex
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocxMetadata do
|
||||
@moduledoc """
|
||||
Word document metadata.
|
||||
|
||||
Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||||
Integrates with `office_metadata` module for core/app/custom properties.
|
||||
"""
|
||||
|
||||
@typedoc "Word document metadata."
|
||||
@type t :: %__MODULE__{
|
||||
core_properties: map() | nil,
|
||||
app_properties: map() | nil,
|
||||
custom_properties: map() | nil
|
||||
}
|
||||
|
||||
defstruct core_properties: nil,
|
||||
app_properties: nil,
|
||||
custom_properties: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/element.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/element.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Element do
|
||||
@moduledoc """
|
||||
Semantic element extracted from document.
|
||||
|
||||
Represents a logical unit of content with semantic classification,
|
||||
unique identifier, and metadata for tracking origin and position.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic element extracted from document."
|
||||
@type t :: %__MODULE__{
|
||||
element_id: String.t() | nil,
|
||||
element_type: String.t() | nil,
|
||||
text: String.t() | nil,
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct element_id: nil,
|
||||
element_type: :title,
|
||||
text: nil,
|
||||
metadata: nil
|
||||
end
|
||||
23
packages/elixir/lib/kreuzberg/element_metadata.ex
generated
Normal file
23
packages/elixir/lib/kreuzberg/element_metadata.ex
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ElementMetadata do
|
||||
@moduledoc "Metadata for a semantic element."
|
||||
|
||||
@typedoc "Metadata for a semantic element."
|
||||
@type t :: %__MODULE__{
|
||||
page_number: non_neg_integer() | nil,
|
||||
filename: String.t() | nil,
|
||||
coordinates: map() | nil,
|
||||
element_index: non_neg_integer() | nil,
|
||||
additional: map()
|
||||
}
|
||||
|
||||
defstruct page_number: nil,
|
||||
filename: nil,
|
||||
coordinates: nil,
|
||||
element_index: nil,
|
||||
additional: %{}
|
||||
end
|
||||
73
packages/elixir/lib/kreuzberg/element_type.ex
generated
Normal file
73
packages/elixir/lib/kreuzberg/element_type.ex
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ElementType do
|
||||
@moduledoc """
|
||||
Semantic element type classification.
|
||||
|
||||
Categorizes text content into semantic units for downstream processing.
|
||||
Supports the element types commonly found in Unstructured documents.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic element type classification."
|
||||
@type t ::
|
||||
:title
|
||||
| :narrative_text
|
||||
| :heading
|
||||
| :list_item
|
||||
| :table
|
||||
| :image
|
||||
| :page_break
|
||||
| :code_block
|
||||
| :block_quote
|
||||
| :footer
|
||||
| :header
|
||||
|
||||
@title :title
|
||||
@narrative_text :narrative_text
|
||||
@heading :heading
|
||||
@list_item :list_item
|
||||
@table :table
|
||||
@image :image
|
||||
@page_break :page_break
|
||||
@code_block :code_block
|
||||
@block_quote :block_quote
|
||||
@footer :footer
|
||||
@header :header
|
||||
|
||||
@doc "Document title"
|
||||
@spec title() :: t()
|
||||
def title, do: @title
|
||||
@doc "Main narrative text body"
|
||||
@spec narrative_text() :: t()
|
||||
def narrative_text, do: @narrative_text
|
||||
@doc "Section heading"
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@doc "List item (bullet, numbered, etc.)"
|
||||
@spec list_item() :: t()
|
||||
def list_item, do: @list_item
|
||||
@doc "Table element"
|
||||
@spec table() :: t()
|
||||
def table, do: @table
|
||||
@doc "Image element"
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@doc "Page break marker"
|
||||
@spec page_break() :: t()
|
||||
def page_break, do: @page_break
|
||||
@doc "Code block"
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@doc "Block quote"
|
||||
@spec block_quote() :: t()
|
||||
def block_quote, do: @block_quote
|
||||
@doc "Footer text"
|
||||
@spec footer() :: t()
|
||||
def footer, do: @footer
|
||||
@doc "Header text"
|
||||
@spec header() :: t()
|
||||
def header, do: @header
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/email_attachment.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/email_attachment.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailAttachment do
|
||||
@moduledoc """
|
||||
Email attachment representation.
|
||||
|
||||
Contains metadata and optionally the content of an email attachment.
|
||||
"""
|
||||
|
||||
@typedoc "Email attachment representation."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
filename: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
size: non_neg_integer() | nil,
|
||||
is_image: boolean(),
|
||||
data: binary() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
filename: nil,
|
||||
mime_type: nil,
|
||||
size: nil,
|
||||
is_image: false,
|
||||
data: nil
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/email_config.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/email_config.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailConfig do
|
||||
@moduledoc "Configuration for email extraction."
|
||||
|
||||
@typedoc "Configuration for email extraction."
|
||||
@type t :: %__MODULE__{
|
||||
msg_fallback_codepage: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct msg_fallback_codepage: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/email_extraction_result.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/email_extraction_result.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailExtractionResult do
|
||||
@moduledoc """
|
||||
Email extraction result.
|
||||
|
||||
Complete representation of an extracted email message (.eml or .msg)
|
||||
including headers, body content, and attachments.
|
||||
"""
|
||||
|
||||
@typedoc "Email extraction result."
|
||||
@type t :: %__MODULE__{
|
||||
subject: String.t() | nil,
|
||||
from_email: String.t() | nil,
|
||||
to_emails: [String.t()],
|
||||
cc_emails: [String.t()],
|
||||
bcc_emails: [String.t()],
|
||||
date: String.t() | nil,
|
||||
message_id: String.t() | nil,
|
||||
plain_text: String.t() | nil,
|
||||
html_content: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
attachments: [map()],
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct subject: nil,
|
||||
from_email: nil,
|
||||
to_emails: [],
|
||||
cc_emails: [],
|
||||
bcc_emails: [],
|
||||
date: nil,
|
||||
message_id: nil,
|
||||
plain_text: nil,
|
||||
html_content: nil,
|
||||
content: nil,
|
||||
attachments: [],
|
||||
metadata: %{}
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/email_metadata.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/email_metadata.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailMetadata do
|
||||
@moduledoc """
|
||||
Email metadata extracted from .eml and .msg files.
|
||||
|
||||
Includes sender/recipient information, message ID, and attachment list.
|
||||
"""
|
||||
|
||||
@typedoc "Email metadata extracted from .eml and .msg files."
|
||||
@type t :: %__MODULE__{
|
||||
from_email: String.t() | nil,
|
||||
from_name: String.t() | nil,
|
||||
to_emails: [String.t()],
|
||||
cc_emails: [String.t()],
|
||||
bcc_emails: [String.t()],
|
||||
message_id: String.t() | nil,
|
||||
attachments: [String.t()]
|
||||
}
|
||||
|
||||
defstruct from_email: nil,
|
||||
from_name: nil,
|
||||
to_emails: [],
|
||||
cc_emails: [],
|
||||
bcc_emails: [],
|
||||
message_id: nil,
|
||||
attachments: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/embedded_changes.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/embedded_changes.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedChanges do
|
||||
@moduledoc "Changes to embedded archive children between two results."
|
||||
|
||||
@typedoc "Changes to embedded archive children between two results."
|
||||
@type t :: %__MODULE__{
|
||||
added: [map()],
|
||||
removed: [map()],
|
||||
changed: [map()]
|
||||
}
|
||||
|
||||
defstruct added: [],
|
||||
removed: [],
|
||||
changed: []
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/embedded_diff.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/embedded_diff.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedDiff do
|
||||
@moduledoc "Diff for a single embedded archive entry that appears in both results."
|
||||
|
||||
@typedoc "Diff for a single embedded archive entry that appears in both results."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
diff: map()
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
diff: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/embedded_file.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/embedded_file.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedFile do
|
||||
@moduledoc "Embedded file descriptor extracted from the PDF name tree."
|
||||
|
||||
@typedoc "Embedded file descriptor extracted from the PDF name tree."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
data: binary(),
|
||||
compressed_size: non_neg_integer(),
|
||||
mime_type: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
data: <<>>,
|
||||
compressed_size: 0,
|
||||
mime_type: nil
|
||||
end
|
||||
74
packages/elixir/lib/kreuzberg/embedding_backend_bridge.ex
generated
Normal file
74
packages/elixir/lib/kreuzberg/embedding_backend_bridge.ex
generated
Normal file
@@ -0,0 +1,74 @@
|
||||
defmodule KreuzbergEmbeddingBackendBridge do
|
||||
@moduledoc """
|
||||
GenServer bridge for EmbeddingBackend implementation in kreuzberg.
|
||||
|
||||
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
require Logger
|
||||
|
||||
@doc """
|
||||
Start a GenServer linked to the current process.
|
||||
|
||||
impl_module should be a module that implements the EmbeddingBackend trait methods.
|
||||
"""
|
||||
def start_link(impl_module) do
|
||||
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
|
||||
end
|
||||
|
||||
@impl GenServer
|
||||
def init(impl_module) do
|
||||
{:ok, impl_module}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Handle an incoming trait call message.
|
||||
|
||||
Message format: {:trait_call, method_atom, args_json, reply_id}
|
||||
"""
|
||||
@impl GenServer
|
||||
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
|
||||
try do
|
||||
args = Jason.decode!(args_json)
|
||||
method_name = to_string(method)
|
||||
ordered_args = ordered_args(impl_module, method_name, args)
|
||||
|
||||
# Dispatch to the implementation module
|
||||
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
|
||||
|
||||
# Send result back to Rust
|
||||
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
|
||||
rescue
|
||||
e ->
|
||||
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
|
||||
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
|
||||
end
|
||||
|
||||
{:noreply, impl_module}
|
||||
end
|
||||
|
||||
defp ordered_args(impl_module, method_name, args) when is_map(args) do
|
||||
if function_exported?(impl_module, :__alef_arg_order__, 1) do
|
||||
impl_module.__alef_arg_order__(method_name)
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
else
|
||||
args
|
||||
|> Map.keys()
|
||||
|> Enum.sort()
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
end
|
||||
end
|
||||
|
||||
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
|
||||
|
||||
@doc """
|
||||
Register an implementation module, starting a GenServer to handle trait calls.
|
||||
"""
|
||||
def register(impl_module) do
|
||||
plugin_name = impl_module.name()
|
||||
{:ok, pid} = start_link(impl_module)
|
||||
Kreuzberg.Native.register_embedding_backend(pid, plugin_name)
|
||||
end
|
||||
end
|
||||
43
packages/elixir/lib/kreuzberg/embedding_config.ex
generated
Normal file
43
packages/elixir/lib/kreuzberg/embedding_config.ex
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingConfig do
|
||||
@moduledoc """
|
||||
Embedding configuration for text chunks.
|
||||
|
||||
Configures embedding generation using ONNX models via the vendored embedding engine.
|
||||
Requires the `embeddings` feature to be enabled.
|
||||
"""
|
||||
|
||||
@typedoc "Embedding configuration for text chunks."
|
||||
@type t :: %__MODULE__{
|
||||
model: String.t() | nil,
|
||||
normalize: boolean(),
|
||||
batch_size: non_neg_integer(),
|
||||
show_download_progress: boolean(),
|
||||
cache_dir: String.t() | nil,
|
||||
acceleration: map() | nil,
|
||||
max_embed_duration_secs: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct model: :preset,
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false,
|
||||
cache_dir: nil,
|
||||
acceleration: nil,
|
||||
max_embed_duration_secs: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
20
packages/elixir/lib/kreuzberg/embedding_model_type.ex
generated
Normal file
20
packages/elixir/lib/kreuzberg/embedding_model_type.ex
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingModelType do
|
||||
@moduledoc "Embedding model types supported by Kreuzberg."
|
||||
|
||||
@typedoc "Embedding model types supported by Kreuzberg."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Use a preset model configuration (recommended)"
|
||||
@type preset :: %{type: :preset, name: String.t()}
|
||||
@typedoc "Use a custom ONNX model from HuggingFace"
|
||||
@type custom :: %{type: :custom, model_id: String.t(), dimensions: non_neg_integer()}
|
||||
@typedoc "Provider-hosted embedding model via liter-llm."
|
||||
@type llm :: %{type: :llm, llm: Kreuzberg.LlmConfig.t()}
|
||||
@typedoc "In-process embedding backend registered via the plugin system."
|
||||
@type plugin :: %{type: :plugin, name: String.t()}
|
||||
end
|
||||
37
packages/elixir/lib/kreuzberg/embedding_preset.ex
generated
Normal file
37
packages/elixir/lib/kreuzberg/embedding_preset.ex
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingPreset do
|
||||
@moduledoc """
|
||||
Preset configurations for common RAG use cases.
|
||||
|
||||
Each preset combines chunk size, overlap, and embedding model
|
||||
to provide an optimized configuration for specific scenarios.
|
||||
|
||||
All string fields are owned `String` for FFI compatibility — instances
|
||||
are safe to clone and pass across language boundaries.
|
||||
"""
|
||||
|
||||
@typedoc "Preset configurations for common RAG use cases."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
chunk_size: non_neg_integer(),
|
||||
overlap: non_neg_integer(),
|
||||
model_repo: String.t() | nil,
|
||||
pooling: String.t() | nil,
|
||||
model_file: String.t() | nil,
|
||||
dimensions: non_neg_integer(),
|
||||
description: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
chunk_size: 0,
|
||||
overlap: 0,
|
||||
model_repo: nil,
|
||||
pooling: nil,
|
||||
model_file: nil,
|
||||
dimensions: 0,
|
||||
description: nil
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/epub_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/epub_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EpubMetadata do
|
||||
@moduledoc "EPUB metadata (Dublin Core extensions)."
|
||||
|
||||
@typedoc "EPUB metadata (Dublin Core extensions)."
|
||||
@type t :: %__MODULE__{
|
||||
coverage: String.t() | nil,
|
||||
dc_format: String.t() | nil,
|
||||
relation: String.t() | nil,
|
||||
source: String.t() | nil,
|
||||
dc_type: String.t() | nil,
|
||||
cover_image: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct coverage: nil,
|
||||
dc_format: nil,
|
||||
relation: nil,
|
||||
source: nil,
|
||||
dc_type: nil,
|
||||
cover_image: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/error_metadata.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/error_metadata.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ErrorMetadata do
|
||||
@moduledoc "Error metadata (for batch operations)."
|
||||
|
||||
@typedoc "Error metadata (for batch operations)."
|
||||
@type t :: %__MODULE__{
|
||||
error_type: String.t() | nil,
|
||||
message: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct error_type: nil,
|
||||
message: nil
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/excel_metadata.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/excel_metadata.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelMetadata do
|
||||
@moduledoc """
|
||||
Excel/spreadsheet format metadata.
|
||||
|
||||
Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
||||
discriminant. Sheet count and sheet names are stored inside this struct.
|
||||
"""
|
||||
|
||||
@typedoc "Excel/spreadsheet format metadata."
|
||||
@type t :: %__MODULE__{
|
||||
sheet_count: non_neg_integer() | nil,
|
||||
sheet_names: [String.t()] | nil
|
||||
}
|
||||
|
||||
defstruct sheet_count: nil,
|
||||
sheet_names: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/excel_sheet.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/excel_sheet.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelSheet do
|
||||
@moduledoc """
|
||||
Single Excel worksheet.
|
||||
|
||||
Represents one sheet from an Excel workbook with its content
|
||||
converted to Markdown format and dimensional statistics.
|
||||
"""
|
||||
|
||||
@typedoc "Single Excel worksheet."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
markdown: String.t() | nil,
|
||||
row_count: non_neg_integer(),
|
||||
col_count: non_neg_integer(),
|
||||
cell_count: non_neg_integer(),
|
||||
table_cells: [[String.t()]] | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
markdown: nil,
|
||||
row_count: 0,
|
||||
col_count: 0,
|
||||
cell_count: 0,
|
||||
table_cells: nil
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/excel_workbook.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/excel_workbook.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelWorkbook do
|
||||
@moduledoc """
|
||||
Excel workbook representation.
|
||||
|
||||
Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
||||
extracted content and metadata.
|
||||
"""
|
||||
|
||||
@typedoc "Excel workbook representation."
|
||||
@type t :: %__MODULE__{
|
||||
sheets: [map()],
|
||||
metadata: map(),
|
||||
revisions: [map()] | nil
|
||||
}
|
||||
|
||||
defstruct sheets: [],
|
||||
metadata: %{},
|
||||
revisions: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/execution_provider_type.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/execution_provider_type.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExecutionProviderType do
|
||||
@moduledoc """
|
||||
ONNX Runtime execution provider type.
|
||||
|
||||
Determines which hardware backend is used for model inference.
|
||||
`Auto` (default) selects the best available provider per platform.
|
||||
"""
|
||||
|
||||
@typedoc "ONNX Runtime execution provider type."
|
||||
@type t :: :auto | :cpu | :core_ml | :cuda | :tensor_rt
|
||||
|
||||
@auto :auto
|
||||
@cpu :cpu
|
||||
@core_ml :core_ml
|
||||
@cuda :cuda
|
||||
@tensor_rt :tensor_rt
|
||||
|
||||
@doc "Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere."
|
||||
@spec auto() :: t()
|
||||
def auto, do: @auto
|
||||
@doc "CPU execution provider (always available)."
|
||||
@spec cpu() :: t()
|
||||
def cpu, do: @cpu
|
||||
@doc "Apple CoreML (macOS/iOS Neural Engine + GPU)."
|
||||
@spec core_ml() :: t()
|
||||
def core_ml, do: @core_ml
|
||||
@doc "NVIDIA CUDA GPU acceleration."
|
||||
@spec cuda() :: t()
|
||||
def cuda, do: @cuda
|
||||
@doc "NVIDIA TensorRT (optimized CUDA inference)."
|
||||
@spec tensor_rt() :: t()
|
||||
def tensor_rt, do: @tensor_rt
|
||||
end
|
||||
51
packages/elixir/lib/kreuzberg/extracted_image.ex
generated
Normal file
51
packages/elixir/lib/kreuzberg/extracted_image.ex
generated
Normal file
@@ -0,0 +1,51 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractedImage do
|
||||
@moduledoc """
|
||||
Extracted image from a document.
|
||||
|
||||
Contains raw image data, metadata, and optional nested OCR results.
|
||||
Raw bytes allow cross-language compatibility - users can convert to
|
||||
PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||||
"""
|
||||
|
||||
@typedoc "Extracted image from a document."
|
||||
@type t :: %__MODULE__{
|
||||
data: binary(),
|
||||
format: String.t() | nil,
|
||||
image_index: non_neg_integer(),
|
||||
page_number: non_neg_integer() | nil,
|
||||
width: non_neg_integer() | nil,
|
||||
height: non_neg_integer() | nil,
|
||||
colorspace: String.t() | nil,
|
||||
bits_per_component: non_neg_integer() | nil,
|
||||
is_mask: boolean(),
|
||||
description: String.t() | nil,
|
||||
ocr_result: map() | nil,
|
||||
bounding_box: map() | nil,
|
||||
source_path: String.t() | nil,
|
||||
image_kind: String.t() | nil | nil,
|
||||
kind_confidence: float() | nil,
|
||||
cluster_id: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct data: <<>>,
|
||||
format: nil,
|
||||
image_index: 0,
|
||||
page_number: nil,
|
||||
width: nil,
|
||||
height: nil,
|
||||
colorspace: nil,
|
||||
bits_per_component: nil,
|
||||
is_mask: false,
|
||||
description: nil,
|
||||
ocr_result: nil,
|
||||
bounding_box: nil,
|
||||
source_path: nil,
|
||||
image_kind: nil,
|
||||
kind_confidence: nil,
|
||||
cluster_id: nil
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/extracted_uri.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/extracted_uri.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractedUri do
|
||||
@moduledoc """
|
||||
A URI extracted from a document.
|
||||
|
||||
Represents any link, reference, or resource pointer found during extraction.
|
||||
The `kind` field classifies the URI semantically, while `label` carries
|
||||
optional human-readable display text.
|
||||
"""
|
||||
|
||||
@typedoc "A URI extracted from a document."
|
||||
@type t :: %__MODULE__{
|
||||
url: String.t() | nil,
|
||||
label: String.t() | nil,
|
||||
page: non_neg_integer() | nil,
|
||||
kind: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct url: nil,
|
||||
label: nil,
|
||||
page: nil,
|
||||
kind: :hyperlink
|
||||
end
|
||||
111
packages/elixir/lib/kreuzberg/extraction_config.ex
generated
Normal file
111
packages/elixir/lib/kreuzberg/extraction_config.ex
generated
Normal file
@@ -0,0 +1,111 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionConfig do
|
||||
@moduledoc """
|
||||
Main extraction configuration.
|
||||
|
||||
This struct contains all configuration options for the extraction process.
|
||||
It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
|
||||
// Create with defaults
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
// Load from TOML file
|
||||
// let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Main extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
use_cache: boolean(),
|
||||
enable_quality_processing: boolean(),
|
||||
ocr: map() | nil,
|
||||
force_ocr: boolean(),
|
||||
force_ocr_pages: [non_neg_integer()] | nil,
|
||||
disable_ocr: boolean(),
|
||||
chunking: map() | nil,
|
||||
content_filter: map() | nil,
|
||||
images: map() | nil,
|
||||
pdf_options: map() | nil,
|
||||
token_reduction: map() | nil,
|
||||
language_detection: map() | nil,
|
||||
pages: map() | nil,
|
||||
keywords: map() | nil,
|
||||
postprocessor: map() | nil,
|
||||
html_options: String.t() | nil,
|
||||
html_output: map() | nil,
|
||||
extraction_timeout_secs: non_neg_integer() | nil,
|
||||
max_concurrent_extractions: non_neg_integer() | nil,
|
||||
result_format: String.t() | nil,
|
||||
security_limits: map() | nil,
|
||||
max_embedded_file_bytes: non_neg_integer() | nil,
|
||||
output_format: String.t() | nil,
|
||||
layout: map() | nil,
|
||||
use_layout_for_markdown: boolean(),
|
||||
include_document_structure: boolean(),
|
||||
acceleration: map() | nil,
|
||||
cache_namespace: String.t() | nil,
|
||||
cache_ttl_secs: non_neg_integer() | nil,
|
||||
email: map() | nil,
|
||||
concurrency: String.t() | nil,
|
||||
max_archive_depth: non_neg_integer(),
|
||||
tree_sitter: map() | nil,
|
||||
structured_extraction: map() | nil,
|
||||
cancel_token: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: nil,
|
||||
force_ocr: false,
|
||||
force_ocr_pages: nil,
|
||||
disable_ocr: false,
|
||||
chunking: nil,
|
||||
content_filter: nil,
|
||||
images: nil,
|
||||
pdf_options: nil,
|
||||
token_reduction: nil,
|
||||
language_detection: nil,
|
||||
pages: nil,
|
||||
keywords: nil,
|
||||
postprocessor: nil,
|
||||
html_options: nil,
|
||||
html_output: nil,
|
||||
extraction_timeout_secs: nil,
|
||||
max_concurrent_extractions: nil,
|
||||
result_format: :unified,
|
||||
security_limits: nil,
|
||||
max_embedded_file_bytes: nil,
|
||||
output_format: :plain,
|
||||
layout: nil,
|
||||
use_layout_for_markdown: false,
|
||||
include_document_structure: false,
|
||||
acceleration: nil,
|
||||
cache_namespace: nil,
|
||||
cache_ttl_secs: nil,
|
||||
email: nil,
|
||||
concurrency: nil,
|
||||
max_archive_depth: 0,
|
||||
tree_sitter: nil,
|
||||
structured_extraction: nil,
|
||||
cancel_token: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/extraction_diff.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/extraction_diff.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionDiff do
|
||||
@moduledoc "The complete diff between two `ExtractionResult` values."
|
||||
|
||||
@typedoc "The complete diff between two `ExtractionResult` values."
|
||||
@type t :: %__MODULE__{
|
||||
content_diff: [map()],
|
||||
tables_added: [map()],
|
||||
tables_removed: [map()],
|
||||
tables_changed: [map()],
|
||||
metadata_changed: String.t() | nil,
|
||||
embedded_changes: map()
|
||||
}
|
||||
|
||||
defstruct content_diff: [],
|
||||
tables_added: [],
|
||||
tables_removed: [],
|
||||
tables_changed: [],
|
||||
metadata_changed: nil,
|
||||
embedded_changes: nil
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/extraction_method.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/extraction_method.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionMethod do
|
||||
@moduledoc "How the extracted text was produced."
|
||||
|
||||
@typedoc "How the extracted text was produced."
|
||||
@type t :: :native | :ocr | :mixed
|
||||
|
||||
@native :native
|
||||
@ocr :ocr
|
||||
@mixed :mixed
|
||||
|
||||
@spec native() :: t()
|
||||
def native, do: @native
|
||||
@spec ocr() :: t()
|
||||
def ocr, do: @ocr
|
||||
@spec mixed() :: t()
|
||||
def mixed, do: @mixed
|
||||
end
|
||||
78
packages/elixir/lib/kreuzberg/extraction_result.ex
generated
Normal file
78
packages/elixir/lib/kreuzberg/extraction_result.ex
generated
Normal file
@@ -0,0 +1,78 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionResult do
|
||||
@moduledoc """
|
||||
General extraction result used by the core extraction API.
|
||||
|
||||
This is the main result type returned by all extraction functions.
|
||||
"""
|
||||
|
||||
@typedoc "General extraction result used by the core extraction API."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
metadata: map(),
|
||||
extraction_method: String.t() | nil | nil,
|
||||
tables: [map()],
|
||||
detected_languages: [String.t()] | nil,
|
||||
chunks: [map()] | nil,
|
||||
images: [map()] | nil,
|
||||
pages: [map()] | nil,
|
||||
elements: [map()] | nil,
|
||||
djot_content: map() | nil,
|
||||
ocr_elements: [map()] | nil,
|
||||
document: map() | nil,
|
||||
extracted_keywords: [map()] | nil,
|
||||
quality_score: float() | nil,
|
||||
processing_warnings: [map()],
|
||||
annotations: [map()] | nil,
|
||||
children: [map()] | nil,
|
||||
uris: [map()] | nil,
|
||||
revisions: [map()] | nil,
|
||||
structured_output: String.t() | nil,
|
||||
code_intelligence: String.t() | nil,
|
||||
llm_usage: [map()] | nil,
|
||||
formatted_content: String.t() | nil,
|
||||
ocr_internal_document: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
mime_type: nil,
|
||||
metadata: nil,
|
||||
extraction_method: nil,
|
||||
tables: [],
|
||||
detected_languages: nil,
|
||||
chunks: nil,
|
||||
images: nil,
|
||||
pages: nil,
|
||||
elements: nil,
|
||||
djot_content: nil,
|
||||
ocr_elements: nil,
|
||||
document: nil,
|
||||
extracted_keywords: nil,
|
||||
quality_score: nil,
|
||||
processing_warnings: [],
|
||||
annotations: nil,
|
||||
children: nil,
|
||||
uris: nil,
|
||||
revisions: nil,
|
||||
structured_output: nil,
|
||||
code_intelligence: nil,
|
||||
llm_usage: nil,
|
||||
formatted_content: nil,
|
||||
ocr_internal_document: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/fiction_book_metadata.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/fiction_book_metadata.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FictionBookMetadata do
|
||||
@moduledoc "FictionBook (FB2) metadata."
|
||||
|
||||
@typedoc "FictionBook (FB2) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
genres: [String.t()],
|
||||
sequences: [String.t()],
|
||||
annotation: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct genres: [],
|
||||
sequences: [],
|
||||
annotation: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
96
packages/elixir/lib/kreuzberg/file_extraction_config.ex
generated
Normal file
96
packages/elixir/lib/kreuzberg/file_extraction_config.ex
generated
Normal file
@@ -0,0 +1,96 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FileExtractionConfig do
|
||||
@moduledoc """
|
||||
Per-file extraction configuration overrides for batch processing.
|
||||
|
||||
All fields are `Option<T>` — `None` means "use the batch-level default."
|
||||
This type is used with `batch_extract_files` and
|
||||
`batch_extract_bytes` to allow heterogeneous
|
||||
extraction settings within a single batch.
|
||||
|
||||
# Excluded Fields
|
||||
|
||||
The following `ExtractionConfig` fields are batch-level only and
|
||||
cannot be overridden per file:
|
||||
- `max_concurrent_extractions` — controls batch parallelism
|
||||
- `use_cache` — global caching policy
|
||||
- `acceleration` — shared ONNX execution provider
|
||||
- `security_limits` — global archive security policy
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::FileExtractionConfig;
|
||||
|
||||
// Override just OCR forcing for a specific file
|
||||
let config = FileExtractionConfig {
|
||||
force_ocr: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Per-file extraction configuration overrides for batch processing."
|
||||
@type t :: %__MODULE__{
|
||||
enable_quality_processing: boolean() | nil,
|
||||
ocr: map() | nil,
|
||||
force_ocr: boolean() | nil,
|
||||
force_ocr_pages: [non_neg_integer()] | nil,
|
||||
disable_ocr: boolean() | nil,
|
||||
chunking: map() | nil,
|
||||
content_filter: map() | nil,
|
||||
images: map() | nil,
|
||||
pdf_options: map() | nil,
|
||||
token_reduction: map() | nil,
|
||||
language_detection: map() | nil,
|
||||
pages: map() | nil,
|
||||
keywords: map() | nil,
|
||||
postprocessor: map() | nil,
|
||||
html_options: String.t() | nil,
|
||||
result_format: String.t() | nil | nil,
|
||||
output_format: String.t() | nil | nil,
|
||||
include_document_structure: boolean() | nil,
|
||||
layout: map() | nil,
|
||||
timeout_secs: non_neg_integer() | nil,
|
||||
tree_sitter: map() | nil,
|
||||
structured_extraction: map() | nil
|
||||
}
|
||||
|
||||
defstruct enable_quality_processing: nil,
|
||||
ocr: nil,
|
||||
force_ocr: nil,
|
||||
force_ocr_pages: nil,
|
||||
disable_ocr: nil,
|
||||
chunking: nil,
|
||||
content_filter: nil,
|
||||
images: nil,
|
||||
pdf_options: nil,
|
||||
token_reduction: nil,
|
||||
language_detection: nil,
|
||||
pages: nil,
|
||||
keywords: nil,
|
||||
postprocessor: nil,
|
||||
html_options: nil,
|
||||
result_format: nil,
|
||||
output_format: nil,
|
||||
include_document_structure: nil,
|
||||
layout: nil,
|
||||
timeout_secs: nil,
|
||||
tree_sitter: nil,
|
||||
structured_extraction: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/footnote.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/footnote.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Footnote do
|
||||
@moduledoc "Footnote in Djot."
|
||||
|
||||
@typedoc "Footnote in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
label: String.t() | nil,
|
||||
content: [map()]
|
||||
}
|
||||
|
||||
defstruct label: nil,
|
||||
content: []
|
||||
end
|
||||
37
packages/elixir/lib/kreuzberg/format_metadata.ex
generated
Normal file
37
packages/elixir/lib/kreuzberg/format_metadata.ex
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FormatMetadata do
|
||||
@moduledoc """
|
||||
Format-specific metadata (discriminated union).
|
||||
|
||||
Only one format type can exist per extraction result. This provides
|
||||
type-safe, clean metadata without nested optionals.
|
||||
"""
|
||||
|
||||
@typedoc "Format-specific metadata (discriminated union)."
|
||||
@type t :: term()
|
||||
|
||||
@type pdf :: %{type: :pdf, metadata: Kreuzberg.PdfMetadata.t()}
|
||||
@type docx :: %{type: :docx, metadata: Kreuzberg.DocxMetadata.t()}
|
||||
@type excel :: %{type: :excel, metadata: Kreuzberg.ExcelMetadata.t()}
|
||||
@type email :: %{type: :email, metadata: Kreuzberg.EmailMetadata.t()}
|
||||
@type pptx :: %{type: :pptx, metadata: Kreuzberg.PptxMetadata.t()}
|
||||
@type archive :: %{type: :archive, metadata: Kreuzberg.ArchiveMetadata.t()}
|
||||
@type image :: %{type: :image, metadata: Kreuzberg.ImageMetadata.t()}
|
||||
@type xml :: %{type: :xml, metadata: Kreuzberg.XmlMetadata.t()}
|
||||
@type text :: %{type: :text, metadata: Kreuzberg.TextMetadata.t()}
|
||||
@type html :: %{type: :html, metadata: Kreuzberg.HtmlMetadata.t()}
|
||||
@type ocr :: %{type: :ocr, metadata: Kreuzberg.OcrMetadata.t()}
|
||||
@type csv :: %{type: :csv, metadata: Kreuzberg.CsvMetadata.t()}
|
||||
@type bibtex :: %{type: :bibtex, metadata: Kreuzberg.BibtexMetadata.t()}
|
||||
@type citation :: %{type: :citation, metadata: Kreuzberg.CitationMetadata.t()}
|
||||
@type fiction_book :: %{type: :fiction_book, metadata: Kreuzberg.FictionBookMetadata.t()}
|
||||
@type dbf :: %{type: :dbf, metadata: Kreuzberg.DbfMetadata.t()}
|
||||
@type jats :: %{type: :jats, metadata: Kreuzberg.JatsMetadata.t()}
|
||||
@type epub :: %{type: :epub, metadata: Kreuzberg.EpubMetadata.t()}
|
||||
@type pst :: %{type: :pst, metadata: Kreuzberg.PstMetadata.t()}
|
||||
@type code :: %{type: :code, value: String.t()}
|
||||
end
|
||||
31
packages/elixir/lib/kreuzberg/formatted_block.ex
generated
Normal file
31
packages/elixir/lib/kreuzberg/formatted_block.ex
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FormattedBlock do
|
||||
@moduledoc """
|
||||
Block-level element in a Djot document.
|
||||
|
||||
Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
||||
"""
|
||||
|
||||
@typedoc "Block-level element in a Djot document."
|
||||
@type t :: %__MODULE__{
|
||||
block_type: String.t() | nil,
|
||||
level: non_neg_integer() | nil,
|
||||
inline_content: [map()],
|
||||
attributes: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
code: String.t() | nil,
|
||||
children: [map()]
|
||||
}
|
||||
|
||||
defstruct block_type: :paragraph,
|
||||
level: nil,
|
||||
inline_content: [],
|
||||
attributes: nil,
|
||||
language: nil,
|
||||
code: nil,
|
||||
children: []
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/grid_cell.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/grid_cell.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.GridCell do
|
||||
@moduledoc "Individual grid cell with position and span metadata."
|
||||
|
||||
@typedoc "Individual grid cell with position and span metadata."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
row: non_neg_integer(),
|
||||
col: non_neg_integer(),
|
||||
row_span: non_neg_integer(),
|
||||
col_span: non_neg_integer(),
|
||||
is_header: boolean(),
|
||||
bbox: map() | nil
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
row: 0,
|
||||
col: 0,
|
||||
row_span: 0,
|
||||
col_span: 0,
|
||||
is_header: false,
|
||||
bbox: nil
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/header_metadata.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/header_metadata.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeaderMetadata do
|
||||
@moduledoc "Header/heading element metadata."
|
||||
|
||||
@typedoc "Header/heading element metadata."
|
||||
@type t :: %__MODULE__{
|
||||
level: non_neg_integer(),
|
||||
text: String.t() | nil,
|
||||
id: String.t() | nil,
|
||||
depth: non_neg_integer(),
|
||||
html_offset: non_neg_integer()
|
||||
}
|
||||
|
||||
defstruct level: 0,
|
||||
text: nil,
|
||||
id: nil,
|
||||
depth: 0,
|
||||
html_offset: 0
|
||||
|
||||
@doc "Validate that the header level is within valid range (1-6)."
|
||||
@spec valid?(t()) :: boolean()
|
||||
def valid?(%__MODULE__{level: level}) do
|
||||
level >= 1 and level <= 6
|
||||
end
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/heading_context.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/heading_context.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeadingContext do
|
||||
@moduledoc """
|
||||
Heading context for a chunk within a Markdown document.
|
||||
|
||||
Contains the heading hierarchy from document root to this chunk's section.
|
||||
"""
|
||||
|
||||
@typedoc "Heading context for a chunk within a Markdown document."
|
||||
@type t :: %__MODULE__{
|
||||
headings: [map()]
|
||||
}
|
||||
|
||||
defstruct headings: []
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/heading_level.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/heading_level.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeadingLevel do
|
||||
@moduledoc "A single heading in the hierarchy."
|
||||
|
||||
@typedoc "A single heading in the hierarchy."
|
||||
@type t :: %__MODULE__{
|
||||
level: non_neg_integer(),
|
||||
text: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct level: 0,
|
||||
text: nil
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/hierarchical_block.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/hierarchical_block.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HierarchicalBlock do
|
||||
@moduledoc """
|
||||
A text block with hierarchy level assignment.
|
||||
|
||||
Represents a block of text with semantic heading information extracted from
|
||||
font size clustering and hierarchical analysis.
|
||||
"""
|
||||
|
||||
@typedoc "A text block with hierarchy level assignment."
|
||||
@type t :: %__MODULE__{
|
||||
text: String.t() | nil,
|
||||
font_size: float(),
|
||||
level: String.t() | nil,
|
||||
bbox: [float()] | nil
|
||||
}
|
||||
|
||||
defstruct text: nil,
|
||||
font_size: 0.0,
|
||||
level: nil,
|
||||
bbox: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/hierarchy_config.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/hierarchy_config.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HierarchyConfig do
|
||||
@moduledoc """
|
||||
Hierarchy extraction configuration for PDF text structure analysis.
|
||||
|
||||
Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||||
clustering and semantic analysis. When enabled, hierarchical blocks are
|
||||
included in page content.
|
||||
"""
|
||||
|
||||
@typedoc "Hierarchy extraction configuration for PDF text structure analysis."
|
||||
@type t :: %__MODULE__{
|
||||
enabled: boolean(),
|
||||
k_clusters: non_neg_integer(),
|
||||
include_bbox: boolean(),
|
||||
ocr_coverage_threshold: float() | nil
|
||||
}
|
||||
|
||||
defstruct enabled: true,
|
||||
k_clusters: 3,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
59
packages/elixir/lib/kreuzberg/html_metadata.ex
generated
Normal file
59
packages/elixir/lib/kreuzberg/html_metadata.ex
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlMetadata do
|
||||
@moduledoc """
|
||||
HTML metadata extracted from HTML documents.
|
||||
|
||||
Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
||||
and extracted structural elements (headers, links, images, structured data).
|
||||
"""
|
||||
|
||||
@typedoc "HTML metadata extracted from HTML documents."
|
||||
@type t :: %__MODULE__{
|
||||
title: String.t() | nil,
|
||||
description: String.t() | nil,
|
||||
keywords: [String.t()],
|
||||
author: String.t() | nil,
|
||||
canonical_url: String.t() | nil,
|
||||
base_href: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
text_direction: String.t() | nil | nil,
|
||||
open_graph: map(),
|
||||
twitter_card: map(),
|
||||
meta_tags: map(),
|
||||
headers: [map()],
|
||||
links: [map()],
|
||||
images: [map()],
|
||||
structured_data: [map()]
|
||||
}
|
||||
|
||||
defstruct title: nil,
|
||||
description: nil,
|
||||
keywords: [],
|
||||
author: nil,
|
||||
canonical_url: nil,
|
||||
base_href: nil,
|
||||
language: nil,
|
||||
text_direction: nil,
|
||||
open_graph: %{},
|
||||
twitter_card: %{},
|
||||
meta_tags: %{},
|
||||
headers: [],
|
||||
links: [],
|
||||
images: [],
|
||||
structured_data: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
53
packages/elixir/lib/kreuzberg/html_output_config.ex
generated
Normal file
53
packages/elixir/lib/kreuzberg/html_output_config.ex
generated
Normal file
@@ -0,0 +1,53 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlOutputConfig do
|
||||
@moduledoc """
|
||||
Configuration for styled HTML output.
|
||||
|
||||
When set on [`ExtractionConfig::html_output`] alongside
|
||||
`output_format = OutputFormat::Html`, the pipeline builds a
|
||||
[`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
||||
the plain comrak-based renderer.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
|
||||
|
||||
let config = HtmlOutputConfig {
|
||||
theme: HtmlTheme::GitHub,
|
||||
css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Configuration for styled HTML output."
|
||||
@type t :: %__MODULE__{
|
||||
css: String.t() | nil,
|
||||
css_file: String.t() | nil,
|
||||
theme: String.t() | nil,
|
||||
class_prefix: String.t() | nil,
|
||||
embed_css: boolean()
|
||||
}
|
||||
|
||||
defstruct css: nil,
|
||||
css_file: nil,
|
||||
theme: :unstyled,
|
||||
class_prefix: nil,
|
||||
embed_css: true
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/html_theme.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/html_theme.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlTheme do
|
||||
@moduledoc "Built-in HTML theme selection."
|
||||
|
||||
@typedoc "Built-in HTML theme selection."
|
||||
@type t :: :default | :git_hub | :dark | :light | :unstyled
|
||||
|
||||
@default :default
|
||||
@git_hub :git_hub
|
||||
@dark :dark
|
||||
@light :light
|
||||
@unstyled :unstyled
|
||||
|
||||
@doc "Sensible defaults: system font stack, neutral colours, readable line measure. CSS custom properties (`--kb-*`) are all defined so user CSS can override individual values."
|
||||
@spec default() :: t()
|
||||
def default, do: @default
|
||||
@doc "GitHub Markdown-inspired palette and spacing."
|
||||
@spec git_hub() :: t()
|
||||
def git_hub, do: @git_hub
|
||||
@doc "Dark background, light text."
|
||||
@spec dark() :: t()
|
||||
def dark, do: @dark
|
||||
@doc "Minimal light theme with generous whitespace."
|
||||
@spec light() :: t()
|
||||
def light, do: @light
|
||||
@doc "No built-in stylesheet emitted. CSS custom properties are still defined on `:root` so user stylesheets can reference `var(--kb-*)` tokens."
|
||||
@spec unstyled() :: t()
|
||||
def unstyled, do: @unstyled
|
||||
end
|
||||
50
packages/elixir/lib/kreuzberg/image_extraction_config.ex
generated
Normal file
50
packages/elixir/lib/kreuzberg/image_extraction_config.ex
generated
Normal file
@@ -0,0 +1,50 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageExtractionConfig do
|
||||
@moduledoc "Image extraction configuration."
|
||||
|
||||
@typedoc "Image extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
extract_images: boolean(),
|
||||
target_dpi: integer(),
|
||||
max_image_dimension: integer(),
|
||||
inject_placeholders: boolean(),
|
||||
auto_adjust_dpi: boolean(),
|
||||
min_dpi: integer(),
|
||||
max_dpi: integer(),
|
||||
max_images_per_page: non_neg_integer() | nil,
|
||||
classify: boolean(),
|
||||
include_page_rasters: boolean(),
|
||||
run_ocr_on_images: boolean(),
|
||||
ocr_text_only: boolean(),
|
||||
append_ocr_text: boolean()
|
||||
}
|
||||
|
||||
defstruct extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4_096,
|
||||
inject_placeholders: true,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 72,
|
||||
max_dpi: 600,
|
||||
max_images_per_page: nil,
|
||||
classify: true,
|
||||
include_page_rasters: false,
|
||||
run_ocr_on_images: true,
|
||||
ocr_text_only: false,
|
||||
append_ocr_text: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
73
packages/elixir/lib/kreuzberg/image_kind.ex
generated
Normal file
73
packages/elixir/lib/kreuzberg/image_kind.ex
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageKind do
|
||||
@moduledoc "Heuristic classification of what an image likely depicts."
|
||||
|
||||
@typedoc "Heuristic classification of what an image likely depicts."
|
||||
@type t ::
|
||||
:photograph
|
||||
| :diagram
|
||||
| :chart
|
||||
| :drawing
|
||||
| :text_block
|
||||
| :decoration
|
||||
| :logo
|
||||
| :icon
|
||||
| :tile_fragment
|
||||
| :mask
|
||||
| :page_raster
|
||||
| :unknown
|
||||
|
||||
@photograph :photograph
|
||||
@diagram :diagram
|
||||
@chart :chart
|
||||
@drawing :drawing
|
||||
@text_block :text_block
|
||||
@decoration :decoration
|
||||
@logo :logo
|
||||
@icon :icon
|
||||
@tile_fragment :tile_fragment
|
||||
@mask :mask
|
||||
@page_raster :page_raster
|
||||
@unknown :unknown
|
||||
|
||||
@doc "Photographic image (natural scene, photograph)"
|
||||
@spec photograph() :: t()
|
||||
def photograph, do: @photograph
|
||||
@doc "Technical or schematic diagram"
|
||||
@spec diagram() :: t()
|
||||
def diagram, do: @diagram
|
||||
@doc "Chart, graph, or plot"
|
||||
@spec chart() :: t()
|
||||
def chart, do: @chart
|
||||
@doc "Freehand or technical drawing"
|
||||
@spec drawing() :: t()
|
||||
def drawing, do: @drawing
|
||||
@doc "Text-heavy image (scanned text, document)"
|
||||
@spec text_block() :: t()
|
||||
def text_block, do: @text_block
|
||||
@doc "Decorative element or border"
|
||||
@spec decoration() :: t()
|
||||
def decoration, do: @decoration
|
||||
@doc "Logo or brand mark"
|
||||
@spec logo() :: t()
|
||||
def logo, do: @logo
|
||||
@doc "Small icon"
|
||||
@spec icon() :: t()
|
||||
def icon, do: @icon
|
||||
@doc "Fragment of a larger tiled image (tile of a technical drawing)"
|
||||
@spec tile_fragment() :: t()
|
||||
def tile_fragment, do: @tile_fragment
|
||||
@doc "Mask or transparency map"
|
||||
@spec mask() :: t()
|
||||
def mask, do: @mask
|
||||
@doc "Full-page render produced during OCR preprocessing; used as a citation thumbnail."
|
||||
@spec page_raster() :: t()
|
||||
def page_raster, do: @page_raster
|
||||
@doc "Could not classify with reasonable confidence"
|
||||
@spec unknown() :: t()
|
||||
def unknown, do: @unknown
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/image_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/image_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageMetadata do
|
||||
@moduledoc """
|
||||
Image metadata extracted from image files.
|
||||
|
||||
Includes dimensions, format, and EXIF data.
|
||||
"""
|
||||
|
||||
@typedoc "Image metadata extracted from image files."
|
||||
@type t :: %__MODULE__{
|
||||
width: non_neg_integer(),
|
||||
height: non_neg_integer(),
|
||||
format: String.t() | nil,
|
||||
exif: map()
|
||||
}
|
||||
|
||||
defstruct width: 0,
|
||||
height: 0,
|
||||
format: nil,
|
||||
exif: %{}
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/image_metadata_type.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/image_metadata_type.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageMetadataType do
|
||||
@moduledoc "Image element metadata."
|
||||
|
||||
@typedoc "Image element metadata."
|
||||
@type t :: %__MODULE__{
|
||||
src: String.t() | nil,
|
||||
alt: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
dimensions: [non_neg_integer()] | nil,
|
||||
image_type: String.t() | nil,
|
||||
attributes: [[String.t()]]
|
||||
}
|
||||
|
||||
defstruct src: nil,
|
||||
alt: nil,
|
||||
title: nil,
|
||||
dimensions: nil,
|
||||
image_type: :data_uri,
|
||||
attributes: []
|
||||
end
|
||||
44
packages/elixir/lib/kreuzberg/image_preprocessing_config.ex
generated
Normal file
44
packages/elixir/lib/kreuzberg/image_preprocessing_config.ex
generated
Normal file
@@ -0,0 +1,44 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImagePreprocessingConfig do
|
||||
@moduledoc """
|
||||
Image preprocessing configuration for OCR.
|
||||
|
||||
These settings control how images are preprocessed before OCR to improve
|
||||
text recognition quality. Different preprocessing strategies work better
|
||||
for different document types.
|
||||
"""
|
||||
|
||||
@typedoc "Image preprocessing configuration for OCR."
|
||||
@type t :: %__MODULE__{
|
||||
target_dpi: integer(),
|
||||
auto_rotate: boolean(),
|
||||
deskew: boolean(),
|
||||
denoise: boolean(),
|
||||
contrast_enhance: boolean(),
|
||||
binarization_method: String.t() | nil,
|
||||
invert_colors: boolean()
|
||||
}
|
||||
|
||||
defstruct target_dpi: 300,
|
||||
auto_rotate: true,
|
||||
deskew: true,
|
||||
denoise: false,
|
||||
contrast_enhance: false,
|
||||
binarization_method: "otsu",
|
||||
invert_colors: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/image_preprocessing_metadata.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/image_preprocessing_metadata.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImagePreprocessingMetadata do
|
||||
@moduledoc """
|
||||
Image preprocessing metadata.
|
||||
|
||||
Tracks the transformations applied to an image during OCR preprocessing,
|
||||
including DPI normalization, resizing, and resampling.
|
||||
"""
|
||||
|
||||
@typedoc "Image preprocessing metadata."
|
||||
@type t :: %__MODULE__{
|
||||
original_dimensions: [non_neg_integer()],
|
||||
original_dpi: [float()],
|
||||
target_dpi: integer(),
|
||||
scale_factor: float(),
|
||||
auto_adjusted: boolean(),
|
||||
final_dpi: integer(),
|
||||
new_dimensions: [non_neg_integer()] | nil,
|
||||
resample_method: String.t() | nil,
|
||||
dimension_clamped: boolean(),
|
||||
calculated_dpi: integer() | nil,
|
||||
skipped_resize: boolean(),
|
||||
resize_error: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct original_dimensions: [],
|
||||
original_dpi: [],
|
||||
target_dpi: 0,
|
||||
scale_factor: 0.0,
|
||||
auto_adjusted: false,
|
||||
final_dpi: 0,
|
||||
new_dimensions: nil,
|
||||
resample_method: nil,
|
||||
dimension_clamped: false,
|
||||
calculated_dpi: nil,
|
||||
skipped_resize: false,
|
||||
resize_error: nil
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/image_type.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/image_type.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageType do
|
||||
@moduledoc "Image type classification."
|
||||
|
||||
@typedoc "Image type classification."
|
||||
@type t :: :"data-uri" | :"inline-svg" | :external | :relative
|
||||
|
||||
@data_uri :"data-uri"
|
||||
@inline_svg :"inline-svg"
|
||||
@external :external
|
||||
@relative :relative
|
||||
|
||||
@doc "Data URI image"
|
||||
@spec data_uri() :: t()
|
||||
def data_uri, do: @data_uri
|
||||
@doc "Inline SVG"
|
||||
@spec inline_svg() :: t()
|
||||
def inline_svg, do: @inline_svg
|
||||
@doc "External image URL"
|
||||
@spec external() :: t()
|
||||
def external, do: @external
|
||||
@doc "Relative path image"
|
||||
@spec relative() :: t()
|
||||
def relative, do: @relative
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/inline_element.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/inline_element.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.InlineElement do
|
||||
@moduledoc """
|
||||
Inline element within a block.
|
||||
|
||||
Represents text with formatting, links, images, etc.
|
||||
"""
|
||||
|
||||
@typedoc "Inline element within a block."
|
||||
@type t :: %__MODULE__{
|
||||
element_type: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
attributes: String.t() | nil,
|
||||
metadata: map() | nil
|
||||
}
|
||||
|
||||
defstruct element_type: :text,
|
||||
content: nil,
|
||||
attributes: nil,
|
||||
metadata: nil
|
||||
end
|
||||
77
packages/elixir/lib/kreuzberg/inline_type.ex
generated
Normal file
77
packages/elixir/lib/kreuzberg/inline_type.ex
generated
Normal file
@@ -0,0 +1,77 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.InlineType do
|
||||
@moduledoc "Types of inline elements in Djot."
|
||||
|
||||
@typedoc "Types of inline elements in Djot."
|
||||
@type t ::
|
||||
:text
|
||||
| :strong
|
||||
| :emphasis
|
||||
| :highlight
|
||||
| :subscript
|
||||
| :superscript
|
||||
| :insert
|
||||
| :delete
|
||||
| :code
|
||||
| :link
|
||||
| :image
|
||||
| :span
|
||||
| :math
|
||||
| :raw_inline
|
||||
| :footnote_ref
|
||||
| :symbol
|
||||
|
||||
@text :text
|
||||
@strong :strong
|
||||
@emphasis :emphasis
|
||||
@highlight :highlight
|
||||
@subscript :subscript
|
||||
@superscript :superscript
|
||||
@insert :insert
|
||||
@delete :delete
|
||||
@code :code
|
||||
@link :link
|
||||
@image :image
|
||||
@span :span
|
||||
@math :math
|
||||
@raw_inline :raw_inline
|
||||
@footnote_ref :footnote_ref
|
||||
@symbol :symbol
|
||||
|
||||
@spec text() :: t()
|
||||
def text, do: @text
|
||||
@spec strong() :: t()
|
||||
def strong, do: @strong
|
||||
@spec emphasis() :: t()
|
||||
def emphasis, do: @emphasis
|
||||
@spec highlight() :: t()
|
||||
def highlight, do: @highlight
|
||||
@spec subscript() :: t()
|
||||
def subscript, do: @subscript
|
||||
@spec superscript() :: t()
|
||||
def superscript, do: @superscript
|
||||
@spec insert() :: t()
|
||||
def insert, do: @insert
|
||||
@spec delete() :: t()
|
||||
def delete, do: @delete
|
||||
@spec code() :: t()
|
||||
def code, do: @code
|
||||
@spec link() :: t()
|
||||
def link, do: @link
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@spec span() :: t()
|
||||
def span, do: @span
|
||||
@spec math() :: t()
|
||||
def math, do: @math
|
||||
@spec raw_inline() :: t()
|
||||
def raw_inline, do: @raw_inline
|
||||
@spec footnote_ref() :: t()
|
||||
def footnote_ref, do: @footnote_ref
|
||||
@spec symbol() :: t()
|
||||
def symbol, do: @symbol
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/jats_metadata.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/jats_metadata.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.JatsMetadata do
|
||||
@moduledoc "JATS (Journal Article Tag Suite) metadata."
|
||||
|
||||
@typedoc "JATS (Journal Article Tag Suite) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
copyright: String.t() | nil,
|
||||
license: String.t() | nil,
|
||||
history_dates: map(),
|
||||
contributor_roles: [map()]
|
||||
}
|
||||
|
||||
defstruct copyright: nil,
|
||||
license: nil,
|
||||
history_dates: %{},
|
||||
contributor_roles: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/keyword.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/keyword.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Keyword do
|
||||
@moduledoc "Extracted keyword with metadata."
|
||||
|
||||
@typedoc "Extracted keyword with metadata."
|
||||
@type t :: %__MODULE__{
|
||||
text: String.t() | nil,
|
||||
score: float(),
|
||||
algorithm: String.t() | nil,
|
||||
positions: [non_neg_integer()] | nil
|
||||
}
|
||||
|
||||
defstruct text: nil,
|
||||
score: 0.0,
|
||||
algorithm: :yake,
|
||||
positions: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/keyword_algorithm.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/keyword_algorithm.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.KeywordAlgorithm do
|
||||
@moduledoc "Keyword algorithm selection."
|
||||
|
||||
@typedoc "Keyword algorithm selection."
|
||||
@type t :: :yake | :rake
|
||||
|
||||
@yake :yake
|
||||
@rake :rake
|
||||
|
||||
@doc "YAKE (Yet Another Keyword Extractor) - statistical approach"
|
||||
@spec yake() :: t()
|
||||
def yake, do: @yake
|
||||
@doc "RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based"
|
||||
@spec rake() :: t()
|
||||
def rake, do: @rake
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/keyword_config.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/keyword_config.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.KeywordConfig do
|
||||
@moduledoc "Keyword extraction configuration."
|
||||
|
||||
@typedoc "Keyword extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
algorithm: String.t() | nil,
|
||||
max_keywords: non_neg_integer(),
|
||||
min_score: float(),
|
||||
ngram_range: [non_neg_integer()],
|
||||
language: String.t() | nil,
|
||||
yake_params: map() | nil,
|
||||
rake_params: map() | nil
|
||||
}
|
||||
|
||||
defstruct algorithm: :yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0,
|
||||
ngram_range: [],
|
||||
language: nil,
|
||||
yake_params: nil,
|
||||
rake_params: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/language_detection_config.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/language_detection_config.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.LanguageDetectionConfig do
|
||||
@moduledoc "Language detection configuration."
|
||||
|
||||
@typedoc "Language detection configuration."
|
||||
@type t :: %__MODULE__{
|
||||
enabled: boolean(),
|
||||
min_confidence: float(),
|
||||
detect_multiple: boolean()
|
||||
}
|
||||
|
||||
defstruct enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
89
packages/elixir/lib/kreuzberg/layout_class.ex
generated
Normal file
89
packages/elixir/lib/kreuzberg/layout_class.ex
generated
Normal file
@@ -0,0 +1,89 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.LayoutClass do
|
||||
@moduledoc """
|
||||
The 17 canonical document layout classes.
|
||||
|
||||
All model backends (RT-DETR, YOLO, etc.) map their native class IDs
|
||||
to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
|
||||
map to the closest equivalent.
|
||||
|
||||
Wire format is snake_case in all serializers (JSON, TOML, YAML).
|
||||
"""
|
||||
|
||||
@typedoc "The 17 canonical document layout classes."
|
||||
@type t ::
|
||||
:caption
|
||||
| :footnote
|
||||
| :formula
|
||||
| :list_item
|
||||
| :page_footer
|
||||
| :page_header
|
||||
| :picture
|
||||
| :section_header
|
||||
| :table
|
||||
| :text
|
||||
| :title
|
||||
| :document_index
|
||||
| :code
|
||||
| :checkbox_selected
|
||||
| :checkbox_unselected
|
||||
| :form
|
||||
| :key_value_region
|
||||
|
||||
@caption :caption
|
||||
@footnote :footnote
|
||||
@formula :formula
|
||||
@list_item :list_item
|
||||
@page_footer :page_footer
|
||||
@page_header :page_header
|
||||
@picture :picture
|
||||
@section_header :section_header
|
||||
@table :table
|
||||
@text :text
|
||||
@title :title
|
||||
@document_index :document_index
|
||||
@code :code
|
||||
@checkbox_selected :checkbox_selected
|
||||
@checkbox_unselected :checkbox_unselected
|
||||
@form :form
|
||||
@key_value_region :key_value_region
|
||||
|
||||
@spec caption() :: t()
|
||||
def caption, do: @caption
|
||||
@spec footnote() :: t()
|
||||
def footnote, do: @footnote
|
||||
@spec formula() :: t()
|
||||
def formula, do: @formula
|
||||
@spec list_item() :: t()
|
||||
def list_item, do: @list_item
|
||||
@spec page_footer() :: t()
|
||||
def page_footer, do: @page_footer
|
||||
@spec page_header() :: t()
|
||||
def page_header, do: @page_header
|
||||
@spec picture() :: t()
|
||||
def picture, do: @picture
|
||||
@spec section_header() :: t()
|
||||
def section_header, do: @section_header
|
||||
@spec table() :: t()
|
||||
def table, do: @table
|
||||
@spec text() :: t()
|
||||
def text, do: @text
|
||||
@spec title() :: t()
|
||||
def title, do: @title
|
||||
@spec document_index() :: t()
|
||||
def document_index, do: @document_index
|
||||
@spec code() :: t()
|
||||
def code, do: @code
|
||||
@spec checkbox_selected() :: t()
|
||||
def checkbox_selected, do: @checkbox_selected
|
||||
@spec checkbox_unselected() :: t()
|
||||
def checkbox_unselected, do: @checkbox_unselected
|
||||
@spec form() :: t()
|
||||
def form, do: @form
|
||||
@spec key_value_region() :: t()
|
||||
def key_value_region, do: @key_value_region
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/layout_detection.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/layout_detection.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.LayoutDetection do
|
||||
@moduledoc "A single layout detection result."
|
||||
|
||||
@typedoc "A single layout detection result."
|
||||
@type t :: %__MODULE__{
|
||||
class_name: String.t() | nil,
|
||||
confidence: float(),
|
||||
bbox: map()
|
||||
}
|
||||
|
||||
defstruct class_name: :caption,
|
||||
confidence: 0.0,
|
||||
bbox: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/layout_detection_config.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/layout_detection_config.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.LayoutDetectionConfig do
|
||||
@moduledoc """
|
||||
Layout detection configuration.
|
||||
|
||||
Controls layout detection behavior in the extraction pipeline.
|
||||
When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
||||
is enabled for PDF extraction.
|
||||
"""
|
||||
|
||||
@typedoc "Layout detection configuration."
|
||||
@type t :: %__MODULE__{
|
||||
confidence_threshold: float() | nil,
|
||||
apply_heuristics: boolean(),
|
||||
table_model: String.t() | nil,
|
||||
acceleration: map() | nil
|
||||
}
|
||||
|
||||
defstruct confidence_threshold: nil,
|
||||
apply_heuristics: true,
|
||||
table_model: :tatr,
|
||||
acceleration: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user