This commit is contained in:
31
packages/elixir/.credo.exs
generated
Normal file
31
packages/elixir/.credo.exs
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
%{
|
||||
configs: [
|
||||
%{
|
||||
name: "default",
|
||||
strict: true,
|
||||
parse_timeout: 5000,
|
||||
files: %{
|
||||
included: [
|
||||
"lib/",
|
||||
"src/",
|
||||
"test/",
|
||||
"web/",
|
||||
"apps/*/lib/",
|
||||
"apps/*/src/",
|
||||
"apps/*/test/",
|
||||
"apps/*/web/"
|
||||
],
|
||||
excluded: [
|
||||
~r"/_build/",
|
||||
~r"/deps/",
|
||||
~r"/node_modules/"
|
||||
]
|
||||
},
|
||||
checks: %{
|
||||
enabled: [
|
||||
{Credo.Check.Refactor.CyclomaticComplexity, max_complexity: 16}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
5
packages/elixir/.formatter.exs
generated
Normal file
5
packages/elixir/.formatter.exs
generated
Normal file
@@ -0,0 +1,5 @@
|
||||
[
|
||||
import_deps: [:rustler],
|
||||
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"],
|
||||
line_length: 120
|
||||
]
|
||||
93
packages/elixir/LICENSE
generated
Normal file
93
packages/elixir/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
509
packages/elixir/README.md
generated
Normal file
509
packages/elixir/README.md
generated
Normal file
@@ -0,0 +1,509 @@
|
||||
# Elixir
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
||||
<a href="https://github.com/kreuzberg-dev/alef">
|
||||
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
|
||||
</a>
|
||||
<!-- Language Bindings -->
|
||||
<a href="https://crates.io/crates/kreuzberg">
|
||||
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/kreuzberg/">
|
||||
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
|
||||
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
|
||||
</a>
|
||||
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
||||
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
||||
</a>
|
||||
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
||||
</a>
|
||||
<a href="https://rubygems.org/gems/kreuzberg">
|
||||
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
||||
</a>
|
||||
<a href="https://hex.pm/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
||||
</a>
|
||||
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
||||
</a>
|
||||
<a href="https://pub.dev/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
|
||||
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
|
||||
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
||||
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
|
||||
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
|
||||
</a>
|
||||
|
||||
<!-- Project Info -->
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev">
|
||||
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://huggingface.co/Kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="margin: 24px 0 0;">
|
||||
<a href="https://kreuzberg.dev">
|
||||
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
|
||||
<a href="https://discord.gg/xt9WY3GnKR">
|
||||
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev/demo.html">
|
||||
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Elixir bindings with native BEAM concurrency, OTP integration, and idiomatic Elixir API.
|
||||
|
||||
## What This Package Provides
|
||||
|
||||
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
|
||||
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
|
||||
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
|
||||
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
|
||||
- **BEAM package** — Rustler NIF binding for OTP pipelines.
|
||||
|
||||
## Installation
|
||||
|
||||
### Package Installation
|
||||
|
||||
Add to your `mix.exs` dependencies:
|
||||
|
||||
```elixir
|
||||
def deps do
|
||||
[
|
||||
{:kreuzberg, "~> 5.0.0-rc.3"}
|
||||
]
|
||||
end
|
||||
```
|
||||
|
||||
Then run:
|
||||
|
||||
```bash
|
||||
mix deps.get
|
||||
```
|
||||
|
||||
### System Requirements
|
||||
- **Elixir 1.14+** and **Erlang/OTP 26+** required
|
||||
- Pre-compiled NIFs bundled via `rustler_precompiled` for macOS (arm64, x64), Linux (x64, arm64), and Windows (x64)
|
||||
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
||||
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Extraction
|
||||
|
||||
Extract text, metadata, and structure from any supported document format:
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
# Basic document extraction workflow
|
||||
# Load file -> extract -> access results
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
IO.puts("Extracted Content:")
|
||||
IO.puts(result.content)
|
||||
|
||||
IO.puts("\nMetadata:")
|
||||
IO.puts("Format: #{inspect(result.metadata.format)}")
|
||||
IO.puts("Tables found: #{length(result.tables)}")
|
||||
```
|
||||
```
|
||||
|
||||
### Common Use Cases
|
||||
|
||||
#### Extract with Custom Configuration
|
||||
|
||||
Most use cases benefit from configuration to control extraction behavior:
|
||||
|
||||
**With OCR (for scanned documents):**
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
content = result.content
|
||||
IO.puts("OCR Extracted content:")
|
||||
IO.puts(content)
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
```
|
||||
|
||||
#### Table Extraction
|
||||
|
||||
See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
|
||||
|
||||
#### Processing Multiple Files
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
|
||||
|
||||
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
|
||||
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("File: #{result.mime_type}")
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
IO.puts("Total files processed: #{length(results)}")
|
||||
```
|
||||
```
|
||||
|
||||
#### Async Processing
|
||||
|
||||
For non-blocking document processing:
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
# Extract from different file types (PDF, DOCX, etc.)
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result.content}")
|
||||
IO.puts("Format: #{inspect(result.metadata.format)}")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
```
|
||||
|
||||
### Next Steps
|
||||
|
||||
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
||||
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
|
||||
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
|
||||
|
||||
## Features
|
||||
|
||||
### Supported File Formats (90+)
|
||||
|
||||
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
||||
|
||||
#### Office Documents
|
||||
|
||||
| Category | Formats | Capabilities |
|
||||
|----------|---------|--------------|
|
||||
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
||||
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
||||
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
||||
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
||||
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
||||
| **Database** | `.dbf` | Table data extraction, field type support |
|
||||
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
||||
|
||||
#### Images (OCR-Enabled)
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
||||
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
|
||||
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
||||
|
||||
#### Web & Data
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
||||
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
||||
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
|
||||
|
||||
#### Email & Archives
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
||||
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
||||
|
||||
#### Academic & Scientific
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
|
||||
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
||||
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
||||
|
||||
#### Code Intelligence (300+ Languages)
|
||||
|
||||
| Feature | Description |
|
||||
|---------|-------------|
|
||||
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
||||
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
||||
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
||||
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
||||
| **Diagnostics** | Parse errors with line/column positions |
|
||||
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
||||
|
||||
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
||||
|
||||
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
- **Text Extraction** - Extract all text content with position and formatting information
|
||||
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
||||
- **Table Extraction** - Parse tables with structure and cell content preservation
|
||||
- **Image Extraction** - Extract embedded images and render page previews
|
||||
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
||||
- **Async/Await** - Non-blocking document processing with concurrent operations
|
||||
- **Plugin System** - Extensible post-processing for custom text transformation
|
||||
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
||||
- **Batch Processing** - Efficiently process multiple documents in parallel
|
||||
- **Memory Efficient** - Stream large files without loading entirely into memory
|
||||
- **Language Detection** - Detect and support multiple languages in documents
|
||||
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
||||
- **Configuration** - Fine-grained control over extraction behavior
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
| Format | Speed | Memory | Notes |
|
||||
|--------|-------|--------|-------|
|
||||
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
||||
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
||||
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
||||
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
||||
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
||||
|
||||
## OCR Support
|
||||
|
||||
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
||||
|
||||
- **Tesseract**
|
||||
|
||||
- **Paddleocr**
|
||||
|
||||
### OCR Configuration Example
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
content = result.content
|
||||
IO.puts("OCR Extracted content:")
|
||||
IO.puts(content)
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
```
|
||||
|
||||
## Async Support
|
||||
|
||||
This binding provides full async/await support for non-blocking document processing:
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
# Extract from different file types (PDF, DOCX, etc.)
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result.content}")
|
||||
IO.puts("Format: #{inspect(result.metadata.format)}")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
```
|
||||
|
||||
## Plugin System
|
||||
|
||||
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
||||
|
||||
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
|
||||
|
||||
### Plugin Example
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
|
||||
# Word Count Post-Processor Plugin
|
||||
# This post-processor automatically counts words in extracted content
|
||||
# and adds the word count to the metadata.
|
||||
|
||||
defmodule MyApp.Plugins.WordCountProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"WordCountProcessor"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def processing_stage do
|
||||
:post
|
||||
end
|
||||
|
||||
@impl true
|
||||
def version do
|
||||
"1.0.0"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _options) do
|
||||
content = result["content"] || ""
|
||||
word_count = content
|
||||
|> String.split(~r/\s+/, trim: true)
|
||||
|> length()
|
||||
|
||||
# Update metadata with word count
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
updated_metadata = Map.put(metadata, "word_count", word_count)
|
||||
|
||||
{:ok, Map.put(result, "metadata", updated_metadata)}
|
||||
end
|
||||
end
|
||||
|
||||
# Register the word count post-processor
|
||||
Plugin.register_post_processor(:word_count_processor, MyApp.Plugins.WordCountProcessor)
|
||||
|
||||
# Example usage
|
||||
result = %{
|
||||
"content" => "The quick brown fox jumps over the lazy dog. This is a sample document with multiple words.",
|
||||
"metadata" => %{
|
||||
"source" => "document.pdf",
|
||||
"pages" => 1
|
||||
}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.WordCountProcessor.process(result, %{}) do
|
||||
{:ok, processed_result} ->
|
||||
word_count = processed_result["metadata"]["word_count"]
|
||||
IO.puts("Word count added: #{word_count} words")
|
||||
IO.inspect(processed_result, label: "Processed Result")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Processing failed: #{reason}")
|
||||
end
|
||||
|
||||
# List all registered post-processors
|
||||
{:ok, processors} = Plugin.list_post_processors()
|
||||
IO.inspect(processors, label: "Registered Post-Processors")
|
||||
```
|
||||
```
|
||||
|
||||
## Embeddings Support
|
||||
|
||||
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
||||
|
||||
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
|
||||
|
||||
## Batch Processing
|
||||
|
||||
Process multiple documents efficiently:
|
||||
|
||||
```exs
|
||||
```elixir title="Elixir"
|
||||
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
|
||||
|
||||
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
|
||||
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("File: #{result.mime_type}")
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
IO.puts("Total files processed: #{length(results)}")
|
||||
```
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
||||
|
||||
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
|
||||
|
||||
## Documentation
|
||||
|
||||
- **[Official Documentation](https://docs.kreuzberg.dev/)**
|
||||
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## Part of Kreuzberg.dev
|
||||
|
||||
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
||||
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
||||
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
||||
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
||||
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
||||
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
|
||||
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
||||
|
||||
## License
|
||||
|
||||
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
|
||||
|
||||
## Support
|
||||
|
||||
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
|
||||
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
||||
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|
||||
276
packages/elixir/lib/kreuzberg.ex
generated
Normal file
276
packages/elixir/lib/kreuzberg.ex
generated
Normal file
@@ -0,0 +1,276 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg do
|
||||
@moduledoc "High-level API for kreuzberg"
|
||||
|
||||
@doc "Extract content from a byte array."
|
||||
@spec extract_bytes_async(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_async(content, mime_type) do
|
||||
Kreuzberg.Native.extract_bytes_async(content, mime_type, nil)
|
||||
end
|
||||
|
||||
@doc "Extract content from a byte array."
|
||||
@spec extract_bytes_async(binary(), String.t(), String.t() | nil) ::
|
||||
{:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_async(content, mime_type, config) do
|
||||
Kreuzberg.Native.extract_bytes_async(content, mime_type, config)
|
||||
end
|
||||
|
||||
@doc "Extract content from a file."
|
||||
@spec extract_file_async(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_file_async(path, opts \\ []) do
|
||||
Kreuzberg.Native.extract_file_async(
|
||||
path,
|
||||
Keyword.get(opts, :mime_type),
|
||||
Keyword.get(opts, :config)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_file`."
|
||||
@spec extract_file_sync(String.t(), keyword()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_file_sync(path, opts \\ []) do
|
||||
Kreuzberg.Native.extract_file_sync(
|
||||
path,
|
||||
Keyword.get(opts, :mime_type),
|
||||
Keyword.get(opts, :config)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_bytes`."
|
||||
@spec extract_bytes_sync(binary(), String.t()) :: {:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_sync(content, mime_type) do
|
||||
Kreuzberg.Native.extract_bytes_sync(content, mime_type, nil)
|
||||
end
|
||||
|
||||
@doc "Synchronous wrapper for `extract_bytes`."
|
||||
@spec extract_bytes_sync(binary(), String.t(), String.t() | nil) ::
|
||||
{:ok, map()} | {:error, atom, String.t()}
|
||||
def extract_bytes_sync(content, mime_type, config) do
|
||||
Kreuzberg.Native.extract_bytes_sync(content, mime_type, config)
|
||||
end
|
||||
|
||||
@doc "Detect MIME type from raw file bytes."
|
||||
@spec detect_mime_type_from_bytes(binary()) :: {:ok, String.t()} | {:error, atom, String.t()}
|
||||
def detect_mime_type_from_bytes(content) do
|
||||
Kreuzberg.Native.detect_mime_type_from_bytes(content)
|
||||
end
|
||||
|
||||
@doc "Get file extensions for a given MIME type."
|
||||
@spec get_extensions_for_mime(String.t()) :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def get_extensions_for_mime(mime_type) do
|
||||
Kreuzberg.Native.get_extensions_for_mime(mime_type)
|
||||
end
|
||||
|
||||
@doc "Clear all embedding backends from the global registry."
|
||||
@spec clear_embedding_backends() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_embedding_backends do
|
||||
Kreuzberg.Native.clear_embedding_backends()
|
||||
end
|
||||
|
||||
@doc "List the names of all registered embedding backends."
|
||||
@spec list_embedding_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_embedding_backends do
|
||||
Kreuzberg.Native.list_embedding_backends()
|
||||
end
|
||||
|
||||
@doc "List names of all registered document extractors."
|
||||
@spec list_document_extractors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_document_extractors do
|
||||
Kreuzberg.Native.list_document_extractors()
|
||||
end
|
||||
|
||||
@doc "Clear all document extractors from the global registry."
|
||||
@spec clear_document_extractors() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_document_extractors do
|
||||
Kreuzberg.Native.clear_document_extractors()
|
||||
end
|
||||
|
||||
@doc "List all registered OCR backends."
|
||||
@spec list_ocr_backends() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_ocr_backends do
|
||||
Kreuzberg.Native.list_ocr_backends()
|
||||
end
|
||||
|
||||
@doc "Clear all OCR backends from the global registry."
|
||||
@spec clear_ocr_backends() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_ocr_backends do
|
||||
Kreuzberg.Native.clear_ocr_backends()
|
||||
end
|
||||
|
||||
@doc "List all registered post-processor names."
|
||||
@spec list_post_processors() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_post_processors do
|
||||
Kreuzberg.Native.list_post_processors()
|
||||
end
|
||||
|
||||
@doc "Remove all registered post-processors."
|
||||
@spec clear_post_processors() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_post_processors do
|
||||
Kreuzberg.Native.clear_post_processors()
|
||||
end
|
||||
|
||||
@doc "List names of all registered renderers."
|
||||
@spec list_renderers() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_renderers do
|
||||
Kreuzberg.Native.list_renderers()
|
||||
end
|
||||
|
||||
@doc "Clear all renderers from the global registry."
|
||||
@spec clear_renderers() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_renderers do
|
||||
Kreuzberg.Native.clear_renderers()
|
||||
end
|
||||
|
||||
@doc "List names of all registered validators."
|
||||
@spec list_validators() :: {:ok, [String.t()]} | {:error, atom, String.t()}
|
||||
def list_validators do
|
||||
Kreuzberg.Native.list_validators()
|
||||
end
|
||||
|
||||
@doc "Remove all registered validators."
|
||||
@spec clear_validators() :: {:ok, nil} | {:error, atom, String.t()}
|
||||
def clear_validators do
|
||||
Kreuzberg.Native.clear_validators()
|
||||
end
|
||||
|
||||
@doc "Compare two extraction results and return a structured diff."
|
||||
@spec compare(keyword()) :: map()
|
||||
def compare(opts \\ []) do
|
||||
Kreuzberg.Native.compare(
|
||||
Keyword.get(opts, :a),
|
||||
Keyword.get(opts, :b),
|
||||
Keyword.get(opts, :opts)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Generate embeddings asynchronously for a list of text strings."
|
||||
@spec embed_texts_async([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts_async(texts) do
|
||||
Kreuzberg.Native.embed_texts_async(texts, nil)
|
||||
end
|
||||
|
||||
@doc "Generate embeddings asynchronously for a list of text strings."
|
||||
@spec embed_texts_async([String.t()], String.t() | nil) ::
|
||||
{:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts_async(texts, config) do
|
||||
Kreuzberg.Native.embed_texts_async(texts, config)
|
||||
end
|
||||
|
||||
@doc "Render a single PDF page to PNG bytes."
|
||||
@spec render_pdf_page_to_png(binary(), non_neg_integer(), keyword()) ::
|
||||
{:ok, binary()} | {:error, atom, String.t()}
|
||||
def render_pdf_page_to_png(pdf_bytes, page_index, opts \\ []) do
|
||||
Kreuzberg.Native.render_pdf_page_to_png(
|
||||
pdf_bytes,
|
||||
page_index,
|
||||
Keyword.get(opts, :dpi),
|
||||
Keyword.get(opts, :password)
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Detect the MIME type of a file at the given path."
|
||||
@spec detect_mime_type(String.t(), boolean()) :: {:ok, String.t()} | {:error, atom, String.t()}
|
||||
def detect_mime_type(path, check_exists) do
|
||||
Kreuzberg.Native.detect_mime_type(path, check_exists)
|
||||
end
|
||||
|
||||
@doc "Embed a list of texts using the configured embedding model."
|
||||
@spec embed_texts([String.t()]) :: {:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts(texts) do
|
||||
Kreuzberg.Native.embed_texts(texts, nil)
|
||||
end
|
||||
|
||||
@doc "Embed a list of texts using the configured embedding model."
|
||||
@spec embed_texts([String.t()], String.t() | nil) ::
|
||||
{:ok, [[float()]]} | {:error, atom, String.t()}
|
||||
def embed_texts(texts, config) do
|
||||
Kreuzberg.Native.embed_texts(texts, config)
|
||||
end
|
||||
|
||||
@doc "Get an embedding preset by name."
|
||||
@spec get_embedding_preset(String.t()) :: map() | nil
|
||||
def get_embedding_preset(name) do
|
||||
Kreuzberg.Native.get_embedding_preset(name)
|
||||
end
|
||||
|
||||
@doc "List the names of all available embedding presets."
|
||||
@spec list_embedding_presets() :: [String.t()]
|
||||
def list_embedding_presets do
|
||||
Kreuzberg.Native.list_embedding_presets()
|
||||
end
|
||||
|
||||
@doc "Register a OcrBackend plugin with a GenServer PID and name."
|
||||
@spec register_ocr_backend(pid(), String.t()) :: :ok | :error
|
||||
def register_ocr_backend(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_ocr_backend(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered OcrBackend plugin by name."
|
||||
@spec unregister_ocr_backend(String.t()) :: :ok | :error
|
||||
def unregister_ocr_backend(name) do
|
||||
Kreuzberg.Native.unregister_ocr_backend(name)
|
||||
end
|
||||
|
||||
@doc "Register a PostProcessor plugin with a GenServer PID and name."
|
||||
@spec register_post_processor(pid(), String.t()) :: :ok | :error
|
||||
def register_post_processor(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_post_processor(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered PostProcessor plugin by name."
|
||||
@spec unregister_post_processor(String.t()) :: :ok | :error
|
||||
def unregister_post_processor(name) do
|
||||
Kreuzberg.Native.unregister_post_processor(name)
|
||||
end
|
||||
|
||||
@doc "Register a Validator plugin with a GenServer PID and name."
|
||||
@spec register_validator(pid(), String.t()) :: :ok | :error
|
||||
def register_validator(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_validator(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered Validator plugin by name."
|
||||
@spec unregister_validator(String.t()) :: :ok | :error
|
||||
def unregister_validator(name) do
|
||||
Kreuzberg.Native.unregister_validator(name)
|
||||
end
|
||||
|
||||
@doc "Register a EmbeddingBackend plugin with a GenServer PID and name."
|
||||
@spec register_embedding_backend(pid(), String.t()) :: :ok | :error
|
||||
def register_embedding_backend(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_embedding_backend(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered EmbeddingBackend plugin by name."
|
||||
@spec unregister_embedding_backend(String.t()) :: :ok | :error
|
||||
def unregister_embedding_backend(name) do
|
||||
Kreuzberg.Native.unregister_embedding_backend(name)
|
||||
end
|
||||
|
||||
@doc "Register a DocumentExtractor plugin with a GenServer PID and name."
|
||||
@spec register_document_extractor(pid(), String.t()) :: :ok | :error
|
||||
def register_document_extractor(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_document_extractor(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered DocumentExtractor plugin by name."
|
||||
@spec unregister_document_extractor(String.t()) :: :ok | :error
|
||||
def unregister_document_extractor(name) do
|
||||
Kreuzberg.Native.unregister_document_extractor(name)
|
||||
end
|
||||
|
||||
@doc "Register a Renderer plugin with a GenServer PID and name."
|
||||
@spec register_renderer(pid(), String.t()) :: :ok | :error
|
||||
def register_renderer(genserver_pid, plugin_name) do
|
||||
Kreuzberg.Native.register_renderer(genserver_pid, plugin_name)
|
||||
end
|
||||
|
||||
@doc "Unregister a previously registered Renderer plugin by name."
|
||||
@spec unregister_renderer(String.t()) :: :ok | :error
|
||||
def unregister_renderer(name) do
|
||||
Kreuzberg.Native.unregister_renderer(name)
|
||||
end
|
||||
end
|
||||
48
packages/elixir/lib/kreuzberg/acceleration_config.ex
generated
Normal file
48
packages/elixir/lib/kreuzberg/acceleration_config.ex
generated
Normal file
@@ -0,0 +1,48 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.AccelerationConfig do
|
||||
@moduledoc """
|
||||
Hardware acceleration configuration for ONNX Runtime models.
|
||||
|
||||
Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||||
for inference in layout detection and embedding generation.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::AccelerationConfig;
|
||||
|
||||
// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
|
||||
let config = AccelerationConfig::default();
|
||||
|
||||
// Force CPU only
|
||||
let config = AccelerationConfig {
|
||||
provider: kreuzberg::ExecutionProviderType::Cpu,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Hardware acceleration configuration for ONNX Runtime models."
|
||||
@type t :: %__MODULE__{
|
||||
provider: String.t() | nil,
|
||||
device_id: non_neg_integer()
|
||||
}
|
||||
|
||||
defstruct provider: :auto,
|
||||
device_id: 0
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
28
packages/elixir/lib/kreuzberg/annotation_kind.ex
generated
Normal file
28
packages/elixir/lib/kreuzberg/annotation_kind.ex
generated
Normal file
@@ -0,0 +1,28 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.AnnotationKind do
|
||||
@moduledoc "Types of inline text annotations."
|
||||
|
||||
@typedoc "Types of inline text annotations."
|
||||
@type t :: term()
|
||||
|
||||
@type bold :: :bold
|
||||
@type italic :: :italic
|
||||
@type underline :: :underline
|
||||
@type strikethrough :: :strikethrough
|
||||
@type code :: :code
|
||||
@type subscript :: :subscript
|
||||
@type superscript :: :superscript
|
||||
@type link :: %{type: :link, url: String.t(), title: String.t()}
|
||||
@typedoc "Highlighted text (PDF highlights, HTML `<mark>`)."
|
||||
@type highlight :: :highlight
|
||||
@typedoc "Text color (CSS-compatible value, e.g. \"#ff0000\", \"red\")."
|
||||
@type color :: %{type: :color, value: String.t()}
|
||||
@typedoc "Font size with units (e.g. \"12pt\", \"1.2em\", \"16px\")."
|
||||
@type font_size :: %{type: :font_size, value: String.t()}
|
||||
@typedoc "Extensible annotation for format-specific styling."
|
||||
@type custom :: %{type: :custom, name: String.t(), value: String.t()}
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/archive_entry.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/archive_entry.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ArchiveEntry do
|
||||
@moduledoc """
|
||||
A single file extracted from an archive.
|
||||
|
||||
When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||||
enabled, each processable file produces its own full `ExtractionResult`.
|
||||
"""
|
||||
|
||||
@typedoc "A single file extracted from an archive."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
result: map()
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
mime_type: nil,
|
||||
result: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/archive_metadata.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/archive_metadata.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ArchiveMetadata do
|
||||
@moduledoc """
|
||||
Archive (ZIP/TAR/7Z) metadata.
|
||||
|
||||
Extracted from compressed archive files containing file lists and size information.
|
||||
"""
|
||||
|
||||
@typedoc "Archive (ZIP/TAR/7Z) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
format: String.t() | nil,
|
||||
file_count: non_neg_integer(),
|
||||
file_list: [String.t()],
|
||||
total_size: non_neg_integer(),
|
||||
compressed_size: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct format: nil,
|
||||
file_count: 0,
|
||||
file_list: [],
|
||||
total_size: 0,
|
||||
compressed_size: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/b_box.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/b_box.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BBox do
|
||||
@moduledoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
|
||||
|
||||
@typedoc "Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right."
|
||||
@type t :: %__MODULE__{
|
||||
x1: float(),
|
||||
y1: float(),
|
||||
x2: float(),
|
||||
y2: float()
|
||||
}
|
||||
|
||||
defstruct x1: 0.0,
|
||||
y1: 0.0,
|
||||
x2: 0.0,
|
||||
y2: 0.0
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/batch_bytes_item.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/batch_bytes_item.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BatchBytesItem do
|
||||
@moduledoc """
|
||||
Batch item for byte array extraction.
|
||||
|
||||
Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||||
to represent a single item in a batch extraction job.
|
||||
"""
|
||||
|
||||
@typedoc "Batch item for byte array extraction."
|
||||
@type t :: %__MODULE__{
|
||||
content: binary(),
|
||||
mime_type: String.t() | nil,
|
||||
config: map() | nil
|
||||
}
|
||||
|
||||
defstruct content: <<>>,
|
||||
mime_type: nil,
|
||||
config: nil
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/batch_file_item.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/batch_file_item.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BatchFileItem do
|
||||
@moduledoc """
|
||||
Batch item for file extraction.
|
||||
|
||||
Used with `batch_extract_files` and `batch_extract_files_sync`
|
||||
to represent a single file in a batch extraction job.
|
||||
"""
|
||||
|
||||
@typedoc "Batch item for file extraction."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
config: map() | nil
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
config: nil
|
||||
end
|
||||
34
packages/elixir/lib/kreuzberg/bibtex_metadata.ex
generated
Normal file
34
packages/elixir/lib/kreuzberg/bibtex_metadata.ex
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BibtexMetadata do
|
||||
@moduledoc "BibTeX bibliography metadata."
|
||||
|
||||
@typedoc "BibTeX bibliography metadata."
|
||||
@type t :: %__MODULE__{
|
||||
entry_count: non_neg_integer(),
|
||||
citation_keys: [String.t()],
|
||||
authors: [String.t()],
|
||||
year_range: map() | nil,
|
||||
entry_types: map() | nil
|
||||
}
|
||||
|
||||
defstruct entry_count: 0,
|
||||
citation_keys: [],
|
||||
authors: [],
|
||||
year_range: nil,
|
||||
entry_types: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
77
packages/elixir/lib/kreuzberg/block_type.ex
generated
Normal file
77
packages/elixir/lib/kreuzberg/block_type.ex
generated
Normal file
@@ -0,0 +1,77 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BlockType do
|
||||
@moduledoc "Types of block-level elements in Djot."
|
||||
|
||||
@typedoc "Types of block-level elements in Djot."
|
||||
@type t ::
|
||||
:paragraph
|
||||
| :heading
|
||||
| :blockquote
|
||||
| :code_block
|
||||
| :list_item
|
||||
| :ordered_list
|
||||
| :bullet_list
|
||||
| :task_list
|
||||
| :definition_list
|
||||
| :definition_term
|
||||
| :definition_description
|
||||
| :div
|
||||
| :section
|
||||
| :thematic_break
|
||||
| :raw_block
|
||||
| :math_display
|
||||
|
||||
@paragraph :paragraph
|
||||
@heading :heading
|
||||
@blockquote :blockquote
|
||||
@code_block :code_block
|
||||
@list_item :list_item
|
||||
@ordered_list :ordered_list
|
||||
@bullet_list :bullet_list
|
||||
@task_list :task_list
|
||||
@definition_list :definition_list
|
||||
@definition_term :definition_term
|
||||
@definition_description :definition_description
|
||||
@div :div
|
||||
@section :section
|
||||
@thematic_break :thematic_break
|
||||
@raw_block :raw_block
|
||||
@math_display :math_display
|
||||
|
||||
@spec paragraph() :: t()
|
||||
def paragraph, do: @paragraph
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@spec blockquote() :: t()
|
||||
def blockquote, do: @blockquote
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@spec list_item() :: t()
|
||||
def list_item, do: @list_item
|
||||
@spec ordered_list() :: t()
|
||||
def ordered_list, do: @ordered_list
|
||||
@spec bullet_list() :: t()
|
||||
def bullet_list, do: @bullet_list
|
||||
@spec task_list() :: t()
|
||||
def task_list, do: @task_list
|
||||
@spec definition_list() :: t()
|
||||
def definition_list, do: @definition_list
|
||||
@spec definition_term() :: t()
|
||||
def definition_term, do: @definition_term
|
||||
@spec definition_description() :: t()
|
||||
def definition_description, do: @definition_description
|
||||
@spec div() :: t()
|
||||
def div, do: @div
|
||||
@spec section() :: t()
|
||||
def section, do: @section
|
||||
@spec thematic_break() :: t()
|
||||
def thematic_break, do: @thematic_break
|
||||
@spec raw_block() :: t()
|
||||
def raw_block, do: @raw_block
|
||||
@spec math_display() :: t()
|
||||
def math_display, do: @math_display
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/bounding_box.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/bounding_box.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.BoundingBox do
|
||||
@moduledoc "Bounding box coordinates for element positioning."
|
||||
|
||||
@typedoc "Bounding box coordinates for element positioning."
|
||||
@type t :: %__MODULE__{
|
||||
x0: float(),
|
||||
y0: float(),
|
||||
x1: float(),
|
||||
y1: float()
|
||||
}
|
||||
|
||||
defstruct x0: 0.0,
|
||||
y0: 0.0,
|
||||
x1: 0.0,
|
||||
y1: 0.0
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/cache_stats.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/cache_stats.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CacheStats do
|
||||
@moduledoc false
|
||||
|
||||
@type t :: %__MODULE__{
|
||||
total_files: non_neg_integer(),
|
||||
total_size_mb: float(),
|
||||
available_space_mb: float(),
|
||||
oldest_file_age_days: float(),
|
||||
newest_file_age_days: float()
|
||||
}
|
||||
|
||||
defstruct total_files: 0,
|
||||
total_size_mb: 0.0,
|
||||
available_space_mb: 0.0,
|
||||
oldest_file_age_days: 0.0,
|
||||
newest_file_age_days: 0.0
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/cell_change.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/cell_change.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CellChange do
|
||||
@moduledoc """
|
||||
A single changed cell within a table.
|
||||
|
||||
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||||
reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
`crate::diff` re-exports this type verbatim.
|
||||
"""
|
||||
|
||||
@typedoc "A single changed cell within a table."
|
||||
@type t :: %__MODULE__{
|
||||
row: non_neg_integer(),
|
||||
col: non_neg_integer(),
|
||||
from: String.t() | nil,
|
||||
to: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct row: 0,
|
||||
col: 0,
|
||||
from: nil,
|
||||
to: nil
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/chunk.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/chunk.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Chunk do
|
||||
@moduledoc """
|
||||
A text chunk with optional embedding and metadata.
|
||||
|
||||
Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||||
contains the text content, optional embedding vector (if embedding generation
|
||||
is configured), and metadata about its position in the document.
|
||||
"""
|
||||
|
||||
@typedoc "A text chunk with optional embedding and metadata."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
chunk_type: String.t() | nil,
|
||||
embedding: [float()] | nil,
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
chunk_type: :unknown,
|
||||
embedding: nil,
|
||||
metadata: nil
|
||||
end
|
||||
31
packages/elixir/lib/kreuzberg/chunk_metadata.ex
generated
Normal file
31
packages/elixir/lib/kreuzberg/chunk_metadata.ex
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkMetadata do
|
||||
@moduledoc "Metadata about a chunk's position in the original document."
|
||||
|
||||
@typedoc "Metadata about a chunk's position in the original document."
|
||||
@type t :: %__MODULE__{
|
||||
byte_start: non_neg_integer(),
|
||||
byte_end: non_neg_integer(),
|
||||
token_count: non_neg_integer() | nil,
|
||||
chunk_index: non_neg_integer(),
|
||||
total_chunks: non_neg_integer(),
|
||||
first_page: non_neg_integer() | nil,
|
||||
last_page: non_neg_integer() | nil,
|
||||
heading_context: map() | nil,
|
||||
image_indices: [non_neg_integer()]
|
||||
}
|
||||
|
||||
defstruct byte_start: 0,
|
||||
byte_end: 0,
|
||||
token_count: nil,
|
||||
chunk_index: 0,
|
||||
total_chunks: 0,
|
||||
first_page: nil,
|
||||
last_page: nil,
|
||||
heading_context: nil,
|
||||
image_indices: []
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/chunk_sizing.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/chunk_sizing.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkSizing do
|
||||
@moduledoc """
|
||||
How chunk size is measured.
|
||||
|
||||
Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||||
chunks are sized by token count according to the specified tokenizer.
|
||||
|
||||
Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||||
available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||||
(e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||||
"""
|
||||
|
||||
@typedoc "How chunk size is measured."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Size measured in Unicode characters (default)."
|
||||
@type characters :: :characters
|
||||
@typedoc "Size measured in tokens from a HuggingFace tokenizer."
|
||||
@type tokenizer :: %{type: :tokenizer, model: String.t(), cache_dir: String.t()}
|
||||
end
|
||||
84
packages/elixir/lib/kreuzberg/chunk_type.ex
generated
Normal file
84
packages/elixir/lib/kreuzberg/chunk_type.ex
generated
Normal file
@@ -0,0 +1,84 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkType do
|
||||
@moduledoc """
|
||||
Semantic structural classification of a text chunk.
|
||||
|
||||
Assigned by the heuristic classifier in `chunking::classifier`.
|
||||
Defaults to `Unknown` when no rule matches.
|
||||
Designed to be extended in future versions without breaking changes.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic structural classification of a text chunk."
|
||||
@type t ::
|
||||
:heading
|
||||
| :party_list
|
||||
| :definitions
|
||||
| :operative_clause
|
||||
| :signature_block
|
||||
| :schedule
|
||||
| :table_like
|
||||
| :formula
|
||||
| :code_block
|
||||
| :image
|
||||
| :org_chart
|
||||
| :diagram
|
||||
| :unknown
|
||||
|
||||
@heading :heading
|
||||
@party_list :party_list
|
||||
@definitions :definitions
|
||||
@operative_clause :operative_clause
|
||||
@signature_block :signature_block
|
||||
@schedule :schedule
|
||||
@table_like :table_like
|
||||
@formula :formula
|
||||
@code_block :code_block
|
||||
@image :image
|
||||
@org_chart :org_chart
|
||||
@diagram :diagram
|
||||
@unknown :unknown
|
||||
|
||||
@doc "Section heading or document title."
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@doc "Party list: names, addresses, and signatories."
|
||||
@spec party_list() :: t()
|
||||
def party_list, do: @party_list
|
||||
@doc "Definition clause (\"X means…\", \"X shall mean…\")."
|
||||
@spec definitions() :: t()
|
||||
def definitions, do: @definitions
|
||||
@doc "Operative clause containing legal/contractual action verbs."
|
||||
@spec operative_clause() :: t()
|
||||
def operative_clause, do: @operative_clause
|
||||
@doc "Signature block with signatures, names, and dates."
|
||||
@spec signature_block() :: t()
|
||||
def signature_block, do: @signature_block
|
||||
@doc "Schedule, annex, appendix, or exhibit section."
|
||||
@spec schedule() :: t()
|
||||
def schedule, do: @schedule
|
||||
@doc "Table-like content with aligned columns or repeated patterns."
|
||||
@spec table_like() :: t()
|
||||
def table_like, do: @table_like
|
||||
@doc "Mathematical formula or equation."
|
||||
@spec formula() :: t()
|
||||
def formula, do: @formula
|
||||
@doc "Code block or preformatted content."
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@doc "Embedded or referenced image content."
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@doc "Organizational chart or hierarchy diagram."
|
||||
@spec org_chart() :: t()
|
||||
def org_chart, do: @org_chart
|
||||
@doc "Diagram, figure, or visual illustration."
|
||||
@spec diagram() :: t()
|
||||
def diagram, do: @diagram
|
||||
@doc "Unclassified or mixed content."
|
||||
@spec unknown() :: t()
|
||||
def unknown, do: @unknown
|
||||
end
|
||||
40
packages/elixir/lib/kreuzberg/chunker_type.ex
generated
Normal file
40
packages/elixir/lib/kreuzberg/chunker_type.ex
generated
Normal file
@@ -0,0 +1,40 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkerType do
|
||||
@moduledoc """
|
||||
Type of text chunker to use.
|
||||
|
||||
# Variants
|
||||
|
||||
* `Text` - Generic text splitter, splits on whitespace and punctuation
|
||||
* `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
||||
* `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
||||
* `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
||||
embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
||||
lower = more splits). Without an embedding, falls back to a
|
||||
structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
||||
blank-line paragraphs) and merges groups into chunks capped at
|
||||
`max_characters` (default 1000). `topic_threshold` has no effect in the
|
||||
fallback path. For best results, pair with an embedding model.
|
||||
"""
|
||||
|
||||
@typedoc "Type of text chunker to use."
|
||||
@type t :: :text | :markdown | :yaml | :semantic
|
||||
|
||||
@text :text
|
||||
@markdown :markdown
|
||||
@yaml :yaml
|
||||
@semantic :semantic
|
||||
|
||||
@spec text() :: t()
|
||||
def text, do: @text
|
||||
@spec markdown() :: t()
|
||||
def markdown, do: @markdown
|
||||
@spec yaml() :: t()
|
||||
def yaml, do: @yaml
|
||||
@spec semantic() :: t()
|
||||
def semantic, do: @semantic
|
||||
end
|
||||
55
packages/elixir/lib/kreuzberg/chunking_config.ex
generated
Normal file
55
packages/elixir/lib/kreuzberg/chunking_config.ex
generated
Normal file
@@ -0,0 +1,55 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ChunkingConfig do
|
||||
@moduledoc """
|
||||
Chunking configuration.
|
||||
|
||||
Configures text chunking for document content, including chunk size,
|
||||
overlap, trimming behavior, and optional embeddings.
|
||||
|
||||
Use `..Default::default()` when constructing to allow for future field additions:
|
||||
```rust
|
||||
let config = ChunkingConfig {
|
||||
max_characters: 500,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Chunking configuration."
|
||||
@type t :: %__MODULE__{
|
||||
max_characters: non_neg_integer(),
|
||||
overlap: non_neg_integer(),
|
||||
trim: boolean(),
|
||||
chunker_type: String.t() | nil,
|
||||
embedding: map() | nil,
|
||||
preset: String.t() | nil,
|
||||
sizing: String.t() | nil,
|
||||
prepend_heading_context: boolean(),
|
||||
topic_threshold: float() | nil
|
||||
}
|
||||
|
||||
defstruct max_characters: 1_000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunker_type: :text,
|
||||
embedding: nil,
|
||||
preset: nil,
|
||||
sizing: :characters,
|
||||
prepend_heading_context: false,
|
||||
topic_threshold: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/citation_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/citation_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CitationMetadata do
|
||||
@moduledoc "Citation file metadata (RIS, PubMed, EndNote)."
|
||||
|
||||
@typedoc "Citation file metadata (RIS, PubMed, EndNote)."
|
||||
@type t :: %__MODULE__{
|
||||
citation_count: non_neg_integer(),
|
||||
format: String.t() | nil,
|
||||
authors: [String.t()],
|
||||
year_range: map() | nil,
|
||||
dois: [String.t()],
|
||||
keywords: [String.t()]
|
||||
}
|
||||
|
||||
defstruct citation_count: 0,
|
||||
format: nil,
|
||||
authors: [],
|
||||
year_range: nil,
|
||||
dois: [],
|
||||
keywords: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/code_content_mode.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/code_content_mode.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CodeContentMode do
|
||||
@moduledoc """
|
||||
Content rendering mode for code extraction.
|
||||
|
||||
Controls how extracted code content is represented in the `content` field
|
||||
of `ExtractionResult`.
|
||||
"""
|
||||
|
||||
@typedoc "Content rendering mode for code extraction."
|
||||
@type t :: :chunks | :raw | :structure
|
||||
|
||||
@chunks :chunks
|
||||
@raw :raw
|
||||
@structure :structure
|
||||
|
||||
@doc "Use TSLP semantic chunks as content (default)."
|
||||
@spec chunks() :: t()
|
||||
def chunks, do: @chunks
|
||||
@doc "Use raw source code as content."
|
||||
@spec raw() :: t()
|
||||
def raw, do: @raw
|
||||
@doc "Emit function/class headings + docstrings (no code bodies)."
|
||||
@spec structure() :: t()
|
||||
def structure, do: @structure
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/content_filter_config.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/content_filter_config.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContentFilterConfig do
|
||||
@moduledoc """
|
||||
Cross-extractor content filtering configuration.
|
||||
|
||||
Controls whether "furniture" content (headers, footers, page numbers,
|
||||
watermarks, repeating text) is included in or stripped from extraction
|
||||
results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||||
with format-specific implementation.
|
||||
|
||||
When `None` on `ExtractionConfig`, each extractor uses its current
|
||||
default behavior unchanged.
|
||||
"""
|
||||
|
||||
@typedoc "Cross-extractor content filtering configuration."
|
||||
@type t :: %__MODULE__{
|
||||
include_headers: boolean(),
|
||||
include_footers: boolean(),
|
||||
strip_repeating_text: boolean(),
|
||||
include_watermarks: boolean()
|
||||
}
|
||||
|
||||
defstruct include_headers: false,
|
||||
include_footers: false,
|
||||
strip_repeating_text: true,
|
||||
include_watermarks: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/content_layer.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/content_layer.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContentLayer do
|
||||
@moduledoc """
|
||||
Content layer classification for document nodes.
|
||||
|
||||
Replaces separate body/furniture arrays with per-node granularity.
|
||||
"""
|
||||
|
||||
@typedoc "Content layer classification for document nodes."
|
||||
@type t :: :body | :header | :footer | :footnote
|
||||
|
||||
@body :body
|
||||
@header :header
|
||||
@footer :footer
|
||||
@footnote :footnote
|
||||
|
||||
@doc "Main document body content."
|
||||
@spec body() :: t()
|
||||
def body, do: @body
|
||||
@doc "Page/section header (running header)."
|
||||
@spec header() :: t()
|
||||
def header, do: @header
|
||||
@doc "Page/section footer (running footer)."
|
||||
@spec footer() :: t()
|
||||
def footer, do: @footer
|
||||
@doc "Footnote content."
|
||||
@spec footnote() :: t()
|
||||
def footnote, do: @footnote
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/contributor_role.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/contributor_role.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ContributorRole do
|
||||
@moduledoc "JATS contributor with role."
|
||||
|
||||
@typedoc "JATS contributor with role."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
role: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
role: nil
|
||||
end
|
||||
59
packages/elixir/lib/kreuzberg/core_properties.ex
generated
Normal file
59
packages/elixir/lib/kreuzberg/core_properties.ex
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CoreProperties do
|
||||
@moduledoc """
|
||||
Dublin Core metadata from docProps/core.xml
|
||||
|
||||
Contains standard metadata fields defined by the Dublin Core standard
|
||||
and Office-specific extensions.
|
||||
"""
|
||||
|
||||
@typedoc "Dublin Core metadata from docProps/core.xml"
|
||||
@type t :: %__MODULE__{
|
||||
title: String.t() | nil,
|
||||
subject: String.t() | nil,
|
||||
creator: String.t() | nil,
|
||||
keywords: String.t() | nil,
|
||||
description: String.t() | nil,
|
||||
last_modified_by: String.t() | nil,
|
||||
revision: String.t() | nil,
|
||||
created: String.t() | nil,
|
||||
modified: String.t() | nil,
|
||||
category: String.t() | nil,
|
||||
content_status: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
identifier: String.t() | nil,
|
||||
version: String.t() | nil,
|
||||
last_printed: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct title: nil,
|
||||
subject: nil,
|
||||
creator: nil,
|
||||
keywords: nil,
|
||||
description: nil,
|
||||
last_modified_by: nil,
|
||||
revision: nil,
|
||||
created: nil,
|
||||
modified: nil,
|
||||
category: nil,
|
||||
content_status: nil,
|
||||
language: nil,
|
||||
identifier: nil,
|
||||
version: nil,
|
||||
last_printed: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
34
packages/elixir/lib/kreuzberg/csv_metadata.ex
generated
Normal file
34
packages/elixir/lib/kreuzberg/csv_metadata.ex
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.CsvMetadata do
|
||||
@moduledoc "CSV/TSV file metadata."
|
||||
|
||||
@typedoc "CSV/TSV file metadata."
|
||||
@type t :: %__MODULE__{
|
||||
row_count: non_neg_integer(),
|
||||
column_count: non_neg_integer(),
|
||||
delimiter: String.t() | nil,
|
||||
has_header: boolean(),
|
||||
column_types: [String.t()] | nil
|
||||
}
|
||||
|
||||
defstruct row_count: 0,
|
||||
column_count: 0,
|
||||
delimiter: nil,
|
||||
has_header: false,
|
||||
column_types: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/dbf_field_info.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/dbf_field_info.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DbfFieldInfo do
|
||||
@moduledoc "dBASE field information."
|
||||
|
||||
@typedoc "dBASE field information."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
field_type: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
field_type: nil
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/dbf_metadata.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/dbf_metadata.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DbfMetadata do
|
||||
@moduledoc "dBASE (DBF) file metadata."
|
||||
|
||||
@typedoc "dBASE (DBF) file metadata."
|
||||
@type t :: %__MODULE__{
|
||||
record_count: non_neg_integer(),
|
||||
field_count: non_neg_integer(),
|
||||
fields: [map()]
|
||||
}
|
||||
|
||||
defstruct record_count: 0,
|
||||
field_count: 0,
|
||||
fields: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/detect_response.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/detect_response.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DetectResponse do
|
||||
@moduledoc "MIME type detection response."
|
||||
|
||||
@typedoc "MIME type detection response."
|
||||
@type t :: %__MODULE__{
|
||||
mime_type: String.t() | nil,
|
||||
filename: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct mime_type: nil,
|
||||
filename: nil
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/detection_result.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/detection_result.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DetectionResult do
|
||||
@moduledoc "Page-level detection result containing all detections and page metadata."
|
||||
|
||||
@typedoc "Page-level detection result containing all detections and page metadata."
|
||||
@type t :: %__MODULE__{
|
||||
page_width: non_neg_integer(),
|
||||
page_height: non_neg_integer(),
|
||||
detections: [map()]
|
||||
}
|
||||
|
||||
defstruct page_width: 0,
|
||||
page_height: 0,
|
||||
detections: []
|
||||
end
|
||||
23
packages/elixir/lib/kreuzberg/diff_hunk.ex
generated
Normal file
23
packages/elixir/lib/kreuzberg/diff_hunk.ex
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffHunk do
|
||||
@moduledoc "A single contiguous hunk in a unified diff."
|
||||
|
||||
@typedoc "A single contiguous hunk in a unified diff."
|
||||
@type t :: %__MODULE__{
|
||||
from_line: non_neg_integer(),
|
||||
from_count: non_neg_integer(),
|
||||
to_line: non_neg_integer(),
|
||||
to_count: non_neg_integer(),
|
||||
lines: [String.t() | nil]
|
||||
}
|
||||
|
||||
defstruct from_line: 0,
|
||||
from_count: 0,
|
||||
to_line: 0,
|
||||
to_count: 0,
|
||||
lines: []
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/diff_line.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/diff_line.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffLine do
|
||||
@moduledoc """
|
||||
A single line in a unified-diff hunk.
|
||||
|
||||
Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||||
reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
`crate::diff` re-exports this type verbatim.
|
||||
"""
|
||||
|
||||
@typedoc "A single line in a unified-diff hunk."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Unchanged context line."
|
||||
@type context :: %{type: :context, value: String.t()}
|
||||
@typedoc "Line added in the \"after\" version."
|
||||
@type added :: %{type: :added, value: String.t()}
|
||||
@typedoc "Line removed from the \"before\" version."
|
||||
@type removed :: %{type: :removed, value: String.t()}
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/diff_options.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/diff_options.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DiffOptions do
|
||||
@moduledoc "Options controlling how two `ExtractionResult` values are compared."
|
||||
|
||||
@typedoc "Options controlling how two `ExtractionResult` values are compared."
|
||||
@type t :: %__MODULE__{
|
||||
include_metadata: boolean(),
|
||||
include_embedded: boolean(),
|
||||
max_content_chars: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct include_metadata: true,
|
||||
include_embedded: true,
|
||||
max_content_chars: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
41
packages/elixir/lib/kreuzberg/djot_content.ex
generated
Normal file
41
packages/elixir/lib/kreuzberg/djot_content.ex
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotContent do
|
||||
@moduledoc """
|
||||
Comprehensive Djot document structure with semantic preservation.
|
||||
|
||||
This type captures the full richness of Djot markup, including:
|
||||
- Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
||||
- Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
||||
- Attributes (classes, IDs, key-value pairs)
|
||||
- Links, images, footnotes
|
||||
- Math expressions (inline and display)
|
||||
- Tables with full structure
|
||||
|
||||
Available when the `djot` feature is enabled.
|
||||
"""
|
||||
|
||||
@typedoc "Comprehensive Djot document structure with semantic preservation."
|
||||
@type t :: %__MODULE__{
|
||||
plain_text: String.t() | nil,
|
||||
blocks: [map()],
|
||||
metadata: map(),
|
||||
tables: [map()],
|
||||
images: [map()],
|
||||
links: [map()],
|
||||
footnotes: [map()],
|
||||
attributes: [String.t()]
|
||||
}
|
||||
|
||||
defstruct plain_text: nil,
|
||||
blocks: [],
|
||||
metadata: nil,
|
||||
tables: [],
|
||||
images: [],
|
||||
links: [],
|
||||
footnotes: [],
|
||||
attributes: []
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/djot_image.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/djot_image.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotImage do
|
||||
@moduledoc "Image element in Djot."
|
||||
|
||||
@typedoc "Image element in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
src: String.t() | nil,
|
||||
alt: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
attributes: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct src: nil,
|
||||
alt: nil,
|
||||
title: nil,
|
||||
attributes: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/djot_link.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/djot_link.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DjotLink do
|
||||
@moduledoc "Link element in Djot."
|
||||
|
||||
@typedoc "Link element in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
url: String.t() | nil,
|
||||
text: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
attributes: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct url: nil,
|
||||
text: nil,
|
||||
title: nil,
|
||||
attributes: nil
|
||||
end
|
||||
74
packages/elixir/lib/kreuzberg/document_extractor_bridge.ex
generated
Normal file
74
packages/elixir/lib/kreuzberg/document_extractor_bridge.ex
generated
Normal file
@@ -0,0 +1,74 @@
|
||||
defmodule KreuzbergDocumentExtractorBridge do
|
||||
@moduledoc """
|
||||
GenServer bridge for DocumentExtractor implementation in kreuzberg.
|
||||
|
||||
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
require Logger
|
||||
|
||||
@doc """
|
||||
Start a GenServer linked to the current process.
|
||||
|
||||
impl_module should be a module that implements the DocumentExtractor trait methods.
|
||||
"""
|
||||
def start_link(impl_module) do
|
||||
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
|
||||
end
|
||||
|
||||
@impl GenServer
|
||||
def init(impl_module) do
|
||||
{:ok, impl_module}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Handle an incoming trait call message.
|
||||
|
||||
Message format: {:trait_call, method_atom, args_json, reply_id}
|
||||
"""
|
||||
@impl GenServer
|
||||
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
|
||||
try do
|
||||
args = Jason.decode!(args_json)
|
||||
method_name = to_string(method)
|
||||
ordered_args = ordered_args(impl_module, method_name, args)
|
||||
|
||||
# Dispatch to the implementation module
|
||||
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
|
||||
|
||||
# Send result back to Rust
|
||||
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
|
||||
rescue
|
||||
e ->
|
||||
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
|
||||
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
|
||||
end
|
||||
|
||||
{:noreply, impl_module}
|
||||
end
|
||||
|
||||
defp ordered_args(impl_module, method_name, args) when is_map(args) do
|
||||
if function_exported?(impl_module, :__alef_arg_order__, 1) do
|
||||
impl_module.__alef_arg_order__(method_name)
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
else
|
||||
args
|
||||
|> Map.keys()
|
||||
|> Enum.sort()
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
end
|
||||
end
|
||||
|
||||
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
|
||||
|
||||
@doc """
|
||||
Register an implementation module, starting a GenServer to handle trait calls.
|
||||
"""
|
||||
def register(impl_module) do
|
||||
plugin_name = impl_module.name()
|
||||
{:ok, pid} = start_link(impl_module)
|
||||
Kreuzberg.Native.register_document_extractor(pid, plugin_name)
|
||||
end
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/document_node.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/document_node.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentNode do
|
||||
@moduledoc """
|
||||
A single node in the document tree.
|
||||
|
||||
Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
||||
for tree structure, and metadata like page number, bounding box, and content layer.
|
||||
"""
|
||||
|
||||
@typedoc "A single node in the document tree."
|
||||
@type t :: %__MODULE__{
|
||||
id: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
parent: non_neg_integer() | nil,
|
||||
children: [non_neg_integer()],
|
||||
content_layer: String.t() | nil,
|
||||
page: non_neg_integer() | nil,
|
||||
page_end: non_neg_integer() | nil,
|
||||
bbox: map() | nil,
|
||||
annotations: [map()],
|
||||
attributes: map() | nil
|
||||
}
|
||||
|
||||
defstruct id: nil,
|
||||
content: :title,
|
||||
parent: nil,
|
||||
children: [],
|
||||
content_layer: :body,
|
||||
page: nil,
|
||||
page_end: nil,
|
||||
bbox: nil,
|
||||
annotations: [],
|
||||
attributes: nil
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/document_relationship.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/document_relationship.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentRelationship do
|
||||
@moduledoc "A resolved relationship between two nodes in the document tree."
|
||||
|
||||
@typedoc "A resolved relationship between two nodes in the document tree."
|
||||
@type t :: %__MODULE__{
|
||||
source: non_neg_integer(),
|
||||
target: non_neg_integer(),
|
||||
kind: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct source: 0,
|
||||
target: 0,
|
||||
kind: :footnote_reference
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/document_revision.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/document_revision.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentRevision do
|
||||
@moduledoc """
|
||||
A single tracked change embedded in a document.
|
||||
|
||||
Populated by per-format extractors that understand change-tracking metadata
|
||||
(DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||||
extractor defaults to `ExtractionResult.revisions = None` until a
|
||||
format-specific implementation is added.
|
||||
"""
|
||||
|
||||
@typedoc "A single tracked change embedded in a document."
|
||||
@type t :: %__MODULE__{
|
||||
revision_id: String.t() | nil,
|
||||
author: String.t() | nil,
|
||||
timestamp: String.t() | nil,
|
||||
kind: String.t() | nil,
|
||||
anchor: String.t() | nil | nil,
|
||||
delta: map()
|
||||
}
|
||||
|
||||
defstruct revision_id: nil,
|
||||
author: nil,
|
||||
timestamp: nil,
|
||||
kind: :insertion,
|
||||
anchor: nil,
|
||||
delta: nil
|
||||
end
|
||||
43
packages/elixir/lib/kreuzberg/document_structure.ex
generated
Normal file
43
packages/elixir/lib/kreuzberg/document_structure.ex
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocumentStructure do
|
||||
@moduledoc """
|
||||
Top-level structured document representation.
|
||||
|
||||
A flat array of nodes with index-based parent/child references forming a tree.
|
||||
Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
||||
to iterate over top-level content by layer.
|
||||
|
||||
# Validation
|
||||
|
||||
Call `validate()` after construction to verify all node indices are in bounds
|
||||
and parent-child relationships are bidirectionally consistent.
|
||||
"""
|
||||
|
||||
@typedoc "Top-level structured document representation."
|
||||
@type t :: %__MODULE__{
|
||||
nodes: [map()],
|
||||
source_format: String.t() | nil,
|
||||
relationships: [map()],
|
||||
node_types: [String.t()]
|
||||
}
|
||||
|
||||
defstruct nodes: [],
|
||||
source_format: nil,
|
||||
relationships: [],
|
||||
node_types: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
60
packages/elixir/lib/kreuzberg/docx_app_properties.ex
generated
Normal file
60
packages/elixir/lib/kreuzberg/docx_app_properties.ex
generated
Normal file
@@ -0,0 +1,60 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocxAppProperties do
|
||||
@moduledoc """
|
||||
Application properties from docProps/app.xml for DOCX
|
||||
|
||||
Contains Word-specific document statistics and metadata.
|
||||
"""
|
||||
|
||||
@typedoc "Application properties from docProps/app.xml for DOCX"
|
||||
@type t :: %__MODULE__{
|
||||
application: String.t() | nil,
|
||||
app_version: String.t() | nil,
|
||||
template: String.t() | nil,
|
||||
total_time: integer() | nil,
|
||||
pages: integer() | nil,
|
||||
words: integer() | nil,
|
||||
characters: integer() | nil,
|
||||
characters_with_spaces: integer() | nil,
|
||||
lines: integer() | nil,
|
||||
paragraphs: integer() | nil,
|
||||
company: String.t() | nil,
|
||||
doc_security: integer() | nil,
|
||||
scale_crop: boolean() | nil,
|
||||
links_up_to_date: boolean() | nil,
|
||||
shared_doc: boolean() | nil,
|
||||
hyperlinks_changed: boolean() | nil
|
||||
}
|
||||
|
||||
defstruct application: nil,
|
||||
app_version: nil,
|
||||
template: nil,
|
||||
total_time: nil,
|
||||
pages: nil,
|
||||
words: nil,
|
||||
characters: nil,
|
||||
characters_with_spaces: nil,
|
||||
lines: nil,
|
||||
paragraphs: nil,
|
||||
company: nil,
|
||||
doc_security: nil,
|
||||
scale_crop: nil,
|
||||
links_up_to_date: nil,
|
||||
shared_doc: nil,
|
||||
hyperlinks_changed: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
35
packages/elixir/lib/kreuzberg/docx_metadata.ex
generated
Normal file
35
packages/elixir/lib/kreuzberg/docx_metadata.ex
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.DocxMetadata do
|
||||
@moduledoc """
|
||||
Word document metadata.
|
||||
|
||||
Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||||
Integrates with `office_metadata` module for core/app/custom properties.
|
||||
"""
|
||||
|
||||
@typedoc "Word document metadata."
|
||||
@type t :: %__MODULE__{
|
||||
core_properties: map() | nil,
|
||||
app_properties: map() | nil,
|
||||
custom_properties: map() | nil
|
||||
}
|
||||
|
||||
defstruct core_properties: nil,
|
||||
app_properties: nil,
|
||||
custom_properties: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/element.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/element.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Element do
|
||||
@moduledoc """
|
||||
Semantic element extracted from document.
|
||||
|
||||
Represents a logical unit of content with semantic classification,
|
||||
unique identifier, and metadata for tracking origin and position.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic element extracted from document."
|
||||
@type t :: %__MODULE__{
|
||||
element_id: String.t() | nil,
|
||||
element_type: String.t() | nil,
|
||||
text: String.t() | nil,
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct element_id: nil,
|
||||
element_type: :title,
|
||||
text: nil,
|
||||
metadata: nil
|
||||
end
|
||||
23
packages/elixir/lib/kreuzberg/element_metadata.ex
generated
Normal file
23
packages/elixir/lib/kreuzberg/element_metadata.ex
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ElementMetadata do
|
||||
@moduledoc "Metadata for a semantic element."
|
||||
|
||||
@typedoc "Metadata for a semantic element."
|
||||
@type t :: %__MODULE__{
|
||||
page_number: non_neg_integer() | nil,
|
||||
filename: String.t() | nil,
|
||||
coordinates: map() | nil,
|
||||
element_index: non_neg_integer() | nil,
|
||||
additional: map()
|
||||
}
|
||||
|
||||
defstruct page_number: nil,
|
||||
filename: nil,
|
||||
coordinates: nil,
|
||||
element_index: nil,
|
||||
additional: %{}
|
||||
end
|
||||
73
packages/elixir/lib/kreuzberg/element_type.ex
generated
Normal file
73
packages/elixir/lib/kreuzberg/element_type.ex
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ElementType do
|
||||
@moduledoc """
|
||||
Semantic element type classification.
|
||||
|
||||
Categorizes text content into semantic units for downstream processing.
|
||||
Supports the element types commonly found in Unstructured documents.
|
||||
"""
|
||||
|
||||
@typedoc "Semantic element type classification."
|
||||
@type t ::
|
||||
:title
|
||||
| :narrative_text
|
||||
| :heading
|
||||
| :list_item
|
||||
| :table
|
||||
| :image
|
||||
| :page_break
|
||||
| :code_block
|
||||
| :block_quote
|
||||
| :footer
|
||||
| :header
|
||||
|
||||
@title :title
|
||||
@narrative_text :narrative_text
|
||||
@heading :heading
|
||||
@list_item :list_item
|
||||
@table :table
|
||||
@image :image
|
||||
@page_break :page_break
|
||||
@code_block :code_block
|
||||
@block_quote :block_quote
|
||||
@footer :footer
|
||||
@header :header
|
||||
|
||||
@doc "Document title"
|
||||
@spec title() :: t()
|
||||
def title, do: @title
|
||||
@doc "Main narrative text body"
|
||||
@spec narrative_text() :: t()
|
||||
def narrative_text, do: @narrative_text
|
||||
@doc "Section heading"
|
||||
@spec heading() :: t()
|
||||
def heading, do: @heading
|
||||
@doc "List item (bullet, numbered, etc.)"
|
||||
@spec list_item() :: t()
|
||||
def list_item, do: @list_item
|
||||
@doc "Table element"
|
||||
@spec table() :: t()
|
||||
def table, do: @table
|
||||
@doc "Image element"
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@doc "Page break marker"
|
||||
@spec page_break() :: t()
|
||||
def page_break, do: @page_break
|
||||
@doc "Code block"
|
||||
@spec code_block() :: t()
|
||||
def code_block, do: @code_block
|
||||
@doc "Block quote"
|
||||
@spec block_quote() :: t()
|
||||
def block_quote, do: @block_quote
|
||||
@doc "Footer text"
|
||||
@spec footer() :: t()
|
||||
def footer, do: @footer
|
||||
@doc "Header text"
|
||||
@spec header() :: t()
|
||||
def header, do: @header
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/email_attachment.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/email_attachment.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailAttachment do
|
||||
@moduledoc """
|
||||
Email attachment representation.
|
||||
|
||||
Contains metadata and optionally the content of an email attachment.
|
||||
"""
|
||||
|
||||
@typedoc "Email attachment representation."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
filename: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
size: non_neg_integer() | nil,
|
||||
is_image: boolean(),
|
||||
data: binary() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
filename: nil,
|
||||
mime_type: nil,
|
||||
size: nil,
|
||||
is_image: false,
|
||||
data: nil
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/email_config.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/email_config.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailConfig do
|
||||
@moduledoc "Configuration for email extraction."
|
||||
|
||||
@typedoc "Configuration for email extraction."
|
||||
@type t :: %__MODULE__{
|
||||
msg_fallback_codepage: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct msg_fallback_codepage: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/email_extraction_result.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/email_extraction_result.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailExtractionResult do
|
||||
@moduledoc """
|
||||
Email extraction result.
|
||||
|
||||
Complete representation of an extracted email message (.eml or .msg)
|
||||
including headers, body content, and attachments.
|
||||
"""
|
||||
|
||||
@typedoc "Email extraction result."
|
||||
@type t :: %__MODULE__{
|
||||
subject: String.t() | nil,
|
||||
from_email: String.t() | nil,
|
||||
to_emails: [String.t()],
|
||||
cc_emails: [String.t()],
|
||||
bcc_emails: [String.t()],
|
||||
date: String.t() | nil,
|
||||
message_id: String.t() | nil,
|
||||
plain_text: String.t() | nil,
|
||||
html_content: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
attachments: [map()],
|
||||
metadata: map()
|
||||
}
|
||||
|
||||
defstruct subject: nil,
|
||||
from_email: nil,
|
||||
to_emails: [],
|
||||
cc_emails: [],
|
||||
bcc_emails: [],
|
||||
date: nil,
|
||||
message_id: nil,
|
||||
plain_text: nil,
|
||||
html_content: nil,
|
||||
content: nil,
|
||||
attachments: [],
|
||||
metadata: %{}
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/email_metadata.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/email_metadata.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmailMetadata do
|
||||
@moduledoc """
|
||||
Email metadata extracted from .eml and .msg files.
|
||||
|
||||
Includes sender/recipient information, message ID, and attachment list.
|
||||
"""
|
||||
|
||||
@typedoc "Email metadata extracted from .eml and .msg files."
|
||||
@type t :: %__MODULE__{
|
||||
from_email: String.t() | nil,
|
||||
from_name: String.t() | nil,
|
||||
to_emails: [String.t()],
|
||||
cc_emails: [String.t()],
|
||||
bcc_emails: [String.t()],
|
||||
message_id: String.t() | nil,
|
||||
attachments: [String.t()]
|
||||
}
|
||||
|
||||
defstruct from_email: nil,
|
||||
from_name: nil,
|
||||
to_emails: [],
|
||||
cc_emails: [],
|
||||
bcc_emails: [],
|
||||
message_id: nil,
|
||||
attachments: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/embedded_changes.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/embedded_changes.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedChanges do
|
||||
@moduledoc "Changes to embedded archive children between two results."
|
||||
|
||||
@typedoc "Changes to embedded archive children between two results."
|
||||
@type t :: %__MODULE__{
|
||||
added: [map()],
|
||||
removed: [map()],
|
||||
changed: [map()]
|
||||
}
|
||||
|
||||
defstruct added: [],
|
||||
removed: [],
|
||||
changed: []
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/embedded_diff.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/embedded_diff.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedDiff do
|
||||
@moduledoc "Diff for a single embedded archive entry that appears in both results."
|
||||
|
||||
@typedoc "Diff for a single embedded archive entry that appears in both results."
|
||||
@type t :: %__MODULE__{
|
||||
path: String.t() | nil,
|
||||
diff: map()
|
||||
}
|
||||
|
||||
defstruct path: nil,
|
||||
diff: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/embedded_file.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/embedded_file.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddedFile do
|
||||
@moduledoc "Embedded file descriptor extracted from the PDF name tree."
|
||||
|
||||
@typedoc "Embedded file descriptor extracted from the PDF name tree."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
data: binary(),
|
||||
compressed_size: non_neg_integer(),
|
||||
mime_type: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
data: <<>>,
|
||||
compressed_size: 0,
|
||||
mime_type: nil
|
||||
end
|
||||
74
packages/elixir/lib/kreuzberg/embedding_backend_bridge.ex
generated
Normal file
74
packages/elixir/lib/kreuzberg/embedding_backend_bridge.ex
generated
Normal file
@@ -0,0 +1,74 @@
|
||||
defmodule KreuzbergEmbeddingBackendBridge do
|
||||
@moduledoc """
|
||||
GenServer bridge for EmbeddingBackend implementation in kreuzberg.
|
||||
|
||||
Handles incoming trait method calls from Rust and dispatches them to an implementation module.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
require Logger
|
||||
|
||||
@doc """
|
||||
Start a GenServer linked to the current process.
|
||||
|
||||
impl_module should be a module that implements the EmbeddingBackend trait methods.
|
||||
"""
|
||||
def start_link(impl_module) do
|
||||
GenServer.start_link(__MODULE__, impl_module, name: __MODULE__)
|
||||
end
|
||||
|
||||
@impl GenServer
|
||||
def init(impl_module) do
|
||||
{:ok, impl_module}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Handle an incoming trait call message.
|
||||
|
||||
Message format: {:trait_call, method_atom, args_json, reply_id}
|
||||
"""
|
||||
@impl GenServer
|
||||
def handle_info({:trait_call, method, args_json, reply_id}, impl_module) do
|
||||
try do
|
||||
args = Jason.decode!(args_json)
|
||||
method_name = to_string(method)
|
||||
ordered_args = ordered_args(impl_module, method_name, args)
|
||||
|
||||
# Dispatch to the implementation module
|
||||
result = apply(impl_module, String.to_existing_atom(method_name), ordered_args)
|
||||
|
||||
# Send result back to Rust
|
||||
Kreuzberg.Native.complete_trait_call(reply_id, Jason.encode!(result))
|
||||
rescue
|
||||
e ->
|
||||
Logger.error("Error calling {impl_module}.{method}: {Exception.message(e)}")
|
||||
Kreuzberg.Native.fail_trait_call(reply_id, Exception.message(e))
|
||||
end
|
||||
|
||||
{:noreply, impl_module}
|
||||
end
|
||||
|
||||
defp ordered_args(impl_module, method_name, args) when is_map(args) do
|
||||
if function_exported?(impl_module, :__alef_arg_order__, 1) do
|
||||
impl_module.__alef_arg_order__(method_name)
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
else
|
||||
args
|
||||
|> Map.keys()
|
||||
|> Enum.sort()
|
||||
|> Enum.map(&Map.fetch!(args, &1))
|
||||
end
|
||||
end
|
||||
|
||||
defp ordered_args(_impl_module, _method_name, args) when is_list(args), do: args
|
||||
|
||||
@doc """
|
||||
Register an implementation module, starting a GenServer to handle trait calls.
|
||||
"""
|
||||
def register(impl_module) do
|
||||
plugin_name = impl_module.name()
|
||||
{:ok, pid} = start_link(impl_module)
|
||||
Kreuzberg.Native.register_embedding_backend(pid, plugin_name)
|
||||
end
|
||||
end
|
||||
43
packages/elixir/lib/kreuzberg/embedding_config.ex
generated
Normal file
43
packages/elixir/lib/kreuzberg/embedding_config.ex
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingConfig do
|
||||
@moduledoc """
|
||||
Embedding configuration for text chunks.
|
||||
|
||||
Configures embedding generation using ONNX models via the vendored embedding engine.
|
||||
Requires the `embeddings` feature to be enabled.
|
||||
"""
|
||||
|
||||
@typedoc "Embedding configuration for text chunks."
|
||||
@type t :: %__MODULE__{
|
||||
model: String.t() | nil,
|
||||
normalize: boolean(),
|
||||
batch_size: non_neg_integer(),
|
||||
show_download_progress: boolean(),
|
||||
cache_dir: String.t() | nil,
|
||||
acceleration: map() | nil,
|
||||
max_embed_duration_secs: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct model: :preset,
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false,
|
||||
cache_dir: nil,
|
||||
acceleration: nil,
|
||||
max_embed_duration_secs: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
20
packages/elixir/lib/kreuzberg/embedding_model_type.ex
generated
Normal file
20
packages/elixir/lib/kreuzberg/embedding_model_type.ex
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingModelType do
|
||||
@moduledoc "Embedding model types supported by Kreuzberg."
|
||||
|
||||
@typedoc "Embedding model types supported by Kreuzberg."
|
||||
@type t :: term()
|
||||
|
||||
@typedoc "Use a preset model configuration (recommended)"
|
||||
@type preset :: %{type: :preset, name: String.t()}
|
||||
@typedoc "Use a custom ONNX model from HuggingFace"
|
||||
@type custom :: %{type: :custom, model_id: String.t(), dimensions: non_neg_integer()}
|
||||
@typedoc "Provider-hosted embedding model via liter-llm."
|
||||
@type llm :: %{type: :llm, llm: Kreuzberg.LlmConfig.t()}
|
||||
@typedoc "In-process embedding backend registered via the plugin system."
|
||||
@type plugin :: %{type: :plugin, name: String.t()}
|
||||
end
|
||||
37
packages/elixir/lib/kreuzberg/embedding_preset.ex
generated
Normal file
37
packages/elixir/lib/kreuzberg/embedding_preset.ex
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EmbeddingPreset do
|
||||
@moduledoc """
|
||||
Preset configurations for common RAG use cases.
|
||||
|
||||
Each preset combines chunk size, overlap, and embedding model
|
||||
to provide an optimized configuration for specific scenarios.
|
||||
|
||||
All string fields are owned `String` for FFI compatibility — instances
|
||||
are safe to clone and pass across language boundaries.
|
||||
"""
|
||||
|
||||
@typedoc "Preset configurations for common RAG use cases."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
chunk_size: non_neg_integer(),
|
||||
overlap: non_neg_integer(),
|
||||
model_repo: String.t() | nil,
|
||||
pooling: String.t() | nil,
|
||||
model_file: String.t() | nil,
|
||||
dimensions: non_neg_integer(),
|
||||
description: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
chunk_size: 0,
|
||||
overlap: 0,
|
||||
model_repo: nil,
|
||||
pooling: nil,
|
||||
model_file: nil,
|
||||
dimensions: 0,
|
||||
description: nil
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/epub_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/epub_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.EpubMetadata do
|
||||
@moduledoc "EPUB metadata (Dublin Core extensions)."
|
||||
|
||||
@typedoc "EPUB metadata (Dublin Core extensions)."
|
||||
@type t :: %__MODULE__{
|
||||
coverage: String.t() | nil,
|
||||
dc_format: String.t() | nil,
|
||||
relation: String.t() | nil,
|
||||
source: String.t() | nil,
|
||||
dc_type: String.t() | nil,
|
||||
cover_image: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct coverage: nil,
|
||||
dc_format: nil,
|
||||
relation: nil,
|
||||
source: nil,
|
||||
dc_type: nil,
|
||||
cover_image: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/error_metadata.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/error_metadata.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ErrorMetadata do
|
||||
@moduledoc "Error metadata (for batch operations)."
|
||||
|
||||
@typedoc "Error metadata (for batch operations)."
|
||||
@type t :: %__MODULE__{
|
||||
error_type: String.t() | nil,
|
||||
message: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct error_type: nil,
|
||||
message: nil
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/excel_metadata.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/excel_metadata.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelMetadata do
|
||||
@moduledoc """
|
||||
Excel/spreadsheet format metadata.
|
||||
|
||||
Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
||||
discriminant. Sheet count and sheet names are stored inside this struct.
|
||||
"""
|
||||
|
||||
@typedoc "Excel/spreadsheet format metadata."
|
||||
@type t :: %__MODULE__{
|
||||
sheet_count: non_neg_integer() | nil,
|
||||
sheet_names: [String.t()] | nil
|
||||
}
|
||||
|
||||
defstruct sheet_count: nil,
|
||||
sheet_names: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/excel_sheet.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/excel_sheet.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelSheet do
|
||||
@moduledoc """
|
||||
Single Excel worksheet.
|
||||
|
||||
Represents one sheet from an Excel workbook with its content
|
||||
converted to Markdown format and dimensional statistics.
|
||||
"""
|
||||
|
||||
@typedoc "Single Excel worksheet."
|
||||
@type t :: %__MODULE__{
|
||||
name: String.t() | nil,
|
||||
markdown: String.t() | nil,
|
||||
row_count: non_neg_integer(),
|
||||
col_count: non_neg_integer(),
|
||||
cell_count: non_neg_integer(),
|
||||
table_cells: [[String.t()]] | nil
|
||||
}
|
||||
|
||||
defstruct name: nil,
|
||||
markdown: nil,
|
||||
row_count: 0,
|
||||
col_count: 0,
|
||||
cell_count: 0,
|
||||
table_cells: nil
|
||||
end
|
||||
24
packages/elixir/lib/kreuzberg/excel_workbook.ex
generated
Normal file
24
packages/elixir/lib/kreuzberg/excel_workbook.ex
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExcelWorkbook do
|
||||
@moduledoc """
|
||||
Excel workbook representation.
|
||||
|
||||
Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
||||
extracted content and metadata.
|
||||
"""
|
||||
|
||||
@typedoc "Excel workbook representation."
|
||||
@type t :: %__MODULE__{
|
||||
sheets: [map()],
|
||||
metadata: map(),
|
||||
revisions: [map()] | nil
|
||||
}
|
||||
|
||||
defstruct sheets: [],
|
||||
metadata: %{},
|
||||
revisions: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/execution_provider_type.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/execution_provider_type.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExecutionProviderType do
|
||||
@moduledoc """
|
||||
ONNX Runtime execution provider type.
|
||||
|
||||
Determines which hardware backend is used for model inference.
|
||||
`Auto` (default) selects the best available provider per platform.
|
||||
"""
|
||||
|
||||
@typedoc "ONNX Runtime execution provider type."
|
||||
@type t :: :auto | :cpu | :core_ml | :cuda | :tensor_rt
|
||||
|
||||
@auto :auto
|
||||
@cpu :cpu
|
||||
@core_ml :core_ml
|
||||
@cuda :cuda
|
||||
@tensor_rt :tensor_rt
|
||||
|
||||
@doc "Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere."
|
||||
@spec auto() :: t()
|
||||
def auto, do: @auto
|
||||
@doc "CPU execution provider (always available)."
|
||||
@spec cpu() :: t()
|
||||
def cpu, do: @cpu
|
||||
@doc "Apple CoreML (macOS/iOS Neural Engine + GPU)."
|
||||
@spec core_ml() :: t()
|
||||
def core_ml, do: @core_ml
|
||||
@doc "NVIDIA CUDA GPU acceleration."
|
||||
@spec cuda() :: t()
|
||||
def cuda, do: @cuda
|
||||
@doc "NVIDIA TensorRT (optimized CUDA inference)."
|
||||
@spec tensor_rt() :: t()
|
||||
def tensor_rt, do: @tensor_rt
|
||||
end
|
||||
51
packages/elixir/lib/kreuzberg/extracted_image.ex
generated
Normal file
51
packages/elixir/lib/kreuzberg/extracted_image.ex
generated
Normal file
@@ -0,0 +1,51 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractedImage do
|
||||
@moduledoc """
|
||||
Extracted image from a document.
|
||||
|
||||
Contains raw image data, metadata, and optional nested OCR results.
|
||||
Raw bytes allow cross-language compatibility - users can convert to
|
||||
PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||||
"""
|
||||
|
||||
@typedoc "Extracted image from a document."
|
||||
@type t :: %__MODULE__{
|
||||
data: binary(),
|
||||
format: String.t() | nil,
|
||||
image_index: non_neg_integer(),
|
||||
page_number: non_neg_integer() | nil,
|
||||
width: non_neg_integer() | nil,
|
||||
height: non_neg_integer() | nil,
|
||||
colorspace: String.t() | nil,
|
||||
bits_per_component: non_neg_integer() | nil,
|
||||
is_mask: boolean(),
|
||||
description: String.t() | nil,
|
||||
ocr_result: map() | nil,
|
||||
bounding_box: map() | nil,
|
||||
source_path: String.t() | nil,
|
||||
image_kind: String.t() | nil | nil,
|
||||
kind_confidence: float() | nil,
|
||||
cluster_id: non_neg_integer() | nil
|
||||
}
|
||||
|
||||
defstruct data: <<>>,
|
||||
format: nil,
|
||||
image_index: 0,
|
||||
page_number: nil,
|
||||
width: nil,
|
||||
height: nil,
|
||||
colorspace: nil,
|
||||
bits_per_component: nil,
|
||||
is_mask: false,
|
||||
description: nil,
|
||||
ocr_result: nil,
|
||||
bounding_box: nil,
|
||||
source_path: nil,
|
||||
image_kind: nil,
|
||||
kind_confidence: nil,
|
||||
cluster_id: nil
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/extracted_uri.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/extracted_uri.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractedUri do
|
||||
@moduledoc """
|
||||
A URI extracted from a document.
|
||||
|
||||
Represents any link, reference, or resource pointer found during extraction.
|
||||
The `kind` field classifies the URI semantically, while `label` carries
|
||||
optional human-readable display text.
|
||||
"""
|
||||
|
||||
@typedoc "A URI extracted from a document."
|
||||
@type t :: %__MODULE__{
|
||||
url: String.t() | nil,
|
||||
label: String.t() | nil,
|
||||
page: non_neg_integer() | nil,
|
||||
kind: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct url: nil,
|
||||
label: nil,
|
||||
page: nil,
|
||||
kind: :hyperlink
|
||||
end
|
||||
111
packages/elixir/lib/kreuzberg/extraction_config.ex
generated
Normal file
111
packages/elixir/lib/kreuzberg/extraction_config.ex
generated
Normal file
@@ -0,0 +1,111 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionConfig do
|
||||
@moduledoc """
|
||||
Main extraction configuration.
|
||||
|
||||
This struct contains all configuration options for the extraction process.
|
||||
It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
|
||||
// Create with defaults
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
// Load from TOML file
|
||||
// let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Main extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
use_cache: boolean(),
|
||||
enable_quality_processing: boolean(),
|
||||
ocr: map() | nil,
|
||||
force_ocr: boolean(),
|
||||
force_ocr_pages: [non_neg_integer()] | nil,
|
||||
disable_ocr: boolean(),
|
||||
chunking: map() | nil,
|
||||
content_filter: map() | nil,
|
||||
images: map() | nil,
|
||||
pdf_options: map() | nil,
|
||||
token_reduction: map() | nil,
|
||||
language_detection: map() | nil,
|
||||
pages: map() | nil,
|
||||
keywords: map() | nil,
|
||||
postprocessor: map() | nil,
|
||||
html_options: String.t() | nil,
|
||||
html_output: map() | nil,
|
||||
extraction_timeout_secs: non_neg_integer() | nil,
|
||||
max_concurrent_extractions: non_neg_integer() | nil,
|
||||
result_format: String.t() | nil,
|
||||
security_limits: map() | nil,
|
||||
max_embedded_file_bytes: non_neg_integer() | nil,
|
||||
output_format: String.t() | nil,
|
||||
layout: map() | nil,
|
||||
use_layout_for_markdown: boolean(),
|
||||
include_document_structure: boolean(),
|
||||
acceleration: map() | nil,
|
||||
cache_namespace: String.t() | nil,
|
||||
cache_ttl_secs: non_neg_integer() | nil,
|
||||
email: map() | nil,
|
||||
concurrency: String.t() | nil,
|
||||
max_archive_depth: non_neg_integer(),
|
||||
tree_sitter: map() | nil,
|
||||
structured_extraction: map() | nil,
|
||||
cancel_token: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: nil,
|
||||
force_ocr: false,
|
||||
force_ocr_pages: nil,
|
||||
disable_ocr: false,
|
||||
chunking: nil,
|
||||
content_filter: nil,
|
||||
images: nil,
|
||||
pdf_options: nil,
|
||||
token_reduction: nil,
|
||||
language_detection: nil,
|
||||
pages: nil,
|
||||
keywords: nil,
|
||||
postprocessor: nil,
|
||||
html_options: nil,
|
||||
html_output: nil,
|
||||
extraction_timeout_secs: nil,
|
||||
max_concurrent_extractions: nil,
|
||||
result_format: :unified,
|
||||
security_limits: nil,
|
||||
max_embedded_file_bytes: nil,
|
||||
output_format: :plain,
|
||||
layout: nil,
|
||||
use_layout_for_markdown: false,
|
||||
include_document_structure: false,
|
||||
acceleration: nil,
|
||||
cache_namespace: nil,
|
||||
cache_ttl_secs: nil,
|
||||
email: nil,
|
||||
concurrency: nil,
|
||||
max_archive_depth: 0,
|
||||
tree_sitter: nil,
|
||||
structured_extraction: nil,
|
||||
cancel_token: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/extraction_diff.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/extraction_diff.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionDiff do
|
||||
@moduledoc "The complete diff between two `ExtractionResult` values."
|
||||
|
||||
@typedoc "The complete diff between two `ExtractionResult` values."
|
||||
@type t :: %__MODULE__{
|
||||
content_diff: [map()],
|
||||
tables_added: [map()],
|
||||
tables_removed: [map()],
|
||||
tables_changed: [map()],
|
||||
metadata_changed: String.t() | nil,
|
||||
embedded_changes: map()
|
||||
}
|
||||
|
||||
defstruct content_diff: [],
|
||||
tables_added: [],
|
||||
tables_removed: [],
|
||||
tables_changed: [],
|
||||
metadata_changed: nil,
|
||||
embedded_changes: nil
|
||||
end
|
||||
22
packages/elixir/lib/kreuzberg/extraction_method.ex
generated
Normal file
22
packages/elixir/lib/kreuzberg/extraction_method.ex
generated
Normal file
@@ -0,0 +1,22 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionMethod do
|
||||
@moduledoc "How the extracted text was produced."
|
||||
|
||||
@typedoc "How the extracted text was produced."
|
||||
@type t :: :native | :ocr | :mixed
|
||||
|
||||
@native :native
|
||||
@ocr :ocr
|
||||
@mixed :mixed
|
||||
|
||||
@spec native() :: t()
|
||||
def native, do: @native
|
||||
@spec ocr() :: t()
|
||||
def ocr, do: @ocr
|
||||
@spec mixed() :: t()
|
||||
def mixed, do: @mixed
|
||||
end
|
||||
78
packages/elixir/lib/kreuzberg/extraction_result.ex
generated
Normal file
78
packages/elixir/lib/kreuzberg/extraction_result.ex
generated
Normal file
@@ -0,0 +1,78 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ExtractionResult do
|
||||
@moduledoc """
|
||||
General extraction result used by the core extraction API.
|
||||
|
||||
This is the main result type returned by all extraction functions.
|
||||
"""
|
||||
|
||||
@typedoc "General extraction result used by the core extraction API."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
mime_type: String.t() | nil,
|
||||
metadata: map(),
|
||||
extraction_method: String.t() | nil | nil,
|
||||
tables: [map()],
|
||||
detected_languages: [String.t()] | nil,
|
||||
chunks: [map()] | nil,
|
||||
images: [map()] | nil,
|
||||
pages: [map()] | nil,
|
||||
elements: [map()] | nil,
|
||||
djot_content: map() | nil,
|
||||
ocr_elements: [map()] | nil,
|
||||
document: map() | nil,
|
||||
extracted_keywords: [map()] | nil,
|
||||
quality_score: float() | nil,
|
||||
processing_warnings: [map()],
|
||||
annotations: [map()] | nil,
|
||||
children: [map()] | nil,
|
||||
uris: [map()] | nil,
|
||||
revisions: [map()] | nil,
|
||||
structured_output: String.t() | nil,
|
||||
code_intelligence: String.t() | nil,
|
||||
llm_usage: [map()] | nil,
|
||||
formatted_content: String.t() | nil,
|
||||
ocr_internal_document: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
mime_type: nil,
|
||||
metadata: nil,
|
||||
extraction_method: nil,
|
||||
tables: [],
|
||||
detected_languages: nil,
|
||||
chunks: nil,
|
||||
images: nil,
|
||||
pages: nil,
|
||||
elements: nil,
|
||||
djot_content: nil,
|
||||
ocr_elements: nil,
|
||||
document: nil,
|
||||
extracted_keywords: nil,
|
||||
quality_score: nil,
|
||||
processing_warnings: [],
|
||||
annotations: nil,
|
||||
children: nil,
|
||||
uris: nil,
|
||||
revisions: nil,
|
||||
structured_output: nil,
|
||||
code_intelligence: nil,
|
||||
llm_usage: nil,
|
||||
formatted_content: nil,
|
||||
ocr_internal_document: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
30
packages/elixir/lib/kreuzberg/fiction_book_metadata.ex
generated
Normal file
30
packages/elixir/lib/kreuzberg/fiction_book_metadata.ex
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FictionBookMetadata do
|
||||
@moduledoc "FictionBook (FB2) metadata."
|
||||
|
||||
@typedoc "FictionBook (FB2) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
genres: [String.t()],
|
||||
sequences: [String.t()],
|
||||
annotation: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct genres: [],
|
||||
sequences: [],
|
||||
annotation: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
96
packages/elixir/lib/kreuzberg/file_extraction_config.ex
generated
Normal file
96
packages/elixir/lib/kreuzberg/file_extraction_config.ex
generated
Normal file
@@ -0,0 +1,96 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FileExtractionConfig do
|
||||
@moduledoc """
|
||||
Per-file extraction configuration overrides for batch processing.
|
||||
|
||||
All fields are `Option<T>` — `None` means "use the batch-level default."
|
||||
This type is used with `batch_extract_files` and
|
||||
`batch_extract_bytes` to allow heterogeneous
|
||||
extraction settings within a single batch.
|
||||
|
||||
# Excluded Fields
|
||||
|
||||
The following `ExtractionConfig` fields are batch-level only and
|
||||
cannot be overridden per file:
|
||||
- `max_concurrent_extractions` — controls batch parallelism
|
||||
- `use_cache` — global caching policy
|
||||
- `acceleration` — shared ONNX execution provider
|
||||
- `security_limits` — global archive security policy
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::FileExtractionConfig;
|
||||
|
||||
// Override just OCR forcing for a specific file
|
||||
let config = FileExtractionConfig {
|
||||
force_ocr: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Per-file extraction configuration overrides for batch processing."
|
||||
@type t :: %__MODULE__{
|
||||
enable_quality_processing: boolean() | nil,
|
||||
ocr: map() | nil,
|
||||
force_ocr: boolean() | nil,
|
||||
force_ocr_pages: [non_neg_integer()] | nil,
|
||||
disable_ocr: boolean() | nil,
|
||||
chunking: map() | nil,
|
||||
content_filter: map() | nil,
|
||||
images: map() | nil,
|
||||
pdf_options: map() | nil,
|
||||
token_reduction: map() | nil,
|
||||
language_detection: map() | nil,
|
||||
pages: map() | nil,
|
||||
keywords: map() | nil,
|
||||
postprocessor: map() | nil,
|
||||
html_options: String.t() | nil,
|
||||
result_format: String.t() | nil | nil,
|
||||
output_format: String.t() | nil | nil,
|
||||
include_document_structure: boolean() | nil,
|
||||
layout: map() | nil,
|
||||
timeout_secs: non_neg_integer() | nil,
|
||||
tree_sitter: map() | nil,
|
||||
structured_extraction: map() | nil
|
||||
}
|
||||
|
||||
defstruct enable_quality_processing: nil,
|
||||
ocr: nil,
|
||||
force_ocr: nil,
|
||||
force_ocr_pages: nil,
|
||||
disable_ocr: nil,
|
||||
chunking: nil,
|
||||
content_filter: nil,
|
||||
images: nil,
|
||||
pdf_options: nil,
|
||||
token_reduction: nil,
|
||||
language_detection: nil,
|
||||
pages: nil,
|
||||
keywords: nil,
|
||||
postprocessor: nil,
|
||||
html_options: nil,
|
||||
result_format: nil,
|
||||
output_format: nil,
|
||||
include_document_structure: nil,
|
||||
layout: nil,
|
||||
timeout_secs: nil,
|
||||
tree_sitter: nil,
|
||||
structured_extraction: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/footnote.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/footnote.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Footnote do
|
||||
@moduledoc "Footnote in Djot."
|
||||
|
||||
@typedoc "Footnote in Djot."
|
||||
@type t :: %__MODULE__{
|
||||
label: String.t() | nil,
|
||||
content: [map()]
|
||||
}
|
||||
|
||||
defstruct label: nil,
|
||||
content: []
|
||||
end
|
||||
37
packages/elixir/lib/kreuzberg/format_metadata.ex
generated
Normal file
37
packages/elixir/lib/kreuzberg/format_metadata.ex
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FormatMetadata do
|
||||
@moduledoc """
|
||||
Format-specific metadata (discriminated union).
|
||||
|
||||
Only one format type can exist per extraction result. This provides
|
||||
type-safe, clean metadata without nested optionals.
|
||||
"""
|
||||
|
||||
@typedoc "Format-specific metadata (discriminated union)."
|
||||
@type t :: term()
|
||||
|
||||
@type pdf :: %{type: :pdf, metadata: Kreuzberg.PdfMetadata.t()}
|
||||
@type docx :: %{type: :docx, metadata: Kreuzberg.DocxMetadata.t()}
|
||||
@type excel :: %{type: :excel, metadata: Kreuzberg.ExcelMetadata.t()}
|
||||
@type email :: %{type: :email, metadata: Kreuzberg.EmailMetadata.t()}
|
||||
@type pptx :: %{type: :pptx, metadata: Kreuzberg.PptxMetadata.t()}
|
||||
@type archive :: %{type: :archive, metadata: Kreuzberg.ArchiveMetadata.t()}
|
||||
@type image :: %{type: :image, metadata: Kreuzberg.ImageMetadata.t()}
|
||||
@type xml :: %{type: :xml, metadata: Kreuzberg.XmlMetadata.t()}
|
||||
@type text :: %{type: :text, metadata: Kreuzberg.TextMetadata.t()}
|
||||
@type html :: %{type: :html, metadata: Kreuzberg.HtmlMetadata.t()}
|
||||
@type ocr :: %{type: :ocr, metadata: Kreuzberg.OcrMetadata.t()}
|
||||
@type csv :: %{type: :csv, metadata: Kreuzberg.CsvMetadata.t()}
|
||||
@type bibtex :: %{type: :bibtex, metadata: Kreuzberg.BibtexMetadata.t()}
|
||||
@type citation :: %{type: :citation, metadata: Kreuzberg.CitationMetadata.t()}
|
||||
@type fiction_book :: %{type: :fiction_book, metadata: Kreuzberg.FictionBookMetadata.t()}
|
||||
@type dbf :: %{type: :dbf, metadata: Kreuzberg.DbfMetadata.t()}
|
||||
@type jats :: %{type: :jats, metadata: Kreuzberg.JatsMetadata.t()}
|
||||
@type epub :: %{type: :epub, metadata: Kreuzberg.EpubMetadata.t()}
|
||||
@type pst :: %{type: :pst, metadata: Kreuzberg.PstMetadata.t()}
|
||||
@type code :: %{type: :code, value: String.t()}
|
||||
end
|
||||
31
packages/elixir/lib/kreuzberg/formatted_block.ex
generated
Normal file
31
packages/elixir/lib/kreuzberg/formatted_block.ex
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.FormattedBlock do
|
||||
@moduledoc """
|
||||
Block-level element in a Djot document.
|
||||
|
||||
Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
||||
"""
|
||||
|
||||
@typedoc "Block-level element in a Djot document."
|
||||
@type t :: %__MODULE__{
|
||||
block_type: String.t() | nil,
|
||||
level: non_neg_integer() | nil,
|
||||
inline_content: [map()],
|
||||
attributes: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
code: String.t() | nil,
|
||||
children: [map()]
|
||||
}
|
||||
|
||||
defstruct block_type: :paragraph,
|
||||
level: nil,
|
||||
inline_content: [],
|
||||
attributes: nil,
|
||||
language: nil,
|
||||
code: nil,
|
||||
children: []
|
||||
end
|
||||
27
packages/elixir/lib/kreuzberg/grid_cell.ex
generated
Normal file
27
packages/elixir/lib/kreuzberg/grid_cell.ex
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.GridCell do
|
||||
@moduledoc "Individual grid cell with position and span metadata."
|
||||
|
||||
@typedoc "Individual grid cell with position and span metadata."
|
||||
@type t :: %__MODULE__{
|
||||
content: String.t() | nil,
|
||||
row: non_neg_integer(),
|
||||
col: non_neg_integer(),
|
||||
row_span: non_neg_integer(),
|
||||
col_span: non_neg_integer(),
|
||||
is_header: boolean(),
|
||||
bbox: map() | nil
|
||||
}
|
||||
|
||||
defstruct content: nil,
|
||||
row: 0,
|
||||
col: 0,
|
||||
row_span: 0,
|
||||
col_span: 0,
|
||||
is_header: false,
|
||||
bbox: nil
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/header_metadata.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/header_metadata.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeaderMetadata do
|
||||
@moduledoc "Header/heading element metadata."
|
||||
|
||||
@typedoc "Header/heading element metadata."
|
||||
@type t :: %__MODULE__{
|
||||
level: non_neg_integer(),
|
||||
text: String.t() | nil,
|
||||
id: String.t() | nil,
|
||||
depth: non_neg_integer(),
|
||||
html_offset: non_neg_integer()
|
||||
}
|
||||
|
||||
defstruct level: 0,
|
||||
text: nil,
|
||||
id: nil,
|
||||
depth: 0,
|
||||
html_offset: 0
|
||||
|
||||
@doc "Validate that the header level is within valid range (1-6)."
|
||||
@spec valid?(t()) :: boolean()
|
||||
def valid?(%__MODULE__{level: level}) do
|
||||
level >= 1 and level <= 6
|
||||
end
|
||||
end
|
||||
19
packages/elixir/lib/kreuzberg/heading_context.ex
generated
Normal file
19
packages/elixir/lib/kreuzberg/heading_context.ex
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeadingContext do
|
||||
@moduledoc """
|
||||
Heading context for a chunk within a Markdown document.
|
||||
|
||||
Contains the heading hierarchy from document root to this chunk's section.
|
||||
"""
|
||||
|
||||
@typedoc "Heading context for a chunk within a Markdown document."
|
||||
@type t :: %__MODULE__{
|
||||
headings: [map()]
|
||||
}
|
||||
|
||||
defstruct headings: []
|
||||
end
|
||||
17
packages/elixir/lib/kreuzberg/heading_level.ex
generated
Normal file
17
packages/elixir/lib/kreuzberg/heading_level.ex
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HeadingLevel do
|
||||
@moduledoc "A single heading in the hierarchy."
|
||||
|
||||
@typedoc "A single heading in the hierarchy."
|
||||
@type t :: %__MODULE__{
|
||||
level: non_neg_integer(),
|
||||
text: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct level: 0,
|
||||
text: nil
|
||||
end
|
||||
26
packages/elixir/lib/kreuzberg/hierarchical_block.ex
generated
Normal file
26
packages/elixir/lib/kreuzberg/hierarchical_block.ex
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HierarchicalBlock do
|
||||
@moduledoc """
|
||||
A text block with hierarchy level assignment.
|
||||
|
||||
Represents a block of text with semantic heading information extracted from
|
||||
font size clustering and hierarchical analysis.
|
||||
"""
|
||||
|
||||
@typedoc "A text block with hierarchy level assignment."
|
||||
@type t :: %__MODULE__{
|
||||
text: String.t() | nil,
|
||||
font_size: float(),
|
||||
level: String.t() | nil,
|
||||
bbox: [float()] | nil
|
||||
}
|
||||
|
||||
defstruct text: nil,
|
||||
font_size: 0.0,
|
||||
level: nil,
|
||||
bbox: nil
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/hierarchy_config.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/hierarchy_config.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HierarchyConfig do
|
||||
@moduledoc """
|
||||
Hierarchy extraction configuration for PDF text structure analysis.
|
||||
|
||||
Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||||
clustering and semantic analysis. When enabled, hierarchical blocks are
|
||||
included in page content.
|
||||
"""
|
||||
|
||||
@typedoc "Hierarchy extraction configuration for PDF text structure analysis."
|
||||
@type t :: %__MODULE__{
|
||||
enabled: boolean(),
|
||||
k_clusters: non_neg_integer(),
|
||||
include_bbox: boolean(),
|
||||
ocr_coverage_threshold: float() | nil
|
||||
}
|
||||
|
||||
defstruct enabled: true,
|
||||
k_clusters: 3,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
59
packages/elixir/lib/kreuzberg/html_metadata.ex
generated
Normal file
59
packages/elixir/lib/kreuzberg/html_metadata.ex
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlMetadata do
|
||||
@moduledoc """
|
||||
HTML metadata extracted from HTML documents.
|
||||
|
||||
Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
||||
and extracted structural elements (headers, links, images, structured data).
|
||||
"""
|
||||
|
||||
@typedoc "HTML metadata extracted from HTML documents."
|
||||
@type t :: %__MODULE__{
|
||||
title: String.t() | nil,
|
||||
description: String.t() | nil,
|
||||
keywords: [String.t()],
|
||||
author: String.t() | nil,
|
||||
canonical_url: String.t() | nil,
|
||||
base_href: String.t() | nil,
|
||||
language: String.t() | nil,
|
||||
text_direction: String.t() | nil | nil,
|
||||
open_graph: map(),
|
||||
twitter_card: map(),
|
||||
meta_tags: map(),
|
||||
headers: [map()],
|
||||
links: [map()],
|
||||
images: [map()],
|
||||
structured_data: [map()]
|
||||
}
|
||||
|
||||
defstruct title: nil,
|
||||
description: nil,
|
||||
keywords: [],
|
||||
author: nil,
|
||||
canonical_url: nil,
|
||||
base_href: nil,
|
||||
language: nil,
|
||||
text_direction: nil,
|
||||
open_graph: %{},
|
||||
twitter_card: %{},
|
||||
meta_tags: %{},
|
||||
headers: [],
|
||||
links: [],
|
||||
images: [],
|
||||
structured_data: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
53
packages/elixir/lib/kreuzberg/html_output_config.ex
generated
Normal file
53
packages/elixir/lib/kreuzberg/html_output_config.ex
generated
Normal file
@@ -0,0 +1,53 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlOutputConfig do
|
||||
@moduledoc """
|
||||
Configuration for styled HTML output.
|
||||
|
||||
When set on [`ExtractionConfig::html_output`] alongside
|
||||
`output_format = OutputFormat::Html`, the pipeline builds a
|
||||
[`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
||||
the plain comrak-based renderer.
|
||||
|
||||
# Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
|
||||
|
||||
let config = HtmlOutputConfig {
|
||||
theme: HtmlTheme::GitHub,
|
||||
css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
"""
|
||||
|
||||
@typedoc "Configuration for styled HTML output."
|
||||
@type t :: %__MODULE__{
|
||||
css: String.t() | nil,
|
||||
css_file: String.t() | nil,
|
||||
theme: String.t() | nil,
|
||||
class_prefix: String.t() | nil,
|
||||
embed_css: boolean()
|
||||
}
|
||||
|
||||
defstruct css: nil,
|
||||
css_file: nil,
|
||||
theme: :unstyled,
|
||||
class_prefix: nil,
|
||||
embed_css: true
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
33
packages/elixir/lib/kreuzberg/html_theme.ex
generated
Normal file
33
packages/elixir/lib/kreuzberg/html_theme.ex
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.HtmlTheme do
|
||||
@moduledoc "Built-in HTML theme selection."
|
||||
|
||||
@typedoc "Built-in HTML theme selection."
|
||||
@type t :: :default | :git_hub | :dark | :light | :unstyled
|
||||
|
||||
@default :default
|
||||
@git_hub :git_hub
|
||||
@dark :dark
|
||||
@light :light
|
||||
@unstyled :unstyled
|
||||
|
||||
@doc "Sensible defaults: system font stack, neutral colours, readable line measure. CSS custom properties (`--kb-*`) are all defined so user CSS can override individual values."
|
||||
@spec default() :: t()
|
||||
def default, do: @default
|
||||
@doc "GitHub Markdown-inspired palette and spacing."
|
||||
@spec git_hub() :: t()
|
||||
def git_hub, do: @git_hub
|
||||
@doc "Dark background, light text."
|
||||
@spec dark() :: t()
|
||||
def dark, do: @dark
|
||||
@doc "Minimal light theme with generous whitespace."
|
||||
@spec light() :: t()
|
||||
def light, do: @light
|
||||
@doc "No built-in stylesheet emitted. CSS custom properties are still defined on `:root` so user stylesheets can reference `var(--kb-*)` tokens."
|
||||
@spec unstyled() :: t()
|
||||
def unstyled, do: @unstyled
|
||||
end
|
||||
50
packages/elixir/lib/kreuzberg/image_extraction_config.ex
generated
Normal file
50
packages/elixir/lib/kreuzberg/image_extraction_config.ex
generated
Normal file
@@ -0,0 +1,50 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageExtractionConfig do
|
||||
@moduledoc "Image extraction configuration."
|
||||
|
||||
@typedoc "Image extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
extract_images: boolean(),
|
||||
target_dpi: integer(),
|
||||
max_image_dimension: integer(),
|
||||
inject_placeholders: boolean(),
|
||||
auto_adjust_dpi: boolean(),
|
||||
min_dpi: integer(),
|
||||
max_dpi: integer(),
|
||||
max_images_per_page: non_neg_integer() | nil,
|
||||
classify: boolean(),
|
||||
include_page_rasters: boolean(),
|
||||
run_ocr_on_images: boolean(),
|
||||
ocr_text_only: boolean(),
|
||||
append_ocr_text: boolean()
|
||||
}
|
||||
|
||||
defstruct extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4_096,
|
||||
inject_placeholders: true,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 72,
|
||||
max_dpi: 600,
|
||||
max_images_per_page: nil,
|
||||
classify: true,
|
||||
include_page_rasters: false,
|
||||
run_ocr_on_images: true,
|
||||
ocr_text_only: false,
|
||||
append_ocr_text: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
73
packages/elixir/lib/kreuzberg/image_kind.ex
generated
Normal file
73
packages/elixir/lib/kreuzberg/image_kind.ex
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageKind do
|
||||
@moduledoc "Heuristic classification of what an image likely depicts."
|
||||
|
||||
@typedoc "Heuristic classification of what an image likely depicts."
|
||||
@type t ::
|
||||
:photograph
|
||||
| :diagram
|
||||
| :chart
|
||||
| :drawing
|
||||
| :text_block
|
||||
| :decoration
|
||||
| :logo
|
||||
| :icon
|
||||
| :tile_fragment
|
||||
| :mask
|
||||
| :page_raster
|
||||
| :unknown
|
||||
|
||||
@photograph :photograph
|
||||
@diagram :diagram
|
||||
@chart :chart
|
||||
@drawing :drawing
|
||||
@text_block :text_block
|
||||
@decoration :decoration
|
||||
@logo :logo
|
||||
@icon :icon
|
||||
@tile_fragment :tile_fragment
|
||||
@mask :mask
|
||||
@page_raster :page_raster
|
||||
@unknown :unknown
|
||||
|
||||
@doc "Photographic image (natural scene, photograph)"
|
||||
@spec photograph() :: t()
|
||||
def photograph, do: @photograph
|
||||
@doc "Technical or schematic diagram"
|
||||
@spec diagram() :: t()
|
||||
def diagram, do: @diagram
|
||||
@doc "Chart, graph, or plot"
|
||||
@spec chart() :: t()
|
||||
def chart, do: @chart
|
||||
@doc "Freehand or technical drawing"
|
||||
@spec drawing() :: t()
|
||||
def drawing, do: @drawing
|
||||
@doc "Text-heavy image (scanned text, document)"
|
||||
@spec text_block() :: t()
|
||||
def text_block, do: @text_block
|
||||
@doc "Decorative element or border"
|
||||
@spec decoration() :: t()
|
||||
def decoration, do: @decoration
|
||||
@doc "Logo or brand mark"
|
||||
@spec logo() :: t()
|
||||
def logo, do: @logo
|
||||
@doc "Small icon"
|
||||
@spec icon() :: t()
|
||||
def icon, do: @icon
|
||||
@doc "Fragment of a larger tiled image (tile of a technical drawing)"
|
||||
@spec tile_fragment() :: t()
|
||||
def tile_fragment, do: @tile_fragment
|
||||
@doc "Mask or transparency map"
|
||||
@spec mask() :: t()
|
||||
def mask, do: @mask
|
||||
@doc "Full-page render produced during OCR preprocessing; used as a citation thumbnail."
|
||||
@spec page_raster() :: t()
|
||||
def page_raster, do: @page_raster
|
||||
@doc "Could not classify with reasonable confidence"
|
||||
@spec unknown() :: t()
|
||||
def unknown, do: @unknown
|
||||
end
|
||||
36
packages/elixir/lib/kreuzberg/image_metadata.ex
generated
Normal file
36
packages/elixir/lib/kreuzberg/image_metadata.ex
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageMetadata do
|
||||
@moduledoc """
|
||||
Image metadata extracted from image files.
|
||||
|
||||
Includes dimensions, format, and EXIF data.
|
||||
"""
|
||||
|
||||
@typedoc "Image metadata extracted from image files."
|
||||
@type t :: %__MODULE__{
|
||||
width: non_neg_integer(),
|
||||
height: non_neg_integer(),
|
||||
format: String.t() | nil,
|
||||
exif: map()
|
||||
}
|
||||
|
||||
defstruct width: 0,
|
||||
height: 0,
|
||||
format: nil,
|
||||
exif: %{}
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/image_metadata_type.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/image_metadata_type.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageMetadataType do
|
||||
@moduledoc "Image element metadata."
|
||||
|
||||
@typedoc "Image element metadata."
|
||||
@type t :: %__MODULE__{
|
||||
src: String.t() | nil,
|
||||
alt: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
dimensions: [non_neg_integer()] | nil,
|
||||
image_type: String.t() | nil,
|
||||
attributes: [[String.t()]]
|
||||
}
|
||||
|
||||
defstruct src: nil,
|
||||
alt: nil,
|
||||
title: nil,
|
||||
dimensions: nil,
|
||||
image_type: :data_uri,
|
||||
attributes: []
|
||||
end
|
||||
44
packages/elixir/lib/kreuzberg/image_preprocessing_config.ex
generated
Normal file
44
packages/elixir/lib/kreuzberg/image_preprocessing_config.ex
generated
Normal file
@@ -0,0 +1,44 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImagePreprocessingConfig do
|
||||
@moduledoc """
|
||||
Image preprocessing configuration for OCR.
|
||||
|
||||
These settings control how images are preprocessed before OCR to improve
|
||||
text recognition quality. Different preprocessing strategies work better
|
||||
for different document types.
|
||||
"""
|
||||
|
||||
@typedoc "Image preprocessing configuration for OCR."
|
||||
@type t :: %__MODULE__{
|
||||
target_dpi: integer(),
|
||||
auto_rotate: boolean(),
|
||||
deskew: boolean(),
|
||||
denoise: boolean(),
|
||||
contrast_enhance: boolean(),
|
||||
binarization_method: String.t() | nil,
|
||||
invert_colors: boolean()
|
||||
}
|
||||
|
||||
defstruct target_dpi: 300,
|
||||
auto_rotate: true,
|
||||
deskew: true,
|
||||
denoise: false,
|
||||
contrast_enhance: false,
|
||||
binarization_method: "otsu",
|
||||
invert_colors: false
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
42
packages/elixir/lib/kreuzberg/image_preprocessing_metadata.ex
generated
Normal file
42
packages/elixir/lib/kreuzberg/image_preprocessing_metadata.ex
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImagePreprocessingMetadata do
|
||||
@moduledoc """
|
||||
Image preprocessing metadata.
|
||||
|
||||
Tracks the transformations applied to an image during OCR preprocessing,
|
||||
including DPI normalization, resizing, and resampling.
|
||||
"""
|
||||
|
||||
@typedoc "Image preprocessing metadata."
|
||||
@type t :: %__MODULE__{
|
||||
original_dimensions: [non_neg_integer()],
|
||||
original_dpi: [float()],
|
||||
target_dpi: integer(),
|
||||
scale_factor: float(),
|
||||
auto_adjusted: boolean(),
|
||||
final_dpi: integer(),
|
||||
new_dimensions: [non_neg_integer()] | nil,
|
||||
resample_method: String.t() | nil,
|
||||
dimension_clamped: boolean(),
|
||||
calculated_dpi: integer() | nil,
|
||||
skipped_resize: boolean(),
|
||||
resize_error: String.t() | nil
|
||||
}
|
||||
|
||||
defstruct original_dimensions: [],
|
||||
original_dpi: [],
|
||||
target_dpi: 0,
|
||||
scale_factor: 0.0,
|
||||
auto_adjusted: false,
|
||||
final_dpi: 0,
|
||||
new_dimensions: nil,
|
||||
resample_method: nil,
|
||||
dimension_clamped: false,
|
||||
calculated_dpi: nil,
|
||||
skipped_resize: false,
|
||||
resize_error: nil
|
||||
end
|
||||
29
packages/elixir/lib/kreuzberg/image_type.ex
generated
Normal file
29
packages/elixir/lib/kreuzberg/image_type.ex
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.ImageType do
|
||||
@moduledoc "Image type classification."
|
||||
|
||||
@typedoc "Image type classification."
|
||||
@type t :: :"data-uri" | :"inline-svg" | :external | :relative
|
||||
|
||||
@data_uri :"data-uri"
|
||||
@inline_svg :"inline-svg"
|
||||
@external :external
|
||||
@relative :relative
|
||||
|
||||
@doc "Data URI image"
|
||||
@spec data_uri() :: t()
|
||||
def data_uri, do: @data_uri
|
||||
@doc "Inline SVG"
|
||||
@spec inline_svg() :: t()
|
||||
def inline_svg, do: @inline_svg
|
||||
@doc "External image URL"
|
||||
@spec external() :: t()
|
||||
def external, do: @external
|
||||
@doc "Relative path image"
|
||||
@spec relative() :: t()
|
||||
def relative, do: @relative
|
||||
end
|
||||
25
packages/elixir/lib/kreuzberg/inline_element.ex
generated
Normal file
25
packages/elixir/lib/kreuzberg/inline_element.ex
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.InlineElement do
|
||||
@moduledoc """
|
||||
Inline element within a block.
|
||||
|
||||
Represents text with formatting, links, images, etc.
|
||||
"""
|
||||
|
||||
@typedoc "Inline element within a block."
|
||||
@type t :: %__MODULE__{
|
||||
element_type: String.t() | nil,
|
||||
content: String.t() | nil,
|
||||
attributes: String.t() | nil,
|
||||
metadata: map() | nil
|
||||
}
|
||||
|
||||
defstruct element_type: :text,
|
||||
content: nil,
|
||||
attributes: nil,
|
||||
metadata: nil
|
||||
end
|
||||
77
packages/elixir/lib/kreuzberg/inline_type.ex
generated
Normal file
77
packages/elixir/lib/kreuzberg/inline_type.ex
generated
Normal file
@@ -0,0 +1,77 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.InlineType do
|
||||
@moduledoc "Types of inline elements in Djot."
|
||||
|
||||
@typedoc "Types of inline elements in Djot."
|
||||
@type t ::
|
||||
:text
|
||||
| :strong
|
||||
| :emphasis
|
||||
| :highlight
|
||||
| :subscript
|
||||
| :superscript
|
||||
| :insert
|
||||
| :delete
|
||||
| :code
|
||||
| :link
|
||||
| :image
|
||||
| :span
|
||||
| :math
|
||||
| :raw_inline
|
||||
| :footnote_ref
|
||||
| :symbol
|
||||
|
||||
@text :text
|
||||
@strong :strong
|
||||
@emphasis :emphasis
|
||||
@highlight :highlight
|
||||
@subscript :subscript
|
||||
@superscript :superscript
|
||||
@insert :insert
|
||||
@delete :delete
|
||||
@code :code
|
||||
@link :link
|
||||
@image :image
|
||||
@span :span
|
||||
@math :math
|
||||
@raw_inline :raw_inline
|
||||
@footnote_ref :footnote_ref
|
||||
@symbol :symbol
|
||||
|
||||
@spec text() :: t()
|
||||
def text, do: @text
|
||||
@spec strong() :: t()
|
||||
def strong, do: @strong
|
||||
@spec emphasis() :: t()
|
||||
def emphasis, do: @emphasis
|
||||
@spec highlight() :: t()
|
||||
def highlight, do: @highlight
|
||||
@spec subscript() :: t()
|
||||
def subscript, do: @subscript
|
||||
@spec superscript() :: t()
|
||||
def superscript, do: @superscript
|
||||
@spec insert() :: t()
|
||||
def insert, do: @insert
|
||||
@spec delete() :: t()
|
||||
def delete, do: @delete
|
||||
@spec code() :: t()
|
||||
def code, do: @code
|
||||
@spec link() :: t()
|
||||
def link, do: @link
|
||||
@spec image() :: t()
|
||||
def image, do: @image
|
||||
@spec span() :: t()
|
||||
def span, do: @span
|
||||
@spec math() :: t()
|
||||
def math, do: @math
|
||||
@spec raw_inline() :: t()
|
||||
def raw_inline, do: @raw_inline
|
||||
@spec footnote_ref() :: t()
|
||||
def footnote_ref, do: @footnote_ref
|
||||
@spec symbol() :: t()
|
||||
def symbol, do: @symbol
|
||||
end
|
||||
32
packages/elixir/lib/kreuzberg/jats_metadata.ex
generated
Normal file
32
packages/elixir/lib/kreuzberg/jats_metadata.ex
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.JatsMetadata do
|
||||
@moduledoc "JATS (Journal Article Tag Suite) metadata."
|
||||
|
||||
@typedoc "JATS (Journal Article Tag Suite) metadata."
|
||||
@type t :: %__MODULE__{
|
||||
copyright: String.t() | nil,
|
||||
license: String.t() | nil,
|
||||
history_dates: map(),
|
||||
contributor_roles: [map()]
|
||||
}
|
||||
|
||||
defstruct copyright: nil,
|
||||
license: nil,
|
||||
history_dates: %{},
|
||||
contributor_roles: []
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/keyword.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/keyword.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.Keyword do
|
||||
@moduledoc "Extracted keyword with metadata."
|
||||
|
||||
@typedoc "Extracted keyword with metadata."
|
||||
@type t :: %__MODULE__{
|
||||
text: String.t() | nil,
|
||||
score: float(),
|
||||
algorithm: String.t() | nil,
|
||||
positions: [non_neg_integer()] | nil
|
||||
}
|
||||
|
||||
defstruct text: nil,
|
||||
score: 0.0,
|
||||
algorithm: :yake,
|
||||
positions: nil
|
||||
end
|
||||
21
packages/elixir/lib/kreuzberg/keyword_algorithm.ex
generated
Normal file
21
packages/elixir/lib/kreuzberg/keyword_algorithm.ex
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.KeywordAlgorithm do
|
||||
@moduledoc "Keyword algorithm selection."
|
||||
|
||||
@typedoc "Keyword algorithm selection."
|
||||
@type t :: :yake | :rake
|
||||
|
||||
@yake :yake
|
||||
@rake :rake
|
||||
|
||||
@doc "YAKE (Yet Another Keyword Extractor) - statistical approach"
|
||||
@spec yake() :: t()
|
||||
def yake, do: @yake
|
||||
@doc "RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based"
|
||||
@spec rake() :: t()
|
||||
def rake, do: @rake
|
||||
end
|
||||
38
packages/elixir/lib/kreuzberg/keyword_config.ex
generated
Normal file
38
packages/elixir/lib/kreuzberg/keyword_config.ex
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
defmodule Kreuzberg.KeywordConfig do
|
||||
@moduledoc "Keyword extraction configuration."
|
||||
|
||||
@typedoc "Keyword extraction configuration."
|
||||
@type t :: %__MODULE__{
|
||||
algorithm: String.t() | nil,
|
||||
max_keywords: non_neg_integer(),
|
||||
min_score: float(),
|
||||
ngram_range: [non_neg_integer()],
|
||||
language: String.t() | nil,
|
||||
yake_params: map() | nil,
|
||||
rake_params: map() | nil
|
||||
}
|
||||
|
||||
defstruct algorithm: :yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0,
|
||||
ngram_range: [],
|
||||
language: nil,
|
||||
yake_params: nil,
|
||||
rake_params: nil
|
||||
|
||||
defimpl Jason.Encoder do
|
||||
@doc false
|
||||
def encode(value, opts) do
|
||||
value
|
||||
|> Map.from_struct()
|
||||
|> Enum.reject(fn {_k, v} -> v == nil end)
|
||||
|> Enum.into(%{})
|
||||
|> Jason.Encoder.encode(opts)
|
||||
end
|
||||
end
|
||||
end
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user