Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

70
packages/kotlin-android/.editorconfig generated Normal file
View File

@@ -0,0 +1,70 @@
# Generated by alef. Do not edit by hand.
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
[*.kt]
indent_style = space
indent_size = 4
# Java packages derived from hyphenated GitHub orgs (e.g. `sample-org` →
# `com.github.sample_org`) carry underscores; allow them for the
# co-located Java facade and its Kotlin wrappers.
ktlint_standard_package-name = disabled
# ktfmt and ktlint disagree on class/function/parameter splitting heuristics
# and continuation indent. ktfmt is the canonical formatter for the emitted
# code, so we disable the ktlint rules that conflict.
ktlint_standard_class-signature = disabled
ktlint_standard_function-signature = disabled
ktlint_standard_function-expression-body = disabled
ktlint_standard_no-empty-class-body = disabled
ktlint_standard_no-empty-first-line-in-method-block = disabled
ktlint_standard_indent = disabled
# `string-template-indent` depends on `indent`; must be disabled together.
ktlint_standard_string-template-indent = disabled
ktlint_standard_filename = disabled
# ktfmt collapses `fun foo(): T = callbackFlow { ... }` to one line for
# expression-bodied functions; ktlint's multiline-expression-wrapping then
# rejects it. Defer to ktfmt — disable the ktlint rule for the generated tree.
ktlint_standard_multiline-expression-wrapping = disabled
# ktlint --format and ktfmt fight on chained-call layout (`.foo()\n.bar()`
# vs `.foo().bar()`) and on multi-line `if/else` bodies. ktfmt is the
# canonical formatter for emitted code, so disable the ktlint counter-rules.
ktlint_standard_chain-method-continuation = disabled
ktlint_standard_multiline-if-else = disabled
# ktfmt strips trailing commas before `)` on multiline call sites; ktlint
# `trailing-comma-on-call-site` demands them. The two fight in a loop.
# Defer to ktfmt — disable the ktlint rule.
ktlint_standard_trailing-comma-on-call-site = disabled
ktlint_standard_trailing-comma-on-declaration-site = disabled
[*.gradle.kts]
indent_style = space
indent_size = 4
# ktfmt and ktlint disagree on class/function/parameter splitting heuristics
# and continuation indent. ktfmt is the canonical formatter for the emitted
# code, so we disable the ktlint rules that conflict.
ktlint_standard_class-signature = disabled
ktlint_standard_function-signature = disabled
ktlint_standard_function-expression-body = disabled
ktlint_standard_no-empty-class-body = disabled
ktlint_standard_no-empty-first-line-in-method-block = disabled
ktlint_standard_indent = disabled
# `string-template-indent` depends on `indent`; must be disabled together.
ktlint_standard_string-template-indent = disabled
ktlint_standard_filename = disabled
ktlint_standard_multiline-expression-wrapping = disabled
ktlint_standard_chain-method-continuation = disabled
ktlint_standard_multiline-if-else = disabled
# ktfmt strips trailing commas before `)` on multiline call sites; ktlint
# `trailing-comma-on-call-site` demands them. The two fight in a loop.
# Defer to ktfmt — disable the ktlint rule.
ktlint_standard_trailing-comma-on-call-site = disabled
ktlint_standard_trailing-comma-on-declaration-site = disabled
[*.{xml,pro,gitignore}]
indent_style = space
indent_size = 2

6
packages/kotlin-android/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,6 @@
# Generated by alef. Do not edit by hand.
build/
.gradle/
.idea/
*.iml
local.properties

93
packages/kotlin-android/LICENSE generated Normal file
View File

@@ -0,0 +1,93 @@
Elastic License 2.0 (ELv2)
Copyright 2025-2026 Kreuzberg, Inc.
Acceptance
By using the software, you agree to all of the terms and conditions below.
Copyright License
The licensor grants you a non-exclusive, royalty-free, worldwide,
non-sublicensable, non-transferable license to use, copy, distribute, make
available, and prepare derivative works of the software, in each case subject to
the limitations and conditions below.
Limitations
You may not provide the software to third parties as a hosted or managed
service, where the service provides users with access to any substantial set of
the features or functionality of the software.
You may not move, change, disable, or circumvent the license key functionality
in the software, and you may not remove or obscure any functionality in the
software that is protected by the license key.
You may not alter, remove, or obscure any licensing, copyright, or other notices
of the licensor in the software. Any use of the licensor's trademarks is subject
to applicable law.
Patents
The licensor grants you a license, under any patent claims the licensor can
license, or becomes able to license, to make, have made, use, sell, offer for
sale, import and have imported the software, in each case subject to the
limitations and conditions in this license. This license does not cover any
patent claims that you cause to be infringed by modifications or additions to the
software. If you or your company make any written claim that the software
infringes or contributes to infringement of any patent, your patent license for
the software granted under these terms ends immediately. If your company makes
such a claim, your patent license ends immediately for work on behalf of your
company.
Notices
You must ensure that anyone who gets a copy of any part of the software from you
also gets a copy of these terms.
If you modify the software, you must include in any modified copies of the
software prominent notices stating that you have modified the software.
No Other Rights
These terms do not imply any licenses other than those expressly granted in
these terms.
Termination
If you use the software in violation of these terms, such use is not licensed,
and your licenses will automatically terminate. If the licensor provides you with
a notice of your violation, and you cease all violation of this license no later
than 30 days after you receive that notice, your licenses will be reinstated
retroactively. However, if you violate these terms after such reinstatement, any
additional violation of these terms will cause your licenses to terminate
automatically and permanently.
No Liability
As far as the law allows, the software comes as is, without any warranty or
condition, and the licensor will not be liable to you for any damages arising out
of these terms or the use or nature of the software, under any kind of legal
claim.
Definitions
The licensor is the entity offering these terms, and the software is the
software the licensor makes available under these terms, including any portion
of it.
you refers to the individual or entity agreeing to these terms.
your company is any legal entity, sole proprietorship, or other kind of
organization that you work for, plus all organizations that have control over,
are under the control of, or are under common control with that organization.
control means ownership of substantially all the assets of an entity, or the
power to direct its management and policies by vote, contract, or otherwise.
Control can be direct or indirect.
your licenses are all the licenses granted to you for the software under these
terms.
use means anything you do with the software requiring one of your licenses.
trademark means trademarks, service marks, and similar rights.

328
packages/kotlin-android/README.md generated Normal file
View File

@@ -0,0 +1,328 @@
# Kotlin (Android)
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
<a href="https://github.com/kreuzberg-dev/alef">
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
</a>
<!-- Language Bindings -->
<a href="https://crates.io/crates/kreuzberg">
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
</a>
<a href="https://pypi.org/project/kreuzberg/">
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/node">
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
</a>
<a href="https://www.nuget.org/packages/Kreuzberg/">
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
</a>
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
</a>
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://hex.pm/packages/kreuzberg">
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
</a>
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
</a>
<a href="https://pub.dev/packages/kreuzberg">
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
</a>
<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
</a>
<a href="https://docs.kreuzberg.dev">
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
</a>
<a href="https://huggingface.co/Kreuzberg">
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
</a>
</div>
<div align="center" style="margin: 24px 0 0;">
<a href="https://kreuzberg.dev">
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
</a>
</div>
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
<a href="https://discord.gg/xt9WY3GnKR">
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
</a>
<a href="https://docs.kreuzberg.dev/demo.html">
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
</a>
</div>
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Android library (AAR) with bundled jniLibs/arm64-v8a and jniLibs/x86_64 — Gradle automatically picks up the native cdylib for emulator and device builds. Server-side Kotlin/JVM consumers can use the Java binding directly via standard Kotlin/Java interop.
## What This Package Provides
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
- **Android AAR** — JNI-backed package for mobile extraction workloads.
## Installation
### Package Installation
Kotlin DSL (`build.gradle.kts`):
```kotlin
implementation("dev.kreuzberg:kreuzberg-android:5.0.0-rc.3")
```
Groovy DSL (`build.gradle`):
```groovy
implementation 'dev.kreuzberg:kreuzberg-android:5.0.0-rc.3'
```
Add to your `pom.xml`:
```xml
<dependency>
<groupId>dev.kreuzberg</groupId>
<artifactId>kreuzberg-android</artifactId>
<version>5.0.0-rc.3</version>
</dependency>
```
### System Requirements
- See [Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/) for requirements
## Quick Start
### Basic Extraction
Extract text, metadata, and structure from any supported document format:
<!-- snippet not found: api/extract_file_sync.md -->
### Common Use Cases
#### Extract with Custom Configuration
Most use cases benefit from configuration to control extraction behavior:
**With OCR (for scanned documents):**
<!-- snippet not found: ocr/ocr_extraction.md -->
#### Table Extraction
See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
#### Processing Multiple Files
<!-- snippet not found: api/batch_extract_files_sync.md -->
#### Async Processing
For non-blocking document processing:
<!-- snippet not found: api/extract_file_async.md -->
### Next Steps
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
## Features
### Supported File Formats (90+)
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
#### Office Documents
| Category | Formats | Capabilities |
|----------|---------|--------------|
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
| **Database** | `.dbf` | Table data extraction, field type support |
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
#### Images (OCR-Enabled)
| Category | Formats | Features |
|----------|---------|----------|
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
#### Web & Data
| Category | Formats | Features |
|----------|---------|----------|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
#### Email & Archives
| Category | Formats | Features |
|----------|---------|----------|
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
#### Academic & Scientific
| Category | Formats | Features |
|----------|---------|----------|
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
#### Code Intelligence (300+ Languages)
| Feature | Description |
|---------|-------------|
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
| **Symbol Extraction** | Variables, constants, type aliases, properties |
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
| **Diagnostics** | Parse errors with line/column positions |
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
### Key Capabilities
- **Text Extraction** - Extract all text content with position and formatting information
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
- **Table Extraction** - Parse tables with structure and cell content preservation
- **Image Extraction** - Extract embedded images and render page previews
- **OCR Support** - Integrate multiple OCR backends for scanned documents
- **Async/Await** - Non-blocking document processing with concurrent operations
- **Plugin System** - Extensible post-processing for custom text transformation
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
- **Batch Processing** - Efficiently process multiple documents in parallel
- **Memory Efficient** - Stream large files without loading entirely into memory
- **Language Detection** - Detect and support multiple languages in documents
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
- **Configuration** - Fine-grained control over extraction behavior
### Performance Characteristics
| Format | Speed | Memory | Notes |
|--------|-------|--------|-------|
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
## OCR Support
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
- **Tesseract**
- **Paddleocr**
### OCR Configuration Example
<!-- snippet not found: ocr/ocr_extraction.md -->
## Async Support
This binding provides full async/await support for non-blocking document processing:
<!-- snippet not found: api/extract_file_async.md -->
## Plugin System
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
## Embeddings Support
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
## Batch Processing
Process multiple documents efficiently:
<!-- snippet not found: api/batch_extract_files_sync.md -->
## Configuration
For advanced configuration options including language detection, table extraction, OCR settings, and more:
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
## Documentation
- **[Official Documentation](https://docs.kreuzberg.dev/)**
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
## Contributing
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
## Part of Kreuzberg.dev
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
## License
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
## Support
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)

110
packages/kotlin-android/build.gradle.kts generated Normal file
View File

@@ -0,0 +1,110 @@
// Generated by alef. Do not edit by hand.
import com.vanniktech.maven.publish.AndroidSingleVariantLibrary
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
buildscript {
dependencies {
classpath("com.vanniktech:gradle-maven-publish-plugin:0.36.0")
}
}
plugins {
id("com.android.library") version "8.13.0"
kotlin("android") version "2.3.21"
id("com.vanniktech.maven.publish") version "0.36.0"
id("org.jlleitschuh.gradle.ktlint") version "13.1.0"
id("com.github.ben-manes.versions") version "0.52.0"
}
android {
namespace = "dev.kreuzberg"
compileSdk = 35
defaultConfig {
minSdk = 21
consumerProguardFiles("consumer-rules.pro")
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17
}
sourceSets {
getByName("main") {
jniLibs.srcDirs("src/main/jniLibs")
}
}
}
kotlin {
compilerOptions {
jvmTarget.set(JvmTarget.JVM_17)
}
}
ktlint {
version.set("1.8.0")
android.set(true)
ignoreFailures.set(false)
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-stdlib")
// Generated Kotlin facade uses suspend functions and Flow wrappers, both of
// which require kotlinx-coroutines-android (transitively pulls -core).
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-android:1.11.0")
// Generated sealed-class DTOs use Jackson @JsonDeserialize for polymorphic
// serde-tagged unions; jackson-module-kotlin is required for Kotlin
// data-class deserialization (handles nullable, default values, etc.).
// jackson-datatype-jdk8 is required because the generated DefaultClient.kt
// registers Jdk8Module for Optional<T> / java.util.Optional support.
implementation("com.fasterxml.jackson.core:jackson-databind:2.21.3")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.21.3")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.21.3")
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.3.0")
androidTestImplementation("androidx.test.espresso:espresso-core:3.7.0")
}
mavenPublishing {
configure(AndroidSingleVariantLibrary(
variant = "release",
sourcesJar = com.vanniktech.maven.publish.SourcesJar.Sources(),
javadocJar = com.vanniktech.maven.publish.JavadocJar.Empty(),
))
publishToMavenCentral()
signAllPublications()
coordinates(
groupId = "dev.kreuzberg",
artifactId = "kreuzberg-android",
version = "5.0.0-rc.3",
)
pom {
name.set("kreuzberg-android")
description.set("High-performance document intelligence library")
url.set("https://github.com/kreuzberg-dev/kreuzberg")
licenses {
license {
name.set("Elastic-2.0")
url.set("https://www.elastic.co/licensing/elastic-license")
}
}
developers {
developer {
name.set("Na&apos;aman Hirschfeld")
email.set("naaman@kreuzberg.dev")
}
}
scm {
url.set("https://github.com/kreuzberg-dev/kreuzberg")
connection.set("scm:git:git://github.com/kreuzberg-dev/kreuzberg.git")
developerConnection.set("scm:git:ssh://git@github.com:kreuzberg-dev/kreuzberg.git")
}
}
}

View File

@@ -0,0 +1,3 @@
# Generated by alef. Do not edit by hand.
# Keep generated public API for consumers using R8/ProGuard.
-keep class dev.kreuzberg.** { *; }

2
packages/kotlin-android/proguard-rules.pro generated vendored Normal file
View File

@@ -0,0 +1,2 @@
# Generated by alef. Do not edit by hand.
# Add project-specific ProGuard rules for this AAR module here.

View File

@@ -0,0 +1,19 @@
// Generated by alef. Do not edit by hand.
pluginManagement {
repositories {
google()
mavenCentral()
gradlePluginPortal()
}
}
dependencyResolutionManagement {
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
repositories {
google()
mavenCentral()
}
}
rootProject.name = "kreuzberg-android"

View File

@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generated by alef. Do not edit by hand. -->
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="dev.kreuzberg">
</manifest>

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*/
data class AccelerationConfig(
/** Execution provider to use for ONNX inference. */
val provider: ExecutionProviderType = ExecutionProviderType.AUTO,
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
val deviceId: Int = 0,
)

View File

@@ -0,0 +1,172 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Types of inline text annotations. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = AnnotationKindDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = AnnotationKindSerializer::class)
sealed class AnnotationKind {
object Bold : AnnotationKind()
object Italic : AnnotationKind()
object Underline : AnnotationKind()
object Strikethrough : AnnotationKind()
object Code : AnnotationKind()
object Subscript : AnnotationKind()
object Superscript : AnnotationKind()
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Link(
val url: String,
val title: String?,
) : AnnotationKind()
/** Highlighted text (PDF highlights, HTML `<mark>`). */
object Highlight : AnnotationKind()
/** Text color (CSS-compatible value, e.g. "#ff0000", "red"). */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Color(
val value: String,
) : AnnotationKind()
/** Font size with units (e.g. "12pt", "1.2em", "16px"). */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class FontSize(
val value: String,
) : AnnotationKind()
/** Extensible annotation for format-specific styling. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Custom(
val name: String,
val value: String?,
) : AnnotationKind()
}
private class AnnotationKindDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<AnnotationKind>(AnnotationKind::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): AnnotationKind {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("annotation_type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("annotation_type") }
return when (tag) {
"bold" -> AnnotationKind.Bold
"italic" -> AnnotationKind.Italic
"underline" -> AnnotationKind.Underline
"strikethrough" -> AnnotationKind.Strikethrough
"code" -> AnnotationKind.Code
"subscript" -> AnnotationKind.Subscript
"superscript" -> AnnotationKind.Superscript
"link" -> ctx.readTreeAsValue<AnnotationKind.Link>(payload, AnnotationKind.Link::class.java)
"highlight" -> AnnotationKind.Highlight
"color" -> ctx.readTreeAsValue<AnnotationKind.Color>(payload, AnnotationKind.Color::class.java)
"font_size" -> ctx.readTreeAsValue<AnnotationKind.FontSize>(payload, AnnotationKind.FontSize::class.java)
"custom" -> ctx.readTreeAsValue<AnnotationKind.Custom>(payload, AnnotationKind.Custom::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown AnnotationKind tag", tag, AnnotationKind::class.java,
)
}
}
}
private class AnnotationKindSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<AnnotationKind>(AnnotationKind::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: AnnotationKind,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is AnnotationKind.Bold -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "bold")
n
}
is AnnotationKind.Italic -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "italic")
n
}
is AnnotationKind.Underline -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "underline")
n
}
is AnnotationKind.Strikethrough -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "strikethrough")
n
}
is AnnotationKind.Code -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "code")
n
}
is AnnotationKind.Subscript -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "subscript")
n
}
is AnnotationKind.Superscript -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "superscript")
n
}
is AnnotationKind.Link -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Link) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "link")
n
}
is AnnotationKind.Highlight -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "highlight")
n
}
is AnnotationKind.Color -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Color) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "color")
n
}
is AnnotationKind.FontSize -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.FontSize) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "font_size")
n
}
is AnnotationKind.Custom -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "custom")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
data class ArchiveEntry(
/** Archive-relative file path (e.g. "folder/document.pdf"). */
val path: String,
/** Detected MIME type of the file. */
val mimeType: String,
/** Full extraction result for this file. */
val result: ExtractionResult,
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
data class ArchiveMetadata(
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
val format: String = "",
/** Total number of files in the archive */
val fileCount: Int = 0,
/** List of file paths within the archive */
val fileList: List<String> = emptyList(),
/** Total uncompressed size in bytes */
val totalSize: Long = 0L,
/** Compressed size in bytes (if available) */
val compressedSize: Long? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
data class BBox(val x1: Float, val y1: Float, val x2: Float, val y2: Float)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
data class BatchBytesItem(
/** The content bytes to extract from */
val content: ByteArray,
/** MIME type of the content (e.g., "application/pdf", "text/html") */
val mimeType: String,
/** Per-item configuration overrides (None uses batch-level defaults) */
val config: FileExtractionConfig? = null,
)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
data class BatchFileItem(
/** Path to the file to extract from */
val path: java.nio.file.Path,
/** Per-file configuration overrides (None uses batch-level defaults) */
val config: FileExtractionConfig? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** BibTeX bibliography metadata. */
data class BibtexMetadata(
/** Number of entries in the bibliography. */
val entryCount: Long = 0L,
val citationKeys: List<String> = emptyList(),
val authors: List<String> = emptyList(),
val yearRange: YearRange? = null,
val entryTypes: Map<String, Long>? = null,
)

View File

@@ -0,0 +1,103 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Types of block-level elements in Djot. */
enum class BlockType {
@com.fasterxml.jackson.annotation.JsonProperty("paragraph")
PARAGRAPH,
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
@com.fasterxml.jackson.annotation.JsonProperty("blockquote")
BLOCKQUOTE,
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
LIST_ITEM,
@com.fasterxml.jackson.annotation.JsonProperty("ordered_list")
ORDERED_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("bullet_list")
BULLET_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("task_list")
TASK_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("definition_list")
DEFINITION_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("definition_term")
DEFINITION_TERM,
@com.fasterxml.jackson.annotation.JsonProperty("definition_description")
DEFINITION_DESCRIPTION,
@com.fasterxml.jackson.annotation.JsonProperty("div")
DIV,
@com.fasterxml.jackson.annotation.JsonProperty("section")
SECTION,
@com.fasterxml.jackson.annotation.JsonProperty("thematic_break")
THEMATIC_BREAK,
@com.fasterxml.jackson.annotation.JsonProperty("raw_block")
RAW_BLOCK,
@com.fasterxml.jackson.annotation.JsonProperty("math_display")
MATH_DISPLAY;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
PARAGRAPH -> "paragraph"
HEADING -> "heading"
BLOCKQUOTE -> "blockquote"
CODE_BLOCK -> "code_block"
LIST_ITEM -> "list_item"
ORDERED_LIST -> "ordered_list"
BULLET_LIST -> "bullet_list"
TASK_LIST -> "task_list"
DEFINITION_LIST -> "definition_list"
DEFINITION_TERM -> "definition_term"
DEFINITION_DESCRIPTION -> "definition_description"
DIV -> "div"
SECTION -> "section"
THEMATIC_BREAK -> "thematic_break"
RAW_BLOCK -> "raw_block"
MATH_DISPLAY -> "math_display"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): BlockType = when (value) {
"paragraph" -> PARAGRAPH
"heading" -> HEADING
"blockquote" -> BLOCKQUOTE
"code_block" -> CODE_BLOCK
"list_item" -> LIST_ITEM
"ordered_list" -> ORDERED_LIST
"bullet_list" -> BULLET_LIST
"task_list" -> TASK_LIST
"definition_list" -> DEFINITION_LIST
"definition_term" -> DEFINITION_TERM
"definition_description" -> DEFINITION_DESCRIPTION
"div" -> DIV
"section" -> SECTION
"thematic_break" -> THEMATIC_BREAK
"raw_block" -> RAW_BLOCK
"math_display" -> MATH_DISPLAY
else -> throw IllegalArgumentException("Unknown BlockType value: $value")
}
}
}

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Bounding box coordinates for element positioning. */
data class BoundingBox(
/** Left x-coordinate */
val x0: Double = 0.0,
/** Bottom y-coordinate */
val y0: Double = 0.0,
/** Right x-coordinate */
val x1: Double = 0.0,
/** Top y-coordinate */
val y1: Double = 0.0,
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
data class CacheStats(
val totalFiles: Long,
val totalSizeMb: Double,
val availableSpaceMb: Double,
val oldestFileAgeDays: Double,
val newestFileAgeDays: Double,
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
data class CellChange(
/** Zero-based row index. */
val row: Long,
/** Zero-based column index. */
val col: Long,
/** Value before the change. */
val from: String,
/** Value after the change. */
val to: String,
)

View File

@@ -0,0 +1,51 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
data class Chunk(
/** The text content of this chunk. */
val content: String,
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType.Unknown` when no rule matches.
*/
val chunkType: ChunkType,
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*/
val embedding: List<Float>? = null,
/** Metadata about this chunk's position and properties. */
val metadata: ChunkMetadata,
)

View File

@@ -0,0 +1,68 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Metadata about a chunk's position in the original document. */
data class ChunkMetadata(
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
val byteStart: Long,
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
val byteEnd: Long,
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*/
val tokenCount: Long? = null,
/** Zero-based index of this chunk in the document. */
val chunkIndex: Long,
/** Total number of chunks in the document. */
val totalChunks: Long,
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
val firstPage: Int? = null,
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
val lastPage: Int? = null,
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType.Markdown` is used.
*/
val headingContext: HeadingContext? = null,
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*/
val imageIndices: List<Int> = emptyList(),
)

View File

@@ -0,0 +1,93 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = ChunkSizingDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = ChunkSizingSerializer::class)
sealed class ChunkSizing {
/** Size measured in Unicode characters (default). */
object Characters : ChunkSizing()
/** Size measured in tokens from a HuggingFace tokenizer. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Tokenizer(
val model: String,
val cacheDir: java.nio.file.Path?,
) : ChunkSizing()
}
private class ChunkSizingDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<ChunkSizing>(ChunkSizing::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): ChunkSizing {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
return when (tag) {
"characters" -> ChunkSizing.Characters
"tokenizer" -> ctx.readTreeAsValue<ChunkSizing.Tokenizer>(payload, ChunkSizing.Tokenizer::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown ChunkSizing tag", tag, ChunkSizing::class.java,
)
}
}
}
private class ChunkSizingSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<ChunkSizing>(ChunkSizing::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: ChunkSizing,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is ChunkSizing.Characters -> {
val n = mapper.createObjectNode()
n.put("type", "characters")
n
}
is ChunkSizing.Tokenizer -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as ChunkSizing.Tokenizer) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "tokenizer")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,110 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in `chunking.classifier`.
* Defaults to `Unknown` when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
enum class ChunkType {
/** Section heading or document title. */
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
/** Party list: names, addresses, and signatories. */
@com.fasterxml.jackson.annotation.JsonProperty("party_list")
PARTY_LIST,
/** Definition clause ("X means…", "X shall mean…"). */
@com.fasterxml.jackson.annotation.JsonProperty("definitions")
DEFINITIONS,
/** Operative clause containing legal/contractual action verbs. */
@com.fasterxml.jackson.annotation.JsonProperty("operative_clause")
OPERATIVE_CLAUSE,
/** Signature block with signatures, names, and dates. */
@com.fasterxml.jackson.annotation.JsonProperty("signature_block")
SIGNATURE_BLOCK,
/** Schedule, annex, appendix, or exhibit section. */
@com.fasterxml.jackson.annotation.JsonProperty("schedule")
SCHEDULE,
/** Table-like content with aligned columns or repeated patterns. */
@com.fasterxml.jackson.annotation.JsonProperty("table_like")
TABLE_LIKE,
/** Mathematical formula or equation. */
@com.fasterxml.jackson.annotation.JsonProperty("formula")
FORMULA,
/** Code block or preformatted content. */
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
/** Embedded or referenced image content. */
@com.fasterxml.jackson.annotation.JsonProperty("image")
IMAGE,
/** Organizational chart or hierarchy diagram. */
@com.fasterxml.jackson.annotation.JsonProperty("org_chart")
ORG_CHART,
/** Diagram, figure, or visual illustration. */
@com.fasterxml.jackson.annotation.JsonProperty("diagram")
DIAGRAM,
/** Unclassified or mixed content. */
@com.fasterxml.jackson.annotation.JsonProperty("unknown")
UNKNOWN;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
HEADING -> "heading"
PARTY_LIST -> "party_list"
DEFINITIONS -> "definitions"
OPERATIVE_CLAUSE -> "operative_clause"
SIGNATURE_BLOCK -> "signature_block"
SCHEDULE -> "schedule"
TABLE_LIKE -> "table_like"
FORMULA -> "formula"
CODE_BLOCK -> "code_block"
IMAGE -> "image"
ORG_CHART -> "org_chart"
DIAGRAM -> "diagram"
UNKNOWN -> "unknown"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ChunkType = when (value) {
"heading" -> HEADING
"party_list" -> PARTY_LIST
"definitions" -> DEFINITIONS
"operative_clause" -> OPERATIVE_CLAUSE
"signature_block" -> SIGNATURE_BLOCK
"schedule" -> SCHEDULE
"table_like" -> TABLE_LIKE
"formula" -> FORMULA
"code_block" -> CODE_BLOCK
"image" -> IMAGE
"org_chart" -> ORG_CHART
"diagram" -> DIAGRAM
"unknown" -> UNKNOWN
else -> throw IllegalArgumentException("Unknown ChunkType value: $value")
}
}
}

View File

@@ -0,0 +1,70 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Type of text chunker to use.
*
* # Variants
*
* - `Text` - Generic text splitter, splits on whitespace and punctuation
* - `Markdown` - Markdown-aware splitter, preserves formatting and structure
* - `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* - `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* `max_characters` (default 1000). `topic_threshold` has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
enum class ChunkerType {
@com.fasterxml.jackson.annotation.JsonProperty("text")
TEXT,
@com.fasterxml.jackson.annotation.JsonProperty("markdown")
MARKDOWN,
@com.fasterxml.jackson.annotation.JsonProperty("yaml")
YAML,
@com.fasterxml.jackson.annotation.JsonProperty("semantic")
SEMANTIC;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
TEXT -> "text"
MARKDOWN -> "markdown"
YAML -> "yaml"
SEMANTIC -> "semantic"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ChunkerType = when (value) {
"text" -> TEXT
"markdown" -> MARKDOWN
"yaml" -> YAML
"semantic" -> SEMANTIC
else -> throw IllegalArgumentException("Unknown ChunkerType value: $value")
}
}
}

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..the default constructor` when constructing to allow for future field additions:
*/
data class ChunkingConfig(
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
@com.fasterxml.jackson.annotation.JsonProperty("max_chars")
val maxCharacters: Long = 1000L,
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*/
@com.fasterxml.jackson.annotation.JsonProperty("max_overlap")
val overlap: Long = 200L,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
val trim: Boolean = true,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
val chunkerType: ChunkerType = ChunkerType.TEXT,
/** Optional embedding configuration for chunk embeddings. */
val embedding: EmbeddingConfig? = null,
/** Use a preset configuration (overrides individual settings if provided). */
val preset: String? = null,
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*/
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = ChunkSizing::class)
val sizing: ChunkSizing,
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*/
val prependHeadingContext: Boolean = false,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*/
val topicThreshold: Float? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Citation file metadata (RIS, PubMed, EndNote). */
data class CitationMetadata(
val citationCount: Long = 0L,
val format: String? = null,
val authors: List<String> = emptyList(),
val yearRange: YearRange? = null,
val dois: List<String> = emptyList(),
val keywords: List<String> = emptyList(),
)

View File

@@ -0,0 +1,59 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the `content` field
* of `ExtractionResult`.
*/
enum class CodeContentMode {
/** Use TSLP semantic chunks as content (default). */
@com.fasterxml.jackson.annotation.JsonProperty("chunks")
CHUNKS,
/** Use raw source code as content. */
@com.fasterxml.jackson.annotation.JsonProperty("raw")
RAW,
/** Emit function/class headings + docstrings (no code bodies). */
@com.fasterxml.jackson.annotation.JsonProperty("structure")
STRUCTURE;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
CHUNKS -> "chunks"
RAW -> "raw"
STRUCTURE -> "structure"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): CodeContentMode = when (value) {
"chunks" -> CHUNKS
"raw" -> RAW
"structure" -> STRUCTURE
else -> throw IllegalArgumentException("Unknown CodeContentMode value: $value")
}
}
}

View File

@@ -0,0 +1,89 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `null` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
data class ContentFilterConfig(
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
*
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*/
val includeHeaders: Boolean = false,
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
*
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*/
val includeFooters: Boolean = false,
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*/
val stripRepeatingText: Boolean = true,
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*/
val includeWatermarks: Boolean = false,
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
enum class ContentLayer {
/** Main document body content. */
@com.fasterxml.jackson.annotation.JsonProperty("body")
BODY,
/** Page/section header (running header). */
@com.fasterxml.jackson.annotation.JsonProperty("header")
HEADER,
/** Page/section footer (running footer). */
@com.fasterxml.jackson.annotation.JsonProperty("footer")
FOOTER,
/** Footnote content. */
@com.fasterxml.jackson.annotation.JsonProperty("footnote")
FOOTNOTE;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
BODY -> "body"
HEADER -> "header"
FOOTER -> "footer"
FOOTNOTE -> "footnote"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ContentLayer = when (value) {
"body" -> BODY
"header" -> HEADER
"footer" -> FOOTER
"footnote" -> FOOTNOTE
else -> throw IllegalArgumentException("Unknown ContentLayer value: $value")
}
}
}

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** JATS contributor with role. */
data class ContributorRole(val name: String, val role: String? = null)

View File

@@ -0,0 +1,62 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
data class CoreProperties(
/** Document title */
val title: String? = null,
/** Document subject/topic */
val subject: String? = null,
/** Document creator/author */
val creator: String? = null,
/** Keywords or tags */
val keywords: String? = null,
/** Document description/abstract */
val description: String? = null,
/** User who last modified the document */
val lastModifiedBy: String? = null,
/** Revision number */
val revision: String? = null,
/** Creation timestamp (ISO 8601) */
val created: String? = null,
/** Last modification timestamp (ISO 8601) */
val modified: String? = null,
/** Document category */
val category: String? = null,
/** Content status (Draft, Final, etc.) */
val contentStatus: String? = null,
/** Document language */
val language: String? = null,
/** Unique identifier */
val identifier: String? = null,
/** Document version */
val version: String? = null,
/** Last print timestamp (ISO 8601) */
val lastPrinted: String? = null,
)

View File

@@ -0,0 +1,32 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** CSV/TSV file metadata. */
data class CsvMetadata(
val rowCount: Int = 0,
val columnCount: Int = 0,
val delimiter: String? = null,
val hasHeader: Boolean = false,
val columnTypes: List<String>? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** dBASE field information. */
data class DbfFieldInfo(val name: String, val fieldType: String)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** dBASE (DBF) file metadata. */
data class DbfMetadata(
val recordCount: Long = 0L,
val fieldCount: Long = 0L,
val fields: List<DbfFieldInfo> = emptyList(),
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:max-line-length",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:annotation",
"MaxLineLength",
"TooManyFunctions",
"LongParameterList",
"LongMethod",
)
package dev.kreuzberg
import com.fasterxml.jackson.core.type.TypeReference
@Suppress("TooManyFunctions")
class Document internal constructor(internal val handle: Long) : AutoCloseable {
companion object {
private val MAPPER = com.fasterxml.jackson.databind.ObjectMapper()
.registerModule(com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
.findAndRegisterModules()
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
}
// Return the 1-based page number for each top-level table in the document.
fun tablePageNumbers(): List<Long> {
val responseJson = KreuzbergBridge.nativeDocumentTablePageNumbers(handle)
return MAPPER.readValue(responseJson, object : TypeReference<List<Long>>() {})
}
override fun close() { KreuzbergBridge.nativeFreeDocument(handle) }
}

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** MIME type detection response. */
data class DetectResponse(
/** Detected MIME type */
val mimeType: String,
/** Original filename (if provided) */
val filename: String? = null,
)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Page-level detection result containing all detections and page metadata. */
data class DetectionResult(
val pageWidth: Int,
val pageHeight: Int,
val detections: List<LayoutDetection> = emptyList(),
)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A single contiguous hunk in a unified diff. */
data class DiffHunk(
/** Starting line number in the old content (0-indexed). */
val fromLine: Long,
/** Number of lines from the old content in this hunk. */
val fromCount: Long,
/** Starting line number in the new content (0-indexed). */
val toLine: Long,
/** Number of lines from the new content in this hunk. */
val toCount: Long,
/** Lines that make up this hunk. */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(contentAs = DiffLine::class)
val lines: List<DiffLine> = emptyList(),
)

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = DiffLineDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = DiffLineSerializer::class)
sealed class DiffLine {
/** Unchanged context line. */
data class Context(val value: String) : DiffLine()
/** Line added in the "after" version. */
data class Added(val value: String) : DiffLine()
/** Line removed from the "before" version. */
data class Removed(val value: String) : DiffLine()
}
private class DiffLineDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<DiffLine>(DiffLine::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): DiffLine {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("kind")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("kind") }
return when (tag) {
"context" -> DiffLine.Context(ctx.readTreeAsValue<String>(payload, String::class.java))
"added" -> DiffLine.Added(ctx.readTreeAsValue<String>(payload, String::class.java))
"removed" -> DiffLine.Removed(ctx.readTreeAsValue<String>(payload, String::class.java))
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown DiffLine tag", tag, DiffLine::class.java,
)
}
}
}
private class DiffLineSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<DiffLine>(DiffLine::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: DiffLine,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is DiffLine.Context -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "context")
n
}
is DiffLine.Added -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "added")
n
}
is DiffLine.Removed -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "removed")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Options controlling how two `ExtractionResult` values are compared. */
data class DiffOptions(
/** Include metadata changes in the diff. Default: `true`. */
val includeMetadata: Boolean = true,
/** Include embedded-children changes in the diff. Default: `true`. */
val includeEmbedded: Boolean = true,
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `null` means no truncation.
*/
val maxContentChars: Long? = null,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
*
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
data class DjotContent(
/** Plain text representation for backwards compatibility */
val plainText: String,
/** Structured block-level content */
val blocks: List<FormattedBlock> = emptyList(),
/** Metadata from YAML frontmatter */
val metadata: Metadata,
/** Extracted tables as structured data */
val tables: List<Table> = emptyList(),
/** Extracted images with metadata */
val images: List<DjotImage> = emptyList(),
/** Extracted links with URLs */
val links: List<DjotLink> = emptyList(),
/** Footnote definitions */
val footnotes: List<Footnote> = emptyList(),
/** Attributes mapped by element identifier (if present) */
val attributes: List<String> = emptyList(),
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image element in Djot. */
data class DjotImage(
/** Image source URL or path */
val src: String,
/** Alternative text */
val alt: String,
/** Optional title */
val title: String? = null,
/** Element attributes */
val attributes: String? = null,
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Link element in Djot. */
data class DjotLink(
/** Link URL */
val url: String,
/** Link text content */
val text: String,
/** Optional title */
val title: String? = null,
/** Element attributes */
val attributes: String? = null,
)

View File

@@ -0,0 +1,25 @@
// Generated by alef. Do not edit by hand.
package dev.kreuzberg
object DocumentExtractorBridge {
private val registered = mutableMapOf<String, IDocumentExtractor>()
fun register(impl: IDocumentExtractor): Unit {
val name = impl.name()
registered[name] = impl
KreuzbergBridge.nativeRegisterDocumentExtractor(impl)
}
fun unregister(name: String): Unit {
registered.remove(name)
KreuzbergBridge.nativeUnregisterDocumentExtractor(name)
}
fun clearAll(): Unit {
registered.clear()
KreuzbergBridge.nativeClearDocumentExtractors()
}
fun getAll(): Map<String, IDocumentExtractor> = registered.toMap()
}

View File

@@ -0,0 +1,62 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
data class DocumentNode(
/** Deterministic identifier (hash of content + position). */
val id: String,
/** Node content — tagged enum, type-specific data only. */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = NodeContent::class)
val content: NodeContent,
/** Parent node index (`null` = root-level node). */
val parent: Int? = null,
/** Child node indices in reading order. */
val children: List<Int> = emptyList(),
/** Content layer classification. */
val contentLayer: ContentLayer,
/** Page number where this node starts (1-indexed). */
val page: Int? = null,
/** Page number where this node ends (for multi-page tables/sections). */
val pageEnd: Int? = null,
/** Bounding box in document coordinates. */
val bbox: BoundingBox? = null,
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*/
val annotations: List<TextAnnotation> = emptyList(),
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*/
val attributes: Map<String, String>? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A resolved relationship between two nodes in the document tree. */
data class DocumentRelationship(
/** Source node index (the referencing node). */
val source: Int,
/** Target node index (the referenced node). */
val target: Int,
/** Semantic kind of the relationship. */
val kind: RelationshipKind,
)

View File

@@ -0,0 +1,66 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
data class DocumentRevision(
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
*/
val revisionId: String,
/** Display name of the author who made this change, when available. */
val author: String? = null,
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*/
val timestamp: String? = null,
/** Semantic kind of this revision. */
val kind: RevisionKind,
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `null` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*/
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = RevisionAnchor::class)
val anchor: RevisionAnchor? = null,
/** The content changes that make up this revision. */
val delta: RevisionDelta,
)

View File

@@ -0,0 +1,65 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
data class DocumentStructure(
/** All nodes in document/reading order. */
val nodes: List<DocumentNode> = emptyList(),
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*/
val sourceFormat: String? = null,
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*/
val relationships: List<DocumentRelationship> = emptyList(),
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* `NodeContent` variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
*
* Computed from `nodes` via `DocumentStructure.finalize_node_types`.
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*/
val nodeTypes: List<String> = emptyList(),
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
data class DocxAppProperties(
/** Application name (e.g., "Microsoft Office Word") */
val application: String? = null,
/** Application version */
val appVersion: String? = null,
/** Template filename */
val template: String? = null,
/** Total editing time in minutes */
val totalTime: Int? = null,
/** Number of pages */
val pages: Int? = null,
/** Number of words */
val words: Int? = null,
/** Number of characters (excluding spaces) */
val characters: Int? = null,
/** Number of characters (including spaces) */
val charactersWithSpaces: Int? = null,
/** Number of lines */
val lines: Int? = null,
/** Number of paragraphs */
val paragraphs: Int? = null,
/** Company name */
val company: String? = null,
/** Document security level */
val docSecurity: Int? = null,
/** Scale crop flag */
val scaleCrop: Boolean? = null,
/** Links up to date flag */
val linksUpToDate: Boolean? = null,
/** Shared document flag */
val sharedDoc: Boolean? = null,
/** Hyperlinks changed flag */
val hyperlinksChanged: Boolean? = null,
)

View File

@@ -0,0 +1,53 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
data class DocxMetadata(
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*/
val coreProperties: CoreProperties? = null,
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*/
val appProperties: DocxAppProperties? = null,
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*/
val customProperties: Map<String, Any>? = null,
)

View File

@@ -0,0 +1,29 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
)
package dev.kreuzberg
/** Whether the drawing is inline or anchored. */
sealed class DrawingType {
object Inline : DrawingType()
data class Anchored(val value: String) : DrawingType()
}

View File

@@ -0,0 +1,40 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
data class Element(
/** Unique element identifier */
val elementId: String,
/** Semantic type of this element */
val elementType: ElementType,
/** Text content of the element */
val text: String,
/** Metadata about the element */
val metadata: ElementMetadata,
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Metadata for a semantic element. */
data class ElementMetadata(
/** Page number (1-indexed) */
val pageNumber: Int? = null,
/** Source filename or document name */
val filename: String? = null,
/** Bounding box coordinates if available */
val coordinates: BoundingBox? = null,
/** Position index in the element sequence */
val elementIndex: Long? = null,
/** Additional custom metadata */
val additional: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,99 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
enum class ElementType {
/** Document title */
@com.fasterxml.jackson.annotation.JsonProperty("title")
TITLE,
/** Main narrative text body */
@com.fasterxml.jackson.annotation.JsonProperty("narrative_text")
NARRATIVE_TEXT,
/** Section heading */
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
/** List item (bullet, numbered, etc.) */
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
LIST_ITEM,
/** Table element */
@com.fasterxml.jackson.annotation.JsonProperty("table")
TABLE,
/** Image element */
@com.fasterxml.jackson.annotation.JsonProperty("image")
IMAGE,
/** Page break marker */
@com.fasterxml.jackson.annotation.JsonProperty("page_break")
PAGE_BREAK,
/** Code block */
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
/** Block quote */
@com.fasterxml.jackson.annotation.JsonProperty("block_quote")
BLOCK_QUOTE,
/** Footer text */
@com.fasterxml.jackson.annotation.JsonProperty("footer")
FOOTER,
/** Header text */
@com.fasterxml.jackson.annotation.JsonProperty("header")
HEADER;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
TITLE -> "title"
NARRATIVE_TEXT -> "narrative_text"
HEADING -> "heading"
LIST_ITEM -> "list_item"
TABLE -> "table"
IMAGE -> "image"
PAGE_BREAK -> "page_break"
CODE_BLOCK -> "code_block"
BLOCK_QUOTE -> "block_quote"
FOOTER -> "footer"
HEADER -> "header"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ElementType = when (value) {
"title" -> TITLE
"narrative_text" -> NARRATIVE_TEXT
"heading" -> HEADING
"list_item" -> LIST_ITEM
"table" -> TABLE
"image" -> IMAGE
"page_break" -> PAGE_BREAK
"code_block" -> CODE_BLOCK
"block_quote" -> BLOCK_QUOTE
"footer" -> FOOTER
"header" -> HEADER
else -> throw IllegalArgumentException("Unknown ElementType value: $value")
}
}
}

View File

@@ -0,0 +1,46 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
data class EmailAttachment(
/** Attachment name (from Content-Disposition header) */
val name: String? = null,
/** Filename of the attachment */
val filename: String? = null,
/** MIME type of the attachment */
val mimeType: String? = null,
/** Size in bytes */
val size: Long? = null,
/** Whether this attachment is an image */
val isImage: Boolean,
/**
* Attachment data (if extracted).
* Uses `bytes.Bytes` for cheap cloning of large buffers.
*/
val data: ByteArray? = null,
)

View File

@@ -0,0 +1,49 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Configuration for email extraction. */
data class EmailConfig(
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `null`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 — the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
*
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*/
val msgFallbackCodepage: Int? = null,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
data class EmailExtractionResult(
/** Email subject line */
val subject: String? = null,
/** Sender email address */
val fromEmail: String? = null,
/** Primary recipient email addresses */
val toEmails: List<String> = emptyList(),
/** CC recipient email addresses */
val ccEmails: List<String> = emptyList(),
/** BCC recipient email addresses */
val bccEmails: List<String> = emptyList(),
/** Email date/timestamp */
val date: String? = null,
/** Message-ID header value */
val messageId: String? = null,
/** Plain text version of the email body */
val plainText: String? = null,
/** HTML version of the email body */
val htmlContent: String? = null,
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
val content: String,
/** List of email attachments */
val attachments: List<EmailAttachment> = emptyList(),
/** Additional email headers and metadata */
val metadata: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,45 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
data class EmailMetadata(
/** Sender's email address */
val fromEmail: String? = null,
/** Sender's display name */
val fromName: String? = null,
/** Primary recipients */
val toEmails: List<String> = emptyList(),
/** CC recipients */
val ccEmails: List<String> = emptyList(),
/** BCC recipients */
val bccEmails: List<String> = emptyList(),
/** Message-ID header value */
val messageId: String? = null,
/** List of attachment filenames */
val attachments: List<String> = emptyList(),
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Changes to embedded archive children between two results. */
data class EmbeddedChanges(
/** Children present in `b` but not in `a` (matched by `path`). */
val added: List<ArchiveEntry> = emptyList(),
/** Children present in `a` but not in `b` (matched by `path`). */
val removed: List<ArchiveEntry> = emptyList(),
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*/
val changed: List<EmbeddedDiff> = emptyList(),
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Diff for a single embedded archive entry that appears in both results. */
data class EmbeddedDiff(
/** Archive-relative path identifying this entry. */
val path: String,
/** The recursive diff of the entry's extraction result. */
val diff: ExtractionDiff,
)

View File

@@ -0,0 +1,40 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Embedded file descriptor extracted from the PDF name tree. */
data class EmbeddedFile(
/** The filename as stored in the PDF name tree. */
val name: String,
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
val data: ByteArray,
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*/
val compressedSize: Long,
/** MIME type if specified in the filespec, otherwise `null`. */
val mimeType: String? = null,
)

View File

@@ -0,0 +1,25 @@
// Generated by alef. Do not edit by hand.
package dev.kreuzberg
object EmbeddingBackendBridge {
private val registered = mutableMapOf<String, IEmbeddingBackend>()
fun register(impl: IEmbeddingBackend): Unit {
val name = impl.name()
registered[name] = impl
KreuzbergBridge.nativeRegisterEmbeddingBackend(impl)
}
fun unregister(name: String): Unit {
registered.remove(name)
KreuzbergBridge.nativeUnregisterEmbeddingBackend(name)
}
fun clearAll(): Unit {
registered.clear()
KreuzbergBridge.nativeClearEmbeddingBackends()
}
fun getAll(): Map<String, IEmbeddingBackend> = registered.toMap()
}

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
data class EmbeddingConfig(
/** The embedding model to use (defaults to "balanced" preset if not specified) */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = EmbeddingModelType::class)
val model: EmbeddingModelType,
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
val normalize: Boolean = true,
/** Batch size for embedding generation */
val batchSize: Long = 32L,
/** Show model download progress */
val showDownloadProgress: Boolean = false,
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*/
val cacheDir: java.nio.file.Path? = null,
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `null` (auto-select per platform).
*/
val acceleration: AccelerationConfig? = null,
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using `EmbeddingModelType.Plugin`.
*
* Applies only to the in-process plugin path — protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `null` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*/
val maxEmbedDurationSecs: Long? = null,
)

View File

@@ -0,0 +1,140 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Embedding model types supported by Kreuzberg. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = EmbeddingModelTypeDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = EmbeddingModelTypeSerializer::class)
sealed class EmbeddingModelType {
/** Use a preset model configuration (recommended) */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Preset(
val name: String,
) : EmbeddingModelType()
/** Use a custom ONNX model from HuggingFace */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Custom(
val modelId: String,
val dimensions: Long,
) : EmbeddingModelType()
/**
* Provider-hosted embedding model via liter-llm.
*
* Uses the model specified in the nested `LlmConfig` (e.g.,
* `"openai/text-embedding-3-small"`).
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Llm(
val llm: LlmConfig,
) : EmbeddingModelType()
/**
* In-process embedding backend registered via the plugin system.
*
* The caller registers an `EmbeddingBackend` once
* (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
* or tuned ONNX model), then references it by name in config. Kreuzberg calls back
* into the registered backend during chunking and standalone embed requests —
* no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
*
* When this variant is selected, only the following `EmbeddingConfig` fields
* apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
* (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
* `show_download_progress`, `acceleration`) are ignored — the host owns the
* model lifecycle.
*
* Semantic chunking falls back to `ChunkingConfig.max_characters` when this variant
* is used, since there is no preset to look a chunk-size ceiling up against — size your
* context window via `max_characters` directly.
*
* See `register_embedding_backend`.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Plugin(
val name: String,
) : EmbeddingModelType()
}
private class EmbeddingModelTypeDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): EmbeddingModelType {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
return when (tag) {
"preset" -> ctx.readTreeAsValue<EmbeddingModelType.Preset>(payload, EmbeddingModelType.Preset::class.java)
"custom" -> ctx.readTreeAsValue<EmbeddingModelType.Custom>(payload, EmbeddingModelType.Custom::class.java)
"llm" -> ctx.readTreeAsValue<EmbeddingModelType.Llm>(payload, EmbeddingModelType.Llm::class.java)
"plugin" -> ctx.readTreeAsValue<EmbeddingModelType.Plugin>(payload, EmbeddingModelType.Plugin::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown EmbeddingModelType tag", tag, EmbeddingModelType::class.java,
)
}
}
}
private class EmbeddingModelTypeSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: EmbeddingModelType,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is EmbeddingModelType.Preset -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Preset) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "preset")
n
}
is EmbeddingModelType.Custom -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "custom")
n
}
is EmbeddingModelType.Llm -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Llm) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "llm")
n
}
is EmbeddingModelType.Plugin -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Plugin) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "plugin")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,46 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
data class EmbeddingPreset(
val name: String,
val chunkSize: Long,
val overlap: Long,
/** HuggingFace repository name for the model. */
val modelRepo: String,
/** Pooling strategy: "cls" or "mean". */
val pooling: String,
/** Path to the ONNX model file within the repo. */
val modelFile: String,
val dimensions: Long,
val description: String,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** EPUB metadata (Dublin Core extensions). */
data class EpubMetadata(
val coverage: String? = null,
val dcFormat: String? = null,
val relation: String? = null,
val source: String? = null,
val dcType: String? = null,
val coverImage: String? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Error metadata (for batch operations). */
data class ErrorMetadata(val errorType: String, val message: String)

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
data class ExcelMetadata(
/** Number of sheets in the workbook. */
val sheetCount: Int? = null,
/** Names of all sheets in the workbook. */
val sheetNames: List<String>? = null,
)

View File

@@ -0,0 +1,48 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
data class ExcelSheet(
/** Sheet name as it appears in Excel */
val name: String,
/** Sheet content converted to Markdown tables */
val markdown: String,
/** Number of rows */
val rowCount: Long,
/** Number of columns */
val colCount: Long,
/** Total number of non-empty cells */
val cellCount: Long,
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*/
val tableCells: List<List<String>>? = null,
)

View File

@@ -0,0 +1,47 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
data class ExcelWorkbook(
/** All sheets in the workbook */
val sheets: List<ExcelSheet> = emptyList(),
/** Workbook-level metadata (author, creation date, etc.) */
val metadata: Map<String, String> = emptyMap(),
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
* `anchor` and `delta` are `null`/empty for v1 (per-cell log parsing is a
* follow-up). `null` when `xl/revisions/revisionHeaders.xml` is absent.
*/
val revisions: List<DocumentRevision>? = null,
)

View File

@@ -0,0 +1,69 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* `Auto` (default) selects the best available provider per platform.
*/
enum class ExecutionProviderType {
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
@com.fasterxml.jackson.annotation.JsonProperty("auto")
AUTO,
/** CPU execution provider (always available). */
@com.fasterxml.jackson.annotation.JsonProperty("cpu")
CPU,
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
@com.fasterxml.jackson.annotation.JsonProperty("coreml")
CORE_ML,
/** NVIDIA CUDA GPU acceleration. */
@com.fasterxml.jackson.annotation.JsonProperty("cuda")
CUDA,
/** NVIDIA TensorRT (optimized CUDA inference). */
@com.fasterxml.jackson.annotation.JsonProperty("tensorrt")
TENSOR_RT;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
AUTO -> "auto"
CPU -> "cpu"
CORE_ML -> "coreml"
CUDA -> "cuda"
TENSOR_RT -> "tensorrt"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ExecutionProviderType = when (value) {
"auto" -> AUTO
"cpu" -> CPU
"coreml" -> CORE_ML
"cuda" -> CUDA
"tensorrt" -> TENSOR_RT
else -> throw IllegalArgumentException("Unknown ExecutionProviderType value: $value")
}
}
}

View File

@@ -0,0 +1,88 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
data class ExtractedImage(
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes.Bytes` for cheap cloning of large buffers.
*/
val data: ByteArray,
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*/
val format: String,
/** Zero-indexed position of this image in the document/page */
val imageIndex: Int,
/** Page/slide number where image was found (1-indexed) */
val pageNumber: Int? = null,
/** Image width in pixels */
val width: Int? = null,
/** Image height in pixels */
val height: Int? = null,
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
val colorspace: String? = null,
/** Bits per color component (e.g., 8, 16) */
val bitsPerComponent: Int? = null,
/** Whether this image is a mask image */
val isMask: Boolean,
/** Optional description of the image */
val description: String? = null,
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*/
val ocrResult: ExtractionResult? = null,
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*/
val boundingBox: BoundingBox? = null,
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*/
val sourcePath: String? = null,
/**
* Heuristic classification of what this image likely depicts.
* `null` if classification was disabled or inconclusive.
*/
val imageKind: ImageKind? = null,
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
val kindConfidence: Float? = null,
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `null` for singletons.
*/
val clusterId: Int? = null,
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image metadata extracted from an image file. */
data class ExtractedImageMetadata(
/** Image width in pixels */
val width: Int,
/** Image height in pixels */
val height: Int,
/** Image format (e.g., "PNG", "JPEG") */
val format: String,
/** EXIF data if available */
val exifData: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
data class ExtractedUri(
/** The URL or path string. */
val url: String,
/** Optional display text / label for the link. */
val label: String? = null,
/** Optional page number where the URI was found (1-indexed). */
val page: Int? = null,
/** Semantic classification of the URI. */
val kind: UriKind,
)

View File

@@ -0,0 +1,275 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*/
data class ExtractionConfig(
/** Enable caching of extraction results */
val useCache: Boolean = true,
/** Enable quality post-processing */
val enableQualityProcessing: Boolean = true,
/** OCR configuration (None = OCR disabled) */
val ocr: OcrConfig? = null,
/** Force OCR even for searchable PDFs */
val forceOcr: Boolean = false,
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*/
val forceOcrPages: List<Int>? = null,
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*/
val disableOcr: Boolean = false,
/** Text chunking configuration (None = chunking disabled) */
val chunking: ChunkingConfig? = null,
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See `ContentFilterConfig` for per-field documentation.
*/
val contentFilter: ContentFilterConfig? = null,
/** Image extraction configuration (None = no image extraction) */
val images: ImageExtractionConfig? = null,
/** PDF-specific options (None = use defaults) */
val pdfOptions: PdfConfig? = null,
/** Token reduction configuration (None = no token reduction) */
val tokenReduction: TokenReductionOptions? = null,
/** Language detection configuration (None = no language detection) */
val languageDetection: LanguageDetectionConfig? = null,
/** Page extraction configuration (None = no page tracking) */
val pages: PageConfig? = null,
/** Keyword extraction configuration (None = no keyword extraction) */
val keywords: KeywordConfig? = null,
/** Post-processor configuration (None = use defaults) */
val postprocessor: PostProcessorConfig? = null,
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*/
val htmlOptions: String? = null,
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat.Html`, the extraction
* pipeline uses `StyledHtmlRenderer`
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `null`, the existing plain comrak-based HTML renderer is used.
*/
val htmlOutput: HtmlOutputConfig? = null,
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by `FileExtractionConfig.timeout_secs`.
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `null` to
* disable the timeout for trusted input or long-running workloads.
*/
val extractionTimeoutSecs: Long? = null,
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*/
val maxConcurrentExtractions: Long? = null,
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*/
val resultFormat: ResultFormat = ResultFormat.UNIFIED,
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `null`, default limits are used.
*/
val securityLimits: SecurityLimits? = null,
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `null` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*/
val maxEmbeddedFileBytes: Long? = null,
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
*
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*/
val outputFormat: OutputFormat = OutputFormat.Plain,
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*/
val layout: LayoutDetectionConfig? = null,
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*/
val useLayoutForMarkdown: Boolean = false,
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
val includeDocumentStructure: Boolean = false,
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `null`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*/
val acceleration: AccelerationConfig? = null,
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*/
val cacheNamespace: String? = null,
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `null`, the global TTL applies.
*/
val cacheTtlSecs: Long? = null,
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*/
val email: EmailConfig? = null,
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*/
val concurrency: String? = null,
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*/
val maxArchiveDepth: Long = 0L,
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*/
val treeSitter: TreeSitterConfig? = null,
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult.structured_output`.
*/
val structuredExtraction: StructuredExtractionConfig? = null,
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a `CancellationToken` clone here and call `CancellationToken.cancel`
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns `KreuzbergError.Cancelled` when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*/
val cancelToken: String? = null,
)

View File

@@ -0,0 +1,53 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** The complete diff between two `ExtractionResult` values. */
data class ExtractionDiff(
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*/
val contentDiff: List<DiffHunk> = emptyList(),
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
val tablesAdded: List<Table> = emptyList(),
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
val tablesRemoved: List<Table> = emptyList(),
/** Cell-level changes for table pairs that share the same index and dimensions. */
val tablesChanged: List<TableDiff> = emptyList(),
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ — each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*/
val metadataChanged: Any,
/** Changes to embedded archive children. */
val embeddedChanges: EmbeddedChanges,
)

View File

@@ -0,0 +1,51 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** How the extracted text was produced. */
enum class ExtractionMethod {
@com.fasterxml.jackson.annotation.JsonProperty("native")
NATIVE,
@com.fasterxml.jackson.annotation.JsonProperty("ocr")
OCR,
@com.fasterxml.jackson.annotation.JsonProperty("mixed")
MIXED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
NATIVE -> "native"
OCR -> "ocr"
MIXED -> "mixed"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ExtractionMethod = when (value) {
"native" -> NATIVE
"ocr" -> OCR
"mixed" -> MIXED
else -> throw IllegalArgumentException("Unknown ExtractionMethod value: $value")
}
}
}

View File

@@ -0,0 +1,229 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
data class ExtractionResult(
val content: String = "",
val mimeType: String = "",
val metadata: Metadata = Metadata(),
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
val extractionMethod: ExtractionMethod? = null,
val tables: List<Table> = emptyList(),
val detectedLanguages: List<String>? = null,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
val chunks: List<Chunk>? = null,
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*/
val images: List<ExtractedImage>? = null,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
val pages: List<PageContent>? = null,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
val elements: List<Element>? = null,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
*
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `null` for non-Djot documents.
*/
val djotContent: DjotContent? = null,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
*
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*/
val ocrElements: List<OcrElement>? = null,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
*
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
val document: DocumentStructure? = null,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*/
val extractedKeywords: List<Keyword>? = null,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*/
val qualityScore: Double? = null,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*/
val processingWarnings: List<ProcessingWarning> = emptyList(),
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig.extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
val annotations: List<PdfAnnotation>? = null,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `null` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*/
val children: List<ArchiveEntry>? = null,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
val uris: List<ExtractedUri>? = null,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* …). Every extractor defaults to `null` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent — the revision list is the separate
* audit trail.
*/
val revisions: List<DocumentRevision>? = null,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
val structuredOutput: Any? = null,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack.ProcessResult`.
*/
val codeIntelligence: Any? = null,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `null` when no LLM was used.
*/
val llmUsage: List<LlmUsage>? = null,
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*/
val formattedContent: String? = null,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
val ocrInternalDocument: String? = null,
)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** FictionBook (FB2) metadata. */
data class FictionBookMetadata(
val genres: List<String> = emptyList(),
val sequences: List<String> = emptyList(),
val annotation: String? = null,
)

View File

@@ -0,0 +1,100 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` — `null` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
*
* - `max_concurrent_extractions` — controls batch parallelism
* - `use_cache` — global caching policy
* - `acceleration` — shared ONNX execution provider
* - `security_limits` — global archive security policy
*/
data class FileExtractionConfig(
/** Override quality post-processing for this file. */
val enableQualityProcessing: Boolean? = null,
/** Override OCR configuration for this file (None in the Option = use batch default). */
val ocr: OcrConfig? = null,
/** Override force OCR for this file. */
val forceOcr: Boolean? = null,
/** Override force OCR pages for this file (1-indexed page numbers). */
val forceOcrPages: List<Int>? = null,
/** Override disable OCR for this file. */
val disableOcr: Boolean? = null,
/** Override chunking configuration for this file. */
val chunking: ChunkingConfig? = null,
/** Override content filtering configuration for this file. */
val contentFilter: ContentFilterConfig? = null,
/** Override image extraction configuration for this file. */
val images: ImageExtractionConfig? = null,
/** Override PDF options for this file. */
val pdfOptions: PdfConfig? = null,
/** Override token reduction for this file. */
val tokenReduction: TokenReductionOptions? = null,
/** Override language detection for this file. */
val languageDetection: LanguageDetectionConfig? = null,
/** Override page extraction for this file. */
val pages: PageConfig? = null,
/** Override keyword extraction for this file. */
val keywords: KeywordConfig? = null,
/** Override post-processor for this file. */
val postprocessor: PostProcessorConfig? = null,
/** Override HTML conversion options for this file. */
val htmlOptions: String? = null,
/** Override result format for this file. */
val resultFormat: ResultFormat? = null,
/** Override output content format for this file. */
val outputFormat: OutputFormat? = null,
/** Override document structure output for this file. */
val includeDocumentStructure: Boolean? = null,
/** Override layout detection for this file. */
val layout: LayoutDetectionConfig? = null,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
val timeoutSecs: Long? = null,
/** Override tree-sitter configuration for this file. */
val treeSitter: TreeSitterConfig? = null,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
val structuredExtraction: StructuredExtractionConfig? = null,
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Footnote in Djot. */
data class Footnote(
/** Footnote label */
val label: String,
/** Footnote content blocks */
val content: List<FormattedBlock> = emptyList(),
)

View File

@@ -0,0 +1,227 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = FormatMetadataDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = FormatMetadataSerializer::class)
sealed class FormatMetadata {
data class Pdf(val metadata: PdfMetadata) : FormatMetadata()
data class Docx(val metadata: DocxMetadata) : FormatMetadata()
data class Excel(val metadata: ExcelMetadata) : FormatMetadata()
data class Email(val metadata: EmailMetadata) : FormatMetadata()
data class Pptx(val metadata: PptxMetadata) : FormatMetadata()
data class Archive(val metadata: ArchiveMetadata) : FormatMetadata()
data class Image(val metadata: ImageMetadata) : FormatMetadata()
data class Xml(val metadata: XmlMetadata) : FormatMetadata()
data class Text(val metadata: TextMetadata) : FormatMetadata()
data class Html(val metadata: HtmlMetadata) : FormatMetadata()
data class Ocr(val metadata: OcrMetadata) : FormatMetadata()
data class Csv(val metadata: CsvMetadata) : FormatMetadata()
data class Bibtex(val metadata: BibtexMetadata) : FormatMetadata()
data class Citation(val metadata: CitationMetadata) : FormatMetadata()
data class FictionBook(val metadata: FictionBookMetadata) : FormatMetadata()
data class Dbf(val metadata: DbfMetadata) : FormatMetadata()
data class Jats(val metadata: JatsMetadata) : FormatMetadata()
data class Epub(val metadata: EpubMetadata) : FormatMetadata()
data class Pst(val metadata: PstMetadata) : FormatMetadata()
data class Code(val value: String) : FormatMetadata()
}
private class FormatMetadataDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<FormatMetadata>(FormatMetadata::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): FormatMetadata {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("format_type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("format_type") }
return when (tag) {
"pdf" -> FormatMetadata.Pdf(ctx.readTreeAsValue<PdfMetadata>(payload, PdfMetadata::class.java))
"docx" -> FormatMetadata.Docx(ctx.readTreeAsValue<DocxMetadata>(payload, DocxMetadata::class.java))
"excel" -> FormatMetadata.Excel(ctx.readTreeAsValue<ExcelMetadata>(payload, ExcelMetadata::class.java))
"email" -> FormatMetadata.Email(ctx.readTreeAsValue<EmailMetadata>(payload, EmailMetadata::class.java))
"pptx" -> FormatMetadata.Pptx(ctx.readTreeAsValue<PptxMetadata>(payload, PptxMetadata::class.java))
"archive" -> FormatMetadata.Archive(ctx.readTreeAsValue<ArchiveMetadata>(payload, ArchiveMetadata::class.java))
"image" -> FormatMetadata.Image(ctx.readTreeAsValue<ImageMetadata>(payload, ImageMetadata::class.java))
"xml" -> FormatMetadata.Xml(ctx.readTreeAsValue<XmlMetadata>(payload, XmlMetadata::class.java))
"text" -> FormatMetadata.Text(ctx.readTreeAsValue<TextMetadata>(payload, TextMetadata::class.java))
"html" -> FormatMetadata.Html(ctx.readTreeAsValue<HtmlMetadata>(payload, HtmlMetadata::class.java))
"ocr" -> FormatMetadata.Ocr(ctx.readTreeAsValue<OcrMetadata>(payload, OcrMetadata::class.java))
"csv" -> FormatMetadata.Csv(ctx.readTreeAsValue<CsvMetadata>(payload, CsvMetadata::class.java))
"bibtex" -> FormatMetadata.Bibtex(ctx.readTreeAsValue<BibtexMetadata>(payload, BibtexMetadata::class.java))
"citation" -> FormatMetadata.Citation(ctx.readTreeAsValue<CitationMetadata>(payload, CitationMetadata::class.java))
"fiction_book" -> FormatMetadata.FictionBook(ctx.readTreeAsValue<FictionBookMetadata>(payload, FictionBookMetadata::class.java))
"dbf" -> FormatMetadata.Dbf(ctx.readTreeAsValue<DbfMetadata>(payload, DbfMetadata::class.java))
"jats" -> FormatMetadata.Jats(ctx.readTreeAsValue<JatsMetadata>(payload, JatsMetadata::class.java))
"epub" -> FormatMetadata.Epub(ctx.readTreeAsValue<EpubMetadata>(payload, EpubMetadata::class.java))
"pst" -> FormatMetadata.Pst(ctx.readTreeAsValue<PstMetadata>(payload, PstMetadata::class.java))
"code" -> FormatMetadata.Code(ctx.readTreeAsValue<String>(payload, String::class.java))
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown FormatMetadata tag", tag, FormatMetadata::class.java,
)
}
}
}
private class FormatMetadataSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<FormatMetadata>(FormatMetadata::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: FormatMetadata,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is FormatMetadata.Pdf -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pdf")
n
}
is FormatMetadata.Docx -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "docx")
n
}
is FormatMetadata.Excel -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "excel")
n
}
is FormatMetadata.Email -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "email")
n
}
is FormatMetadata.Pptx -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pptx")
n
}
is FormatMetadata.Archive -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "archive")
n
}
is FormatMetadata.Image -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "image")
n
}
is FormatMetadata.Xml -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "xml")
n
}
is FormatMetadata.Text -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "text")
n
}
is FormatMetadata.Html -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "html")
n
}
is FormatMetadata.Ocr -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "ocr")
n
}
is FormatMetadata.Csv -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "csv")
n
}
is FormatMetadata.Bibtex -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "bibtex")
n
}
is FormatMetadata.Citation -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "citation")
n
}
is FormatMetadata.FictionBook -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "fiction_book")
n
}
is FormatMetadata.Dbf -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "dbf")
n
}
is FormatMetadata.Jats -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "jats")
n
}
is FormatMetadata.Epub -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "epub")
n
}
is FormatMetadata.Pst -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pst")
n
}
is FormatMetadata.Code -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "code")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,45 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
data class FormattedBlock(
/** Type of block element */
val blockType: BlockType,
/** Heading level (1-6) for headings, or nesting level for lists */
val level: Long? = null,
/** Inline content within the block */
val inlineContent: List<InlineElement> = emptyList(),
/** Element attributes (classes, IDs, key-value pairs) */
val attributes: String? = null,
/** Language identifier for code blocks */
val language: String? = null,
/** Raw code content for code blocks */
val code: String? = null,
/** Nested blocks for containers (blockquotes, list items, divs) */
val children: List<FormattedBlock> = emptyList(),
)

View File

@@ -0,0 +1,54 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
enum class FracType {
@com.fasterxml.jackson.annotation.JsonProperty("Bar")
BAR,
@com.fasterxml.jackson.annotation.JsonProperty("NoBar")
NO_BAR,
@com.fasterxml.jackson.annotation.JsonProperty("Linear")
LINEAR,
@com.fasterxml.jackson.annotation.JsonProperty("Skewed")
SKEWED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
BAR -> "Bar"
NO_BAR -> "NoBar"
LINEAR -> "Linear"
SKEWED -> "Skewed"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): FracType = when (value) {
"Bar" -> BAR
"NoBar" -> NO_BAR
"Linear" -> LINEAR
"Skewed" -> SKEWED
else -> throw IllegalArgumentException("Unknown FracType value: $value")
}
}
}

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Individual grid cell with position and span metadata. */
data class GridCell(
/** Cell text content. */
val content: String,
/** Zero-indexed row position. */
val row: Int,
/** Zero-indexed column position. */
val col: Int,
/** Number of rows this cell spans. */
val rowSpan: Int,
/** Number of columns this cell spans. */
val colSpan: Int,
/** Whether this is a header cell. */
val isHeader: Boolean,
/** Bounding box for this cell (if available). */
val bbox: BoundingBox? = null,
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Header/heading element metadata. */
data class HeaderMetadata(
/** Header level: 1 (h1) through 6 (h6) */
val level: Byte,
/** Normalized text content of the header */
val text: String,
/** HTML id attribute if present */
val id: String? = null,
/** Document tree depth at the header element */
val depth: Int,
/** Byte offset in original HTML document */
val htmlOffset: Int,
)

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
data class HeadingContext(
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*/
val headings: List<HeadingLevel> = emptyList(),
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A single heading in the hierarchy. */
data class HeadingLevel(
/** Heading depth (1 = h1, 2 = h2, etc.) */
val level: Byte,
/** The text content of the heading. */
val text: String,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
data class HierarchicalBlock(
/** The text content of this block */
val text: String,
/** The font size of the text in this block */
val fontSize: Float,
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
*
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*/
val level: String,
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*/
val bbox: List<Float>? = null,
)

View File

@@ -0,0 +1,52 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
data class HierarchyConfig(
/** Enable hierarchy extraction */
val enabled: Boolean = true,
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*/
val kClusters: Long = 3L,
/** Include bounding box information in hierarchy blocks */
val includeBbox: Boolean = true,
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*/
val ocrCoverageThreshold: Float? = null,
)

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
data class HtmlMetadata(
/** Document title from `<title>` tag */
val title: String? = null,
/** Document description from `<meta name="description">` tag */
val description: String? = null,
/** Document keywords from `<meta name="keywords">` tag, split on commas */
val keywords: List<String> = emptyList(),
/** Document author from `<meta name="author">` tag */
val author: String? = null,
/** Canonical URL from `<link rel="canonical">` tag */
val canonicalUrl: String? = null,
/** Base URL from `<base href="">` tag for resolving relative URLs */
val baseHref: String? = null,
/** Document language from `lang` attribute */
val language: String? = null,
/** Document text direction from `dir` attribute */
val textDirection: TextDirection? = null,
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*/
val openGraph: Map<String, String> = emptyMap(),
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*/
val twitterCard: Map<String, String> = emptyMap(),
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*/
val metaTags: Map<String, String> = emptyMap(),
/** Extracted header elements with hierarchy */
val headers: List<HeaderMetadata> = emptyList(),
/** Extracted hyperlinks with type classification */
val links: List<LinkMetadata> = emptyList(),
/** Extracted images with source and dimensions */
val images: List<ImageMetadataType> = emptyList(),
/** Extracted structured data blocks */
val structuredData: List<StructuredData> = emptyList(),
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Configuration for styled HTML output.
*
* When set on `ExtractionConfig.html_output` alongside
* `output_format = OutputFormat.Html`, the pipeline builds a
* `StyledHtmlRenderer` instead of
* the plain comrak-based renderer.
*/
data class HtmlOutputConfig(
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*/
val css: String? = null,
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*/
val cssFile: java.nio.file.Path? = null,
/** Built-in colour/typography theme. Default: `HtmlTheme.Unstyled`. */
val theme: HtmlTheme = HtmlTheme.UNSTYLED,
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*/
val classPrefix: String = "",
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*/
val embedCss: Boolean = true,
)

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Built-in HTML theme selection. */
enum class HtmlTheme {
/**
* Sensible defaults: system font stack, neutral colours, readable line
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
* can override individual values.
*/
@com.fasterxml.jackson.annotation.JsonProperty("default")
DEFAULT,
/** GitHub Markdown-inspired palette and spacing. */
@com.fasterxml.jackson.annotation.JsonProperty("github")
GIT_HUB,
/** Dark background, light text. */
@com.fasterxml.jackson.annotation.JsonProperty("dark")
DARK,
/** Minimal light theme with generous whitespace. */
@com.fasterxml.jackson.annotation.JsonProperty("light")
LIGHT,
/**
* No built-in stylesheet emitted. CSS custom properties are still defined
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
*/
@com.fasterxml.jackson.annotation.JsonProperty("unstyled")
UNSTYLED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
DEFAULT -> "default"
GIT_HUB -> "github"
DARK -> "dark"
LIGHT -> "light"
UNSTYLED -> "unstyled"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): HtmlTheme = when (value) {
"default" -> DEFAULT
"github" -> GIT_HUB
"dark" -> DARK
"light" -> LIGHT
"unstyled" -> UNSTYLED
else -> throw IllegalArgumentException("Unknown HtmlTheme value: $value")
}
}
}

View File

@@ -0,0 +1,143 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
*
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
*/
interface IDocumentExtractor {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Extract content from a byte array.
*
* This is the core extraction method that processes in-memory document data.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
* The pipeline will convert this into the public `ExtractionResult`.
*
* **Errors:**
*
* - `KreuzbergError.Parsing` - Document parsing failed
* - `KreuzbergError.Validation` - Invalid document structure
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
* - `KreuzbergError.MissingDependency` - Required dependency not available
*/
suspend fun extractBytes(
content: ByteArray,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Extract content from a file.
*
* Default implementation reads the file and calls `extract_bytes`.
* Override for custom file handling, streaming, or memory optimizations.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
*
* **Errors:**
*
* Same as `extract_bytes`, plus file I/O errors.
*/
suspend fun extractFile(
path: java.nio.file.Path,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Get the list of MIME types supported by this extractor.
*
* Can include exact MIME types and prefix patterns:
*
* - Exact: `"application/pdf"`, `"text/plain"`
* - Prefix: `"image/*"` (matches any image type)
*
* **Returns:**
*
* A slice of MIME type strings.
*/
fun supportedMimeTypes(): List<String>
/**
* Get the priority of this extractor.
*
* Higher priority extractors are preferred when multiple extractors
* support the same MIME type.
*
* # Priority Guidelines
*
* - **0-25**: Fallback/low-quality extractors
* - **26-49**: Alternative extractors
* - **50**: Default priority (built-in extractors)
* - **51-75**: Premium/enhanced extractors
* - **76-100**: Specialized/high-priority extractors
*
* **Returns:**
*
* Priority value (default: 50)
*/
fun priority(): Int
/**
* Optional: Check if this extractor can handle a specific file.
*
* Allows for more sophisticated detection beyond MIME types.
* Defaults to `true` (rely on MIME type matching).
*
* **Returns:**
*
* `true` if the extractor can handle this file, `false` otherwise.
*/
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
}

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Trait for in-process embedding backend plugins.
*
* Async to match the convention used by `OcrBackend`,
* `DocumentExtractor`, and `PostProcessor`.
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
* equivalent to satisfy the async signature.
*
* # Thread safety
*
* Backends must be `Send + Sync + 'static`. They are stored in
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
* pipeline. If the backend's underlying model isn't thread-safe, the backend
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
*
* # Contract
*
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
* `self.dimensions()`. The dispatcher in `embed_texts`
* validates this before returning to downstream consumers; a non-conforming
* backend surfaces as a `KreuzbergError.Validation`, not a panic.
*
* - `embed` may be called from any thread. Its future must be `Send`
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
*
* - `dimensions()` is called exactly once at registration, immediately after
* `initialize()` succeeds. The returned value is cached by the registry and
* used for all subsequent shape validation. Lazy-loading implementations can
* defer model loading into `initialize()` and report the real dimension
* afterwards. Later mutations of the backend's reported dimension are not
* observed by kreuzberg — implementations that need to change dimension
* must unregister and re-register.
*
* - `shutdown()` (inherited from `Plugin`) may be invoked
* concurrently with an in-flight `embed()` call. Implementations must
* tolerate this — e.g. by letting in-flight calls finish using resources
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
* shared state that isn't needed by `embed`.
*
* # Runtime
*
* The synchronous `embed_texts` entry uses
* `tokio.task.block_in_place` to await the trait's async `embed`, which
* requires a multi-thread tokio runtime. Callers running inside a
* `current_thread` runtime (e.g. `#[tokio.test]` without `flavor = "multi_thread"`,
* or `tokio.runtime.Builder.new_current_thread()`) must use
* `embed_texts_async` instead, which awaits directly without
* `block_in_place`.
*/
interface IEmbeddingBackend {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Embedding vector dimension. Must be `> 0` and must match the length of
* every vector returned by `embed`.
*/
fun dimensions(): Long
/**
* Embed a batch of texts, returning one vector per input in order.
*
* **Errors:**
*
* Implementations should return `Plugin` for
* backend-specific failures. The dispatcher layers its own validation
* (length, per-vector dimension) on top.
*/
suspend fun embed(texts: List<String>): List<List<Float>>
}

View File

@@ -0,0 +1,116 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Trait for OCR backend plugins.
*
* Implement this trait to add custom OCR capabilities. OCR backends can be:
*
* - Native Rust implementations (like Tesseract)
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
*
* # Thread Safety
*
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
*/
interface IOcrBackend {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Process an image and extract text via OCR.
*
* **Returns:**
*
* An `ExtractionResult` containing the extracted text and metadata.
*
* **Errors:**
*
* - `KreuzbergError.Ocr` - OCR processing failed
* - `KreuzbergError.Validation` - Invalid image format or configuration
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
*
* # Reading `backend_options`
*
* Backends that support runtime tuning can read `config.backend_options` and
* deserialize only the keys they care about. Unknown keys are silently ignored,
* so multiple backends can coexist in a pipeline without key conflicts.
*/
suspend fun processImage(imageBytes: ByteArray, config: OcrConfig): ExtractionResult
/**
* Process a file and extract text via OCR.
*
* Default implementation reads the file and calls `process_image`.
* Override for custom file handling or optimizations.
*
* **Errors:**
*
* Same as `process_image`, plus file I/O errors.
*/
suspend fun processImageFile(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
/**
* Check if this backend supports a given language code.
*
* **Returns:**
*
* `true` if the language is supported, `false` otherwise.
*/
fun supportsLanguage(lang: String): Boolean
/**
* Get the backend type identifier.
*
* **Returns:**
*
* The backend type enum value.
*/
fun backendType(): OcrBackendType
/**
* Optional: Get a list of all supported languages.
*
* Defaults to empty list. Override to provide comprehensive language support info.
*/
fun supportedLanguages(): List<String>
/**
* Optional: Check if the backend supports table detection.
*
* Defaults to `false`. Override if your backend can detect and extract tables.
*/
fun supportsTableDetection(): Boolean
/**
* Check if the backend supports direct document-level processing (e.g. for PDFs).
*
* Defaults to `false`. Override if the backend has optimized document processing.
*/
fun supportsDocumentProcessing(): Boolean
/**
* Process a document file directly via OCR.
*
* Only called if `supports_document_processing` returns `true`.
*/
suspend fun processDocument(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
}

Some files were not shown because too many files have changed in this diff Show More