Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generated by alef. Do not edit by hand. -->
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="dev.kreuzberg">
</manifest>

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*/
data class AccelerationConfig(
/** Execution provider to use for ONNX inference. */
val provider: ExecutionProviderType = ExecutionProviderType.AUTO,
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
val deviceId: Int = 0,
)

View File

@@ -0,0 +1,172 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Types of inline text annotations. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = AnnotationKindDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = AnnotationKindSerializer::class)
sealed class AnnotationKind {
object Bold : AnnotationKind()
object Italic : AnnotationKind()
object Underline : AnnotationKind()
object Strikethrough : AnnotationKind()
object Code : AnnotationKind()
object Subscript : AnnotationKind()
object Superscript : AnnotationKind()
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Link(
val url: String,
val title: String?,
) : AnnotationKind()
/** Highlighted text (PDF highlights, HTML `<mark>`). */
object Highlight : AnnotationKind()
/** Text color (CSS-compatible value, e.g. "#ff0000", "red"). */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Color(
val value: String,
) : AnnotationKind()
/** Font size with units (e.g. "12pt", "1.2em", "16px"). */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class FontSize(
val value: String,
) : AnnotationKind()
/** Extensible annotation for format-specific styling. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Custom(
val name: String,
val value: String?,
) : AnnotationKind()
}
private class AnnotationKindDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<AnnotationKind>(AnnotationKind::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): AnnotationKind {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("annotation_type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("annotation_type") }
return when (tag) {
"bold" -> AnnotationKind.Bold
"italic" -> AnnotationKind.Italic
"underline" -> AnnotationKind.Underline
"strikethrough" -> AnnotationKind.Strikethrough
"code" -> AnnotationKind.Code
"subscript" -> AnnotationKind.Subscript
"superscript" -> AnnotationKind.Superscript
"link" -> ctx.readTreeAsValue<AnnotationKind.Link>(payload, AnnotationKind.Link::class.java)
"highlight" -> AnnotationKind.Highlight
"color" -> ctx.readTreeAsValue<AnnotationKind.Color>(payload, AnnotationKind.Color::class.java)
"font_size" -> ctx.readTreeAsValue<AnnotationKind.FontSize>(payload, AnnotationKind.FontSize::class.java)
"custom" -> ctx.readTreeAsValue<AnnotationKind.Custom>(payload, AnnotationKind.Custom::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown AnnotationKind tag", tag, AnnotationKind::class.java,
)
}
}
}
private class AnnotationKindSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<AnnotationKind>(AnnotationKind::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: AnnotationKind,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is AnnotationKind.Bold -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "bold")
n
}
is AnnotationKind.Italic -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "italic")
n
}
is AnnotationKind.Underline -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "underline")
n
}
is AnnotationKind.Strikethrough -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "strikethrough")
n
}
is AnnotationKind.Code -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "code")
n
}
is AnnotationKind.Subscript -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "subscript")
n
}
is AnnotationKind.Superscript -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "superscript")
n
}
is AnnotationKind.Link -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Link) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "link")
n
}
is AnnotationKind.Highlight -> {
val n = mapper.createObjectNode()
n.put("annotation_type", "highlight")
n
}
is AnnotationKind.Color -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Color) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "color")
n
}
is AnnotationKind.FontSize -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.FontSize) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "font_size")
n
}
is AnnotationKind.Custom -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("annotation_type", "custom")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
data class ArchiveEntry(
/** Archive-relative file path (e.g. "folder/document.pdf"). */
val path: String,
/** Detected MIME type of the file. */
val mimeType: String,
/** Full extraction result for this file. */
val result: ExtractionResult,
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
data class ArchiveMetadata(
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
val format: String = "",
/** Total number of files in the archive */
val fileCount: Int = 0,
/** List of file paths within the archive */
val fileList: List<String> = emptyList(),
/** Total uncompressed size in bytes */
val totalSize: Long = 0L,
/** Compressed size in bytes (if available) */
val compressedSize: Long? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
data class BBox(val x1: Float, val y1: Float, val x2: Float, val y2: Float)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
data class BatchBytesItem(
/** The content bytes to extract from */
val content: ByteArray,
/** MIME type of the content (e.g., "application/pdf", "text/html") */
val mimeType: String,
/** Per-item configuration overrides (None uses batch-level defaults) */
val config: FileExtractionConfig? = null,
)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
data class BatchFileItem(
/** Path to the file to extract from */
val path: java.nio.file.Path,
/** Per-file configuration overrides (None uses batch-level defaults) */
val config: FileExtractionConfig? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** BibTeX bibliography metadata. */
data class BibtexMetadata(
/** Number of entries in the bibliography. */
val entryCount: Long = 0L,
val citationKeys: List<String> = emptyList(),
val authors: List<String> = emptyList(),
val yearRange: YearRange? = null,
val entryTypes: Map<String, Long>? = null,
)

View File

@@ -0,0 +1,103 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Types of block-level elements in Djot. */
enum class BlockType {
@com.fasterxml.jackson.annotation.JsonProperty("paragraph")
PARAGRAPH,
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
@com.fasterxml.jackson.annotation.JsonProperty("blockquote")
BLOCKQUOTE,
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
LIST_ITEM,
@com.fasterxml.jackson.annotation.JsonProperty("ordered_list")
ORDERED_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("bullet_list")
BULLET_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("task_list")
TASK_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("definition_list")
DEFINITION_LIST,
@com.fasterxml.jackson.annotation.JsonProperty("definition_term")
DEFINITION_TERM,
@com.fasterxml.jackson.annotation.JsonProperty("definition_description")
DEFINITION_DESCRIPTION,
@com.fasterxml.jackson.annotation.JsonProperty("div")
DIV,
@com.fasterxml.jackson.annotation.JsonProperty("section")
SECTION,
@com.fasterxml.jackson.annotation.JsonProperty("thematic_break")
THEMATIC_BREAK,
@com.fasterxml.jackson.annotation.JsonProperty("raw_block")
RAW_BLOCK,
@com.fasterxml.jackson.annotation.JsonProperty("math_display")
MATH_DISPLAY;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
PARAGRAPH -> "paragraph"
HEADING -> "heading"
BLOCKQUOTE -> "blockquote"
CODE_BLOCK -> "code_block"
LIST_ITEM -> "list_item"
ORDERED_LIST -> "ordered_list"
BULLET_LIST -> "bullet_list"
TASK_LIST -> "task_list"
DEFINITION_LIST -> "definition_list"
DEFINITION_TERM -> "definition_term"
DEFINITION_DESCRIPTION -> "definition_description"
DIV -> "div"
SECTION -> "section"
THEMATIC_BREAK -> "thematic_break"
RAW_BLOCK -> "raw_block"
MATH_DISPLAY -> "math_display"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): BlockType = when (value) {
"paragraph" -> PARAGRAPH
"heading" -> HEADING
"blockquote" -> BLOCKQUOTE
"code_block" -> CODE_BLOCK
"list_item" -> LIST_ITEM
"ordered_list" -> ORDERED_LIST
"bullet_list" -> BULLET_LIST
"task_list" -> TASK_LIST
"definition_list" -> DEFINITION_LIST
"definition_term" -> DEFINITION_TERM
"definition_description" -> DEFINITION_DESCRIPTION
"div" -> DIV
"section" -> SECTION
"thematic_break" -> THEMATIC_BREAK
"raw_block" -> RAW_BLOCK
"math_display" -> MATH_DISPLAY
else -> throw IllegalArgumentException("Unknown BlockType value: $value")
}
}
}

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Bounding box coordinates for element positioning. */
data class BoundingBox(
/** Left x-coordinate */
val x0: Double = 0.0,
/** Bottom y-coordinate */
val y0: Double = 0.0,
/** Right x-coordinate */
val x1: Double = 0.0,
/** Top y-coordinate */
val y1: Double = 0.0,
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
data class CacheStats(
val totalFiles: Long,
val totalSizeMb: Double,
val availableSpaceMb: Double,
val oldestFileAgeDays: Double,
val newestFileAgeDays: Double,
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
data class CellChange(
/** Zero-based row index. */
val row: Long,
/** Zero-based column index. */
val col: Long,
/** Value before the change. */
val from: String,
/** Value after the change. */
val to: String,
)

View File

@@ -0,0 +1,51 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
data class Chunk(
/** The text content of this chunk. */
val content: String,
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType.Unknown` when no rule matches.
*/
val chunkType: ChunkType,
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*/
val embedding: List<Float>? = null,
/** Metadata about this chunk's position and properties. */
val metadata: ChunkMetadata,
)

View File

@@ -0,0 +1,68 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Metadata about a chunk's position in the original document. */
data class ChunkMetadata(
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
val byteStart: Long,
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
val byteEnd: Long,
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*/
val tokenCount: Long? = null,
/** Zero-based index of this chunk in the document. */
val chunkIndex: Long,
/** Total number of chunks in the document. */
val totalChunks: Long,
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
val firstPage: Int? = null,
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
val lastPage: Int? = null,
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType.Markdown` is used.
*/
val headingContext: HeadingContext? = null,
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*/
val imageIndices: List<Int> = emptyList(),
)

View File

@@ -0,0 +1,93 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = ChunkSizingDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = ChunkSizingSerializer::class)
sealed class ChunkSizing {
/** Size measured in Unicode characters (default). */
object Characters : ChunkSizing()
/** Size measured in tokens from a HuggingFace tokenizer. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Tokenizer(
val model: String,
val cacheDir: java.nio.file.Path?,
) : ChunkSizing()
}
private class ChunkSizingDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<ChunkSizing>(ChunkSizing::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): ChunkSizing {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
return when (tag) {
"characters" -> ChunkSizing.Characters
"tokenizer" -> ctx.readTreeAsValue<ChunkSizing.Tokenizer>(payload, ChunkSizing.Tokenizer::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown ChunkSizing tag", tag, ChunkSizing::class.java,
)
}
}
}
private class ChunkSizingSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<ChunkSizing>(ChunkSizing::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: ChunkSizing,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is ChunkSizing.Characters -> {
val n = mapper.createObjectNode()
n.put("type", "characters")
n
}
is ChunkSizing.Tokenizer -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as ChunkSizing.Tokenizer) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "tokenizer")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,110 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in `chunking.classifier`.
* Defaults to `Unknown` when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
enum class ChunkType {
/** Section heading or document title. */
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
/** Party list: names, addresses, and signatories. */
@com.fasterxml.jackson.annotation.JsonProperty("party_list")
PARTY_LIST,
/** Definition clause ("X means…", "X shall mean…"). */
@com.fasterxml.jackson.annotation.JsonProperty("definitions")
DEFINITIONS,
/** Operative clause containing legal/contractual action verbs. */
@com.fasterxml.jackson.annotation.JsonProperty("operative_clause")
OPERATIVE_CLAUSE,
/** Signature block with signatures, names, and dates. */
@com.fasterxml.jackson.annotation.JsonProperty("signature_block")
SIGNATURE_BLOCK,
/** Schedule, annex, appendix, or exhibit section. */
@com.fasterxml.jackson.annotation.JsonProperty("schedule")
SCHEDULE,
/** Table-like content with aligned columns or repeated patterns. */
@com.fasterxml.jackson.annotation.JsonProperty("table_like")
TABLE_LIKE,
/** Mathematical formula or equation. */
@com.fasterxml.jackson.annotation.JsonProperty("formula")
FORMULA,
/** Code block or preformatted content. */
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
/** Embedded or referenced image content. */
@com.fasterxml.jackson.annotation.JsonProperty("image")
IMAGE,
/** Organizational chart or hierarchy diagram. */
@com.fasterxml.jackson.annotation.JsonProperty("org_chart")
ORG_CHART,
/** Diagram, figure, or visual illustration. */
@com.fasterxml.jackson.annotation.JsonProperty("diagram")
DIAGRAM,
/** Unclassified or mixed content. */
@com.fasterxml.jackson.annotation.JsonProperty("unknown")
UNKNOWN;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
HEADING -> "heading"
PARTY_LIST -> "party_list"
DEFINITIONS -> "definitions"
OPERATIVE_CLAUSE -> "operative_clause"
SIGNATURE_BLOCK -> "signature_block"
SCHEDULE -> "schedule"
TABLE_LIKE -> "table_like"
FORMULA -> "formula"
CODE_BLOCK -> "code_block"
IMAGE -> "image"
ORG_CHART -> "org_chart"
DIAGRAM -> "diagram"
UNKNOWN -> "unknown"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ChunkType = when (value) {
"heading" -> HEADING
"party_list" -> PARTY_LIST
"definitions" -> DEFINITIONS
"operative_clause" -> OPERATIVE_CLAUSE
"signature_block" -> SIGNATURE_BLOCK
"schedule" -> SCHEDULE
"table_like" -> TABLE_LIKE
"formula" -> FORMULA
"code_block" -> CODE_BLOCK
"image" -> IMAGE
"org_chart" -> ORG_CHART
"diagram" -> DIAGRAM
"unknown" -> UNKNOWN
else -> throw IllegalArgumentException("Unknown ChunkType value: $value")
}
}
}

View File

@@ -0,0 +1,70 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Type of text chunker to use.
*
* # Variants
*
* - `Text` - Generic text splitter, splits on whitespace and punctuation
* - `Markdown` - Markdown-aware splitter, preserves formatting and structure
* - `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* - `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* `max_characters` (default 1000). `topic_threshold` has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
enum class ChunkerType {
@com.fasterxml.jackson.annotation.JsonProperty("text")
TEXT,
@com.fasterxml.jackson.annotation.JsonProperty("markdown")
MARKDOWN,
@com.fasterxml.jackson.annotation.JsonProperty("yaml")
YAML,
@com.fasterxml.jackson.annotation.JsonProperty("semantic")
SEMANTIC;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
TEXT -> "text"
MARKDOWN -> "markdown"
YAML -> "yaml"
SEMANTIC -> "semantic"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ChunkerType = when (value) {
"text" -> TEXT
"markdown" -> MARKDOWN
"yaml" -> YAML
"semantic" -> SEMANTIC
else -> throw IllegalArgumentException("Unknown ChunkerType value: $value")
}
}
}

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..the default constructor` when constructing to allow for future field additions:
*/
data class ChunkingConfig(
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
@com.fasterxml.jackson.annotation.JsonProperty("max_chars")
val maxCharacters: Long = 1000L,
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*/
@com.fasterxml.jackson.annotation.JsonProperty("max_overlap")
val overlap: Long = 200L,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
val trim: Boolean = true,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
val chunkerType: ChunkerType = ChunkerType.TEXT,
/** Optional embedding configuration for chunk embeddings. */
val embedding: EmbeddingConfig? = null,
/** Use a preset configuration (overrides individual settings if provided). */
val preset: String? = null,
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*/
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = ChunkSizing::class)
val sizing: ChunkSizing,
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*/
val prependHeadingContext: Boolean = false,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*/
val topicThreshold: Float? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Citation file metadata (RIS, PubMed, EndNote). */
data class CitationMetadata(
val citationCount: Long = 0L,
val format: String? = null,
val authors: List<String> = emptyList(),
val yearRange: YearRange? = null,
val dois: List<String> = emptyList(),
val keywords: List<String> = emptyList(),
)

View File

@@ -0,0 +1,59 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the `content` field
* of `ExtractionResult`.
*/
enum class CodeContentMode {
/** Use TSLP semantic chunks as content (default). */
@com.fasterxml.jackson.annotation.JsonProperty("chunks")
CHUNKS,
/** Use raw source code as content. */
@com.fasterxml.jackson.annotation.JsonProperty("raw")
RAW,
/** Emit function/class headings + docstrings (no code bodies). */
@com.fasterxml.jackson.annotation.JsonProperty("structure")
STRUCTURE;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
CHUNKS -> "chunks"
RAW -> "raw"
STRUCTURE -> "structure"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): CodeContentMode = when (value) {
"chunks" -> CHUNKS
"raw" -> RAW
"structure" -> STRUCTURE
else -> throw IllegalArgumentException("Unknown CodeContentMode value: $value")
}
}
}

View File

@@ -0,0 +1,89 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `null` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
data class ContentFilterConfig(
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
*
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*/
val includeHeaders: Boolean = false,
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
*
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*/
val includeFooters: Boolean = false,
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*/
val stripRepeatingText: Boolean = true,
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*/
val includeWatermarks: Boolean = false,
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
enum class ContentLayer {
/** Main document body content. */
@com.fasterxml.jackson.annotation.JsonProperty("body")
BODY,
/** Page/section header (running header). */
@com.fasterxml.jackson.annotation.JsonProperty("header")
HEADER,
/** Page/section footer (running footer). */
@com.fasterxml.jackson.annotation.JsonProperty("footer")
FOOTER,
/** Footnote content. */
@com.fasterxml.jackson.annotation.JsonProperty("footnote")
FOOTNOTE;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
BODY -> "body"
HEADER -> "header"
FOOTER -> "footer"
FOOTNOTE -> "footnote"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ContentLayer = when (value) {
"body" -> BODY
"header" -> HEADER
"footer" -> FOOTER
"footnote" -> FOOTNOTE
else -> throw IllegalArgumentException("Unknown ContentLayer value: $value")
}
}
}

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** JATS contributor with role. */
data class ContributorRole(val name: String, val role: String? = null)

View File

@@ -0,0 +1,62 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
data class CoreProperties(
/** Document title */
val title: String? = null,
/** Document subject/topic */
val subject: String? = null,
/** Document creator/author */
val creator: String? = null,
/** Keywords or tags */
val keywords: String? = null,
/** Document description/abstract */
val description: String? = null,
/** User who last modified the document */
val lastModifiedBy: String? = null,
/** Revision number */
val revision: String? = null,
/** Creation timestamp (ISO 8601) */
val created: String? = null,
/** Last modification timestamp (ISO 8601) */
val modified: String? = null,
/** Document category */
val category: String? = null,
/** Content status (Draft, Final, etc.) */
val contentStatus: String? = null,
/** Document language */
val language: String? = null,
/** Unique identifier */
val identifier: String? = null,
/** Document version */
val version: String? = null,
/** Last print timestamp (ISO 8601) */
val lastPrinted: String? = null,
)

View File

@@ -0,0 +1,32 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** CSV/TSV file metadata. */
data class CsvMetadata(
val rowCount: Int = 0,
val columnCount: Int = 0,
val delimiter: String? = null,
val hasHeader: Boolean = false,
val columnTypes: List<String>? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** dBASE field information. */
data class DbfFieldInfo(val name: String, val fieldType: String)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** dBASE (DBF) file metadata. */
data class DbfMetadata(
val recordCount: Long = 0L,
val fieldCount: Long = 0L,
val fields: List<DbfFieldInfo> = emptyList(),
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:max-line-length",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:annotation",
"MaxLineLength",
"TooManyFunctions",
"LongParameterList",
"LongMethod",
)
package dev.kreuzberg
import com.fasterxml.jackson.core.type.TypeReference
@Suppress("TooManyFunctions")
class Document internal constructor(internal val handle: Long) : AutoCloseable {
companion object {
private val MAPPER = com.fasterxml.jackson.databind.ObjectMapper()
.registerModule(com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
.findAndRegisterModules()
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
}
// Return the 1-based page number for each top-level table in the document.
fun tablePageNumbers(): List<Long> {
val responseJson = KreuzbergBridge.nativeDocumentTablePageNumbers(handle)
return MAPPER.readValue(responseJson, object : TypeReference<List<Long>>() {})
}
override fun close() { KreuzbergBridge.nativeFreeDocument(handle) }
}

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** MIME type detection response. */
data class DetectResponse(
/** Detected MIME type */
val mimeType: String,
/** Original filename (if provided) */
val filename: String? = null,
)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Page-level detection result containing all detections and page metadata. */
data class DetectionResult(
val pageWidth: Int,
val pageHeight: Int,
val detections: List<LayoutDetection> = emptyList(),
)

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A single contiguous hunk in a unified diff. */
data class DiffHunk(
/** Starting line number in the old content (0-indexed). */
val fromLine: Long,
/** Number of lines from the old content in this hunk. */
val fromCount: Long,
/** Starting line number in the new content (0-indexed). */
val toLine: Long,
/** Number of lines from the new content in this hunk. */
val toCount: Long,
/** Lines that make up this hunk. */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(contentAs = DiffLine::class)
val lines: List<DiffLine> = emptyList(),
)

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = DiffLineDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = DiffLineSerializer::class)
sealed class DiffLine {
/** Unchanged context line. */
data class Context(val value: String) : DiffLine()
/** Line added in the "after" version. */
data class Added(val value: String) : DiffLine()
/** Line removed from the "before" version. */
data class Removed(val value: String) : DiffLine()
}
private class DiffLineDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<DiffLine>(DiffLine::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): DiffLine {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("kind")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("kind") }
return when (tag) {
"context" -> DiffLine.Context(ctx.readTreeAsValue<String>(payload, String::class.java))
"added" -> DiffLine.Added(ctx.readTreeAsValue<String>(payload, String::class.java))
"removed" -> DiffLine.Removed(ctx.readTreeAsValue<String>(payload, String::class.java))
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown DiffLine tag", tag, DiffLine::class.java,
)
}
}
}
private class DiffLineSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<DiffLine>(DiffLine::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: DiffLine,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is DiffLine.Context -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "context")
n
}
is DiffLine.Added -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "added")
n
}
is DiffLine.Removed -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("kind", "removed")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,38 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Options controlling how two `ExtractionResult` values are compared. */
data class DiffOptions(
/** Include metadata changes in the diff. Default: `true`. */
val includeMetadata: Boolean = true,
/** Include embedded-children changes in the diff. Default: `true`. */
val includeEmbedded: Boolean = true,
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `null` means no truncation.
*/
val maxContentChars: Long? = null,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
*
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
data class DjotContent(
/** Plain text representation for backwards compatibility */
val plainText: String,
/** Structured block-level content */
val blocks: List<FormattedBlock> = emptyList(),
/** Metadata from YAML frontmatter */
val metadata: Metadata,
/** Extracted tables as structured data */
val tables: List<Table> = emptyList(),
/** Extracted images with metadata */
val images: List<DjotImage> = emptyList(),
/** Extracted links with URLs */
val links: List<DjotLink> = emptyList(),
/** Footnote definitions */
val footnotes: List<Footnote> = emptyList(),
/** Attributes mapped by element identifier (if present) */
val attributes: List<String> = emptyList(),
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image element in Djot. */
data class DjotImage(
/** Image source URL or path */
val src: String,
/** Alternative text */
val alt: String,
/** Optional title */
val title: String? = null,
/** Element attributes */
val attributes: String? = null,
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Link element in Djot. */
data class DjotLink(
/** Link URL */
val url: String,
/** Link text content */
val text: String,
/** Optional title */
val title: String? = null,
/** Element attributes */
val attributes: String? = null,
)

View File

@@ -0,0 +1,25 @@
// Generated by alef. Do not edit by hand.
package dev.kreuzberg
object DocumentExtractorBridge {
private val registered = mutableMapOf<String, IDocumentExtractor>()
fun register(impl: IDocumentExtractor): Unit {
val name = impl.name()
registered[name] = impl
KreuzbergBridge.nativeRegisterDocumentExtractor(impl)
}
fun unregister(name: String): Unit {
registered.remove(name)
KreuzbergBridge.nativeUnregisterDocumentExtractor(name)
}
fun clearAll(): Unit {
registered.clear()
KreuzbergBridge.nativeClearDocumentExtractors()
}
fun getAll(): Map<String, IDocumentExtractor> = registered.toMap()
}

View File

@@ -0,0 +1,62 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
data class DocumentNode(
/** Deterministic identifier (hash of content + position). */
val id: String,
/** Node content — tagged enum, type-specific data only. */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = NodeContent::class)
val content: NodeContent,
/** Parent node index (`null` = root-level node). */
val parent: Int? = null,
/** Child node indices in reading order. */
val children: List<Int> = emptyList(),
/** Content layer classification. */
val contentLayer: ContentLayer,
/** Page number where this node starts (1-indexed). */
val page: Int? = null,
/** Page number where this node ends (for multi-page tables/sections). */
val pageEnd: Int? = null,
/** Bounding box in document coordinates. */
val bbox: BoundingBox? = null,
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*/
val annotations: List<TextAnnotation> = emptyList(),
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*/
val attributes: Map<String, String>? = null,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A resolved relationship between two nodes in the document tree. */
data class DocumentRelationship(
/** Source node index (the referencing node). */
val source: Int,
/** Target node index (the referenced node). */
val target: Int,
/** Semantic kind of the relationship. */
val kind: RelationshipKind,
)

View File

@@ -0,0 +1,66 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
data class DocumentRevision(
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
*/
val revisionId: String,
/** Display name of the author who made this change, when available. */
val author: String? = null,
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*/
val timestamp: String? = null,
/** Semantic kind of this revision. */
val kind: RevisionKind,
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `null` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*/
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = RevisionAnchor::class)
val anchor: RevisionAnchor? = null,
/** The content changes that make up this revision. */
val delta: RevisionDelta,
)

View File

@@ -0,0 +1,65 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
data class DocumentStructure(
/** All nodes in document/reading order. */
val nodes: List<DocumentNode> = emptyList(),
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*/
val sourceFormat: String? = null,
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*/
val relationships: List<DocumentRelationship> = emptyList(),
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* `NodeContent` variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
*
* Computed from `nodes` via `DocumentStructure.finalize_node_types`.
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*/
val nodeTypes: List<String> = emptyList(),
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
data class DocxAppProperties(
/** Application name (e.g., "Microsoft Office Word") */
val application: String? = null,
/** Application version */
val appVersion: String? = null,
/** Template filename */
val template: String? = null,
/** Total editing time in minutes */
val totalTime: Int? = null,
/** Number of pages */
val pages: Int? = null,
/** Number of words */
val words: Int? = null,
/** Number of characters (excluding spaces) */
val characters: Int? = null,
/** Number of characters (including spaces) */
val charactersWithSpaces: Int? = null,
/** Number of lines */
val lines: Int? = null,
/** Number of paragraphs */
val paragraphs: Int? = null,
/** Company name */
val company: String? = null,
/** Document security level */
val docSecurity: Int? = null,
/** Scale crop flag */
val scaleCrop: Boolean? = null,
/** Links up to date flag */
val linksUpToDate: Boolean? = null,
/** Shared document flag */
val sharedDoc: Boolean? = null,
/** Hyperlinks changed flag */
val hyperlinksChanged: Boolean? = null,
)

View File

@@ -0,0 +1,53 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
data class DocxMetadata(
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*/
val coreProperties: CoreProperties? = null,
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*/
val appProperties: DocxAppProperties? = null,
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*/
val customProperties: Map<String, Any>? = null,
)

View File

@@ -0,0 +1,29 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
)
package dev.kreuzberg
/** Whether the drawing is inline or anchored. */
sealed class DrawingType {
object Inline : DrawingType()
data class Anchored(val value: String) : DrawingType()
}

View File

@@ -0,0 +1,40 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
data class Element(
/** Unique element identifier */
val elementId: String,
/** Semantic type of this element */
val elementType: ElementType,
/** Text content of the element */
val text: String,
/** Metadata about the element */
val metadata: ElementMetadata,
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Metadata for a semantic element. */
data class ElementMetadata(
/** Page number (1-indexed) */
val pageNumber: Int? = null,
/** Source filename or document name */
val filename: String? = null,
/** Bounding box coordinates if available */
val coordinates: BoundingBox? = null,
/** Position index in the element sequence */
val elementIndex: Long? = null,
/** Additional custom metadata */
val additional: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,99 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
enum class ElementType {
/** Document title */
@com.fasterxml.jackson.annotation.JsonProperty("title")
TITLE,
/** Main narrative text body */
@com.fasterxml.jackson.annotation.JsonProperty("narrative_text")
NARRATIVE_TEXT,
/** Section heading */
@com.fasterxml.jackson.annotation.JsonProperty("heading")
HEADING,
/** List item (bullet, numbered, etc.) */
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
LIST_ITEM,
/** Table element */
@com.fasterxml.jackson.annotation.JsonProperty("table")
TABLE,
/** Image element */
@com.fasterxml.jackson.annotation.JsonProperty("image")
IMAGE,
/** Page break marker */
@com.fasterxml.jackson.annotation.JsonProperty("page_break")
PAGE_BREAK,
/** Code block */
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
CODE_BLOCK,
/** Block quote */
@com.fasterxml.jackson.annotation.JsonProperty("block_quote")
BLOCK_QUOTE,
/** Footer text */
@com.fasterxml.jackson.annotation.JsonProperty("footer")
FOOTER,
/** Header text */
@com.fasterxml.jackson.annotation.JsonProperty("header")
HEADER;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
TITLE -> "title"
NARRATIVE_TEXT -> "narrative_text"
HEADING -> "heading"
LIST_ITEM -> "list_item"
TABLE -> "table"
IMAGE -> "image"
PAGE_BREAK -> "page_break"
CODE_BLOCK -> "code_block"
BLOCK_QUOTE -> "block_quote"
FOOTER -> "footer"
HEADER -> "header"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ElementType = when (value) {
"title" -> TITLE
"narrative_text" -> NARRATIVE_TEXT
"heading" -> HEADING
"list_item" -> LIST_ITEM
"table" -> TABLE
"image" -> IMAGE
"page_break" -> PAGE_BREAK
"code_block" -> CODE_BLOCK
"block_quote" -> BLOCK_QUOTE
"footer" -> FOOTER
"header" -> HEADER
else -> throw IllegalArgumentException("Unknown ElementType value: $value")
}
}
}

View File

@@ -0,0 +1,46 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
data class EmailAttachment(
/** Attachment name (from Content-Disposition header) */
val name: String? = null,
/** Filename of the attachment */
val filename: String? = null,
/** MIME type of the attachment */
val mimeType: String? = null,
/** Size in bytes */
val size: Long? = null,
/** Whether this attachment is an image */
val isImage: Boolean,
/**
* Attachment data (if extracted).
* Uses `bytes.Bytes` for cheap cloning of large buffers.
*/
val data: ByteArray? = null,
)

View File

@@ -0,0 +1,49 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Configuration for email extraction. */
data class EmailConfig(
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `null`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 — the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
*
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*/
val msgFallbackCodepage: Int? = null,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
data class EmailExtractionResult(
/** Email subject line */
val subject: String? = null,
/** Sender email address */
val fromEmail: String? = null,
/** Primary recipient email addresses */
val toEmails: List<String> = emptyList(),
/** CC recipient email addresses */
val ccEmails: List<String> = emptyList(),
/** BCC recipient email addresses */
val bccEmails: List<String> = emptyList(),
/** Email date/timestamp */
val date: String? = null,
/** Message-ID header value */
val messageId: String? = null,
/** Plain text version of the email body */
val plainText: String? = null,
/** HTML version of the email body */
val htmlContent: String? = null,
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
val content: String,
/** List of email attachments */
val attachments: List<EmailAttachment> = emptyList(),
/** Additional email headers and metadata */
val metadata: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,45 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
data class EmailMetadata(
/** Sender's email address */
val fromEmail: String? = null,
/** Sender's display name */
val fromName: String? = null,
/** Primary recipients */
val toEmails: List<String> = emptyList(),
/** CC recipients */
val ccEmails: List<String> = emptyList(),
/** BCC recipients */
val bccEmails: List<String> = emptyList(),
/** Message-ID header value */
val messageId: String? = null,
/** List of attachment filenames */
val attachments: List<String> = emptyList(),
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Changes to embedded archive children between two results. */
data class EmbeddedChanges(
/** Children present in `b` but not in `a` (matched by `path`). */
val added: List<ArchiveEntry> = emptyList(),
/** Children present in `a` but not in `b` (matched by `path`). */
val removed: List<ArchiveEntry> = emptyList(),
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*/
val changed: List<EmbeddedDiff> = emptyList(),
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Diff for a single embedded archive entry that appears in both results. */
data class EmbeddedDiff(
/** Archive-relative path identifying this entry. */
val path: String,
/** The recursive diff of the entry's extraction result. */
val diff: ExtractionDiff,
)

View File

@@ -0,0 +1,40 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Embedded file descriptor extracted from the PDF name tree. */
data class EmbeddedFile(
/** The filename as stored in the PDF name tree. */
val name: String,
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
val data: ByteArray,
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*/
val compressedSize: Long,
/** MIME type if specified in the filespec, otherwise `null`. */
val mimeType: String? = null,
)

View File

@@ -0,0 +1,25 @@
// Generated by alef. Do not edit by hand.
package dev.kreuzberg
object EmbeddingBackendBridge {
private val registered = mutableMapOf<String, IEmbeddingBackend>()
fun register(impl: IEmbeddingBackend): Unit {
val name = impl.name()
registered[name] = impl
KreuzbergBridge.nativeRegisterEmbeddingBackend(impl)
}
fun unregister(name: String): Unit {
registered.remove(name)
KreuzbergBridge.nativeUnregisterEmbeddingBackend(name)
}
fun clearAll(): Unit {
registered.clear()
KreuzbergBridge.nativeClearEmbeddingBackends()
}
fun getAll(): Map<String, IEmbeddingBackend> = registered.toMap()
}

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
data class EmbeddingConfig(
/** The embedding model to use (defaults to "balanced" preset if not specified) */
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = EmbeddingModelType::class)
val model: EmbeddingModelType,
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
val normalize: Boolean = true,
/** Batch size for embedding generation */
val batchSize: Long = 32L,
/** Show model download progress */
val showDownloadProgress: Boolean = false,
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*/
val cacheDir: java.nio.file.Path? = null,
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `null` (auto-select per platform).
*/
val acceleration: AccelerationConfig? = null,
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using `EmbeddingModelType.Plugin`.
*
* Applies only to the in-process plugin path — protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `null` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*/
val maxEmbedDurationSecs: Long? = null,
)

View File

@@ -0,0 +1,140 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Embedding model types supported by Kreuzberg. */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = EmbeddingModelTypeDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = EmbeddingModelTypeSerializer::class)
sealed class EmbeddingModelType {
/** Use a preset model configuration (recommended) */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Preset(
val name: String,
) : EmbeddingModelType()
/** Use a custom ONNX model from HuggingFace */
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Custom(
val modelId: String,
val dimensions: Long,
) : EmbeddingModelType()
/**
* Provider-hosted embedding model via liter-llm.
*
* Uses the model specified in the nested `LlmConfig` (e.g.,
* `"openai/text-embedding-3-small"`).
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Llm(
val llm: LlmConfig,
) : EmbeddingModelType()
/**
* In-process embedding backend registered via the plugin system.
*
* The caller registers an `EmbeddingBackend` once
* (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
* or tuned ONNX model), then references it by name in config. Kreuzberg calls back
* into the registered backend during chunking and standalone embed requests —
* no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
*
* When this variant is selected, only the following `EmbeddingConfig` fields
* apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
* (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
* `show_download_progress`, `acceleration`) are ignored — the host owns the
* model lifecycle.
*
* Semantic chunking falls back to `ChunkingConfig.max_characters` when this variant
* is used, since there is no preset to look a chunk-size ceiling up against — size your
* context window via `max_characters` directly.
*
* See `register_embedding_backend`.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
data class Plugin(
val name: String,
) : EmbeddingModelType()
}
private class EmbeddingModelTypeDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): EmbeddingModelType {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
return when (tag) {
"preset" -> ctx.readTreeAsValue<EmbeddingModelType.Preset>(payload, EmbeddingModelType.Preset::class.java)
"custom" -> ctx.readTreeAsValue<EmbeddingModelType.Custom>(payload, EmbeddingModelType.Custom::class.java)
"llm" -> ctx.readTreeAsValue<EmbeddingModelType.Llm>(payload, EmbeddingModelType.Llm::class.java)
"plugin" -> ctx.readTreeAsValue<EmbeddingModelType.Plugin>(payload, EmbeddingModelType.Plugin::class.java)
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown EmbeddingModelType tag", tag, EmbeddingModelType::class.java,
)
}
}
}
private class EmbeddingModelTypeSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: EmbeddingModelType,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is EmbeddingModelType.Preset -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Preset) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "preset")
n
}
is EmbeddingModelType.Custom -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "custom")
n
}
is EmbeddingModelType.Llm -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Llm) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "llm")
n
}
is EmbeddingModelType.Plugin -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Plugin) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("type", "plugin")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,46 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
data class EmbeddingPreset(
val name: String,
val chunkSize: Long,
val overlap: Long,
/** HuggingFace repository name for the model. */
val modelRepo: String,
/** Pooling strategy: "cls" or "mean". */
val pooling: String,
/** Path to the ONNX model file within the repo. */
val modelFile: String,
val dimensions: Long,
val description: String,
)

View File

@@ -0,0 +1,33 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** EPUB metadata (Dublin Core extensions). */
data class EpubMetadata(
val coverage: String? = null,
val dcFormat: String? = null,
val relation: String? = null,
val source: String? = null,
val dcType: String? = null,
val coverImage: String? = null,
)

View File

@@ -0,0 +1,26 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Error metadata (for batch operations). */
data class ErrorMetadata(val errorType: String, val message: String)

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
data class ExcelMetadata(
/** Number of sheets in the workbook. */
val sheetCount: Int? = null,
/** Names of all sheets in the workbook. */
val sheetNames: List<String>? = null,
)

View File

@@ -0,0 +1,48 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
data class ExcelSheet(
/** Sheet name as it appears in Excel */
val name: String,
/** Sheet content converted to Markdown tables */
val markdown: String,
/** Number of rows */
val rowCount: Long,
/** Number of columns */
val colCount: Long,
/** Total number of non-empty cells */
val cellCount: Long,
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*/
val tableCells: List<List<String>>? = null,
)

View File

@@ -0,0 +1,47 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
data class ExcelWorkbook(
/** All sheets in the workbook */
val sheets: List<ExcelSheet> = emptyList(),
/** Workbook-level metadata (author, creation date, etc.) */
val metadata: Map<String, String> = emptyMap(),
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
* `anchor` and `delta` are `null`/empty for v1 (per-cell log parsing is a
* follow-up). `null` when `xl/revisions/revisionHeaders.xml` is absent.
*/
val revisions: List<DocumentRevision>? = null,
)

View File

@@ -0,0 +1,69 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* `Auto` (default) selects the best available provider per platform.
*/
enum class ExecutionProviderType {
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
@com.fasterxml.jackson.annotation.JsonProperty("auto")
AUTO,
/** CPU execution provider (always available). */
@com.fasterxml.jackson.annotation.JsonProperty("cpu")
CPU,
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
@com.fasterxml.jackson.annotation.JsonProperty("coreml")
CORE_ML,
/** NVIDIA CUDA GPU acceleration. */
@com.fasterxml.jackson.annotation.JsonProperty("cuda")
CUDA,
/** NVIDIA TensorRT (optimized CUDA inference). */
@com.fasterxml.jackson.annotation.JsonProperty("tensorrt")
TENSOR_RT;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
AUTO -> "auto"
CPU -> "cpu"
CORE_ML -> "coreml"
CUDA -> "cuda"
TENSOR_RT -> "tensorrt"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ExecutionProviderType = when (value) {
"auto" -> AUTO
"cpu" -> CPU
"coreml" -> CORE_ML
"cuda" -> CUDA
"tensorrt" -> TENSOR_RT
else -> throw IllegalArgumentException("Unknown ExecutionProviderType value: $value")
}
}
}

View File

@@ -0,0 +1,88 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
data class ExtractedImage(
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes.Bytes` for cheap cloning of large buffers.
*/
val data: ByteArray,
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*/
val format: String,
/** Zero-indexed position of this image in the document/page */
val imageIndex: Int,
/** Page/slide number where image was found (1-indexed) */
val pageNumber: Int? = null,
/** Image width in pixels */
val width: Int? = null,
/** Image height in pixels */
val height: Int? = null,
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
val colorspace: String? = null,
/** Bits per color component (e.g., 8, 16) */
val bitsPerComponent: Int? = null,
/** Whether this image is a mask image */
val isMask: Boolean,
/** Optional description of the image */
val description: String? = null,
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*/
val ocrResult: ExtractionResult? = null,
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*/
val boundingBox: BoundingBox? = null,
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*/
val sourcePath: String? = null,
/**
* Heuristic classification of what this image likely depicts.
* `null` if classification was disabled or inconclusive.
*/
val imageKind: ImageKind? = null,
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
val kindConfidence: Float? = null,
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `null` for singletons.
*/
val clusterId: Int? = null,
)

View File

@@ -0,0 +1,35 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image metadata extracted from an image file. */
data class ExtractedImageMetadata(
/** Image width in pixels */
val width: Int,
/** Image height in pixels */
val height: Int,
/** Image format (e.g., "PNG", "JPEG") */
val format: String,
/** EXIF data if available */
val exifData: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
data class ExtractedUri(
/** The URL or path string. */
val url: String,
/** Optional display text / label for the link. */
val label: String? = null,
/** Optional page number where the URI was found (1-indexed). */
val page: Int? = null,
/** Semantic classification of the URI. */
val kind: UriKind,
)

View File

@@ -0,0 +1,275 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*/
data class ExtractionConfig(
/** Enable caching of extraction results */
val useCache: Boolean = true,
/** Enable quality post-processing */
val enableQualityProcessing: Boolean = true,
/** OCR configuration (None = OCR disabled) */
val ocr: OcrConfig? = null,
/** Force OCR even for searchable PDFs */
val forceOcr: Boolean = false,
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*/
val forceOcrPages: List<Int>? = null,
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*/
val disableOcr: Boolean = false,
/** Text chunking configuration (None = chunking disabled) */
val chunking: ChunkingConfig? = null,
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See `ContentFilterConfig` for per-field documentation.
*/
val contentFilter: ContentFilterConfig? = null,
/** Image extraction configuration (None = no image extraction) */
val images: ImageExtractionConfig? = null,
/** PDF-specific options (None = use defaults) */
val pdfOptions: PdfConfig? = null,
/** Token reduction configuration (None = no token reduction) */
val tokenReduction: TokenReductionOptions? = null,
/** Language detection configuration (None = no language detection) */
val languageDetection: LanguageDetectionConfig? = null,
/** Page extraction configuration (None = no page tracking) */
val pages: PageConfig? = null,
/** Keyword extraction configuration (None = no keyword extraction) */
val keywords: KeywordConfig? = null,
/** Post-processor configuration (None = use defaults) */
val postprocessor: PostProcessorConfig? = null,
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*/
val htmlOptions: String? = null,
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat.Html`, the extraction
* pipeline uses `StyledHtmlRenderer`
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `null`, the existing plain comrak-based HTML renderer is used.
*/
val htmlOutput: HtmlOutputConfig? = null,
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by `FileExtractionConfig.timeout_secs`.
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `null` to
* disable the timeout for trusted input or long-running workloads.
*/
val extractionTimeoutSecs: Long? = null,
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*/
val maxConcurrentExtractions: Long? = null,
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*/
val resultFormat: ResultFormat = ResultFormat.UNIFIED,
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `null`, default limits are used.
*/
val securityLimits: SecurityLimits? = null,
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `null` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*/
val maxEmbeddedFileBytes: Long? = null,
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
*
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*/
val outputFormat: OutputFormat = OutputFormat.Plain,
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*/
val layout: LayoutDetectionConfig? = null,
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*/
val useLayoutForMarkdown: Boolean = false,
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
val includeDocumentStructure: Boolean = false,
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `null`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*/
val acceleration: AccelerationConfig? = null,
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*/
val cacheNamespace: String? = null,
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `null`, the global TTL applies.
*/
val cacheTtlSecs: Long? = null,
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*/
val email: EmailConfig? = null,
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*/
val concurrency: String? = null,
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*/
val maxArchiveDepth: Long = 0L,
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*/
val treeSitter: TreeSitterConfig? = null,
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult.structured_output`.
*/
val structuredExtraction: StructuredExtractionConfig? = null,
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a `CancellationToken` clone here and call `CancellationToken.cancel`
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns `KreuzbergError.Cancelled` when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*/
val cancelToken: String? = null,
)

View File

@@ -0,0 +1,53 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** The complete diff between two `ExtractionResult` values. */
data class ExtractionDiff(
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*/
val contentDiff: List<DiffHunk> = emptyList(),
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
val tablesAdded: List<Table> = emptyList(),
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
val tablesRemoved: List<Table> = emptyList(),
/** Cell-level changes for table pairs that share the same index and dimensions. */
val tablesChanged: List<TableDiff> = emptyList(),
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ — each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*/
val metadataChanged: Any,
/** Changes to embedded archive children. */
val embeddedChanges: EmbeddedChanges,
)

View File

@@ -0,0 +1,51 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** How the extracted text was produced. */
enum class ExtractionMethod {
@com.fasterxml.jackson.annotation.JsonProperty("native")
NATIVE,
@com.fasterxml.jackson.annotation.JsonProperty("ocr")
OCR,
@com.fasterxml.jackson.annotation.JsonProperty("mixed")
MIXED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
NATIVE -> "native"
OCR -> "ocr"
MIXED -> "mixed"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ExtractionMethod = when (value) {
"native" -> NATIVE
"ocr" -> OCR
"mixed" -> MIXED
else -> throw IllegalArgumentException("Unknown ExtractionMethod value: $value")
}
}
}

View File

@@ -0,0 +1,229 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
data class ExtractionResult(
val content: String = "",
val mimeType: String = "",
val metadata: Metadata = Metadata(),
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
val extractionMethod: ExtractionMethod? = null,
val tables: List<Table> = emptyList(),
val detectedLanguages: List<String>? = null,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
val chunks: List<Chunk>? = null,
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*/
val images: List<ExtractedImage>? = null,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
val pages: List<PageContent>? = null,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
val elements: List<Element>? = null,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
*
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `null` for non-Djot documents.
*/
val djotContent: DjotContent? = null,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
*
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*/
val ocrElements: List<OcrElement>? = null,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
*
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
val document: DocumentStructure? = null,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*/
val extractedKeywords: List<Keyword>? = null,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*/
val qualityScore: Double? = null,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*/
val processingWarnings: List<ProcessingWarning> = emptyList(),
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig.extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
val annotations: List<PdfAnnotation>? = null,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `null` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*/
val children: List<ArchiveEntry>? = null,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
val uris: List<ExtractedUri>? = null,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* …). Every extractor defaults to `null` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent — the revision list is the separate
* audit trail.
*/
val revisions: List<DocumentRevision>? = null,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
val structuredOutput: Any? = null,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack.ProcessResult`.
*/
val codeIntelligence: Any? = null,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `null` when no LLM was used.
*/
val llmUsage: List<LlmUsage>? = null,
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*/
val formattedContent: String? = null,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
val ocrInternalDocument: String? = null,
)

View File

@@ -0,0 +1,30 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** FictionBook (FB2) metadata. */
data class FictionBookMetadata(
val genres: List<String> = emptyList(),
val sequences: List<String> = emptyList(),
val annotation: String? = null,
)

View File

@@ -0,0 +1,100 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` — `null` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
*
* - `max_concurrent_extractions` — controls batch parallelism
* - `use_cache` — global caching policy
* - `acceleration` — shared ONNX execution provider
* - `security_limits` — global archive security policy
*/
data class FileExtractionConfig(
/** Override quality post-processing for this file. */
val enableQualityProcessing: Boolean? = null,
/** Override OCR configuration for this file (None in the Option = use batch default). */
val ocr: OcrConfig? = null,
/** Override force OCR for this file. */
val forceOcr: Boolean? = null,
/** Override force OCR pages for this file (1-indexed page numbers). */
val forceOcrPages: List<Int>? = null,
/** Override disable OCR for this file. */
val disableOcr: Boolean? = null,
/** Override chunking configuration for this file. */
val chunking: ChunkingConfig? = null,
/** Override content filtering configuration for this file. */
val contentFilter: ContentFilterConfig? = null,
/** Override image extraction configuration for this file. */
val images: ImageExtractionConfig? = null,
/** Override PDF options for this file. */
val pdfOptions: PdfConfig? = null,
/** Override token reduction for this file. */
val tokenReduction: TokenReductionOptions? = null,
/** Override language detection for this file. */
val languageDetection: LanguageDetectionConfig? = null,
/** Override page extraction for this file. */
val pages: PageConfig? = null,
/** Override keyword extraction for this file. */
val keywords: KeywordConfig? = null,
/** Override post-processor for this file. */
val postprocessor: PostProcessorConfig? = null,
/** Override HTML conversion options for this file. */
val htmlOptions: String? = null,
/** Override result format for this file. */
val resultFormat: ResultFormat? = null,
/** Override output content format for this file. */
val outputFormat: OutputFormat? = null,
/** Override document structure output for this file. */
val includeDocumentStructure: Boolean? = null,
/** Override layout detection for this file. */
val layout: LayoutDetectionConfig? = null,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
val timeoutSecs: Long? = null,
/** Override tree-sitter configuration for this file. */
val treeSitter: TreeSitterConfig? = null,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
val structuredExtraction: StructuredExtractionConfig? = null,
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Footnote in Djot. */
data class Footnote(
/** Footnote label */
val label: String,
/** Footnote content blocks */
val content: List<FormattedBlock> = emptyList(),
)

View File

@@ -0,0 +1,227 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = FormatMetadataDeserializer::class)
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = FormatMetadataSerializer::class)
sealed class FormatMetadata {
data class Pdf(val metadata: PdfMetadata) : FormatMetadata()
data class Docx(val metadata: DocxMetadata) : FormatMetadata()
data class Excel(val metadata: ExcelMetadata) : FormatMetadata()
data class Email(val metadata: EmailMetadata) : FormatMetadata()
data class Pptx(val metadata: PptxMetadata) : FormatMetadata()
data class Archive(val metadata: ArchiveMetadata) : FormatMetadata()
data class Image(val metadata: ImageMetadata) : FormatMetadata()
data class Xml(val metadata: XmlMetadata) : FormatMetadata()
data class Text(val metadata: TextMetadata) : FormatMetadata()
data class Html(val metadata: HtmlMetadata) : FormatMetadata()
data class Ocr(val metadata: OcrMetadata) : FormatMetadata()
data class Csv(val metadata: CsvMetadata) : FormatMetadata()
data class Bibtex(val metadata: BibtexMetadata) : FormatMetadata()
data class Citation(val metadata: CitationMetadata) : FormatMetadata()
data class FictionBook(val metadata: FictionBookMetadata) : FormatMetadata()
data class Dbf(val metadata: DbfMetadata) : FormatMetadata()
data class Jats(val metadata: JatsMetadata) : FormatMetadata()
data class Epub(val metadata: EpubMetadata) : FormatMetadata()
data class Pst(val metadata: PstMetadata) : FormatMetadata()
data class Code(val value: String) : FormatMetadata()
}
private class FormatMetadataDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<FormatMetadata>(FormatMetadata::class.java) {
@Suppress("LongMethod")
override fun deserialize(
parser: com.fasterxml.jackson.core.JsonParser,
ctx: com.fasterxml.jackson.databind.DeserializationContext,
): FormatMetadata {
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
val tag = node.get("format_type")?.asText()
@Suppress("UNCHECKED_CAST")
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("format_type") }
return when (tag) {
"pdf" -> FormatMetadata.Pdf(ctx.readTreeAsValue<PdfMetadata>(payload, PdfMetadata::class.java))
"docx" -> FormatMetadata.Docx(ctx.readTreeAsValue<DocxMetadata>(payload, DocxMetadata::class.java))
"excel" -> FormatMetadata.Excel(ctx.readTreeAsValue<ExcelMetadata>(payload, ExcelMetadata::class.java))
"email" -> FormatMetadata.Email(ctx.readTreeAsValue<EmailMetadata>(payload, EmailMetadata::class.java))
"pptx" -> FormatMetadata.Pptx(ctx.readTreeAsValue<PptxMetadata>(payload, PptxMetadata::class.java))
"archive" -> FormatMetadata.Archive(ctx.readTreeAsValue<ArchiveMetadata>(payload, ArchiveMetadata::class.java))
"image" -> FormatMetadata.Image(ctx.readTreeAsValue<ImageMetadata>(payload, ImageMetadata::class.java))
"xml" -> FormatMetadata.Xml(ctx.readTreeAsValue<XmlMetadata>(payload, XmlMetadata::class.java))
"text" -> FormatMetadata.Text(ctx.readTreeAsValue<TextMetadata>(payload, TextMetadata::class.java))
"html" -> FormatMetadata.Html(ctx.readTreeAsValue<HtmlMetadata>(payload, HtmlMetadata::class.java))
"ocr" -> FormatMetadata.Ocr(ctx.readTreeAsValue<OcrMetadata>(payload, OcrMetadata::class.java))
"csv" -> FormatMetadata.Csv(ctx.readTreeAsValue<CsvMetadata>(payload, CsvMetadata::class.java))
"bibtex" -> FormatMetadata.Bibtex(ctx.readTreeAsValue<BibtexMetadata>(payload, BibtexMetadata::class.java))
"citation" -> FormatMetadata.Citation(ctx.readTreeAsValue<CitationMetadata>(payload, CitationMetadata::class.java))
"fiction_book" -> FormatMetadata.FictionBook(ctx.readTreeAsValue<FictionBookMetadata>(payload, FictionBookMetadata::class.java))
"dbf" -> FormatMetadata.Dbf(ctx.readTreeAsValue<DbfMetadata>(payload, DbfMetadata::class.java))
"jats" -> FormatMetadata.Jats(ctx.readTreeAsValue<JatsMetadata>(payload, JatsMetadata::class.java))
"epub" -> FormatMetadata.Epub(ctx.readTreeAsValue<EpubMetadata>(payload, EpubMetadata::class.java))
"pst" -> FormatMetadata.Pst(ctx.readTreeAsValue<PstMetadata>(payload, PstMetadata::class.java))
"code" -> FormatMetadata.Code(ctx.readTreeAsValue<String>(payload, String::class.java))
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
parser, "Unknown FormatMetadata tag", tag, FormatMetadata::class.java,
)
}
}
}
private class FormatMetadataSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<FormatMetadata>(FormatMetadata::class.java) {
@Suppress("LongMethod")
override fun serialize(
value: FormatMetadata,
gen: com.fasterxml.jackson.core.JsonGenerator,
provider: com.fasterxml.jackson.databind.SerializerProvider,
) {
@Suppress("UNCHECKED_CAST")
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
is FormatMetadata.Pdf -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pdf")
n
}
is FormatMetadata.Docx -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "docx")
n
}
is FormatMetadata.Excel -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "excel")
n
}
is FormatMetadata.Email -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "email")
n
}
is FormatMetadata.Pptx -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pptx")
n
}
is FormatMetadata.Archive -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "archive")
n
}
is FormatMetadata.Image -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "image")
n
}
is FormatMetadata.Xml -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "xml")
n
}
is FormatMetadata.Text -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "text")
n
}
is FormatMetadata.Html -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "html")
n
}
is FormatMetadata.Ocr -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "ocr")
n
}
is FormatMetadata.Csv -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "csv")
n
}
is FormatMetadata.Bibtex -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "bibtex")
n
}
is FormatMetadata.Citation -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "citation")
n
}
is FormatMetadata.FictionBook -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "fiction_book")
n
}
is FormatMetadata.Dbf -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "dbf")
n
}
is FormatMetadata.Jats -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "jats")
n
}
is FormatMetadata.Epub -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "epub")
n
}
is FormatMetadata.Pst -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "pst")
n
}
is FormatMetadata.Code -> {
@Suppress("UNCHECKED_CAST")
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
n.put("format_type", "code")
n
}
}
mapper.writeTree(gen, node)
}
}

View File

@@ -0,0 +1,45 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
data class FormattedBlock(
/** Type of block element */
val blockType: BlockType,
/** Heading level (1-6) for headings, or nesting level for lists */
val level: Long? = null,
/** Inline content within the block */
val inlineContent: List<InlineElement> = emptyList(),
/** Element attributes (classes, IDs, key-value pairs) */
val attributes: String? = null,
/** Language identifier for code blocks */
val language: String? = null,
/** Raw code content for code blocks */
val code: String? = null,
/** Nested blocks for containers (blockquotes, list items, divs) */
val children: List<FormattedBlock> = emptyList(),
)

View File

@@ -0,0 +1,54 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
enum class FracType {
@com.fasterxml.jackson.annotation.JsonProperty("Bar")
BAR,
@com.fasterxml.jackson.annotation.JsonProperty("NoBar")
NO_BAR,
@com.fasterxml.jackson.annotation.JsonProperty("Linear")
LINEAR,
@com.fasterxml.jackson.annotation.JsonProperty("Skewed")
SKEWED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
BAR -> "Bar"
NO_BAR -> "NoBar"
LINEAR -> "Linear"
SKEWED -> "Skewed"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): FracType = when (value) {
"Bar" -> BAR
"NoBar" -> NO_BAR
"Linear" -> LINEAR
"Skewed" -> SKEWED
else -> throw IllegalArgumentException("Unknown FracType value: $value")
}
}
}

View File

@@ -0,0 +1,41 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Individual grid cell with position and span metadata. */
data class GridCell(
/** Cell text content. */
val content: String,
/** Zero-indexed row position. */
val row: Int,
/** Zero-indexed column position. */
val col: Int,
/** Number of rows this cell spans. */
val rowSpan: Int,
/** Number of columns this cell spans. */
val colSpan: Int,
/** Whether this is a header cell. */
val isHeader: Boolean,
/** Bounding box for this cell (if available). */
val bbox: BoundingBox? = null,
)

View File

@@ -0,0 +1,37 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Header/heading element metadata. */
data class HeaderMetadata(
/** Header level: 1 (h1) through 6 (h6) */
val level: Byte,
/** Normalized text content of the header */
val text: String,
/** HTML id attribute if present */
val id: String? = null,
/** Document tree depth at the header element */
val depth: Int,
/** Byte offset in original HTML document */
val htmlOffset: Int,
)

View File

@@ -0,0 +1,36 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
data class HeadingContext(
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*/
val headings: List<HeadingLevel> = emptyList(),
)

View File

@@ -0,0 +1,31 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** A single heading in the hierarchy. */
data class HeadingLevel(
/** Heading depth (1 = h1, 2 = h2, etc.) */
val level: Byte,
/** The text content of the heading. */
val text: String,
)

View File

@@ -0,0 +1,56 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
data class HierarchicalBlock(
/** The text content of this block */
val text: String,
/** The font size of the text in this block */
val fontSize: Float,
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
*
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*/
val level: String,
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*/
val bbox: List<Float>? = null,
)

View File

@@ -0,0 +1,52 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
data class HierarchyConfig(
/** Enable hierarchy extraction */
val enabled: Boolean = true,
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*/
val kClusters: Long = 3L,
/** Include bounding box information in hierarchy blocks */
val includeBbox: Boolean = true,
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*/
val ocrCoverageThreshold: Float? = null,
)

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
data class HtmlMetadata(
/** Document title from `<title>` tag */
val title: String? = null,
/** Document description from `<meta name="description">` tag */
val description: String? = null,
/** Document keywords from `<meta name="keywords">` tag, split on commas */
val keywords: List<String> = emptyList(),
/** Document author from `<meta name="author">` tag */
val author: String? = null,
/** Canonical URL from `<link rel="canonical">` tag */
val canonicalUrl: String? = null,
/** Base URL from `<base href="">` tag for resolving relative URLs */
val baseHref: String? = null,
/** Document language from `lang` attribute */
val language: String? = null,
/** Document text direction from `dir` attribute */
val textDirection: TextDirection? = null,
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*/
val openGraph: Map<String, String> = emptyMap(),
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*/
val twitterCard: Map<String, String> = emptyMap(),
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*/
val metaTags: Map<String, String> = emptyMap(),
/** Extracted header elements with hierarchy */
val headers: List<HeaderMetadata> = emptyList(),
/** Extracted hyperlinks with type classification */
val links: List<LinkMetadata> = emptyList(),
/** Extracted images with source and dimensions */
val images: List<ImageMetadataType> = emptyList(),
/** Extracted structured data blocks */
val structuredData: List<StructuredData> = emptyList(),
)

View File

@@ -0,0 +1,63 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Configuration for styled HTML output.
*
* When set on `ExtractionConfig.html_output` alongside
* `output_format = OutputFormat.Html`, the pipeline builds a
* `StyledHtmlRenderer` instead of
* the plain comrak-based renderer.
*/
data class HtmlOutputConfig(
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*/
val css: String? = null,
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*/
val cssFile: java.nio.file.Path? = null,
/** Built-in colour/typography theme. Default: `HtmlTheme.Unstyled`. */
val theme: HtmlTheme = HtmlTheme.UNSTYLED,
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*/
val classPrefix: String = "",
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*/
val embedCss: Boolean = true,
)

View File

@@ -0,0 +1,71 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Built-in HTML theme selection. */
enum class HtmlTheme {
/**
* Sensible defaults: system font stack, neutral colours, readable line
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
* can override individual values.
*/
@com.fasterxml.jackson.annotation.JsonProperty("default")
DEFAULT,
/** GitHub Markdown-inspired palette and spacing. */
@com.fasterxml.jackson.annotation.JsonProperty("github")
GIT_HUB,
/** Dark background, light text. */
@com.fasterxml.jackson.annotation.JsonProperty("dark")
DARK,
/** Minimal light theme with generous whitespace. */
@com.fasterxml.jackson.annotation.JsonProperty("light")
LIGHT,
/**
* No built-in stylesheet emitted. CSS custom properties are still defined
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
*/
@com.fasterxml.jackson.annotation.JsonProperty("unstyled")
UNSTYLED;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
DEFAULT -> "default"
GIT_HUB -> "github"
DARK -> "dark"
LIGHT -> "light"
UNSTYLED -> "unstyled"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): HtmlTheme = when (value) {
"default" -> DEFAULT
"github" -> GIT_HUB
"dark" -> DARK
"light" -> LIGHT
"unstyled" -> UNSTYLED
else -> throw IllegalArgumentException("Unknown HtmlTheme value: $value")
}
}
}

View File

@@ -0,0 +1,143 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
*
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
*/
interface IDocumentExtractor {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Extract content from a byte array.
*
* This is the core extraction method that processes in-memory document data.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
* The pipeline will convert this into the public `ExtractionResult`.
*
* **Errors:**
*
* - `KreuzbergError.Parsing` - Document parsing failed
* - `KreuzbergError.Validation` - Invalid document structure
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
* - `KreuzbergError.MissingDependency` - Required dependency not available
*/
suspend fun extractBytes(
content: ByteArray,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Extract content from a file.
*
* Default implementation reads the file and calls `extract_bytes`.
* Override for custom file handling, streaming, or memory optimizations.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
*
* **Errors:**
*
* Same as `extract_bytes`, plus file I/O errors.
*/
suspend fun extractFile(
path: java.nio.file.Path,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Get the list of MIME types supported by this extractor.
*
* Can include exact MIME types and prefix patterns:
*
* - Exact: `"application/pdf"`, `"text/plain"`
* - Prefix: `"image/*"` (matches any image type)
*
* **Returns:**
*
* A slice of MIME type strings.
*/
fun supportedMimeTypes(): List<String>
/**
* Get the priority of this extractor.
*
* Higher priority extractors are preferred when multiple extractors
* support the same MIME type.
*
* # Priority Guidelines
*
* - **0-25**: Fallback/low-quality extractors
* - **26-49**: Alternative extractors
* - **50**: Default priority (built-in extractors)
* - **51-75**: Premium/enhanced extractors
* - **76-100**: Specialized/high-priority extractors
*
* **Returns:**
*
* Priority value (default: 50)
*/
fun priority(): Int
/**
* Optional: Check if this extractor can handle a specific file.
*
* Allows for more sophisticated detection beyond MIME types.
* Defaults to `true` (rely on MIME type matching).
*
* **Returns:**
*
* `true` if the extractor can handle this file, `false` otherwise.
*/
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
}

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Trait for in-process embedding backend plugins.
*
* Async to match the convention used by `OcrBackend`,
* `DocumentExtractor`, and `PostProcessor`.
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
* equivalent to satisfy the async signature.
*
* # Thread safety
*
* Backends must be `Send + Sync + 'static`. They are stored in
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
* pipeline. If the backend's underlying model isn't thread-safe, the backend
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
*
* # Contract
*
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
* `self.dimensions()`. The dispatcher in `embed_texts`
* validates this before returning to downstream consumers; a non-conforming
* backend surfaces as a `KreuzbergError.Validation`, not a panic.
*
* - `embed` may be called from any thread. Its future must be `Send`
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
*
* - `dimensions()` is called exactly once at registration, immediately after
* `initialize()` succeeds. The returned value is cached by the registry and
* used for all subsequent shape validation. Lazy-loading implementations can
* defer model loading into `initialize()` and report the real dimension
* afterwards. Later mutations of the backend's reported dimension are not
* observed by kreuzberg — implementations that need to change dimension
* must unregister and re-register.
*
* - `shutdown()` (inherited from `Plugin`) may be invoked
* concurrently with an in-flight `embed()` call. Implementations must
* tolerate this — e.g. by letting in-flight calls finish using resources
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
* shared state that isn't needed by `embed`.
*
* # Runtime
*
* The synchronous `embed_texts` entry uses
* `tokio.task.block_in_place` to await the trait's async `embed`, which
* requires a multi-thread tokio runtime. Callers running inside a
* `current_thread` runtime (e.g. `#[tokio.test]` without `flavor = "multi_thread"`,
* or `tokio.runtime.Builder.new_current_thread()`) must use
* `embed_texts_async` instead, which awaits directly without
* `block_in_place`.
*/
interface IEmbeddingBackend {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Embedding vector dimension. Must be `> 0` and must match the length of
* every vector returned by `embed`.
*/
fun dimensions(): Long
/**
* Embed a batch of texts, returning one vector per input in order.
*
* **Errors:**
*
* Implementations should return `Plugin` for
* backend-specific failures. The dispatcher layers its own validation
* (length, per-vector dimension) on top.
*/
suspend fun embed(texts: List<String>): List<List<Float>>
}

View File

@@ -0,0 +1,116 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Trait for OCR backend plugins.
*
* Implement this trait to add custom OCR capabilities. OCR backends can be:
*
* - Native Rust implementations (like Tesseract)
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
*
* # Thread Safety
*
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
*/
interface IOcrBackend {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Process an image and extract text via OCR.
*
* **Returns:**
*
* An `ExtractionResult` containing the extracted text and metadata.
*
* **Errors:**
*
* - `KreuzbergError.Ocr` - OCR processing failed
* - `KreuzbergError.Validation` - Invalid image format or configuration
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
*
* # Reading `backend_options`
*
* Backends that support runtime tuning can read `config.backend_options` and
* deserialize only the keys they care about. Unknown keys are silently ignored,
* so multiple backends can coexist in a pipeline without key conflicts.
*/
suspend fun processImage(imageBytes: ByteArray, config: OcrConfig): ExtractionResult
/**
* Process a file and extract text via OCR.
*
* Default implementation reads the file and calls `process_image`.
* Override for custom file handling or optimizations.
*
* **Errors:**
*
* Same as `process_image`, plus file I/O errors.
*/
suspend fun processImageFile(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
/**
* Check if this backend supports a given language code.
*
* **Returns:**
*
* `true` if the language is supported, `false` otherwise.
*/
fun supportsLanguage(lang: String): Boolean
/**
* Get the backend type identifier.
*
* **Returns:**
*
* The backend type enum value.
*/
fun backendType(): OcrBackendType
/**
* Optional: Get a list of all supported languages.
*
* Defaults to empty list. Override to provide comprehensive language support info.
*/
fun supportedLanguages(): List<String>
/**
* Optional: Check if the backend supports table detection.
*
* Defaults to `false`. Override if your backend can detect and extract tables.
*/
fun supportsTableDetection(): Boolean
/**
* Check if the backend supports direct document-level processing (e.g. for PDFs).
*
* Defaults to `false`. Override if the backend has optimized document processing.
*/
fun supportsDocumentProcessing(): Boolean
/**
* Process a document file directly via OCR.
*
* Only called if `supports_document_processing` returns `true`.
*/
suspend fun processDocument(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
}

View File

@@ -0,0 +1,144 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Trait for post-processor plugins.
*
* Post-processors transform or enrich extraction results after the initial
* extraction is complete. They can:
*
* - Clean and normalize text
* - Add metadata (language, keywords, entities)
* - Split content into chunks
* - Score quality
* - Apply custom transformations
*
* # Processing Order
*
* Post-processors are executed in stage order:
*
* 1. **Early** - Language detection, entity extraction
* 2. **Middle** - Keyword extraction, token reduction
* 3. **Late** - Custom hooks, final validation
*
* Within each stage, processors are executed in registration order.
*
* # Error Handling
*
* Post-processor errors are non-fatal by default - they're captured in metadata
* and execution continues. To make errors fatal, return an error from `process()`.
*
* # Thread Safety
*
* Post-processors must be thread-safe (`Send + Sync`).
*/
interface IPostProcessor {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Process an extraction result.
*
* Transform or enrich the extraction result. Can modify:
*
* - `content` - The extracted text
* - `metadata` - Add or update metadata fields
* - `tables` - Modify or enhance table data
*
* **Returns:**
*
* `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
*
* **Errors:**
*
* Return errors for fatal processing failures. Non-fatal errors should be
* captured in metadata directly on the result.
*
* # Performance
*
* This signature avoids unnecessary cloning of large extraction results by
* taking a mutable reference instead of ownership. Processors modify the
* result in place.
*
* # Example - Language Detection
*
*
* # Example - Text Cleaning
*
* ```rust
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Remove excessive whitespace
* result.content = result
* .content
* .split_whitespace()
* .collect::<Vec<_>>()
* .join(" ");
*
* Ok(())
* }
* ```
*/
suspend fun process(result: ExtractionResult, config: ExtractionConfig)
/**
* Get the processing stage for this post-processor.
*
* Determines when this processor runs in the pipeline.
*
* **Returns:**
*
* The `ProcessingStage` (Early, Middle, or Late).
*/
fun processingStage(): ProcessingStage
/**
* Optional: Check if this processor should run for a given result.
*
* Allows conditional processing based on MIME type, metadata, or content.
* Defaults to `true` (always run).
*
* **Returns:**
*
* `true` if the processor should run, `false` to skip.
*/
fun shouldProcess(result: ExtractionResult, config: ExtractionConfig): Boolean
/**
* Optional: Estimate processing time in milliseconds.
*
* Used for logging and debugging. Defaults to 0 (unknown).
*
* **Returns:**
*
* Estimated processing time in milliseconds.
*/
fun estimatedDurationMs(result: ExtractionResult): Long
/**
* Execution priority within the processing stage.
*
* Higher values run first within the same `ProcessingStage`. Defaults to 50.
* Use 0-49 for fallback processors, 50 for normal processors, and 51-255
* for high-priority processors that should run early in their stage.
*/
fun priority(): Int
}

View File

@@ -0,0 +1,59 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Trait for document renderers that convert `InternalDocument` to output strings.
*
* Renderers are typically stateless converters that transform the internal
* document representation into a specific output format (Markdown, HTML,
* Djot, plain text, etc.). They participate in the standard `Plugin`
* lifecycle so custom renderers can be registered from any supported binding
* language.
*
* The format name is exposed via `Plugin.name`. For stateless renderers
* the `Plugin` lifecycle methods (`version`, `initialize`, `shutdown`) all
* take no-op defaults and need not be overridden.
*
* # Thread Safety
*
* Renderers must be `Send + Sync` (inherited from `Plugin`).
*/
interface IRenderer {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Render an `InternalDocument` to the output format.
*
* **Returns:**
*
* The rendered output as a string.
*
* **Errors:**
*
* Returns an error if rendering fails.
*/
fun render(doc: ExtractionResult): String
}

View File

@@ -0,0 +1,164 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Trait for validator plugins.
*
* Validators check extraction results for quality, completeness, or correctness.
* Unlike post-processors, validator errors **fail fast** - if a validator returns
* an error, the extraction fails immediately.
*
* # Use Cases
*
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
* - **Compliance**: Verify content meets regulatory requirements
* - **Content Filtering**: Reject documents containing unwanted content
* - **Format Validation**: Verify extracted content structure
* - **Security Checks**: Scan for malicious content
*
* # Error Handling
*
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
* to the caller. Use validators for hard requirements that must be met.
*
* For non-fatal checks, use post-processors instead.
*
* # Thread Safety
*
* Validators must be thread-safe (`Send + Sync`).
*/
interface IValidator {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Validate an extraction result.
*
* Check the extraction result and return `Ok(())` if valid, or an error
* if validation fails.
*
* **Returns:**
*
* - `Ok(())` if validation passes
* - `Err(...)` if validation fails (extraction will fail)
*
* **Errors:**
*
* - `KreuzbergError.Validation` - Validation failed
* - Any other error type appropriate for the failure
*
* # Example - Content Length Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* let length = result.content.len();
*
* if length < self.min {
* return Err(KreuzbergError::validation(format!(
* "Content too short: {} < {} characters",
* length, self.min
* )));
* }
*
* if length > self.max {
* return Err(KreuzbergError::validation(format!(
* "Content too long: {} > {} characters",
* length, self.max
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Quality Score Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check if quality_score exists in metadata
* let score = result.metadata
* .additional
* .get("quality_score")
* .and_then(|v| v.as_f64())
* .unwrap_or(0.0);
*
* if score < self.min_score {
* return Err(KreuzbergError::validation(format!(
* "Quality score too low: {} < {}",
* score, self.min_score
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Security Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check for blocked patterns
* for pattern in &self.blocked_patterns {
* if result.content.contains(pattern) {
* return Err(KreuzbergError::validation(format!(
* "Content contains blocked pattern: {}",
* pattern
* )));
* }
* }
*
* Ok(())
* }
* ```
*/
suspend fun validate(result: ExtractionResult, config: ExtractionConfig)
/**
* Optional: Check if this validator should run for a given result.
*
* Allows conditional validation based on MIME type, metadata, or content.
* Defaults to `true` (always run).
*
* **Returns:**
*
* `true` if the validator should run, `false` to skip.
*/
fun shouldValidate(result: ExtractionResult, config: ExtractionConfig): Boolean
/**
* Optional: Get the validation priority.
*
* Higher priority validators run first. Useful for ordering validation checks
* (e.g., run cheap validations before expensive ones).
*
* Default priority is 50.
*
* **Returns:**
*
* Priority value (higher = runs earlier).
*/
fun priority(): Int
}

View File

@@ -0,0 +1,95 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image extraction configuration. */
data class ImageExtractionConfig(
/** Extract images from documents */
val extractImages: Boolean = true,
/** Target DPI for image normalization */
val targetDpi: Int = 300,
/** Maximum dimension for images (width or height) */
val maxImageDimension: Int = 4096,
/**
* Whether to inject image reference placeholders into markdown output.
* When `true` (default), image references like `![Image 1](embedded:p1_i0)`
* are appended to the markdown. Set to `false` to extract images as data
* without polluting the markdown output.
*/
val injectPlaceholders: Boolean = true,
/** Automatically adjust DPI based on image content */
val autoAdjustDpi: Boolean = true,
/** Minimum DPI threshold */
val minDpi: Int = 72,
/** Maximum DPI threshold */
val maxDpi: Int = 600,
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* `null` (default) means no limit — all images are extracted.
*/
val maxImagesPerPage: Int? = null,
/**
* When `true` (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*/
val classify: Boolean = true,
/**
* When `true`, full-page renders produced during OCR preprocessing are captured
* and returned as `ImageKind.PageRaster` entries in `ExtractionResult.images`.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
*
* Defaults to `false`. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*/
val includePageRasters: Boolean = false,
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
* are processed with the configured OCR backend. Set to `false` to extract images
* without OCR processing, even when OCR is enabled.
*/
val runOcrOnImages: Boolean = true,
/**
* When `true`, image OCR results are rendered as plain text without the
* `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
* is also `true`.
*/
val ocrTextOnly: Boolean = false,
/**
* When `true` and `ocr_text_only` is `false`, append the OCR text after
* the image placeholder in the rendered output.
*/
val appendOcrText: Boolean = false,
)

View File

@@ -0,0 +1,99 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Heuristic classification of what an image likely depicts. */
enum class ImageKind {
/** Photographic image (natural scene, photograph) */
@com.fasterxml.jackson.annotation.JsonProperty("photograph")
PHOTOGRAPH,
/** Technical or schematic diagram */
@com.fasterxml.jackson.annotation.JsonProperty("diagram")
DIAGRAM,
/** Chart, graph, or plot */
@com.fasterxml.jackson.annotation.JsonProperty("chart")
CHART,
/** Freehand or technical drawing */
@com.fasterxml.jackson.annotation.JsonProperty("drawing")
DRAWING,
/** Text-heavy image (scanned text, document) */
@com.fasterxml.jackson.annotation.JsonProperty("text_block")
TEXT_BLOCK,
/** Decorative element or border */
@com.fasterxml.jackson.annotation.JsonProperty("decoration")
DECORATION,
/** Logo or brand mark */
@com.fasterxml.jackson.annotation.JsonProperty("logo")
LOGO,
/** Small icon */
@com.fasterxml.jackson.annotation.JsonProperty("icon")
ICON,
/** Fragment of a larger tiled image (tile of a technical drawing) */
@com.fasterxml.jackson.annotation.JsonProperty("tile_fragment")
TILE_FRAGMENT,
/** Mask or transparency map */
@com.fasterxml.jackson.annotation.JsonProperty("mask")
MASK,
/** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
@com.fasterxml.jackson.annotation.JsonProperty("page_raster")
PAGE_RASTER,
/** Could not classify with reasonable confidence */
@com.fasterxml.jackson.annotation.JsonProperty("unknown")
UNKNOWN;
@com.fasterxml.jackson.annotation.JsonValue
fun toWire(): String = when (this) {
PHOTOGRAPH -> "photograph"
DIAGRAM -> "diagram"
CHART -> "chart"
DRAWING -> "drawing"
TEXT_BLOCK -> "text_block"
DECORATION -> "decoration"
LOGO -> "logo"
ICON -> "icon"
TILE_FRAGMENT -> "tile_fragment"
MASK -> "mask"
PAGE_RASTER -> "page_raster"
UNKNOWN -> "unknown"
}
companion object {
@com.fasterxml.jackson.annotation.JsonCreator
@JvmStatic
fun fromWire(value: String): ImageKind = when (value) {
"photograph" -> PHOTOGRAPH
"diagram" -> DIAGRAM
"chart" -> CHART
"drawing" -> DRAWING
"text_block" -> TEXT_BLOCK
"decoration" -> DECORATION
"logo" -> LOGO
"icon" -> ICON
"tile_fragment" -> TILE_FRAGMENT
"mask" -> MASK
"page_raster" -> PAGE_RASTER
"unknown" -> UNKNOWN
else -> throw IllegalArgumentException("Unknown ImageKind value: $value")
}
}
}

View File

@@ -0,0 +1,39 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
data class ImageMetadata(
/** Image width in pixels */
val width: Int = 0,
/** Image height in pixels */
val height: Int = 0,
/** Image format (e.g., "PNG", "JPEG", "TIFF") */
val format: String = "",
/** EXIF metadata tags */
val exif: Map<String, String> = emptyMap(),
)

View File

@@ -0,0 +1,39 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/** Image element metadata. */
data class ImageMetadataType(
/** Image source (URL, data URI, or SVG content) */
val src: String,
/** Alternative text from alt attribute */
val alt: String? = null,
/** Title attribute */
val title: String? = null,
/** Image dimensions as (width, height) if available */
val dimensions: List<Int>? = null,
/** Image type classification */
val imageType: ImageType,
/** Additional attributes as key-value pairs */
val attributes: List<List<String>> = emptyList(),
)

View File

@@ -0,0 +1,47 @@
// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
data class ImagePreprocessingConfig(
/** Target DPI for the image (300 is standard, 600 for small text). */
val targetDpi: Int = 300,
/** Auto-detect and correct image rotation. */
val autoRotate: Boolean = true,
/** Correct skew (tilted images). */
val deskew: Boolean = true,
/** Remove noise from the image. */
val denoise: Boolean = false,
/** Enhance contrast for better text visibility. */
val contrastEnhance: Boolean = false,
/** Binarization method: "otsu", "sauvola", "adaptive". */
val binarizationMethod: String = "otsu",
/** Invert colors (white text on black → black on white). */
val invertColors: Boolean = false,
)

Some files were not shown because too many files have changed in this diff Show More