This commit is contained in:
5
packages/kotlin-android/src/main/AndroidManifest.xml
generated
Normal file
5
packages/kotlin-android/src/main/AndroidManifest.xml
generated
Normal file
@@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!-- Generated by alef. Do not edit by hand. -->
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
package="dev.kreuzberg">
|
||||
</manifest>
|
||||
0
packages/kotlin-android/src/main/jniLibs/arm64-v8a/.gitkeep
generated
Normal file
0
packages/kotlin-android/src/main/jniLibs/arm64-v8a/.gitkeep
generated
Normal file
0
packages/kotlin-android/src/main/jniLibs/x86_64/.gitkeep
generated
Normal file
0
packages/kotlin-android/src/main/jniLibs/x86_64/.gitkeep
generated
Normal file
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/AccelerationConfig.kt
generated
Normal file
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/AccelerationConfig.kt
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Hardware acceleration configuration for ONNX Runtime models.
|
||||
*
|
||||
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||||
* for inference in layout detection and embedding generation.
|
||||
*/
|
||||
data class AccelerationConfig(
|
||||
/** Execution provider to use for ONNX inference. */
|
||||
val provider: ExecutionProviderType = ExecutionProviderType.AUTO,
|
||||
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
|
||||
val deviceId: Int = 0,
|
||||
)
|
||||
172
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/AnnotationKind.kt
generated
Normal file
172
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/AnnotationKind.kt
generated
Normal file
@@ -0,0 +1,172 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Types of inline text annotations. */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = AnnotationKindDeserializer::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = AnnotationKindSerializer::class)
|
||||
sealed class AnnotationKind {
|
||||
object Bold : AnnotationKind()
|
||||
object Italic : AnnotationKind()
|
||||
object Underline : AnnotationKind()
|
||||
object Strikethrough : AnnotationKind()
|
||||
object Code : AnnotationKind()
|
||||
object Subscript : AnnotationKind()
|
||||
object Superscript : AnnotationKind()
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Link(
|
||||
val url: String,
|
||||
val title: String?,
|
||||
) : AnnotationKind()
|
||||
/** Highlighted text (PDF highlights, HTML `<mark>`). */
|
||||
object Highlight : AnnotationKind()
|
||||
/** Text color (CSS-compatible value, e.g. "#ff0000", "red"). */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Color(
|
||||
val value: String,
|
||||
) : AnnotationKind()
|
||||
/** Font size with units (e.g. "12pt", "1.2em", "16px"). */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class FontSize(
|
||||
val value: String,
|
||||
) : AnnotationKind()
|
||||
/** Extensible annotation for format-specific styling. */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Custom(
|
||||
val name: String,
|
||||
val value: String?,
|
||||
) : AnnotationKind()
|
||||
}
|
||||
|
||||
private class AnnotationKindDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<AnnotationKind>(AnnotationKind::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun deserialize(
|
||||
parser: com.fasterxml.jackson.core.JsonParser,
|
||||
ctx: com.fasterxml.jackson.databind.DeserializationContext,
|
||||
): AnnotationKind {
|
||||
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
|
||||
val tag = node.get("annotation_type")?.asText()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("annotation_type") }
|
||||
return when (tag) {
|
||||
"bold" -> AnnotationKind.Bold
|
||||
"italic" -> AnnotationKind.Italic
|
||||
"underline" -> AnnotationKind.Underline
|
||||
"strikethrough" -> AnnotationKind.Strikethrough
|
||||
"code" -> AnnotationKind.Code
|
||||
"subscript" -> AnnotationKind.Subscript
|
||||
"superscript" -> AnnotationKind.Superscript
|
||||
"link" -> ctx.readTreeAsValue<AnnotationKind.Link>(payload, AnnotationKind.Link::class.java)
|
||||
"highlight" -> AnnotationKind.Highlight
|
||||
"color" -> ctx.readTreeAsValue<AnnotationKind.Color>(payload, AnnotationKind.Color::class.java)
|
||||
"font_size" -> ctx.readTreeAsValue<AnnotationKind.FontSize>(payload, AnnotationKind.FontSize::class.java)
|
||||
"custom" -> ctx.readTreeAsValue<AnnotationKind.Custom>(payload, AnnotationKind.Custom::class.java)
|
||||
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
|
||||
parser, "Unknown AnnotationKind tag", tag, AnnotationKind::class.java,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class AnnotationKindSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<AnnotationKind>(AnnotationKind::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun serialize(
|
||||
value: AnnotationKind,
|
||||
gen: com.fasterxml.jackson.core.JsonGenerator,
|
||||
provider: com.fasterxml.jackson.databind.SerializerProvider,
|
||||
) {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
|
||||
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
|
||||
is AnnotationKind.Bold -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "bold")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Italic -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "italic")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Underline -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "underline")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Strikethrough -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "strikethrough")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Code -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "code")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Subscript -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "subscript")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Superscript -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "superscript")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Link -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Link) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("annotation_type", "link")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Highlight -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("annotation_type", "highlight")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Color -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Color) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("annotation_type", "color")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.FontSize -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.FontSize) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("annotation_type", "font_size")
|
||||
n
|
||||
}
|
||||
is AnnotationKind.Custom -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as AnnotationKind.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("annotation_type", "custom")
|
||||
n
|
||||
}
|
||||
}
|
||||
mapper.writeTree(gen, node)
|
||||
}
|
||||
}
|
||||
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ArchiveEntry.kt
generated
Normal file
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ArchiveEntry.kt
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A single file extracted from an archive.
|
||||
*
|
||||
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||||
* enabled, each processable file produces its own full `ExtractionResult`.
|
||||
*/
|
||||
data class ArchiveEntry(
|
||||
/** Archive-relative file path (e.g. "folder/document.pdf"). */
|
||||
val path: String,
|
||||
/** Detected MIME type of the file. */
|
||||
val mimeType: String,
|
||||
/** Full extraction result for this file. */
|
||||
val result: ExtractionResult,
|
||||
)
|
||||
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ArchiveMetadata.kt
generated
Normal file
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ArchiveMetadata.kt
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Archive (ZIP/TAR/7Z) metadata.
|
||||
*
|
||||
* Extracted from compressed archive files containing file lists and size information.
|
||||
*/
|
||||
data class ArchiveMetadata(
|
||||
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
|
||||
val format: String = "",
|
||||
/** Total number of files in the archive */
|
||||
val fileCount: Int = 0,
|
||||
/** List of file paths within the archive */
|
||||
val fileList: List<String> = emptyList(),
|
||||
/** Total uncompressed size in bytes */
|
||||
val totalSize: Long = 0L,
|
||||
/** Compressed size in bytes (if available) */
|
||||
val compressedSize: Long? = null,
|
||||
)
|
||||
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BBox.kt
generated
Normal file
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BBox.kt
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
|
||||
data class BBox(val x1: Float, val y1: Float, val x2: Float, val y2: Float)
|
||||
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BatchBytesItem.kt
generated
Normal file
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BatchBytesItem.kt
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Batch item for byte array extraction.
|
||||
*
|
||||
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||||
* to represent a single item in a batch extraction job.
|
||||
*/
|
||||
data class BatchBytesItem(
|
||||
/** The content bytes to extract from */
|
||||
val content: ByteArray,
|
||||
/** MIME type of the content (e.g., "application/pdf", "text/html") */
|
||||
val mimeType: String,
|
||||
/** Per-item configuration overrides (None uses batch-level defaults) */
|
||||
val config: FileExtractionConfig? = null,
|
||||
)
|
||||
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BatchFileItem.kt
generated
Normal file
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BatchFileItem.kt
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Batch item for file extraction.
|
||||
*
|
||||
* Used with `batch_extract_files` and `batch_extract_files_sync`
|
||||
* to represent a single file in a batch extraction job.
|
||||
*/
|
||||
data class BatchFileItem(
|
||||
/** Path to the file to extract from */
|
||||
val path: java.nio.file.Path,
|
||||
/** Per-file configuration overrides (None uses batch-level defaults) */
|
||||
val config: FileExtractionConfig? = null,
|
||||
)
|
||||
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BibtexMetadata.kt
generated
Normal file
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BibtexMetadata.kt
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** BibTeX bibliography metadata. */
|
||||
data class BibtexMetadata(
|
||||
/** Number of entries in the bibliography. */
|
||||
val entryCount: Long = 0L,
|
||||
val citationKeys: List<String> = emptyList(),
|
||||
val authors: List<String> = emptyList(),
|
||||
val yearRange: YearRange? = null,
|
||||
val entryTypes: Map<String, Long>? = null,
|
||||
)
|
||||
103
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BlockType.kt
generated
Normal file
103
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BlockType.kt
generated
Normal file
@@ -0,0 +1,103 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Types of block-level elements in Djot. */
|
||||
enum class BlockType {
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("paragraph")
|
||||
PARAGRAPH,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("heading")
|
||||
HEADING,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("blockquote")
|
||||
BLOCKQUOTE,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
|
||||
CODE_BLOCK,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
|
||||
LIST_ITEM,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("ordered_list")
|
||||
ORDERED_LIST,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("bullet_list")
|
||||
BULLET_LIST,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("task_list")
|
||||
TASK_LIST,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("definition_list")
|
||||
DEFINITION_LIST,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("definition_term")
|
||||
DEFINITION_TERM,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("definition_description")
|
||||
DEFINITION_DESCRIPTION,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("div")
|
||||
DIV,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("section")
|
||||
SECTION,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("thematic_break")
|
||||
THEMATIC_BREAK,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("raw_block")
|
||||
RAW_BLOCK,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("math_display")
|
||||
MATH_DISPLAY;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
PARAGRAPH -> "paragraph"
|
||||
HEADING -> "heading"
|
||||
BLOCKQUOTE -> "blockquote"
|
||||
CODE_BLOCK -> "code_block"
|
||||
LIST_ITEM -> "list_item"
|
||||
ORDERED_LIST -> "ordered_list"
|
||||
BULLET_LIST -> "bullet_list"
|
||||
TASK_LIST -> "task_list"
|
||||
DEFINITION_LIST -> "definition_list"
|
||||
DEFINITION_TERM -> "definition_term"
|
||||
DEFINITION_DESCRIPTION -> "definition_description"
|
||||
DIV -> "div"
|
||||
SECTION -> "section"
|
||||
THEMATIC_BREAK -> "thematic_break"
|
||||
RAW_BLOCK -> "raw_block"
|
||||
MATH_DISPLAY -> "math_display"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): BlockType = when (value) {
|
||||
"paragraph" -> PARAGRAPH
|
||||
"heading" -> HEADING
|
||||
"blockquote" -> BLOCKQUOTE
|
||||
"code_block" -> CODE_BLOCK
|
||||
"list_item" -> LIST_ITEM
|
||||
"ordered_list" -> ORDERED_LIST
|
||||
"bullet_list" -> BULLET_LIST
|
||||
"task_list" -> TASK_LIST
|
||||
"definition_list" -> DEFINITION_LIST
|
||||
"definition_term" -> DEFINITION_TERM
|
||||
"definition_description" -> DEFINITION_DESCRIPTION
|
||||
"div" -> DIV
|
||||
"section" -> SECTION
|
||||
"thematic_break" -> THEMATIC_BREAK
|
||||
"raw_block" -> RAW_BLOCK
|
||||
"math_display" -> MATH_DISPLAY
|
||||
else -> throw IllegalArgumentException("Unknown BlockType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BoundingBox.kt
generated
Normal file
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/BoundingBox.kt
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Bounding box coordinates for element positioning. */
|
||||
data class BoundingBox(
|
||||
/** Left x-coordinate */
|
||||
val x0: Double = 0.0,
|
||||
/** Bottom y-coordinate */
|
||||
val y0: Double = 0.0,
|
||||
/** Right x-coordinate */
|
||||
val x1: Double = 0.0,
|
||||
/** Top y-coordinate */
|
||||
val y1: Double = 0.0,
|
||||
)
|
||||
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CacheStats.kt
generated
Normal file
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CacheStats.kt
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
data class CacheStats(
|
||||
val totalFiles: Long,
|
||||
val totalSizeMb: Double,
|
||||
val availableSpaceMb: Double,
|
||||
val oldestFileAgeDays: Double,
|
||||
val newestFileAgeDays: Double,
|
||||
)
|
||||
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CellChange.kt
generated
Normal file
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CellChange.kt
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A single changed cell within a table.
|
||||
*
|
||||
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
|
||||
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
* `crate.diff` re-exports this type verbatim.
|
||||
*/
|
||||
data class CellChange(
|
||||
/** Zero-based row index. */
|
||||
val row: Long,
|
||||
/** Zero-based column index. */
|
||||
val col: Long,
|
||||
/** Value before the change. */
|
||||
val from: String,
|
||||
/** Value after the change. */
|
||||
val to: String,
|
||||
)
|
||||
51
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Chunk.kt
generated
Normal file
51
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Chunk.kt
generated
Normal file
@@ -0,0 +1,51 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A text chunk with optional embedding and metadata.
|
||||
*
|
||||
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||||
* contains the text content, optional embedding vector (if embedding generation
|
||||
* is configured), and metadata about its position in the document.
|
||||
*/
|
||||
data class Chunk(
|
||||
/** The text content of this chunk. */
|
||||
val content: String,
|
||||
/**
|
||||
* Semantic structural classification of this chunk.
|
||||
*
|
||||
* Assigned by the heuristic classifier based on content patterns and
|
||||
* heading context. Defaults to `ChunkType.Unknown` when no rule matches.
|
||||
*/
|
||||
val chunkType: ChunkType,
|
||||
/**
|
||||
* Optional embedding vector for this chunk.
|
||||
*
|
||||
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
||||
* The dimensionality depends on the chosen embedding model.
|
||||
*/
|
||||
val embedding: List<Float>? = null,
|
||||
/** Metadata about this chunk's position and properties. */
|
||||
val metadata: ChunkMetadata,
|
||||
)
|
||||
68
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkMetadata.kt
generated
Normal file
68
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkMetadata.kt
generated
Normal file
@@ -0,0 +1,68 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Metadata about a chunk's position in the original document. */
|
||||
data class ChunkMetadata(
|
||||
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
|
||||
val byteStart: Long,
|
||||
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
|
||||
val byteEnd: Long,
|
||||
/**
|
||||
* Number of tokens in this chunk (if available).
|
||||
*
|
||||
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
||||
*/
|
||||
val tokenCount: Long? = null,
|
||||
/** Zero-based index of this chunk in the document. */
|
||||
val chunkIndex: Long,
|
||||
/** Total number of chunks in the document. */
|
||||
val totalChunks: Long,
|
||||
/**
|
||||
* First page number this chunk spans (1-indexed).
|
||||
*
|
||||
* Only populated when page tracking is enabled in extraction configuration.
|
||||
*/
|
||||
val firstPage: Int? = null,
|
||||
/**
|
||||
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
||||
*
|
||||
* Only populated when page tracking is enabled in extraction configuration.
|
||||
*/
|
||||
val lastPage: Int? = null,
|
||||
/**
|
||||
* Heading context when using Markdown chunker.
|
||||
*
|
||||
* Contains the heading hierarchy this chunk falls under.
|
||||
* Only populated when `ChunkerType.Markdown` is used.
|
||||
*/
|
||||
val headingContext: HeadingContext? = null,
|
||||
/**
|
||||
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
||||
*
|
||||
* Contains zero-based indices into the top-level `images` collection for every
|
||||
* image whose `page_number` falls within `[first_page, last_page]`.
|
||||
* Empty when image extraction is disabled or the chunk spans no pages with images.
|
||||
*/
|
||||
val imageIndices: List<Int> = emptyList(),
|
||||
)
|
||||
93
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkSizing.kt
generated
Normal file
93
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkSizing.kt
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* How chunk size is measured.
|
||||
*
|
||||
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||||
* chunks are sized by token count according to the specified tokenizer.
|
||||
*
|
||||
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||||
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||||
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||||
*/
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = ChunkSizingDeserializer::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = ChunkSizingSerializer::class)
|
||||
sealed class ChunkSizing {
|
||||
/** Size measured in Unicode characters (default). */
|
||||
object Characters : ChunkSizing()
|
||||
/** Size measured in tokens from a HuggingFace tokenizer. */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Tokenizer(
|
||||
val model: String,
|
||||
val cacheDir: java.nio.file.Path?,
|
||||
) : ChunkSizing()
|
||||
}
|
||||
|
||||
private class ChunkSizingDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<ChunkSizing>(ChunkSizing::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun deserialize(
|
||||
parser: com.fasterxml.jackson.core.JsonParser,
|
||||
ctx: com.fasterxml.jackson.databind.DeserializationContext,
|
||||
): ChunkSizing {
|
||||
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
|
||||
val tag = node.get("type")?.asText()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
|
||||
return when (tag) {
|
||||
"characters" -> ChunkSizing.Characters
|
||||
"tokenizer" -> ctx.readTreeAsValue<ChunkSizing.Tokenizer>(payload, ChunkSizing.Tokenizer::class.java)
|
||||
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
|
||||
parser, "Unknown ChunkSizing tag", tag, ChunkSizing::class.java,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class ChunkSizingSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<ChunkSizing>(ChunkSizing::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun serialize(
|
||||
value: ChunkSizing,
|
||||
gen: com.fasterxml.jackson.core.JsonGenerator,
|
||||
provider: com.fasterxml.jackson.databind.SerializerProvider,
|
||||
) {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
|
||||
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
|
||||
is ChunkSizing.Characters -> {
|
||||
val n = mapper.createObjectNode()
|
||||
n.put("type", "characters")
|
||||
n
|
||||
}
|
||||
is ChunkSizing.Tokenizer -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as ChunkSizing.Tokenizer) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("type", "tokenizer")
|
||||
n
|
||||
}
|
||||
}
|
||||
mapper.writeTree(gen, node)
|
||||
}
|
||||
}
|
||||
110
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkType.kt
generated
Normal file
110
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkType.kt
generated
Normal file
@@ -0,0 +1,110 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Semantic structural classification of a text chunk.
|
||||
*
|
||||
* Assigned by the heuristic classifier in `chunking.classifier`.
|
||||
* Defaults to `Unknown` when no rule matches.
|
||||
* Designed to be extended in future versions without breaking changes.
|
||||
*/
|
||||
enum class ChunkType {
|
||||
/** Section heading or document title. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("heading")
|
||||
HEADING,
|
||||
/** Party list: names, addresses, and signatories. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("party_list")
|
||||
PARTY_LIST,
|
||||
/** Definition clause ("X means…", "X shall mean…"). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("definitions")
|
||||
DEFINITIONS,
|
||||
/** Operative clause containing legal/contractual action verbs. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("operative_clause")
|
||||
OPERATIVE_CLAUSE,
|
||||
/** Signature block with signatures, names, and dates. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("signature_block")
|
||||
SIGNATURE_BLOCK,
|
||||
/** Schedule, annex, appendix, or exhibit section. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("schedule")
|
||||
SCHEDULE,
|
||||
/** Table-like content with aligned columns or repeated patterns. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("table_like")
|
||||
TABLE_LIKE,
|
||||
/** Mathematical formula or equation. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("formula")
|
||||
FORMULA,
|
||||
/** Code block or preformatted content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
|
||||
CODE_BLOCK,
|
||||
/** Embedded or referenced image content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("image")
|
||||
IMAGE,
|
||||
/** Organizational chart or hierarchy diagram. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("org_chart")
|
||||
ORG_CHART,
|
||||
/** Diagram, figure, or visual illustration. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("diagram")
|
||||
DIAGRAM,
|
||||
/** Unclassified or mixed content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("unknown")
|
||||
UNKNOWN;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
HEADING -> "heading"
|
||||
PARTY_LIST -> "party_list"
|
||||
DEFINITIONS -> "definitions"
|
||||
OPERATIVE_CLAUSE -> "operative_clause"
|
||||
SIGNATURE_BLOCK -> "signature_block"
|
||||
SCHEDULE -> "schedule"
|
||||
TABLE_LIKE -> "table_like"
|
||||
FORMULA -> "formula"
|
||||
CODE_BLOCK -> "code_block"
|
||||
IMAGE -> "image"
|
||||
ORG_CHART -> "org_chart"
|
||||
DIAGRAM -> "diagram"
|
||||
UNKNOWN -> "unknown"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ChunkType = when (value) {
|
||||
"heading" -> HEADING
|
||||
"party_list" -> PARTY_LIST
|
||||
"definitions" -> DEFINITIONS
|
||||
"operative_clause" -> OPERATIVE_CLAUSE
|
||||
"signature_block" -> SIGNATURE_BLOCK
|
||||
"schedule" -> SCHEDULE
|
||||
"table_like" -> TABLE_LIKE
|
||||
"formula" -> FORMULA
|
||||
"code_block" -> CODE_BLOCK
|
||||
"image" -> IMAGE
|
||||
"org_chart" -> ORG_CHART
|
||||
"diagram" -> DIAGRAM
|
||||
"unknown" -> UNKNOWN
|
||||
else -> throw IllegalArgumentException("Unknown ChunkType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
70
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkerType.kt
generated
Normal file
70
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkerType.kt
generated
Normal file
@@ -0,0 +1,70 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Type of text chunker to use.
|
||||
*
|
||||
* # Variants
|
||||
*
|
||||
* - `Text` - Generic text splitter, splits on whitespace and punctuation
|
||||
* - `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
||||
* - `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
||||
* - `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
||||
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
||||
* lower = more splits). Without an embedding, falls back to a
|
||||
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
||||
* blank-line paragraphs) and merges groups into chunks capped at
|
||||
* `max_characters` (default 1000). `topic_threshold` has no effect in the
|
||||
* fallback path. For best results, pair with an embedding model.
|
||||
*/
|
||||
enum class ChunkerType {
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("text")
|
||||
TEXT,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("markdown")
|
||||
MARKDOWN,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("yaml")
|
||||
YAML,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("semantic")
|
||||
SEMANTIC;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
TEXT -> "text"
|
||||
MARKDOWN -> "markdown"
|
||||
YAML -> "yaml"
|
||||
SEMANTIC -> "semantic"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ChunkerType = when (value) {
|
||||
"text" -> TEXT
|
||||
"markdown" -> MARKDOWN
|
||||
"yaml" -> YAML
|
||||
"semantic" -> SEMANTIC
|
||||
else -> throw IllegalArgumentException("Unknown ChunkerType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkingConfig.kt
generated
Normal file
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ChunkingConfig.kt
generated
Normal file
@@ -0,0 +1,95 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Chunking configuration.
|
||||
*
|
||||
* Configures text chunking for document content, including chunk size,
|
||||
* overlap, trimming behavior, and optional embeddings.
|
||||
*
|
||||
* Use `..the default constructor` when constructing to allow for future field additions:
|
||||
*/
|
||||
data class ChunkingConfig(
|
||||
/**
|
||||
* Maximum size per chunk (in units determined by `sizing`).
|
||||
*
|
||||
* When `sizing` is `Characters` (default), this is the max character count.
|
||||
* When using token-based sizing, this is the max token count.
|
||||
*
|
||||
* Default: 1000
|
||||
*/
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("max_chars")
|
||||
val maxCharacters: Long = 1000L,
|
||||
/**
|
||||
* Overlap between chunks (in units determined by `sizing`).
|
||||
*
|
||||
* Default: 200
|
||||
*/
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("max_overlap")
|
||||
val overlap: Long = 200L,
|
||||
/**
|
||||
* Whether to trim whitespace from chunk boundaries.
|
||||
*
|
||||
* Default: true
|
||||
*/
|
||||
val trim: Boolean = true,
|
||||
/**
|
||||
* Type of chunker to use (Text or Markdown).
|
||||
*
|
||||
* Default: Text
|
||||
*/
|
||||
val chunkerType: ChunkerType = ChunkerType.TEXT,
|
||||
/** Optional embedding configuration for chunk embeddings. */
|
||||
val embedding: EmbeddingConfig? = null,
|
||||
/** Use a preset configuration (overrides individual settings if provided). */
|
||||
val preset: String? = null,
|
||||
/**
|
||||
* How to measure chunk size.
|
||||
*
|
||||
* Default: `Characters` (Unicode character count).
|
||||
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
|
||||
*/
|
||||
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = ChunkSizing::class)
|
||||
val sizing: ChunkSizing,
|
||||
/**
|
||||
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
|
||||
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
|
||||
*
|
||||
* This is useful for RAG pipelines where each chunk needs self-contained
|
||||
* context about its position in the document structure.
|
||||
*
|
||||
* Default: `false`
|
||||
*/
|
||||
val prependHeadingContext: Boolean = false,
|
||||
/**
|
||||
* Optional cosine similarity threshold for semantic topic boundary detection.
|
||||
*
|
||||
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
|
||||
* provided. You almost never need to set this. When omitted, defaults to
|
||||
* `0.75` which works well for most documents. Lower values detect more
|
||||
* topic boundaries (more, smaller chunks); higher values detect fewer.
|
||||
* Range: `0.0..=1.0`.
|
||||
*/
|
||||
val topicThreshold: Float? = null,
|
||||
)
|
||||
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CitationMetadata.kt
generated
Normal file
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CitationMetadata.kt
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Citation file metadata (RIS, PubMed, EndNote). */
|
||||
data class CitationMetadata(
|
||||
val citationCount: Long = 0L,
|
||||
val format: String? = null,
|
||||
val authors: List<String> = emptyList(),
|
||||
val yearRange: YearRange? = null,
|
||||
val dois: List<String> = emptyList(),
|
||||
val keywords: List<String> = emptyList(),
|
||||
)
|
||||
59
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CodeContentMode.kt
generated
Normal file
59
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CodeContentMode.kt
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Content rendering mode for code extraction.
|
||||
*
|
||||
* Controls how extracted code content is represented in the `content` field
|
||||
* of `ExtractionResult`.
|
||||
*/
|
||||
enum class CodeContentMode {
|
||||
/** Use TSLP semantic chunks as content (default). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("chunks")
|
||||
CHUNKS,
|
||||
/** Use raw source code as content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("raw")
|
||||
RAW,
|
||||
/** Emit function/class headings + docstrings (no code bodies). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("structure")
|
||||
STRUCTURE;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
CHUNKS -> "chunks"
|
||||
RAW -> "raw"
|
||||
STRUCTURE -> "structure"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): CodeContentMode = when (value) {
|
||||
"chunks" -> CHUNKS
|
||||
"raw" -> RAW
|
||||
"structure" -> STRUCTURE
|
||||
else -> throw IllegalArgumentException("Unknown CodeContentMode value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
89
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContentFilterConfig.kt
generated
Normal file
89
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContentFilterConfig.kt
generated
Normal file
@@ -0,0 +1,89 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Cross-extractor content filtering configuration.
|
||||
*
|
||||
* Controls whether "furniture" content (headers, footers, page numbers,
|
||||
* watermarks, repeating text) is included in or stripped from extraction
|
||||
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||||
* with format-specific implementation.
|
||||
*
|
||||
* When `null` on `ExtractionConfig`, each extractor uses its current
|
||||
* default behavior unchanged.
|
||||
*/
|
||||
data class ContentFilterConfig(
|
||||
/**
|
||||
* Include running headers in extraction output.
|
||||
*
|
||||
* - PDF: Disables top-margin furniture stripping and prevents the layout
|
||||
* model from treating `PageHeader`-classified regions as furniture.
|
||||
*
|
||||
* - DOCX: Includes document headers in text output.
|
||||
* - RTF/ODT: Headers already included; this is a no-op when true.
|
||||
* - HTML/EPUB: Keeps `<header>` element content.
|
||||
*
|
||||
* Default: `false` (headers are stripped or excluded).
|
||||
*/
|
||||
val includeHeaders: Boolean = false,
|
||||
/**
|
||||
* Include running footers in extraction output.
|
||||
*
|
||||
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
||||
* model from treating `PageFooter`-classified regions as furniture.
|
||||
*
|
||||
* - DOCX: Includes document footers in text output.
|
||||
* - RTF/ODT: Footers already included; this is a no-op when true.
|
||||
* - HTML/EPUB: Keeps `<footer>` element content.
|
||||
*
|
||||
* Default: `false` (footers are stripped or excluded).
|
||||
*/
|
||||
val includeFooters: Boolean = false,
|
||||
/**
|
||||
* Enable the heuristic cross-page repeating text detector.
|
||||
*
|
||||
* When `true` (default), text that repeats verbatim across a supermajority
|
||||
* of pages is classified as furniture and stripped. Disable this if brand
|
||||
* names or repeated headings are being incorrectly removed by the heuristic.
|
||||
*
|
||||
* Note: when a layout-detection model is active, the model may independently
|
||||
* classify page-header / page-footer regions as furniture on a per-page basis.
|
||||
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
|
||||
* or both, in addition to disabling this flag.
|
||||
*
|
||||
* Primarily affects PDF extraction.
|
||||
*
|
||||
* Default: `true`.
|
||||
*/
|
||||
val stripRepeatingText: Boolean = true,
|
||||
/**
|
||||
* Include watermark text in extraction output.
|
||||
*
|
||||
* - PDF: Keeps watermark artifacts and arXiv identifiers.
|
||||
* - Other formats: No effect currently.
|
||||
*
|
||||
* Default: `false` (watermarks are stripped).
|
||||
*/
|
||||
val includeWatermarks: Boolean = false,
|
||||
)
|
||||
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContentLayer.kt
generated
Normal file
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContentLayer.kt
generated
Normal file
@@ -0,0 +1,63 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Content layer classification for document nodes.
|
||||
*
|
||||
* Replaces separate body/furniture arrays with per-node granularity.
|
||||
*/
|
||||
enum class ContentLayer {
|
||||
/** Main document body content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("body")
|
||||
BODY,
|
||||
/** Page/section header (running header). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("header")
|
||||
HEADER,
|
||||
/** Page/section footer (running footer). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("footer")
|
||||
FOOTER,
|
||||
/** Footnote content. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("footnote")
|
||||
FOOTNOTE;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
BODY -> "body"
|
||||
HEADER -> "header"
|
||||
FOOTER -> "footer"
|
||||
FOOTNOTE -> "footnote"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ContentLayer = when (value) {
|
||||
"body" -> BODY
|
||||
"header" -> HEADER
|
||||
"footer" -> FOOTER
|
||||
"footnote" -> FOOTNOTE
|
||||
else -> throw IllegalArgumentException("Unknown ContentLayer value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContributorRole.kt
generated
Normal file
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ContributorRole.kt
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** JATS contributor with role. */
|
||||
data class ContributorRole(val name: String, val role: String? = null)
|
||||
62
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CoreProperties.kt
generated
Normal file
62
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CoreProperties.kt
generated
Normal file
@@ -0,0 +1,62 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Dublin Core metadata from docProps/core.xml
|
||||
*
|
||||
* Contains standard metadata fields defined by the Dublin Core standard
|
||||
* and Office-specific extensions.
|
||||
*/
|
||||
data class CoreProperties(
|
||||
/** Document title */
|
||||
val title: String? = null,
|
||||
/** Document subject/topic */
|
||||
val subject: String? = null,
|
||||
/** Document creator/author */
|
||||
val creator: String? = null,
|
||||
/** Keywords or tags */
|
||||
val keywords: String? = null,
|
||||
/** Document description/abstract */
|
||||
val description: String? = null,
|
||||
/** User who last modified the document */
|
||||
val lastModifiedBy: String? = null,
|
||||
/** Revision number */
|
||||
val revision: String? = null,
|
||||
/** Creation timestamp (ISO 8601) */
|
||||
val created: String? = null,
|
||||
/** Last modification timestamp (ISO 8601) */
|
||||
val modified: String? = null,
|
||||
/** Document category */
|
||||
val category: String? = null,
|
||||
/** Content status (Draft, Final, etc.) */
|
||||
val contentStatus: String? = null,
|
||||
/** Document language */
|
||||
val language: String? = null,
|
||||
/** Unique identifier */
|
||||
val identifier: String? = null,
|
||||
/** Document version */
|
||||
val version: String? = null,
|
||||
/** Last print timestamp (ISO 8601) */
|
||||
val lastPrinted: String? = null,
|
||||
)
|
||||
32
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CsvMetadata.kt
generated
Normal file
32
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/CsvMetadata.kt
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** CSV/TSV file metadata. */
|
||||
data class CsvMetadata(
|
||||
val rowCount: Int = 0,
|
||||
val columnCount: Int = 0,
|
||||
val delimiter: String? = null,
|
||||
val hasHeader: Boolean = false,
|
||||
val columnTypes: List<String>? = null,
|
||||
)
|
||||
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DbfFieldInfo.kt
generated
Normal file
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DbfFieldInfo.kt
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** dBASE field information. */
|
||||
data class DbfFieldInfo(val name: String, val fieldType: String)
|
||||
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DbfMetadata.kt
generated
Normal file
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DbfMetadata.kt
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** dBASE (DBF) file metadata. */
|
||||
data class DbfMetadata(
|
||||
val recordCount: Long = 0L,
|
||||
val fieldCount: Long = 0L,
|
||||
val fields: List<DbfFieldInfo> = emptyList(),
|
||||
)
|
||||
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DefaultClient.kt
generated
Normal file
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DefaultClient.kt
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:annotation",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"LongParameterList",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference
|
||||
|
||||
@Suppress("TooManyFunctions")
|
||||
class Document internal constructor(internal val handle: Long) : AutoCloseable {
|
||||
companion object {
|
||||
private val MAPPER = com.fasterxml.jackson.databind.ObjectMapper()
|
||||
.registerModule(com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
|
||||
.findAndRegisterModules()
|
||||
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
|
||||
}
|
||||
|
||||
// Return the 1-based page number for each top-level table in the document.
|
||||
fun tablePageNumbers(): List<Long> {
|
||||
val responseJson = KreuzbergBridge.nativeDocumentTablePageNumbers(handle)
|
||||
return MAPPER.readValue(responseJson, object : TypeReference<List<Long>>() {})
|
||||
}
|
||||
|
||||
override fun close() { KreuzbergBridge.nativeFreeDocument(handle) }
|
||||
}
|
||||
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DetectResponse.kt
generated
Normal file
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DetectResponse.kt
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** MIME type detection response. */
|
||||
data class DetectResponse(
|
||||
/** Detected MIME type */
|
||||
val mimeType: String,
|
||||
/** Original filename (if provided) */
|
||||
val filename: String? = null,
|
||||
)
|
||||
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DetectionResult.kt
generated
Normal file
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DetectionResult.kt
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Page-level detection result containing all detections and page metadata. */
|
||||
data class DetectionResult(
|
||||
val pageWidth: Int,
|
||||
val pageHeight: Int,
|
||||
val detections: List<LayoutDetection> = emptyList(),
|
||||
)
|
||||
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffHunk.kt
generated
Normal file
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffHunk.kt
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** A single contiguous hunk in a unified diff. */
|
||||
data class DiffHunk(
|
||||
/** Starting line number in the old content (0-indexed). */
|
||||
val fromLine: Long,
|
||||
/** Number of lines from the old content in this hunk. */
|
||||
val fromCount: Long,
|
||||
/** Starting line number in the new content (0-indexed). */
|
||||
val toLine: Long,
|
||||
/** Number of lines from the new content in this hunk. */
|
||||
val toCount: Long,
|
||||
/** Lines that make up this hunk. */
|
||||
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(contentAs = DiffLine::class)
|
||||
val lines: List<DiffLine> = emptyList(),
|
||||
)
|
||||
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffLine.kt
generated
Normal file
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffLine.kt
generated
Normal file
@@ -0,0 +1,95 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A single line in a unified-diff hunk.
|
||||
*
|
||||
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
|
||||
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
||||
* `crate.diff` re-exports this type verbatim.
|
||||
*/
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = DiffLineDeserializer::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = DiffLineSerializer::class)
|
||||
sealed class DiffLine {
|
||||
/** Unchanged context line. */
|
||||
data class Context(val value: String) : DiffLine()
|
||||
/** Line added in the "after" version. */
|
||||
data class Added(val value: String) : DiffLine()
|
||||
/** Line removed from the "before" version. */
|
||||
data class Removed(val value: String) : DiffLine()
|
||||
}
|
||||
|
||||
private class DiffLineDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<DiffLine>(DiffLine::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun deserialize(
|
||||
parser: com.fasterxml.jackson.core.JsonParser,
|
||||
ctx: com.fasterxml.jackson.databind.DeserializationContext,
|
||||
): DiffLine {
|
||||
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
|
||||
val tag = node.get("kind")?.asText()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("kind") }
|
||||
return when (tag) {
|
||||
"context" -> DiffLine.Context(ctx.readTreeAsValue<String>(payload, String::class.java))
|
||||
"added" -> DiffLine.Added(ctx.readTreeAsValue<String>(payload, String::class.java))
|
||||
"removed" -> DiffLine.Removed(ctx.readTreeAsValue<String>(payload, String::class.java))
|
||||
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
|
||||
parser, "Unknown DiffLine tag", tag, DiffLine::class.java,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class DiffLineSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<DiffLine>(DiffLine::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun serialize(
|
||||
value: DiffLine,
|
||||
gen: com.fasterxml.jackson.core.JsonGenerator,
|
||||
provider: com.fasterxml.jackson.databind.SerializerProvider,
|
||||
) {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
|
||||
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
|
||||
is DiffLine.Context -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("kind", "context")
|
||||
n
|
||||
}
|
||||
is DiffLine.Added -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("kind", "added")
|
||||
n
|
||||
}
|
||||
is DiffLine.Removed -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("kind", "removed")
|
||||
n
|
||||
}
|
||||
}
|
||||
mapper.writeTree(gen, node)
|
||||
}
|
||||
}
|
||||
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffOptions.kt
generated
Normal file
38
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DiffOptions.kt
generated
Normal file
@@ -0,0 +1,38 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Options controlling how two `ExtractionResult` values are compared. */
|
||||
data class DiffOptions(
|
||||
/** Include metadata changes in the diff. Default: `true`. */
|
||||
val includeMetadata: Boolean = true,
|
||||
/** Include embedded-children changes in the diff. Default: `true`. */
|
||||
val includeEmbedded: Boolean = true,
|
||||
/**
|
||||
* Truncate content to this many characters before diffing.
|
||||
*
|
||||
* Useful for very large documents where only the first N characters matter.
|
||||
* `null` means no truncation.
|
||||
*/
|
||||
val maxContentChars: Long? = null,
|
||||
)
|
||||
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotContent.kt
generated
Normal file
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotContent.kt
generated
Normal file
@@ -0,0 +1,56 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Comprehensive Djot document structure with semantic preservation.
|
||||
*
|
||||
* This type captures the full richness of Djot markup, including:
|
||||
*
|
||||
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
||||
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
||||
* - Attributes (classes, IDs, key-value pairs)
|
||||
* - Links, images, footnotes
|
||||
* - Math expressions (inline and display)
|
||||
* - Tables with full structure
|
||||
*
|
||||
* Available when the `djot` feature is enabled.
|
||||
*/
|
||||
data class DjotContent(
|
||||
/** Plain text representation for backwards compatibility */
|
||||
val plainText: String,
|
||||
/** Structured block-level content */
|
||||
val blocks: List<FormattedBlock> = emptyList(),
|
||||
/** Metadata from YAML frontmatter */
|
||||
val metadata: Metadata,
|
||||
/** Extracted tables as structured data */
|
||||
val tables: List<Table> = emptyList(),
|
||||
/** Extracted images with metadata */
|
||||
val images: List<DjotImage> = emptyList(),
|
||||
/** Extracted links with URLs */
|
||||
val links: List<DjotLink> = emptyList(),
|
||||
/** Footnote definitions */
|
||||
val footnotes: List<Footnote> = emptyList(),
|
||||
/** Attributes mapped by element identifier (if present) */
|
||||
val attributes: List<String> = emptyList(),
|
||||
)
|
||||
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotImage.kt
generated
Normal file
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotImage.kt
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Image element in Djot. */
|
||||
data class DjotImage(
|
||||
/** Image source URL or path */
|
||||
val src: String,
|
||||
/** Alternative text */
|
||||
val alt: String,
|
||||
/** Optional title */
|
||||
val title: String? = null,
|
||||
/** Element attributes */
|
||||
val attributes: String? = null,
|
||||
)
|
||||
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotLink.kt
generated
Normal file
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DjotLink.kt
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Link element in Djot. */
|
||||
data class DjotLink(
|
||||
/** Link URL */
|
||||
val url: String,
|
||||
/** Link text content */
|
||||
val text: String,
|
||||
/** Optional title */
|
||||
val title: String? = null,
|
||||
/** Element attributes */
|
||||
val attributes: String? = null,
|
||||
)
|
||||
25
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentExtractorBridge.kt
generated
Normal file
25
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentExtractorBridge.kt
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
object DocumentExtractorBridge {
|
||||
private val registered = mutableMapOf<String, IDocumentExtractor>()
|
||||
|
||||
fun register(impl: IDocumentExtractor): Unit {
|
||||
val name = impl.name()
|
||||
registered[name] = impl
|
||||
KreuzbergBridge.nativeRegisterDocumentExtractor(impl)
|
||||
}
|
||||
|
||||
fun unregister(name: String): Unit {
|
||||
registered.remove(name)
|
||||
KreuzbergBridge.nativeUnregisterDocumentExtractor(name)
|
||||
}
|
||||
|
||||
fun clearAll(): Unit {
|
||||
registered.clear()
|
||||
KreuzbergBridge.nativeClearDocumentExtractors()
|
||||
}
|
||||
|
||||
fun getAll(): Map<String, IDocumentExtractor> = registered.toMap()
|
||||
}
|
||||
62
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentNode.kt
generated
Normal file
62
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentNode.kt
generated
Normal file
@@ -0,0 +1,62 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A single node in the document tree.
|
||||
*
|
||||
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
||||
* for tree structure, and metadata like page number, bounding box, and content layer.
|
||||
*/
|
||||
data class DocumentNode(
|
||||
/** Deterministic identifier (hash of content + position). */
|
||||
val id: String,
|
||||
/** Node content — tagged enum, type-specific data only. */
|
||||
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = NodeContent::class)
|
||||
val content: NodeContent,
|
||||
/** Parent node index (`null` = root-level node). */
|
||||
val parent: Int? = null,
|
||||
/** Child node indices in reading order. */
|
||||
val children: List<Int> = emptyList(),
|
||||
/** Content layer classification. */
|
||||
val contentLayer: ContentLayer,
|
||||
/** Page number where this node starts (1-indexed). */
|
||||
val page: Int? = null,
|
||||
/** Page number where this node ends (for multi-page tables/sections). */
|
||||
val pageEnd: Int? = null,
|
||||
/** Bounding box in document coordinates. */
|
||||
val bbox: BoundingBox? = null,
|
||||
/**
|
||||
* Inline annotations (formatting, links) on this node's text content.
|
||||
*
|
||||
* Only meaningful for text-carrying nodes; empty for containers.
|
||||
*/
|
||||
val annotations: List<TextAnnotation> = emptyList(),
|
||||
/**
|
||||
* Format-specific key-value attributes.
|
||||
*
|
||||
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
|
||||
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
|
||||
*/
|
||||
val attributes: Map<String, String>? = null,
|
||||
)
|
||||
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentRelationship.kt
generated
Normal file
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentRelationship.kt
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** A resolved relationship between two nodes in the document tree. */
|
||||
data class DocumentRelationship(
|
||||
/** Source node index (the referencing node). */
|
||||
val source: Int,
|
||||
/** Target node index (the referenced node). */
|
||||
val target: Int,
|
||||
/** Semantic kind of the relationship. */
|
||||
val kind: RelationshipKind,
|
||||
)
|
||||
66
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentRevision.kt
generated
Normal file
66
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentRevision.kt
generated
Normal file
@@ -0,0 +1,66 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A single tracked change embedded in a document.
|
||||
*
|
||||
* Populated by per-format extractors that understand change-tracking metadata
|
||||
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||||
* extractor defaults to `ExtractionResult.revisions = None` until a
|
||||
* format-specific implementation is added.
|
||||
*/
|
||||
data class DocumentRevision(
|
||||
/**
|
||||
* Format-specific revision identifier.
|
||||
*
|
||||
* For DOCX this is the `w:id` attribute value on the change element
|
||||
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
|
||||
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
|
||||
*/
|
||||
val revisionId: String,
|
||||
/** Display name of the author who made this change, when available. */
|
||||
val author: String? = null,
|
||||
/**
|
||||
* ISO-8601 timestamp of the change, when available.
|
||||
*
|
||||
* Stored as a plain string so this type remains FFI-friendly and
|
||||
* unconditionally available without the `chrono` optional dep.
|
||||
* DOCX populates this from the `w:date` attribute (e.g.
|
||||
* `"2024-03-15T10:30:00Z"`).
|
||||
*/
|
||||
val timestamp: String? = null,
|
||||
/** Semantic kind of this revision. */
|
||||
val kind: RevisionKind,
|
||||
/**
|
||||
* Best-effort document location for this revision.
|
||||
*
|
||||
* Resolution is format-dependent and may be `null` when the location
|
||||
* cannot be determined (e.g. changes inside table cells before
|
||||
* table-cell anchor support is added).
|
||||
*/
|
||||
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = RevisionAnchor::class)
|
||||
val anchor: RevisionAnchor? = null,
|
||||
/** The content changes that make up this revision. */
|
||||
val delta: RevisionDelta,
|
||||
)
|
||||
65
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentStructure.kt
generated
Normal file
65
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocumentStructure.kt
generated
Normal file
@@ -0,0 +1,65 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Top-level structured document representation.
|
||||
*
|
||||
* A flat array of nodes with index-based parent/child references forming a tree.
|
||||
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
||||
* to iterate over top-level content by layer.
|
||||
*
|
||||
* # Validation
|
||||
*
|
||||
* Call `validate()` after construction to verify all node indices are in bounds
|
||||
* and parent-child relationships are bidirectionally consistent.
|
||||
*/
|
||||
data class DocumentStructure(
|
||||
/** All nodes in document/reading order. */
|
||||
val nodes: List<DocumentNode> = emptyList(),
|
||||
/**
|
||||
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
|
||||
*
|
||||
* Allows renderers to apply format-aware heuristics when converting
|
||||
* the document tree to output formats.
|
||||
*/
|
||||
val sourceFormat: String? = null,
|
||||
/**
|
||||
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
|
||||
*
|
||||
* Populated during derivation from the internal document representation.
|
||||
* Empty when no relationships are detected.
|
||||
*/
|
||||
val relationships: List<DocumentRelationship> = emptyList(),
|
||||
/**
|
||||
* Sorted, deduplicated list of node type names present in this document.
|
||||
*
|
||||
* Each value is the snake_case `node_type` tag of the corresponding
|
||||
* `NodeContent` variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
|
||||
*
|
||||
* Computed from `nodes` via `DocumentStructure.finalize_node_types`.
|
||||
* Empty until that method is called (internal construction paths call it
|
||||
* at the end of derivation).
|
||||
*/
|
||||
val nodeTypes: List<String> = emptyList(),
|
||||
)
|
||||
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocxAppProperties.kt
generated
Normal file
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocxAppProperties.kt
generated
Normal file
@@ -0,0 +1,63 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Application properties from docProps/app.xml for DOCX
|
||||
*
|
||||
* Contains Word-specific document statistics and metadata.
|
||||
*/
|
||||
data class DocxAppProperties(
|
||||
/** Application name (e.g., "Microsoft Office Word") */
|
||||
val application: String? = null,
|
||||
/** Application version */
|
||||
val appVersion: String? = null,
|
||||
/** Template filename */
|
||||
val template: String? = null,
|
||||
/** Total editing time in minutes */
|
||||
val totalTime: Int? = null,
|
||||
/** Number of pages */
|
||||
val pages: Int? = null,
|
||||
/** Number of words */
|
||||
val words: Int? = null,
|
||||
/** Number of characters (excluding spaces) */
|
||||
val characters: Int? = null,
|
||||
/** Number of characters (including spaces) */
|
||||
val charactersWithSpaces: Int? = null,
|
||||
/** Number of lines */
|
||||
val lines: Int? = null,
|
||||
/** Number of paragraphs */
|
||||
val paragraphs: Int? = null,
|
||||
/** Company name */
|
||||
val company: String? = null,
|
||||
/** Document security level */
|
||||
val docSecurity: Int? = null,
|
||||
/** Scale crop flag */
|
||||
val scaleCrop: Boolean? = null,
|
||||
/** Links up to date flag */
|
||||
val linksUpToDate: Boolean? = null,
|
||||
/** Shared document flag */
|
||||
val sharedDoc: Boolean? = null,
|
||||
/** Hyperlinks changed flag */
|
||||
val hyperlinksChanged: Boolean? = null,
|
||||
)
|
||||
53
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocxMetadata.kt
generated
Normal file
53
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DocxMetadata.kt
generated
Normal file
@@ -0,0 +1,53 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Word document metadata.
|
||||
*
|
||||
* Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||||
* Integrates with `office_metadata` module for core/app/custom properties.
|
||||
*/
|
||||
data class DocxMetadata(
|
||||
/**
|
||||
* Core properties from docProps/core.xml (Dublin Core metadata)
|
||||
*
|
||||
* Contains title, creator, subject, keywords, dates, etc.
|
||||
* Shared format across DOCX/PPTX/XLSX documents.
|
||||
*/
|
||||
val coreProperties: CoreProperties? = null,
|
||||
/**
|
||||
* Application properties from docProps/app.xml (Word-specific statistics)
|
||||
*
|
||||
* Contains word count, page count, paragraph count, editing time, etc.
|
||||
* DOCX-specific variant of Office application properties.
|
||||
*/
|
||||
val appProperties: DocxAppProperties? = null,
|
||||
/**
|
||||
* Custom properties from docProps/custom.xml (user-defined properties)
|
||||
*
|
||||
* Contains key-value pairs defined by users or applications.
|
||||
* Values can be strings, numbers, booleans, or dates.
|
||||
*/
|
||||
val customProperties: Map<String, Any>? = null,
|
||||
)
|
||||
29
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DrawingType.kt
generated
Normal file
29
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/DrawingType.kt
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Whether the drawing is inline or anchored. */
|
||||
sealed class DrawingType {
|
||||
object Inline : DrawingType()
|
||||
|
||||
data class Anchored(val value: String) : DrawingType()
|
||||
}
|
||||
40
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Element.kt
generated
Normal file
40
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Element.kt
generated
Normal file
@@ -0,0 +1,40 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Semantic element extracted from document.
|
||||
*
|
||||
* Represents a logical unit of content with semantic classification,
|
||||
* unique identifier, and metadata for tracking origin and position.
|
||||
*/
|
||||
data class Element(
|
||||
/** Unique element identifier */
|
||||
val elementId: String,
|
||||
/** Semantic type of this element */
|
||||
val elementType: ElementType,
|
||||
/** Text content of the element */
|
||||
val text: String,
|
||||
/** Metadata about the element */
|
||||
val metadata: ElementMetadata,
|
||||
)
|
||||
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ElementMetadata.kt
generated
Normal file
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ElementMetadata.kt
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Metadata for a semantic element. */
|
||||
data class ElementMetadata(
|
||||
/** Page number (1-indexed) */
|
||||
val pageNumber: Int? = null,
|
||||
/** Source filename or document name */
|
||||
val filename: String? = null,
|
||||
/** Bounding box coordinates if available */
|
||||
val coordinates: BoundingBox? = null,
|
||||
/** Position index in the element sequence */
|
||||
val elementIndex: Long? = null,
|
||||
/** Additional custom metadata */
|
||||
val additional: Map<String, String> = emptyMap(),
|
||||
)
|
||||
99
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ElementType.kt
generated
Normal file
99
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ElementType.kt
generated
Normal file
@@ -0,0 +1,99 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Semantic element type classification.
|
||||
*
|
||||
* Categorizes text content into semantic units for downstream processing.
|
||||
* Supports the element types commonly found in Unstructured documents.
|
||||
*/
|
||||
enum class ElementType {
|
||||
/** Document title */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("title")
|
||||
TITLE,
|
||||
/** Main narrative text body */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("narrative_text")
|
||||
NARRATIVE_TEXT,
|
||||
/** Section heading */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("heading")
|
||||
HEADING,
|
||||
/** List item (bullet, numbered, etc.) */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("list_item")
|
||||
LIST_ITEM,
|
||||
/** Table element */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("table")
|
||||
TABLE,
|
||||
/** Image element */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("image")
|
||||
IMAGE,
|
||||
/** Page break marker */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("page_break")
|
||||
PAGE_BREAK,
|
||||
/** Code block */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("code_block")
|
||||
CODE_BLOCK,
|
||||
/** Block quote */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("block_quote")
|
||||
BLOCK_QUOTE,
|
||||
/** Footer text */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("footer")
|
||||
FOOTER,
|
||||
/** Header text */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("header")
|
||||
HEADER;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
TITLE -> "title"
|
||||
NARRATIVE_TEXT -> "narrative_text"
|
||||
HEADING -> "heading"
|
||||
LIST_ITEM -> "list_item"
|
||||
TABLE -> "table"
|
||||
IMAGE -> "image"
|
||||
PAGE_BREAK -> "page_break"
|
||||
CODE_BLOCK -> "code_block"
|
||||
BLOCK_QUOTE -> "block_quote"
|
||||
FOOTER -> "footer"
|
||||
HEADER -> "header"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ElementType = when (value) {
|
||||
"title" -> TITLE
|
||||
"narrative_text" -> NARRATIVE_TEXT
|
||||
"heading" -> HEADING
|
||||
"list_item" -> LIST_ITEM
|
||||
"table" -> TABLE
|
||||
"image" -> IMAGE
|
||||
"page_break" -> PAGE_BREAK
|
||||
"code_block" -> CODE_BLOCK
|
||||
"block_quote" -> BLOCK_QUOTE
|
||||
"footer" -> FOOTER
|
||||
"header" -> HEADER
|
||||
else -> throw IllegalArgumentException("Unknown ElementType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
46
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailAttachment.kt
generated
Normal file
46
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailAttachment.kt
generated
Normal file
@@ -0,0 +1,46 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Email attachment representation.
|
||||
*
|
||||
* Contains metadata and optionally the content of an email attachment.
|
||||
*/
|
||||
data class EmailAttachment(
|
||||
/** Attachment name (from Content-Disposition header) */
|
||||
val name: String? = null,
|
||||
/** Filename of the attachment */
|
||||
val filename: String? = null,
|
||||
/** MIME type of the attachment */
|
||||
val mimeType: String? = null,
|
||||
/** Size in bytes */
|
||||
val size: Long? = null,
|
||||
/** Whether this attachment is an image */
|
||||
val isImage: Boolean,
|
||||
/**
|
||||
* Attachment data (if extracted).
|
||||
* Uses `bytes.Bytes` for cheap cloning of large buffers.
|
||||
*/
|
||||
val data: ByteArray? = null,
|
||||
)
|
||||
49
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailConfig.kt
generated
Normal file
49
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailConfig.kt
generated
Normal file
@@ -0,0 +1,49 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Configuration for email extraction. */
|
||||
data class EmailConfig(
|
||||
/**
|
||||
* Windows codepage number to use when an MSG file contains no codepage property.
|
||||
* Defaults to `null`, which falls back to windows-1252.
|
||||
*
|
||||
* If an unrecognized or invalid codepage number is supplied (including 0),
|
||||
* the behavior silently falls back to windows-1252 — the same as when the
|
||||
* MSG file itself contains an unrecognized codepage. No error or warning is
|
||||
* emitted. Users should verify output when supplying unusual values.
|
||||
*
|
||||
* Common values:
|
||||
*
|
||||
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
|
||||
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
|
||||
* - 1252: Western European (default)
|
||||
* - 1253: Greek
|
||||
* - 1254: Turkish
|
||||
* - 1255: Hebrew
|
||||
* - 1256: Arabic
|
||||
* - 932: Japanese (Shift-JIS)
|
||||
* - 936: Simplified Chinese (GBK)
|
||||
*/
|
||||
val msgFallbackCodepage: Int? = null,
|
||||
)
|
||||
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailExtractionResult.kt
generated
Normal file
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailExtractionResult.kt
generated
Normal file
@@ -0,0 +1,56 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Email extraction result.
|
||||
*
|
||||
* Complete representation of an extracted email message (.eml or .msg)
|
||||
* including headers, body content, and attachments.
|
||||
*/
|
||||
data class EmailExtractionResult(
|
||||
/** Email subject line */
|
||||
val subject: String? = null,
|
||||
/** Sender email address */
|
||||
val fromEmail: String? = null,
|
||||
/** Primary recipient email addresses */
|
||||
val toEmails: List<String> = emptyList(),
|
||||
/** CC recipient email addresses */
|
||||
val ccEmails: List<String> = emptyList(),
|
||||
/** BCC recipient email addresses */
|
||||
val bccEmails: List<String> = emptyList(),
|
||||
/** Email date/timestamp */
|
||||
val date: String? = null,
|
||||
/** Message-ID header value */
|
||||
val messageId: String? = null,
|
||||
/** Plain text version of the email body */
|
||||
val plainText: String? = null,
|
||||
/** HTML version of the email body */
|
||||
val htmlContent: String? = null,
|
||||
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
|
||||
val content: String,
|
||||
/** List of email attachments */
|
||||
val attachments: List<EmailAttachment> = emptyList(),
|
||||
/** Additional email headers and metadata */
|
||||
val metadata: Map<String, String> = emptyMap(),
|
||||
)
|
||||
45
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailMetadata.kt
generated
Normal file
45
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmailMetadata.kt
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Email metadata extracted from .eml and .msg files.
|
||||
*
|
||||
* Includes sender/recipient information, message ID, and attachment list.
|
||||
*/
|
||||
data class EmailMetadata(
|
||||
/** Sender's email address */
|
||||
val fromEmail: String? = null,
|
||||
/** Sender's display name */
|
||||
val fromName: String? = null,
|
||||
/** Primary recipients */
|
||||
val toEmails: List<String> = emptyList(),
|
||||
/** CC recipients */
|
||||
val ccEmails: List<String> = emptyList(),
|
||||
/** BCC recipients */
|
||||
val bccEmails: List<String> = emptyList(),
|
||||
/** Message-ID header value */
|
||||
val messageId: String? = null,
|
||||
/** List of attachment filenames */
|
||||
val attachments: List<String> = emptyList(),
|
||||
)
|
||||
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedChanges.kt
generated
Normal file
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedChanges.kt
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Changes to embedded archive children between two results. */
|
||||
data class EmbeddedChanges(
|
||||
/** Children present in `b` but not in `a` (matched by `path`). */
|
||||
val added: List<ArchiveEntry> = emptyList(),
|
||||
/** Children present in `a` but not in `b` (matched by `path`). */
|
||||
val removed: List<ArchiveEntry> = emptyList(),
|
||||
/**
|
||||
* Children present in both but with differing content (matched by `path`).
|
||||
*
|
||||
* Each entry holds the diff of the nested `ExtractionResult`.
|
||||
*/
|
||||
val changed: List<EmbeddedDiff> = emptyList(),
|
||||
)
|
||||
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedDiff.kt
generated
Normal file
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedDiff.kt
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Diff for a single embedded archive entry that appears in both results. */
|
||||
data class EmbeddedDiff(
|
||||
/** Archive-relative path identifying this entry. */
|
||||
val path: String,
|
||||
/** The recursive diff of the entry's extraction result. */
|
||||
val diff: ExtractionDiff,
|
||||
)
|
||||
40
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedFile.kt
generated
Normal file
40
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddedFile.kt
generated
Normal file
@@ -0,0 +1,40 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Embedded file descriptor extracted from the PDF name tree. */
|
||||
data class EmbeddedFile(
|
||||
/** The filename as stored in the PDF name tree. */
|
||||
val name: String,
|
||||
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
|
||||
val data: ByteArray,
|
||||
/**
|
||||
* Compressed byte count of the original stream (before decompression).
|
||||
*
|
||||
* Used by callers to compute the decompression ratio and detect zip-bomb-style
|
||||
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
|
||||
*/
|
||||
val compressedSize: Long,
|
||||
/** MIME type if specified in the filespec, otherwise `null`. */
|
||||
val mimeType: String? = null,
|
||||
)
|
||||
25
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingBackendBridge.kt
generated
Normal file
25
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingBackendBridge.kt
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
object EmbeddingBackendBridge {
|
||||
private val registered = mutableMapOf<String, IEmbeddingBackend>()
|
||||
|
||||
fun register(impl: IEmbeddingBackend): Unit {
|
||||
val name = impl.name()
|
||||
registered[name] = impl
|
||||
KreuzbergBridge.nativeRegisterEmbeddingBackend(impl)
|
||||
}
|
||||
|
||||
fun unregister(name: String): Unit {
|
||||
registered.remove(name)
|
||||
KreuzbergBridge.nativeUnregisterEmbeddingBackend(name)
|
||||
}
|
||||
|
||||
fun clearAll(): Unit {
|
||||
registered.clear()
|
||||
KreuzbergBridge.nativeClearEmbeddingBackends()
|
||||
}
|
||||
|
||||
fun getAll(): Map<String, IEmbeddingBackend> = registered.toMap()
|
||||
}
|
||||
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingConfig.kt
generated
Normal file
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingConfig.kt
generated
Normal file
@@ -0,0 +1,71 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Embedding configuration for text chunks.
|
||||
*
|
||||
* Configures embedding generation using ONNX models via the vendored embedding engine.
|
||||
* Requires the `embeddings` feature to be enabled.
|
||||
*/
|
||||
data class EmbeddingConfig(
|
||||
/** The embedding model to use (defaults to "balanced" preset if not specified) */
|
||||
@field:com.fasterxml.jackson.databind.annotation.JsonSerialize(`as` = EmbeddingModelType::class)
|
||||
val model: EmbeddingModelType,
|
||||
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
|
||||
val normalize: Boolean = true,
|
||||
/** Batch size for embedding generation */
|
||||
val batchSize: Long = 32L,
|
||||
/** Show model download progress */
|
||||
val showDownloadProgress: Boolean = false,
|
||||
/**
|
||||
* Custom cache directory for model files
|
||||
*
|
||||
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
|
||||
* Allows full customization of model download location.
|
||||
*/
|
||||
val cacheDir: java.nio.file.Path? = null,
|
||||
/**
|
||||
* Hardware acceleration for the embedding ONNX model.
|
||||
*
|
||||
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
||||
* is used for inference. Defaults to `null` (auto-select per platform).
|
||||
*/
|
||||
val acceleration: AccelerationConfig? = null,
|
||||
/**
|
||||
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
|
||||
* using `EmbeddingModelType.Plugin`.
|
||||
*
|
||||
* Applies only to the in-process plugin path — protects against hung
|
||||
* host-language backends (e.g. a Python callback deadlocked on the GIL,
|
||||
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
|
||||
* returns `Plugin` instead of blocking forever.
|
||||
*
|
||||
* `null` disables the timeout. The default (60 seconds) is conservative
|
||||
* for common in-process inference; increase for large batches on slow
|
||||
* hardware.
|
||||
*/
|
||||
val maxEmbedDurationSecs: Long? = null,
|
||||
)
|
||||
140
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingModelType.kt
generated
Normal file
140
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingModelType.kt
generated
Normal file
@@ -0,0 +1,140 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Embedding model types supported by Kreuzberg. */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = EmbeddingModelTypeDeserializer::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = EmbeddingModelTypeSerializer::class)
|
||||
sealed class EmbeddingModelType {
|
||||
/** Use a preset model configuration (recommended) */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Preset(
|
||||
val name: String,
|
||||
) : EmbeddingModelType()
|
||||
/** Use a custom ONNX model from HuggingFace */
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Custom(
|
||||
val modelId: String,
|
||||
val dimensions: Long,
|
||||
) : EmbeddingModelType()
|
||||
/**
|
||||
* Provider-hosted embedding model via liter-llm.
|
||||
*
|
||||
* Uses the model specified in the nested `LlmConfig` (e.g.,
|
||||
* `"openai/text-embedding-3-small"`).
|
||||
*/
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Llm(
|
||||
val llm: LlmConfig,
|
||||
) : EmbeddingModelType()
|
||||
/**
|
||||
* In-process embedding backend registered via the plugin system.
|
||||
*
|
||||
* The caller registers an `EmbeddingBackend` once
|
||||
* (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
|
||||
* or tuned ONNX model), then references it by name in config. Kreuzberg calls back
|
||||
* into the registered backend during chunking and standalone embed requests —
|
||||
* no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
|
||||
*
|
||||
* When this variant is selected, only the following `EmbeddingConfig` fields
|
||||
* apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
|
||||
* (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
|
||||
* `show_download_progress`, `acceleration`) are ignored — the host owns the
|
||||
* model lifecycle.
|
||||
*
|
||||
* Semantic chunking falls back to `ChunkingConfig.max_characters` when this variant
|
||||
* is used, since there is no preset to look a chunk-size ceiling up against — size your
|
||||
* context window via `max_characters` directly.
|
||||
*
|
||||
* See `register_embedding_backend`.
|
||||
*/
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = com.fasterxml.jackson.databind.JsonDeserializer.None::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = com.fasterxml.jackson.databind.JsonSerializer.None::class)
|
||||
data class Plugin(
|
||||
val name: String,
|
||||
) : EmbeddingModelType()
|
||||
}
|
||||
|
||||
private class EmbeddingModelTypeDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun deserialize(
|
||||
parser: com.fasterxml.jackson.core.JsonParser,
|
||||
ctx: com.fasterxml.jackson.databind.DeserializationContext,
|
||||
): EmbeddingModelType {
|
||||
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
|
||||
val tag = node.get("type")?.asText()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("type") }
|
||||
return when (tag) {
|
||||
"preset" -> ctx.readTreeAsValue<EmbeddingModelType.Preset>(payload, EmbeddingModelType.Preset::class.java)
|
||||
"custom" -> ctx.readTreeAsValue<EmbeddingModelType.Custom>(payload, EmbeddingModelType.Custom::class.java)
|
||||
"llm" -> ctx.readTreeAsValue<EmbeddingModelType.Llm>(payload, EmbeddingModelType.Llm::class.java)
|
||||
"plugin" -> ctx.readTreeAsValue<EmbeddingModelType.Plugin>(payload, EmbeddingModelType.Plugin::class.java)
|
||||
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
|
||||
parser, "Unknown EmbeddingModelType tag", tag, EmbeddingModelType::class.java,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class EmbeddingModelTypeSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<EmbeddingModelType>(EmbeddingModelType::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun serialize(
|
||||
value: EmbeddingModelType,
|
||||
gen: com.fasterxml.jackson.core.JsonGenerator,
|
||||
provider: com.fasterxml.jackson.databind.SerializerProvider,
|
||||
) {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
|
||||
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
|
||||
is EmbeddingModelType.Preset -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Preset) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("type", "preset")
|
||||
n
|
||||
}
|
||||
is EmbeddingModelType.Custom -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Custom) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("type", "custom")
|
||||
n
|
||||
}
|
||||
is EmbeddingModelType.Llm -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Llm) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("type", "llm")
|
||||
n
|
||||
}
|
||||
is EmbeddingModelType.Plugin -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value as EmbeddingModelType.Plugin) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("type", "plugin")
|
||||
n
|
||||
}
|
||||
}
|
||||
mapper.writeTree(gen, node)
|
||||
}
|
||||
}
|
||||
46
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingPreset.kt
generated
Normal file
46
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EmbeddingPreset.kt
generated
Normal file
@@ -0,0 +1,46 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Preset configurations for common RAG use cases.
|
||||
*
|
||||
* Each preset combines chunk size, overlap, and embedding model
|
||||
* to provide an optimized configuration for specific scenarios.
|
||||
*
|
||||
* All string fields are owned `String` for FFI compatibility — instances
|
||||
* are safe to clone and pass across language boundaries.
|
||||
*/
|
||||
data class EmbeddingPreset(
|
||||
val name: String,
|
||||
val chunkSize: Long,
|
||||
val overlap: Long,
|
||||
/** HuggingFace repository name for the model. */
|
||||
val modelRepo: String,
|
||||
/** Pooling strategy: "cls" or "mean". */
|
||||
val pooling: String,
|
||||
/** Path to the ONNX model file within the repo. */
|
||||
val modelFile: String,
|
||||
val dimensions: Long,
|
||||
val description: String,
|
||||
)
|
||||
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EpubMetadata.kt
generated
Normal file
33
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/EpubMetadata.kt
generated
Normal file
@@ -0,0 +1,33 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** EPUB metadata (Dublin Core extensions). */
|
||||
data class EpubMetadata(
|
||||
val coverage: String? = null,
|
||||
val dcFormat: String? = null,
|
||||
val relation: String? = null,
|
||||
val source: String? = null,
|
||||
val dcType: String? = null,
|
||||
val coverImage: String? = null,
|
||||
)
|
||||
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ErrorMetadata.kt
generated
Normal file
26
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ErrorMetadata.kt
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Error metadata (for batch operations). */
|
||||
data class ErrorMetadata(val errorType: String, val message: String)
|
||||
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelMetadata.kt
generated
Normal file
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelMetadata.kt
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Excel/spreadsheet format metadata.
|
||||
*
|
||||
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
|
||||
* discriminant. Sheet count and sheet names are stored inside this struct.
|
||||
*/
|
||||
data class ExcelMetadata(
|
||||
/** Number of sheets in the workbook. */
|
||||
val sheetCount: Int? = null,
|
||||
/** Names of all sheets in the workbook. */
|
||||
val sheetNames: List<String>? = null,
|
||||
)
|
||||
48
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelSheet.kt
generated
Normal file
48
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelSheet.kt
generated
Normal file
@@ -0,0 +1,48 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Single Excel worksheet.
|
||||
*
|
||||
* Represents one sheet from an Excel workbook with its content
|
||||
* converted to Markdown format and dimensional statistics.
|
||||
*/
|
||||
data class ExcelSheet(
|
||||
/** Sheet name as it appears in Excel */
|
||||
val name: String,
|
||||
/** Sheet content converted to Markdown tables */
|
||||
val markdown: String,
|
||||
/** Number of rows */
|
||||
val rowCount: Long,
|
||||
/** Number of columns */
|
||||
val colCount: Long,
|
||||
/** Total number of non-empty cells */
|
||||
val cellCount: Long,
|
||||
/**
|
||||
* Pre-extracted table cells (2D vector of cell values)
|
||||
* Populated during markdown generation to avoid re-parsing markdown.
|
||||
* None for empty sheets.
|
||||
*/
|
||||
val tableCells: List<List<String>>? = null,
|
||||
)
|
||||
47
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelWorkbook.kt
generated
Normal file
47
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExcelWorkbook.kt
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Excel workbook representation.
|
||||
*
|
||||
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
||||
* extracted content and metadata.
|
||||
*/
|
||||
data class ExcelWorkbook(
|
||||
/** All sheets in the workbook */
|
||||
val sheets: List<ExcelSheet> = emptyList(),
|
||||
/** Workbook-level metadata (author, creation date, etc.) */
|
||||
val metadata: Map<String, String> = emptyMap(),
|
||||
/**
|
||||
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
|
||||
*
|
||||
* Populated for legacy shared-workbook `.xlsx` files that contain the
|
||||
* `xl/revisions/` directory. Each `<header>` element maps to one
|
||||
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
|
||||
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
|
||||
* `anchor` and `delta` are `null`/empty for v1 (per-cell log parsing is a
|
||||
* follow-up). `null` when `xl/revisions/revisionHeaders.xml` is absent.
|
||||
*/
|
||||
val revisions: List<DocumentRevision>? = null,
|
||||
)
|
||||
69
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExecutionProviderType.kt
generated
Normal file
69
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExecutionProviderType.kt
generated
Normal file
@@ -0,0 +1,69 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* ONNX Runtime execution provider type.
|
||||
*
|
||||
* Determines which hardware backend is used for model inference.
|
||||
* `Auto` (default) selects the best available provider per platform.
|
||||
*/
|
||||
enum class ExecutionProviderType {
|
||||
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("auto")
|
||||
AUTO,
|
||||
/** CPU execution provider (always available). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("cpu")
|
||||
CPU,
|
||||
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("coreml")
|
||||
CORE_ML,
|
||||
/** NVIDIA CUDA GPU acceleration. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("cuda")
|
||||
CUDA,
|
||||
/** NVIDIA TensorRT (optimized CUDA inference). */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("tensorrt")
|
||||
TENSOR_RT;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
AUTO -> "auto"
|
||||
CPU -> "cpu"
|
||||
CORE_ML -> "coreml"
|
||||
CUDA -> "cuda"
|
||||
TENSOR_RT -> "tensorrt"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ExecutionProviderType = when (value) {
|
||||
"auto" -> AUTO
|
||||
"cpu" -> CPU
|
||||
"coreml" -> CORE_ML
|
||||
"cuda" -> CUDA
|
||||
"tensorrt" -> TENSOR_RT
|
||||
else -> throw IllegalArgumentException("Unknown ExecutionProviderType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
88
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedImage.kt
generated
Normal file
88
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedImage.kt
generated
Normal file
@@ -0,0 +1,88 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Extracted image from a document.
|
||||
*
|
||||
* Contains raw image data, metadata, and optional nested OCR results.
|
||||
* Raw bytes allow cross-language compatibility - users can convert to
|
||||
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||||
*/
|
||||
data class ExtractedImage(
|
||||
/**
|
||||
* Raw image data (PNG, JPEG, WebP, etc. bytes).
|
||||
* Uses `bytes.Bytes` for cheap cloning of large buffers.
|
||||
*/
|
||||
val data: ByteArray,
|
||||
/**
|
||||
* Image format (e.g., "jpeg", "png", "webp")
|
||||
* Uses Cow<'static, str> to avoid allocation for static literals.
|
||||
*/
|
||||
val format: String,
|
||||
/** Zero-indexed position of this image in the document/page */
|
||||
val imageIndex: Int,
|
||||
/** Page/slide number where image was found (1-indexed) */
|
||||
val pageNumber: Int? = null,
|
||||
/** Image width in pixels */
|
||||
val width: Int? = null,
|
||||
/** Image height in pixels */
|
||||
val height: Int? = null,
|
||||
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
|
||||
val colorspace: String? = null,
|
||||
/** Bits per color component (e.g., 8, 16) */
|
||||
val bitsPerComponent: Int? = null,
|
||||
/** Whether this image is a mask image */
|
||||
val isMask: Boolean,
|
||||
/** Optional description of the image */
|
||||
val description: String? = null,
|
||||
/**
|
||||
* Nested OCR extraction result (if image was OCRed)
|
||||
*
|
||||
* When OCR is performed on this image, the result is embedded here
|
||||
* rather than in a separate collection, making the relationship explicit.
|
||||
*/
|
||||
val ocrResult: ExtractionResult? = null,
|
||||
/**
|
||||
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
||||
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
|
||||
*/
|
||||
val boundingBox: BoundingBox? = null,
|
||||
/**
|
||||
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
||||
* Used for rendering image references when the binary data is not extracted.
|
||||
*/
|
||||
val sourcePath: String? = null,
|
||||
/**
|
||||
* Heuristic classification of what this image likely depicts.
|
||||
* `null` if classification was disabled or inconclusive.
|
||||
*/
|
||||
val imageKind: ImageKind? = null,
|
||||
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
|
||||
val kindConfidence: Float? = null,
|
||||
/**
|
||||
* Identifier shared across images that form a single logical figure
|
||||
* (e.g. all raster tiles of one technical drawing). `null` for singletons.
|
||||
*/
|
||||
val clusterId: Int? = null,
|
||||
)
|
||||
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedImageMetadata.kt
generated
Normal file
35
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedImageMetadata.kt
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Image metadata extracted from an image file. */
|
||||
data class ExtractedImageMetadata(
|
||||
/** Image width in pixels */
|
||||
val width: Int,
|
||||
/** Image height in pixels */
|
||||
val height: Int,
|
||||
/** Image format (e.g., "PNG", "JPEG") */
|
||||
val format: String,
|
||||
/** EXIF data if available */
|
||||
val exifData: Map<String, String> = emptyMap(),
|
||||
)
|
||||
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedUri.kt
generated
Normal file
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractedUri.kt
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A URI extracted from a document.
|
||||
*
|
||||
* Represents any link, reference, or resource pointer found during extraction.
|
||||
* The `kind` field classifies the URI semantically, while `label` carries
|
||||
* optional human-readable display text.
|
||||
*/
|
||||
data class ExtractedUri(
|
||||
/** The URL or path string. */
|
||||
val url: String,
|
||||
/** Optional display text / label for the link. */
|
||||
val label: String? = null,
|
||||
/** Optional page number where the URI was found (1-indexed). */
|
||||
val page: Int? = null,
|
||||
/** Semantic classification of the URI. */
|
||||
val kind: UriKind,
|
||||
)
|
||||
275
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionConfig.kt
generated
Normal file
275
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionConfig.kt
generated
Normal file
@@ -0,0 +1,275 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Main extraction configuration.
|
||||
*
|
||||
* This struct contains all configuration options for the extraction process.
|
||||
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||||
*/
|
||||
data class ExtractionConfig(
|
||||
/** Enable caching of extraction results */
|
||||
val useCache: Boolean = true,
|
||||
/** Enable quality post-processing */
|
||||
val enableQualityProcessing: Boolean = true,
|
||||
/** OCR configuration (None = OCR disabled) */
|
||||
val ocr: OcrConfig? = null,
|
||||
/** Force OCR even for searchable PDFs */
|
||||
val forceOcr: Boolean = false,
|
||||
/**
|
||||
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
||||
*
|
||||
* When set, only the listed pages are OCR'd regardless of text layer quality.
|
||||
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
|
||||
* Only applies to PDF documents. Duplicates are automatically deduplicated.
|
||||
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
|
||||
*/
|
||||
val forceOcrPages: List<Int>? = null,
|
||||
/**
|
||||
* Disable OCR entirely, even for images.
|
||||
*
|
||||
* When `true`, OCR is skipped for all document types. Images return metadata
|
||||
* only (dimensions, format, EXIF) without text extraction. PDFs use only
|
||||
* native text extraction without OCR fallback.
|
||||
*
|
||||
* Cannot be `true` simultaneously with `force_ocr`.
|
||||
*
|
||||
* *Added in v4.7.0.*
|
||||
*/
|
||||
val disableOcr: Boolean = false,
|
||||
/** Text chunking configuration (None = chunking disabled) */
|
||||
val chunking: ChunkingConfig? = null,
|
||||
/**
|
||||
* Content filtering configuration (None = use extractor defaults).
|
||||
*
|
||||
* Controls whether document "furniture" (headers, footers, watermarks,
|
||||
* repeating text) is included in or stripped from extraction results.
|
||||
* See `ContentFilterConfig` for per-field documentation.
|
||||
*/
|
||||
val contentFilter: ContentFilterConfig? = null,
|
||||
/** Image extraction configuration (None = no image extraction) */
|
||||
val images: ImageExtractionConfig? = null,
|
||||
/** PDF-specific options (None = use defaults) */
|
||||
val pdfOptions: PdfConfig? = null,
|
||||
/** Token reduction configuration (None = no token reduction) */
|
||||
val tokenReduction: TokenReductionOptions? = null,
|
||||
/** Language detection configuration (None = no language detection) */
|
||||
val languageDetection: LanguageDetectionConfig? = null,
|
||||
/** Page extraction configuration (None = no page tracking) */
|
||||
val pages: PageConfig? = null,
|
||||
/** Keyword extraction configuration (None = no keyword extraction) */
|
||||
val keywords: KeywordConfig? = null,
|
||||
/** Post-processor configuration (None = use defaults) */
|
||||
val postprocessor: PostProcessorConfig? = null,
|
||||
/**
|
||||
* HTML to Markdown conversion options (None = use defaults)
|
||||
*
|
||||
* Configure how HTML documents are converted to Markdown, including heading styles,
|
||||
* list formatting, code block styles, and preprocessing options.
|
||||
*/
|
||||
val htmlOptions: String? = null,
|
||||
/**
|
||||
* Styled HTML output configuration.
|
||||
*
|
||||
* When set alongside `output_format = OutputFormat.Html`, the extraction
|
||||
* pipeline uses `StyledHtmlRenderer`
|
||||
* which emits stable `kb-*` CSS class hooks on every structural element
|
||||
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
|
||||
*
|
||||
* When `null`, the existing plain comrak-based HTML renderer is used.
|
||||
*/
|
||||
val htmlOutput: HtmlOutputConfig? = null,
|
||||
/**
|
||||
* Default per-file timeout in seconds for batch extraction.
|
||||
*
|
||||
* When set, each file in a batch will be canceled after this duration
|
||||
* unless overridden by `FileExtractionConfig.timeout_secs`.
|
||||
*
|
||||
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
|
||||
* nested archives, documents with millions of cells) from running
|
||||
* indefinitely and exhausting caller resources. Set to `null` to
|
||||
* disable the timeout for trusted input or long-running workloads.
|
||||
*/
|
||||
val extractionTimeoutSecs: Long? = null,
|
||||
/**
|
||||
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
|
||||
*
|
||||
* Limits parallelism to prevent resource exhaustion when processing
|
||||
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
||||
*/
|
||||
val maxConcurrentExtractions: Long? = null,
|
||||
/**
|
||||
* Result structure format
|
||||
*
|
||||
* Controls whether results are returned in unified format (default) with all
|
||||
* content in the `content` field, or element-based format with semantic
|
||||
* elements (for Unstructured-compatible output).
|
||||
*/
|
||||
val resultFormat: ResultFormat = ResultFormat.UNIFIED,
|
||||
/**
|
||||
* Security limits for archive extraction.
|
||||
*
|
||||
* Controls maximum archive size, compression ratio, file count, and other
|
||||
* security thresholds to prevent decompression bomb attacks. Also caps
|
||||
* nesting depth, iteration count, entity / token length, total
|
||||
* content size, and table cell count for every extraction path that
|
||||
* ingests user-controlled bytes.
|
||||
* When `null`, default limits are used.
|
||||
*/
|
||||
val securityLimits: SecurityLimits? = null,
|
||||
/**
|
||||
* Maximum uncompressed size in bytes for a single embedded file before
|
||||
* recursive extraction is attempted (default: 50 MiB).
|
||||
*
|
||||
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
||||
* to email attachments processed via recursive extraction. Files that
|
||||
* exceed this limit are skipped with a `ProcessingWarning` rather than
|
||||
* passed to the extraction pipeline, preventing a single oversized
|
||||
* embedded object from consuming unbounded memory or time.
|
||||
*
|
||||
* Set to `null` to disable the per-embedded-file cap (falls back to
|
||||
* `security_limits.max_archive_size` as the only guard).
|
||||
*/
|
||||
val maxEmbeddedFileBytes: Long? = null,
|
||||
/**
|
||||
* Content text format (default: Plain).
|
||||
*
|
||||
* Controls the format of the extracted content:
|
||||
*
|
||||
* - `Plain`: Raw extracted text (default)
|
||||
* - `Markdown`: Markdown formatted output
|
||||
* - `Djot`: Djot markup format (requires djot feature)
|
||||
* - `Html`: HTML formatted output
|
||||
*
|
||||
* When set to a structured format, extraction results will include
|
||||
* formatted output. The `formatted_content` field may be populated
|
||||
* when format conversion is applied.
|
||||
*/
|
||||
val outputFormat: OutputFormat = OutputFormat.Plain,
|
||||
/**
|
||||
* Layout detection configuration (None = layout detection disabled).
|
||||
*
|
||||
* When set, PDF pages and images are analyzed for document structure
|
||||
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
||||
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
||||
* in the markdown pipeline. For images, per-region OCR is performed with
|
||||
* markdown formatting based on detected layout classes.
|
||||
* Requires the `layout-detection` feature to run inference; the field is
|
||||
* present whenever the `layout-types` feature is active (which includes
|
||||
* `layout-detection` as well as the no-ORT target groups).
|
||||
*/
|
||||
val layout: LayoutDetectionConfig? = null,
|
||||
/**
|
||||
* Run layout detection on the non-OCR PDF markdown path.
|
||||
*
|
||||
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
|
||||
* table, list, and figure detection in the structure pipeline that would
|
||||
* otherwise rely on font-clustering heuristics alone. Significantly
|
||||
* improves SF1 (structural F1) at the cost of inference latency
|
||||
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
|
||||
* Requires the `layout-detection` feature.
|
||||
*/
|
||||
val useLayoutForMarkdown: Boolean = false,
|
||||
/**
|
||||
* Enable structured document tree output.
|
||||
*
|
||||
* When true, populates the `document` field on `ExtractionResult` with a
|
||||
* hierarchical `DocumentStructure` containing heading-driven section nesting,
|
||||
* table grids, content layer classification, and inline annotations.
|
||||
*
|
||||
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
||||
*/
|
||||
val includeDocumentStructure: Boolean = false,
|
||||
/**
|
||||
* Hardware acceleration configuration for ONNX Runtime models.
|
||||
*
|
||||
* Controls execution provider selection for layout detection and embedding
|
||||
* models. When `null`, uses platform defaults (CoreML on macOS, CUDA on
|
||||
* Linux, CPU on Windows).
|
||||
*/
|
||||
val acceleration: AccelerationConfig? = null,
|
||||
/**
|
||||
* Cache namespace for tenant isolation.
|
||||
*
|
||||
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
|
||||
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
||||
* Different namespaces have isolated cache spaces on the same filesystem.
|
||||
*/
|
||||
val cacheNamespace: String? = null,
|
||||
/**
|
||||
* Per-request cache TTL in seconds.
|
||||
*
|
||||
* Overrides the global `max_age_days` for this specific extraction.
|
||||
* When `0`, caching is completely skipped (no read or write).
|
||||
* When `null`, the global TTL applies.
|
||||
*/
|
||||
val cacheTtlSecs: Long? = null,
|
||||
/**
|
||||
* Email extraction configuration (None = use defaults).
|
||||
*
|
||||
* Currently supports configuring the fallback codepage for MSG files
|
||||
* that do not specify one. See `EmailConfig` for details.
|
||||
*/
|
||||
val email: EmailConfig? = null,
|
||||
/**
|
||||
* Concurrency limits for constrained environments (None = use defaults).
|
||||
*
|
||||
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
||||
* (when `max_concurrent_extractions` is unset) the batch concurrency
|
||||
* semaphore. See `ConcurrencyConfig` for details.
|
||||
*/
|
||||
val concurrency: String? = null,
|
||||
/**
|
||||
* Maximum recursion depth for archive extraction (default: 3).
|
||||
* Set to 0 to disable recursive extraction (legacy behavior).
|
||||
*/
|
||||
val maxArchiveDepth: Long = 0L,
|
||||
/**
|
||||
* Tree-sitter language pack configuration (None = tree-sitter disabled).
|
||||
*
|
||||
* When set, enables code file extraction using tree-sitter parsers.
|
||||
* Controls grammar download behavior and code analysis options.
|
||||
*/
|
||||
val treeSitter: TreeSitterConfig? = null,
|
||||
/**
|
||||
* Structured extraction via LLM (None = disabled).
|
||||
*
|
||||
* When set, the extracted document content is sent to an LLM with the
|
||||
* provided JSON schema. The structured response is stored in
|
||||
* `ExtractionResult.structured_output`.
|
||||
*/
|
||||
val structuredExtraction: StructuredExtractionConfig? = null,
|
||||
/**
|
||||
* Cancellation token for this extraction (None = no external cancellation).
|
||||
*
|
||||
* Pass a `CancellationToken` clone here and call `CancellationToken.cancel`
|
||||
* from another thread / task to abort the extraction in progress. The extractor
|
||||
* checks the token at safe checkpoints (before lock acquisition, between pages,
|
||||
* between batch items) and returns `KreuzbergError.Cancelled` when set.
|
||||
*
|
||||
* The field is excluded from serialization because `CancellationToken` is a
|
||||
* runtime handle, not a configuration value.
|
||||
*/
|
||||
val cancelToken: String? = null,
|
||||
)
|
||||
53
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionDiff.kt
generated
Normal file
53
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionDiff.kt
generated
Normal file
@@ -0,0 +1,53 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** The complete diff between two `ExtractionResult` values. */
|
||||
data class ExtractionDiff(
|
||||
/**
|
||||
* Unified-diff hunks for the `content` field.
|
||||
*
|
||||
* Empty when the content is identical.
|
||||
*/
|
||||
val contentDiff: List<DiffHunk> = emptyList(),
|
||||
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
|
||||
val tablesAdded: List<Table> = emptyList(),
|
||||
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
|
||||
val tablesRemoved: List<Table> = emptyList(),
|
||||
/** Cell-level changes for table pairs that share the same index and dimensions. */
|
||||
val tablesChanged: List<TableDiff> = emptyList(),
|
||||
/**
|
||||
* Metadata difference, encoded as a JSON object with three top-level keys:
|
||||
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
|
||||
* but not `b`), and `changed` (keys whose values differ — each entry is
|
||||
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
|
||||
*
|
||||
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
|
||||
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
|
||||
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
|
||||
* preferred json-patch impl directly.
|
||||
*/
|
||||
val metadataChanged: Any,
|
||||
/** Changes to embedded archive children. */
|
||||
val embeddedChanges: EmbeddedChanges,
|
||||
)
|
||||
51
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionMethod.kt
generated
Normal file
51
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionMethod.kt
generated
Normal file
@@ -0,0 +1,51 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** How the extracted text was produced. */
|
||||
enum class ExtractionMethod {
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("native")
|
||||
NATIVE,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("ocr")
|
||||
OCR,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("mixed")
|
||||
MIXED;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
NATIVE -> "native"
|
||||
OCR -> "ocr"
|
||||
MIXED -> "mixed"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ExtractionMethod = when (value) {
|
||||
"native" -> NATIVE
|
||||
"ocr" -> OCR
|
||||
"mixed" -> MIXED
|
||||
else -> throw IllegalArgumentException("Unknown ExtractionMethod value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
229
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionResult.kt
generated
Normal file
229
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ExtractionResult.kt
generated
Normal file
@@ -0,0 +1,229 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* General extraction result used by the core extraction API.
|
||||
*
|
||||
* This is the main result type returned by all extraction functions.
|
||||
*/
|
||||
data class ExtractionResult(
|
||||
val content: String = "",
|
||||
val mimeType: String = "",
|
||||
val metadata: Metadata = Metadata(),
|
||||
/**
|
||||
* Extraction strategy used to produce the returned text.
|
||||
*
|
||||
* Populated when the extractor can reliably distinguish native text extraction,
|
||||
* OCR-only extraction, or mixed native/OCR output.
|
||||
*/
|
||||
val extractionMethod: ExtractionMethod? = null,
|
||||
val tables: List<Table> = emptyList(),
|
||||
val detectedLanguages: List<String>? = null,
|
||||
/**
|
||||
* Text chunks when chunking is enabled.
|
||||
*
|
||||
* When chunking configuration is provided, the content is split into
|
||||
* overlapping chunks for efficient processing. Each chunk contains the text,
|
||||
* optional embeddings (if enabled), and metadata about its position.
|
||||
*/
|
||||
val chunks: List<Chunk>? = null,
|
||||
/**
|
||||
* Extracted images from the document.
|
||||
*
|
||||
* When image extraction is enabled via `ImageExtractionConfig`, this field
|
||||
* contains all images found in the document with their raw data and metadata.
|
||||
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
||||
*/
|
||||
val images: List<ExtractedImage>? = null,
|
||||
/**
|
||||
* Per-page content when page extraction is enabled.
|
||||
*
|
||||
* When page extraction is configured, the document is split into per-page content
|
||||
* with tables and images mapped to their respective pages.
|
||||
*/
|
||||
val pages: List<PageContent>? = null,
|
||||
/**
|
||||
* Semantic elements when element-based result format is enabled.
|
||||
*
|
||||
* When result_format is set to ElementBased, this field contains semantic
|
||||
* elements with type classification, unique identifiers, and metadata for
|
||||
* Unstructured-compatible element-based processing.
|
||||
*/
|
||||
val elements: List<Element>? = null,
|
||||
/**
|
||||
* Rich Djot content structure (when extracting Djot documents).
|
||||
*
|
||||
* When extracting Djot documents with structured extraction enabled,
|
||||
* this field contains the full semantic structure including:
|
||||
*
|
||||
* - Block-level elements with nesting
|
||||
* - Inline formatting with attributes
|
||||
* - Links, images, footnotes
|
||||
* - Math expressions
|
||||
* - Complete attribute information
|
||||
*
|
||||
* The `content` field still contains plain text for backward compatibility.
|
||||
*
|
||||
* Always `null` for non-Djot documents.
|
||||
*/
|
||||
val djotContent: DjotContent? = null,
|
||||
/**
|
||||
* OCR elements with full spatial and confidence metadata.
|
||||
*
|
||||
* When OCR is performed with element extraction enabled, this field contains
|
||||
* the structured representation of detected text including:
|
||||
*
|
||||
* - Bounding geometry (rectangles or quadrilaterals)
|
||||
* - Confidence scores (detection and recognition)
|
||||
* - Rotation information
|
||||
* - Hierarchical relationships (Tesseract only)
|
||||
*
|
||||
* This field preserves all metadata that would otherwise be lost when
|
||||
* converting to plain text or markdown output formats.
|
||||
*
|
||||
* Only populated when `OcrElementConfig.include_elements` is true.
|
||||
*/
|
||||
val ocrElements: List<OcrElement>? = null,
|
||||
/**
|
||||
* Structured document tree (when document structure extraction is enabled).
|
||||
*
|
||||
* When `include_document_structure` is true in `ExtractionConfig`, this field
|
||||
* contains the full hierarchical representation of the document including:
|
||||
*
|
||||
* - Heading-driven section nesting
|
||||
* - Table grids with cell-level metadata
|
||||
* - Content layer classification (body, header, footer, footnote)
|
||||
* - Inline text annotations (formatting, links)
|
||||
* - Bounding boxes and page numbers
|
||||
*
|
||||
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
||||
*/
|
||||
val document: DocumentStructure? = null,
|
||||
/**
|
||||
* Extracted keywords when keyword extraction is enabled.
|
||||
*
|
||||
* When keyword extraction (RAKE or YAKE) is configured, this field contains
|
||||
* the extracted keywords with scores, algorithm info, and position data.
|
||||
* Previously stored in `metadata.additional["keywords"]`.
|
||||
*/
|
||||
val extractedKeywords: List<Keyword>? = null,
|
||||
/**
|
||||
* Document quality score from quality analysis.
|
||||
*
|
||||
* A value between 0.0 and 1.0 indicating the overall text quality.
|
||||
* Previously stored in `metadata.additional["quality_score"]`.
|
||||
*/
|
||||
val qualityScore: Double? = null,
|
||||
/**
|
||||
* Non-fatal warnings collected during processing pipeline stages.
|
||||
*
|
||||
* Captures errors from optional pipeline features (embedding, chunking,
|
||||
* language detection, output formatting) that don't prevent extraction
|
||||
* but may indicate degraded results.
|
||||
* Previously stored as individual keys in `metadata.additional`.
|
||||
*/
|
||||
val processingWarnings: List<ProcessingWarning> = emptyList(),
|
||||
/**
|
||||
* PDF annotations extracted from the document.
|
||||
*
|
||||
* When annotation extraction is enabled via `PdfConfig.extract_annotations`,
|
||||
* this field contains text notes, highlights, links, stamps, and other
|
||||
* annotations found in PDF documents.
|
||||
*/
|
||||
val annotations: List<PdfAnnotation>? = null,
|
||||
/**
|
||||
* Nested extraction results from archive contents.
|
||||
*
|
||||
* When extracting archives, each processable file inside produces its own
|
||||
* full extraction result. Set to `null` for non-archive formats.
|
||||
* Use `max_archive_depth` in config to control recursion depth.
|
||||
*/
|
||||
val children: List<ArchiveEntry>? = null,
|
||||
/**
|
||||
* URIs/links discovered during document extraction.
|
||||
*
|
||||
* Contains hyperlinks, image references, citations, email addresses, and
|
||||
* other URI-like references found in the document. Always extracted when
|
||||
* present in the source document.
|
||||
*/
|
||||
val uris: List<ExtractedUri>? = null,
|
||||
/**
|
||||
* Tracked changes embedded in the source document.
|
||||
*
|
||||
* Populated by per-format extractors that understand change-tracking
|
||||
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
|
||||
* …). Every extractor defaults to `null` until its format-specific
|
||||
* implementation is added. Extractors that do populate this field follow
|
||||
* the "accepted-changes" convention: inserted text is present in
|
||||
* `content`, deleted text is absent — the revision list is the separate
|
||||
* audit trail.
|
||||
*/
|
||||
val revisions: List<DocumentRevision>? = null,
|
||||
/**
|
||||
* Structured extraction output from LLM-based JSON schema extraction.
|
||||
*
|
||||
* When `structured_extraction` is configured in `ExtractionConfig`, the
|
||||
* extracted document content is sent to a VLM with the provided JSON schema.
|
||||
* The response is parsed and stored here as a JSON value matching the schema.
|
||||
*/
|
||||
val structuredOutput: Any? = null,
|
||||
/**
|
||||
* Code intelligence results from tree-sitter analysis.
|
||||
*
|
||||
* Populated when extracting source code files with the `tree-sitter` feature.
|
||||
* Contains metrics, structural analysis, imports/exports, comments,
|
||||
* docstrings, symbols, diagnostics, and optionally chunked code segments.
|
||||
*
|
||||
* Stored as an opaque JSON value so that all language bindings (Go, Java,
|
||||
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
|
||||
* The underlying type is `tree_sitter_language_pack.ProcessResult`.
|
||||
*/
|
||||
val codeIntelligence: Any? = null,
|
||||
/**
|
||||
* LLM token usage and cost data for all LLM calls made during this extraction.
|
||||
*
|
||||
* Contains one entry per LLM call. Multiple entries are produced when
|
||||
* VLM OCR, structured extraction, or LLM embeddings run during
|
||||
* the same extraction.
|
||||
*
|
||||
* `null` when no LLM was used.
|
||||
*/
|
||||
val llmUsage: List<LlmUsage>? = null,
|
||||
/**
|
||||
* Pre-rendered content in the requested output format.
|
||||
*
|
||||
* Populated during `derive_extraction_result` before tree derivation consumes
|
||||
* element data. `apply_output_format` swaps this into `content` at the end
|
||||
* of the pipeline, after post-processors have operated on plain text.
|
||||
*/
|
||||
val formattedContent: String? = null,
|
||||
/**
|
||||
* Structured hOCR document for the OCR+layout pipeline.
|
||||
*
|
||||
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
|
||||
* paragraph structure with bounding boxes and confidence scores. The layout
|
||||
* classification step enriches these elements before final rendering.
|
||||
*/
|
||||
val ocrInternalDocument: String? = null,
|
||||
)
|
||||
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FictionBookMetadata.kt
generated
Normal file
30
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FictionBookMetadata.kt
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** FictionBook (FB2) metadata. */
|
||||
data class FictionBookMetadata(
|
||||
val genres: List<String> = emptyList(),
|
||||
val sequences: List<String> = emptyList(),
|
||||
val annotation: String? = null,
|
||||
)
|
||||
100
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FileExtractionConfig.kt
generated
Normal file
100
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FileExtractionConfig.kt
generated
Normal file
@@ -0,0 +1,100 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Per-file extraction configuration overrides for batch processing.
|
||||
*
|
||||
* All fields are `Option<T>` — `null` means "use the batch-level default."
|
||||
* This type is used with `batch_extract_files` and
|
||||
* `batch_extract_bytes` to allow heterogeneous
|
||||
* extraction settings within a single batch.
|
||||
*
|
||||
* # Excluded Fields
|
||||
*
|
||||
* The following `ExtractionConfig` fields are batch-level only and
|
||||
* cannot be overridden per file:
|
||||
*
|
||||
* - `max_concurrent_extractions` — controls batch parallelism
|
||||
* - `use_cache` — global caching policy
|
||||
* - `acceleration` — shared ONNX execution provider
|
||||
* - `security_limits` — global archive security policy
|
||||
*/
|
||||
data class FileExtractionConfig(
|
||||
/** Override quality post-processing for this file. */
|
||||
val enableQualityProcessing: Boolean? = null,
|
||||
/** Override OCR configuration for this file (None in the Option = use batch default). */
|
||||
val ocr: OcrConfig? = null,
|
||||
/** Override force OCR for this file. */
|
||||
val forceOcr: Boolean? = null,
|
||||
/** Override force OCR pages for this file (1-indexed page numbers). */
|
||||
val forceOcrPages: List<Int>? = null,
|
||||
/** Override disable OCR for this file. */
|
||||
val disableOcr: Boolean? = null,
|
||||
/** Override chunking configuration for this file. */
|
||||
val chunking: ChunkingConfig? = null,
|
||||
/** Override content filtering configuration for this file. */
|
||||
val contentFilter: ContentFilterConfig? = null,
|
||||
/** Override image extraction configuration for this file. */
|
||||
val images: ImageExtractionConfig? = null,
|
||||
/** Override PDF options for this file. */
|
||||
val pdfOptions: PdfConfig? = null,
|
||||
/** Override token reduction for this file. */
|
||||
val tokenReduction: TokenReductionOptions? = null,
|
||||
/** Override language detection for this file. */
|
||||
val languageDetection: LanguageDetectionConfig? = null,
|
||||
/** Override page extraction for this file. */
|
||||
val pages: PageConfig? = null,
|
||||
/** Override keyword extraction for this file. */
|
||||
val keywords: KeywordConfig? = null,
|
||||
/** Override post-processor for this file. */
|
||||
val postprocessor: PostProcessorConfig? = null,
|
||||
/** Override HTML conversion options for this file. */
|
||||
val htmlOptions: String? = null,
|
||||
/** Override result format for this file. */
|
||||
val resultFormat: ResultFormat? = null,
|
||||
/** Override output content format for this file. */
|
||||
val outputFormat: OutputFormat? = null,
|
||||
/** Override document structure output for this file. */
|
||||
val includeDocumentStructure: Boolean? = null,
|
||||
/** Override layout detection for this file. */
|
||||
val layout: LayoutDetectionConfig? = null,
|
||||
/**
|
||||
* Override per-file extraction timeout in seconds.
|
||||
*
|
||||
* When set, the extraction for this file will be canceled after the
|
||||
* specified duration. A timed-out file produces an error result without
|
||||
* affecting other files in the batch.
|
||||
*/
|
||||
val timeoutSecs: Long? = null,
|
||||
/** Override tree-sitter configuration for this file. */
|
||||
val treeSitter: TreeSitterConfig? = null,
|
||||
/**
|
||||
* Override structured extraction configuration for this file.
|
||||
*
|
||||
* When set, enables LLM-based structured extraction with a JSON schema
|
||||
* for this specific file. The extracted content is sent to a VLM/LLM
|
||||
* and the response is parsed according to the provided schema.
|
||||
*/
|
||||
val structuredExtraction: StructuredExtractionConfig? = null,
|
||||
)
|
||||
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Footnote.kt
generated
Normal file
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/Footnote.kt
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Footnote in Djot. */
|
||||
data class Footnote(
|
||||
/** Footnote label */
|
||||
val label: String,
|
||||
/** Footnote content blocks */
|
||||
val content: List<FormattedBlock> = emptyList(),
|
||||
)
|
||||
227
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FormatMetadata.kt
generated
Normal file
227
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FormatMetadata.kt
generated
Normal file
@@ -0,0 +1,227 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Format-specific metadata (discriminated union).
|
||||
*
|
||||
* Only one format type can exist per extraction result. This provides
|
||||
* type-safe, clean metadata without nested optionals.
|
||||
*/
|
||||
@com.fasterxml.jackson.databind.annotation.JsonDeserialize(using = FormatMetadataDeserializer::class)
|
||||
@com.fasterxml.jackson.databind.annotation.JsonSerialize(using = FormatMetadataSerializer::class)
|
||||
sealed class FormatMetadata {
|
||||
data class Pdf(val metadata: PdfMetadata) : FormatMetadata()
|
||||
data class Docx(val metadata: DocxMetadata) : FormatMetadata()
|
||||
data class Excel(val metadata: ExcelMetadata) : FormatMetadata()
|
||||
data class Email(val metadata: EmailMetadata) : FormatMetadata()
|
||||
data class Pptx(val metadata: PptxMetadata) : FormatMetadata()
|
||||
data class Archive(val metadata: ArchiveMetadata) : FormatMetadata()
|
||||
data class Image(val metadata: ImageMetadata) : FormatMetadata()
|
||||
data class Xml(val metadata: XmlMetadata) : FormatMetadata()
|
||||
data class Text(val metadata: TextMetadata) : FormatMetadata()
|
||||
data class Html(val metadata: HtmlMetadata) : FormatMetadata()
|
||||
data class Ocr(val metadata: OcrMetadata) : FormatMetadata()
|
||||
data class Csv(val metadata: CsvMetadata) : FormatMetadata()
|
||||
data class Bibtex(val metadata: BibtexMetadata) : FormatMetadata()
|
||||
data class Citation(val metadata: CitationMetadata) : FormatMetadata()
|
||||
data class FictionBook(val metadata: FictionBookMetadata) : FormatMetadata()
|
||||
data class Dbf(val metadata: DbfMetadata) : FormatMetadata()
|
||||
data class Jats(val metadata: JatsMetadata) : FormatMetadata()
|
||||
data class Epub(val metadata: EpubMetadata) : FormatMetadata()
|
||||
data class Pst(val metadata: PstMetadata) : FormatMetadata()
|
||||
data class Code(val value: String) : FormatMetadata()
|
||||
}
|
||||
|
||||
private class FormatMetadataDeserializer : com.fasterxml.jackson.databind.deser.std.StdDeserializer<FormatMetadata>(FormatMetadata::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun deserialize(
|
||||
parser: com.fasterxml.jackson.core.JsonParser,
|
||||
ctx: com.fasterxml.jackson.databind.DeserializationContext,
|
||||
): FormatMetadata {
|
||||
val node = parser.codec.readTree<com.fasterxml.jackson.databind.node.ObjectNode>(parser)
|
||||
val tag = node.get("format_type")?.asText()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val payload = (node.deepCopy() as com.fasterxml.jackson.databind.node.ObjectNode).apply { remove("format_type") }
|
||||
return when (tag) {
|
||||
"pdf" -> FormatMetadata.Pdf(ctx.readTreeAsValue<PdfMetadata>(payload, PdfMetadata::class.java))
|
||||
"docx" -> FormatMetadata.Docx(ctx.readTreeAsValue<DocxMetadata>(payload, DocxMetadata::class.java))
|
||||
"excel" -> FormatMetadata.Excel(ctx.readTreeAsValue<ExcelMetadata>(payload, ExcelMetadata::class.java))
|
||||
"email" -> FormatMetadata.Email(ctx.readTreeAsValue<EmailMetadata>(payload, EmailMetadata::class.java))
|
||||
"pptx" -> FormatMetadata.Pptx(ctx.readTreeAsValue<PptxMetadata>(payload, PptxMetadata::class.java))
|
||||
"archive" -> FormatMetadata.Archive(ctx.readTreeAsValue<ArchiveMetadata>(payload, ArchiveMetadata::class.java))
|
||||
"image" -> FormatMetadata.Image(ctx.readTreeAsValue<ImageMetadata>(payload, ImageMetadata::class.java))
|
||||
"xml" -> FormatMetadata.Xml(ctx.readTreeAsValue<XmlMetadata>(payload, XmlMetadata::class.java))
|
||||
"text" -> FormatMetadata.Text(ctx.readTreeAsValue<TextMetadata>(payload, TextMetadata::class.java))
|
||||
"html" -> FormatMetadata.Html(ctx.readTreeAsValue<HtmlMetadata>(payload, HtmlMetadata::class.java))
|
||||
"ocr" -> FormatMetadata.Ocr(ctx.readTreeAsValue<OcrMetadata>(payload, OcrMetadata::class.java))
|
||||
"csv" -> FormatMetadata.Csv(ctx.readTreeAsValue<CsvMetadata>(payload, CsvMetadata::class.java))
|
||||
"bibtex" -> FormatMetadata.Bibtex(ctx.readTreeAsValue<BibtexMetadata>(payload, BibtexMetadata::class.java))
|
||||
"citation" -> FormatMetadata.Citation(ctx.readTreeAsValue<CitationMetadata>(payload, CitationMetadata::class.java))
|
||||
"fiction_book" -> FormatMetadata.FictionBook(ctx.readTreeAsValue<FictionBookMetadata>(payload, FictionBookMetadata::class.java))
|
||||
"dbf" -> FormatMetadata.Dbf(ctx.readTreeAsValue<DbfMetadata>(payload, DbfMetadata::class.java))
|
||||
"jats" -> FormatMetadata.Jats(ctx.readTreeAsValue<JatsMetadata>(payload, JatsMetadata::class.java))
|
||||
"epub" -> FormatMetadata.Epub(ctx.readTreeAsValue<EpubMetadata>(payload, EpubMetadata::class.java))
|
||||
"pst" -> FormatMetadata.Pst(ctx.readTreeAsValue<PstMetadata>(payload, PstMetadata::class.java))
|
||||
"code" -> FormatMetadata.Code(ctx.readTreeAsValue<String>(payload, String::class.java))
|
||||
else -> throw com.fasterxml.jackson.databind.exc.InvalidFormatException(
|
||||
parser, "Unknown FormatMetadata tag", tag, FormatMetadata::class.java,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class FormatMetadataSerializer : com.fasterxml.jackson.databind.ser.std.StdSerializer<FormatMetadata>(FormatMetadata::class.java) {
|
||||
@Suppress("LongMethod")
|
||||
override fun serialize(
|
||||
value: FormatMetadata,
|
||||
gen: com.fasterxml.jackson.core.JsonGenerator,
|
||||
provider: com.fasterxml.jackson.databind.SerializerProvider,
|
||||
) {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mapper = (gen.codec as? com.fasterxml.jackson.databind.ObjectMapper) ?: com.fasterxml.jackson.databind.ObjectMapper().findAndRegisterModules()
|
||||
val node: com.fasterxml.jackson.databind.node.ObjectNode = when (value) {
|
||||
is FormatMetadata.Pdf -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "pdf")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Docx -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "docx")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Excel -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "excel")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Email -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "email")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Pptx -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "pptx")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Archive -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "archive")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Image -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "image")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Xml -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "xml")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Text -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "text")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Html -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "html")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Ocr -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "ocr")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Csv -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "csv")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Bibtex -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "bibtex")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Citation -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "citation")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.FictionBook -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "fiction_book")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Dbf -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "dbf")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Jats -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "jats")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Epub -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "epub")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Pst -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.metadata) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "pst")
|
||||
n
|
||||
}
|
||||
is FormatMetadata.Code -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val n = mapper.valueToTree<com.fasterxml.jackson.databind.node.ObjectNode>(value.value) as com.fasterxml.jackson.databind.node.ObjectNode
|
||||
n.put("format_type", "code")
|
||||
n
|
||||
}
|
||||
}
|
||||
mapper.writeTree(gen, node)
|
||||
}
|
||||
}
|
||||
45
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FormattedBlock.kt
generated
Normal file
45
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FormattedBlock.kt
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Block-level element in a Djot document.
|
||||
*
|
||||
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
||||
*/
|
||||
data class FormattedBlock(
|
||||
/** Type of block element */
|
||||
val blockType: BlockType,
|
||||
/** Heading level (1-6) for headings, or nesting level for lists */
|
||||
val level: Long? = null,
|
||||
/** Inline content within the block */
|
||||
val inlineContent: List<InlineElement> = emptyList(),
|
||||
/** Element attributes (classes, IDs, key-value pairs) */
|
||||
val attributes: String? = null,
|
||||
/** Language identifier for code blocks */
|
||||
val language: String? = null,
|
||||
/** Raw code content for code blocks */
|
||||
val code: String? = null,
|
||||
/** Nested blocks for containers (blockquotes, list items, divs) */
|
||||
val children: List<FormattedBlock> = emptyList(),
|
||||
)
|
||||
54
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FracType.kt
generated
Normal file
54
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/FracType.kt
generated
Normal file
@@ -0,0 +1,54 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
enum class FracType {
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("Bar")
|
||||
BAR,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("NoBar")
|
||||
NO_BAR,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("Linear")
|
||||
LINEAR,
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("Skewed")
|
||||
SKEWED;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
BAR -> "Bar"
|
||||
NO_BAR -> "NoBar"
|
||||
LINEAR -> "Linear"
|
||||
SKEWED -> "Skewed"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): FracType = when (value) {
|
||||
"Bar" -> BAR
|
||||
"NoBar" -> NO_BAR
|
||||
"Linear" -> LINEAR
|
||||
"Skewed" -> SKEWED
|
||||
else -> throw IllegalArgumentException("Unknown FracType value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/GridCell.kt
generated
Normal file
41
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/GridCell.kt
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Individual grid cell with position and span metadata. */
|
||||
data class GridCell(
|
||||
/** Cell text content. */
|
||||
val content: String,
|
||||
/** Zero-indexed row position. */
|
||||
val row: Int,
|
||||
/** Zero-indexed column position. */
|
||||
val col: Int,
|
||||
/** Number of rows this cell spans. */
|
||||
val rowSpan: Int,
|
||||
/** Number of columns this cell spans. */
|
||||
val colSpan: Int,
|
||||
/** Whether this is a header cell. */
|
||||
val isHeader: Boolean,
|
||||
/** Bounding box for this cell (if available). */
|
||||
val bbox: BoundingBox? = null,
|
||||
)
|
||||
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeaderMetadata.kt
generated
Normal file
37
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeaderMetadata.kt
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Header/heading element metadata. */
|
||||
data class HeaderMetadata(
|
||||
/** Header level: 1 (h1) through 6 (h6) */
|
||||
val level: Byte,
|
||||
/** Normalized text content of the header */
|
||||
val text: String,
|
||||
/** HTML id attribute if present */
|
||||
val id: String? = null,
|
||||
/** Document tree depth at the header element */
|
||||
val depth: Int,
|
||||
/** Byte offset in original HTML document */
|
||||
val htmlOffset: Int,
|
||||
)
|
||||
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeadingContext.kt
generated
Normal file
36
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeadingContext.kt
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Heading context for a chunk within a Markdown document.
|
||||
*
|
||||
* Contains the heading hierarchy from document root to this chunk's section.
|
||||
*/
|
||||
data class HeadingContext(
|
||||
/**
|
||||
* The heading hierarchy from document root to this chunk's section.
|
||||
* Index 0 is the outermost (h1), last element is the most specific.
|
||||
*/
|
||||
val headings: List<HeadingLevel> = emptyList(),
|
||||
)
|
||||
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeadingLevel.kt
generated
Normal file
31
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HeadingLevel.kt
generated
Normal file
@@ -0,0 +1,31 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** A single heading in the hierarchy. */
|
||||
data class HeadingLevel(
|
||||
/** Heading depth (1 = h1, 2 = h2, etc.) */
|
||||
val level: Byte,
|
||||
/** The text content of the heading. */
|
||||
val text: String,
|
||||
)
|
||||
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HierarchicalBlock.kt
generated
Normal file
56
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HierarchicalBlock.kt
generated
Normal file
@@ -0,0 +1,56 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* A text block with hierarchy level assignment.
|
||||
*
|
||||
* Represents a block of text with semantic heading information extracted from
|
||||
* font size clustering and hierarchical analysis.
|
||||
*/
|
||||
data class HierarchicalBlock(
|
||||
/** The text content of this block */
|
||||
val text: String,
|
||||
/** The font size of the text in this block */
|
||||
val fontSize: Float,
|
||||
/**
|
||||
* The hierarchy level of this block (H1-H6 or Body)
|
||||
*
|
||||
* Levels correspond to HTML heading tags:
|
||||
*
|
||||
* - "h1": Top-level heading
|
||||
* - "h2": Secondary heading
|
||||
* - "h3": Tertiary heading
|
||||
* - "h4": Quaternary heading
|
||||
* - "h5": Quinary heading
|
||||
* - "h6": Senary heading
|
||||
* - "body": Body text (no heading level)
|
||||
*/
|
||||
val level: String,
|
||||
/**
|
||||
* Bounding box information for the block
|
||||
*
|
||||
* Contains coordinates as (left, top, right, bottom) in PDF units.
|
||||
*/
|
||||
val bbox: List<Float>? = null,
|
||||
)
|
||||
52
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HierarchyConfig.kt
generated
Normal file
52
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HierarchyConfig.kt
generated
Normal file
@@ -0,0 +1,52 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Hierarchy extraction configuration for PDF text structure analysis.
|
||||
*
|
||||
* Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||||
* clustering and semantic analysis. When enabled, hierarchical blocks are
|
||||
* included in page content.
|
||||
*/
|
||||
data class HierarchyConfig(
|
||||
/** Enable hierarchy extraction */
|
||||
val enabled: Boolean = true,
|
||||
/**
|
||||
* Number of font size clusters to use for hierarchy levels (1-7)
|
||||
*
|
||||
* Default: 6, which provides H1-H6 heading levels with body text.
|
||||
* Larger values create more fine-grained hierarchy levels.
|
||||
*/
|
||||
val kClusters: Long = 3L,
|
||||
/** Include bounding box information in hierarchy blocks */
|
||||
val includeBbox: Boolean = true,
|
||||
/**
|
||||
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
||||
*
|
||||
* Determines when OCR should be triggered based on text block coverage.
|
||||
* OCR is triggered when text blocks cover less than this fraction of the page.
|
||||
* Default: 0.5 (trigger OCR if less than 50% of page has text)
|
||||
*/
|
||||
val ocrCoverageThreshold: Float? = null,
|
||||
)
|
||||
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlMetadata.kt
generated
Normal file
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlMetadata.kt
generated
Normal file
@@ -0,0 +1,71 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* HTML metadata extracted from HTML documents.
|
||||
*
|
||||
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
||||
* and extracted structural elements (headers, links, images, structured data).
|
||||
*/
|
||||
data class HtmlMetadata(
|
||||
/** Document title from `<title>` tag */
|
||||
val title: String? = null,
|
||||
/** Document description from `<meta name="description">` tag */
|
||||
val description: String? = null,
|
||||
/** Document keywords from `<meta name="keywords">` tag, split on commas */
|
||||
val keywords: List<String> = emptyList(),
|
||||
/** Document author from `<meta name="author">` tag */
|
||||
val author: String? = null,
|
||||
/** Canonical URL from `<link rel="canonical">` tag */
|
||||
val canonicalUrl: String? = null,
|
||||
/** Base URL from `<base href="">` tag for resolving relative URLs */
|
||||
val baseHref: String? = null,
|
||||
/** Document language from `lang` attribute */
|
||||
val language: String? = null,
|
||||
/** Document text direction from `dir` attribute */
|
||||
val textDirection: TextDirection? = null,
|
||||
/**
|
||||
* Open Graph metadata (og:* properties) for social media
|
||||
* Keys like "title", "description", "image", "url", etc.
|
||||
*/
|
||||
val openGraph: Map<String, String> = emptyMap(),
|
||||
/**
|
||||
* Twitter Card metadata (twitter:* properties)
|
||||
* Keys like "card", "site", "creator", "title", "description", "image", etc.
|
||||
*/
|
||||
val twitterCard: Map<String, String> = emptyMap(),
|
||||
/**
|
||||
* Additional meta tags not covered by specific fields
|
||||
* Keys are meta name/property attributes, values are content
|
||||
*/
|
||||
val metaTags: Map<String, String> = emptyMap(),
|
||||
/** Extracted header elements with hierarchy */
|
||||
val headers: List<HeaderMetadata> = emptyList(),
|
||||
/** Extracted hyperlinks with type classification */
|
||||
val links: List<LinkMetadata> = emptyList(),
|
||||
/** Extracted images with source and dimensions */
|
||||
val images: List<ImageMetadataType> = emptyList(),
|
||||
/** Extracted structured data blocks */
|
||||
val structuredData: List<StructuredData> = emptyList(),
|
||||
)
|
||||
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlOutputConfig.kt
generated
Normal file
63
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlOutputConfig.kt
generated
Normal file
@@ -0,0 +1,63 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Configuration for styled HTML output.
|
||||
*
|
||||
* When set on `ExtractionConfig.html_output` alongside
|
||||
* `output_format = OutputFormat.Html`, the pipeline builds a
|
||||
* `StyledHtmlRenderer` instead of
|
||||
* the plain comrak-based renderer.
|
||||
*/
|
||||
data class HtmlOutputConfig(
|
||||
/**
|
||||
* Inline CSS string injected into the output after the theme stylesheet.
|
||||
* Concatenated after `css_file` content when both are set.
|
||||
*/
|
||||
val css: String? = null,
|
||||
/**
|
||||
* Path to a CSS file loaded once at renderer construction time.
|
||||
* Concatenated before `css` when both are set.
|
||||
*/
|
||||
val cssFile: java.nio.file.Path? = null,
|
||||
/** Built-in colour/typography theme. Default: `HtmlTheme.Unstyled`. */
|
||||
val theme: HtmlTheme = HtmlTheme.UNSTYLED,
|
||||
/**
|
||||
* CSS class prefix applied to every emitted class name.
|
||||
*
|
||||
* Default: `"kb-"`. Change this if your host application already uses
|
||||
* classes that start with `kb-`.
|
||||
*/
|
||||
val classPrefix: String = "",
|
||||
/**
|
||||
* When `true` (default), write the resolved CSS into a `<style>` block
|
||||
* immediately after the opening `<div class="{prefix}doc">`.
|
||||
*
|
||||
* Set to `false` to emit only the structural markup and wire up your
|
||||
* own stylesheet targeting the `kb-*` class names.
|
||||
*/
|
||||
val embedCss: Boolean = true,
|
||||
)
|
||||
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlTheme.kt
generated
Normal file
71
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/HtmlTheme.kt
generated
Normal file
@@ -0,0 +1,71 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Built-in HTML theme selection. */
|
||||
enum class HtmlTheme {
|
||||
/**
|
||||
* Sensible defaults: system font stack, neutral colours, readable line
|
||||
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
|
||||
* can override individual values.
|
||||
*/
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("default")
|
||||
DEFAULT,
|
||||
/** GitHub Markdown-inspired palette and spacing. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("github")
|
||||
GIT_HUB,
|
||||
/** Dark background, light text. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("dark")
|
||||
DARK,
|
||||
/** Minimal light theme with generous whitespace. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("light")
|
||||
LIGHT,
|
||||
/**
|
||||
* No built-in stylesheet emitted. CSS custom properties are still defined
|
||||
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
|
||||
*/
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("unstyled")
|
||||
UNSTYLED;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
DEFAULT -> "default"
|
||||
GIT_HUB -> "github"
|
||||
DARK -> "dark"
|
||||
LIGHT -> "light"
|
||||
UNSTYLED -> "unstyled"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): HtmlTheme = when (value) {
|
||||
"default" -> DEFAULT
|
||||
"github" -> GIT_HUB
|
||||
"dark" -> DARK
|
||||
"light" -> LIGHT
|
||||
"unstyled" -> UNSTYLED
|
||||
else -> throw IllegalArgumentException("Unknown HtmlTheme value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
143
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IDocumentExtractor.kt
generated
Normal file
143
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IDocumentExtractor.kt
generated
Normal file
@@ -0,0 +1,143 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Trait for document extractor plugins.
|
||||
*
|
||||
* Implement this trait to add support for new document formats or to override
|
||||
* built-in extraction behavior with custom logic.
|
||||
*
|
||||
* # Return Type
|
||||
*
|
||||
* Extractors return `InternalDocument`, a flat intermediate representation.
|
||||
* The pipeline converts this into the public `ExtractionResult` via the
|
||||
* derivation step.
|
||||
*
|
||||
* # Priority System
|
||||
*
|
||||
* When multiple extractors support the same MIME type, the registry selects
|
||||
* the extractor with the highest priority value. Use this to:
|
||||
*
|
||||
* - Override built-in extractors (priority > 50)
|
||||
* - Provide fallback extractors (priority < 50)
|
||||
* - Implement specialized extractors for specific use cases
|
||||
*
|
||||
* Default priority is 50.
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
||||
*/
|
||||
interface IDocumentExtractor {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Extract content from a byte array.
|
||||
*
|
||||
* This is the core extraction method that processes in-memory document data.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
* The pipeline will convert this into the public `ExtractionResult`.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* - `KreuzbergError.Parsing` - Document parsing failed
|
||||
* - `KreuzbergError.Validation` - Invalid document structure
|
||||
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
|
||||
* - `KreuzbergError.MissingDependency` - Required dependency not available
|
||||
*/
|
||||
suspend fun extractBytes(
|
||||
content: ByteArray,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig,
|
||||
): ExtractionResult
|
||||
/**
|
||||
* Extract content from a file.
|
||||
*
|
||||
* Default implementation reads the file and calls `extract_bytes`.
|
||||
* Override for custom file handling, streaming, or memory optimizations.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Same as `extract_bytes`, plus file I/O errors.
|
||||
*/
|
||||
suspend fun extractFile(
|
||||
path: java.nio.file.Path,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig,
|
||||
): ExtractionResult
|
||||
/**
|
||||
* Get the list of MIME types supported by this extractor.
|
||||
*
|
||||
* Can include exact MIME types and prefix patterns:
|
||||
*
|
||||
* - Exact: `"application/pdf"`, `"text/plain"`
|
||||
* - Prefix: `"image/*"` (matches any image type)
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* A slice of MIME type strings.
|
||||
*/
|
||||
fun supportedMimeTypes(): List<String>
|
||||
/**
|
||||
* Get the priority of this extractor.
|
||||
*
|
||||
* Higher priority extractors are preferred when multiple extractors
|
||||
* support the same MIME type.
|
||||
*
|
||||
* # Priority Guidelines
|
||||
*
|
||||
* - **0-25**: Fallback/low-quality extractors
|
||||
* - **26-49**: Alternative extractors
|
||||
* - **50**: Default priority (built-in extractors)
|
||||
* - **51-75**: Premium/enhanced extractors
|
||||
* - **76-100**: Specialized/high-priority extractors
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* Priority value (default: 50)
|
||||
*/
|
||||
fun priority(): Int
|
||||
/**
|
||||
* Optional: Check if this extractor can handle a specific file.
|
||||
*
|
||||
* Allows for more sophisticated detection beyond MIME types.
|
||||
* Defaults to `true` (rely on MIME type matching).
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `true` if the extractor can handle this file, `false` otherwise.
|
||||
*/
|
||||
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
|
||||
}
|
||||
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IEmbeddingBackend.kt
generated
Normal file
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IEmbeddingBackend.kt
generated
Normal file
@@ -0,0 +1,95 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Trait for in-process embedding backend plugins.
|
||||
*
|
||||
* Async to match the convention used by `OcrBackend`,
|
||||
* `DocumentExtractor`, and `PostProcessor`.
|
||||
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
|
||||
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
|
||||
* equivalent to satisfy the async signature.
|
||||
*
|
||||
* # Thread safety
|
||||
*
|
||||
* Backends must be `Send + Sync + 'static`. They are stored in
|
||||
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
|
||||
* pipeline. If the backend's underlying model isn't thread-safe, the backend
|
||||
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
|
||||
*
|
||||
* # Contract
|
||||
*
|
||||
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
|
||||
* `self.dimensions()`. The dispatcher in `embed_texts`
|
||||
* validates this before returning to downstream consumers; a non-conforming
|
||||
* backend surfaces as a `KreuzbergError.Validation`, not a panic.
|
||||
*
|
||||
* - `embed` may be called from any thread. Its future must be `Send`
|
||||
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
|
||||
*
|
||||
* - `dimensions()` is called exactly once at registration, immediately after
|
||||
* `initialize()` succeeds. The returned value is cached by the registry and
|
||||
* used for all subsequent shape validation. Lazy-loading implementations can
|
||||
* defer model loading into `initialize()` and report the real dimension
|
||||
* afterwards. Later mutations of the backend's reported dimension are not
|
||||
* observed by kreuzberg — implementations that need to change dimension
|
||||
* must unregister and re-register.
|
||||
*
|
||||
* - `shutdown()` (inherited from `Plugin`) may be invoked
|
||||
* concurrently with an in-flight `embed()` call. Implementations must
|
||||
* tolerate this — e.g. by letting in-flight calls finish using resources
|
||||
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
|
||||
* shared state that isn't needed by `embed`.
|
||||
*
|
||||
* # Runtime
|
||||
*
|
||||
* The synchronous `embed_texts` entry uses
|
||||
* `tokio.task.block_in_place` to await the trait's async `embed`, which
|
||||
* requires a multi-thread tokio runtime. Callers running inside a
|
||||
* `current_thread` runtime (e.g. `#[tokio.test]` without `flavor = "multi_thread"`,
|
||||
* or `tokio.runtime.Builder.new_current_thread()`) must use
|
||||
* `embed_texts_async` instead, which awaits directly without
|
||||
* `block_in_place`.
|
||||
*/
|
||||
interface IEmbeddingBackend {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Embedding vector dimension. Must be `> 0` and must match the length of
|
||||
* every vector returned by `embed`.
|
||||
*/
|
||||
fun dimensions(): Long
|
||||
/**
|
||||
* Embed a batch of texts, returning one vector per input in order.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Implementations should return `Plugin` for
|
||||
* backend-specific failures. The dispatcher layers its own validation
|
||||
* (length, per-vector dimension) on top.
|
||||
*/
|
||||
suspend fun embed(texts: List<String>): List<List<Float>>
|
||||
}
|
||||
116
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IOcrBackend.kt
generated
Normal file
116
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IOcrBackend.kt
generated
Normal file
@@ -0,0 +1,116 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Trait for OCR backend plugins.
|
||||
*
|
||||
* Implement this trait to add custom OCR capabilities. OCR backends can be:
|
||||
*
|
||||
* - Native Rust implementations (like Tesseract)
|
||||
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
|
||||
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
|
||||
*/
|
||||
interface IOcrBackend {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Process an image and extract text via OCR.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* An `ExtractionResult` containing the extracted text and metadata.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* - `KreuzbergError.Ocr` - OCR processing failed
|
||||
* - `KreuzbergError.Validation` - Invalid image format or configuration
|
||||
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
|
||||
*
|
||||
* # Reading `backend_options`
|
||||
*
|
||||
* Backends that support runtime tuning can read `config.backend_options` and
|
||||
* deserialize only the keys they care about. Unknown keys are silently ignored,
|
||||
* so multiple backends can coexist in a pipeline without key conflicts.
|
||||
*/
|
||||
suspend fun processImage(imageBytes: ByteArray, config: OcrConfig): ExtractionResult
|
||||
/**
|
||||
* Process a file and extract text via OCR.
|
||||
*
|
||||
* Default implementation reads the file and calls `process_image`.
|
||||
* Override for custom file handling or optimizations.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Same as `process_image`, plus file I/O errors.
|
||||
*/
|
||||
suspend fun processImageFile(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
|
||||
/**
|
||||
* Check if this backend supports a given language code.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `true` if the language is supported, `false` otherwise.
|
||||
*/
|
||||
fun supportsLanguage(lang: String): Boolean
|
||||
/**
|
||||
* Get the backend type identifier.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* The backend type enum value.
|
||||
*/
|
||||
fun backendType(): OcrBackendType
|
||||
/**
|
||||
* Optional: Get a list of all supported languages.
|
||||
*
|
||||
* Defaults to empty list. Override to provide comprehensive language support info.
|
||||
*/
|
||||
fun supportedLanguages(): List<String>
|
||||
/**
|
||||
* Optional: Check if the backend supports table detection.
|
||||
*
|
||||
* Defaults to `false`. Override if your backend can detect and extract tables.
|
||||
*/
|
||||
fun supportsTableDetection(): Boolean
|
||||
/**
|
||||
* Check if the backend supports direct document-level processing (e.g. for PDFs).
|
||||
*
|
||||
* Defaults to `false`. Override if the backend has optimized document processing.
|
||||
*/
|
||||
fun supportsDocumentProcessing(): Boolean
|
||||
/**
|
||||
* Process a document file directly via OCR.
|
||||
*
|
||||
* Only called if `supports_document_processing` returns `true`.
|
||||
*/
|
||||
suspend fun processDocument(path: java.nio.file.Path, config: OcrConfig): ExtractionResult
|
||||
}
|
||||
144
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IPostProcessor.kt
generated
Normal file
144
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IPostProcessor.kt
generated
Normal file
@@ -0,0 +1,144 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Trait for post-processor plugins.
|
||||
*
|
||||
* Post-processors transform or enrich extraction results after the initial
|
||||
* extraction is complete. They can:
|
||||
*
|
||||
* - Clean and normalize text
|
||||
* - Add metadata (language, keywords, entities)
|
||||
* - Split content into chunks
|
||||
* - Score quality
|
||||
* - Apply custom transformations
|
||||
*
|
||||
* # Processing Order
|
||||
*
|
||||
* Post-processors are executed in stage order:
|
||||
*
|
||||
* 1. **Early** - Language detection, entity extraction
|
||||
* 2. **Middle** - Keyword extraction, token reduction
|
||||
* 3. **Late** - Custom hooks, final validation
|
||||
*
|
||||
* Within each stage, processors are executed in registration order.
|
||||
*
|
||||
* # Error Handling
|
||||
*
|
||||
* Post-processor errors are non-fatal by default - they're captured in metadata
|
||||
* and execution continues. To make errors fatal, return an error from `process()`.
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* Post-processors must be thread-safe (`Send + Sync`).
|
||||
*/
|
||||
interface IPostProcessor {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Process an extraction result.
|
||||
*
|
||||
* Transform or enrich the extraction result. Can modify:
|
||||
*
|
||||
* - `content` - The extracted text
|
||||
* - `metadata` - Add or update metadata fields
|
||||
* - `tables` - Modify or enhance table data
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Return errors for fatal processing failures. Non-fatal errors should be
|
||||
* captured in metadata directly on the result.
|
||||
*
|
||||
* # Performance
|
||||
*
|
||||
* This signature avoids unnecessary cloning of large extraction results by
|
||||
* taking a mutable reference instead of ownership. Processors modify the
|
||||
* result in place.
|
||||
*
|
||||
* # Example - Language Detection
|
||||
*
|
||||
*
|
||||
* # Example - Text Cleaning
|
||||
*
|
||||
* ```rust
|
||||
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||||
* -> Result<()> {
|
||||
* // Remove excessive whitespace
|
||||
* result.content = result
|
||||
* .content
|
||||
* .split_whitespace()
|
||||
* .collect::<Vec<_>>()
|
||||
* .join(" ");
|
||||
*
|
||||
* Ok(())
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
suspend fun process(result: ExtractionResult, config: ExtractionConfig)
|
||||
/**
|
||||
* Get the processing stage for this post-processor.
|
||||
*
|
||||
* Determines when this processor runs in the pipeline.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* The `ProcessingStage` (Early, Middle, or Late).
|
||||
*/
|
||||
fun processingStage(): ProcessingStage
|
||||
/**
|
||||
* Optional: Check if this processor should run for a given result.
|
||||
*
|
||||
* Allows conditional processing based on MIME type, metadata, or content.
|
||||
* Defaults to `true` (always run).
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `true` if the processor should run, `false` to skip.
|
||||
*/
|
||||
fun shouldProcess(result: ExtractionResult, config: ExtractionConfig): Boolean
|
||||
/**
|
||||
* Optional: Estimate processing time in milliseconds.
|
||||
*
|
||||
* Used for logging and debugging. Defaults to 0 (unknown).
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* Estimated processing time in milliseconds.
|
||||
*/
|
||||
fun estimatedDurationMs(result: ExtractionResult): Long
|
||||
/**
|
||||
* Execution priority within the processing stage.
|
||||
*
|
||||
* Higher values run first within the same `ProcessingStage`. Defaults to 50.
|
||||
* Use 0-49 for fallback processors, 50 for normal processors, and 51-255
|
||||
* for high-priority processors that should run early in their stage.
|
||||
*/
|
||||
fun priority(): Int
|
||||
}
|
||||
59
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IRenderer.kt
generated
Normal file
59
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IRenderer.kt
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Trait for document renderers that convert `InternalDocument` to output strings.
|
||||
*
|
||||
* Renderers are typically stateless converters that transform the internal
|
||||
* document representation into a specific output format (Markdown, HTML,
|
||||
* Djot, plain text, etc.). They participate in the standard `Plugin`
|
||||
* lifecycle so custom renderers can be registered from any supported binding
|
||||
* language.
|
||||
*
|
||||
* The format name is exposed via `Plugin.name`. For stateless renderers
|
||||
* the `Plugin` lifecycle methods (`version`, `initialize`, `shutdown`) all
|
||||
* take no-op defaults and need not be overridden.
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* Renderers must be `Send + Sync` (inherited from `Plugin`).
|
||||
*/
|
||||
interface IRenderer {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Render an `InternalDocument` to the output format.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* The rendered output as a string.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Returns an error if rendering fails.
|
||||
*/
|
||||
fun render(doc: ExtractionResult): String
|
||||
}
|
||||
164
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IValidator.kt
generated
Normal file
164
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IValidator.kt
generated
Normal file
@@ -0,0 +1,164 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Trait for validator plugins.
|
||||
*
|
||||
* Validators check extraction results for quality, completeness, or correctness.
|
||||
* Unlike post-processors, validator errors **fail fast** - if a validator returns
|
||||
* an error, the extraction fails immediately.
|
||||
*
|
||||
* # Use Cases
|
||||
*
|
||||
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
|
||||
* - **Compliance**: Verify content meets regulatory requirements
|
||||
* - **Content Filtering**: Reject documents containing unwanted content
|
||||
* - **Format Validation**: Verify extracted content structure
|
||||
* - **Security Checks**: Scan for malicious content
|
||||
*
|
||||
* # Error Handling
|
||||
*
|
||||
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
|
||||
* to the caller. Use validators for hard requirements that must be met.
|
||||
*
|
||||
* For non-fatal checks, use post-processors instead.
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* Validators must be thread-safe (`Send + Sync`).
|
||||
*/
|
||||
interface IValidator {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Validate an extraction result.
|
||||
*
|
||||
* Check the extraction result and return `Ok(())` if valid, or an error
|
||||
* if validation fails.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* - `Ok(())` if validation passes
|
||||
* - `Err(...)` if validation fails (extraction will fail)
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* - `KreuzbergError.Validation` - Validation failed
|
||||
* - Any other error type appropriate for the failure
|
||||
*
|
||||
* # Example - Content Length Validation
|
||||
*
|
||||
* ```rust
|
||||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
* -> Result<()> {
|
||||
* let length = result.content.len();
|
||||
*
|
||||
* if length < self.min {
|
||||
* return Err(KreuzbergError::validation(format!(
|
||||
* "Content too short: {} < {} characters",
|
||||
* length, self.min
|
||||
* )));
|
||||
* }
|
||||
*
|
||||
* if length > self.max {
|
||||
* return Err(KreuzbergError::validation(format!(
|
||||
* "Content too long: {} > {} characters",
|
||||
* length, self.max
|
||||
* )));
|
||||
* }
|
||||
*
|
||||
* Ok(())
|
||||
* }
|
||||
* ```
|
||||
*
|
||||
* # Example - Quality Score Validation
|
||||
*
|
||||
* ```rust
|
||||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
* -> Result<()> {
|
||||
* // Check if quality_score exists in metadata
|
||||
* let score = result.metadata
|
||||
* .additional
|
||||
* .get("quality_score")
|
||||
* .and_then(|v| v.as_f64())
|
||||
* .unwrap_or(0.0);
|
||||
*
|
||||
* if score < self.min_score {
|
||||
* return Err(KreuzbergError::validation(format!(
|
||||
* "Quality score too low: {} < {}",
|
||||
* score, self.min_score
|
||||
* )));
|
||||
* }
|
||||
*
|
||||
* Ok(())
|
||||
* }
|
||||
* ```
|
||||
*
|
||||
* # Example - Security Validation
|
||||
*
|
||||
* ```rust
|
||||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
* -> Result<()> {
|
||||
* // Check for blocked patterns
|
||||
* for pattern in &self.blocked_patterns {
|
||||
* if result.content.contains(pattern) {
|
||||
* return Err(KreuzbergError::validation(format!(
|
||||
* "Content contains blocked pattern: {}",
|
||||
* pattern
|
||||
* )));
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* Ok(())
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
suspend fun validate(result: ExtractionResult, config: ExtractionConfig)
|
||||
/**
|
||||
* Optional: Check if this validator should run for a given result.
|
||||
*
|
||||
* Allows conditional validation based on MIME type, metadata, or content.
|
||||
* Defaults to `true` (always run).
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `true` if the validator should run, `false` to skip.
|
||||
*/
|
||||
fun shouldValidate(result: ExtractionResult, config: ExtractionConfig): Boolean
|
||||
/**
|
||||
* Optional: Get the validation priority.
|
||||
*
|
||||
* Higher priority validators run first. Useful for ordering validation checks
|
||||
* (e.g., run cheap validations before expensive ones).
|
||||
*
|
||||
* Default priority is 50.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* Priority value (higher = runs earlier).
|
||||
*/
|
||||
fun priority(): Int
|
||||
}
|
||||
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageExtractionConfig.kt
generated
Normal file
95
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageExtractionConfig.kt
generated
Normal file
@@ -0,0 +1,95 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Image extraction configuration. */
|
||||
data class ImageExtractionConfig(
|
||||
/** Extract images from documents */
|
||||
val extractImages: Boolean = true,
|
||||
/** Target DPI for image normalization */
|
||||
val targetDpi: Int = 300,
|
||||
/** Maximum dimension for images (width or height) */
|
||||
val maxImageDimension: Int = 4096,
|
||||
/**
|
||||
* Whether to inject image reference placeholders into markdown output.
|
||||
* When `true` (default), image references like ``
|
||||
* are appended to the markdown. Set to `false` to extract images as data
|
||||
* without polluting the markdown output.
|
||||
*/
|
||||
val injectPlaceholders: Boolean = true,
|
||||
/** Automatically adjust DPI based on image content */
|
||||
val autoAdjustDpi: Boolean = true,
|
||||
/** Minimum DPI threshold */
|
||||
val minDpi: Int = 72,
|
||||
/** Maximum DPI threshold */
|
||||
val maxDpi: Int = 600,
|
||||
/**
|
||||
* Maximum number of image objects to extract per PDF page.
|
||||
*
|
||||
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
||||
* can trigger extremely long or indefinite extraction times when every image
|
||||
* object on a dense page is decoded individually via the PDF extractor. Setting this
|
||||
* limit causes kreuzberg to stop collecting individual images once the count
|
||||
* per page reaches the cap and emit a warning instead.
|
||||
*
|
||||
* `null` (default) means no limit — all images are extracted.
|
||||
*/
|
||||
val maxImagesPerPage: Int? = null,
|
||||
/**
|
||||
* When `true` (default), extracted images are classified by kind and grouped
|
||||
* into clusters where they appear to belong to one figure.
|
||||
*/
|
||||
val classify: Boolean = true,
|
||||
/**
|
||||
* When `true`, full-page renders produced during OCR preprocessing are captured
|
||||
* and returned as `ImageKind.PageRaster` entries in `ExtractionResult.images`.
|
||||
*
|
||||
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
||||
* document-level OCR bypass is active (whole-document backend). When OCR is
|
||||
* enabled and this flag is set but the active backend skips per-page rendering,
|
||||
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
|
||||
*
|
||||
* Defaults to `false`. Enable when downstream consumers need page thumbnails
|
||||
* (e.g. citation previews, visual grounding).
|
||||
*/
|
||||
val includePageRasters: Boolean = false,
|
||||
/**
|
||||
* Run OCR on extracted images and include the recognized text in the document content.
|
||||
*
|
||||
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
|
||||
* are processed with the configured OCR backend. Set to `false` to extract images
|
||||
* without OCR processing, even when OCR is enabled.
|
||||
*/
|
||||
val runOcrOnImages: Boolean = true,
|
||||
/**
|
||||
* When `true`, image OCR results are rendered as plain text without the
|
||||
* `` markdown placeholder. Only takes effect when `run_ocr_on_images`
|
||||
* is also `true`.
|
||||
*/
|
||||
val ocrTextOnly: Boolean = false,
|
||||
/**
|
||||
* When `true` and `ocr_text_only` is `false`, append the OCR text after
|
||||
* the image placeholder in the rendered output.
|
||||
*/
|
||||
val appendOcrText: Boolean = false,
|
||||
)
|
||||
99
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageKind.kt
generated
Normal file
99
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageKind.kt
generated
Normal file
@@ -0,0 +1,99 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Heuristic classification of what an image likely depicts. */
|
||||
enum class ImageKind {
|
||||
/** Photographic image (natural scene, photograph) */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("photograph")
|
||||
PHOTOGRAPH,
|
||||
/** Technical or schematic diagram */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("diagram")
|
||||
DIAGRAM,
|
||||
/** Chart, graph, or plot */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("chart")
|
||||
CHART,
|
||||
/** Freehand or technical drawing */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("drawing")
|
||||
DRAWING,
|
||||
/** Text-heavy image (scanned text, document) */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("text_block")
|
||||
TEXT_BLOCK,
|
||||
/** Decorative element or border */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("decoration")
|
||||
DECORATION,
|
||||
/** Logo or brand mark */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("logo")
|
||||
LOGO,
|
||||
/** Small icon */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("icon")
|
||||
ICON,
|
||||
/** Fragment of a larger tiled image (tile of a technical drawing) */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("tile_fragment")
|
||||
TILE_FRAGMENT,
|
||||
/** Mask or transparency map */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("mask")
|
||||
MASK,
|
||||
/** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("page_raster")
|
||||
PAGE_RASTER,
|
||||
/** Could not classify with reasonable confidence */
|
||||
@com.fasterxml.jackson.annotation.JsonProperty("unknown")
|
||||
UNKNOWN;
|
||||
|
||||
@com.fasterxml.jackson.annotation.JsonValue
|
||||
fun toWire(): String = when (this) {
|
||||
PHOTOGRAPH -> "photograph"
|
||||
DIAGRAM -> "diagram"
|
||||
CHART -> "chart"
|
||||
DRAWING -> "drawing"
|
||||
TEXT_BLOCK -> "text_block"
|
||||
DECORATION -> "decoration"
|
||||
LOGO -> "logo"
|
||||
ICON -> "icon"
|
||||
TILE_FRAGMENT -> "tile_fragment"
|
||||
MASK -> "mask"
|
||||
PAGE_RASTER -> "page_raster"
|
||||
UNKNOWN -> "unknown"
|
||||
}
|
||||
|
||||
companion object {
|
||||
@com.fasterxml.jackson.annotation.JsonCreator
|
||||
@JvmStatic
|
||||
fun fromWire(value: String): ImageKind = when (value) {
|
||||
"photograph" -> PHOTOGRAPH
|
||||
"diagram" -> DIAGRAM
|
||||
"chart" -> CHART
|
||||
"drawing" -> DRAWING
|
||||
"text_block" -> TEXT_BLOCK
|
||||
"decoration" -> DECORATION
|
||||
"logo" -> LOGO
|
||||
"icon" -> ICON
|
||||
"tile_fragment" -> TILE_FRAGMENT
|
||||
"mask" -> MASK
|
||||
"page_raster" -> PAGE_RASTER
|
||||
"unknown" -> UNKNOWN
|
||||
else -> throw IllegalArgumentException("Unknown ImageKind value: $value")
|
||||
}
|
||||
}
|
||||
}
|
||||
39
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageMetadata.kt
generated
Normal file
39
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageMetadata.kt
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Image metadata extracted from image files.
|
||||
*
|
||||
* Includes dimensions, format, and EXIF data.
|
||||
*/
|
||||
data class ImageMetadata(
|
||||
/** Image width in pixels */
|
||||
val width: Int = 0,
|
||||
/** Image height in pixels */
|
||||
val height: Int = 0,
|
||||
/** Image format (e.g., "PNG", "JPEG", "TIFF") */
|
||||
val format: String = "",
|
||||
/** EXIF metadata tags */
|
||||
val exif: Map<String, String> = emptyMap(),
|
||||
)
|
||||
39
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageMetadataType.kt
generated
Normal file
39
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImageMetadataType.kt
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/** Image element metadata. */
|
||||
data class ImageMetadataType(
|
||||
/** Image source (URL, data URI, or SVG content) */
|
||||
val src: String,
|
||||
/** Alternative text from alt attribute */
|
||||
val alt: String? = null,
|
||||
/** Title attribute */
|
||||
val title: String? = null,
|
||||
/** Image dimensions as (width, height) if available */
|
||||
val dimensions: List<Int>? = null,
|
||||
/** Image type classification */
|
||||
val imageType: ImageType,
|
||||
/** Additional attributes as key-value pairs */
|
||||
val attributes: List<List<String>> = emptyList(),
|
||||
)
|
||||
47
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImagePreprocessingConfig.kt
generated
Normal file
47
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/ImagePreprocessingConfig.kt
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
/**
|
||||
* Image preprocessing configuration for OCR.
|
||||
*
|
||||
* These settings control how images are preprocessed before OCR to improve
|
||||
* text recognition quality. Different preprocessing strategies work better
|
||||
* for different document types.
|
||||
*/
|
||||
data class ImagePreprocessingConfig(
|
||||
/** Target DPI for the image (300 is standard, 600 for small text). */
|
||||
val targetDpi: Int = 300,
|
||||
/** Auto-detect and correct image rotation. */
|
||||
val autoRotate: Boolean = true,
|
||||
/** Correct skew (tilted images). */
|
||||
val deskew: Boolean = true,
|
||||
/** Remove noise from the image. */
|
||||
val denoise: Boolean = false,
|
||||
/** Enhance contrast for better text visibility. */
|
||||
val contrastEnhance: Boolean = false,
|
||||
/** Binarization method: "otsu", "sauvola", "adaptive". */
|
||||
val binarizationMethod: String = "otsu",
|
||||
/** Invert colors (white text on black → black on white). */
|
||||
val invertColors: Boolean = false,
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user