Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,67 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = AccelerationConfig.Builder.class)
public record AccelerationConfig(
/**
* Execution provider to use for ONNX inference.
*/
@Nullable @JsonProperty("provider") ExecutionProviderType provider,
/**
* GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
*/
@Nullable @JsonProperty("device_id") Integer deviceId
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@Nullable private ExecutionProviderType provider = ExecutionProviderType.Auto;
@JsonProperty("device_id")
private Integer deviceId = null;
/** Sets the provider field. */
@JsonProperty("provider")
public Builder withProvider(final @Nullable ExecutionProviderType value) {
this.provider = value;
return this;
}
/** Sets the deviceId field. */
@JsonProperty("device_id")
public Builder withDeviceId(final @Nullable Integer value) {
this.deviceId = value;
return this;
}
/** Builds the AccelerationConfig instance. */
public AccelerationConfig build() {
return new AccelerationConfig(
provider,
deviceId
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,77 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import java.util.Optional;
/**
* Types of inline text annotations.
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "annotation_type", visible = false)
@JsonSubTypes({
@JsonSubTypes.Type(value = AnnotationKind.Bold.class, name = "bold"),
@JsonSubTypes.Type(value = AnnotationKind.Italic.class, name = "italic"),
@JsonSubTypes.Type(value = AnnotationKind.Underline.class, name = "underline"),
@JsonSubTypes.Type(value = AnnotationKind.Strikethrough.class, name = "strikethrough"),
@JsonSubTypes.Type(value = AnnotationKind.Code.class, name = "code"),
@JsonSubTypes.Type(value = AnnotationKind.Subscript.class, name = "subscript"),
@JsonSubTypes.Type(value = AnnotationKind.Superscript.class, name = "superscript"),
@JsonSubTypes.Type(value = AnnotationKind.Link.class, name = "link"),
@JsonSubTypes.Type(value = AnnotationKind.Highlight.class, name = "highlight"),
@JsonSubTypes.Type(value = AnnotationKind.Color.class, name = "color"),
@JsonSubTypes.Type(value = AnnotationKind.FontSize.class, name = "font_size"),
@JsonSubTypes.Type(value = AnnotationKind.Custom.class, name = "custom")
})
@com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true)
public sealed interface AnnotationKind {
record Bold() implements AnnotationKind {
}
record Italic() implements AnnotationKind {
}
record Underline() implements AnnotationKind {
}
record Strikethrough() implements AnnotationKind {
}
record Code() implements AnnotationKind {
}
record Subscript() implements AnnotationKind {
}
record Superscript() implements AnnotationKind {
}
record Link(
@JsonProperty("url") String url,
@JsonProperty("title") Optional<String> title
) implements AnnotationKind {
}
/** Highlighted text (PDF highlights, HTML {@code &lt;mark&gt;}). */
record Highlight() implements AnnotationKind {
}
/** Text color (CSS-compatible value, e.g. "#ff0000", "red"). */
record Color(@JsonProperty("value") String value) implements AnnotationKind { }
/** Font size with units (e.g. "12pt", "1.2em", "16px"). */
record FontSize(@JsonProperty("value") String value) implements AnnotationKind { }
/** Extensible annotation for format-specific styling. */
record Custom(
@JsonProperty("name") String name,
@JsonProperty("value") Optional<String> value
) implements AnnotationKind {
}
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full {@code ExtractionResult}.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ArchiveEntry.Builder.class)
public record ArchiveEntry(
/**
* Archive-relative file path (e.g. "folder/document.pdf").
*/
@JsonProperty("path") String path,
/**
* Detected MIME type of the file.
*/
@JsonProperty("mime_type") String mimeType,
/**
* Full extraction result for this file.
*/
@JsonProperty("result") ExtractionResult result
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String path = "";
@JsonProperty("mime_type")
private String mimeType = "";
private ExtractionResult result = null;
/** Sets the path field. */
@JsonProperty("path")
public Builder withPath(final String value) {
this.path = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final String value) {
this.mimeType = value;
return this;
}
/** Sets the result field. */
@JsonProperty("result")
public Builder withResult(final ExtractionResult value) {
this.result = value;
return this;
}
/** Builds the ArchiveEntry instance. */
public ArchiveEntry build() {
return new ArchiveEntry(
path,
mimeType,
result
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,109 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ArchiveMetadata.Builder.class)
public record ArchiveMetadata(
/**
* Archive format ("ZIP", "TAR", "7Z", etc.)
*/
@JsonProperty("format") String format,
/**
* Total number of files in the archive
*/
@JsonProperty("file_count") int fileCount,
/**
* List of file paths within the archive
*/
@JsonProperty("file_list") List<String> fileList,
/**
* Total uncompressed size in bytes
*/
@JsonProperty("total_size") long totalSize,
/**
* Compressed size in bytes (if available)
*/
@Nullable @JsonProperty("compressed_size") Long compressedSize
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String format = "";
@JsonProperty("file_count")
private int fileCount = 0;
@JsonProperty("file_list")
private List<String> fileList = List.of();
@JsonProperty("total_size")
private long totalSize = 0;
@JsonProperty("compressed_size")
private Long compressedSize = null;
/** Sets the format field. */
@JsonProperty("format")
public Builder withFormat(final String value) {
this.format = value;
return this;
}
/** Sets the fileCount field. */
@JsonProperty("file_count")
public Builder withFileCount(final int value) {
this.fileCount = value;
return this;
}
/** Sets the fileList field. */
@JsonProperty("file_list")
public Builder withFileList(final List<String> value) {
this.fileList = value;
return this;
}
/** Sets the totalSize field. */
@JsonProperty("total_size")
public Builder withTotalSize(final long value) {
this.totalSize = value;
return this;
}
/** Sets the compressedSize field. */
@JsonProperty("compressed_size")
public Builder withCompressedSize(final @Nullable long value) {
this.compressedSize = value;
return this;
}
/** Builds the ArchiveMetadata instance. */
public ArchiveMetadata build() {
return new ArchiveMetadata(
format,
fileCount,
fileList,
totalSize,
compressedSize
);
}
}
// CPD-ON
}

76
packages/java/dev/kreuzberg/BBox.java generated Normal file
View File

@@ -0,0 +1,76 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = BBox.Builder.class)
public record BBox(
@JsonProperty("x1") float x1,
@JsonProperty("y1") float y1,
@JsonProperty("x2") float x2,
@JsonProperty("y2") float y2
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private float x1 = 0.0f;
private float y1 = 0.0f;
private float x2 = 0.0f;
private float y2 = 0.0f;
/** Sets the x1 field. */
@JsonProperty("x1")
public Builder withX1(final float value) {
this.x1 = value;
return this;
}
/** Sets the y1 field. */
@JsonProperty("y1")
public Builder withY1(final float value) {
this.y1 = value;
return this;
}
/** Sets the x2 field. */
@JsonProperty("x2")
public Builder withX2(final float value) {
this.x2 = value;
return this;
}
/** Sets the y2 field. */
@JsonProperty("y2")
public Builder withY2(final float value) {
this.y2 = value;
return this;
}
/** Builds the BBox instance. */
public BBox build() {
return new BBox(
x1,
y1,
x2,
y2
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,81 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Batch item for byte array extraction.
*
* Used with {@code batch_extract_bytes} and {@code batch_extract_bytes_sync}
* to represent a single item in a batch extraction job.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = BatchBytesItem.Builder.class)
public record BatchBytesItem(
/**
* The content bytes to extract from
*/
@JsonSerialize(using = ByteArrayToIntArraySerializer.class) @JsonProperty("content") byte[] content,
/**
* MIME type of the content (e.g., "application/pdf", "text/html")
*/
@JsonProperty("mime_type") String mimeType,
/**
* Per-item configuration overrides (null uses batch-level defaults)
*/
@Nullable @JsonProperty("config") FileExtractionConfig config
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private byte[] content = new byte[0];
@JsonProperty("mime_type")
private String mimeType = "";
private FileExtractionConfig config = null;
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final byte[] value) {
this.content = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final String value) {
this.mimeType = value;
return this;
}
/** Sets the config field. */
@JsonProperty("config")
public Builder withConfig(final @Nullable FileExtractionConfig value) {
this.config = value;
return this;
}
/** Builds the BatchBytesItem instance. */
public BatchBytesItem build() {
return new BatchBytesItem(
content,
mimeType,
config
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,66 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Batch item for file extraction.
*
* Used with {@code batch_extract_files} and {@code batch_extract_files_sync}
* to represent a single file in a batch extraction job.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = BatchFileItem.Builder.class)
public record BatchFileItem(
/**
* Path to the file to extract from
*/
@JsonProperty("path") java.nio.file.Path path,
/**
* Per-file configuration overrides (null uses batch-level defaults)
*/
@Nullable @JsonProperty("config") FileExtractionConfig config
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private java.nio.file.Path path = null;
private FileExtractionConfig config = null;
/** Sets the path field. */
@JsonProperty("path")
public Builder withPath(final java.nio.file.Path value) {
this.path = value;
return this;
}
/** Sets the config field. */
@JsonProperty("config")
public Builder withConfig(final @Nullable FileExtractionConfig value) {
this.config = value;
return this;
}
/** Builds the BatchFileItem instance. */
public BatchFileItem build() {
return new BatchFileItem(
path,
config
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,96 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* BibTeX bibliography metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = BibtexMetadata.Builder.class)
public record BibtexMetadata(
/**
* Number of entries in the bibliography.
*/
@JsonProperty("entry_count") long entryCount,
@Nullable @JsonProperty("citation_keys") List<String> citationKeys,
@Nullable @JsonProperty("authors") List<String> authors,
@Nullable @JsonProperty("year_range") YearRange yearRange,
@Nullable @JsonProperty("entry_types") Map<String, Long> entryTypes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("entry_count")
private long entryCount = 0;
@JsonProperty("citation_keys")
private List<String> citationKeys = null;
private List<String> authors = null;
@JsonProperty("year_range")
private YearRange yearRange = null;
@JsonProperty("entry_types")
private Map<String, Long> entryTypes = null;
/** Sets the entryCount field. */
@JsonProperty("entry_count")
public Builder withEntryCount(final long value) {
this.entryCount = value;
return this;
}
/** Sets the citationKeys field. */
@JsonProperty("citation_keys")
public Builder withCitationKeys(final @Nullable List<String> value) {
this.citationKeys = value;
return this;
}
/** Sets the authors field. */
@JsonProperty("authors")
public Builder withAuthors(final @Nullable List<String> value) {
this.authors = value;
return this;
}
/** Sets the yearRange field. */
@JsonProperty("year_range")
public Builder withYearRange(final @Nullable YearRange value) {
this.yearRange = value;
return this;
}
/** Sets the entryTypes field. */
@JsonProperty("entry_types")
public Builder withEntryTypes(final @Nullable Map<String, Long> value) {
this.entryTypes = value;
return this;
}
/** Builds the BibtexMetadata instance. */
public BibtexMetadata build() {
return new BibtexMetadata(
entryCount,
citationKeys,
authors,
yearRange,
entryTypes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,61 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Types of block-level elements in Djot.
*/
public enum BlockType {
Paragraph("paragraph"),
Heading("heading"),
Blockquote("blockquote"),
CodeBlock("code_block"),
ListItem("list_item"),
OrderedList("ordered_list"),
BulletList("bullet_list"),
TaskList("task_list"),
DefinitionList("definition_list"),
DefinitionTerm("definition_term"),
DefinitionDescription("definition_description"),
Div("div"),
Section("section"),
ThematicBreak("thematic_break"),
RawBlock("raw_block"),
MathDisplay("math_display");
/** The string value. */
private final String value;
BlockType(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static BlockType fromValue(final String value) {
for (BlockType e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,88 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Bounding box coordinates for element positioning.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = BoundingBox.Builder.class)
public record BoundingBox(
/**
* Left x-coordinate
*/
@JsonProperty("x0") double x0,
/**
* Bottom y-coordinate
*/
@JsonProperty("y0") double y0,
/**
* Right x-coordinate
*/
@JsonProperty("x1") double x1,
/**
* Top y-coordinate
*/
@JsonProperty("y1") double y1
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private double x0 = 0.0;
private double y0 = 0.0;
private double x1 = 0.0;
private double y1 = 0.0;
/** Sets the x0 field. */
@JsonProperty("x0")
public Builder withX0(final double value) {
this.x0 = value;
return this;
}
/** Sets the y0 field. */
@JsonProperty("y0")
public Builder withY0(final double value) {
this.y0 = value;
return this;
}
/** Sets the x1 field. */
@JsonProperty("x1")
public Builder withX1(final double value) {
this.x1 = value;
return this;
}
/** Sets the y1 field. */
@JsonProperty("y1")
public Builder withY1(final double value) {
this.y1 = value;
return this;
}
/** Builds the BoundingBox instance. */
public BoundingBox build() {
return new BoundingBox(
x0,
y0,
x1,
y1
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,35 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.ser.std.StdSerializer;
/**
* Serialises {@code byte[]} as a JSON array of integers.
*
* <p>Jackson's default serialiser encodes {@code byte[]} as a base64 string, but
* Rust's {@code serde} for {@code Vec<u8>} expects {@code [72, 101, 108, ...]}.
* Annotate any {@code byte[]} field sent to the FFI layer with
* {@code @JsonSerialize(using = ByteArrayToIntArraySerializer.class)}.
*/
public class ByteArrayToIntArraySerializer extends StdSerializer<byte[]> {
/** Default constructor required by Jackson. */
public ByteArrayToIntArraySerializer() {
super(byte[].class);
}
@Override
public void serialize(final byte[] value, final JsonGenerator gen,
final SerializerProvider provider) throws java.io.IOException {
gen.writeStartArray();
for (byte b : value) {
gen.writeNumber(b & 0xFF);
}
gen.writeEndArray();
}
}

View File

@@ -0,0 +1,15 @@
// DO NOT EDIT - auto-generated by alef
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
package dev.kreuzberg;
public class CacheException extends KreuzbergErrorException {
/** Creates a new CacheException with the given message. */
public CacheException(final String message) {
super(message);
}
/** Creates a new CacheException with the given message and cause. */
public CacheException(final String message, final Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,88 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = CacheStats.Builder.class)
public record CacheStats(
@JsonProperty("total_files") long totalFiles,
@JsonProperty("total_size_mb") double totalSizeMb,
@JsonProperty("available_space_mb") double availableSpaceMb,
@JsonProperty("oldest_file_age_days") double oldestFileAgeDays,
@JsonProperty("newest_file_age_days") double newestFileAgeDays
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("total_files")
private long totalFiles = 0;
@JsonProperty("total_size_mb")
private double totalSizeMb = 0.0;
@JsonProperty("available_space_mb")
private double availableSpaceMb = 0.0;
@JsonProperty("oldest_file_age_days")
private double oldestFileAgeDays = 0.0;
@JsonProperty("newest_file_age_days")
private double newestFileAgeDays = 0.0;
/** Sets the totalFiles field. */
@JsonProperty("total_files")
public Builder withTotalFiles(final long value) {
this.totalFiles = value;
return this;
}
/** Sets the totalSizeMb field. */
@JsonProperty("total_size_mb")
public Builder withTotalSizeMb(final double value) {
this.totalSizeMb = value;
return this;
}
/** Sets the availableSpaceMb field. */
@JsonProperty("available_space_mb")
public Builder withAvailableSpaceMb(final double value) {
this.availableSpaceMb = value;
return this;
}
/** Sets the oldestFileAgeDays field. */
@JsonProperty("oldest_file_age_days")
public Builder withOldestFileAgeDays(final double value) {
this.oldestFileAgeDays = value;
return this;
}
/** Sets the newestFileAgeDays field. */
@JsonProperty("newest_file_age_days")
public Builder withNewestFileAgeDays(final double value) {
this.newestFileAgeDays = value;
return this;
}
/** Builds the CacheStats instance. */
public CacheStats build() {
return new CacheStats(
totalFiles,
totalSizeMb,
availableSpaceMb,
oldestFileAgeDays,
newestFileAgeDays
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,15 @@
// DO NOT EDIT - auto-generated by alef
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
package dev.kreuzberg;
public class CancelledException extends KreuzbergErrorException {
/** Creates a new CancelledException with the given message. */
public CancelledException(final String message) {
super(message);
}
/** Creates a new CancelledException with the given message and cause. */
public CancelledException(final String message, final Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,92 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* A single changed cell within a table.
*
* Defined here (rather than only in {@code crate.diff}) so {@code RevisionDelta} can
* reference it unconditionally, without requiring the {@code diff} Cargo feature.
* {@code crate.diff} re-exports this type verbatim.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = CellChange.Builder.class)
public record CellChange(
/**
* Zero-based row index.
*/
@JsonProperty("row") long row,
/**
* Zero-based column index.
*/
@JsonProperty("col") long col,
/**
* Value before the change.
*/
@JsonProperty("from") String from,
/**
* Value after the change.
*/
@JsonProperty("to") String to
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private long row = 0;
private long col = 0;
private String from = "";
private String to = "";
/** Sets the row field. */
@JsonProperty("row")
public Builder withRow(final long value) {
this.row = value;
return this;
}
/** Sets the col field. */
@JsonProperty("col")
public Builder withCol(final long value) {
this.col = value;
return this;
}
/** Sets the from field. */
@JsonProperty("from")
public Builder withFrom(final String value) {
this.from = value;
return this;
}
/** Sets the to field. */
@JsonProperty("to")
public Builder withTo(final String value) {
this.to = value;
return this;
}
/** Builds the CellChange instance. */
public CellChange build() {
return new CellChange(
row,
col,
from,
to
);
}
}
// CPD-ON
}

101
packages/java/dev/kreuzberg/Chunk.java generated Normal file
View File

@@ -0,0 +1,101 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in {@code ExtractionConfig}. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = Chunk.Builder.class)
public record Chunk(
/**
* The text content of this chunk.
*/
@JsonProperty("content") String content,
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to {@code ChunkType.Unknown} when no rule matches.
*/
@Nullable @JsonProperty("chunk_type") ChunkType chunkType,
/**
* Optional embedding vector for this chunk.
*
* Only populated when {@code EmbeddingConfig} is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*/
@Nullable @JsonProperty("embedding") List<Float> embedding,
/**
* Metadata about this chunk's position and properties.
*/
@JsonProperty("metadata") ChunkMetadata metadata
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String content = "";
@JsonProperty("chunk_type")
@Nullable private ChunkType chunkType = ChunkType.Unknown;
private List<Float> embedding = null;
private ChunkMetadata metadata = null;
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the chunkType field. */
@JsonProperty("chunk_type")
public Builder withChunkType(final @Nullable ChunkType value) {
this.chunkType = value;
return this;
}
/** Sets the embedding field. */
@JsonProperty("embedding")
public Builder withEmbedding(final @Nullable List<Float> value) {
this.embedding = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final ChunkMetadata value) {
this.metadata = value;
return this;
}
/** Builds the Chunk instance. */
public Chunk build() {
return new Chunk(
content,
chunkType,
embedding,
metadata
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,177 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Metadata about a chunk's position in the original document.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ChunkMetadata.Builder.class)
public record ChunkMetadata(
/**
* Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
*/
@JsonProperty("byte_start") long byteStart,
/**
* Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
*/
@JsonProperty("byte_end") long byteEnd,
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*/
@Nullable @JsonProperty("token_count") Long tokenCount,
/**
* Zero-based index of this chunk in the document.
*/
@JsonProperty("chunk_index") long chunkIndex,
/**
* Total number of chunks in the document.
*/
@JsonProperty("total_chunks") long totalChunks,
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
@Nullable @JsonProperty("first_page") Integer firstPage,
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
@Nullable @JsonProperty("last_page") Integer lastPage,
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when {@code ChunkerType.Markdown} is used.
*/
@Nullable @JsonProperty("heading_context") HeadingContext headingContext,
/**
* Indices into {@code ExtractionResult.images} for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level {@code images} collection for every
* image whose {@code page_number} falls within {@code [first_page, last_page]}.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*/
@Nullable @JsonProperty("image_indices") List<Integer> imageIndices
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("byte_start")
private long byteStart = 0;
@JsonProperty("byte_end")
private long byteEnd = 0;
@JsonProperty("token_count")
private Long tokenCount = null;
@JsonProperty("chunk_index")
private long chunkIndex = 0;
@JsonProperty("total_chunks")
private long totalChunks = 0;
@JsonProperty("first_page")
private Integer firstPage = null;
@JsonProperty("last_page")
private Integer lastPage = null;
@JsonProperty("heading_context")
@Nullable private HeadingContext headingContext = null;
@JsonProperty("image_indices")
private List<Integer> imageIndices = null;
/** Sets the byteStart field. */
@JsonProperty("byte_start")
public Builder withByteStart(final long value) {
this.byteStart = value;
return this;
}
/** Sets the byteEnd field. */
@JsonProperty("byte_end")
public Builder withByteEnd(final long value) {
this.byteEnd = value;
return this;
}
/** Sets the tokenCount field. */
@JsonProperty("token_count")
public Builder withTokenCount(final @Nullable long value) {
this.tokenCount = value;
return this;
}
/** Sets the chunkIndex field. */
@JsonProperty("chunk_index")
public Builder withChunkIndex(final long value) {
this.chunkIndex = value;
return this;
}
/** Sets the totalChunks field. */
@JsonProperty("total_chunks")
public Builder withTotalChunks(final long value) {
this.totalChunks = value;
return this;
}
/** Sets the firstPage field. */
@JsonProperty("first_page")
public Builder withFirstPage(final @Nullable int value) {
this.firstPage = value;
return this;
}
/** Sets the lastPage field. */
@JsonProperty("last_page")
public Builder withLastPage(final @Nullable int value) {
this.lastPage = value;
return this;
}
/** Sets the headingContext field. */
@JsonProperty("heading_context")
public Builder withHeadingContext(final @Nullable HeadingContext value) {
this.headingContext = value;
return this;
}
/** Sets the imageIndices field. */
@JsonProperty("image_indices")
public Builder withImageIndices(final @Nullable List<Integer> value) {
this.imageIndices = value;
return this;
}
/** Builds the ChunkMetadata instance. */
public ChunkMetadata build() {
return new ChunkMetadata(
byteStart,
byteEnd,
tokenCount,
chunkIndex,
totalChunks,
firstPage,
lastPage,
headingContext,
imageIndices
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,41 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import java.util.Optional;
/**
* How chunk size is measured.
*
* Defaults to {@code Characters} (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., {@code Xenova/gpt-4o}, {@code Xenova/cl100k_base}).
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type", visible = false)
@JsonSubTypes({
@JsonSubTypes.Type(value = ChunkSizing.Characters.class, name = "characters"),
@JsonSubTypes.Type(value = ChunkSizing.Tokenizer.class, name = "tokenizer")
})
@com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true)
public sealed interface ChunkSizing {
/** Size measured in Unicode characters (default). */
record Characters() implements ChunkSizing {
}
/** Size measured in tokens from a HuggingFace tokenizer. */
record Tokenizer(
@JsonProperty("model") String model,
@JsonProperty("cache_dir") Optional<java.nio.file.Path> cacheDir
) implements ChunkSizing {
}
}

View File

@@ -0,0 +1,75 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in {@code chunking.classifier}.
* Defaults to {@code Unknown} when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
public enum ChunkType {
/** Section heading or document title. */
Heading("heading"),
/** Party list: names, addresses, and signatories. */
PartyList("party_list"),
/** Definition clause ("X means…", "X shall mean…"). */
Definitions("definitions"),
/** Operative clause containing legal/contractual action verbs. */
OperativeClause("operative_clause"),
/** Signature block with signatures, names, and dates. */
SignatureBlock("signature_block"),
/** Schedule, annex, appendix, or exhibit section. */
Schedule("schedule"),
/** Table-like content with aligned columns or repeated patterns. */
TableLike("table_like"),
/** Mathematical formula or equation. */
Formula("formula"),
/** Code block or preformatted content. */
CodeBlock("code_block"),
/** Embedded or referenced image content. */
Image("image"),
/** Organizational chart or hierarchy diagram. */
OrgChart("org_chart"),
/** Diagram, figure, or visual illustration. */
Diagram("diagram"),
/** Unclassified or mixed content. */
Unknown("unknown");
/** The string value. */
private final String value;
ChunkType(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ChunkType fromValue(final String value) {
for (ChunkType e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Type of text chunker to use.
*
* # Variants
*
* * {@code Text} - Generic text splitter, splits on whitespace and punctuation
* * {@code Markdown} - Markdown-aware splitter, preserves formatting and structure
* * {@code Yaml} - YAML-aware splitter, creates one chunk per top-level key
* * {@code Semantic} - Topic-aware chunker. With an {@code EmbeddingConfig}, splits at
* embedding-based topic shifts tuned by {@code topic_threshold} (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* {@code max_characters} (default 1000). {@code topic_threshold} has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
public enum ChunkerType {
Text("text"),
Markdown("markdown"),
Yaml("yaml"),
Semantic("semantic");
/** The string value. */
private final String value;
ChunkerType(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ChunkerType fromValue(final String value) {
for (ChunkerType e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,203 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use {@code ..Default.default()} when constructing to allow for future field additions:
*
* let config = ChunkingConfig {
* max_characters: 500,
* ..Default.default()
* };
*
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ChunkingConfig.Builder.class)
public record ChunkingConfig(
/**
* Maximum size per chunk (in units determined by {@code sizing}).
*
* When {@code sizing} is {@code Characters} (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
@Nullable @JsonProperty("max_chars") Long maxCharacters,
/**
* Overlap between chunks (in units determined by {@code sizing}).
*
* Default: 200
*/
@Nullable @JsonProperty("max_overlap") Long overlap,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
@Nullable @JsonProperty("trim") Boolean trim,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
@Nullable @JsonProperty("chunker_type") ChunkerType chunkerType,
/**
* Optional embedding configuration for chunk embeddings.
*/
@Nullable @JsonProperty("embedding") EmbeddingConfig embedding,
/**
* Use a preset configuration (overrides individual settings if provided).
*/
@Nullable @JsonProperty("preset") String preset,
/**
* How to measure chunk size.
*
* Default: {@code Characters} (Unicode character count).
* Enable {@code chunking-tiktoken} or {@code chunking-tokenizers} features for token-based sizing.
*/
@Nullable @JsonProperty("sizing") ChunkSizing sizing,
/**
* When {@code true} and {@code chunker_type} is {@code Markdown}, prepend the heading hierarchy
* path (e.g. {@code "# Title &gt; ## Section\n\n"}) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: {@code false}
*/
@Nullable @JsonProperty("prepend_heading_context") Boolean prependHeadingContext,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when {@code chunker_type} is {@code Semantic} and an {@code EmbeddingConfig} is
* provided. You almost never need to set this. When omitted, defaults to
* {@code 0.75} which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: {@code 0.0..=1.0}.
*/
@Nullable @JsonProperty("topic_threshold") Float topicThreshold
) {
public static Builder builder() {
return new Builder();
}
public ChunkingConfig{
if (maxCharacters == null) maxCharacters = 1000L;
if (overlap == null) overlap = 200L;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("max_chars")
private Long maxCharacters = null;
@JsonProperty("max_overlap")
private Long overlap = null;
private Boolean trim = null;
@JsonProperty("chunker_type")
@Nullable private ChunkerType chunkerType = ChunkerType.Text;
private EmbeddingConfig embedding = null;
private String preset = null;
@Nullable private ChunkSizing sizing = new ChunkSizing.Characters();
@JsonProperty("prepend_heading_context")
private Boolean prependHeadingContext = null;
@JsonProperty("topic_threshold")
private Float topicThreshold = null;
/** Sets the maxCharacters field. */
@JsonProperty("max_chars")
public Builder withMaxCharacters(final @Nullable Long value) {
this.maxCharacters = value;
return this;
}
/** Sets the overlap field. */
@JsonProperty("max_overlap")
public Builder withOverlap(final @Nullable Long value) {
this.overlap = value;
return this;
}
/** Sets the trim field. */
@JsonProperty("trim")
public Builder withTrim(final @Nullable Boolean value) {
this.trim = value;
return this;
}
/** Sets the chunkerType field. */
@JsonProperty("chunker_type")
public Builder withChunkerType(final @Nullable ChunkerType value) {
this.chunkerType = value;
return this;
}
/** Sets the embedding field. */
@JsonProperty("embedding")
public Builder withEmbedding(final @Nullable EmbeddingConfig value) {
this.embedding = value;
return this;
}
/** Sets the preset field. */
@JsonProperty("preset")
public Builder withPreset(final @Nullable String value) {
this.preset = value;
return this;
}
/** Sets the sizing field. */
@JsonProperty("sizing")
public Builder withSizing(final @Nullable ChunkSizing value) {
this.sizing = value;
return this;
}
/** Sets the prependHeadingContext field. */
@JsonProperty("prepend_heading_context")
public Builder withPrependHeadingContext(final @Nullable Boolean value) {
this.prependHeadingContext = value;
return this;
}
/** Sets the topicThreshold field. */
@JsonProperty("topic_threshold")
public Builder withTopicThreshold(final @Nullable Float value) {
this.topicThreshold = value;
return this;
}
/** Builds the ChunkingConfig instance. */
public ChunkingConfig build() {
return new ChunkingConfig(
maxCharacters,
overlap,
trim,
chunkerType,
embedding,
preset,
sizing,
prependHeadingContext,
topicThreshold
);
}
}
// CPD-ON
public static ChunkingConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Citation file metadata (RIS, PubMed, EndNote).
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = CitationMetadata.Builder.class)
public record CitationMetadata(
@JsonProperty("citation_count") long citationCount,
@Nullable @JsonProperty("format") String format,
@Nullable @JsonProperty("authors") List<String> authors,
@Nullable @JsonProperty("year_range") YearRange yearRange,
@Nullable @JsonProperty("dois") List<String> dois,
@Nullable @JsonProperty("keywords") List<String> keywords
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("citation_count")
private long citationCount = 0;
private String format = null;
private List<String> authors = null;
@JsonProperty("year_range")
private YearRange yearRange = null;
private List<String> dois = null;
private List<String> keywords = null;
/** Sets the citationCount field. */
@JsonProperty("citation_count")
public Builder withCitationCount(final long value) {
this.citationCount = value;
return this;
}
/** Sets the format field. */
@JsonProperty("format")
public Builder withFormat(final @Nullable String value) {
this.format = value;
return this;
}
/** Sets the authors field. */
@JsonProperty("authors")
public Builder withAuthors(final @Nullable List<String> value) {
this.authors = value;
return this;
}
/** Sets the yearRange field. */
@JsonProperty("year_range")
public Builder withYearRange(final @Nullable YearRange value) {
this.yearRange = value;
return this;
}
/** Sets the dois field. */
@JsonProperty("dois")
public Builder withDois(final @Nullable List<String> value) {
this.dois = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable List<String> value) {
this.keywords = value;
return this;
}
/** Builds the CitationMetadata instance. */
public CitationMetadata build() {
return new CitationMetadata(
citationCount,
format,
authors,
yearRange,
dois,
keywords
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,54 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the {@code content} field
* of {@code ExtractionResult}.
*/
public enum CodeContentMode {
/** Use TSLP semantic chunks as content (default). */
Chunks("chunks"),
/** Use raw source code as content. */
Raw("raw"),
/** Emit function/class headings + docstrings (no code bodies). */
Structure("structure");
/** The string value. */
private final String value;
CodeContentMode(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static CodeContentMode fromValue(final String value) {
for (CodeContentMode e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,138 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When {@code None} on {@code ExtractionConfig}, each extractor uses its current
* default behavior unchanged.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ContentFilterConfig.Builder.class)
public record ContentFilterConfig(
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating {@code PageHeader}-classified regions as furniture.
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps {@code &lt;header&gt;} element content.
*
* Default: {@code false} (headers are stripped or excluded).
*/
@Nullable @JsonProperty("include_headers") Boolean includeHeaders,
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating {@code PageFooter}-classified regions as furniture.
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps {@code &lt;footer&gt;} element content.
*
* Default: {@code false} (footers are stripped or excluded).
*/
@Nullable @JsonProperty("include_footers") Boolean includeFooters,
/**
* Enable the heuristic cross-page repeating text detector.
*
* When {@code true} (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set {@code include_headers = true}, {@code include_footers = true},
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: {@code true}.
*/
@Nullable @JsonProperty("strip_repeating_text") Boolean stripRepeatingText,
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: {@code false} (watermarks are stripped).
*/
@Nullable @JsonProperty("include_watermarks") Boolean includeWatermarks
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("include_headers")
private Boolean includeHeaders = null;
@JsonProperty("include_footers")
private Boolean includeFooters = null;
@JsonProperty("strip_repeating_text")
private Boolean stripRepeatingText = null;
@JsonProperty("include_watermarks")
private Boolean includeWatermarks = null;
/** Sets the includeHeaders field. */
@JsonProperty("include_headers")
public Builder withIncludeHeaders(final @Nullable Boolean value) {
this.includeHeaders = value;
return this;
}
/** Sets the includeFooters field. */
@JsonProperty("include_footers")
public Builder withIncludeFooters(final @Nullable Boolean value) {
this.includeFooters = value;
return this;
}
/** Sets the stripRepeatingText field. */
@JsonProperty("strip_repeating_text")
public Builder withStripRepeatingText(final @Nullable Boolean value) {
this.stripRepeatingText = value;
return this;
}
/** Sets the includeWatermarks field. */
@JsonProperty("include_watermarks")
public Builder withIncludeWatermarks(final @Nullable Boolean value) {
this.includeWatermarks = value;
return this;
}
/** Builds the ContentFilterConfig instance. */
public ContentFilterConfig build() {
return new ContentFilterConfig(
includeHeaders,
includeFooters,
stripRepeatingText,
includeWatermarks
);
}
}
// CPD-ON
public static ContentFilterConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,55 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
public enum ContentLayer {
/** Main document body content. */
Body("body"),
/** Page/section header (running header). */
Header("header"),
/** Page/section footer (running footer). */
Footer("footer"),
/** Footnote content. */
Footnote("footnote");
/** The string value. */
private final String value;
ContentLayer(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ContentLayer fromValue(final String value) {
for (ContentLayer e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,57 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* JATS contributor with role.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ContributorRole.Builder.class)
public record ContributorRole(
@JsonProperty("name") String name,
@Nullable @JsonProperty("role") String role
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = "";
private String role = null;
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final String value) {
this.name = value;
return this;
}
/** Sets the role field. */
@JsonProperty("role")
public Builder withRole(final @Nullable String value) {
this.role = value;
return this;
}
/** Builds the ContributorRole instance. */
public ContributorRole build() {
return new ContributorRole(
name,
role
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
/** Exception thrown when type conversion fails. */
@SuppressWarnings("checkstyle:LineLength")
public class ConversionErrorException extends KreuzbergRsException {
/** Creates a new ConversionErrorException. */
public ConversionErrorException(final String message) {
super(2, message);
}
/** Creates a new ConversionErrorException with a cause. */
public ConversionErrorException(final String message, final Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,238 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = CoreProperties.Builder.class)
public record CoreProperties(
/**
* Document title
*/
@Nullable @JsonProperty("title") String title,
/**
* Document subject/topic
*/
@Nullable @JsonProperty("subject") String subject,
/**
* Document creator/author
*/
@Nullable @JsonProperty("creator") String creator,
/**
* Keywords or tags
*/
@Nullable @JsonProperty("keywords") String keywords,
/**
* Document description/abstract
*/
@Nullable @JsonProperty("description") String description,
/**
* User who last modified the document
*/
@Nullable @JsonProperty("last_modified_by") String lastModifiedBy,
/**
* Revision number
*/
@Nullable @JsonProperty("revision") String revision,
/**
* Creation timestamp (ISO 8601)
*/
@Nullable @JsonProperty("created") String created,
/**
* Last modification timestamp (ISO 8601)
*/
@Nullable @JsonProperty("modified") String modified,
/**
* Document category
*/
@Nullable @JsonProperty("category") String category,
/**
* Content status (Draft, Final, etc.)
*/
@Nullable @JsonProperty("content_status") String contentStatus,
/**
* Document language
*/
@Nullable @JsonProperty("language") String language,
/**
* Unique identifier
*/
@Nullable @JsonProperty("identifier") String identifier,
/**
* Document version
*/
@Nullable @JsonProperty("version") String version,
/**
* Last print timestamp (ISO 8601)
*/
@Nullable @JsonProperty("last_printed") String lastPrinted
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String title = null;
private String subject = null;
private String creator = null;
private String keywords = null;
private String description = null;
@JsonProperty("last_modified_by")
private String lastModifiedBy = null;
private String revision = null;
private String created = null;
private String modified = null;
private String category = null;
@JsonProperty("content_status")
private String contentStatus = null;
private String language = null;
private String identifier = null;
private String version = null;
@JsonProperty("last_printed")
private String lastPrinted = null;
/** Sets the title field. */
@JsonProperty("title")
public Builder withTitle(final @Nullable String value) {
this.title = value;
return this;
}
/** Sets the subject field. */
@JsonProperty("subject")
public Builder withSubject(final @Nullable String value) {
this.subject = value;
return this;
}
/** Sets the creator field. */
@JsonProperty("creator")
public Builder withCreator(final @Nullable String value) {
this.creator = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable String value) {
this.keywords = value;
return this;
}
/** Sets the description field. */
@JsonProperty("description")
public Builder withDescription(final @Nullable String value) {
this.description = value;
return this;
}
/** Sets the lastModifiedBy field. */
@JsonProperty("last_modified_by")
public Builder withLastModifiedBy(final @Nullable String value) {
this.lastModifiedBy = value;
return this;
}
/** Sets the revision field. */
@JsonProperty("revision")
public Builder withRevision(final @Nullable String value) {
this.revision = value;
return this;
}
/** Sets the created field. */
@JsonProperty("created")
public Builder withCreated(final @Nullable String value) {
this.created = value;
return this;
}
/** Sets the modified field. */
@JsonProperty("modified")
public Builder withModified(final @Nullable String value) {
this.modified = value;
return this;
}
/** Sets the category field. */
@JsonProperty("category")
public Builder withCategory(final @Nullable String value) {
this.category = value;
return this;
}
/** Sets the contentStatus field. */
@JsonProperty("content_status")
public Builder withContentStatus(final @Nullable String value) {
this.contentStatus = value;
return this;
}
/** Sets the language field. */
@JsonProperty("language")
public Builder withLanguage(final @Nullable String value) {
this.language = value;
return this;
}
/** Sets the identifier field. */
@JsonProperty("identifier")
public Builder withIdentifier(final @Nullable String value) {
this.identifier = value;
return this;
}
/** Sets the version field. */
@JsonProperty("version")
public Builder withVersion(final @Nullable String value) {
this.version = value;
return this;
}
/** Sets the lastPrinted field. */
@JsonProperty("last_printed")
public Builder withLastPrinted(final @Nullable String value) {
this.lastPrinted = value;
return this;
}
/** Builds the CoreProperties instance. */
public CoreProperties build() {
return new CoreProperties(
title,
subject,
creator,
keywords,
description,
lastModifiedBy,
revision,
created,
modified,
category,
contentStatus,
language,
identifier,
version,
lastPrinted
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,92 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* CSV/TSV file metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = CsvMetadata.Builder.class)
public record CsvMetadata(
@JsonProperty("row_count") int rowCount,
@JsonProperty("column_count") int columnCount,
@Nullable @JsonProperty("delimiter") String delimiter,
@JsonProperty("has_header") boolean hasHeader,
@Nullable @JsonProperty("column_types") List<String> columnTypes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("row_count")
private int rowCount = 0;
@JsonProperty("column_count")
private int columnCount = 0;
private String delimiter = null;
@JsonProperty("has_header")
private boolean hasHeader = false;
@JsonProperty("column_types")
private List<String> columnTypes = null;
/** Sets the rowCount field. */
@JsonProperty("row_count")
public Builder withRowCount(final int value) {
this.rowCount = value;
return this;
}
/** Sets the columnCount field. */
@JsonProperty("column_count")
public Builder withColumnCount(final int value) {
this.columnCount = value;
return this;
}
/** Sets the delimiter field. */
@JsonProperty("delimiter")
public Builder withDelimiter(final @Nullable String value) {
this.delimiter = value;
return this;
}
/** Sets the hasHeader field. */
@JsonProperty("has_header")
public Builder withHasHeader(final boolean value) {
this.hasHeader = value;
return this;
}
/** Sets the columnTypes field. */
@JsonProperty("column_types")
public Builder withColumnTypes(final @Nullable List<String> value) {
this.columnTypes = value;
return this;
}
/** Builds the CsvMetadata instance. */
public CsvMetadata build() {
return new CsvMetadata(
rowCount,
columnCount,
delimiter,
hasHeader,
columnTypes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,57 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* dBASE field information.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DbfFieldInfo.Builder.class)
public record DbfFieldInfo(
@JsonProperty("name") String name,
@JsonProperty("field_type") String fieldType
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = "";
@JsonProperty("field_type")
private String fieldType = "";
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final String value) {
this.name = value;
return this;
}
/** Sets the fieldType field. */
@JsonProperty("field_type")
public Builder withFieldType(final String value) {
this.fieldType = value;
return this;
}
/** Builds the DbfFieldInfo instance. */
public DbfFieldInfo build() {
return new DbfFieldInfo(
name,
fieldType
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,70 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* dBASE (DBF) file metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DbfMetadata.Builder.class)
public record DbfMetadata(
@JsonProperty("record_count") long recordCount,
@JsonProperty("field_count") long fieldCount,
@Nullable @JsonProperty("fields") List<DbfFieldInfo> fields
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("record_count")
private long recordCount = 0;
@JsonProperty("field_count")
private long fieldCount = 0;
private List<DbfFieldInfo> fields = null;
/** Sets the recordCount field. */
@JsonProperty("record_count")
public Builder withRecordCount(final long value) {
this.recordCount = value;
return this;
}
/** Sets the fieldCount field. */
@JsonProperty("field_count")
public Builder withFieldCount(final long value) {
this.fieldCount = value;
return this;
}
/** Sets the fields field. */
@JsonProperty("fields")
public Builder withFields(final @Nullable List<DbfFieldInfo> value) {
this.fields = value;
return this;
}
/** Builds the DbfMetadata instance. */
public DbfMetadata build() {
return new DbfMetadata(
recordCount,
fieldCount,
fields
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,64 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* MIME type detection response.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DetectResponse.Builder.class)
public record DetectResponse(
/**
* Detected MIME type
*/
@JsonProperty("mime_type") String mimeType,
/**
* Original filename (if provided)
*/
@Nullable @JsonProperty("filename") String filename
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("mime_type")
private String mimeType = "";
private String filename = null;
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final String value) {
this.mimeType = value;
return this;
}
/** Sets the filename field. */
@JsonProperty("filename")
public Builder withFilename(final @Nullable String value) {
this.filename = value;
return this;
}
/** Builds the DetectResponse instance. */
public DetectResponse build() {
return new DetectResponse(
mimeType,
filename
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,69 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Page-level detection result containing all detections and page metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DetectionResult.Builder.class)
public record DetectionResult(
@JsonProperty("page_width") int pageWidth,
@JsonProperty("page_height") int pageHeight,
@JsonProperty("detections") List<LayoutDetection> detections
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("page_width")
private int pageWidth = 0;
@JsonProperty("page_height")
private int pageHeight = 0;
private List<LayoutDetection> detections = List.of();
/** Sets the pageWidth field. */
@JsonProperty("page_width")
public Builder withPageWidth(final int value) {
this.pageWidth = value;
return this;
}
/** Sets the pageHeight field. */
@JsonProperty("page_height")
public Builder withPageHeight(final int value) {
this.pageHeight = value;
return this;
}
/** Sets the detections field. */
@JsonProperty("detections")
public Builder withDetections(final List<LayoutDetection> value) {
this.detections = value;
return this;
}
/** Builds the DetectionResult instance. */
public DetectionResult build() {
return new DetectionResult(
pageWidth,
pageHeight,
detections
);
}
}
// CPD-ON
}

106
packages/java/dev/kreuzberg/DiffHunk.java generated Normal file
View File

@@ -0,0 +1,106 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* A single contiguous hunk in a unified diff.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DiffHunk.Builder.class)
public record DiffHunk(
/**
* Starting line number in the old content (0-indexed).
*/
@JsonProperty("from_line") long fromLine,
/**
* Number of lines from the old content in this hunk.
*/
@JsonProperty("from_count") long fromCount,
/**
* Starting line number in the new content (0-indexed).
*/
@JsonProperty("to_line") long toLine,
/**
* Number of lines from the new content in this hunk.
*/
@JsonProperty("to_count") long toCount,
/**
* Lines that make up this hunk.
*/
@JsonProperty("lines") List<DiffLine> lines
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("from_line")
private long fromLine = 0;
@JsonProperty("from_count")
private long fromCount = 0;
@JsonProperty("to_line")
private long toLine = 0;
@JsonProperty("to_count")
private long toCount = 0;
private List<DiffLine> lines = List.of();
/** Sets the fromLine field. */
@JsonProperty("from_line")
public Builder withFromLine(final long value) {
this.fromLine = value;
return this;
}
/** Sets the fromCount field. */
@JsonProperty("from_count")
public Builder withFromCount(final long value) {
this.fromCount = value;
return this;
}
/** Sets the toLine field. */
@JsonProperty("to_line")
public Builder withToLine(final long value) {
this.toLine = value;
return this;
}
/** Sets the toCount field. */
@JsonProperty("to_count")
public Builder withToCount(final long value) {
this.toCount = value;
return this;
}
/** Sets the lines field. */
@JsonProperty("lines")
public Builder withLines(final List<DiffLine> value) {
this.lines = value;
return this;
}
/** Builds the DiffHunk instance. */
public DiffHunk build() {
return new DiffHunk(
fromLine,
fromCount,
toLine,
toCount,
lines
);
}
}
// CPD-ON
}

126
packages/java/dev/kreuzberg/DiffLine.java generated Normal file
View File

@@ -0,0 +1,126 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
import com.fasterxml.jackson.databind.ser.std.StdSerializer;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.jspecify.annotations.Nullable;
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in {@code crate.diff}) so {@code RevisionDelta} can
* reference it unconditionally, without requiring the {@code diff} Cargo feature.
* {@code crate.diff} re-exports this type verbatim.
*/
@com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true)
@JsonDeserialize(using = DiffLineDeserializer.class)
@JsonSerialize(using = DiffLineSerializer.class)
public sealed interface DiffLine {
/** Unchanged context line. */
record Context(String value) implements DiffLine { }
/** Line added in the "after" version. */
record Added(String value) implements DiffLine { }
/** Line removed from the "before" version. */
record Removed(String value) implements DiffLine { }
/** Returns the Context data if this is a Context variant, otherwise null. */
default @Nullable String context() {
return this instanceof Context e ? e.value() : null;
}
/** Returns the Added data if this is a Added variant, otherwise null. */
default @Nullable String added() {
return this instanceof Added e ? e.value() : null;
}
/** Returns the Removed data if this is a Removed variant, otherwise null. */
default @Nullable String removed() {
return this instanceof Removed e ? e.value() : null;
}
}
// Custom deserializer for sealed interface with unwrapped variants
class DiffLineDeserializer extends StdDeserializer<DiffLine> {
DiffLineDeserializer() {
super(DiffLine.class);
}
@Override
public DiffLine deserialize(JsonParser parser, DeserializationContext ctx)
throws java.io.IOException {
ObjectNode node = parser.getCodec().readTree(parser);
com.fasterxml.jackson.databind.JsonNode tagNode = node.get("kind");
if (tagNode == null || tagNode.isNull()) {
throw new com.fasterxml.jackson.databind.JsonMappingException(
parser, "Missing discriminator field: kind");
}
String tagValue = tagNode.asText();
node.remove("kind");
return switch (tagValue) {
case "context" -> new DiffLine.Context(node.toString());
case "added" -> new DiffLine.Added(node.toString());
case "removed" -> new DiffLine.Removed(node.toString());
default -> throw new com.fasterxml.jackson.databind.JsonMappingException(
parser, "Unknown DiffLine discriminator: " + tagValue);
};
}
}
// Custom serializer for sealed interface with unwrapped variants — emits
// the discriminator tag alongside the inner record's fields (flat object).
class DiffLineSerializer extends StdSerializer<DiffLine> {
private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER =
new com.fasterxml.jackson.databind.ObjectMapper()
.registerModule(new com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
.setSerializationInclusion(com.fasterxml.jackson.annotation.JsonInclude.Include.NON_NULL);
DiffLineSerializer() {
super(DiffLine.class);
}
@Override
public void serialize(DiffLine value, JsonGenerator gen, SerializerProvider provider)
throws java.io.IOException {
String tag;
Object inner;if (value instanceof DiffLine.Context v) {
tag = "context"; inner = v.value(); }else if (value instanceof DiffLine.Added v) {
tag = "added"; inner = v.value(); }else if (value instanceof DiffLine.Removed v) {
tag = "removed"; inner = v.value(); } else {
throw new com.fasterxml.jackson.databind.JsonMappingException(gen,
"Unknown DiffLine variant: " + value.getClass().getName());
}
gen.writeStartObject();
gen.writeStringField("kind", tag);
if (inner != null) {
com.fasterxml.jackson.databind.JsonNode tree = MAPPER.valueToTree(inner);
if (tree.isObject()) {
java.util.Iterator<java.util.Map.Entry<String, com.fasterxml.jackson.databind.JsonNode>> it =
tree.fields();
while (it.hasNext()) {
java.util.Map.Entry<String, com.fasterxml.jackson.databind.JsonNode> e = it.next();
gen.writeFieldName(e.getKey());
gen.writeTree(e.getValue());
}
}
}
gen.writeEndObject();
}
}

View File

@@ -0,0 +1,85 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Options controlling how two {@code ExtractionResult} values are compared.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DiffOptions.Builder.class)
public record DiffOptions(
/**
* Include metadata changes in the diff. Default: {@code true}.
*/
@JsonProperty("include_metadata") boolean includeMetadata,
/**
* Include embedded-children changes in the diff. Default: {@code true}.
*/
@JsonProperty("include_embedded") boolean includeEmbedded,
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* {@code None} means no truncation.
*/
@Nullable @JsonProperty("max_content_chars") Long maxContentChars
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("include_metadata")
private boolean includeMetadata = true;
@JsonProperty("include_embedded")
private boolean includeEmbedded = true;
@JsonProperty("max_content_chars")
private Long maxContentChars = null;
/** Sets the includeMetadata field. */
@JsonProperty("include_metadata")
public Builder withIncludeMetadata(final boolean value) {
this.includeMetadata = value;
return this;
}
/** Sets the includeEmbedded field. */
@JsonProperty("include_embedded")
public Builder withIncludeEmbedded(final boolean value) {
this.includeEmbedded = value;
return this;
}
/** Sets the maxContentChars field. */
@JsonProperty("max_content_chars")
public Builder withMaxContentChars(final @Nullable long value) {
this.maxContentChars = value;
return this;
}
/** Builds the DiffOptions instance. */
public DiffOptions build() {
return new DiffOptions(
includeMetadata,
includeEmbedded,
maxContentChars
);
}
}
// CPD-ON
public static DiffOptions defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,153 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the {@code djot} feature is enabled.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DjotContent.Builder.class)
public record DjotContent(
/**
* Plain text representation for backwards compatibility
*/
@JsonProperty("plain_text") String plainText,
/**
* Structured block-level content
*/
@JsonProperty("blocks") List<FormattedBlock> blocks,
/**
* Metadata from YAML frontmatter
*/
@JsonProperty("metadata") Metadata metadata,
/**
* Extracted tables as structured data
*/
@JsonProperty("tables") List<Table> tables,
/**
* Extracted images with metadata
*/
@JsonProperty("images") List<DjotImage> images,
/**
* Extracted links with URLs
*/
@JsonProperty("links") List<DjotLink> links,
/**
* Footnote definitions
*/
@JsonProperty("footnotes") List<Footnote> footnotes,
/**
* Attributes mapped by element identifier (if present)
*/
@Nullable @JsonProperty("attributes") List<String> attributes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("plain_text")
private String plainText = "";
private List<FormattedBlock> blocks = List.of();
private Metadata metadata = null;
private List<Table> tables = List.of();
private List<DjotImage> images = List.of();
private List<DjotLink> links = List.of();
private List<Footnote> footnotes = List.of();
private List<String> attributes = null;
/** Sets the plainText field. */
@JsonProperty("plain_text")
public Builder withPlainText(final String value) {
this.plainText = value;
return this;
}
/** Sets the blocks field. */
@JsonProperty("blocks")
public Builder withBlocks(final List<FormattedBlock> value) {
this.blocks = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final Metadata value) {
this.metadata = value;
return this;
}
/** Sets the tables field. */
@JsonProperty("tables")
public Builder withTables(final List<Table> value) {
this.tables = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final List<DjotImage> value) {
this.images = value;
return this;
}
/** Sets the links field. */
@JsonProperty("links")
public Builder withLinks(final List<DjotLink> value) {
this.links = value;
return this;
}
/** Sets the footnotes field. */
@JsonProperty("footnotes")
public Builder withFootnotes(final List<Footnote> value) {
this.footnotes = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final @Nullable List<String> value) {
this.attributes = value;
return this;
}
/** Builds the DjotContent instance. */
public DjotContent build() {
return new DjotContent(
plainText,
blocks,
metadata,
tables,
images,
links,
footnotes,
attributes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,89 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Image element in Djot.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DjotImage.Builder.class)
public record DjotImage(
/**
* Image source URL or path
*/
@JsonProperty("src") String src,
/**
* Alternative text
*/
@JsonProperty("alt") String alt,
/**
* Optional title
*/
@Nullable @JsonProperty("title") String title,
/**
* Element attributes
*/
@Nullable @JsonProperty("attributes") String attributes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String src = "";
private String alt = "";
private String title = null;
private String attributes = null;
/** Sets the src field. */
@JsonProperty("src")
public Builder withSrc(final String value) {
this.src = value;
return this;
}
/** Sets the alt field. */
@JsonProperty("alt")
public Builder withAlt(final String value) {
this.alt = value;
return this;
}
/** Sets the title field. */
@JsonProperty("title")
public Builder withTitle(final @Nullable String value) {
this.title = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final @Nullable String value) {
this.attributes = value;
return this;
}
/** Builds the DjotImage instance. */
public DjotImage build() {
return new DjotImage(
src,
alt,
title,
attributes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,89 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Link element in Djot.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DjotLink.Builder.class)
public record DjotLink(
/**
* Link URL
*/
@JsonProperty("url") String url,
/**
* Link text content
*/
@JsonProperty("text") String text,
/**
* Optional title
*/
@Nullable @JsonProperty("title") String title,
/**
* Element attributes
*/
@Nullable @JsonProperty("attributes") String attributes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String url = "";
private String text = "";
private String title = null;
private String attributes = null;
/** Sets the url field. */
@JsonProperty("url")
public Builder withUrl(final String value) {
this.url = value;
return this;
}
/** Sets the text field. */
@JsonProperty("text")
public Builder withText(final String value) {
this.text = value;
return this;
}
/** Sets the title field. */
@JsonProperty("title")
public Builder withTitle(final @Nullable String value) {
this.title = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final @Nullable String value) {
this.attributes = value;
return this;
}
/** Builds the DjotLink instance. */
public DjotLink build() {
return new DjotLink(
url,
text,
title,
attributes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,402 @@
package dev.kreuzberg;
import java.lang.foreign.Arena;
import java.lang.foreign.FunctionDescriptor;
import java.lang.foreign.Linker;
import java.lang.foreign.MemoryLayout;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import com.fasterxml.jackson.databind.ObjectMapper;
/**
* Allocates Panama FFM upcall stubs for an IDocumentExtractor implementation,
* assembles the C vtable in native memory, and provides static
* registerDocumentExtractor/unregisterDocumentExtractor helpers.
*/
public final class DocumentExtractorBridge implements AutoCloseable {
private static final Linker LINKER = Linker.nativeLinker();
private static final MethodHandles.Lookup LOOKUP = MethodHandles.lookup();
private static final ObjectMapper JSON = new ObjectMapper();
/** Live registry — keeps Arenas and upcall stubs alive past the register call. */
private static final ConcurrentHashMap<String, DocumentExtractorBridge>
DOCUMENT_EXTRACTOR_BRIDGES = new ConcurrentHashMap<>();
// C vtable: 11 fields (4 plugin methods + 5 trait methods + free_string + free_user_data)
private static final MemoryLayout VTABLE_LAYOUT = MemoryLayout.structLayout(ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS);
private static final long VTABLE_SIZE = VTABLE_LAYOUT.byteSize();
private final Arena arena;
private final MemorySegment vtable;
private final IDocumentExtractor impl;
DocumentExtractorBridge(final IDocumentExtractor impl) {
this.impl = impl;
this.arena = Arena.ofShared();
this.vtable = arena.allocate(VTABLE_SIZE);
try {
long offset = 0L;
var stubName = LINKER.upcallStub(LOOKUP.bind(this, "handleName",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubName);
offset += ValueLayout.ADDRESS.byteSize();
var stubVersion = LINKER.upcallStub(LOOKUP.bind(this, "handleVersion",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubVersion);
offset += ValueLayout.ADDRESS.byteSize();
var stubInitialize = LINKER.upcallStub(LOOKUP.bind(this, "handleInitialize",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubInitialize);
offset += ValueLayout.ADDRESS.byteSize();
var stubShutdown = LINKER.upcallStub(LOOKUP.bind(this, "handleShutdown",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubShutdown);
offset += ValueLayout.ADDRESS.byteSize();
var stubExtractBytes = LINKER.upcallStub(LOOKUP.bind(this, "handleExtractBytes",
MethodType.methodType(
int.class,
MemorySegment.class,
MemorySegment.class,
long.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class
)),
FunctionDescriptor.of(
ValueLayout.JAVA_INT,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.JAVA_LONG,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS
),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubExtractBytes);
offset += ValueLayout.ADDRESS.byteSize();
var stubExtractFile = LINKER.upcallStub(LOOKUP.bind(this, "handleExtractFile",
MethodType.methodType(
int.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class
)),
FunctionDescriptor.of(
ValueLayout.JAVA_INT,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS
),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubExtractFile);
offset += ValueLayout.ADDRESS.byteSize();
var stubSupportedMimeTypes = LINKER.upcallStub(LOOKUP.bind(this, "handleSupportedMimeTypes",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubSupportedMimeTypes);
offset += ValueLayout.ADDRESS.byteSize();
var stubPriority = LINKER.upcallStub(LOOKUP.bind(this, "handlePriority",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubPriority);
offset += ValueLayout.ADDRESS.byteSize();
var stubCanHandle = LINKER.upcallStub(LOOKUP.bind(this, "handleCanHandle",
MethodType.methodType(
int.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class,
MemorySegment.class
)),
FunctionDescriptor.of(
ValueLayout.JAVA_INT,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS
),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubCanHandle);
offset += ValueLayout.ADDRESS.byteSize();
var stubFreeString = LINKER.upcallStub(LOOKUP.bind(this, "freeString",
MethodType.methodType(void.class, MemorySegment.class)),
FunctionDescriptor.ofVoid(ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubFreeString);
offset += ValueLayout.ADDRESS.byteSize();
var stubFreeUserData = LINKER.upcallStub(LOOKUP.bind(this, "freeUserData",
MethodType.methodType(void.class, MemorySegment.class)),
FunctionDescriptor.ofVoid(ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubFreeUserData);
offset += ValueLayout.ADDRESS.byteSize();
} catch (ReflectiveOperationException e) {
arena.close();
throw new RuntimeException("Failed to create trait bridge stubs", e);
}
}
MemorySegment vtableSegment() { return vtable; }
private int handleName(MemorySegment userData, MemorySegment outName, MemorySegment outError) {
try {
outName.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(impl.name()));
return 0;
} catch (Throwable e) { return 1; }
}
private int handleVersion(MemorySegment userData, MemorySegment outVersion, MemorySegment outError) {
try {
outVersion.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(impl.version()));
return 0;
} catch (Throwable e) { return 1; }
}
private int handleInitialize(MemorySegment userData, MemorySegment outError) {
try {
impl.initialize();
return 0;
} catch (Throwable e) { return 1; }
}
private int handleShutdown(MemorySegment userData, MemorySegment outError) {
try {
impl.shutdown();
return 0;
} catch (Throwable e) { return 1; }
}
private int handleExtractBytes(
MemorySegment userData,
MemorySegment content_in,
long contentLen,
MemorySegment mime_type_in,
MemorySegment config_in,
MemorySegment outResult,
MemorySegment outError
) {
try {
byte[] content = content_in.reinterpret(contentLen).toArray(ValueLayout.JAVA_BYTE);
String mime_type = mime_type_in.reinterpret(Long.MAX_VALUE).getString(0);
String config_json = config_in.reinterpret(Long.MAX_VALUE).getString(0);
ExtractionConfig config = JSON.readValue(config_json, ExtractionConfig.class);
String result = impl.extract_bytes(content, mime_type, config);
MemorySegment jsonCs = arena.allocateFrom(result);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private int handleExtractFile(
MemorySegment userData,
MemorySegment path_in,
MemorySegment mime_type_in,
MemorySegment config_in,
MemorySegment outResult,
MemorySegment outError
) {
try {
java.nio.file.Path path = java.nio.file.Paths.get(path_in.reinterpret(Long.MAX_VALUE).getString(0));
String mime_type = mime_type_in.reinterpret(Long.MAX_VALUE).getString(0);
String config_json = config_in.reinterpret(Long.MAX_VALUE).getString(0);
ExtractionConfig config = JSON.readValue(config_json, ExtractionConfig.class);
String result = impl.extract_file(path, mime_type, config);
MemorySegment jsonCs = arena.allocateFrom(result);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private int handleSupportedMimeTypes(MemorySegment userData, MemorySegment outResult, MemorySegment outError) {
try {
List<String> result = impl.supported_mime_types();
String json = JSON.writeValueAsString(result);
MemorySegment jsonCs = arena.allocateFrom(json);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private int handlePriority(MemorySegment userData, MemorySegment outResult, MemorySegment outError) {
try {
int result = impl.priority();
String json = JSON.writeValueAsString(result);
MemorySegment jsonCs = arena.allocateFrom(json);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private int handleCanHandle(
MemorySegment userData,
MemorySegment _path_in,
MemorySegment _mime_type_in,
MemorySegment outResult,
MemorySegment outError
) {
try {
java.nio.file.Path _path = java.nio.file.Paths.get(_path_in.reinterpret(Long.MAX_VALUE).getString(0));
String _mime_type = _mime_type_in.reinterpret(Long.MAX_VALUE).getString(0);
boolean result = impl.can_handle(_path, _mime_type);
String json = JSON.writeValueAsString(result);
MemorySegment jsonCs = arena.allocateFrom(json);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private void writeError(MemorySegment outError, Throwable e) {
try { outError.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(e.getClass().getSimpleName() + ": " + e.getMessage())); }
catch (Throwable ignored) { /* swallow */ }
}
private void freeString(MemorySegment ptr) {
// Strings returned by Java callbacks are arena-owned and released when this bridge closes.
}
private void freeUserData(MemorySegment userData) {
// User data is Java-side state (the impl object), not freed by Rust on drop.
}
/** Read a NUL-terminated native C string safely without unbounded reinterpret. */
private static String readNativeString(MemorySegment ptr) {
return ptr.reinterpret(4096).getString(0);
}
@Override
public void close() { arena.close(); }
/** Register a DocumentExtractor implementation via Panama FFM upcall stubs. */
public static void registerDocumentExtractor(final IDocumentExtractor impl) throws Exception {
var bridge = new DocumentExtractorBridge(impl);
try {
try (var nameArena = Arena.ofShared()) {
var nameCs = nameArena.allocateFrom(impl.name());
MemorySegment outErr = nameArena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_REGISTER_DOCUMENT_EXTRACTOR.invoke(
nameCs,
bridge.vtableSegment(),
MemorySegment.NULL,
outErr
);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL) ? "registration failed (rc=" + rc + ")" : readNativeString(errPtr);
throw new RuntimeException("registerDocumentExtractor: " + msg);
}
}
} catch (Throwable t) {
bridge.close();
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during registration", t);
}
}
DOCUMENT_EXTRACTOR_BRIDGES.put(impl.name(), bridge);
}
/** Unregister a DocumentExtractor implementation by name. */
public static void unregisterDocumentExtractor(String name) throws Exception {
try {
try (var nameArena = Arena.ofShared()) {
var nameCs = nameArena.allocateFrom(name);
MemorySegment outErr = nameArena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_UNREGISTER_DOCUMENT_EXTRACTOR.invoke(nameCs, outErr);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "unregistration failed (rc=" + rc + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new RuntimeException("unregisterDocumentExtractor: " + msg);
}
}
} catch (Throwable t) {
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during unregistration", t);
}
}
DocumentExtractorBridge old = DOCUMENT_EXTRACTOR_BRIDGES.remove(name);
if (old != null) { old.close(); }
}
/** Clear all registered DocumentExtractor implementations. */
public static void clearDocumentExtractors() throws Exception {
try {
try (var arena = Arena.ofShared()) {
MemorySegment outErr = arena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_CLEAR_DOCUMENT_EXTRACTOR.invoke(outErr);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + rc + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new RuntimeException("clearDocumentExtractors: " + msg);
}
}
} catch (Throwable t) {
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during clear", t);
}
}
DOCUMENT_EXTRACTOR_BRIDGES.values().forEach(DocumentExtractorBridge::close);
DOCUMENT_EXTRACTOR_BRIDGES.clear();
}
}

View File

@@ -0,0 +1,179 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* A single node in the document tree.
*
* Each node has deterministic {@code id}, typed {@code content}, optional {@code parent}/{@code children}
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocumentNode.Builder.class)
public record DocumentNode(
/**
* Deterministic identifier (hash of content + position).
*/
@JsonProperty("id") String id,
/**
* Node content — tagged enum, type-specific data only.
*/
@JsonProperty("content") NodeContent content,
/**
* Parent node index ({@code None} = root-level node).
*/
@Nullable @JsonProperty("parent") Integer parent,
/**
* Child node indices in reading order.
*/
@Nullable @JsonProperty("children") List<Integer> children,
/**
* Content layer classification.
*/
@Nullable @JsonProperty("content_layer") ContentLayer contentLayer,
/**
* Page number where this node starts (1-indexed).
*/
@Nullable @JsonProperty("page") Integer page,
/**
* Page number where this node ends (for multi-page tables/sections).
*/
@Nullable @JsonProperty("page_end") Integer pageEnd,
/**
* Bounding box in document coordinates.
*/
@Nullable @JsonProperty("bbox") BoundingBox bbox,
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*/
@Nullable @JsonProperty("annotations") List<TextAnnotation> annotations,
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*/
@Nullable @JsonProperty("attributes") Map<String, String> attributes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String id = "";
private NodeContent content = null;
private Integer parent = null;
private List<Integer> children = null;
@JsonProperty("content_layer")
@Nullable private ContentLayer contentLayer = ContentLayer.Body;
private Integer page = null;
@JsonProperty("page_end")
private Integer pageEnd = null;
private BoundingBox bbox = null;
private List<TextAnnotation> annotations = null;
private Map<String, String> attributes = null;
/** Sets the id field. */
@JsonProperty("id")
public Builder withId(final String value) {
this.id = value;
return this;
}
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final NodeContent value) {
this.content = value;
return this;
}
/** Sets the parent field. */
@JsonProperty("parent")
public Builder withParent(final @Nullable int value) {
this.parent = value;
return this;
}
/** Sets the children field. */
@JsonProperty("children")
public Builder withChildren(final @Nullable List<Integer> value) {
this.children = value;
return this;
}
/** Sets the contentLayer field. */
@JsonProperty("content_layer")
public Builder withContentLayer(final @Nullable ContentLayer value) {
this.contentLayer = value;
return this;
}
/** Sets the page field. */
@JsonProperty("page")
public Builder withPage(final @Nullable int value) {
this.page = value;
return this;
}
/** Sets the pageEnd field. */
@JsonProperty("page_end")
public Builder withPageEnd(final @Nullable int value) {
this.pageEnd = value;
return this;
}
/** Sets the bbox field. */
@JsonProperty("bbox")
public Builder withBbox(final @Nullable BoundingBox value) {
this.bbox = value;
return this;
}
/** Sets the annotations field. */
@JsonProperty("annotations")
public Builder withAnnotations(final @Nullable List<TextAnnotation> value) {
this.annotations = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final @Nullable Map<String, String> value) {
this.attributes = value;
return this;
}
/** Builds the DocumentNode instance. */
public DocumentNode build() {
return new DocumentNode(
id,
content,
parent,
children,
contentLayer,
page,
pageEnd,
bbox,
annotations,
attributes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,75 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* A resolved relationship between two nodes in the document tree.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocumentRelationship.Builder.class)
public record DocumentRelationship(
/**
* Source node index (the referencing node).
*/
@JsonProperty("source") int source,
/**
* Target node index (the referenced node).
*/
@JsonProperty("target") int target,
/**
* Semantic kind of the relationship.
*/
@JsonProperty("kind") RelationshipKind kind
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private int source = 0;
private int target = 0;
private RelationshipKind kind = null;
/** Sets the source field. */
@JsonProperty("source")
public Builder withSource(final int value) {
this.source = value;
return this;
}
/** Sets the target field. */
@JsonProperty("target")
public Builder withTarget(final int value) {
this.target = value;
return this;
}
/** Sets the kind field. */
@JsonProperty("kind")
public Builder withKind(final RelationshipKind value) {
this.kind = value;
return this;
}
/** Builds the DocumentRelationship instance. */
public DocumentRelationship build() {
return new DocumentRelationship(
source,
target,
kind
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,134 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*}, …). Every
* extractor defaults to {@code ExtractionResult.revisions = None} until a
* format-specific implementation is added.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocumentRevision.Builder.class)
public record DocumentRevision(
/**
* Format-specific revision identifier.
*
* For DOCX this is the {@code w:id} attribute value on the change element
* (e.g. {@code "42"}). When the attribute is absent a synthetic fallback is
* generated ({@code "docx-ins-0"}, {@code "docx-del-3"}, …).
*/
@JsonProperty("revision_id") String revisionId,
/**
* Display name of the author who made this change, when available.
*/
@Nullable @JsonProperty("author") String author,
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the {@code chrono} optional dep.
* DOCX populates this from the {@code w:date} attribute (e.g.
* {@code "2024-03-15T10:30:00Z"}).
*/
@Nullable @JsonProperty("timestamp") String timestamp,
/**
* Semantic kind of this revision.
*/
@JsonProperty("kind") RevisionKind kind,
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be {@code None} when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*/
@Nullable @JsonProperty("anchor") RevisionAnchor anchor,
/**
* The content changes that make up this revision.
*/
@JsonProperty("delta") RevisionDelta delta
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("revision_id")
private String revisionId = "";
private String author = null;
private String timestamp = null;
private RevisionKind kind = null;
private RevisionAnchor anchor = null;
private RevisionDelta delta = null;
/** Sets the revisionId field. */
@JsonProperty("revision_id")
public Builder withRevisionId(final String value) {
this.revisionId = value;
return this;
}
/** Sets the author field. */
@JsonProperty("author")
public Builder withAuthor(final @Nullable String value) {
this.author = value;
return this;
}
/** Sets the timestamp field. */
@JsonProperty("timestamp")
public Builder withTimestamp(final @Nullable String value) {
this.timestamp = value;
return this;
}
/** Sets the kind field. */
@JsonProperty("kind")
public Builder withKind(final RevisionKind value) {
this.kind = value;
return this;
}
/** Sets the anchor field. */
@JsonProperty("anchor")
public Builder withAnchor(final @Nullable RevisionAnchor value) {
this.anchor = value;
return this;
}
/** Sets the delta field. */
@JsonProperty("delta")
public Builder withDelta(final RevisionDelta value) {
this.delta = value;
return this;
}
/** Builds the DocumentRevision instance. */
public DocumentRevision build() {
return new DocumentRevision(
revisionId,
author,
timestamp,
kind,
anchor,
delta
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,117 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have {@code parent: None}. Use {@code body_roots()} and {@code furniture_roots()}
* to iterate over top-level content by layer.
*
* # Validation
*
* Call {@code validate()} after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocumentStructure.Builder.class)
public record DocumentStructure(
/**
* All nodes in document/reading order.
*/
@JsonProperty("nodes") List<DocumentNode> nodes,
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*/
@Nullable @JsonProperty("source_format") String sourceFormat,
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*/
@Nullable @JsonProperty("relationships") List<DocumentRelationship> relationships,
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case {@code node_type} tag of the corresponding
* NodeContent variant (e.g. {@code "paragraph"}, {@code "heading"}, {@code "table"}, …).
*
* Computed from nodes via DocumentStructure.finalize_node_types.
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*/
@Nullable @JsonProperty("node_types") List<String> nodeTypes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private List<DocumentNode> nodes = List.of();
@JsonProperty("source_format")
private String sourceFormat = null;
private List<DocumentRelationship> relationships = null;
@JsonProperty("node_types")
private List<String> nodeTypes = null;
/** Sets the nodes field. */
@JsonProperty("nodes")
public Builder withNodes(final List<DocumentNode> value) {
this.nodes = value;
return this;
}
/** Sets the sourceFormat field. */
@JsonProperty("source_format")
public Builder withSourceFormat(final @Nullable String value) {
this.sourceFormat = value;
return this;
}
/** Sets the relationships field. */
@JsonProperty("relationships")
public Builder withRelationships(final @Nullable List<DocumentRelationship> value) {
this.relationships = value;
return this;
}
/** Sets the nodeTypes field. */
@JsonProperty("node_types")
public Builder withNodeTypes(final @Nullable List<String> value) {
this.nodeTypes = value;
return this;
}
/** Builds the DocumentStructure instance. */
public DocumentStructure build() {
return new DocumentStructure(
nodes,
sourceFormat,
relationships,
nodeTypes
);
}
}
// CPD-ON
public static DocumentStructure defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,255 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocxAppProperties.Builder.class)
public record DocxAppProperties(
/**
* Application name (e.g., "Microsoft Office Word")
*/
@Nullable @JsonProperty("application") String application,
/**
* Application version
*/
@Nullable @JsonProperty("app_version") String appVersion,
/**
* Template filename
*/
@Nullable @JsonProperty("template") String template,
/**
* Total editing time in minutes
*/
@Nullable @JsonProperty("total_time") Integer totalTime,
/**
* Number of pages
*/
@Nullable @JsonProperty("pages") Integer pages,
/**
* Number of words
*/
@Nullable @JsonProperty("words") Integer words,
/**
* Number of characters (excluding spaces)
*/
@Nullable @JsonProperty("characters") Integer characters,
/**
* Number of characters (including spaces)
*/
@Nullable @JsonProperty("characters_with_spaces") Integer charactersWithSpaces,
/**
* Number of lines
*/
@Nullable @JsonProperty("lines") Integer lines,
/**
* Number of paragraphs
*/
@Nullable @JsonProperty("paragraphs") Integer paragraphs,
/**
* Company name
*/
@Nullable @JsonProperty("company") String company,
/**
* Document security level
*/
@Nullable @JsonProperty("doc_security") Integer docSecurity,
/**
* Scale crop flag
*/
@Nullable @JsonProperty("scale_crop") Boolean scaleCrop,
/**
* Links up to date flag
*/
@Nullable @JsonProperty("links_up_to_date") Boolean linksUpToDate,
/**
* Shared document flag
*/
@Nullable @JsonProperty("shared_doc") Boolean sharedDoc,
/**
* Hyperlinks changed flag
*/
@Nullable @JsonProperty("hyperlinks_changed") Boolean hyperlinksChanged
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String application = null;
@JsonProperty("app_version")
private String appVersion = null;
private String template = null;
@JsonProperty("total_time")
private Integer totalTime = null;
private Integer pages = null;
private Integer words = null;
private Integer characters = null;
@JsonProperty("characters_with_spaces")
private Integer charactersWithSpaces = null;
private Integer lines = null;
private Integer paragraphs = null;
private String company = null;
@JsonProperty("doc_security")
private Integer docSecurity = null;
@JsonProperty("scale_crop")
private Boolean scaleCrop = null;
@JsonProperty("links_up_to_date")
private Boolean linksUpToDate = null;
@JsonProperty("shared_doc")
private Boolean sharedDoc = null;
@JsonProperty("hyperlinks_changed")
private Boolean hyperlinksChanged = null;
/** Sets the application field. */
@JsonProperty("application")
public Builder withApplication(final @Nullable String value) {
this.application = value;
return this;
}
/** Sets the appVersion field. */
@JsonProperty("app_version")
public Builder withAppVersion(final @Nullable String value) {
this.appVersion = value;
return this;
}
/** Sets the template field. */
@JsonProperty("template")
public Builder withTemplate(final @Nullable String value) {
this.template = value;
return this;
}
/** Sets the totalTime field. */
@JsonProperty("total_time")
public Builder withTotalTime(final @Nullable int value) {
this.totalTime = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable int value) {
this.pages = value;
return this;
}
/** Sets the words field. */
@JsonProperty("words")
public Builder withWords(final @Nullable int value) {
this.words = value;
return this;
}
/** Sets the characters field. */
@JsonProperty("characters")
public Builder withCharacters(final @Nullable int value) {
this.characters = value;
return this;
}
/** Sets the charactersWithSpaces field. */
@JsonProperty("characters_with_spaces")
public Builder withCharactersWithSpaces(final @Nullable int value) {
this.charactersWithSpaces = value;
return this;
}
/** Sets the lines field. */
@JsonProperty("lines")
public Builder withLines(final @Nullable int value) {
this.lines = value;
return this;
}
/** Sets the paragraphs field. */
@JsonProperty("paragraphs")
public Builder withParagraphs(final @Nullable int value) {
this.paragraphs = value;
return this;
}
/** Sets the company field. */
@JsonProperty("company")
public Builder withCompany(final @Nullable String value) {
this.company = value;
return this;
}
/** Sets the docSecurity field. */
@JsonProperty("doc_security")
public Builder withDocSecurity(final @Nullable int value) {
this.docSecurity = value;
return this;
}
/** Sets the scaleCrop field. */
@JsonProperty("scale_crop")
public Builder withScaleCrop(final @Nullable boolean value) {
this.scaleCrop = value;
return this;
}
/** Sets the linksUpToDate field. */
@JsonProperty("links_up_to_date")
public Builder withLinksUpToDate(final @Nullable boolean value) {
this.linksUpToDate = value;
return this;
}
/** Sets the sharedDoc field. */
@JsonProperty("shared_doc")
public Builder withSharedDoc(final @Nullable boolean value) {
this.sharedDoc = value;
return this;
}
/** Sets the hyperlinksChanged field. */
@JsonProperty("hyperlinks_changed")
public Builder withHyperlinksChanged(final @Nullable boolean value) {
this.hyperlinksChanged = value;
return this;
}
/** Builds the DocxAppProperties instance. */
public DocxAppProperties build() {
return new DocxAppProperties(
application,
appVersion,
template,
totalTime,
pages,
words,
characters,
charactersWithSpaces,
lines,
paragraphs,
company,
docSecurity,
scaleCrop,
linksUpToDate,
sharedDoc,
hyperlinksChanged
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.Map;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with {@code office_metadata} module for core/app/custom properties.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = DocxMetadata.Builder.class)
public record DocxMetadata(
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*/
@Nullable @JsonProperty("core_properties") CoreProperties coreProperties,
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*/
@Nullable @JsonProperty("app_properties") DocxAppProperties appProperties,
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*/
@Nullable @JsonProperty("custom_properties") Map<String, JsonNode> customProperties
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("core_properties")
private CoreProperties coreProperties = null;
@JsonProperty("app_properties")
private DocxAppProperties appProperties = null;
@JsonProperty("custom_properties")
private Map<String, JsonNode> customProperties = null;
/** Sets the coreProperties field. */
@JsonProperty("core_properties")
public Builder withCoreProperties(final @Nullable CoreProperties value) {
this.coreProperties = value;
return this;
}
/** Sets the appProperties field. */
@JsonProperty("app_properties")
public Builder withAppProperties(final @Nullable DocxAppProperties value) {
this.appProperties = value;
return this;
}
/** Sets the customProperties field. */
@JsonProperty("custom_properties")
public Builder withCustomProperties(final @Nullable Map<String, JsonNode> value) {
this.customProperties = value;
return this;
}
/** Builds the DocxMetadata instance. */
public DocxMetadata build() {
return new DocxMetadata(
coreProperties,
appProperties,
customProperties
);
}
}
// CPD-ON
}

93
packages/java/dev/kreuzberg/Element.java generated Normal file
View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = Element.Builder.class)
public record Element(
/**
* Unique element identifier
*/
@JsonProperty("element_id") String elementId,
/**
* Semantic type of this element
*/
@JsonProperty("element_type") ElementType elementType,
/**
* Text content of the element
*/
@JsonProperty("text") String text,
/**
* Metadata about the element
*/
@JsonProperty("metadata") ElementMetadata metadata
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("element_id")
private String elementId = "";
@JsonProperty("element_type")
private ElementType elementType = null;
private String text = "";
private ElementMetadata metadata = null;
/** Sets the elementId field. */
@JsonProperty("element_id")
public Builder withElementId(final String value) {
this.elementId = value;
return this;
}
/** Sets the elementType field. */
@JsonProperty("element_type")
public Builder withElementType(final ElementType value) {
this.elementType = value;
return this;
}
/** Sets the text field. */
@JsonProperty("text")
public Builder withText(final String value) {
this.text = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final ElementMetadata value) {
this.metadata = value;
return this;
}
/** Builds the Element instance. */
public Element build() {
return new Element(
elementId,
elementType,
text,
metadata
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Metadata for a semantic element.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ElementMetadata.Builder.class)
public record ElementMetadata(
/**
* Page number (1-indexed)
*/
@Nullable @JsonProperty("page_number") Integer pageNumber,
/**
* Source filename or document name
*/
@Nullable @JsonProperty("filename") String filename,
/**
* Bounding box coordinates if available
*/
@Nullable @JsonProperty("coordinates") BoundingBox coordinates,
/**
* Position index in the element sequence
*/
@Nullable @JsonProperty("element_index") Long elementIndex,
/**
* Additional custom metadata
*/
@JsonProperty("additional") Map<String, String> additional
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("page_number")
private Integer pageNumber = null;
private String filename = null;
private BoundingBox coordinates = null;
@JsonProperty("element_index")
private Long elementIndex = null;
private Map<String, String> additional = Map.of();
/** Sets the pageNumber field. */
@JsonProperty("page_number")
public Builder withPageNumber(final @Nullable int value) {
this.pageNumber = value;
return this;
}
/** Sets the filename field. */
@JsonProperty("filename")
public Builder withFilename(final @Nullable String value) {
this.filename = value;
return this;
}
/** Sets the coordinates field. */
@JsonProperty("coordinates")
public Builder withCoordinates(final @Nullable BoundingBox value) {
this.coordinates = value;
return this;
}
/** Sets the elementIndex field. */
@JsonProperty("element_index")
public Builder withElementIndex(final @Nullable long value) {
this.elementIndex = value;
return this;
}
/** Sets the additional field. */
@JsonProperty("additional")
public Builder withAdditional(final Map<String, String> value) {
this.additional = value;
return this;
}
/** Builds the ElementMetadata instance. */
public ElementMetadata build() {
return new ElementMetadata(
pageNumber,
filename,
coordinates,
elementIndex,
additional
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,70 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
public enum ElementType {
/** Document title */
Title("title"),
/** Main narrative text body */
NarrativeText("narrative_text"),
/** Section heading */
Heading("heading"),
/** List item (bullet, numbered, etc.) */
ListItem("list_item"),
/** Table element */
Table("table"),
/** Image element */
Image("image"),
/** Page break marker */
PageBreak("page_break"),
/** Code block */
CodeBlock("code_block"),
/** Block quote */
BlockQuote("block_quote"),
/** Footer text */
Footer("footer"),
/** Header text */
Header("header");
/** The string value. */
private final String value;
ElementType(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ElementType fromValue(final String value) {
for (ElementType e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,120 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmailAttachment.Builder.class)
public record EmailAttachment(
/**
* Attachment name (from Content-Disposition header)
*/
@Nullable @JsonProperty("name") String name,
/**
* Filename of the attachment
*/
@Nullable @JsonProperty("filename") String filename,
/**
* MIME type of the attachment
*/
@Nullable @JsonProperty("mime_type") String mimeType,
/**
* Size in bytes
*/
@Nullable @JsonProperty("size") Long size,
/**
* Whether this attachment is an image
*/
@JsonProperty("is_image") boolean isImage,
/**
* Attachment data (if extracted).
* Uses {@code bytes.Bytes} for cheap cloning of large buffers.
*/
@Nullable @JsonProperty("data") byte[] data
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = null;
private String filename = null;
@JsonProperty("mime_type")
private String mimeType = null;
private Long size = null;
@JsonProperty("is_image")
private boolean isImage = false;
private byte[] data = null;
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final @Nullable String value) {
this.name = value;
return this;
}
/** Sets the filename field. */
@JsonProperty("filename")
public Builder withFilename(final @Nullable String value) {
this.filename = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final @Nullable String value) {
this.mimeType = value;
return this;
}
/** Sets the size field. */
@JsonProperty("size")
public Builder withSize(final @Nullable long value) {
this.size = value;
return this;
}
/** Sets the isImage field. */
@JsonProperty("is_image")
public Builder withIsImage(final boolean value) {
this.isImage = value;
return this;
}
/** Sets the data field. */
@JsonProperty("data")
public Builder withData(final @Nullable byte[] value) {
this.data = value;
return this;
}
/** Builds the EmailAttachment instance. */
public EmailAttachment build() {
return new EmailAttachment(
name,
filename,
mimeType,
size,
isImage,
data
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,46 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Configuration for email extraction.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmailConfig.Builder.class)
public record EmailConfig(@Nullable @JsonProperty("msg_fallback_codepage") Integer msgFallbackCodepage) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("msg_fallback_codepage")
private Integer msgFallbackCodepage = null;
/** Sets the msgFallbackCodepage field. */
@JsonProperty("msg_fallback_codepage")
public Builder withMsgFallbackCodepage(final @Nullable int value) {
this.msgFallbackCodepage = value;
return this;
}
/** Builds the EmailConfig instance. */
public EmailConfig build() {
return new EmailConfig(
msgFallbackCodepage
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,205 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmailExtractionResult.Builder.class)
public record EmailExtractionResult(
/**
* Email subject line
*/
@Nullable @JsonProperty("subject") String subject,
/**
* Sender email address
*/
@Nullable @JsonProperty("from_email") String fromEmail,
/**
* Primary recipient email addresses
*/
@JsonProperty("to_emails") List<String> toEmails,
/**
* CC recipient email addresses
*/
@JsonProperty("cc_emails") List<String> ccEmails,
/**
* BCC recipient email addresses
*/
@JsonProperty("bcc_emails") List<String> bccEmails,
/**
* Email date/timestamp
*/
@Nullable @JsonProperty("date") String date,
/**
* Message-ID header value
*/
@Nullable @JsonProperty("message_id") String messageId,
/**
* Plain text version of the email body
*/
@Nullable @JsonProperty("plain_text") String plainText,
/**
* HTML version of the email body
*/
@Nullable @JsonProperty("html_content") String htmlContent,
/**
* Cleaned/processed text content. Aliased as {@code cleaned_text} for back-compat.
*/
@JsonProperty("content") String content,
/**
* List of email attachments
*/
@JsonProperty("attachments") List<EmailAttachment> attachments,
/**
* Additional email headers and metadata
*/
@JsonProperty("metadata") Map<String, String> metadata
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String subject = null;
@JsonProperty("from_email")
private String fromEmail = null;
@JsonProperty("to_emails")
private List<String> toEmails = List.of();
@JsonProperty("cc_emails")
private List<String> ccEmails = List.of();
@JsonProperty("bcc_emails")
private List<String> bccEmails = List.of();
private String date = null;
@JsonProperty("message_id")
private String messageId = null;
@JsonProperty("plain_text")
private String plainText = null;
@JsonProperty("html_content")
private String htmlContent = null;
private String content = "";
private List<EmailAttachment> attachments = List.of();
private Map<String, String> metadata = Map.of();
/** Sets the subject field. */
@JsonProperty("subject")
public Builder withSubject(final @Nullable String value) {
this.subject = value;
return this;
}
/** Sets the fromEmail field. */
@JsonProperty("from_email")
public Builder withFromEmail(final @Nullable String value) {
this.fromEmail = value;
return this;
}
/** Sets the toEmails field. */
@JsonProperty("to_emails")
public Builder withToEmails(final List<String> value) {
this.toEmails = value;
return this;
}
/** Sets the ccEmails field. */
@JsonProperty("cc_emails")
public Builder withCcEmails(final List<String> value) {
this.ccEmails = value;
return this;
}
/** Sets the bccEmails field. */
@JsonProperty("bcc_emails")
public Builder withBccEmails(final List<String> value) {
this.bccEmails = value;
return this;
}
/** Sets the date field. */
@JsonProperty("date")
public Builder withDate(final @Nullable String value) {
this.date = value;
return this;
}
/** Sets the messageId field. */
@JsonProperty("message_id")
public Builder withMessageId(final @Nullable String value) {
this.messageId = value;
return this;
}
/** Sets the plainText field. */
@JsonProperty("plain_text")
public Builder withPlainText(final @Nullable String value) {
this.plainText = value;
return this;
}
/** Sets the htmlContent field. */
@JsonProperty("html_content")
public Builder withHtmlContent(final @Nullable String value) {
this.htmlContent = value;
return this;
}
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the attachments field. */
@JsonProperty("attachments")
public Builder withAttachments(final List<EmailAttachment> value) {
this.attachments = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final Map<String, String> value) {
this.metadata = value;
return this;
}
/** Builds the EmailExtractionResult instance. */
public EmailExtractionResult build() {
return new EmailExtractionResult(
subject,
fromEmail,
toEmails,
ccEmails,
bccEmails,
date,
messageId,
plainText,
htmlContent,
content,
attachments,
metadata
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,137 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmailMetadata.Builder.class)
public record EmailMetadata(
/**
* Sender's email address
*/
@Nullable @JsonProperty("from_email") String fromEmail,
/**
* Sender's display name
*/
@Nullable @JsonProperty("from_name") String fromName,
/**
* Primary recipients
*/
@JsonProperty("to_emails") List<String> toEmails,
/**
* CC recipients
*/
@JsonProperty("cc_emails") List<String> ccEmails,
/**
* BCC recipients
*/
@JsonProperty("bcc_emails") List<String> bccEmails,
/**
* Message-ID header value
*/
@Nullable @JsonProperty("message_id") String messageId,
/**
* List of attachment filenames
*/
@JsonProperty("attachments") List<String> attachments
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("from_email")
private String fromEmail = null;
@JsonProperty("from_name")
private String fromName = null;
@JsonProperty("to_emails")
private List<String> toEmails = List.of();
@JsonProperty("cc_emails")
private List<String> ccEmails = List.of();
@JsonProperty("bcc_emails")
private List<String> bccEmails = List.of();
@JsonProperty("message_id")
private String messageId = null;
private List<String> attachments = List.of();
/** Sets the fromEmail field. */
@JsonProperty("from_email")
public Builder withFromEmail(final @Nullable String value) {
this.fromEmail = value;
return this;
}
/** Sets the fromName field. */
@JsonProperty("from_name")
public Builder withFromName(final @Nullable String value) {
this.fromName = value;
return this;
}
/** Sets the toEmails field. */
@JsonProperty("to_emails")
public Builder withToEmails(final List<String> value) {
this.toEmails = value;
return this;
}
/** Sets the ccEmails field. */
@JsonProperty("cc_emails")
public Builder withCcEmails(final List<String> value) {
this.ccEmails = value;
return this;
}
/** Sets the bccEmails field. */
@JsonProperty("bcc_emails")
public Builder withBccEmails(final List<String> value) {
this.bccEmails = value;
return this;
}
/** Sets the messageId field. */
@JsonProperty("message_id")
public Builder withMessageId(final @Nullable String value) {
this.messageId = value;
return this;
}
/** Sets the attachments field. */
@JsonProperty("attachments")
public Builder withAttachments(final List<String> value) {
this.attachments = value;
return this;
}
/** Builds the EmailMetadata instance. */
public EmailMetadata build() {
return new EmailMetadata(
fromEmail,
fromName,
toEmails,
ccEmails,
bccEmails,
messageId,
attachments
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,78 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Changes to embedded archive children between two results.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmbeddedChanges.Builder.class)
public record EmbeddedChanges(
/**
* Children present in {@code b} but not in {@code a} (matched by {@code path}).
*/
@JsonProperty("added") List<ArchiveEntry> added,
/**
* Children present in {@code a} but not in {@code b} (matched by {@code path}).
*/
@JsonProperty("removed") List<ArchiveEntry> removed,
/**
* Children present in both but with differing content (matched by {@code path}).
*
* Each entry holds the diff of the nested {@code ExtractionResult}.
*/
@JsonProperty("changed") List<EmbeddedDiff> changed
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private List<ArchiveEntry> added = List.of();
private List<ArchiveEntry> removed = List.of();
private List<EmbeddedDiff> changed = List.of();
/** Sets the added field. */
@JsonProperty("added")
public Builder withAdded(final List<ArchiveEntry> value) {
this.added = value;
return this;
}
/** Sets the removed field. */
@JsonProperty("removed")
public Builder withRemoved(final List<ArchiveEntry> value) {
this.removed = value;
return this;
}
/** Sets the changed field. */
@JsonProperty("changed")
public Builder withChanged(final List<EmbeddedDiff> value) {
this.changed = value;
return this;
}
/** Builds the EmbeddedChanges instance. */
public EmbeddedChanges build() {
return new EmbeddedChanges(
added,
removed,
changed
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Diff for a single embedded archive entry that appears in both results.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmbeddedDiff.Builder.class)
public record EmbeddedDiff(
/**
* Archive-relative path identifying this entry.
*/
@JsonProperty("path") String path,
/**
* The recursive diff of the entry's extraction result.
*/
@JsonProperty("diff") ExtractionDiff diff
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String path = "";
private ExtractionDiff diff = null;
/** Sets the path field. */
@JsonProperty("path")
public Builder withPath(final String value) {
this.path = value;
return this;
}
/** Sets the diff field. */
@JsonProperty("diff")
public Builder withDiff(final ExtractionDiff value) {
this.diff = value;
return this;
}
/** Builds the EmbeddedDiff instance. */
public EmbeddedDiff build() {
return new EmbeddedDiff(
path,
diff
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,95 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Embedded file descriptor extracted from the PDF name tree.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmbeddedFile.Builder.class)
public record EmbeddedFile(
/**
* The filename as stored in the PDF name tree.
*/
@JsonProperty("name") String name,
/**
* Raw file bytes from the embedded stream (already decompressed by lopdf).
*/
@JsonSerialize(using = ByteArrayToIntArraySerializer.class) @JsonProperty("data") byte[] data,
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*/
@JsonProperty("compressed_size") long compressedSize,
/**
* MIME type if specified in the filespec, otherwise {@code None}.
*/
@Nullable @JsonProperty("mime_type") String mimeType
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = "";
private byte[] data = new byte[0];
@JsonProperty("compressed_size")
private long compressedSize = 0;
@JsonProperty("mime_type")
private String mimeType = null;
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final String value) {
this.name = value;
return this;
}
/** Sets the data field. */
@JsonProperty("data")
public Builder withData(final byte[] value) {
this.data = value;
return this;
}
/** Sets the compressedSize field. */
@JsonProperty("compressed_size")
public Builder withCompressedSize(final long value) {
this.compressedSize = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final @Nullable String value) {
this.mimeType = value;
return this;
}
/** Builds the EmbeddedFile instance. */
public EmbeddedFile build() {
return new EmbeddedFile(
name,
data,
compressedSize,
mimeType
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,273 @@
package dev.kreuzberg;
import java.lang.foreign.Arena;
import java.lang.foreign.FunctionDescriptor;
import java.lang.foreign.Linker;
import java.lang.foreign.MemoryLayout;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import com.fasterxml.jackson.databind.ObjectMapper;
/**
* Allocates Panama FFM upcall stubs for an IEmbeddingBackend implementation,
* assembles the C vtable in native memory, and provides static
* registerEmbeddingBackend/unregisterEmbeddingBackend helpers.
*/
public final class EmbeddingBackendBridge implements AutoCloseable {
private static final Linker LINKER = Linker.nativeLinker();
private static final MethodHandles.Lookup LOOKUP = MethodHandles.lookup();
private static final ObjectMapper JSON = new ObjectMapper();
/** Live registry — keeps Arenas and upcall stubs alive past the register call. */
private static final ConcurrentHashMap<String, EmbeddingBackendBridge>
EMBEDDING_BACKEND_BRIDGES = new ConcurrentHashMap<>();
// C vtable: 8 fields (4 plugin methods + 2 trait methods + free_string + free_user_data)
private static final MemoryLayout VTABLE_LAYOUT = MemoryLayout.structLayout(ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS);
private static final long VTABLE_SIZE = VTABLE_LAYOUT.byteSize();
private final Arena arena;
private final MemorySegment vtable;
private final IEmbeddingBackend impl;
EmbeddingBackendBridge(final IEmbeddingBackend impl) {
this.impl = impl;
this.arena = Arena.ofShared();
this.vtable = arena.allocate(VTABLE_SIZE);
try {
long offset = 0L;
var stubName = LINKER.upcallStub(LOOKUP.bind(this, "handleName",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubName);
offset += ValueLayout.ADDRESS.byteSize();
var stubVersion = LINKER.upcallStub(LOOKUP.bind(this, "handleVersion",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubVersion);
offset += ValueLayout.ADDRESS.byteSize();
var stubInitialize = LINKER.upcallStub(LOOKUP.bind(this, "handleInitialize",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubInitialize);
offset += ValueLayout.ADDRESS.byteSize();
var stubShutdown = LINKER.upcallStub(LOOKUP.bind(this, "handleShutdown",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubShutdown);
offset += ValueLayout.ADDRESS.byteSize();
var stubDimensions = LINKER.upcallStub(LOOKUP.bind(this, "handleDimensions",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.ADDRESS, ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubDimensions);
offset += ValueLayout.ADDRESS.byteSize();
var stubEmbed = LINKER.upcallStub(LOOKUP.bind(this, "handleEmbed",
MethodType.methodType(int.class, MemorySegment.class, MemorySegment.class, MemorySegment.class, MemorySegment.class)),
FunctionDescriptor.of(
ValueLayout.JAVA_INT,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS,
ValueLayout.ADDRESS
),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubEmbed);
offset += ValueLayout.ADDRESS.byteSize();
var stubFreeString = LINKER.upcallStub(LOOKUP.bind(this, "freeString",
MethodType.methodType(void.class, MemorySegment.class)),
FunctionDescriptor.ofVoid(ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubFreeString);
offset += ValueLayout.ADDRESS.byteSize();
var stubFreeUserData = LINKER.upcallStub(LOOKUP.bind(this, "freeUserData",
MethodType.methodType(void.class, MemorySegment.class)),
FunctionDescriptor.ofVoid(ValueLayout.ADDRESS),
arena);
vtable.set(ValueLayout.ADDRESS, offset, stubFreeUserData);
offset += ValueLayout.ADDRESS.byteSize();
} catch (ReflectiveOperationException e) {
arena.close();
throw new RuntimeException("Failed to create trait bridge stubs", e);
}
}
MemorySegment vtableSegment() { return vtable; }
private int handleName(MemorySegment userData, MemorySegment outName, MemorySegment outError) {
try {
outName.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(impl.name()));
return 0;
} catch (Throwable e) { return 1; }
}
private int handleVersion(MemorySegment userData, MemorySegment outVersion, MemorySegment outError) {
try {
outVersion.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(impl.version()));
return 0;
} catch (Throwable e) { return 1; }
}
private int handleInitialize(MemorySegment userData, MemorySegment outError) {
try {
impl.initialize();
return 0;
} catch (Throwable e) { return 1; }
}
private int handleShutdown(MemorySegment userData, MemorySegment outError) {
try {
impl.shutdown();
return 0;
} catch (Throwable e) { return 1; }
}
private int handleDimensions(MemorySegment userData, MemorySegment outResult, MemorySegment outError) {
try {
long result = impl.dimensions();
String json = JSON.writeValueAsString(result);
MemorySegment jsonCs = arena.allocateFrom(json);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private int handleEmbed(MemorySegment userData, MemorySegment texts_in, MemorySegment outResult, MemorySegment outError) {
try {
String texts_json = texts_in.reinterpret(Long.MAX_VALUE).getString(0);
List<String> texts = JSON.readValue(texts_json, new com.fasterxml.jackson.core.type.TypeReference<List<String>>() { });
List<List<Float>> result = impl.embed(texts);
String json = JSON.writeValueAsString(result);
MemorySegment jsonCs = arena.allocateFrom(json);
outResult.set(ValueLayout.ADDRESS, 0, jsonCs);
return 0;
} catch (Throwable e) {
writeError(outError, e);
return 1;
}
}
private void writeError(MemorySegment outError, Throwable e) {
try { outError.set(ValueLayout.ADDRESS, 0, arena.allocateFrom(e.getClass().getSimpleName() + ": " + e.getMessage())); }
catch (Throwable ignored) { /* swallow */ }
}
private void freeString(MemorySegment ptr) {
// Strings returned by Java callbacks are arena-owned and released when this bridge closes.
}
private void freeUserData(MemorySegment userData) {
// User data is Java-side state (the impl object), not freed by Rust on drop.
}
/** Read a NUL-terminated native C string safely without unbounded reinterpret. */
private static String readNativeString(MemorySegment ptr) {
return ptr.reinterpret(4096).getString(0);
}
@Override
public void close() { arena.close(); }
/** Register a EmbeddingBackend implementation via Panama FFM upcall stubs. */
public static void registerEmbeddingBackend(final IEmbeddingBackend impl) throws Exception {
var bridge = new EmbeddingBackendBridge(impl);
try {
try (var nameArena = Arena.ofShared()) {
var nameCs = nameArena.allocateFrom(impl.name());
MemorySegment outErr = nameArena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_REGISTER_EMBEDDING_BACKEND.invoke(
nameCs,
bridge.vtableSegment(),
MemorySegment.NULL,
outErr
);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL) ? "registration failed (rc=" + rc + ")" : readNativeString(errPtr);
throw new RuntimeException("registerEmbeddingBackend: " + msg);
}
}
} catch (Throwable t) {
bridge.close();
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during registration", t);
}
}
EMBEDDING_BACKEND_BRIDGES.put(impl.name(), bridge);
}
/** Unregister a EmbeddingBackend implementation by name. */
public static void unregisterEmbeddingBackend(String name) throws Exception {
try {
try (var nameArena = Arena.ofShared()) {
var nameCs = nameArena.allocateFrom(name);
MemorySegment outErr = nameArena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_UNREGISTER_EMBEDDING_BACKEND.invoke(nameCs, outErr);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "unregistration failed (rc=" + rc + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new RuntimeException("unregisterEmbeddingBackend: " + msg);
}
}
} catch (Throwable t) {
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during unregistration", t);
}
}
EmbeddingBackendBridge old = EMBEDDING_BACKEND_BRIDGES.remove(name);
if (old != null) { old.close(); }
}
/** Clear all registered EmbeddingBackend implementations. */
public static void clearEmbeddingBackends() throws Exception {
try {
try (var arena = Arena.ofShared()) {
MemorySegment outErr = arena.allocate(ValueLayout.ADDRESS);
int rc = (int) NativeLib.KREUZBERG_CLEAR_EMBEDDING_BACKEND.invoke(outErr);
if (rc != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + rc + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new RuntimeException("clearEmbeddingBackends: " + msg);
}
}
} catch (Throwable t) {
if (t instanceof Exception e) {
throw e;
} else {
throw new RuntimeException("Unexpected error during clear", t);
}
}
EMBEDDING_BACKEND_BRIDGES.values().forEach(EmbeddingBackendBridge::close);
EMBEDDING_BACKEND_BRIDGES.clear();
}
}

View File

@@ -0,0 +1,157 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the {@code embeddings} feature to be enabled.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmbeddingConfig.Builder.class)
public record EmbeddingConfig(
/**
* The embedding model to use (defaults to "balanced" preset if not specified)
*/
@Nullable @JsonProperty("model") EmbeddingModelType model,
/**
* Whether to normalize embedding vectors (recommended for cosine similarity)
*/
@Nullable @JsonProperty("normalize") Boolean normalize,
/**
* Batch size for embedding generation
*/
@Nullable @JsonProperty("batch_size") Long batchSize,
/**
* Show model download progress
*/
@Nullable @JsonProperty("show_download_progress") Boolean showDownloadProgress,
/**
* Custom cache directory for model files
*
* Defaults to {@code ~/.cache/kreuzberg/embeddings/} if not specified.
* Allows full customization of model download location.
*/
@JsonProperty("cache_dir") java.nio.file.@Nullable Path cacheDir,
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to {@code None} (auto-select per platform).
*/
@Nullable @JsonProperty("acceleration") AccelerationConfig acceleration,
/**
* Maximum wall-clock duration (in seconds) for a single {@code embed()} call when
* using EmbeddingModelType.Plugin.
*
* Applies only to the in-process plugin path — protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns {@code Plugin} instead of blocking forever.
*
* {@code None} disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*/
@Nullable @JsonProperty("max_embed_duration_secs") Long maxEmbedDurationSecs
) {
public static Builder builder() {
return new Builder();
}
public EmbeddingConfig{
if (batchSize == null) batchSize = 32L;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@Nullable private EmbeddingModelType model = null;
private Boolean normalize = null;
@JsonProperty("batch_size")
private Long batchSize = null;
@JsonProperty("show_download_progress")
private Boolean showDownloadProgress = null;
@JsonProperty("cache_dir")
private java.nio.file.Path cacheDir = null;
@Nullable private AccelerationConfig acceleration = null;
@JsonProperty("max_embed_duration_secs")
private Long maxEmbedDurationSecs = null;
/** Sets the model field. */
@JsonProperty("model")
public Builder withModel(final @Nullable EmbeddingModelType value) {
this.model = value;
return this;
}
/** Sets the normalize field. */
@JsonProperty("normalize")
public Builder withNormalize(final @Nullable Boolean value) {
this.normalize = value;
return this;
}
/** Sets the batchSize field. */
@JsonProperty("batch_size")
public Builder withBatchSize(final @Nullable Long value) {
this.batchSize = value;
return this;
}
/** Sets the showDownloadProgress field. */
@JsonProperty("show_download_progress")
public Builder withShowDownloadProgress(final @Nullable Boolean value) {
this.showDownloadProgress = value;
return this;
}
/** Sets the cacheDir field. */
@JsonProperty("cache_dir")
public Builder withCacheDir(final java.nio.file.@Nullable Path value) {
this.cacheDir = value;
return this;
}
/** Sets the acceleration field. */
@JsonProperty("acceleration")
public Builder withAcceleration(final @Nullable AccelerationConfig value) {
this.acceleration = value;
return this;
}
/** Sets the maxEmbedDurationSecs field. */
@JsonProperty("max_embed_duration_secs")
public Builder withMaxEmbedDurationSecs(final @Nullable Long value) {
this.maxEmbedDurationSecs = value;
return this;
}
/** Builds the EmbeddingConfig instance. */
public EmbeddingConfig build() {
return new EmbeddingConfig(
model,
normalize,
batchSize,
showDownloadProgress,
cacheDir,
acceleration,
maxEmbedDurationSecs
);
}
}
// CPD-ON
public static EmbeddingConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,15 @@
// DO NOT EDIT - auto-generated by alef
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
package dev.kreuzberg;
public class EmbeddingException extends KreuzbergErrorException {
/** Creates a new EmbeddingException with the given message. */
public EmbeddingException(final String message) {
super(message);
}
/** Creates a new EmbeddingException with the given message and cause. */
public EmbeddingException(final String message, final Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,40 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
/**
* Embedding model types supported by Kreuzberg.
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type", visible = false)
@JsonSubTypes({
@JsonSubTypes.Type(value = EmbeddingModelType.Preset.class, name = "preset"),
@JsonSubTypes.Type(value = EmbeddingModelType.Custom.class, name = "custom"),
@JsonSubTypes.Type(value = EmbeddingModelType.Llm.class, name = "llm"),
@JsonSubTypes.Type(value = EmbeddingModelType.Plugin.class, name = "plugin")
})
@com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true)
public sealed interface EmbeddingModelType {
/** Use a preset model configuration (recommended) */
record Preset(@JsonProperty("name") String name) implements EmbeddingModelType { }
/** Use a custom ONNX model from HuggingFace */
record Custom(
@JsonProperty("model_id") String modelId,
@JsonProperty("dimensions") long dimensions
) implements EmbeddingModelType {
}
/** Provider-hosted embedding model via liter-llm. */
record Llm(@JsonProperty("llm") LlmConfig llm) implements EmbeddingModelType { }
/** In-process embedding backend registered via the plugin system. */
record Plugin(@JsonProperty("name") String name) implements EmbeddingModelType { }
}

View File

@@ -0,0 +1,134 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned {@code String} for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EmbeddingPreset.Builder.class)
public record EmbeddingPreset(
@JsonProperty("name") String name,
@JsonProperty("chunk_size") long chunkSize,
@JsonProperty("overlap") long overlap,
/**
* HuggingFace repository name for the model.
*/
@JsonProperty("model_repo") String modelRepo,
/**
* Pooling strategy: "cls" or "mean".
*/
@JsonProperty("pooling") String pooling,
/**
* Path to the ONNX model file within the repo.
*/
@JsonProperty("model_file") String modelFile,
@JsonProperty("dimensions") long dimensions,
@JsonProperty("description") String description
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = "";
@JsonProperty("chunk_size")
private long chunkSize = 0;
private long overlap = 0;
@JsonProperty("model_repo")
private String modelRepo = "";
private String pooling = "";
@JsonProperty("model_file")
private String modelFile = "";
private long dimensions = 0;
private String description = "";
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final String value) {
this.name = value;
return this;
}
/** Sets the chunkSize field. */
@JsonProperty("chunk_size")
public Builder withChunkSize(final long value) {
this.chunkSize = value;
return this;
}
/** Sets the overlap field. */
@JsonProperty("overlap")
public Builder withOverlap(final long value) {
this.overlap = value;
return this;
}
/** Sets the modelRepo field. */
@JsonProperty("model_repo")
public Builder withModelRepo(final String value) {
this.modelRepo = value;
return this;
}
/** Sets the pooling field. */
@JsonProperty("pooling")
public Builder withPooling(final String value) {
this.pooling = value;
return this;
}
/** Sets the modelFile field. */
@JsonProperty("model_file")
public Builder withModelFile(final String value) {
this.modelFile = value;
return this;
}
/** Sets the dimensions field. */
@JsonProperty("dimensions")
public Builder withDimensions(final long value) {
this.dimensions = value;
return this;
}
/** Sets the description field. */
@JsonProperty("description")
public Builder withDescription(final String value) {
this.description = value;
return this;
}
/** Builds the EmbeddingPreset instance. */
public EmbeddingPreset build() {
return new EmbeddingPreset(
name,
chunkSize,
overlap,
modelRepo,
pooling,
modelFile,
dimensions,
description
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* EPUB metadata (Dublin Core extensions).
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = EpubMetadata.Builder.class)
public record EpubMetadata(
@Nullable @JsonProperty("coverage") String coverage,
@Nullable @JsonProperty("dc_format") String dcFormat,
@Nullable @JsonProperty("relation") String relation,
@Nullable @JsonProperty("source") String source,
@Nullable @JsonProperty("dc_type") String dcType,
@Nullable @JsonProperty("cover_image") String coverImage
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String coverage = null;
@JsonProperty("dc_format")
private String dcFormat = null;
private String relation = null;
private String source = null;
@JsonProperty("dc_type")
private String dcType = null;
@JsonProperty("cover_image")
private String coverImage = null;
/** Sets the coverage field. */
@JsonProperty("coverage")
public Builder withCoverage(final @Nullable String value) {
this.coverage = value;
return this;
}
/** Sets the dcFormat field. */
@JsonProperty("dc_format")
public Builder withDcFormat(final @Nullable String value) {
this.dcFormat = value;
return this;
}
/** Sets the relation field. */
@JsonProperty("relation")
public Builder withRelation(final @Nullable String value) {
this.relation = value;
return this;
}
/** Sets the source field. */
@JsonProperty("source")
public Builder withSource(final @Nullable String value) {
this.source = value;
return this;
}
/** Sets the dcType field. */
@JsonProperty("dc_type")
public Builder withDcType(final @Nullable String value) {
this.dcType = value;
return this;
}
/** Sets the coverImage field. */
@JsonProperty("cover_image")
public Builder withCoverImage(final @Nullable String value) {
this.coverImage = value;
return this;
}
/** Builds the EpubMetadata instance. */
public EpubMetadata build() {
return new EpubMetadata(
coverage,
dcFormat,
relation,
source,
dcType,
coverImage
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,57 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Error metadata (for batch operations).
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ErrorMetadata.Builder.class)
public record ErrorMetadata(
@JsonProperty("error_type") String errorType,
@JsonProperty("message") String message
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("error_type")
private String errorType = "";
private String message = "";
/** Sets the errorType field. */
@JsonProperty("error_type")
public Builder withErrorType(final String value) {
this.errorType = value;
return this;
}
/** Sets the message field. */
@JsonProperty("message")
public Builder withMessage(final String value) {
this.message = value;
return this;
}
/** Builds the ErrorMetadata instance. */
public ErrorMetadata build() {
return new ErrorMetadata(
errorType,
message
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,69 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the {@code FormatMetadata.Excel}
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExcelMetadata.Builder.class)
public record ExcelMetadata(
/**
* Number of sheets in the workbook.
*/
@Nullable @JsonProperty("sheet_count") Integer sheetCount,
/**
* Names of all sheets in the workbook.
*/
@Nullable @JsonProperty("sheet_names") List<String> sheetNames
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("sheet_count")
private Integer sheetCount = null;
@JsonProperty("sheet_names")
private List<String> sheetNames = null;
/** Sets the sheetCount field. */
@JsonProperty("sheet_count")
public Builder withSheetCount(final @Nullable int value) {
this.sheetCount = value;
return this;
}
/** Sets the sheetNames field. */
@JsonProperty("sheet_names")
public Builder withSheetNames(final @Nullable List<String> value) {
this.sheetNames = value;
return this;
}
/** Builds the ExcelMetadata instance. */
public ExcelMetadata build() {
return new ExcelMetadata(
sheetCount,
sheetNames
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,125 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExcelSheet.Builder.class)
public record ExcelSheet(
/**
* Sheet name as it appears in Excel
*/
@JsonProperty("name") String name,
/**
* Sheet content converted to Markdown tables
*/
@JsonProperty("markdown") String markdown,
/**
* Number of rows
*/
@JsonProperty("row_count") long rowCount,
/**
* Number of columns
*/
@JsonProperty("col_count") long colCount,
/**
* Total number of non-empty cells
*/
@JsonProperty("cell_count") long cellCount,
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* null for empty sheets.
*/
@Nullable @JsonProperty("table_cells") List<List<String>> tableCells
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String name = "";
private String markdown = "";
@JsonProperty("row_count")
private long rowCount = 0;
@JsonProperty("col_count")
private long colCount = 0;
@JsonProperty("cell_count")
private long cellCount = 0;
@JsonProperty("table_cells")
private List<List<String>> tableCells = null;
/** Sets the name field. */
@JsonProperty("name")
public Builder withName(final String value) {
this.name = value;
return this;
}
/** Sets the markdown field. */
@JsonProperty("markdown")
public Builder withMarkdown(final String value) {
this.markdown = value;
return this;
}
/** Sets the rowCount field. */
@JsonProperty("row_count")
public Builder withRowCount(final long value) {
this.rowCount = value;
return this;
}
/** Sets the colCount field. */
@JsonProperty("col_count")
public Builder withColCount(final long value) {
this.colCount = value;
return this;
}
/** Sets the cellCount field. */
@JsonProperty("cell_count")
public Builder withCellCount(final long value) {
this.cellCount = value;
return this;
}
/** Sets the tableCells field. */
@JsonProperty("table_cells")
public Builder withTableCells(final @Nullable List<List<String>> value) {
this.tableCells = value;
return this;
}
/** Builds the ExcelSheet instance. */
public ExcelSheet build() {
return new ExcelSheet(
name,
markdown,
rowCount,
colCount,
cellCount,
tableCells
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,88 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExcelWorkbook.Builder.class)
public record ExcelWorkbook(
/**
* All sheets in the workbook
*/
@JsonProperty("sheets") List<ExcelSheet> sheets,
/**
* Workbook-level metadata (author, creation date, etc.)
*/
@JsonProperty("metadata") Map<String, String> metadata,
/**
* Collaborative-edit revision headers from {@code xl/revisions/revisionHeaders.xml}.
*
* Populated for legacy shared-workbook {@code .xlsx} files that contain the
* {@code xl/revisions/} directory. Each {@code &lt;header&gt;} element maps to one
* {@code DocumentRevision { kind: FormatChange }} carrying the header's {@code guid}
* (→ {@code revision_id}), {@code userName} (→ {@code author}), and {@code dateTime} (→ {@code timestamp}).
* {@code anchor} and {@code delta} are {@code None}/empty for v1 (per-cell log parsing is a
* follow-up). {@code None} when {@code xl/revisions/revisionHeaders.xml} is absent.
*/
@Nullable @JsonProperty("revisions") List<DocumentRevision> revisions
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private List<ExcelSheet> sheets = List.of();
private Map<String, String> metadata = Map.of();
private List<DocumentRevision> revisions = null;
/** Sets the sheets field. */
@JsonProperty("sheets")
public Builder withSheets(final List<ExcelSheet> value) {
this.sheets = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final Map<String, String> value) {
this.metadata = value;
return this;
}
/** Sets the revisions field. */
@JsonProperty("revisions")
public Builder withRevisions(final @Nullable List<DocumentRevision> value) {
this.revisions = value;
return this;
}
/** Builds the ExcelWorkbook instance. */
public ExcelWorkbook build() {
return new ExcelWorkbook(
sheets,
metadata,
revisions
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,58 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* {@code Auto} (default) selects the best available provider per platform.
*/
public enum ExecutionProviderType {
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
Auto("auto"),
/** CPU execution provider (always available). */
Cpu("cpu"),
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
CoreMl("coreml"),
/** NVIDIA CUDA GPU acceleration. */
Cuda("cuda"),
/** NVIDIA TensorRT (optimized CUDA inference). */
TensorRt("tensorrt");
/** The string value. */
private final String value;
ExecutionProviderType(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ExecutionProviderType fromValue(final String value) {
for (ExecutionProviderType e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,269 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractedImage.Builder.class)
public record ExtractedImage(
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses {@code bytes.Bytes} for cheap cloning of large buffers.
*/
@JsonSerialize(using = ByteArrayToIntArraySerializer.class) @JsonProperty("data") byte[] data,
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow&lt;, str&gt; to avoid allocation for static literals.
*/
@JsonProperty("format") String format,
/**
* Zero-indexed position of this image in the document/page
*/
@JsonProperty("image_index") int imageIndex,
/**
* Page/slide number where image was found (1-indexed)
*/
@Nullable @JsonProperty("page_number") Integer pageNumber,
/**
* Image width in pixels
*/
@Nullable @JsonProperty("width") Integer width,
/**
* Image height in pixels
*/
@Nullable @JsonProperty("height") Integer height,
/**
* Colorspace information (e.g., "RGB", "CMYK", "Gray")
*/
@Nullable @JsonProperty("colorspace") String colorspace,
/**
* Bits per color component (e.g., 8, 16)
*/
@Nullable @JsonProperty("bits_per_component") Integer bitsPerComponent,
/**
* Whether this image is a mask image
*/
@Nullable @JsonProperty("is_mask") Boolean isMask,
/**
* Optional description of the image
*/
@Nullable @JsonProperty("description") String description,
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*/
@Nullable @JsonProperty("ocr_result") ExtractionResult ocrResult,
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*/
@Nullable @JsonProperty("bounding_box") BoundingBox boundingBox,
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*/
@Nullable @JsonProperty("source_path") String sourcePath,
/**
* Heuristic classification of what this image likely depicts.
* {@code None} if classification was disabled or inconclusive.
*/
@Nullable @JsonProperty("image_kind") ImageKind imageKind,
/**
* Confidence score for {@code image_kind}, in the range 0.0 to 1.0.
*/
@Nullable @JsonProperty("kind_confidence") Float kindConfidence,
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). {@code None} for singletons.
*/
@Nullable @JsonProperty("cluster_id") Integer clusterId
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private byte[] data = new byte[0];
private String format = "";
@JsonProperty("image_index")
private int imageIndex = 0;
@JsonProperty("page_number")
private Integer pageNumber = null;
private Integer width = null;
private Integer height = null;
private String colorspace = null;
@JsonProperty("bits_per_component")
private Integer bitsPerComponent = null;
@JsonProperty("is_mask")
private Boolean isMask = null;
private String description = null;
@JsonProperty("ocr_result")
private ExtractionResult ocrResult = null;
@JsonProperty("bounding_box")
@Nullable private BoundingBox boundingBox = null;
@JsonProperty("source_path")
private String sourcePath = null;
@JsonProperty("image_kind")
@Nullable private ImageKind imageKind = null;
@JsonProperty("kind_confidence")
private Float kindConfidence = null;
@JsonProperty("cluster_id")
private Integer clusterId = null;
/** Sets the data field. */
@JsonProperty("data")
public Builder withData(final byte[] value) {
this.data = value;
return this;
}
/** Sets the format field. */
@JsonProperty("format")
public Builder withFormat(final String value) {
this.format = value;
return this;
}
/** Sets the imageIndex field. */
@JsonProperty("image_index")
public Builder withImageIndex(final int value) {
this.imageIndex = value;
return this;
}
/** Sets the pageNumber field. */
@JsonProperty("page_number")
public Builder withPageNumber(final @Nullable int value) {
this.pageNumber = value;
return this;
}
/** Sets the width field. */
@JsonProperty("width")
public Builder withWidth(final @Nullable int value) {
this.width = value;
return this;
}
/** Sets the height field. */
@JsonProperty("height")
public Builder withHeight(final @Nullable int value) {
this.height = value;
return this;
}
/** Sets the colorspace field. */
@JsonProperty("colorspace")
public Builder withColorspace(final @Nullable String value) {
this.colorspace = value;
return this;
}
/** Sets the bitsPerComponent field. */
@JsonProperty("bits_per_component")
public Builder withBitsPerComponent(final @Nullable int value) {
this.bitsPerComponent = value;
return this;
}
/** Sets the isMask field. */
@JsonProperty("is_mask")
public Builder withIsMask(final @Nullable Boolean value) {
this.isMask = value;
return this;
}
/** Sets the description field. */
@JsonProperty("description")
public Builder withDescription(final @Nullable String value) {
this.description = value;
return this;
}
/** Sets the ocrResult field. */
@JsonProperty("ocr_result")
public Builder withOcrResult(final @Nullable ExtractionResult value) {
this.ocrResult = value;
return this;
}
/** Sets the boundingBox field. */
@JsonProperty("bounding_box")
public Builder withBoundingBox(final @Nullable BoundingBox value) {
this.boundingBox = value;
return this;
}
/** Sets the sourcePath field. */
@JsonProperty("source_path")
public Builder withSourcePath(final @Nullable String value) {
this.sourcePath = value;
return this;
}
/** Sets the imageKind field. */
@JsonProperty("image_kind")
public Builder withImageKind(final @Nullable ImageKind value) {
this.imageKind = value;
return this;
}
/** Sets the kindConfidence field. */
@JsonProperty("kind_confidence")
public Builder withKindConfidence(final @Nullable Float value) {
this.kindConfidence = value;
return this;
}
/** Sets the clusterId field. */
@JsonProperty("cluster_id")
public Builder withClusterId(final @Nullable Integer value) {
this.clusterId = value;
return this;
}
/** Builds the ExtractedImage instance. */
public ExtractedImage build() {
return new ExtractedImage(
data,
format,
imageIndex,
pageNumber,
width,
height,
colorspace,
bitsPerComponent,
isMask,
description,
ocrResult,
boundingBox,
sourcePath,
imageKind,
kindConfidence,
clusterId
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The {@code kind} field classifies the URI semantically, while {@code label} carries
* optional human-readable display text.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractedUri.Builder.class)
public record ExtractedUri(
/**
* The URL or path string.
*/
@JsonProperty("url") String url,
/**
* Optional display text / label for the link.
*/
@Nullable @JsonProperty("label") String label,
/**
* Optional page number where the URI was found (1-indexed).
*/
@Nullable @JsonProperty("page") Integer page,
/**
* Semantic classification of the URI.
*/
@JsonProperty("kind") UriKind kind
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String url = "";
private String label = null;
private Integer page = null;
private UriKind kind = null;
/** Sets the url field. */
@JsonProperty("url")
public Builder withUrl(final String value) {
this.url = value;
return this;
}
/** Sets the label field. */
@JsonProperty("label")
public Builder withLabel(final @Nullable String value) {
this.label = value;
return this;
}
/** Sets the page field. */
@JsonProperty("page")
public Builder withPage(final @Nullable int value) {
this.page = value;
return this;
}
/** Sets the kind field. */
@JsonProperty("kind")
public Builder withKind(final UriKind value) {
this.kind = value;
return this;
}
/** Builds the ExtractedUri instance. */
public ExtractedUri build() {
return new ExtractedUri(
url,
label,
page,
kind
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,650 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionConfig.Builder.class)
public record ExtractionConfig(
/**
* Enable caching of extraction results
*/
@Nullable @JsonProperty("use_cache") Boolean useCache,
/**
* Enable quality post-processing
*/
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
/**
* OCR configuration (null = OCR disabled)
*/
@Nullable @JsonProperty("ocr") OcrConfig ocr,
/**
* Force OCR even for searchable PDFs
*/
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
/**
* Force OCR on specific pages only (1-indexed page numbers, must be &gt;= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when {@code force_ocr} is {@code true}.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An {@code ocr} config is recommended for backend/language selection; defaults are used if absent.
*/
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
/**
* Disable OCR entirely, even for images.
*
* When {@code true}, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be {@code true} simultaneously with {@code force_ocr}.
*
* *Added in v4.7.0.*
*/
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
/**
* Text chunking configuration (null = chunking disabled)
*/
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
/**
* Content filtering configuration (null = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See ContentFilterConfig for per-field documentation.
*/
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
/**
* Image extraction configuration (null = no image extraction)
*/
@Nullable @JsonProperty("images") ImageExtractionConfig images,
/**
* PDF-specific options (null = use defaults)
*/
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
/**
* Token reduction configuration (null = no token reduction)
*/
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
/**
* Language detection configuration (null = no language detection)
*/
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
/**
* Page extraction configuration (null = no page tracking)
*/
@Nullable @JsonProperty("pages") PageConfig pages,
/**
* Keyword extraction configuration (null = no keyword extraction)
*/
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
/**
* Post-processor configuration (null = use defaults)
*/
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
/**
* HTML to Markdown conversion options (null = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*/
@Nullable @JsonProperty("html_options") String htmlOptions,
/**
* Styled HTML output configuration.
*
* When set alongside {@code output_format = OutputFormat.Html}, the extraction
* pipeline uses StyledHtmlRenderer(crate.rendering.StyledHtmlRenderer)
* which emits stable {@code kb-*} CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a {@code &lt;style&gt;} block.
*
* When {@code None}, the existing plain comrak-based HTML renderer is used.
*/
@Nullable @JsonProperty("html_output") HtmlOutputConfig htmlOutput,
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by FileExtractionConfig.timeout_secs.
*
* Defaults to {@code Some(60)} to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to {@code None} to
* disable the timeout for trusted input or long-running workloads.
*/
@Nullable @JsonProperty("extraction_timeout_secs") Long extractionTimeoutSecs,
/**
* Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*/
@Nullable @JsonProperty("max_concurrent_extractions") Long maxConcurrentExtractions,
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the {@code content} field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*/
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When {@code None}, default limits are used.
*/
@Nullable @JsonProperty("security_limits") SecurityLimits securityLimits,
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a {@code ProcessingWarning} rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to {@code None} to disable the per-embedded-file cap (falls back to
* {@code security_limits.max_archive_size} as the only guard).
*/
@Nullable @JsonProperty("max_embedded_file_bytes") Long maxEmbeddedFileBytes,
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
* - {@code Plain}: Raw extracted text (default)
* - {@code Markdown}: Markdown formatted output
* - {@code Djot}: Djot markup format (requires djot feature)
* - {@code Html}: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The {@code formatted_content} field may be populated
* when format conversion is applied.
*/
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
/**
* Layout detection configuration (null = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the {@code layout-detection} feature to run inference; the field is
* present whenever the {@code layout-types} feature is active (which includes
* {@code layout-detection} as well as the no-ORT target groups).
*/
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When {@code true} and {@code layout} is {@code Some(_)}, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: {@code false}.
* Requires the {@code layout-detection} feature.
*/
@Nullable @JsonProperty("use_layout_for_markdown") Boolean useLayoutForMarkdown,
/**
* Enable structured document tree output.
*
* When true, populates the {@code document} field on {@code ExtractionResult} with a
* hierarchical {@code DocumentStructure} containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
*/
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When {@code None}, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*/
@Nullable @JsonProperty("acceleration") AccelerationConfig acceleration,
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under {@code {cache_dir}/{namespace}/}.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*/
@Nullable @JsonProperty("cache_namespace") String cacheNamespace,
/**
* Per-request cache TTL in seconds.
*
* Overrides the global {@code max_age_days} for this specific extraction.
* When {@code 0}, caching is completely skipped (no read or write).
* When {@code None}, the global TTL applies.
*/
@Nullable @JsonProperty("cache_ttl_secs") Long cacheTtlSecs,
/**
* Email extraction configuration (null = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See {@code EmailConfig} for details.
*/
@Nullable @JsonProperty("email") EmailConfig email,
/**
* Concurrency limits for constrained environments (null = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when {@code max_concurrent_extractions} is unset) the batch concurrency
* semaphore. See {@code ConcurrencyConfig} for details.
*/
@Nullable @JsonProperty("concurrency") String concurrency,
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*/
@Nullable @JsonProperty("max_archive_depth") Long maxArchiveDepth,
/**
* Tree-sitter language pack configuration (null = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*/
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
/**
* Structured extraction via LLM (null = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* {@code ExtractionResult.structured_output}.
*/
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction,
/**
* Cancellation token for this extraction (null = no external cancellation).
*
* Pass a CancellationToken clone here and call CancellationToken.cancel
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns KreuzbergError.Cancelled when set.
*
* The field is excluded from serialization because {@code CancellationToken} is a
* runtime handle, not a configuration value.
*/
@Nullable @JsonProperty("cancel_token") String cancelToken
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("use_cache")
private Boolean useCache = null;
@JsonProperty("enable_quality_processing")
private Boolean enableQualityProcessing = null;
@Nullable private OcrConfig ocr = null;
@JsonProperty("force_ocr")
private Boolean forceOcr = null;
@JsonProperty("force_ocr_pages")
private List<Integer> forceOcrPages = null;
@JsonProperty("disable_ocr")
private Boolean disableOcr = null;
@Nullable private ChunkingConfig chunking = null;
@JsonProperty("content_filter")
@Nullable private ContentFilterConfig contentFilter = null;
@Nullable private ImageExtractionConfig images = null;
@JsonProperty("pdf_options")
@Nullable private PdfConfig pdfOptions = null;
@JsonProperty("token_reduction")
@Nullable private TokenReductionOptions tokenReduction = null;
@JsonProperty("language_detection")
@Nullable private LanguageDetectionConfig languageDetection = null;
@Nullable private PageConfig pages = null;
@Nullable private KeywordConfig keywords = null;
@Nullable private PostProcessorConfig postprocessor = null;
@JsonProperty("html_options")
private String htmlOptions = null;
@JsonProperty("html_output")
@Nullable private HtmlOutputConfig htmlOutput = null;
@JsonProperty("extraction_timeout_secs")
private Long extractionTimeoutSecs = null;
@JsonProperty("max_concurrent_extractions")
private Long maxConcurrentExtractions = null;
@JsonProperty("result_format")
@Nullable private ResultFormat resultFormat = ResultFormat.Unified;
@JsonProperty("security_limits")
@Nullable private SecurityLimits securityLimits = null;
@JsonProperty("max_embedded_file_bytes")
private Long maxEmbeddedFileBytes = null;
@JsonProperty("output_format")
@Nullable private OutputFormat outputFormat = OutputFormat.Plain;
@Nullable private LayoutDetectionConfig layout = null;
@JsonProperty("use_layout_for_markdown")
private Boolean useLayoutForMarkdown = null;
@JsonProperty("include_document_structure")
private Boolean includeDocumentStructure = null;
@Nullable private AccelerationConfig acceleration = null;
@JsonProperty("cache_namespace")
private String cacheNamespace = null;
@JsonProperty("cache_ttl_secs")
private Long cacheTtlSecs = null;
@Nullable private EmailConfig email = null;
private String concurrency = null;
@JsonProperty("max_archive_depth")
private Long maxArchiveDepth = null;
@JsonProperty("tree_sitter")
@Nullable private TreeSitterConfig treeSitter = null;
@JsonProperty("structured_extraction")
@Nullable private StructuredExtractionConfig structuredExtraction = null;
@JsonProperty("cancel_token")
private String cancelToken = null;
/** Sets the useCache field. */
@JsonProperty("use_cache")
public Builder withUseCache(final @Nullable Boolean value) {
this.useCache = value;
return this;
}
/** Sets the enableQualityProcessing field. */
@JsonProperty("enable_quality_processing")
public Builder withEnableQualityProcessing(final @Nullable Boolean value) {
this.enableQualityProcessing = value;
return this;
}
/** Sets the ocr field. */
@JsonProperty("ocr")
public Builder withOcr(final @Nullable OcrConfig value) {
this.ocr = value;
return this;
}
/** Sets the forceOcr field. */
@JsonProperty("force_ocr")
public Builder withForceOcr(final @Nullable Boolean value) {
this.forceOcr = value;
return this;
}
/** Sets the forceOcrPages field. */
@JsonProperty("force_ocr_pages")
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
this.forceOcrPages = value;
return this;
}
/** Sets the disableOcr field. */
@JsonProperty("disable_ocr")
public Builder withDisableOcr(final @Nullable Boolean value) {
this.disableOcr = value;
return this;
}
/** Sets the chunking field. */
@JsonProperty("chunking")
public Builder withChunking(final @Nullable ChunkingConfig value) {
this.chunking = value;
return this;
}
/** Sets the contentFilter field. */
@JsonProperty("content_filter")
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
this.contentFilter = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable ImageExtractionConfig value) {
this.images = value;
return this;
}
/** Sets the pdfOptions field. */
@JsonProperty("pdf_options")
public Builder withPdfOptions(final @Nullable PdfConfig value) {
this.pdfOptions = value;
return this;
}
/** Sets the tokenReduction field. */
@JsonProperty("token_reduction")
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
this.tokenReduction = value;
return this;
}
/** Sets the languageDetection field. */
@JsonProperty("language_detection")
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
this.languageDetection = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable PageConfig value) {
this.pages = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable KeywordConfig value) {
this.keywords = value;
return this;
}
/** Sets the postprocessor field. */
@JsonProperty("postprocessor")
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
this.postprocessor = value;
return this;
}
/** Sets the htmlOptions field. */
@JsonProperty("html_options")
public Builder withHtmlOptions(final @Nullable String value) {
this.htmlOptions = value;
return this;
}
/** Sets the htmlOutput field. */
@JsonProperty("html_output")
public Builder withHtmlOutput(final @Nullable HtmlOutputConfig value) {
this.htmlOutput = value;
return this;
}
/** Sets the extractionTimeoutSecs field. */
@JsonProperty("extraction_timeout_secs")
public Builder withExtractionTimeoutSecs(final @Nullable Long value) {
this.extractionTimeoutSecs = value;
return this;
}
/** Sets the maxConcurrentExtractions field. */
@JsonProperty("max_concurrent_extractions")
public Builder withMaxConcurrentExtractions(final @Nullable Long value) {
this.maxConcurrentExtractions = value;
return this;
}
/** Sets the resultFormat field. */
@JsonProperty("result_format")
public Builder withResultFormat(final @Nullable ResultFormat value) {
this.resultFormat = value;
return this;
}
/** Sets the securityLimits field. */
@JsonProperty("security_limits")
public Builder withSecurityLimits(final @Nullable SecurityLimits value) {
this.securityLimits = value;
return this;
}
/** Sets the maxEmbeddedFileBytes field. */
@JsonProperty("max_embedded_file_bytes")
public Builder withMaxEmbeddedFileBytes(final @Nullable Long value) {
this.maxEmbeddedFileBytes = value;
return this;
}
/** Sets the outputFormat field. */
@JsonProperty("output_format")
public Builder withOutputFormat(final @Nullable OutputFormat value) {
this.outputFormat = value;
return this;
}
/** Sets the layout field. */
@JsonProperty("layout")
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
this.layout = value;
return this;
}
/** Sets the useLayoutForMarkdown field. */
@JsonProperty("use_layout_for_markdown")
public Builder withUseLayoutForMarkdown(final @Nullable Boolean value) {
this.useLayoutForMarkdown = value;
return this;
}
/** Sets the includeDocumentStructure field. */
@JsonProperty("include_document_structure")
public Builder withIncludeDocumentStructure(final @Nullable Boolean value) {
this.includeDocumentStructure = value;
return this;
}
/** Sets the acceleration field. */
@JsonProperty("acceleration")
public Builder withAcceleration(final @Nullable AccelerationConfig value) {
this.acceleration = value;
return this;
}
/** Sets the cacheNamespace field. */
@JsonProperty("cache_namespace")
public Builder withCacheNamespace(final @Nullable String value) {
this.cacheNamespace = value;
return this;
}
/** Sets the cacheTtlSecs field. */
@JsonProperty("cache_ttl_secs")
public Builder withCacheTtlSecs(final @Nullable Long value) {
this.cacheTtlSecs = value;
return this;
}
/** Sets the email field. */
@JsonProperty("email")
public Builder withEmail(final @Nullable EmailConfig value) {
this.email = value;
return this;
}
/** Sets the concurrency field. */
@JsonProperty("concurrency")
public Builder withConcurrency(final @Nullable String value) {
this.concurrency = value;
return this;
}
/** Sets the maxArchiveDepth field. */
@JsonProperty("max_archive_depth")
public Builder withMaxArchiveDepth(final @Nullable Long value) {
this.maxArchiveDepth = value;
return this;
}
/** Sets the treeSitter field. */
@JsonProperty("tree_sitter")
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
this.treeSitter = value;
return this;
}
/** Sets the structuredExtraction field. */
@JsonProperty("structured_extraction")
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
this.structuredExtraction = value;
return this;
}
/** Sets the cancelToken field. */
@JsonProperty("cancel_token")
public Builder withCancelToken(final @Nullable String value) {
this.cancelToken = value;
return this;
}
/** Builds the ExtractionConfig instance. */
public ExtractionConfig build() {
return new ExtractionConfig(
useCache,
enableQualityProcessing,
ocr,
forceOcr,
forceOcrPages,
disableOcr,
chunking,
contentFilter,
images,
pdfOptions,
tokenReduction,
languageDetection,
pages,
keywords,
postprocessor,
htmlOptions,
htmlOutput,
extractionTimeoutSecs,
maxConcurrentExtractions,
resultFormat,
securityLimits,
maxEmbeddedFileBytes,
outputFormat,
layout,
useLayoutForMarkdown,
includeDocumentStructure,
acceleration,
cacheNamespace,
cacheTtlSecs,
email,
concurrency,
maxArchiveDepth,
treeSitter,
structuredExtraction,
cancelToken
);
}
}
// CPD-ON
public static ExtractionConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,132 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* The complete diff between two {@code ExtractionResult} values.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionDiff.Builder.class)
public record ExtractionDiff(
/**
* Unified-diff hunks for the {@code content} field.
*
* Empty when the content is identical.
*/
@JsonProperty("content_diff") List<DiffHunk> contentDiff,
/**
* Tables present in {@code b} but not in {@code a} (by index position, excess right-side tables).
*/
@JsonProperty("tables_added") List<Table> tablesAdded,
/**
* Tables present in {@code a} but not in {@code b} (by index position, excess left-side tables).
*/
@JsonProperty("tables_removed") List<Table> tablesRemoved,
/**
* Cell-level changes for table pairs that share the same index and dimensions.
*/
@JsonProperty("tables_changed") List<TableDiff> tablesChanged,
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* {@code added} (keys present in {@code b} but not {@code a}), {@code removed} (keys present in {@code a}
* but not {@code b}), and {@code changed} (keys whose values differ — each entry is
* {@code { "from": &lt;value-in-a&gt;, "to": &lt;value-in-b&gt; }}).
*
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed {@code a.metadata} and {@code b.metadata} to your
* preferred json-patch impl directly.
*/
@JsonProperty("metadata_changed") JsonNode metadataChanged,
/**
* Changes to embedded archive children.
*/
@JsonProperty("embedded_changes") EmbeddedChanges embeddedChanges
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("content_diff")
private List<DiffHunk> contentDiff = List.of();
@JsonProperty("tables_added")
private List<Table> tablesAdded = List.of();
@JsonProperty("tables_removed")
private List<Table> tablesRemoved = List.of();
@JsonProperty("tables_changed")
private List<TableDiff> tablesChanged = List.of();
@JsonProperty("metadata_changed")
private JsonNode metadataChanged = null;
@JsonProperty("embedded_changes")
private EmbeddedChanges embeddedChanges = null;
/** Sets the contentDiff field. */
@JsonProperty("content_diff")
public Builder withContentDiff(final List<DiffHunk> value) {
this.contentDiff = value;
return this;
}
/** Sets the tablesAdded field. */
@JsonProperty("tables_added")
public Builder withTablesAdded(final List<Table> value) {
this.tablesAdded = value;
return this;
}
/** Sets the tablesRemoved field. */
@JsonProperty("tables_removed")
public Builder withTablesRemoved(final List<Table> value) {
this.tablesRemoved = value;
return this;
}
/** Sets the tablesChanged field. */
@JsonProperty("tables_changed")
public Builder withTablesChanged(final List<TableDiff> value) {
this.tablesChanged = value;
return this;
}
/** Sets the metadataChanged field. */
@JsonProperty("metadata_changed")
public Builder withMetadataChanged(final JsonNode value) {
this.metadataChanged = value;
return this;
}
/** Sets the embeddedChanges field. */
@JsonProperty("embedded_changes")
public Builder withEmbeddedChanges(final EmbeddedChanges value) {
this.embeddedChanges = value;
return this;
}
/** Builds the ExtractionDiff instance. */
public ExtractionDiff build() {
return new ExtractionDiff(
contentDiff,
tablesAdded,
tablesRemoved,
tablesChanged,
metadataChanged,
embeddedChanges
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,48 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* How the extracted text was produced.
*/
public enum ExtractionMethod {
Native("native"),
Ocr("ocr"),
Mixed("mixed");
/** The string value. */
private final String value;
ExtractionMethod(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ExtractionMethod fromValue(final String value) {
for (ExtractionMethod e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,480 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionResult.Builder.class)
public record ExtractionResult(
@JsonProperty("content") String content,
@JsonProperty("mime_type") String mimeType,
@JsonProperty("metadata") Metadata metadata,
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
@Nullable @JsonProperty("extraction_method") ExtractionMethod extractionMethod,
@JsonProperty("tables") List<Table> tables,
@Nullable @JsonProperty("detected_languages") List<String> detectedLanguages,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
@Nullable @JsonProperty("chunks") List<Chunk> chunks,
/**
* Extracted images from the document.
*
* When image extraction is enabled via {@code ImageExtractionConfig}, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested {@code ocr_result} if OCR was performed.
*/
@Nullable @JsonProperty("images") List<ExtractedImage> images,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
@Nullable @JsonProperty("pages") List<PageContent> pages,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
@Nullable @JsonProperty("elements") List<Element> elements,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The {@code content} field still contains plain text for backward compatibility.
*
* Always {@code None} for non-Djot documents.
*/
@Nullable @JsonProperty("djot_content") DjotContent djotContent,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when {@code OcrElementConfig.include_elements} is true.
*/
@Nullable @JsonProperty("ocr_elements") List<OcrElement> ocrElements,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When {@code include_document_structure} is true in {@code ExtractionConfig}, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
*/
@Nullable @JsonProperty("document") DocumentStructure document,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in {@code metadata.additional["keywords"]}.
*/
@Nullable @JsonProperty("extracted_keywords") List<Keyword> extractedKeywords,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in {@code metadata.additional["quality_score"]}.
*/
@Nullable @JsonProperty("quality_score") Double qualityScore,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in {@code metadata.additional}.
*/
@Nullable @JsonProperty("processing_warnings") List<ProcessingWarning> processingWarnings,
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via {@code PdfConfig.extract_annotations},
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
@Nullable @JsonProperty("annotations") List<PdfAnnotation> annotations,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to {@code None} for non-archive formats.
* Use {@code max_archive_depth} in config to control recursion depth.
*/
@Nullable @JsonProperty("children") List<ArchiveEntry> children,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
@Nullable @JsonProperty("uris") List<ExtractedUri> uris,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*},
* …). Every extractor defaults to {@code None} until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* {@code content}, deleted text is absent — the revision list is the separate
* audit trail.
*/
@Nullable @JsonProperty("revisions") List<DocumentRevision> revisions,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When {@code structured_extraction} is configured in {@code ExtractionConfig}, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
@Nullable @JsonProperty("structured_output") JsonNode structuredOutput,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the {@code tree-sitter} feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is {@code tree_sitter_language_pack.ProcessResult}.
*/
@Nullable @JsonProperty("code_intelligence") JsonNode codeIntelligence,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* {@code None} when no LLM was used.
*/
@Nullable @JsonProperty("llm_usage") List<LlmUsage> llmUsage,
/**
* Pre-rendered content in the requested output format.
*
* Populated during {@code derive_extraction_result} before tree derivation consumes
* element data. {@code apply_output_format} swaps this into {@code content} at the end
* of the pipeline, after post-processors have operated on plain text.
*/
@Nullable @JsonProperty("formatted_content") String formattedContent,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed {@code InternalDocument} carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
@Nullable @JsonProperty("ocr_internal_document") String ocrInternalDocument
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String content = "";
@JsonProperty("mime_type")
private String mimeType = "";
private Metadata metadata = null;
@JsonProperty("extraction_method")
@Nullable private ExtractionMethod extractionMethod = null;
private List<Table> tables = List.of();
@JsonProperty("detected_languages")
private List<String> detectedLanguages = null;
private List<Chunk> chunks = null;
private List<ExtractedImage> images = null;
private List<PageContent> pages = null;
private List<Element> elements = null;
@JsonProperty("djot_content")
@Nullable private DjotContent djotContent = null;
@JsonProperty("ocr_elements")
private List<OcrElement> ocrElements = null;
@Nullable private DocumentStructure document = null;
@JsonProperty("extracted_keywords")
private List<Keyword> extractedKeywords = null;
@JsonProperty("quality_score")
private Double qualityScore = null;
@JsonProperty("processing_warnings")
private List<ProcessingWarning> processingWarnings = null;
private List<PdfAnnotation> annotations = null;
private List<ArchiveEntry> children = null;
private List<ExtractedUri> uris = null;
private List<DocumentRevision> revisions = null;
@JsonProperty("structured_output")
private JsonNode structuredOutput = null;
@JsonProperty("code_intelligence")
private JsonNode codeIntelligence = null;
@JsonProperty("llm_usage")
private List<LlmUsage> llmUsage = null;
@JsonProperty("formatted_content")
private String formattedContent = null;
@JsonProperty("ocr_internal_document")
private String ocrInternalDocument = null;
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final String value) {
this.mimeType = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final Metadata value) {
this.metadata = value;
return this;
}
/** Sets the extractionMethod field. */
@JsonProperty("extraction_method")
public Builder withExtractionMethod(final @Nullable ExtractionMethod value) {
this.extractionMethod = value;
return this;
}
/** Sets the tables field. */
@JsonProperty("tables")
public Builder withTables(final List<Table> value) {
this.tables = value;
return this;
}
/** Sets the detectedLanguages field. */
@JsonProperty("detected_languages")
public Builder withDetectedLanguages(final @Nullable List<String> value) {
this.detectedLanguages = value;
return this;
}
/** Sets the chunks field. */
@JsonProperty("chunks")
public Builder withChunks(final @Nullable List<Chunk> value) {
this.chunks = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable List<ExtractedImage> value) {
this.images = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable List<PageContent> value) {
this.pages = value;
return this;
}
/** Sets the elements field. */
@JsonProperty("elements")
public Builder withElements(final @Nullable List<Element> value) {
this.elements = value;
return this;
}
/** Sets the djotContent field. */
@JsonProperty("djot_content")
public Builder withDjotContent(final @Nullable DjotContent value) {
this.djotContent = value;
return this;
}
/** Sets the ocrElements field. */
@JsonProperty("ocr_elements")
public Builder withOcrElements(final @Nullable List<OcrElement> value) {
this.ocrElements = value;
return this;
}
/** Sets the document field. */
@JsonProperty("document")
public Builder withDocument(final @Nullable DocumentStructure value) {
this.document = value;
return this;
}
/** Sets the extractedKeywords field. */
@JsonProperty("extracted_keywords")
public Builder withExtractedKeywords(final @Nullable List<Keyword> value) {
this.extractedKeywords = value;
return this;
}
/** Sets the qualityScore field. */
@JsonProperty("quality_score")
public Builder withQualityScore(final @Nullable Double value) {
this.qualityScore = value;
return this;
}
/** Sets the processingWarnings field. */
@JsonProperty("processing_warnings")
public Builder withProcessingWarnings(final @Nullable List<ProcessingWarning> value) {
this.processingWarnings = value;
return this;
}
/** Sets the annotations field. */
@JsonProperty("annotations")
public Builder withAnnotations(final @Nullable List<PdfAnnotation> value) {
this.annotations = value;
return this;
}
/** Sets the children field. */
@JsonProperty("children")
public Builder withChildren(final @Nullable List<ArchiveEntry> value) {
this.children = value;
return this;
}
/** Sets the uris field. */
@JsonProperty("uris")
public Builder withUris(final @Nullable List<ExtractedUri> value) {
this.uris = value;
return this;
}
/** Sets the revisions field. */
@JsonProperty("revisions")
public Builder withRevisions(final @Nullable List<DocumentRevision> value) {
this.revisions = value;
return this;
}
/** Sets the structuredOutput field. */
@JsonProperty("structured_output")
public Builder withStructuredOutput(final @Nullable JsonNode value) {
this.structuredOutput = value;
return this;
}
/** Sets the codeIntelligence field. */
@JsonProperty("code_intelligence")
public Builder withCodeIntelligence(final @Nullable JsonNode value) {
this.codeIntelligence = value;
return this;
}
/** Sets the llmUsage field. */
@JsonProperty("llm_usage")
public Builder withLlmUsage(final @Nullable List<LlmUsage> value) {
this.llmUsage = value;
return this;
}
/** Sets the formattedContent field. */
@JsonProperty("formatted_content")
public Builder withFormattedContent(final @Nullable String value) {
this.formattedContent = value;
return this;
}
/** Sets the ocrInternalDocument field. */
@JsonProperty("ocr_internal_document")
public Builder withOcrInternalDocument(final @Nullable String value) {
this.ocrInternalDocument = value;
return this;
}
/** Builds the ExtractionResult instance. */
public ExtractionResult build() {
return new ExtractionResult(
content,
mimeType,
metadata,
extractionMethod,
tables,
detectedLanguages,
chunks,
images,
pages,
elements,
djotContent,
ocrElements,
document,
extractedKeywords,
qualityScore,
processingWarnings,
annotations,
children,
uris,
revisions,
structuredOutput,
codeIntelligence,
llmUsage,
formattedContent,
ocrInternalDocument
);
}
}
// CPD-ON
/**
* Convert from an OCR result.
*/
public static ExtractionResult fromOcr(OcrExtractionResult ocr) {
throw new UnsupportedOperationException("fromOcr is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* FictionBook (FB2) metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = FictionBookMetadata.Builder.class)
public record FictionBookMetadata(
@Nullable @JsonProperty("genres") List<String> genres,
@Nullable @JsonProperty("sequences") List<String> sequences,
@Nullable @JsonProperty("annotation") String annotation
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private List<String> genres = null;
private List<String> sequences = null;
private String annotation = null;
/** Sets the genres field. */
@JsonProperty("genres")
public Builder withGenres(final @Nullable List<String> value) {
this.genres = value;
return this;
}
/** Sets the sequences field. */
@JsonProperty("sequences")
public Builder withSequences(final @Nullable List<String> value) {
this.sequences = value;
return this;
}
/** Sets the annotation field. */
@JsonProperty("annotation")
public Builder withAnnotation(final @Nullable String value) {
this.annotation = value;
return this;
}
/** Builds the FictionBookMetadata instance. */
public FictionBookMetadata build() {
return new FictionBookMetadata(
genres,
sequences,
annotation
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,361 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are {@code Option&lt;T&gt;} — {@code None} means "use the batch-level default."
* This type is used with {@code batch_extract_files} and
* {@code batch_extract_bytes} to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following {@code ExtractionConfig} fields are batch-level only and
* cannot be overridden per file:
* - {@code max_concurrent_extractions} — controls batch parallelism
* - {@code use_cache} — global caching policy
* - {@code acceleration} — shared ONNX execution provider
* - {@code security_limits} — global archive security policy
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = FileExtractionConfig.Builder.class)
public record FileExtractionConfig(
/**
* Override quality post-processing for this file.
*/
@Nullable @JsonProperty("enable_quality_processing") Boolean enableQualityProcessing,
/**
* Override OCR configuration for this file (null in the Option = use batch default).
*/
@Nullable @JsonProperty("ocr") OcrConfig ocr,
/**
* Override force OCR for this file.
*/
@Nullable @JsonProperty("force_ocr") Boolean forceOcr,
/**
* Override force OCR pages for this file (1-indexed page numbers).
*/
@Nullable @JsonProperty("force_ocr_pages") List<Integer> forceOcrPages,
/**
* Override disable OCR for this file.
*/
@Nullable @JsonProperty("disable_ocr") Boolean disableOcr,
/**
* Override chunking configuration for this file.
*/
@Nullable @JsonProperty("chunking") ChunkingConfig chunking,
/**
* Override content filtering configuration for this file.
*/
@Nullable @JsonProperty("content_filter") ContentFilterConfig contentFilter,
/**
* Override image extraction configuration for this file.
*/
@Nullable @JsonProperty("images") ImageExtractionConfig images,
/**
* Override PDF options for this file.
*/
@Nullable @JsonProperty("pdf_options") PdfConfig pdfOptions,
/**
* Override token reduction for this file.
*/
@Nullable @JsonProperty("token_reduction") TokenReductionOptions tokenReduction,
/**
* Override language detection for this file.
*/
@Nullable @JsonProperty("language_detection") LanguageDetectionConfig languageDetection,
/**
* Override page extraction for this file.
*/
@Nullable @JsonProperty("pages") PageConfig pages,
/**
* Override keyword extraction for this file.
*/
@Nullable @JsonProperty("keywords") KeywordConfig keywords,
/**
* Override post-processor for this file.
*/
@Nullable @JsonProperty("postprocessor") PostProcessorConfig postprocessor,
/**
* Override HTML conversion options for this file.
*/
@Nullable @JsonProperty("html_options") String htmlOptions,
/**
* Override result format for this file.
*/
@Nullable @JsonProperty("result_format") ResultFormat resultFormat,
/**
* Override output content format for this file.
*/
@Nullable @JsonProperty("output_format") OutputFormat outputFormat,
/**
* Override document structure output for this file.
*/
@Nullable @JsonProperty("include_document_structure") Boolean includeDocumentStructure,
/**
* Override layout detection for this file.
*/
@Nullable @JsonProperty("layout") LayoutDetectionConfig layout,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
@Nullable @JsonProperty("timeout_secs") Long timeoutSecs,
/**
* Override tree-sitter configuration for this file.
*/
@Nullable @JsonProperty("tree_sitter") TreeSitterConfig treeSitter,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
@Nullable @JsonProperty("structured_extraction") StructuredExtractionConfig structuredExtraction
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("enable_quality_processing")
private Boolean enableQualityProcessing = null;
private OcrConfig ocr = null;
@JsonProperty("force_ocr")
private Boolean forceOcr = null;
@JsonProperty("force_ocr_pages")
private List<Integer> forceOcrPages = null;
@JsonProperty("disable_ocr")
private Boolean disableOcr = null;
private ChunkingConfig chunking = null;
@JsonProperty("content_filter")
private ContentFilterConfig contentFilter = null;
private ImageExtractionConfig images = null;
@JsonProperty("pdf_options")
private PdfConfig pdfOptions = null;
@JsonProperty("token_reduction")
private TokenReductionOptions tokenReduction = null;
@JsonProperty("language_detection")
private LanguageDetectionConfig languageDetection = null;
private PageConfig pages = null;
private KeywordConfig keywords = null;
private PostProcessorConfig postprocessor = null;
@JsonProperty("html_options")
private String htmlOptions = null;
@JsonProperty("result_format")
private ResultFormat resultFormat = null;
@JsonProperty("output_format")
private OutputFormat outputFormat = null;
@JsonProperty("include_document_structure")
private Boolean includeDocumentStructure = null;
private LayoutDetectionConfig layout = null;
@JsonProperty("timeout_secs")
private Long timeoutSecs = null;
@JsonProperty("tree_sitter")
private TreeSitterConfig treeSitter = null;
@JsonProperty("structured_extraction")
private StructuredExtractionConfig structuredExtraction = null;
/** Sets the enableQualityProcessing field. */
@JsonProperty("enable_quality_processing")
public Builder withEnableQualityProcessing(final @Nullable boolean value) {
this.enableQualityProcessing = value;
return this;
}
/** Sets the ocr field. */
@JsonProperty("ocr")
public Builder withOcr(final @Nullable OcrConfig value) {
this.ocr = value;
return this;
}
/** Sets the forceOcr field. */
@JsonProperty("force_ocr")
public Builder withForceOcr(final @Nullable boolean value) {
this.forceOcr = value;
return this;
}
/** Sets the forceOcrPages field. */
@JsonProperty("force_ocr_pages")
public Builder withForceOcrPages(final @Nullable List<Integer> value) {
this.forceOcrPages = value;
return this;
}
/** Sets the disableOcr field. */
@JsonProperty("disable_ocr")
public Builder withDisableOcr(final @Nullable boolean value) {
this.disableOcr = value;
return this;
}
/** Sets the chunking field. */
@JsonProperty("chunking")
public Builder withChunking(final @Nullable ChunkingConfig value) {
this.chunking = value;
return this;
}
/** Sets the contentFilter field. */
@JsonProperty("content_filter")
public Builder withContentFilter(final @Nullable ContentFilterConfig value) {
this.contentFilter = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable ImageExtractionConfig value) {
this.images = value;
return this;
}
/** Sets the pdfOptions field. */
@JsonProperty("pdf_options")
public Builder withPdfOptions(final @Nullable PdfConfig value) {
this.pdfOptions = value;
return this;
}
/** Sets the tokenReduction field. */
@JsonProperty("token_reduction")
public Builder withTokenReduction(final @Nullable TokenReductionOptions value) {
this.tokenReduction = value;
return this;
}
/** Sets the languageDetection field. */
@JsonProperty("language_detection")
public Builder withLanguageDetection(final @Nullable LanguageDetectionConfig value) {
this.languageDetection = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable PageConfig value) {
this.pages = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable KeywordConfig value) {
this.keywords = value;
return this;
}
/** Sets the postprocessor field. */
@JsonProperty("postprocessor")
public Builder withPostprocessor(final @Nullable PostProcessorConfig value) {
this.postprocessor = value;
return this;
}
/** Sets the htmlOptions field. */
@JsonProperty("html_options")
public Builder withHtmlOptions(final @Nullable String value) {
this.htmlOptions = value;
return this;
}
/** Sets the resultFormat field. */
@JsonProperty("result_format")
public Builder withResultFormat(final @Nullable ResultFormat value) {
this.resultFormat = value;
return this;
}
/** Sets the outputFormat field. */
@JsonProperty("output_format")
public Builder withOutputFormat(final @Nullable OutputFormat value) {
this.outputFormat = value;
return this;
}
/** Sets the includeDocumentStructure field. */
@JsonProperty("include_document_structure")
public Builder withIncludeDocumentStructure(final @Nullable boolean value) {
this.includeDocumentStructure = value;
return this;
}
/** Sets the layout field. */
@JsonProperty("layout")
public Builder withLayout(final @Nullable LayoutDetectionConfig value) {
this.layout = value;
return this;
}
/** Sets the timeoutSecs field. */
@JsonProperty("timeout_secs")
public Builder withTimeoutSecs(final @Nullable long value) {
this.timeoutSecs = value;
return this;
}
/** Sets the treeSitter field. */
@JsonProperty("tree_sitter")
public Builder withTreeSitter(final @Nullable TreeSitterConfig value) {
this.treeSitter = value;
return this;
}
/** Sets the structuredExtraction field. */
@JsonProperty("structured_extraction")
public Builder withStructuredExtraction(final @Nullable StructuredExtractionConfig value) {
this.structuredExtraction = value;
return this;
}
/** Builds the FileExtractionConfig instance. */
public FileExtractionConfig build() {
return new FileExtractionConfig(
enableQualityProcessing,
ocr,
forceOcr,
forceOcrPages,
disableOcr,
chunking,
contentFilter,
images,
pdfOptions,
tokenReduction,
languageDetection,
pages,
keywords,
postprocessor,
htmlOptions,
resultFormat,
outputFormat,
includeDocumentStructure,
layout,
timeoutSecs,
treeSitter,
structuredExtraction
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,63 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Footnote in Djot.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = Footnote.Builder.class)
public record Footnote(
/**
* Footnote label
*/
@JsonProperty("label") String label,
/**
* Footnote content blocks
*/
@JsonProperty("content") List<FormattedBlock> content
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String label = "";
private List<FormattedBlock> content = List.of();
/** Sets the label field. */
@JsonProperty("label")
public Builder withLabel(final String value) {
this.label = value;
return this;
}
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final List<FormattedBlock> value) {
this.content = value;
return this;
}
/** Builds the Footnote instance. */
public Footnote build() {
return new Footnote(
label,
content
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,275 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
import com.fasterxml.jackson.databind.ser.std.StdSerializer;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.jspecify.annotations.Nullable;
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
@com.fasterxml.jackson.annotation.JsonIgnoreProperties(ignoreUnknown = true)
@JsonDeserialize(using = FormatMetadataDeserializer.class)
@JsonSerialize(using = FormatMetadataSerializer.class)
public sealed interface FormatMetadata {
record Pdf(PdfMetadata value) implements FormatMetadata { }
record Docx(DocxMetadata value) implements FormatMetadata { }
record Excel(ExcelMetadata value) implements FormatMetadata { }
record Email(EmailMetadata value) implements FormatMetadata { }
record Pptx(PptxMetadata value) implements FormatMetadata { }
record Archive(ArchiveMetadata value) implements FormatMetadata { }
record Image(ImageMetadata value) implements FormatMetadata { }
record Xml(XmlMetadata value) implements FormatMetadata { }
record Text(TextMetadata value) implements FormatMetadata { }
record Html(HtmlMetadata value) implements FormatMetadata { }
record Ocr(OcrMetadata value) implements FormatMetadata { }
record Csv(CsvMetadata value) implements FormatMetadata { }
record Bibtex(BibtexMetadata value) implements FormatMetadata { }
record Citation(CitationMetadata value) implements FormatMetadata { }
record FictionBook(FictionBookMetadata value) implements FormatMetadata { }
record Dbf(DbfMetadata value) implements FormatMetadata { }
record Jats(JatsMetadata value) implements FormatMetadata { }
record Epub(EpubMetadata value) implements FormatMetadata { }
record Pst(PstMetadata value) implements FormatMetadata { }
record Code(String value) implements FormatMetadata { }
/** Returns the Pdf data if this is a Pdf variant, otherwise null. */
default @Nullable PdfMetadata pdf() {
return this instanceof Pdf e ? e.value() : null;
}
/** Returns the Docx data if this is a Docx variant, otherwise null. */
default @Nullable DocxMetadata docx() {
return this instanceof Docx e ? e.value() : null;
}
/** Returns the Excel data if this is a Excel variant, otherwise null. */
default @Nullable ExcelMetadata excel() {
return this instanceof Excel e ? e.value() : null;
}
/** Returns the Email data if this is a Email variant, otherwise null. */
default @Nullable EmailMetadata email() {
return this instanceof Email e ? e.value() : null;
}
/** Returns the Pptx data if this is a Pptx variant, otherwise null. */
default @Nullable PptxMetadata pptx() {
return this instanceof Pptx e ? e.value() : null;
}
/** Returns the Archive data if this is a Archive variant, otherwise null. */
default @Nullable ArchiveMetadata archive() {
return this instanceof Archive e ? e.value() : null;
}
/** Returns the Image data if this is a Image variant, otherwise null. */
default @Nullable ImageMetadata image() {
return this instanceof Image e ? e.value() : null;
}
/** Returns the Xml data if this is a Xml variant, otherwise null. */
default @Nullable XmlMetadata xml() {
return this instanceof Xml e ? e.value() : null;
}
/** Returns the Text data if this is a Text variant, otherwise null. */
default @Nullable TextMetadata text() {
return this instanceof Text e ? e.value() : null;
}
/** Returns the Html data if this is a Html variant, otherwise null. */
default @Nullable HtmlMetadata html() {
return this instanceof Html e ? e.value() : null;
}
/** Returns the Ocr data if this is a Ocr variant, otherwise null. */
default @Nullable OcrMetadata ocr() {
return this instanceof Ocr e ? e.value() : null;
}
/** Returns the Csv data if this is a Csv variant, otherwise null. */
default @Nullable CsvMetadata csv() {
return this instanceof Csv e ? e.value() : null;
}
/** Returns the Bibtex data if this is a Bibtex variant, otherwise null. */
default @Nullable BibtexMetadata bibtex() {
return this instanceof Bibtex e ? e.value() : null;
}
/** Returns the Citation data if this is a Citation variant, otherwise null. */
default @Nullable CitationMetadata citation() {
return this instanceof Citation e ? e.value() : null;
}
/** Returns the FictionBook data if this is a FictionBook variant, otherwise null. */
default @Nullable FictionBookMetadata fictionBook() {
return this instanceof FictionBook e ? e.value() : null;
}
/** Returns the Dbf data if this is a Dbf variant, otherwise null. */
default @Nullable DbfMetadata dbf() {
return this instanceof Dbf e ? e.value() : null;
}
/** Returns the Jats data if this is a Jats variant, otherwise null. */
default @Nullable JatsMetadata jats() {
return this instanceof Jats e ? e.value() : null;
}
/** Returns the Epub data if this is a Epub variant, otherwise null. */
default @Nullable EpubMetadata epub() {
return this instanceof Epub e ? e.value() : null;
}
/** Returns the Pst data if this is a Pst variant, otherwise null. */
default @Nullable PstMetadata pst() {
return this instanceof Pst e ? e.value() : null;
}
/** Returns the Code data if this is a Code variant, otherwise null. */
default @Nullable String code() {
return this instanceof Code e ? e.value() : null;
}
}
// Custom deserializer for sealed interface with unwrapped variants
class FormatMetadataDeserializer extends StdDeserializer<FormatMetadata> {
FormatMetadataDeserializer() {
super(FormatMetadata.class);
}
@Override
public FormatMetadata deserialize(JsonParser parser, DeserializationContext ctx)
throws java.io.IOException {
ObjectNode node = parser.getCodec().readTree(parser);
com.fasterxml.jackson.databind.JsonNode tagNode = node.get("format_type");
if (tagNode == null || tagNode.isNull()) {
throw new com.fasterxml.jackson.databind.JsonMappingException(
parser, "Missing discriminator field: format_type");
}
String tagValue = tagNode.asText();
node.remove("format_type");
return switch (tagValue) {
case "pdf" -> new FormatMetadata.Pdf(ctx.readTreeAsValue(node, PdfMetadata.class));
case "docx" -> new FormatMetadata.Docx(ctx.readTreeAsValue(node, DocxMetadata.class));
case "excel" -> new FormatMetadata.Excel(ctx.readTreeAsValue(node, ExcelMetadata.class));
case "email" -> new FormatMetadata.Email(ctx.readTreeAsValue(node, EmailMetadata.class));
case "pptx" -> new FormatMetadata.Pptx(ctx.readTreeAsValue(node, PptxMetadata.class));
case "archive" -> new FormatMetadata.Archive(ctx.readTreeAsValue(node, ArchiveMetadata.class));
case "image" -> new FormatMetadata.Image(ctx.readTreeAsValue(node, ImageMetadata.class));
case "xml" -> new FormatMetadata.Xml(ctx.readTreeAsValue(node, XmlMetadata.class));
case "text" -> new FormatMetadata.Text(ctx.readTreeAsValue(node, TextMetadata.class));
case "html" -> new FormatMetadata.Html(ctx.readTreeAsValue(node, HtmlMetadata.class));
case "ocr" -> new FormatMetadata.Ocr(ctx.readTreeAsValue(node, OcrMetadata.class));
case "csv" -> new FormatMetadata.Csv(ctx.readTreeAsValue(node, CsvMetadata.class));
case "bibtex" -> new FormatMetadata.Bibtex(ctx.readTreeAsValue(node, BibtexMetadata.class));
case "citation" -> new FormatMetadata.Citation(ctx.readTreeAsValue(node, CitationMetadata.class));
case "fiction_book" -> new FormatMetadata.FictionBook(ctx.readTreeAsValue(node, FictionBookMetadata.class));
case "dbf" -> new FormatMetadata.Dbf(ctx.readTreeAsValue(node, DbfMetadata.class));
case "jats" -> new FormatMetadata.Jats(ctx.readTreeAsValue(node, JatsMetadata.class));
case "epub" -> new FormatMetadata.Epub(ctx.readTreeAsValue(node, EpubMetadata.class));
case "pst" -> new FormatMetadata.Pst(ctx.readTreeAsValue(node, PstMetadata.class));
case "code" -> new FormatMetadata.Code(node.toString());
default -> throw new com.fasterxml.jackson.databind.JsonMappingException(
parser, "Unknown FormatMetadata discriminator: " + tagValue);
};
}
}
// Custom serializer for sealed interface with unwrapped variants — emits
// the discriminator tag alongside the inner record's fields (flat object).
class FormatMetadataSerializer extends StdSerializer<FormatMetadata> {
private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER =
new com.fasterxml.jackson.databind.ObjectMapper()
.registerModule(new com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
.setSerializationInclusion(com.fasterxml.jackson.annotation.JsonInclude.Include.NON_NULL);
FormatMetadataSerializer() {
super(FormatMetadata.class);
}
@Override
public void serialize(FormatMetadata value, JsonGenerator gen, SerializerProvider provider)
throws java.io.IOException {
String tag;
Object inner;if (value instanceof FormatMetadata.Pdf v) {
tag = "pdf"; inner = v.value(); }else if (value instanceof FormatMetadata.Docx v) {
tag = "docx"; inner = v.value(); }else if (value instanceof FormatMetadata.Excel v) {
tag = "excel"; inner = v.value(); }else if (value instanceof FormatMetadata.Email v) {
tag = "email"; inner = v.value(); }else if (value instanceof FormatMetadata.Pptx v) {
tag = "pptx"; inner = v.value(); }else if (value instanceof FormatMetadata.Archive v) {
tag = "archive"; inner = v.value(); }else if (value instanceof FormatMetadata.Image v) {
tag = "image"; inner = v.value(); }else if (value instanceof FormatMetadata.Xml v) {
tag = "xml"; inner = v.value(); }else if (value instanceof FormatMetadata.Text v) {
tag = "text"; inner = v.value(); }else if (value instanceof FormatMetadata.Html v) {
tag = "html"; inner = v.value(); }else if (value instanceof FormatMetadata.Ocr v) {
tag = "ocr"; inner = v.value(); }else if (value instanceof FormatMetadata.Csv v) {
tag = "csv"; inner = v.value(); }else if (value instanceof FormatMetadata.Bibtex v) {
tag = "bibtex"; inner = v.value(); }else if (value instanceof FormatMetadata.Citation v) {
tag = "citation"; inner = v.value(); }else if (value instanceof FormatMetadata.FictionBook v) {
tag = "fiction_book"; inner = v.value(); }else if (value instanceof FormatMetadata.Dbf v) {
tag = "dbf"; inner = v.value(); }else if (value instanceof FormatMetadata.Jats v) {
tag = "jats"; inner = v.value(); }else if (value instanceof FormatMetadata.Epub v) {
tag = "epub"; inner = v.value(); }else if (value instanceof FormatMetadata.Pst v) {
tag = "pst"; inner = v.value(); }else if (value instanceof FormatMetadata.Code v) {
tag = "code"; inner = v.value(); } else {
throw new com.fasterxml.jackson.databind.JsonMappingException(gen,
"Unknown FormatMetadata variant: " + value.getClass().getName());
}
gen.writeStartObject();
gen.writeStringField("format_type", tag);
if (inner != null) {
com.fasterxml.jackson.databind.JsonNode tree = MAPPER.valueToTree(inner);
if (tree.isObject()) {
java.util.Iterator<java.util.Map.Entry<String, com.fasterxml.jackson.databind.JsonNode>> it =
tree.fields();
while (it.hasNext()) {
java.util.Map.Entry<String, com.fasterxml.jackson.databind.JsonNode> e = it.next();
gen.writeFieldName(e.getKey());
gen.writeTree(e.getValue());
}
}
}
gen.writeEndObject();
}
}

View File

@@ -0,0 +1,133 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = FormattedBlock.Builder.class)
public record FormattedBlock(
/**
* Type of block element
*/
@JsonProperty("block_type") BlockType blockType,
/**
* Heading level (1-6) for headings, or nesting level for lists
*/
@Nullable @JsonProperty("level") Long level,
/**
* Inline content within the block
*/
@JsonProperty("inline_content") List<InlineElement> inlineContent,
/**
* Element attributes (classes, IDs, key-value pairs)
*/
@Nullable @JsonProperty("attributes") String attributes,
/**
* Language identifier for code blocks
*/
@Nullable @JsonProperty("language") String language,
/**
* Raw code content for code blocks
*/
@Nullable @JsonProperty("code") String code,
/**
* Nested blocks for containers (blockquotes, list items, divs)
*/
@Nullable @JsonProperty("children") List<FormattedBlock> children
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("block_type")
private BlockType blockType = null;
private Long level = null;
@JsonProperty("inline_content")
private List<InlineElement> inlineContent = List.of();
private String attributes = null;
private String language = null;
private String code = null;
private List<FormattedBlock> children = null;
/** Sets the blockType field. */
@JsonProperty("block_type")
public Builder withBlockType(final BlockType value) {
this.blockType = value;
return this;
}
/** Sets the level field. */
@JsonProperty("level")
public Builder withLevel(final @Nullable long value) {
this.level = value;
return this;
}
/** Sets the inlineContent field. */
@JsonProperty("inline_content")
public Builder withInlineContent(final List<InlineElement> value) {
this.inlineContent = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final @Nullable String value) {
this.attributes = value;
return this;
}
/** Sets the language field. */
@JsonProperty("language")
public Builder withLanguage(final @Nullable String value) {
this.language = value;
return this;
}
/** Sets the code field. */
@JsonProperty("code")
public Builder withCode(final @Nullable String value) {
this.code = value;
return this;
}
/** Sets the children field. */
@JsonProperty("children")
public Builder withChildren(final @Nullable List<FormattedBlock> value) {
this.children = value;
return this;
}
/** Builds the FormattedBlock instance. */
public FormattedBlock build() {
return new FormattedBlock(
blockType,
level,
inlineContent,
attributes,
language,
code,
children
);
}
}
// CPD-ON
}

131
packages/java/dev/kreuzberg/GridCell.java generated Normal file
View File

@@ -0,0 +1,131 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Individual grid cell with position and span metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = GridCell.Builder.class)
public record GridCell(
/**
* Cell text content.
*/
@JsonProperty("content") String content,
/**
* Zero-indexed row position.
*/
@JsonProperty("row") int row,
/**
* Zero-indexed column position.
*/
@JsonProperty("col") int col,
/**
* Number of rows this cell spans.
*/
@Nullable @JsonProperty("row_span") Integer rowSpan,
/**
* Number of columns this cell spans.
*/
@Nullable @JsonProperty("col_span") Integer colSpan,
/**
* Whether this is a header cell.
*/
@Nullable @JsonProperty("is_header") Boolean isHeader,
/**
* Bounding box for this cell (if available).
*/
@Nullable @JsonProperty("bbox") BoundingBox bbox
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String content = "";
private int row = 0;
private int col = 0;
@JsonProperty("row_span")
private Integer rowSpan = null;
@JsonProperty("col_span")
private Integer colSpan = null;
@JsonProperty("is_header")
private Boolean isHeader = null;
private BoundingBox bbox = null;
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the row field. */
@JsonProperty("row")
public Builder withRow(final int value) {
this.row = value;
return this;
}
/** Sets the col field. */
@JsonProperty("col")
public Builder withCol(final int value) {
this.col = value;
return this;
}
/** Sets the rowSpan field. */
@JsonProperty("row_span")
public Builder withRowSpan(final @Nullable Integer value) {
this.rowSpan = value;
return this;
}
/** Sets the colSpan field. */
@JsonProperty("col_span")
public Builder withColSpan(final @Nullable Integer value) {
this.colSpan = value;
return this;
}
/** Sets the isHeader field. */
@JsonProperty("is_header")
public Builder withIsHeader(final @Nullable Boolean value) {
this.isHeader = value;
return this;
}
/** Sets the bbox field. */
@JsonProperty("bbox")
public Builder withBbox(final @Nullable BoundingBox value) {
this.bbox = value;
return this;
}
/** Builds the GridCell instance. */
public GridCell build() {
return new GridCell(
content,
row,
col,
rowSpan,
colSpan,
isHeader,
bbox
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,103 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Header/heading element metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HeaderMetadata.Builder.class)
public record HeaderMetadata(
/**
* Header level: 1 (h1) through 6 (h6)
*/
@JsonProperty("level") byte level,
/**
* Normalized text content of the header
*/
@JsonProperty("text") String text,
/**
* HTML id attribute if present
*/
@Nullable @JsonProperty("id") String id,
/**
* Document tree depth at the header element
*/
@JsonProperty("depth") int depth,
/**
* Byte offset in original HTML document
*/
@JsonProperty("html_offset") int htmlOffset
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private byte level = 0;
private String text = "";
private String id = null;
private int depth = 0;
@JsonProperty("html_offset")
private int htmlOffset = 0;
/** Sets the level field. */
@JsonProperty("level")
public Builder withLevel(final byte value) {
this.level = value;
return this;
}
/** Sets the text field. */
@JsonProperty("text")
public Builder withText(final String value) {
this.text = value;
return this;
}
/** Sets the id field. */
@JsonProperty("id")
public Builder withId(final @Nullable String value) {
this.id = value;
return this;
}
/** Sets the depth field. */
@JsonProperty("depth")
public Builder withDepth(final int value) {
this.depth = value;
return this;
}
/** Sets the htmlOffset field. */
@JsonProperty("html_offset")
public Builder withHtmlOffset(final int value) {
this.htmlOffset = value;
return this;
}
/** Builds the HeaderMetadata instance. */
public HeaderMetadata build() {
return new HeaderMetadata(
level,
text,
id,
depth,
htmlOffset
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,47 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HeadingContext.Builder.class)
public record HeadingContext(@JsonProperty("headings") List<HeadingLevel> headings) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private List<HeadingLevel> headings = List.of();
/** Sets the headings field. */
@JsonProperty("headings")
public Builder withHeadings(final List<HeadingLevel> value) {
this.headings = value;
return this;
}
/** Builds the HeadingContext instance. */
public HeadingContext build() {
return new HeadingContext(
headings
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,53 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* A single heading in the hierarchy.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HeadingLevel.Builder.class)
public record HeadingLevel(@JsonProperty("level") byte level, @JsonProperty("text") String text) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private byte level = 0;
private String text = "";
/** Sets the level field. */
@JsonProperty("level")
public Builder withLevel(final byte value) {
this.level = value;
return this;
}
/** Sets the text field. */
@JsonProperty("text")
public Builder withText(final String value) {
this.text = value;
return this;
}
/** Builds the HeadingLevel instance. */
public HeadingLevel build() {
return new HeadingLevel(
level,
text
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HierarchicalBlock.Builder.class)
public record HierarchicalBlock(
/**
* The text content of this block
*/
@JsonProperty("text") String text,
/**
* The font size of the text in this block
*/
@JsonProperty("font_size") float fontSize,
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*/
@JsonProperty("level") String level,
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*/
@Nullable @JsonProperty("bbox") List<Float> bbox
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String text = "";
@JsonProperty("font_size")
private float fontSize = 0.0f;
private String level = "";
private List<Float> bbox = null;
/** Sets the text field. */
@JsonProperty("text")
public Builder withText(final String value) {
this.text = value;
return this;
}
/** Sets the fontSize field. */
@JsonProperty("font_size")
public Builder withFontSize(final float value) {
this.fontSize = value;
return this;
}
/** Sets the level field. */
@JsonProperty("level")
public Builder withLevel(final String value) {
this.level = value;
return this;
}
/** Sets the bbox field. */
@JsonProperty("bbox")
public Builder withBbox(final @Nullable List<Float> value) {
this.bbox = value;
return this;
}
/** Builds the HierarchicalBlock instance. */
public HierarchicalBlock build() {
return new HierarchicalBlock(
text,
fontSize,
level,
bbox
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,109 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HierarchyConfig.Builder.class)
public record HierarchyConfig(
/**
* Enable hierarchy extraction
*/
@Nullable @JsonProperty("enabled") Boolean enabled,
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*/
@Nullable @JsonProperty("k_clusters") Long kClusters,
/**
* Include bounding box information in hierarchy blocks
*/
@Nullable @JsonProperty("include_bbox") Boolean includeBbox,
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*/
@Nullable @JsonProperty("ocr_coverage_threshold") Float ocrCoverageThreshold
) {
public static Builder builder() {
return new Builder();
}
public HierarchyConfig{
if (kClusters == null) kClusters = 3L;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private Boolean enabled = null;
@JsonProperty("k_clusters")
private Long kClusters = null;
@JsonProperty("include_bbox")
private Boolean includeBbox = null;
@JsonProperty("ocr_coverage_threshold")
private Float ocrCoverageThreshold = null;
/** Sets the enabled field. */
@JsonProperty("enabled")
public Builder withEnabled(final @Nullable Boolean value) {
this.enabled = value;
return this;
}
/** Sets the kClusters field. */
@JsonProperty("k_clusters")
public Builder withKClusters(final @Nullable Long value) {
this.kClusters = value;
return this;
}
/** Sets the includeBbox field. */
@JsonProperty("include_bbox")
public Builder withIncludeBbox(final @Nullable Boolean value) {
this.includeBbox = value;
return this;
}
/** Sets the ocrCoverageThreshold field. */
@JsonProperty("ocr_coverage_threshold")
public Builder withOcrCoverageThreshold(final @Nullable Float value) {
this.ocrCoverageThreshold = value;
return this;
}
/** Builds the HierarchyConfig instance. */
public HierarchyConfig build() {
return new HierarchyConfig(
enabled,
kClusters,
includeBbox,
ocrCoverageThreshold
);
}
}
// CPD-ON
public static HierarchyConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,247 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HtmlMetadata.Builder.class)
public record HtmlMetadata(
/**
* Document title from {@code &lt;title&gt;} tag
*/
@Nullable @JsonProperty("title") String title,
/**
* Document description from {@code &lt;meta name="description"&gt;} tag
*/
@Nullable @JsonProperty("description") String description,
/**
* Document keywords from {@code &lt;meta name="keywords"&gt;} tag, split on commas
*/
@Nullable @JsonProperty("keywords") List<String> keywords,
/**
* Document author from {@code &lt;meta name="author"&gt;} tag
*/
@Nullable @JsonProperty("author") String author,
/**
* Canonical URL from {@code &lt;link rel="canonical"&gt;} tag
*/
@Nullable @JsonProperty("canonical_url") String canonicalUrl,
/**
* Base URL from {@code &lt;base href=""&gt;} tag for resolving relative URLs
*/
@Nullable @JsonProperty("base_href") String baseHref,
/**
* Document language from {@code lang} attribute
*/
@Nullable @JsonProperty("language") String language,
/**
* Document text direction from {@code dir} attribute
*/
@Nullable @JsonProperty("text_direction") TextDirection textDirection,
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*/
@Nullable @JsonProperty("open_graph") Map<String, String> openGraph,
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*/
@Nullable @JsonProperty("twitter_card") Map<String, String> twitterCard,
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*/
@Nullable @JsonProperty("meta_tags") Map<String, String> metaTags,
/**
* Extracted header elements with hierarchy
*/
@Nullable @JsonProperty("headers") List<HeaderMetadata> headers,
/**
* Extracted hyperlinks with type classification
*/
@Nullable @JsonProperty("links") List<LinkMetadata> links,
/**
* Extracted images with source and dimensions
*/
@Nullable @JsonProperty("images") List<ImageMetadataType> images,
/**
* Extracted structured data blocks
*/
@Nullable @JsonProperty("structured_data") List<StructuredData> structuredData
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String title = null;
private String description = null;
private List<String> keywords = null;
private String author = null;
@JsonProperty("canonical_url")
private String canonicalUrl = null;
@JsonProperty("base_href")
private String baseHref = null;
private String language = null;
@JsonProperty("text_direction")
private TextDirection textDirection = null;
@JsonProperty("open_graph")
private Map<String, String> openGraph = null;
@JsonProperty("twitter_card")
private Map<String, String> twitterCard = null;
@JsonProperty("meta_tags")
private Map<String, String> metaTags = null;
private List<HeaderMetadata> headers = null;
private List<LinkMetadata> links = null;
private List<ImageMetadataType> images = null;
@JsonProperty("structured_data")
private List<StructuredData> structuredData = null;
/** Sets the title field. */
@JsonProperty("title")
public Builder withTitle(final @Nullable String value) {
this.title = value;
return this;
}
/** Sets the description field. */
@JsonProperty("description")
public Builder withDescription(final @Nullable String value) {
this.description = value;
return this;
}
/** Sets the keywords field. */
@JsonProperty("keywords")
public Builder withKeywords(final @Nullable List<String> value) {
this.keywords = value;
return this;
}
/** Sets the author field. */
@JsonProperty("author")
public Builder withAuthor(final @Nullable String value) {
this.author = value;
return this;
}
/** Sets the canonicalUrl field. */
@JsonProperty("canonical_url")
public Builder withCanonicalUrl(final @Nullable String value) {
this.canonicalUrl = value;
return this;
}
/** Sets the baseHref field. */
@JsonProperty("base_href")
public Builder withBaseHref(final @Nullable String value) {
this.baseHref = value;
return this;
}
/** Sets the language field. */
@JsonProperty("language")
public Builder withLanguage(final @Nullable String value) {
this.language = value;
return this;
}
/** Sets the textDirection field. */
@JsonProperty("text_direction")
public Builder withTextDirection(final @Nullable TextDirection value) {
this.textDirection = value;
return this;
}
/** Sets the openGraph field. */
@JsonProperty("open_graph")
public Builder withOpenGraph(final @Nullable Map<String, String> value) {
this.openGraph = value;
return this;
}
/** Sets the twitterCard field. */
@JsonProperty("twitter_card")
public Builder withTwitterCard(final @Nullable Map<String, String> value) {
this.twitterCard = value;
return this;
}
/** Sets the metaTags field. */
@JsonProperty("meta_tags")
public Builder withMetaTags(final @Nullable Map<String, String> value) {
this.metaTags = value;
return this;
}
/** Sets the headers field. */
@JsonProperty("headers")
public Builder withHeaders(final @Nullable List<HeaderMetadata> value) {
this.headers = value;
return this;
}
/** Sets the links field. */
@JsonProperty("links")
public Builder withLinks(final @Nullable List<LinkMetadata> value) {
this.links = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable List<ImageMetadataType> value) {
this.images = value;
return this;
}
/** Sets the structuredData field. */
@JsonProperty("structured_data")
public Builder withStructuredData(final @Nullable List<StructuredData> value) {
this.structuredData = value;
return this;
}
/** Builds the HtmlMetadata instance. */
public HtmlMetadata build() {
return new HtmlMetadata(
title,
description,
keywords,
author,
canonicalUrl,
baseHref,
language,
textDirection,
openGraph,
twitterCard,
metaTags,
headers,
links,
images,
structuredData
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,122 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Configuration for styled HTML output.
*
* When set on ExtractionConfig.html_output alongside
* {@code output_format = OutputFormat.Html}, the pipeline builds a
* StyledHtmlRenderer(crate.rendering.StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = HtmlOutputConfig.Builder.class)
public record HtmlOutputConfig(
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after {@code css_file} content when both are set.
*/
@Nullable @JsonProperty("css") String css,
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before {@code css} when both are set.
*/
@JsonProperty("css_file") java.nio.file.@Nullable Path cssFile,
/**
* Built-in colour/typography theme. Default: HtmlTheme.Unstyled.
*/
@Nullable @JsonProperty("theme") HtmlTheme theme,
/**
* CSS class prefix applied to every emitted class name.
*
* Default: {@code "kb-"}. Change this if your host application already uses
* classes that start with {@code kb-}.
*/
@Nullable @JsonProperty("class_prefix") String classPrefix,
/**
* When {@code true} (default), write the resolved CSS into a {@code &lt;style&gt;} block
* immediately after the opening {@code &lt;div class="{prefix}doc"&gt;}.
*
* Set to {@code false} to emit only the structural markup and wire up your
* own stylesheet targeting the {@code kb-*} class names.
*/
@Nullable @JsonProperty("embed_css") Boolean embedCss
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String css = null;
@JsonProperty("css_file")
private java.nio.file.Path cssFile = null;
@Nullable private HtmlTheme theme = HtmlTheme.Unstyled;
@JsonProperty("class_prefix")
private String classPrefix = null;
@JsonProperty("embed_css")
private Boolean embedCss = null;
/** Sets the css field. */
@JsonProperty("css")
public Builder withCss(final @Nullable String value) {
this.css = value;
return this;
}
/** Sets the cssFile field. */
@JsonProperty("css_file")
public Builder withCssFile(final java.nio.file.@Nullable Path value) {
this.cssFile = value;
return this;
}
/** Sets the theme field. */
@JsonProperty("theme")
public Builder withTheme(final @Nullable HtmlTheme value) {
this.theme = value;
return this;
}
/** Sets the classPrefix field. */
@JsonProperty("class_prefix")
public Builder withClassPrefix(final @Nullable String value) {
this.classPrefix = value;
return this;
}
/** Sets the embedCss field. */
@JsonProperty("embed_css")
public Builder withEmbedCss(final @Nullable Boolean value) {
this.embedCss = value;
return this;
}
/** Builds the HtmlOutputConfig instance. */
public HtmlOutputConfig build() {
return new HtmlOutputConfig(
css,
cssFile,
theme,
classPrefix,
embedCss
);
}
}
// CPD-ON
public static HtmlOutputConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,57 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Built-in HTML theme selection.
*/
public enum HtmlTheme {
/** Sensible defaults: system font stack, neutral colours, readable line */
Default("default"),
/** GitHub Markdown-inspired palette and spacing. */
GitHub("github"),
/** Dark background, light text. */
Dark("dark"),
/** Minimal light theme with generous whitespace. */
Light("light"),
/**
* No built-in stylesheet emitted. CSS custom properties are still defined
*/
Unstyled("unstyled");
/** The string value. */
private final String value;
HtmlTheme(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static HtmlTheme fromValue(final String value) {
for (HtmlTheme e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,34 @@
package dev.kreuzberg;
import java.util.List;
/**
* Bridge interface for the DocumentExtractor plugin system.
*
* Implementations are wrapped by DocumentExtractorBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IDocumentExtractor {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** extract_bytes. */ String extract_bytes(byte[] content, String mime_type, ExtractionConfig config) throws Exception;
/** extract_file. */ String extract_file(java.nio.file.Path path, String mime_type, ExtractionConfig config) throws Exception;
/** supported_mime_types. */ List<String> supported_mime_types() throws Exception;
/** priority. */ int priority() throws Exception;
/** can_handle. */ boolean can_handle(java.nio.file.Path _path, String _mime_type) throws Exception;
}

View File

@@ -0,0 +1,28 @@
package dev.kreuzberg;
import java.util.List;
/**
* Bridge interface for the EmbeddingBackend plugin system.
*
* Implementations are wrapped by EmbeddingBackendBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IEmbeddingBackend {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** dimensions. */ long dimensions() throws Exception;
/** embed. */ List<List<Float>> embed(List<String> texts) throws Exception;
}

View File

@@ -0,0 +1,40 @@
package dev.kreuzberg;
import java.util.List;
/**
* Bridge interface for the OcrBackend plugin system.
*
* Implementations are wrapped by OcrBackendBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IOcrBackend {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** process_image. */ ExtractionResult process_image(byte[] image_bytes, OcrConfig config) throws Exception;
/** process_image_file. */ ExtractionResult process_image_file(java.nio.file.Path path, OcrConfig config) throws Exception;
/** supports_language. */ boolean supports_language(String lang) throws Exception;
/** backend_type. */ String backend_type() throws Exception;
/** supported_languages. */ List<String> supported_languages() throws Exception;
/** supports_table_detection. */ boolean supports_table_detection() throws Exception;
/** supports_document_processing. */ boolean supports_document_processing() throws Exception;
/** process_document. */ ExtractionResult process_document(java.nio.file.Path _path, OcrConfig _config) throws Exception;
}

View File

@@ -0,0 +1,32 @@
package dev.kreuzberg;
/**
* Bridge interface for the PostProcessor plugin system.
*
* Implementations are wrapped by PostProcessorBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IPostProcessor {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** process. */ void process(ExtractionResult result, ExtractionConfig config) throws Exception;
/** processing_stage. */ String processing_stage() throws Exception;
/** should_process. */ boolean should_process(ExtractionResult _result, ExtractionConfig _config) throws Exception;
/** estimated_duration_ms. */ long estimated_duration_ms(ExtractionResult _result) throws Exception;
/** priority. */ int priority() throws Exception;
}

View File

@@ -0,0 +1,24 @@
package dev.kreuzberg;
/**
* Bridge interface for the Renderer plugin system.
*
* Implementations are wrapped by RendererBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IRenderer {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** render. */ String render(String doc) throws Exception;
}

View File

@@ -0,0 +1,28 @@
package dev.kreuzberg;
/**
* Bridge interface for the Validator plugin system.
*
* Implementations are wrapped by ValidatorBridge and exposed to the native
* runtime through Panama FFM upcall stubs.
*/
public interface IValidator {
/** Plugin name (used for registry keying). */
String name();
/** Plugin version. */
String version();
/** Initialize the plugin. */
default void initialize() throws Exception {}
/** Shut down the plugin. */
default void shutdown() throws Exception {}
/** validate. */ void validate(ExtractionResult result, ExtractionConfig config) throws Exception;
/** should_validate. */ boolean should_validate(ExtractionResult _result, ExtractionConfig _config) throws Exception;
/** priority. */ int priority() throws Exception;
}

View File

@@ -0,0 +1,255 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Image extraction configuration.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImageExtractionConfig.Builder.class)
public record ImageExtractionConfig(
/**
* Extract images from documents
*/
@Nullable @JsonProperty("extract_images") Boolean extractImages,
/**
* Target DPI for image normalization
*/
@Nullable @JsonProperty("target_dpi") Integer targetDpi,
/**
* Maximum dimension for images (width or height)
*/
@Nullable @JsonProperty("max_image_dimension") Integer maxImageDimension,
/**
* Whether to inject image reference placeholders into markdown output.
* When {@code true} (default), image references like {@code ![Image 1](embedded:p1_i0)}
* are appended to the markdown. Set to {@code false} to extract images as data
* without polluting the markdown output.
*/
@Nullable @JsonProperty("inject_placeholders") Boolean injectPlaceholders,
/**
* Automatically adjust DPI based on image content
*/
@Nullable @JsonProperty("auto_adjust_dpi") Boolean autoAdjustDpi,
/**
* Minimum DPI threshold
*/
@Nullable @JsonProperty("min_dpi") Integer minDpi,
/**
* Maximum DPI threshold
*/
@Nullable @JsonProperty("max_dpi") Integer maxDpi,
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* {@code None} (default) means no limit — all images are extracted.
*/
@Nullable @JsonProperty("max_images_per_page") Integer maxImagesPerPage,
/**
* When {@code true} (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*/
@Nullable @JsonProperty("classify") Boolean classify,
/**
* When {@code true}, full-page renders produced during OCR preprocessing are captured
* and returned as {@code ImageKind.PageRaster} entries in {@code ExtractionResult.images}.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a {@code ProcessingWarning} is emitted in {@code ExtractionResult.processing_warnings}.
*
* Defaults to {@code false}. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*/
@Nullable @JsonProperty("include_page_rasters") Boolean includePageRasters,
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When {@code true} (default) and {@code ExtractionConfig.ocr} is configured, extracted images
* are processed with the configured OCR backend. Set to {@code false} to extract images
* without OCR processing, even when OCR is enabled.
*/
@Nullable @JsonProperty("run_ocr_on_images") Boolean runOcrOnImages,
/**
* When {@code true}, image OCR results are rendered as plain text without the
* {@code ![...](...)} markdown placeholder. Only takes effect when {@code run_ocr_on_images}
* is also {@code true}.
*/
@Nullable @JsonProperty("ocr_text_only") Boolean ocrTextOnly,
/**
* When {@code true} and {@code ocr_text_only} is {@code false}, append the OCR text after
* the image placeholder in the rendered output.
*/
@Nullable @JsonProperty("append_ocr_text") Boolean appendOcrText
) {
public static Builder builder() {
return new Builder();
}
public ImageExtractionConfig{
if (targetDpi == null) targetDpi = 300;
if (maxImageDimension == null) maxImageDimension = 4096;
if (minDpi == null) minDpi = 72;
if (maxDpi == null) maxDpi = 600;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("extract_images")
private Boolean extractImages = null;
@JsonProperty("target_dpi")
private Integer targetDpi = null;
@JsonProperty("max_image_dimension")
private Integer maxImageDimension = null;
@JsonProperty("inject_placeholders")
private Boolean injectPlaceholders = null;
@JsonProperty("auto_adjust_dpi")
private Boolean autoAdjustDpi = null;
@JsonProperty("min_dpi")
private Integer minDpi = null;
@JsonProperty("max_dpi")
private Integer maxDpi = null;
@JsonProperty("max_images_per_page")
private Integer maxImagesPerPage = null;
private Boolean classify = null;
@JsonProperty("include_page_rasters")
private Boolean includePageRasters = null;
@JsonProperty("run_ocr_on_images")
private Boolean runOcrOnImages = null;
@JsonProperty("ocr_text_only")
private Boolean ocrTextOnly = null;
@JsonProperty("append_ocr_text")
private Boolean appendOcrText = null;
/** Sets the extractImages field. */
@JsonProperty("extract_images")
public Builder withExtractImages(final @Nullable Boolean value) {
this.extractImages = value;
return this;
}
/** Sets the targetDpi field. */
@JsonProperty("target_dpi")
public Builder withTargetDpi(final @Nullable Integer value) {
this.targetDpi = value;
return this;
}
/** Sets the maxImageDimension field. */
@JsonProperty("max_image_dimension")
public Builder withMaxImageDimension(final @Nullable Integer value) {
this.maxImageDimension = value;
return this;
}
/** Sets the injectPlaceholders field. */
@JsonProperty("inject_placeholders")
public Builder withInjectPlaceholders(final @Nullable Boolean value) {
this.injectPlaceholders = value;
return this;
}
/** Sets the autoAdjustDpi field. */
@JsonProperty("auto_adjust_dpi")
public Builder withAutoAdjustDpi(final @Nullable Boolean value) {
this.autoAdjustDpi = value;
return this;
}
/** Sets the minDpi field. */
@JsonProperty("min_dpi")
public Builder withMinDpi(final @Nullable Integer value) {
this.minDpi = value;
return this;
}
/** Sets the maxDpi field. */
@JsonProperty("max_dpi")
public Builder withMaxDpi(final @Nullable Integer value) {
this.maxDpi = value;
return this;
}
/** Sets the maxImagesPerPage field. */
@JsonProperty("max_images_per_page")
public Builder withMaxImagesPerPage(final @Nullable Integer value) {
this.maxImagesPerPage = value;
return this;
}
/** Sets the classify field. */
@JsonProperty("classify")
public Builder withClassify(final @Nullable Boolean value) {
this.classify = value;
return this;
}
/** Sets the includePageRasters field. */
@JsonProperty("include_page_rasters")
public Builder withIncludePageRasters(final @Nullable Boolean value) {
this.includePageRasters = value;
return this;
}
/** Sets the runOcrOnImages field. */
@JsonProperty("run_ocr_on_images")
public Builder withRunOcrOnImages(final @Nullable Boolean value) {
this.runOcrOnImages = value;
return this;
}
/** Sets the ocrTextOnly field. */
@JsonProperty("ocr_text_only")
public Builder withOcrTextOnly(final @Nullable Boolean value) {
this.ocrTextOnly = value;
return this;
}
/** Sets the appendOcrText field. */
@JsonProperty("append_ocr_text")
public Builder withAppendOcrText(final @Nullable Boolean value) {
this.appendOcrText = value;
return this;
}
/** Builds the ImageExtractionConfig instance. */
public ImageExtractionConfig build() {
return new ImageExtractionConfig(
extractImages,
targetDpi,
maxImageDimension,
injectPlaceholders,
autoAdjustDpi,
minDpi,
maxDpi,
maxImagesPerPage,
classify,
includePageRasters,
runOcrOnImages,
ocrTextOnly,
appendOcrText
);
}
}
// CPD-ON
public static ImageExtractionConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/**
* Heuristic classification of what an image likely depicts.
*/
public enum ImageKind {
/** Photographic image (natural scene, photograph) */
Photograph("photograph"),
/** Technical or schematic diagram */
Diagram("diagram"),
/** Chart, graph, or plot */
Chart("chart"),
/** Freehand or technical drawing */
Drawing("drawing"),
/** Text-heavy image (scanned text, document) */
TextBlock("text_block"),
/** Decorative element or border */
Decoration("decoration"),
/** Logo or brand mark */
Logo("logo"),
/** Small icon */
Icon("icon"),
/** Fragment of a larger tiled image (tile of a technical drawing) */
TileFragment("tile_fragment"),
/** Mask or transparency map */
Mask("mask"),
/**
* Full-page render produced during OCR preprocessing; used as a citation thumbnail.
*/
PageRaster("page_raster"),
/** Could not classify with reasonable confidence */
Unknown("unknown");
/** The string value. */
private final String value;
ImageKind(final String value) {
this.value = value;
}
/** Returns the string value. */
@JsonValue
public String getValue() {
return value;
}
/** Creates an instance from a string value. */
@JsonCreator
public static ImageKind fromValue(final String value) {
for (ImageKind e : values()) {
if (e.value.equalsIgnoreCase(value)) {
return e;
}
}
throw new IllegalArgumentException("Unknown value: " + value);
}
/** Returns the wire-format string value (matches JSON serialization). */
@Override
public String toString() {
return value;
}
}

View File

@@ -0,0 +1,91 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImageMetadata.Builder.class)
public record ImageMetadata(
/**
* Image width in pixels
*/
@JsonProperty("width") int width,
/**
* Image height in pixels
*/
@JsonProperty("height") int height,
/**
* Image format (e.g., "PNG", "JPEG", "TIFF")
*/
@JsonProperty("format") String format,
/**
* EXIF metadata tags
*/
@JsonProperty("exif") Map<String, String> exif
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private int width = 0;
private int height = 0;
private String format = "";
private Map<String, String> exif = Map.of();
/** Sets the width field. */
@JsonProperty("width")
public Builder withWidth(final int value) {
this.width = value;
return this;
}
/** Sets the height field. */
@JsonProperty("height")
public Builder withHeight(final int value) {
this.height = value;
return this;
}
/** Sets the format field. */
@JsonProperty("format")
public Builder withFormat(final String value) {
this.format = value;
return this;
}
/** Sets the exif field. */
@JsonProperty("exif")
public Builder withExif(final Map<String, String> value) {
this.exif = value;
return this;
}
/** Builds the ImageMetadata instance. */
public ImageMetadata build() {
return new ImageMetadata(
width,
height,
format,
exif
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,117 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Image element metadata.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImageMetadataType.Builder.class)
public record ImageMetadataType(
/**
* Image source (URL, data URI, or SVG content)
*/
@JsonProperty("src") String src,
/**
* Alternative text from alt attribute
*/
@Nullable @JsonProperty("alt") String alt,
/**
* Title attribute
*/
@Nullable @JsonProperty("title") String title,
/**
* Image dimensions as (width, height) if available
*/
@Nullable @JsonProperty("dimensions") List<Integer> dimensions,
/**
* Image type classification
*/
@JsonProperty("image_type") ImageType imageType,
/**
* Additional attributes as key-value pairs
*/
@JsonProperty("attributes") List<List<String>> attributes
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String src = "";
private String alt = null;
private String title = null;
private List<Integer> dimensions = null;
@JsonProperty("image_type")
private ImageType imageType = null;
private List<List<String>> attributes = List.of();
/** Sets the src field. */
@JsonProperty("src")
public Builder withSrc(final String value) {
this.src = value;
return this;
}
/** Sets the alt field. */
@JsonProperty("alt")
public Builder withAlt(final @Nullable String value) {
this.alt = value;
return this;
}
/** Sets the title field. */
@JsonProperty("title")
public Builder withTitle(final @Nullable String value) {
this.title = value;
return this;
}
/** Sets the dimensions field. */
@JsonProperty("dimensions")
public Builder withDimensions(final @Nullable List<Integer> value) {
this.dimensions = value;
return this;
}
/** Sets the imageType field. */
@JsonProperty("image_type")
public Builder withImageType(final ImageType value) {
this.imageType = value;
return this;
}
/** Sets the attributes field. */
@JsonProperty("attributes")
public Builder withAttributes(final List<List<String>> value) {
this.attributes = value;
return this;
}
/** Builds the ImageMetadataType instance. */
public ImageMetadataType build() {
return new ImageMetadataType(
src,
alt,
title,
dimensions,
imageType,
attributes
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,142 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImagePreprocessingConfig.Builder.class)
public record ImagePreprocessingConfig(
/**
* Target DPI for the image (300 is standard, 600 for small text).
*/
@JsonProperty("target_dpi") int targetDpi,
/**
* Auto-detect and correct image rotation.
*/
@JsonProperty("auto_rotate") boolean autoRotate,
/**
* Correct skew (tilted images).
*/
@JsonProperty("deskew") boolean deskew,
/**
* Remove noise from the image.
*/
@JsonProperty("denoise") boolean denoise,
/**
* Enhance contrast for better text visibility.
*/
@JsonProperty("contrast_enhance") boolean contrastEnhance,
/**
* Binarization method: "otsu", "sauvola", "adaptive".
*/
@JsonProperty("binarization_method") String binarizationMethod,
/**
* Invert colors (white text on black → black on white).
*/
@JsonProperty("invert_colors") boolean invertColors
) {
public static Builder builder() {
return new Builder();
}
public ImagePreprocessingConfig{
if (targetDpi == 0) targetDpi = 300;
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("target_dpi")
private int targetDpi = 0;
@JsonProperty("auto_rotate")
private boolean autoRotate = true;
private boolean deskew = true;
private boolean denoise = false;
@JsonProperty("contrast_enhance")
private boolean contrastEnhance = false;
@JsonProperty("binarization_method")
private String binarizationMethod = "otsu";
@JsonProperty("invert_colors")
private boolean invertColors = false;
/** Sets the targetDpi field. */
@JsonProperty("target_dpi")
public Builder withTargetDpi(final int value) {
this.targetDpi = value;
return this;
}
/** Sets the autoRotate field. */
@JsonProperty("auto_rotate")
public Builder withAutoRotate(final boolean value) {
this.autoRotate = value;
return this;
}
/** Sets the deskew field. */
@JsonProperty("deskew")
public Builder withDeskew(final boolean value) {
this.deskew = value;
return this;
}
/** Sets the denoise field. */
@JsonProperty("denoise")
public Builder withDenoise(final boolean value) {
this.denoise = value;
return this;
}
/** Sets the contrastEnhance field. */
@JsonProperty("contrast_enhance")
public Builder withContrastEnhance(final boolean value) {
this.contrastEnhance = value;
return this;
}
/** Sets the binarizationMethod field. */
@JsonProperty("binarization_method")
public Builder withBinarizationMethod(final String value) {
this.binarizationMethod = value;
return this;
}
/** Sets the invertColors field. */
@JsonProperty("invert_colors")
public Builder withInvertColors(final boolean value) {
this.invertColors = value;
return this;
}
/** Builds the ImagePreprocessingConfig instance. */
public ImagePreprocessingConfig build() {
return new ImagePreprocessingConfig(
targetDpi,
autoRotate,
deskew,
denoise,
contrastEnhance,
binarizationMethod,
invertColors
);
}
}
// CPD-ON
public static ImagePreprocessingConfig defaultInstance() {
throw new UnsupportedOperationException("defaultInstance is not yet bridged via JNI; use the Builder instead.");
}
}

View File

@@ -0,0 +1,209 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ImagePreprocessingMetadata.Builder.class)
public record ImagePreprocessingMetadata(
/**
* Original image dimensions (width, height) in pixels
*/
@JsonProperty("original_dimensions") List<Long> originalDimensions,
/**
* Original image DPI (horizontal, vertical)
*/
@JsonProperty("original_dpi") List<Double> originalDpi,
/**
* Target DPI from configuration
*/
@JsonProperty("target_dpi") int targetDpi,
/**
* Scaling factor applied to the image
*/
@JsonProperty("scale_factor") double scaleFactor,
/**
* Whether DPI was auto-adjusted based on content
*/
@JsonProperty("auto_adjusted") boolean autoAdjusted,
/**
* Final DPI after processing
*/
@JsonProperty("final_dpi") int finalDpi,
/**
* New dimensions after resizing (if resized)
*/
@Nullable @JsonProperty("new_dimensions") List<Long> newDimensions,
/**
* Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
*/
@JsonProperty("resample_method") String resampleMethod,
/**
* Whether dimensions were clamped to max_image_dimension
*/
@JsonProperty("dimension_clamped") boolean dimensionClamped,
/**
* Calculated optimal DPI (if auto_adjust_dpi enabled)
*/
@Nullable @JsonProperty("calculated_dpi") Integer calculatedDpi,
/**
* Whether resize was skipped (dimensions already optimal)
*/
@JsonProperty("skipped_resize") boolean skippedResize,
/**
* Error message if resize failed
*/
@Nullable @JsonProperty("resize_error") String resizeError
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
@JsonProperty("original_dimensions")
private List<Long> originalDimensions = List.of();
@JsonProperty("original_dpi")
private List<Double> originalDpi = List.of();
@JsonProperty("target_dpi")
private int targetDpi = 0;
@JsonProperty("scale_factor")
private double scaleFactor = 0.0;
@JsonProperty("auto_adjusted")
private boolean autoAdjusted = false;
@JsonProperty("final_dpi")
private int finalDpi = 0;
@JsonProperty("new_dimensions")
private List<Long> newDimensions = null;
@JsonProperty("resample_method")
private String resampleMethod = "";
@JsonProperty("dimension_clamped")
private boolean dimensionClamped = false;
@JsonProperty("calculated_dpi")
private Integer calculatedDpi = null;
@JsonProperty("skipped_resize")
private boolean skippedResize = false;
@JsonProperty("resize_error")
private String resizeError = null;
/** Sets the originalDimensions field. */
@JsonProperty("original_dimensions")
public Builder withOriginalDimensions(final List<Long> value) {
this.originalDimensions = value;
return this;
}
/** Sets the originalDpi field. */
@JsonProperty("original_dpi")
public Builder withOriginalDpi(final List<Double> value) {
this.originalDpi = value;
return this;
}
/** Sets the targetDpi field. */
@JsonProperty("target_dpi")
public Builder withTargetDpi(final int value) {
this.targetDpi = value;
return this;
}
/** Sets the scaleFactor field. */
@JsonProperty("scale_factor")
public Builder withScaleFactor(final double value) {
this.scaleFactor = value;
return this;
}
/** Sets the autoAdjusted field. */
@JsonProperty("auto_adjusted")
public Builder withAutoAdjusted(final boolean value) {
this.autoAdjusted = value;
return this;
}
/** Sets the finalDpi field. */
@JsonProperty("final_dpi")
public Builder withFinalDpi(final int value) {
this.finalDpi = value;
return this;
}
/** Sets the newDimensions field. */
@JsonProperty("new_dimensions")
public Builder withNewDimensions(final @Nullable List<Long> value) {
this.newDimensions = value;
return this;
}
/** Sets the resampleMethod field. */
@JsonProperty("resample_method")
public Builder withResampleMethod(final String value) {
this.resampleMethod = value;
return this;
}
/** Sets the dimensionClamped field. */
@JsonProperty("dimension_clamped")
public Builder withDimensionClamped(final boolean value) {
this.dimensionClamped = value;
return this;
}
/** Sets the calculatedDpi field. */
@JsonProperty("calculated_dpi")
public Builder withCalculatedDpi(final @Nullable int value) {
this.calculatedDpi = value;
return this;
}
/** Sets the skippedResize field. */
@JsonProperty("skipped_resize")
public Builder withSkippedResize(final boolean value) {
this.skippedResize = value;
return this;
}
/** Sets the resizeError field. */
@JsonProperty("resize_error")
public Builder withResizeError(final @Nullable String value) {
this.resizeError = value;
return this;
}
/** Builds the ImagePreprocessingMetadata instance. */
public ImagePreprocessingMetadata build() {
return new ImagePreprocessingMetadata(
originalDimensions,
originalDpi,
targetDpi,
scaleFactor,
autoAdjusted,
finalDpi,
newDimensions,
resampleMethod,
dimensionClamped,
calculatedDpi,
skippedResize,
resizeError
);
}
}
// CPD-ON
}

View File

@@ -0,0 +1,15 @@
// DO NOT EDIT - auto-generated by alef
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
package dev.kreuzberg;
public class ImageProcessingException extends KreuzbergErrorException {
/** Creates a new ImageProcessingException with the given message. */
public ImageProcessingException(final String message) {
super(message);
}
/** Creates a new ImageProcessingException with the given message and cause. */
public ImageProcessingException(final String message, final Throwable cause) {
super(message, cause);
}
}

Some files were not shown because too many files have changed in this diff Show More