357 lines
12 KiB
Java
Generated
357 lines
12 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* Extraction result metadata.
|
|
*
|
|
* Contains common fields applicable to all formats, format-specific metadata
|
|
* via a discriminated union, and additional custom fields from postprocessors.
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = Metadata.Builder.class)
|
|
public record Metadata(
|
|
/**
|
|
* Document title
|
|
*/
|
|
@Nullable @JsonProperty("title") String title,
|
|
/**
|
|
* Document subject or description
|
|
*/
|
|
@Nullable @JsonProperty("subject") String subject,
|
|
/**
|
|
* Primary author(s) - always Vec for consistency
|
|
*/
|
|
@Nullable @JsonProperty("authors") List<String> authors,
|
|
/**
|
|
* Keywords/tags - always Vec for consistency
|
|
*/
|
|
@Nullable @JsonProperty("keywords") List<String> keywords,
|
|
/**
|
|
* Primary language (ISO 639 code)
|
|
*/
|
|
@Nullable @JsonProperty("language") String language,
|
|
/**
|
|
* Creation timestamp (ISO 8601 format)
|
|
*/
|
|
@Nullable @JsonProperty("created_at") String createdAt,
|
|
/**
|
|
* Last modification timestamp (ISO 8601 format)
|
|
*/
|
|
@Nullable @JsonProperty("modified_at") String modifiedAt,
|
|
/**
|
|
* User who created the document
|
|
*/
|
|
@Nullable @JsonProperty("created_by") String createdBy,
|
|
/**
|
|
* User who last modified the document
|
|
*/
|
|
@Nullable @JsonProperty("modified_by") String modifiedBy,
|
|
/**
|
|
* Page/slide/sheet structure with boundaries
|
|
*/
|
|
@Nullable @JsonProperty("pages") PageStructure pages,
|
|
/**
|
|
* Format-specific metadata (discriminated union)
|
|
*
|
|
* Contains detailed metadata specific to the document format.
|
|
* Serialized as a nested {@code "format"} object with a {@code format_type} discriminator field.
|
|
*/
|
|
@JsonDeserialize(using = FormatMetadataDeserializer.class) @Nullable @JsonProperty("format") FormatMetadata format,
|
|
/**
|
|
* Image preprocessing metadata (when OCR preprocessing was applied)
|
|
*/
|
|
@Nullable @JsonProperty("image_preprocessing") ImagePreprocessingMetadata imagePreprocessing,
|
|
/**
|
|
* JSON schema (for structured data extraction)
|
|
*/
|
|
@Nullable @JsonProperty("json_schema") JsonNode jsonSchema,
|
|
/**
|
|
* Error metadata (for batch operations)
|
|
*/
|
|
@Nullable @JsonProperty("error") ErrorMetadata error,
|
|
/**
|
|
* Extraction duration in milliseconds (for benchmarking).
|
|
*
|
|
* This field is populated by batch extraction to provide per-file timing
|
|
* information. It's {@code None} for single-file extraction (which uses external timing).
|
|
*/
|
|
@Nullable @JsonProperty("extraction_duration_ms") Long extractionDurationMs,
|
|
/**
|
|
* Document category (from frontmatter or classification).
|
|
*/
|
|
@Nullable @JsonProperty("category") String category,
|
|
/**
|
|
* Document tags (from frontmatter).
|
|
*/
|
|
@Nullable @JsonProperty("tags") List<String> tags,
|
|
/**
|
|
* Document version string (from frontmatter).
|
|
*/
|
|
@Nullable @JsonProperty("document_version") String documentVersion,
|
|
/**
|
|
* Abstract or summary text (from frontmatter).
|
|
*/
|
|
@Nullable @JsonProperty("abstract_text") String abstractText,
|
|
/**
|
|
* Output format identifier (e.g., "markdown", "html", "text").
|
|
*
|
|
* Set by the output format pipeline stage when format conversion is applied.
|
|
* Previously stored in {@code metadata.additional["output_format"]}.
|
|
*/
|
|
@Nullable @JsonProperty("output_format") String outputFormat,
|
|
/**
|
|
* Whether OCR was used during extraction.
|
|
*
|
|
* Set to {@code true} whenever the extraction pipeline ran an OCR backend
|
|
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
|
|
* or fallback text. {@code false} means native text extraction was used exclusively.
|
|
*/
|
|
@Nullable @JsonProperty("ocr_used") Boolean ocrUsed,
|
|
/**
|
|
* Additional custom fields from postprocessors.
|
|
*
|
|
* Serialized as a nested {@code "additional"} object (not flattened at root level).
|
|
* Uses {@code Cow<'static, str>} keys so static string keys avoid allocation.
|
|
*/
|
|
@Nullable @JsonProperty("additional") Map<String, JsonNode> additional
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
private String title = null;
|
|
private String subject = null;
|
|
private List<String> authors = null;
|
|
private List<String> keywords = null;
|
|
private String language = null;
|
|
@JsonProperty("created_at")
|
|
private String createdAt = null;
|
|
@JsonProperty("modified_at")
|
|
private String modifiedAt = null;
|
|
@JsonProperty("created_by")
|
|
private String createdBy = null;
|
|
@JsonProperty("modified_by")
|
|
private String modifiedBy = null;
|
|
private PageStructure pages = null;
|
|
private FormatMetadata format = null;
|
|
@JsonProperty("image_preprocessing")
|
|
private ImagePreprocessingMetadata imagePreprocessing = null;
|
|
@JsonProperty("json_schema")
|
|
private JsonNode jsonSchema = null;
|
|
private ErrorMetadata error = null;
|
|
@JsonProperty("extraction_duration_ms")
|
|
private Long extractionDurationMs = null;
|
|
private String category = null;
|
|
private List<String> tags = null;
|
|
@JsonProperty("document_version")
|
|
private String documentVersion = null;
|
|
@JsonProperty("abstract_text")
|
|
private String abstractText = null;
|
|
@JsonProperty("output_format")
|
|
private String outputFormat = null;
|
|
@JsonProperty("ocr_used")
|
|
private Boolean ocrUsed = null;
|
|
private Map<String, JsonNode> additional = null;
|
|
|
|
/** Sets the title field. */
|
|
@JsonProperty("title")
|
|
public Builder withTitle(final @Nullable String value) {
|
|
this.title = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the subject field. */
|
|
@JsonProperty("subject")
|
|
public Builder withSubject(final @Nullable String value) {
|
|
this.subject = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the authors field. */
|
|
@JsonProperty("authors")
|
|
public Builder withAuthors(final @Nullable List<String> value) {
|
|
this.authors = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the keywords field. */
|
|
@JsonProperty("keywords")
|
|
public Builder withKeywords(final @Nullable List<String> value) {
|
|
this.keywords = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the language field. */
|
|
@JsonProperty("language")
|
|
public Builder withLanguage(final @Nullable String value) {
|
|
this.language = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the createdAt field. */
|
|
@JsonProperty("created_at")
|
|
public Builder withCreatedAt(final @Nullable String value) {
|
|
this.createdAt = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the modifiedAt field. */
|
|
@JsonProperty("modified_at")
|
|
public Builder withModifiedAt(final @Nullable String value) {
|
|
this.modifiedAt = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the createdBy field. */
|
|
@JsonProperty("created_by")
|
|
public Builder withCreatedBy(final @Nullable String value) {
|
|
this.createdBy = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the modifiedBy field. */
|
|
@JsonProperty("modified_by")
|
|
public Builder withModifiedBy(final @Nullable String value) {
|
|
this.modifiedBy = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the pages field. */
|
|
@JsonProperty("pages")
|
|
public Builder withPages(final @Nullable PageStructure value) {
|
|
this.pages = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the format field. */
|
|
@JsonProperty("format")
|
|
public Builder withFormat(final @Nullable FormatMetadata value) {
|
|
this.format = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the imagePreprocessing field. */
|
|
@JsonProperty("image_preprocessing")
|
|
public Builder withImagePreprocessing(final @Nullable ImagePreprocessingMetadata value) {
|
|
this.imagePreprocessing = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the jsonSchema field. */
|
|
@JsonProperty("json_schema")
|
|
public Builder withJsonSchema(final @Nullable JsonNode value) {
|
|
this.jsonSchema = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the error field. */
|
|
@JsonProperty("error")
|
|
public Builder withError(final @Nullable ErrorMetadata value) {
|
|
this.error = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the extractionDurationMs field. */
|
|
@JsonProperty("extraction_duration_ms")
|
|
public Builder withExtractionDurationMs(final @Nullable long value) {
|
|
this.extractionDurationMs = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the category field. */
|
|
@JsonProperty("category")
|
|
public Builder withCategory(final @Nullable String value) {
|
|
this.category = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tags field. */
|
|
@JsonProperty("tags")
|
|
public Builder withTags(final @Nullable List<String> value) {
|
|
this.tags = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the documentVersion field. */
|
|
@JsonProperty("document_version")
|
|
public Builder withDocumentVersion(final @Nullable String value) {
|
|
this.documentVersion = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the abstractText field. */
|
|
@JsonProperty("abstract_text")
|
|
public Builder withAbstractText(final @Nullable String value) {
|
|
this.abstractText = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the outputFormat field. */
|
|
@JsonProperty("output_format")
|
|
public Builder withOutputFormat(final @Nullable String value) {
|
|
this.outputFormat = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the ocrUsed field. */
|
|
@JsonProperty("ocr_used")
|
|
public Builder withOcrUsed(final @Nullable Boolean value) {
|
|
this.ocrUsed = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the additional field. */
|
|
@JsonProperty("additional")
|
|
public Builder withAdditional(final @Nullable Map<String, JsonNode> value) {
|
|
this.additional = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the Metadata instance. */
|
|
public Metadata build() {
|
|
return new Metadata(
|
|
title,
|
|
subject,
|
|
authors,
|
|
keywords,
|
|
language,
|
|
createdAt,
|
|
modifiedAt,
|
|
createdBy,
|
|
modifiedBy,
|
|
pages,
|
|
format,
|
|
imagePreprocessing,
|
|
jsonSchema,
|
|
error,
|
|
extractionDurationMs,
|
|
category,
|
|
tags,
|
|
documentVersion,
|
|
abstractText,
|
|
outputFormat,
|
|
ocrUsed,
|
|
additional
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
}
|