481 lines
18 KiB
Java
Generated
481 lines
18 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
package dev.kreuzberg;
|
|
|
|
import java.util.List;
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
|
|
import org.jspecify.annotations.Nullable;
|
|
|
|
/**
|
|
* General extraction result used by the core extraction API.
|
|
*
|
|
* This is the main result type returned by all extraction functions.
|
|
*/
|
|
@JsonInclude(JsonInclude.Include.NON_ABSENT)
|
|
@JsonDeserialize(builder = ExtractionResult.Builder.class)
|
|
public record ExtractionResult(
|
|
@JsonProperty("content") String content,
|
|
@JsonProperty("mime_type") String mimeType,
|
|
@JsonProperty("metadata") Metadata metadata,
|
|
/**
|
|
* Extraction strategy used to produce the returned text.
|
|
*
|
|
* Populated when the extractor can reliably distinguish native text extraction,
|
|
* OCR-only extraction, or mixed native/OCR output.
|
|
*/
|
|
@Nullable @JsonProperty("extraction_method") ExtractionMethod extractionMethod,
|
|
@JsonProperty("tables") List<Table> tables,
|
|
@Nullable @JsonProperty("detected_languages") List<String> detectedLanguages,
|
|
/**
|
|
* Text chunks when chunking is enabled.
|
|
*
|
|
* When chunking configuration is provided, the content is split into
|
|
* overlapping chunks for efficient processing. Each chunk contains the text,
|
|
* optional embeddings (if enabled), and metadata about its position.
|
|
*/
|
|
@Nullable @JsonProperty("chunks") List<Chunk> chunks,
|
|
/**
|
|
* Extracted images from the document.
|
|
*
|
|
* When image extraction is enabled via {@code ImageExtractionConfig}, this field
|
|
* contains all images found in the document with their raw data and metadata.
|
|
* Each image may optionally contain a nested {@code ocr_result} if OCR was performed.
|
|
*/
|
|
@Nullable @JsonProperty("images") List<ExtractedImage> images,
|
|
/**
|
|
* Per-page content when page extraction is enabled.
|
|
*
|
|
* When page extraction is configured, the document is split into per-page content
|
|
* with tables and images mapped to their respective pages.
|
|
*/
|
|
@Nullable @JsonProperty("pages") List<PageContent> pages,
|
|
/**
|
|
* Semantic elements when element-based result format is enabled.
|
|
*
|
|
* When result_format is set to ElementBased, this field contains semantic
|
|
* elements with type classification, unique identifiers, and metadata for
|
|
* Unstructured-compatible element-based processing.
|
|
*/
|
|
@Nullable @JsonProperty("elements") List<Element> elements,
|
|
/**
|
|
* Rich Djot content structure (when extracting Djot documents).
|
|
*
|
|
* When extracting Djot documents with structured extraction enabled,
|
|
* this field contains the full semantic structure including:
|
|
* - Block-level elements with nesting
|
|
* - Inline formatting with attributes
|
|
* - Links, images, footnotes
|
|
* - Math expressions
|
|
* - Complete attribute information
|
|
*
|
|
* The {@code content} field still contains plain text for backward compatibility.
|
|
*
|
|
* Always {@code None} for non-Djot documents.
|
|
*/
|
|
@Nullable @JsonProperty("djot_content") DjotContent djotContent,
|
|
/**
|
|
* OCR elements with full spatial and confidence metadata.
|
|
*
|
|
* When OCR is performed with element extraction enabled, this field contains
|
|
* the structured representation of detected text including:
|
|
* - Bounding geometry (rectangles or quadrilaterals)
|
|
* - Confidence scores (detection and recognition)
|
|
* - Rotation information
|
|
* - Hierarchical relationships (Tesseract only)
|
|
*
|
|
* This field preserves all metadata that would otherwise be lost when
|
|
* converting to plain text or markdown output formats.
|
|
*
|
|
* Only populated when {@code OcrElementConfig.include_elements} is true.
|
|
*/
|
|
@Nullable @JsonProperty("ocr_elements") List<OcrElement> ocrElements,
|
|
/**
|
|
* Structured document tree (when document structure extraction is enabled).
|
|
*
|
|
* When {@code include_document_structure} is true in {@code ExtractionConfig}, this field
|
|
* contains the full hierarchical representation of the document including:
|
|
* - Heading-driven section nesting
|
|
* - Table grids with cell-level metadata
|
|
* - Content layer classification (body, header, footer, footnote)
|
|
* - Inline text annotations (formatting, links)
|
|
* - Bounding boxes and page numbers
|
|
*
|
|
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
|
|
*/
|
|
@Nullable @JsonProperty("document") DocumentStructure document,
|
|
/**
|
|
* Extracted keywords when keyword extraction is enabled.
|
|
*
|
|
* When keyword extraction (RAKE or YAKE) is configured, this field contains
|
|
* the extracted keywords with scores, algorithm info, and position data.
|
|
* Previously stored in {@code metadata.additional["keywords"]}.
|
|
*/
|
|
@Nullable @JsonProperty("extracted_keywords") List<Keyword> extractedKeywords,
|
|
/**
|
|
* Document quality score from quality analysis.
|
|
*
|
|
* A value between 0.0 and 1.0 indicating the overall text quality.
|
|
* Previously stored in {@code metadata.additional["quality_score"]}.
|
|
*/
|
|
@Nullable @JsonProperty("quality_score") Double qualityScore,
|
|
/**
|
|
* Non-fatal warnings collected during processing pipeline stages.
|
|
*
|
|
* Captures errors from optional pipeline features (embedding, chunking,
|
|
* language detection, output formatting) that don't prevent extraction
|
|
* but may indicate degraded results.
|
|
* Previously stored as individual keys in {@code metadata.additional}.
|
|
*/
|
|
@Nullable @JsonProperty("processing_warnings") List<ProcessingWarning> processingWarnings,
|
|
/**
|
|
* PDF annotations extracted from the document.
|
|
*
|
|
* When annotation extraction is enabled via {@code PdfConfig.extract_annotations},
|
|
* this field contains text notes, highlights, links, stamps, and other
|
|
* annotations found in PDF documents.
|
|
*/
|
|
@Nullable @JsonProperty("annotations") List<PdfAnnotation> annotations,
|
|
/**
|
|
* Nested extraction results from archive contents.
|
|
*
|
|
* When extracting archives, each processable file inside produces its own
|
|
* full extraction result. Set to {@code None} for non-archive formats.
|
|
* Use {@code max_archive_depth} in config to control recursion depth.
|
|
*/
|
|
@Nullable @JsonProperty("children") List<ArchiveEntry> children,
|
|
/**
|
|
* URIs/links discovered during document extraction.
|
|
*
|
|
* Contains hyperlinks, image references, citations, email addresses, and
|
|
* other URI-like references found in the document. Always extracted when
|
|
* present in the source document.
|
|
*/
|
|
@Nullable @JsonProperty("uris") List<ExtractedUri> uris,
|
|
/**
|
|
* Tracked changes embedded in the source document.
|
|
*
|
|
* Populated by per-format extractors that understand change-tracking
|
|
* metadata (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*},
|
|
* …). Every extractor defaults to {@code None} until its format-specific
|
|
* implementation is added. Extractors that do populate this field follow
|
|
* the "accepted-changes" convention: inserted text is present in
|
|
* {@code content}, deleted text is absent — the revision list is the separate
|
|
* audit trail.
|
|
*/
|
|
@Nullable @JsonProperty("revisions") List<DocumentRevision> revisions,
|
|
/**
|
|
* Structured extraction output from LLM-based JSON schema extraction.
|
|
*
|
|
* When {@code structured_extraction} is configured in {@code ExtractionConfig}, the
|
|
* extracted document content is sent to a VLM with the provided JSON schema.
|
|
* The response is parsed and stored here as a JSON value matching the schema.
|
|
*/
|
|
@Nullable @JsonProperty("structured_output") JsonNode structuredOutput,
|
|
/**
|
|
* Code intelligence results from tree-sitter analysis.
|
|
*
|
|
* Populated when extracting source code files with the {@code tree-sitter} feature.
|
|
* Contains metrics, structural analysis, imports/exports, comments,
|
|
* docstrings, symbols, diagnostics, and optionally chunked code segments.
|
|
*
|
|
* Stored as an opaque JSON value so that all language bindings (Go, Java,
|
|
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
|
|
* The underlying type is {@code tree_sitter_language_pack.ProcessResult}.
|
|
*/
|
|
@Nullable @JsonProperty("code_intelligence") JsonNode codeIntelligence,
|
|
/**
|
|
* LLM token usage and cost data for all LLM calls made during this extraction.
|
|
*
|
|
* Contains one entry per LLM call. Multiple entries are produced when
|
|
* VLM OCR, structured extraction, or LLM embeddings run during
|
|
* the same extraction.
|
|
*
|
|
* {@code None} when no LLM was used.
|
|
*/
|
|
@Nullable @JsonProperty("llm_usage") List<LlmUsage> llmUsage,
|
|
/**
|
|
* Pre-rendered content in the requested output format.
|
|
*
|
|
* Populated during {@code derive_extraction_result} before tree derivation consumes
|
|
* element data. {@code apply_output_format} swaps this into {@code content} at the end
|
|
* of the pipeline, after post-processors have operated on plain text.
|
|
*/
|
|
@Nullable @JsonProperty("formatted_content") String formattedContent,
|
|
/**
|
|
* Structured hOCR document for the OCR+layout pipeline.
|
|
*
|
|
* When tesseract produces hOCR output, the parsed {@code InternalDocument} carries
|
|
* paragraph structure with bounding boxes and confidence scores. The layout
|
|
* classification step enriches these elements before final rendering.
|
|
*/
|
|
@Nullable @JsonProperty("ocr_internal_document") String ocrInternalDocument
|
|
) {
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
|
|
// CPD-OFF
|
|
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
|
|
public static final class Builder {
|
|
|
|
private String content = "";
|
|
@JsonProperty("mime_type")
|
|
private String mimeType = "";
|
|
private Metadata metadata = null;
|
|
@JsonProperty("extraction_method")
|
|
@Nullable private ExtractionMethod extractionMethod = null;
|
|
private List<Table> tables = List.of();
|
|
@JsonProperty("detected_languages")
|
|
private List<String> detectedLanguages = null;
|
|
private List<Chunk> chunks = null;
|
|
private List<ExtractedImage> images = null;
|
|
private List<PageContent> pages = null;
|
|
private List<Element> elements = null;
|
|
@JsonProperty("djot_content")
|
|
@Nullable private DjotContent djotContent = null;
|
|
@JsonProperty("ocr_elements")
|
|
private List<OcrElement> ocrElements = null;
|
|
@Nullable private DocumentStructure document = null;
|
|
@JsonProperty("extracted_keywords")
|
|
private List<Keyword> extractedKeywords = null;
|
|
@JsonProperty("quality_score")
|
|
private Double qualityScore = null;
|
|
@JsonProperty("processing_warnings")
|
|
private List<ProcessingWarning> processingWarnings = null;
|
|
private List<PdfAnnotation> annotations = null;
|
|
private List<ArchiveEntry> children = null;
|
|
private List<ExtractedUri> uris = null;
|
|
private List<DocumentRevision> revisions = null;
|
|
@JsonProperty("structured_output")
|
|
private JsonNode structuredOutput = null;
|
|
@JsonProperty("code_intelligence")
|
|
private JsonNode codeIntelligence = null;
|
|
@JsonProperty("llm_usage")
|
|
private List<LlmUsage> llmUsage = null;
|
|
@JsonProperty("formatted_content")
|
|
private String formattedContent = null;
|
|
@JsonProperty("ocr_internal_document")
|
|
private String ocrInternalDocument = null;
|
|
|
|
/** Sets the content field. */
|
|
@JsonProperty("content")
|
|
public Builder withContent(final String value) {
|
|
this.content = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the mimeType field. */
|
|
@JsonProperty("mime_type")
|
|
public Builder withMimeType(final String value) {
|
|
this.mimeType = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the metadata field. */
|
|
@JsonProperty("metadata")
|
|
public Builder withMetadata(final Metadata value) {
|
|
this.metadata = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the extractionMethod field. */
|
|
@JsonProperty("extraction_method")
|
|
public Builder withExtractionMethod(final @Nullable ExtractionMethod value) {
|
|
this.extractionMethod = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the tables field. */
|
|
@JsonProperty("tables")
|
|
public Builder withTables(final List<Table> value) {
|
|
this.tables = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the detectedLanguages field. */
|
|
@JsonProperty("detected_languages")
|
|
public Builder withDetectedLanguages(final @Nullable List<String> value) {
|
|
this.detectedLanguages = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the chunks field. */
|
|
@JsonProperty("chunks")
|
|
public Builder withChunks(final @Nullable List<Chunk> value) {
|
|
this.chunks = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the images field. */
|
|
@JsonProperty("images")
|
|
public Builder withImages(final @Nullable List<ExtractedImage> value) {
|
|
this.images = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the pages field. */
|
|
@JsonProperty("pages")
|
|
public Builder withPages(final @Nullable List<PageContent> value) {
|
|
this.pages = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the elements field. */
|
|
@JsonProperty("elements")
|
|
public Builder withElements(final @Nullable List<Element> value) {
|
|
this.elements = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the djotContent field. */
|
|
@JsonProperty("djot_content")
|
|
public Builder withDjotContent(final @Nullable DjotContent value) {
|
|
this.djotContent = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the ocrElements field. */
|
|
@JsonProperty("ocr_elements")
|
|
public Builder withOcrElements(final @Nullable List<OcrElement> value) {
|
|
this.ocrElements = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the document field. */
|
|
@JsonProperty("document")
|
|
public Builder withDocument(final @Nullable DocumentStructure value) {
|
|
this.document = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the extractedKeywords field. */
|
|
@JsonProperty("extracted_keywords")
|
|
public Builder withExtractedKeywords(final @Nullable List<Keyword> value) {
|
|
this.extractedKeywords = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the qualityScore field. */
|
|
@JsonProperty("quality_score")
|
|
public Builder withQualityScore(final @Nullable Double value) {
|
|
this.qualityScore = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the processingWarnings field. */
|
|
@JsonProperty("processing_warnings")
|
|
public Builder withProcessingWarnings(final @Nullable List<ProcessingWarning> value) {
|
|
this.processingWarnings = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the annotations field. */
|
|
@JsonProperty("annotations")
|
|
public Builder withAnnotations(final @Nullable List<PdfAnnotation> value) {
|
|
this.annotations = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the children field. */
|
|
@JsonProperty("children")
|
|
public Builder withChildren(final @Nullable List<ArchiveEntry> value) {
|
|
this.children = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the uris field. */
|
|
@JsonProperty("uris")
|
|
public Builder withUris(final @Nullable List<ExtractedUri> value) {
|
|
this.uris = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the revisions field. */
|
|
@JsonProperty("revisions")
|
|
public Builder withRevisions(final @Nullable List<DocumentRevision> value) {
|
|
this.revisions = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the structuredOutput field. */
|
|
@JsonProperty("structured_output")
|
|
public Builder withStructuredOutput(final @Nullable JsonNode value) {
|
|
this.structuredOutput = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the codeIntelligence field. */
|
|
@JsonProperty("code_intelligence")
|
|
public Builder withCodeIntelligence(final @Nullable JsonNode value) {
|
|
this.codeIntelligence = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the llmUsage field. */
|
|
@JsonProperty("llm_usage")
|
|
public Builder withLlmUsage(final @Nullable List<LlmUsage> value) {
|
|
this.llmUsage = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the formattedContent field. */
|
|
@JsonProperty("formatted_content")
|
|
public Builder withFormattedContent(final @Nullable String value) {
|
|
this.formattedContent = value;
|
|
return this;
|
|
}
|
|
|
|
/** Sets the ocrInternalDocument field. */
|
|
@JsonProperty("ocr_internal_document")
|
|
public Builder withOcrInternalDocument(final @Nullable String value) {
|
|
this.ocrInternalDocument = value;
|
|
return this;
|
|
}
|
|
|
|
/** Builds the ExtractionResult instance. */
|
|
public ExtractionResult build() {
|
|
return new ExtractionResult(
|
|
content,
|
|
mimeType,
|
|
metadata,
|
|
extractionMethod,
|
|
tables,
|
|
detectedLanguages,
|
|
chunks,
|
|
images,
|
|
pages,
|
|
elements,
|
|
djotContent,
|
|
ocrElements,
|
|
document,
|
|
extractedKeywords,
|
|
qualityScore,
|
|
processingWarnings,
|
|
annotations,
|
|
children,
|
|
uris,
|
|
revisions,
|
|
structuredOutput,
|
|
codeIntelligence,
|
|
llmUsage,
|
|
formattedContent,
|
|
ocrInternalDocument
|
|
);
|
|
}
|
|
}
|
|
// CPD-ON
|
|
/**
|
|
* Convert from an OCR result.
|
|
*/
|
|
public static ExtractionResult fromOcr(OcrExtractionResult ocr) {
|
|
throw new UnsupportedOperationException("fromOcr is not yet bridged via JNI; use the Builder instead.");
|
|
}
|
|
}
|