Files
fil/packages/java/dev/kreuzberg/ExtractionResult.java
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

481 lines
18 KiB
Java
Generated

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionResult.Builder.class)
public record ExtractionResult(
@JsonProperty("content") String content,
@JsonProperty("mime_type") String mimeType,
@JsonProperty("metadata") Metadata metadata,
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
@Nullable @JsonProperty("extraction_method") ExtractionMethod extractionMethod,
@JsonProperty("tables") List<Table> tables,
@Nullable @JsonProperty("detected_languages") List<String> detectedLanguages,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
@Nullable @JsonProperty("chunks") List<Chunk> chunks,
/**
* Extracted images from the document.
*
* When image extraction is enabled via {@code ImageExtractionConfig}, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested {@code ocr_result} if OCR was performed.
*/
@Nullable @JsonProperty("images") List<ExtractedImage> images,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
@Nullable @JsonProperty("pages") List<PageContent> pages,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
@Nullable @JsonProperty("elements") List<Element> elements,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The {@code content} field still contains plain text for backward compatibility.
*
* Always {@code None} for non-Djot documents.
*/
@Nullable @JsonProperty("djot_content") DjotContent djotContent,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when {@code OcrElementConfig.include_elements} is true.
*/
@Nullable @JsonProperty("ocr_elements") List<OcrElement> ocrElements,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When {@code include_document_structure} is true in {@code ExtractionConfig}, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of {@code result_format} — can be combined with Unified or ElementBased.
*/
@Nullable @JsonProperty("document") DocumentStructure document,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in {@code metadata.additional["keywords"]}.
*/
@Nullable @JsonProperty("extracted_keywords") List<Keyword> extractedKeywords,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in {@code metadata.additional["quality_score"]}.
*/
@Nullable @JsonProperty("quality_score") Double qualityScore,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in {@code metadata.additional}.
*/
@Nullable @JsonProperty("processing_warnings") List<ProcessingWarning> processingWarnings,
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via {@code PdfConfig.extract_annotations},
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
@Nullable @JsonProperty("annotations") List<PdfAnnotation> annotations,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to {@code None} for non-archive formats.
* Use {@code max_archive_depth} in config to control recursion depth.
*/
@Nullable @JsonProperty("children") List<ArchiveEntry> children,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
@Nullable @JsonProperty("uris") List<ExtractedUri> uris,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*},
* …). Every extractor defaults to {@code None} until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* {@code content}, deleted text is absent — the revision list is the separate
* audit trail.
*/
@Nullable @JsonProperty("revisions") List<DocumentRevision> revisions,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When {@code structured_extraction} is configured in {@code ExtractionConfig}, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
@Nullable @JsonProperty("structured_output") JsonNode structuredOutput,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the {@code tree-sitter} feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is {@code tree_sitter_language_pack.ProcessResult}.
*/
@Nullable @JsonProperty("code_intelligence") JsonNode codeIntelligence,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* {@code None} when no LLM was used.
*/
@Nullable @JsonProperty("llm_usage") List<LlmUsage> llmUsage,
/**
* Pre-rendered content in the requested output format.
*
* Populated during {@code derive_extraction_result} before tree derivation consumes
* element data. {@code apply_output_format} swaps this into {@code content} at the end
* of the pipeline, after post-processors have operated on plain text.
*/
@Nullable @JsonProperty("formatted_content") String formattedContent,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed {@code InternalDocument} carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
@Nullable @JsonProperty("ocr_internal_document") String ocrInternalDocument
) {
public static Builder builder() {
return new Builder();
}
// CPD-OFF
@JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
public static final class Builder {
private String content = "";
@JsonProperty("mime_type")
private String mimeType = "";
private Metadata metadata = null;
@JsonProperty("extraction_method")
@Nullable private ExtractionMethod extractionMethod = null;
private List<Table> tables = List.of();
@JsonProperty("detected_languages")
private List<String> detectedLanguages = null;
private List<Chunk> chunks = null;
private List<ExtractedImage> images = null;
private List<PageContent> pages = null;
private List<Element> elements = null;
@JsonProperty("djot_content")
@Nullable private DjotContent djotContent = null;
@JsonProperty("ocr_elements")
private List<OcrElement> ocrElements = null;
@Nullable private DocumentStructure document = null;
@JsonProperty("extracted_keywords")
private List<Keyword> extractedKeywords = null;
@JsonProperty("quality_score")
private Double qualityScore = null;
@JsonProperty("processing_warnings")
private List<ProcessingWarning> processingWarnings = null;
private List<PdfAnnotation> annotations = null;
private List<ArchiveEntry> children = null;
private List<ExtractedUri> uris = null;
private List<DocumentRevision> revisions = null;
@JsonProperty("structured_output")
private JsonNode structuredOutput = null;
@JsonProperty("code_intelligence")
private JsonNode codeIntelligence = null;
@JsonProperty("llm_usage")
private List<LlmUsage> llmUsage = null;
@JsonProperty("formatted_content")
private String formattedContent = null;
@JsonProperty("ocr_internal_document")
private String ocrInternalDocument = null;
/** Sets the content field. */
@JsonProperty("content")
public Builder withContent(final String value) {
this.content = value;
return this;
}
/** Sets the mimeType field. */
@JsonProperty("mime_type")
public Builder withMimeType(final String value) {
this.mimeType = value;
return this;
}
/** Sets the metadata field. */
@JsonProperty("metadata")
public Builder withMetadata(final Metadata value) {
this.metadata = value;
return this;
}
/** Sets the extractionMethod field. */
@JsonProperty("extraction_method")
public Builder withExtractionMethod(final @Nullable ExtractionMethod value) {
this.extractionMethod = value;
return this;
}
/** Sets the tables field. */
@JsonProperty("tables")
public Builder withTables(final List<Table> value) {
this.tables = value;
return this;
}
/** Sets the detectedLanguages field. */
@JsonProperty("detected_languages")
public Builder withDetectedLanguages(final @Nullable List<String> value) {
this.detectedLanguages = value;
return this;
}
/** Sets the chunks field. */
@JsonProperty("chunks")
public Builder withChunks(final @Nullable List<Chunk> value) {
this.chunks = value;
return this;
}
/** Sets the images field. */
@JsonProperty("images")
public Builder withImages(final @Nullable List<ExtractedImage> value) {
this.images = value;
return this;
}
/** Sets the pages field. */
@JsonProperty("pages")
public Builder withPages(final @Nullable List<PageContent> value) {
this.pages = value;
return this;
}
/** Sets the elements field. */
@JsonProperty("elements")
public Builder withElements(final @Nullable List<Element> value) {
this.elements = value;
return this;
}
/** Sets the djotContent field. */
@JsonProperty("djot_content")
public Builder withDjotContent(final @Nullable DjotContent value) {
this.djotContent = value;
return this;
}
/** Sets the ocrElements field. */
@JsonProperty("ocr_elements")
public Builder withOcrElements(final @Nullable List<OcrElement> value) {
this.ocrElements = value;
return this;
}
/** Sets the document field. */
@JsonProperty("document")
public Builder withDocument(final @Nullable DocumentStructure value) {
this.document = value;
return this;
}
/** Sets the extractedKeywords field. */
@JsonProperty("extracted_keywords")
public Builder withExtractedKeywords(final @Nullable List<Keyword> value) {
this.extractedKeywords = value;
return this;
}
/** Sets the qualityScore field. */
@JsonProperty("quality_score")
public Builder withQualityScore(final @Nullable Double value) {
this.qualityScore = value;
return this;
}
/** Sets the processingWarnings field. */
@JsonProperty("processing_warnings")
public Builder withProcessingWarnings(final @Nullable List<ProcessingWarning> value) {
this.processingWarnings = value;
return this;
}
/** Sets the annotations field. */
@JsonProperty("annotations")
public Builder withAnnotations(final @Nullable List<PdfAnnotation> value) {
this.annotations = value;
return this;
}
/** Sets the children field. */
@JsonProperty("children")
public Builder withChildren(final @Nullable List<ArchiveEntry> value) {
this.children = value;
return this;
}
/** Sets the uris field. */
@JsonProperty("uris")
public Builder withUris(final @Nullable List<ExtractedUri> value) {
this.uris = value;
return this;
}
/** Sets the revisions field. */
@JsonProperty("revisions")
public Builder withRevisions(final @Nullable List<DocumentRevision> value) {
this.revisions = value;
return this;
}
/** Sets the structuredOutput field. */
@JsonProperty("structured_output")
public Builder withStructuredOutput(final @Nullable JsonNode value) {
this.structuredOutput = value;
return this;
}
/** Sets the codeIntelligence field. */
@JsonProperty("code_intelligence")
public Builder withCodeIntelligence(final @Nullable JsonNode value) {
this.codeIntelligence = value;
return this;
}
/** Sets the llmUsage field. */
@JsonProperty("llm_usage")
public Builder withLlmUsage(final @Nullable List<LlmUsage> value) {
this.llmUsage = value;
return this;
}
/** Sets the formattedContent field. */
@JsonProperty("formatted_content")
public Builder withFormattedContent(final @Nullable String value) {
this.formattedContent = value;
return this;
}
/** Sets the ocrInternalDocument field. */
@JsonProperty("ocr_internal_document")
public Builder withOcrInternalDocument(final @Nullable String value) {
this.ocrInternalDocument = value;
return this;
}
/** Builds the ExtractionResult instance. */
public ExtractionResult build() {
return new ExtractionResult(
content,
mimeType,
metadata,
extractionMethod,
tables,
detectedLanguages,
chunks,
images,
pages,
elements,
djotContent,
ocrElements,
document,
extractedKeywords,
qualityScore,
processingWarnings,
annotations,
children,
uris,
revisions,
structuredOutput,
codeIntelligence,
llmUsage,
formattedContent,
ocrInternalDocument
);
}
}
// CPD-ON
/**
* Convert from an OCR result.
*/
public static ExtractionResult fromOcr(OcrExtractionResult ocr) {
throw new UnsupportedOperationException("fromOcr is not yet bridged via JNI; use the Builder instead.");
}
}