fil/packages/java/dev/kreuzberg/ExtractionResult.java

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;

import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.jspecify.annotations.Nullable;

/**
 * General extraction result used by the core extraction API.
 *
 * This is the main result type returned by all extraction functions.
 */
@JsonInclude(JsonInclude.Include.NON_ABSENT)
@JsonDeserialize(builder = ExtractionResult.Builder.class)
public record ExtractionResult(
    @JsonProperty("content") String content,
    @JsonProperty("mime_type") String mimeType,
    @JsonProperty("metadata") Metadata metadata,
    /**
     * Extraction strategy used to produce the returned text.
     *
     * Populated when the extractor can reliably distinguish native text extraction,
     * OCR-only extraction, or mixed native/OCR output.
     */
    @Nullable @JsonProperty("extraction_method") ExtractionMethod extractionMethod,
    @JsonProperty("tables") List<Table> tables,
    @Nullable @JsonProperty("detected_languages") List<String> detectedLanguages,
    /**
     * Text chunks when chunking is enabled.
     *
     * When chunking configuration is provided, the content is split into
     * overlapping chunks for efficient processing. Each chunk contains the text,
     * optional embeddings (if enabled), and metadata about its position.
     */
    @Nullable @JsonProperty("chunks") List<Chunk> chunks,
    /**
     * Extracted images from the document.
     *
     * When image extraction is enabled via {@code ImageExtractionConfig}, this field
     * contains all images found in the document with their raw data and metadata.
     * Each image may optionally contain a nested {@code ocr_result} if OCR was performed.
     */
    @Nullable @JsonProperty("images") List<ExtractedImage> images,
    /**
     * Per-page content when page extraction is enabled.
     *
     * When page extraction is configured, the document is split into per-page content
     * with tables and images mapped to their respective pages.
     */
    @Nullable @JsonProperty("pages") List<PageContent> pages,
    /**
     * Semantic elements when element-based result format is enabled.
     *
     * When result_format is set to ElementBased, this field contains semantic
     * elements with type classification, unique identifiers, and metadata for
     * Unstructured-compatible element-based processing.
     */
    @Nullable @JsonProperty("elements") List<Element> elements,
    /**
     * Rich Djot content structure (when extracting Djot documents).
     *
     * When extracting Djot documents with structured extraction enabled,
     * this field contains the full semantic structure including:
     * - Block-level elements with nesting
     * - Inline formatting with attributes
     * - Links, images, footnotes
     * - Math expressions
     * - Complete attribute information
     *
     * The {@code content} field still contains plain text for backward compatibility.
     *
     * Always {@code None} for non-Djot documents.
     */
    @Nullable @JsonProperty("djot_content") DjotContent djotContent,
    /**
     * OCR elements with full spatial and confidence metadata.
     *
     * When OCR is performed with element extraction enabled, this field contains
     * the structured representation of detected text including:
     * - Bounding geometry (rectangles or quadrilaterals)
     * - Confidence scores (detection and recognition)
     * - Rotation information
     * - Hierarchical relationships (Tesseract only)
     *
     * This field preserves all metadata that would otherwise be lost when
     * converting to plain text or markdown output formats.
     *
     * Only populated when {@code OcrElementConfig.include_elements} is true.
     */
    @Nullable @JsonProperty("ocr_elements") List<OcrElement> ocrElements,
    /**
     * Structured document tree (when document structure extraction is enabled).
     *
     * When {@code include_document_structure} is true in {@code ExtractionConfig}, this field
     * contains the full hierarchical representation of the document including:
     * - Heading-driven section nesting
     * - Table grids with cell-level metadata
     * - Content layer classification (body, header, footer, footnote)
     * - Inline text annotations (formatting, links)
     * - Bounding boxes and page numbers
     *
     * Independent of {@code result_format} — can be combined with Unified or ElementBased.
     */
    @Nullable @JsonProperty("document") DocumentStructure document,
    /**
     * Extracted keywords when keyword extraction is enabled.
     *
     * When keyword extraction (RAKE or YAKE) is configured, this field contains
     * the extracted keywords with scores, algorithm info, and position data.
     * Previously stored in {@code metadata.additional["keywords"]}.
     */
    @Nullable @JsonProperty("extracted_keywords") List<Keyword> extractedKeywords,
    /**
     * Document quality score from quality analysis.
     *
     * A value between 0.0 and 1.0 indicating the overall text quality.
     * Previously stored in {@code metadata.additional["quality_score"]}.
     */
    @Nullable @JsonProperty("quality_score") Double qualityScore,
    /**
     * Non-fatal warnings collected during processing pipeline stages.
     *
     * Captures errors from optional pipeline features (embedding, chunking,
     * language detection, output formatting) that don't prevent extraction
     * but may indicate degraded results.
     * Previously stored as individual keys in {@code metadata.additional}.
     */
    @Nullable @JsonProperty("processing_warnings") List<ProcessingWarning> processingWarnings,
    /**
     * PDF annotations extracted from the document.
     *
     * When annotation extraction is enabled via {@code PdfConfig.extract_annotations},
     * this field contains text notes, highlights, links, stamps, and other
     * annotations found in PDF documents.
     */
    @Nullable @JsonProperty("annotations") List<PdfAnnotation> annotations,
    /**
     * Nested extraction results from archive contents.
     *
     * When extracting archives, each processable file inside produces its own
     * full extraction result. Set to {@code None} for non-archive formats.
     * Use {@code max_archive_depth} in config to control recursion depth.
     */
    @Nullable @JsonProperty("children") List<ArchiveEntry> children,
    /**
     * URIs/links discovered during document extraction.
     *
     * Contains hyperlinks, image references, citations, email addresses, and
     * other URI-like references found in the document. Always extracted when
     * present in the source document.
     */
    @Nullable @JsonProperty("uris") List<ExtractedUri> uris,
    /**
     * Tracked changes embedded in the source document.
     *
     * Populated by per-format extractors that understand change-tracking
     * metadata (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*},
     * …). Every extractor defaults to {@code None} until its format-specific
     * implementation is added. Extractors that do populate this field follow
     * the "accepted-changes" convention: inserted text is present in
     * {@code content}, deleted text is absent — the revision list is the separate
     * audit trail.
     */
    @Nullable @JsonProperty("revisions") List<DocumentRevision> revisions,
    /**
     * Structured extraction output from LLM-based JSON schema extraction.
     *
     * When {@code structured_extraction} is configured in {@code ExtractionConfig}, the
     * extracted document content is sent to a VLM with the provided JSON schema.
     * The response is parsed and stored here as a JSON value matching the schema.
     */
    @Nullable @JsonProperty("structured_output") JsonNode structuredOutput,
    /**
     * Code intelligence results from tree-sitter analysis.
     *
     * Populated when extracting source code files with the {@code tree-sitter} feature.
     * Contains metrics, structural analysis, imports/exports, comments,
     * docstrings, symbols, diagnostics, and optionally chunked code segments.
     *
     * Stored as an opaque JSON value so that all language bindings (Go, Java,
     * C#, …) can deserialize it as a raw JSON object rather than a typed struct.
     * The underlying type is {@code tree_sitter_language_pack.ProcessResult}.
     */
    @Nullable @JsonProperty("code_intelligence") JsonNode codeIntelligence,
    /**
     * LLM token usage and cost data for all LLM calls made during this extraction.
     *
     * Contains one entry per LLM call. Multiple entries are produced when
     * VLM OCR, structured extraction, or LLM embeddings run during
     * the same extraction.
     *
     * {@code None} when no LLM was used.
     */
    @Nullable @JsonProperty("llm_usage") List<LlmUsage> llmUsage,
    /**
     * Pre-rendered content in the requested output format.
     *
     * Populated during {@code derive_extraction_result} before tree derivation consumes
     * element data. {@code apply_output_format} swaps this into {@code content} at the end
     * of the pipeline, after post-processors have operated on plain text.
     */
    @Nullable @JsonProperty("formatted_content") String formattedContent,
    /**
     * Structured hOCR document for the OCR+layout pipeline.
     *
     * When tesseract produces hOCR output, the parsed {@code InternalDocument} carries
     * paragraph structure with bounding boxes and confidence scores. The layout
     * classification step enriches these elements before final rendering.
     */
    @Nullable @JsonProperty("ocr_internal_document") String ocrInternalDocument
) {
    public static Builder builder() {
        return new Builder();
    }

    // CPD-OFF
    @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build")
    public static final class Builder {

private String content = "";
        @JsonProperty("mime_type")
private String mimeType = "";
private Metadata metadata = null;
        @JsonProperty("extraction_method")
        @Nullable private ExtractionMethod extractionMethod = null;
private List<Table> tables = List.of();
        @JsonProperty("detected_languages")
private List<String> detectedLanguages = null;
private List<Chunk> chunks = null;
private List<ExtractedImage> images = null;
private List<PageContent> pages = null;
private List<Element> elements = null;
        @JsonProperty("djot_content")
        @Nullable private DjotContent djotContent = null;
        @JsonProperty("ocr_elements")
private List<OcrElement> ocrElements = null;
        @Nullable private DocumentStructure document = null;
        @JsonProperty("extracted_keywords")
private List<Keyword> extractedKeywords = null;
        @JsonProperty("quality_score")
private Double qualityScore = null;
        @JsonProperty("processing_warnings")
private List<ProcessingWarning> processingWarnings = null;
private List<PdfAnnotation> annotations = null;
private List<ArchiveEntry> children = null;
private List<ExtractedUri> uris = null;
private List<DocumentRevision> revisions = null;
        @JsonProperty("structured_output")
private JsonNode structuredOutput = null;
        @JsonProperty("code_intelligence")
private JsonNode codeIntelligence = null;
        @JsonProperty("llm_usage")
private List<LlmUsage> llmUsage = null;
        @JsonProperty("formatted_content")
private String formattedContent = null;
        @JsonProperty("ocr_internal_document")
private String ocrInternalDocument = null;

        /** Sets the content field. */
        @JsonProperty("content")
        public Builder withContent(final String value) {
            this.content = value;
            return this;
        }

        /** Sets the mimeType field. */
        @JsonProperty("mime_type")
        public Builder withMimeType(final String value) {
            this.mimeType = value;
            return this;
        }

        /** Sets the metadata field. */
        @JsonProperty("metadata")
        public Builder withMetadata(final Metadata value) {
            this.metadata = value;
            return this;
        }

        /** Sets the extractionMethod field. */
        @JsonProperty("extraction_method")
        public Builder withExtractionMethod(final @Nullable ExtractionMethod value) {
            this.extractionMethod = value;
            return this;
        }

        /** Sets the tables field. */
        @JsonProperty("tables")
        public Builder withTables(final List<Table> value) {
            this.tables = value;
            return this;
        }

        /** Sets the detectedLanguages field. */
        @JsonProperty("detected_languages")
        public Builder withDetectedLanguages(final @Nullable List<String> value) {
            this.detectedLanguages = value;
            return this;
        }

        /** Sets the chunks field. */
        @JsonProperty("chunks")
        public Builder withChunks(final @Nullable List<Chunk> value) {
            this.chunks = value;
            return this;
        }

        /** Sets the images field. */
        @JsonProperty("images")
        public Builder withImages(final @Nullable List<ExtractedImage> value) {
            this.images = value;
            return this;
        }

        /** Sets the pages field. */
        @JsonProperty("pages")
        public Builder withPages(final @Nullable List<PageContent> value) {
            this.pages = value;
            return this;
        }

        /** Sets the elements field. */
        @JsonProperty("elements")
        public Builder withElements(final @Nullable List<Element> value) {
            this.elements = value;
            return this;
        }

        /** Sets the djotContent field. */
        @JsonProperty("djot_content")
        public Builder withDjotContent(final @Nullable DjotContent value) {
            this.djotContent = value;
            return this;
        }

        /** Sets the ocrElements field. */
        @JsonProperty("ocr_elements")
        public Builder withOcrElements(final @Nullable List<OcrElement> value) {
            this.ocrElements = value;
            return this;
        }

        /** Sets the document field. */
        @JsonProperty("document")
        public Builder withDocument(final @Nullable DocumentStructure value) {
            this.document = value;
            return this;
        }

        /** Sets the extractedKeywords field. */
        @JsonProperty("extracted_keywords")
        public Builder withExtractedKeywords(final @Nullable List<Keyword> value) {
            this.extractedKeywords = value;
            return this;
        }

        /** Sets the qualityScore field. */
        @JsonProperty("quality_score")
        public Builder withQualityScore(final @Nullable Double value) {
            this.qualityScore = value;
            return this;
        }

        /** Sets the processingWarnings field. */
        @JsonProperty("processing_warnings")
        public Builder withProcessingWarnings(final @Nullable List<ProcessingWarning> value) {
            this.processingWarnings = value;
            return this;
        }

        /** Sets the annotations field. */
        @JsonProperty("annotations")
        public Builder withAnnotations(final @Nullable List<PdfAnnotation> value) {
            this.annotations = value;
            return this;
        }

        /** Sets the children field. */
        @JsonProperty("children")
        public Builder withChildren(final @Nullable List<ArchiveEntry> value) {
            this.children = value;
            return this;
        }

        /** Sets the uris field. */
        @JsonProperty("uris")
        public Builder withUris(final @Nullable List<ExtractedUri> value) {
            this.uris = value;
            return this;
        }

        /** Sets the revisions field. */
        @JsonProperty("revisions")
        public Builder withRevisions(final @Nullable List<DocumentRevision> value) {
            this.revisions = value;
            return this;
        }

        /** Sets the structuredOutput field. */
        @JsonProperty("structured_output")
        public Builder withStructuredOutput(final @Nullable JsonNode value) {
            this.structuredOutput = value;
            return this;
        }

        /** Sets the codeIntelligence field. */
        @JsonProperty("code_intelligence")
        public Builder withCodeIntelligence(final @Nullable JsonNode value) {
            this.codeIntelligence = value;
            return this;
        }

        /** Sets the llmUsage field. */
        @JsonProperty("llm_usage")
        public Builder withLlmUsage(final @Nullable List<LlmUsage> value) {
            this.llmUsage = value;
            return this;
        }

        /** Sets the formattedContent field. */
        @JsonProperty("formatted_content")
        public Builder withFormattedContent(final @Nullable String value) {
            this.formattedContent = value;
            return this;
        }

        /** Sets the ocrInternalDocument field. */
        @JsonProperty("ocr_internal_document")
        public Builder withOcrInternalDocument(final @Nullable String value) {
            this.ocrInternalDocument = value;
            return this;
        }

        /** Builds the ExtractionResult instance. */
        public ExtractionResult build() {
            return new ExtractionResult(
                content,
                mimeType,
                metadata,
                extractionMethod,
                tables,
                detectedLanguages,
                chunks,
                images,
                pages,
                elements,
                djotContent,
                ocrElements,
                document,
                extractedKeywords,
                qualityScore,
                processingWarnings,
                annotations,
                children,
                uris,
                revisions,
                structuredOutput,
                codeIntelligence,
                llmUsage,
                formattedContent,
                ocrInternalDocument
            );
        }
    }
    // CPD-ON
    /**
     * Convert from an OCR result.
     */
    public static ExtractionResult fromOcr(OcrExtractionResult ocr) {
        throw new UnsupportedOperationException("fromOcr is not yet bridged via JNI; use the Builder instead.");
    }
}