// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import java.util.List; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder; import org.jspecify.annotations.Nullable; /** * General extraction result used by the core extraction API. * * This is the main result type returned by all extraction functions. */ @JsonInclude(JsonInclude.Include.NON_ABSENT) @JsonDeserialize(builder = ExtractionResult.Builder.class) public record ExtractionResult( @JsonProperty("content") String content, @JsonProperty("mime_type") String mimeType, @JsonProperty("metadata") Metadata metadata, /** * Extraction strategy used to produce the returned text. * * Populated when the extractor can reliably distinguish native text extraction, * OCR-only extraction, or mixed native/OCR output. */ @Nullable @JsonProperty("extraction_method") ExtractionMethod extractionMethod, @JsonProperty("tables") List tables, @Nullable @JsonProperty("detected_languages") List detectedLanguages, /** * Text chunks when chunking is enabled. * * When chunking configuration is provided, the content is split into * overlapping chunks for efficient processing. Each chunk contains the text, * optional embeddings (if enabled), and metadata about its position. */ @Nullable @JsonProperty("chunks") List chunks, /** * Extracted images from the document. * * When image extraction is enabled via {@code ImageExtractionConfig}, this field * contains all images found in the document with their raw data and metadata. * Each image may optionally contain a nested {@code ocr_result} if OCR was performed. */ @Nullable @JsonProperty("images") List images, /** * Per-page content when page extraction is enabled. * * When page extraction is configured, the document is split into per-page content * with tables and images mapped to their respective pages. */ @Nullable @JsonProperty("pages") List pages, /** * Semantic elements when element-based result format is enabled. * * When result_format is set to ElementBased, this field contains semantic * elements with type classification, unique identifiers, and metadata for * Unstructured-compatible element-based processing. */ @Nullable @JsonProperty("elements") List elements, /** * Rich Djot content structure (when extracting Djot documents). * * When extracting Djot documents with structured extraction enabled, * this field contains the full semantic structure including: * - Block-level elements with nesting * - Inline formatting with attributes * - Links, images, footnotes * - Math expressions * - Complete attribute information * * The {@code content} field still contains plain text for backward compatibility. * * Always {@code None} for non-Djot documents. */ @Nullable @JsonProperty("djot_content") DjotContent djotContent, /** * OCR elements with full spatial and confidence metadata. * * When OCR is performed with element extraction enabled, this field contains * the structured representation of detected text including: * - Bounding geometry (rectangles or quadrilaterals) * - Confidence scores (detection and recognition) * - Rotation information * - Hierarchical relationships (Tesseract only) * * This field preserves all metadata that would otherwise be lost when * converting to plain text or markdown output formats. * * Only populated when {@code OcrElementConfig.include_elements} is true. */ @Nullable @JsonProperty("ocr_elements") List ocrElements, /** * Structured document tree (when document structure extraction is enabled). * * When {@code include_document_structure} is true in {@code ExtractionConfig}, this field * contains the full hierarchical representation of the document including: * - Heading-driven section nesting * - Table grids with cell-level metadata * - Content layer classification (body, header, footer, footnote) * - Inline text annotations (formatting, links) * - Bounding boxes and page numbers * * Independent of {@code result_format} — can be combined with Unified or ElementBased. */ @Nullable @JsonProperty("document") DocumentStructure document, /** * Extracted keywords when keyword extraction is enabled. * * When keyword extraction (RAKE or YAKE) is configured, this field contains * the extracted keywords with scores, algorithm info, and position data. * Previously stored in {@code metadata.additional["keywords"]}. */ @Nullable @JsonProperty("extracted_keywords") List extractedKeywords, /** * Document quality score from quality analysis. * * A value between 0.0 and 1.0 indicating the overall text quality. * Previously stored in {@code metadata.additional["quality_score"]}. */ @Nullable @JsonProperty("quality_score") Double qualityScore, /** * Non-fatal warnings collected during processing pipeline stages. * * Captures errors from optional pipeline features (embedding, chunking, * language detection, output formatting) that don't prevent extraction * but may indicate degraded results. * Previously stored as individual keys in {@code metadata.additional}. */ @Nullable @JsonProperty("processing_warnings") List processingWarnings, /** * PDF annotations extracted from the document. * * When annotation extraction is enabled via {@code PdfConfig.extract_annotations}, * this field contains text notes, highlights, links, stamps, and other * annotations found in PDF documents. */ @Nullable @JsonProperty("annotations") List annotations, /** * Nested extraction results from archive contents. * * When extracting archives, each processable file inside produces its own * full extraction result. Set to {@code None} for non-archive formats. * Use {@code max_archive_depth} in config to control recursion depth. */ @Nullable @JsonProperty("children") List children, /** * URIs/links discovered during document extraction. * * Contains hyperlinks, image references, citations, email addresses, and * other URI-like references found in the document. Always extracted when * present in the source document. */ @Nullable @JsonProperty("uris") List uris, /** * Tracked changes embedded in the source document. * * Populated by per-format extractors that understand change-tracking * metadata (DOCX {@code w:ins}/{@code w:del}/{@code w:rPrChange}, ODT {@code text:change-*}, * …). Every extractor defaults to {@code None} until its format-specific * implementation is added. Extractors that do populate this field follow * the "accepted-changes" convention: inserted text is present in * {@code content}, deleted text is absent — the revision list is the separate * audit trail. */ @Nullable @JsonProperty("revisions") List revisions, /** * Structured extraction output from LLM-based JSON schema extraction. * * When {@code structured_extraction} is configured in {@code ExtractionConfig}, the * extracted document content is sent to a VLM with the provided JSON schema. * The response is parsed and stored here as a JSON value matching the schema. */ @Nullable @JsonProperty("structured_output") JsonNode structuredOutput, /** * Code intelligence results from tree-sitter analysis. * * Populated when extracting source code files with the {@code tree-sitter} feature. * Contains metrics, structural analysis, imports/exports, comments, * docstrings, symbols, diagnostics, and optionally chunked code segments. * * Stored as an opaque JSON value so that all language bindings (Go, Java, * C#, …) can deserialize it as a raw JSON object rather than a typed struct. * The underlying type is {@code tree_sitter_language_pack.ProcessResult}. */ @Nullable @JsonProperty("code_intelligence") JsonNode codeIntelligence, /** * LLM token usage and cost data for all LLM calls made during this extraction. * * Contains one entry per LLM call. Multiple entries are produced when * VLM OCR, structured extraction, or LLM embeddings run during * the same extraction. * * {@code None} when no LLM was used. */ @Nullable @JsonProperty("llm_usage") List llmUsage, /** * Pre-rendered content in the requested output format. * * Populated during {@code derive_extraction_result} before tree derivation consumes * element data. {@code apply_output_format} swaps this into {@code content} at the end * of the pipeline, after post-processors have operated on plain text. */ @Nullable @JsonProperty("formatted_content") String formattedContent, /** * Structured hOCR document for the OCR+layout pipeline. * * When tesseract produces hOCR output, the parsed {@code InternalDocument} carries * paragraph structure with bounding boxes and confidence scores. The layout * classification step enriches these elements before final rendering. */ @Nullable @JsonProperty("ocr_internal_document") String ocrInternalDocument ) { public static Builder builder() { return new Builder(); } // CPD-OFF @JsonPOJOBuilder(withPrefix = "with", buildMethodName = "build") public static final class Builder { private String content = ""; @JsonProperty("mime_type") private String mimeType = ""; private Metadata metadata = null; @JsonProperty("extraction_method") @Nullable private ExtractionMethod extractionMethod = null; private List
tables = List.of(); @JsonProperty("detected_languages") private List detectedLanguages = null; private List chunks = null; private List images = null; private List pages = null; private List elements = null; @JsonProperty("djot_content") @Nullable private DjotContent djotContent = null; @JsonProperty("ocr_elements") private List ocrElements = null; @Nullable private DocumentStructure document = null; @JsonProperty("extracted_keywords") private List extractedKeywords = null; @JsonProperty("quality_score") private Double qualityScore = null; @JsonProperty("processing_warnings") private List processingWarnings = null; private List annotations = null; private List children = null; private List uris = null; private List revisions = null; @JsonProperty("structured_output") private JsonNode structuredOutput = null; @JsonProperty("code_intelligence") private JsonNode codeIntelligence = null; @JsonProperty("llm_usage") private List llmUsage = null; @JsonProperty("formatted_content") private String formattedContent = null; @JsonProperty("ocr_internal_document") private String ocrInternalDocument = null; /** Sets the content field. */ @JsonProperty("content") public Builder withContent(final String value) { this.content = value; return this; } /** Sets the mimeType field. */ @JsonProperty("mime_type") public Builder withMimeType(final String value) { this.mimeType = value; return this; } /** Sets the metadata field. */ @JsonProperty("metadata") public Builder withMetadata(final Metadata value) { this.metadata = value; return this; } /** Sets the extractionMethod field. */ @JsonProperty("extraction_method") public Builder withExtractionMethod(final @Nullable ExtractionMethod value) { this.extractionMethod = value; return this; } /** Sets the tables field. */ @JsonProperty("tables") public Builder withTables(final List
value) { this.tables = value; return this; } /** Sets the detectedLanguages field. */ @JsonProperty("detected_languages") public Builder withDetectedLanguages(final @Nullable List value) { this.detectedLanguages = value; return this; } /** Sets the chunks field. */ @JsonProperty("chunks") public Builder withChunks(final @Nullable List value) { this.chunks = value; return this; } /** Sets the images field. */ @JsonProperty("images") public Builder withImages(final @Nullable List value) { this.images = value; return this; } /** Sets the pages field. */ @JsonProperty("pages") public Builder withPages(final @Nullable List value) { this.pages = value; return this; } /** Sets the elements field. */ @JsonProperty("elements") public Builder withElements(final @Nullable List value) { this.elements = value; return this; } /** Sets the djotContent field. */ @JsonProperty("djot_content") public Builder withDjotContent(final @Nullable DjotContent value) { this.djotContent = value; return this; } /** Sets the ocrElements field. */ @JsonProperty("ocr_elements") public Builder withOcrElements(final @Nullable List value) { this.ocrElements = value; return this; } /** Sets the document field. */ @JsonProperty("document") public Builder withDocument(final @Nullable DocumentStructure value) { this.document = value; return this; } /** Sets the extractedKeywords field. */ @JsonProperty("extracted_keywords") public Builder withExtractedKeywords(final @Nullable List value) { this.extractedKeywords = value; return this; } /** Sets the qualityScore field. */ @JsonProperty("quality_score") public Builder withQualityScore(final @Nullable Double value) { this.qualityScore = value; return this; } /** Sets the processingWarnings field. */ @JsonProperty("processing_warnings") public Builder withProcessingWarnings(final @Nullable List value) { this.processingWarnings = value; return this; } /** Sets the annotations field. */ @JsonProperty("annotations") public Builder withAnnotations(final @Nullable List value) { this.annotations = value; return this; } /** Sets the children field. */ @JsonProperty("children") public Builder withChildren(final @Nullable List value) { this.children = value; return this; } /** Sets the uris field. */ @JsonProperty("uris") public Builder withUris(final @Nullable List value) { this.uris = value; return this; } /** Sets the revisions field. */ @JsonProperty("revisions") public Builder withRevisions(final @Nullable List value) { this.revisions = value; return this; } /** Sets the structuredOutput field. */ @JsonProperty("structured_output") public Builder withStructuredOutput(final @Nullable JsonNode value) { this.structuredOutput = value; return this; } /** Sets the codeIntelligence field. */ @JsonProperty("code_intelligence") public Builder withCodeIntelligence(final @Nullable JsonNode value) { this.codeIntelligence = value; return this; } /** Sets the llmUsage field. */ @JsonProperty("llm_usage") public Builder withLlmUsage(final @Nullable List value) { this.llmUsage = value; return this; } /** Sets the formattedContent field. */ @JsonProperty("formatted_content") public Builder withFormattedContent(final @Nullable String value) { this.formattedContent = value; return this; } /** Sets the ocrInternalDocument field. */ @JsonProperty("ocr_internal_document") public Builder withOcrInternalDocument(final @Nullable String value) { this.ocrInternalDocument = value; return this; } /** Builds the ExtractionResult instance. */ public ExtractionResult build() { return new ExtractionResult( content, mimeType, metadata, extractionMethod, tables, detectedLanguages, chunks, images, pages, elements, djotContent, ocrElements, document, extractedKeywords, qualityScore, processingWarnings, annotations, children, uris, revisions, structuredOutput, codeIntelligence, llmUsage, formattedContent, ocrInternalDocument ); } } // CPD-ON /** * Convert from an OCR result. */ public static ExtractionResult fromOcr(OcrExtractionResult ocr) { throw new UnsupportedOperationException("fromOcr is not yet bridged via JNI; use the Builder instead."); } }