Files
fil/packages/go/v5/binding.go
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

7848 lines
289 KiB
Go
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
// Package kreuzberg provides Go bindings for the kreuzberg library.
package kreuzberg
/*
#cgo CFLAGS: -I${SRCDIR}/include
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/.lib/macos-arm64 -Wl,-rpath,${SRCDIR}/.lib/macos-arm64 -lkreuzberg_ffi
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/.lib/macos-amd64 -Wl,-rpath,${SRCDIR}/.lib/macos-amd64 -lkreuzberg_ffi
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/.lib/linux-amd64 -Wl,-rpath,${SRCDIR}/.lib/linux-amd64 -lkreuzberg_ffi
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/.lib/linux-arm64 -Wl,-rpath,${SRCDIR}/.lib/linux-arm64 -lkreuzberg_ffi
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/.lib/windows-amd64 -lkreuzberg_ffi
#include "kreuzberg.h"
*/
import "C"
import (
"encoding/json"
"errors"
"fmt"
"runtime"
"unsafe"
)
// lastError retrieves the last error from the FFI layer.
func lastError() error {
code := int32(C.kreuzberg_last_error_code())
if code == 0 {
return nil
}
ctx := C.kreuzberg_last_error_context()
if ctx == nil {
return fmt.Errorf("[%d] native error", code)
}
message := C.GoString(ctx)
return fmt.Errorf("[%d] %s", code, message)
}
// unmarshalBytes copies a C byte buffer into a Go []byte.
//
// The pointer is treated as a NUL-terminated C string; binary payloads
// that may contain interior NULs should be exposed by the FFI with an
// explicit length out-parameter instead.
func unmarshalBytes(ptr *C.uint8_t) []byte {
if ptr == nil {
return nil
}
return []byte(C.GoString((*C.char)(unsafe.Pointer(ptr))))
}
// Ptr returns a pointer to the given value.
//
// Used by data DTOs to construct pointers for optional fields without the
// functional-options pattern boilerplate. For example:
//
// &MyStruct{Field: Ptr("value"), OtherField: Ptr(42)}
func Ptr[T any](v T) *T {
return &v
}
var (
// ErrIo is returned when IO error.
ErrIo = errors.New("IO error")
// ErrParsing is returned when parsing error.
ErrParsing = errors.New("parsing error")
// ErrOcr is returned when OCR error.
ErrOcr = errors.New("OCR error")
// ErrValidation is returned when validation error.
ErrValidation = errors.New("validation error")
// ErrCache is returned when cache error.
ErrCache = errors.New("cache error")
// ErrImageProcessing is returned when image processing error.
ErrImageProcessing = errors.New("image processing error")
// ErrSerialization is returned when serialization error.
ErrSerialization = errors.New("serialization error")
// ErrMissingDependency is returned when missing dependency.
ErrMissingDependency = errors.New("missing dependency")
// ErrPlugin is returned when plugin error in.
ErrPlugin = errors.New("plugin error in")
// ErrLockPoisoned is returned when lock poisoned.
ErrLockPoisoned = errors.New("lock poisoned")
// ErrUnsupportedFormat is returned when unsupported format.
ErrUnsupportedFormat = errors.New("unsupported format")
// ErrEmbedding is returned when embedding error.
ErrEmbedding = errors.New("embedding error")
// ErrTimeout is returned when extraction timed out after ms (limit: ms).
ErrTimeout = errors.New("extraction timed out after ms (limit: ms)")
// ErrCancelled is returned when extraction cancelled.
ErrCancelled = errors.New("extraction cancelled")
// ErrSecurity is returned when security violation.
ErrSecurity = errors.New("security violation")
// ErrOther is returned when other.
ErrOther = errors.New("other")
)
// Error is a structured error type.
type Error struct {
Code string
Message string
}
func (e Error) Error() string { return e.Message }
// ExecutionProviderType is an enumeration type.
type ExecutionProviderType string
const (
// ExecutionProviderTypeAuto ExecutionProviderTypeAuto auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
ExecutionProviderTypeAuto ExecutionProviderType = "auto"
// ExecutionProviderTypeCPU ExecutionProviderTypeCPU cPU execution provider (always available).
ExecutionProviderTypeCPU ExecutionProviderType = "cpu"
// ExecutionProviderTypeCoreMl ExecutionProviderTypeCoreMl apple CoreML (macOS/iOS Neural Engine + GPU).
ExecutionProviderTypeCoreMl ExecutionProviderType = "core_ml"
// ExecutionProviderTypeCuda ExecutionProviderTypeCuda nVIDIA CUDA GPU acceleration.
ExecutionProviderTypeCuda ExecutionProviderType = "cuda"
// ExecutionProviderTypeTensorRt ExecutionProviderTypeTensorRt nVIDIA TensorRT (optimized CUDA inference).
ExecutionProviderTypeTensorRt ExecutionProviderType = "tensor_rt"
)
// OutputFormat output format for extraction results.
//
// Controls the format of the `content` field in `ExtractionResult`.
// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
// `Plain` returns the raw extracted text.
// `Structured` returns JSON with full OCR element data including bounding
// boxes and confidence scores.
type OutputFormat string
const (
// OutputFormatPlain plain text content only (default)
OutputFormatPlain OutputFormat = "plain"
// OutputFormatMarkdown markdown format
OutputFormatMarkdown OutputFormat = "markdown"
// OutputFormatDjot djot markup format
OutputFormatDjot OutputFormat = "djot"
// OutputFormatHTML hTML format
OutputFormatHTML OutputFormat = "html"
// OutputFormatJSON jSON tree format with heading-driven sections.
OutputFormatJSON OutputFormat = "json"
// OutputFormatStructured structured JSON format with full OCR element metadata.
OutputFormatStructured OutputFormat = "structured"
)
// HTMLTheme is an enumeration type.
type HTMLTheme string
const (
// HTMLThemeDefault HTMLThemeDefault sensible defaults: system font stack, neutral colours, readable line
// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
// can override individual values.
HTMLThemeDefault HTMLTheme = "default"
// HTMLThemeGitHub HTMLThemeGitHub gitHub Markdown-inspired palette and spacing.
HTMLThemeGitHub HTMLTheme = "git_hub"
// HTMLThemeDark HTMLThemeDark dark background, light text.
HTMLThemeDark HTMLTheme = "dark"
// HTMLThemeLight HTMLThemeLight minimal light theme with generous whitespace.
HTMLThemeLight HTMLTheme = "light"
// HTMLThemeUnstyled HTMLThemeUnstyled no built-in stylesheet emitted. CSS custom properties are still defined
// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
HTMLThemeUnstyled HTMLTheme = "unstyled"
)
// TableModel is an enumeration type.
type TableModel string
const (
// TableModelTatr TableModelTatr tATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
TableModelTatr TableModel = "tatr"
// TableModelSlanetWired TableModelSlanetWired sLANeXT wired variant -- 365MB, optimized for bordered tables.
TableModelSlanetWired TableModel = "slanet_wired"
// TableModelSlanetWireless TableModelSlanetWireless sLANeXT wireless variant -- 365MB, optimized for borderless tables.
TableModelSlanetWireless TableModel = "slanet_wireless"
// TableModelSlanetPlus TableModelSlanetPlus sLANet-plus -- 7.78MB, lightweight general-purpose.
TableModelSlanetPlus TableModel = "slanet_plus"
// TableModelSlanetAuto TableModelSlanetAuto classifier-routed SLANeXT: auto-select wired/wireless per table.
// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
TableModelSlanetAuto TableModel = "slanet_auto"
// TableModelDisabled TableModelDisabled disable table structure model inference entirely; use heuristic path only.
TableModelDisabled TableModel = "disabled"
)
// ChunkerType is an enumeration type.
type ChunkerType string
const (
// ChunkerTypeText ChunkerTypeText is the Text variant of ChunkerType.
ChunkerTypeText ChunkerType = "text"
// ChunkerTypeMarkdown ChunkerTypeMarkdown is the Markdown variant of ChunkerType.
ChunkerTypeMarkdown ChunkerType = "markdown"
// ChunkerTypeYaml ChunkerTypeYaml is the Yaml variant of ChunkerType.
ChunkerTypeYaml ChunkerType = "yaml"
// ChunkerTypeSemantic ChunkerTypeSemantic is the Semantic variant of ChunkerType.
ChunkerTypeSemantic ChunkerType = "semantic"
)
// ChunkSizing how chunk size is measured.
//
// Defaults to `Characters` (Unicode character count). When using token-based sizing,
// chunks are sized by token count according to the specified tokenizer.
//
// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
// Variants: Characters, Tokenizer
// Sealed interface — use one of ChunkSizingCharacters, ChunkSizingTokenizer.
type ChunkSizing interface {
isChunkSizing()
Type() string
}
// ChunkSizingCharacters size measured in Unicode characters (default).
type ChunkSizingCharacters struct {
}
func (ChunkSizingCharacters) isChunkSizing() {}
func (ChunkSizingCharacters) Type() string { return "characters" }
func (v ChunkSizingCharacters) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
}
return json.Marshal(aux{
Type: v.Type(),
})
}
// ChunkSizingTokenizer size measured in tokens from a HuggingFace tokenizer.
type ChunkSizingTokenizer struct {
// HuggingFace model ID or path, e.g. "Xenova/gpt-4o", "bert-base-uncased".
Model string `json:"model"`
// Optional cache directory override for tokenizer files.
// Defaults to hf-hub's standard cache (`~/.cache/huggingface/`).
// Can also be set via `KREUZBERG_TOKENIZER_CACHE_DIR` environment variable.
CacheDir *string `json:"cache_dir,omitempty"`
}
func (ChunkSizingTokenizer) isChunkSizing() {}
func (ChunkSizingTokenizer) Type() string { return "tokenizer" }
func (v ChunkSizingTokenizer) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Model string `json:"model"`
CacheDir *string `json:"cache_dir,omitempty"`
}
return json.Marshal(aux{
Type: v.Type(),
Model: v.Model,
CacheDir: v.CacheDir,
})
}
// UnmarshalChunkSizing decodes JSON data into the appropriate concrete ChunkSizing variant.
func UnmarshalChunkSizing(data []byte) (ChunkSizing, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "characters":
var v ChunkSizingCharacters
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "tokenizer":
var v ChunkSizingTokenizer
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown ChunkSizing type: %q", wire.Type)
}
// EmbeddingModelType embedding model types supported by Kreuzberg.
// Variants: Preset, Custom, Llm, Plugin
// Sealed interface — use one of EmbeddingModelTypePreset, EmbeddingModelTypeCustom.
type EmbeddingModelType interface {
isEmbeddingModelType()
Type() string
}
// EmbeddingModelTypePreset use a preset model configuration (recommended)
type EmbeddingModelTypePreset struct {
Name string `json:"name"`
}
func (EmbeddingModelTypePreset) isEmbeddingModelType() {}
func (EmbeddingModelTypePreset) Type() string { return "preset" }
func (v EmbeddingModelTypePreset) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Name string `json:"name"`
}
return json.Marshal(aux{
Type: v.Type(),
Name: v.Name,
})
}
// EmbeddingModelTypeCustom use a custom ONNX model from HuggingFace
type EmbeddingModelTypeCustom struct {
ModelID string `json:"model_id"`
Dimensions uint `json:"dimensions"`
}
func (EmbeddingModelTypeCustom) isEmbeddingModelType() {}
func (EmbeddingModelTypeCustom) Type() string { return "custom" }
func (v EmbeddingModelTypeCustom) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
ModelID string `json:"model_id"`
Dimensions uint `json:"dimensions"`
}
return json.Marshal(aux{
Type: v.Type(),
ModelID: v.ModelID,
Dimensions: v.Dimensions,
})
}
// EmbeddingModelTypeLlm provider-hosted embedding model via liter-llm.
//
// Uses the model specified in the nested `LlmConfig` (e.g.,
// `"openai/text-embedding-3-small"`).
type EmbeddingModelTypeLlm struct {
Llm LlmConfig `json:"llm"`
}
func (EmbeddingModelTypeLlm) isEmbeddingModelType() {}
func (EmbeddingModelTypeLlm) Type() string { return "llm" }
func (v EmbeddingModelTypeLlm) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Llm LlmConfig `json:"llm"`
}
return json.Marshal(aux{
Type: v.Type(),
Llm: v.Llm,
})
}
// EmbeddingModelTypePlugin in-process embedding backend registered via the plugin system.
//
// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
// into the registered backend during chunking and standalone embed requests —
// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
//
// When this variant is selected, only the following [`EmbeddingConfig`] fields
// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
// `show_download_progress`, `acceleration`) are ignored — the host owns the
// model lifecycle.
//
// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
// is used, since there is no preset to look a chunk-size ceiling up against — size your
// context window via `max_characters` directly.
//
// See `register_embedding_backend`.
type EmbeddingModelTypePlugin struct {
Name string `json:"name"`
}
func (EmbeddingModelTypePlugin) isEmbeddingModelType() {}
func (EmbeddingModelTypePlugin) Type() string { return "plugin" }
func (v EmbeddingModelTypePlugin) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Name string `json:"name"`
}
return json.Marshal(aux{
Type: v.Type(),
Name: v.Name,
})
}
// UnmarshalEmbeddingModelType decodes JSON data into the appropriate concrete EmbeddingModelType variant.
func UnmarshalEmbeddingModelType(data []byte) (EmbeddingModelType, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "preset":
var v EmbeddingModelTypePreset
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "custom":
var v EmbeddingModelTypeCustom
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "llm":
var v EmbeddingModelTypeLlm
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "plugin":
var v EmbeddingModelTypePlugin
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown EmbeddingModelType type: %q", wire.Type)
}
// CodeContentMode is an enumeration type.
type CodeContentMode string
const (
// CodeContentModeChunks CodeContentModeChunks use TSLP semantic chunks as content (default).
CodeContentModeChunks CodeContentMode = "chunks"
// CodeContentModeRaw CodeContentModeRaw use raw source code as content.
CodeContentModeRaw CodeContentMode = "raw"
// CodeContentModeStructure CodeContentModeStructure emit function/class headings + docstrings (no code bodies).
CodeContentModeStructure CodeContentMode = "structure"
)
// ListType is an enumeration type.
type ListType string
const (
// ListTypeBullet ListTypeBullet bullet points (-, *, •, etc.)
ListTypeBullet ListType = "bullet"
// ListTypeNumbered ListTypeNumbered numbered lists (1., 2., etc.)
ListTypeNumbered ListType = "numbered"
// ListTypeLettered ListTypeLettered lettered lists (a., b., A., B., etc.)
ListTypeLettered ListType = "lettered"
// ListTypeIndented ListTypeIndented indented items
ListTypeIndented ListType = "indented"
)
// OcrBackendType is an enumeration type.
type OcrBackendType string
const (
// OcrBackendTypeTesseract OcrBackendTypeTesseract tesseract OCR (native Rust binding)
OcrBackendTypeTesseract OcrBackendType = "tesseract"
// OcrBackendTypeEasyOcr OcrBackendTypeEasyOcr easyOCR (Python-based, via FFI)
OcrBackendTypeEasyOcr OcrBackendType = "easy_ocr"
// OcrBackendTypePaddleOcr OcrBackendTypePaddleOcr paddleOCR (Python-based, via FFI)
OcrBackendTypePaddleOcr OcrBackendType = "paddle_ocr"
// OcrBackendTypeCustom OcrBackendTypeCustom custom/third-party OCR backend
OcrBackendTypeCustom OcrBackendType = "custom"
)
// ProcessingStage is an enumeration type.
type ProcessingStage string
const (
// ProcessingStageEarly ProcessingStageEarly early stage - foundational processing.
//
// Use for:
// - Language detection
// - Character encoding normalization
// - Entity extraction (NER)
// - Text quality scoring
ProcessingStageEarly ProcessingStage = "early"
// ProcessingStageMiddle ProcessingStageMiddle middle stage - content transformation.
//
// Use for:
// - Keyword extraction
// - Token reduction
// - Text summarization
// - Semantic analysis
ProcessingStageMiddle ProcessingStage = "middle"
// ProcessingStageLate ProcessingStageLate late stage - final enrichment.
//
// Use for:
// - Custom user hooks
// - Analytics/logging
// - Final validation
// - Output formatting
ProcessingStageLate ProcessingStage = "late"
)
// ReductionLevel is an enumeration type.
type ReductionLevel string
const (
// ReductionLevelOff ReductionLevelOff is the Off variant of ReductionLevel.
ReductionLevelOff ReductionLevel = "off"
// ReductionLevelLight ReductionLevelLight is the Light variant of ReductionLevel.
ReductionLevelLight ReductionLevel = "light"
// ReductionLevelModerate ReductionLevelModerate is the Moderate variant of ReductionLevel.
ReductionLevelModerate ReductionLevel = "moderate"
// ReductionLevelAggressive ReductionLevelAggressive is the Aggressive variant of ReductionLevel.
ReductionLevelAggressive ReductionLevel = "aggressive"
// ReductionLevelMaximum ReductionLevelMaximum is the Maximum variant of ReductionLevel.
ReductionLevelMaximum ReductionLevel = "maximum"
)
// PdfAnnotationType is an enumeration type.
type PdfAnnotationType string
const (
// PdfAnnotationTypeText PdfAnnotationTypeText sticky note / text annotation
PdfAnnotationTypeText PdfAnnotationType = "text"
// PdfAnnotationTypeHighlight PdfAnnotationTypeHighlight highlighted text region
PdfAnnotationTypeHighlight PdfAnnotationType = "highlight"
// PdfAnnotationTypeLink PdfAnnotationTypeLink hyperlink annotation
PdfAnnotationTypeLink PdfAnnotationType = "link"
// PdfAnnotationTypeStamp PdfAnnotationTypeStamp rubber stamp annotation
PdfAnnotationTypeStamp PdfAnnotationType = "stamp"
// PdfAnnotationTypeUnderline PdfAnnotationTypeUnderline underline text markup
PdfAnnotationTypeUnderline PdfAnnotationType = "underline"
// PdfAnnotationTypeStrikeOut PdfAnnotationTypeStrikeOut strikeout text markup
PdfAnnotationTypeStrikeOut PdfAnnotationType = "strike_out"
// PdfAnnotationTypeOther PdfAnnotationTypeOther any other annotation type
PdfAnnotationTypeOther PdfAnnotationType = "other"
)
// BlockType is an enumeration type.
type BlockType string
const (
// BlockTypeParagraph BlockTypeParagraph is the Paragraph variant of BlockType.
BlockTypeParagraph BlockType = "paragraph"
// BlockTypeHeading BlockTypeHeading is the Heading variant of BlockType.
BlockTypeHeading BlockType = "heading"
// BlockTypeBlockquote BlockTypeBlockquote is the Blockquote variant of BlockType.
BlockTypeBlockquote BlockType = "blockquote"
// BlockTypeCodeBlock BlockTypeCodeBlock is the CodeBlock variant of BlockType.
BlockTypeCodeBlock BlockType = "code_block"
// BlockTypeListItem BlockTypeListItem is the ListItem variant of BlockType.
BlockTypeListItem BlockType = "list_item"
// BlockTypeOrderedList BlockTypeOrderedList is the OrderedList variant of BlockType.
BlockTypeOrderedList BlockType = "ordered_list"
// BlockTypeBulletList BlockTypeBulletList is the BulletList variant of BlockType.
BlockTypeBulletList BlockType = "bullet_list"
// BlockTypeTaskList BlockTypeTaskList is the TaskList variant of BlockType.
BlockTypeTaskList BlockType = "task_list"
// BlockTypeDefinitionList BlockTypeDefinitionList is the DefinitionList variant of BlockType.
BlockTypeDefinitionList BlockType = "definition_list"
// BlockTypeDefinitionTerm BlockTypeDefinitionTerm is the DefinitionTerm variant of BlockType.
BlockTypeDefinitionTerm BlockType = "definition_term"
// BlockTypeDefinitionDescription BlockTypeDefinitionDescription is the DefinitionDescription variant of BlockType.
BlockTypeDefinitionDescription BlockType = "definition_description"
// BlockTypeDiv BlockTypeDiv is the Div variant of BlockType.
BlockTypeDiv BlockType = "div"
// BlockTypeSection BlockTypeSection is the Section variant of BlockType.
BlockTypeSection BlockType = "section"
// BlockTypeThematicBreak BlockTypeThematicBreak is the ThematicBreak variant of BlockType.
BlockTypeThematicBreak BlockType = "thematic_break"
// BlockTypeRawBlock BlockTypeRawBlock is the RawBlock variant of BlockType.
BlockTypeRawBlock BlockType = "raw_block"
// BlockTypeMathDisplay BlockTypeMathDisplay is the MathDisplay variant of BlockType.
BlockTypeMathDisplay BlockType = "math_display"
)
// InlineType is an enumeration type.
type InlineType string
const (
// InlineTypeText InlineTypeText is the Text variant of InlineType.
InlineTypeText InlineType = "text"
// InlineTypeStrong InlineTypeStrong is the Strong variant of InlineType.
InlineTypeStrong InlineType = "strong"
// InlineTypeEmphasis InlineTypeEmphasis is the Emphasis variant of InlineType.
InlineTypeEmphasis InlineType = "emphasis"
// InlineTypeHighlight InlineTypeHighlight is the Highlight variant of InlineType.
InlineTypeHighlight InlineType = "highlight"
// InlineTypeSubscript InlineTypeSubscript is the Subscript variant of InlineType.
InlineTypeSubscript InlineType = "subscript"
// InlineTypeSuperscript InlineTypeSuperscript is the Superscript variant of InlineType.
InlineTypeSuperscript InlineType = "superscript"
// InlineTypeInsert InlineTypeInsert is the Insert variant of InlineType.
InlineTypeInsert InlineType = "insert"
// InlineTypeDelete InlineTypeDelete is the Delete variant of InlineType.
InlineTypeDelete InlineType = "delete"
// InlineTypeCode InlineTypeCode is the Code variant of InlineType.
InlineTypeCode InlineType = "code"
// InlineTypeLink InlineTypeLink is the Link variant of InlineType.
InlineTypeLink InlineType = "link"
// InlineTypeImage InlineTypeImage is the Image variant of InlineType.
InlineTypeImage InlineType = "image"
// InlineTypeSpan InlineTypeSpan is the Span variant of InlineType.
InlineTypeSpan InlineType = "span"
// InlineTypeMath InlineTypeMath is the Math variant of InlineType.
InlineTypeMath InlineType = "math"
// InlineTypeRawInline InlineTypeRawInline is the RawInline variant of InlineType.
InlineTypeRawInline InlineType = "raw_inline"
// InlineTypeFootnoteRef InlineTypeFootnoteRef is the FootnoteRef variant of InlineType.
InlineTypeFootnoteRef InlineType = "footnote_ref"
// InlineTypeSymbol InlineTypeSymbol is the Symbol variant of InlineType.
InlineTypeSymbol InlineType = "symbol"
)
// RelationshipKind is an enumeration type.
type RelationshipKind string
const (
// RelationshipKindFootnoteReference RelationshipKindFootnoteReference footnote marker -> footnote definition.
RelationshipKindFootnoteReference RelationshipKind = "footnote_reference"
// RelationshipKindCitationReference RelationshipKindCitationReference citation marker -> bibliography entry.
RelationshipKindCitationReference RelationshipKind = "citation_reference"
// RelationshipKindInternalLink RelationshipKindInternalLink internal anchor link (`#id`) -> target heading/element.
RelationshipKindInternalLink RelationshipKind = "internal_link"
// RelationshipKindCaption RelationshipKindCaption caption paragraph -> figure/table it describes.
RelationshipKindCaption RelationshipKind = "caption"
// RelationshipKindLabel RelationshipKindLabel label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
RelationshipKindLabel RelationshipKind = "label"
// RelationshipKindTocEntry RelationshipKindTocEntry tOC entry -> target section.
RelationshipKindTocEntry RelationshipKind = "toc_entry"
// RelationshipKindCrossReference RelationshipKindCrossReference cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
RelationshipKindCrossReference RelationshipKind = "cross_reference"
)
// ContentLayer is an enumeration type.
type ContentLayer string
const (
// ContentLayerBody ContentLayerBody main document body content.
ContentLayerBody ContentLayer = "body"
// ContentLayerHeader ContentLayerHeader page/section header (running header).
ContentLayerHeader ContentLayer = "header"
// ContentLayerFooter ContentLayerFooter page/section footer (running footer).
ContentLayerFooter ContentLayer = "footer"
// ContentLayerFootnote ContentLayerFootnote footnote content.
ContentLayerFootnote ContentLayer = "footnote"
)
// NodeContent tagged enum for node content. Each variant carries only type-specific data.
//
// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
// Go/Java/TypeScript bindings.
// Variants: Title, Heading, Paragraph, List, ListItem, Table, Image, Code, Quote, Formula, Footnote, Group, PageBreak, Slide, DefinitionList, DefinitionItem, Citation, Admonition, RawBlock, MetadataBlock
// Sealed interface — use one of NodeContentTitle, NodeContentHeading.
type NodeContent interface {
isNodeContent()
Type() string
}
// NodeContentTitle document title.
type NodeContentTitle struct {
Text string `json:"text"`
}
func (NodeContentTitle) isNodeContent() {}
func (NodeContentTitle) Type() string { return "title" }
func (v NodeContentTitle) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentHeading section heading with level (1-6).
type NodeContentHeading struct {
Level uint8 `json:"level"`
Text string `json:"text"`
}
func (NodeContentHeading) isNodeContent() {}
func (NodeContentHeading) Type() string { return "heading" }
func (v NodeContentHeading) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Level uint8 `json:"level"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Level: v.Level,
Text: v.Text,
})
}
// NodeContentParagraph body text paragraph.
type NodeContentParagraph struct {
Text string `json:"text"`
}
func (NodeContentParagraph) isNodeContent() {}
func (NodeContentParagraph) Type() string { return "paragraph" }
func (v NodeContentParagraph) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentList list container — children are `ListItem` nodes.
type NodeContentList struct {
Ordered bool `json:"ordered"`
}
func (NodeContentList) isNodeContent() {}
func (NodeContentList) Type() string { return "list" }
func (v NodeContentList) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Ordered bool `json:"ordered"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Ordered: v.Ordered,
})
}
// NodeContentListItem individual list item.
type NodeContentListItem struct {
Text string `json:"text"`
}
func (NodeContentListItem) isNodeContent() {}
func (NodeContentListItem) Type() string { return "list_item" }
func (v NodeContentListItem) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentTable table with structured cell grid.
type NodeContentTable struct {
Grid TableGrid `json:"grid"`
}
func (NodeContentTable) isNodeContent() {}
func (NodeContentTable) Type() string { return "table" }
func (v NodeContentTable) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Grid TableGrid `json:"grid"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Grid: v.Grid,
})
}
// NodeContentImage image reference.
type NodeContentImage struct {
Description *string `json:"description,omitempty"`
ImageIndex *uint32 `json:"image_index,omitempty"`
// Source URL or path of the image (from `<img src="...">` or `![](src)`).
Src *string `json:"src,omitempty"`
}
func (NodeContentImage) isNodeContent() {}
func (NodeContentImage) Type() string { return "image" }
func (v NodeContentImage) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Description *string `json:"description,omitempty"`
ImageIndex *uint32 `json:"image_index,omitempty"`
Src *string `json:"src,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Description: v.Description,
ImageIndex: v.ImageIndex,
Src: v.Src,
})
}
// NodeContentCode code block.
type NodeContentCode struct {
Text string `json:"text"`
Language *string `json:"language,omitempty"`
}
func (NodeContentCode) isNodeContent() {}
func (NodeContentCode) Type() string { return "code" }
func (v NodeContentCode) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
Language *string `json:"language,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
Language: v.Language,
})
}
// NodeContentQuote block quote — container, children carry the quoted content.
type NodeContentQuote struct {
}
func (NodeContentQuote) isNodeContent() {}
func (NodeContentQuote) Type() string { return "quote" }
func (v NodeContentQuote) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentFormula mathematical formula / equation.
type NodeContentFormula struct {
Text string `json:"text"`
}
func (NodeContentFormula) isNodeContent() {}
func (NodeContentFormula) Type() string { return "formula" }
func (v NodeContentFormula) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentFootnote footnote reference content.
type NodeContentFootnote struct {
Text string `json:"text"`
}
func (NodeContentFootnote) isNodeContent() {}
func (NodeContentFootnote) Type() string { return "footnote" }
func (v NodeContentFootnote) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentGroup logical grouping container (section, key-value area).
//
// `heading_level` + `heading_text` capture the section heading directly
// rather than relying on a first-child positional convention.
type NodeContentGroup struct {
Label *string `json:"label,omitempty"`
HeadingLevel *uint8 `json:"heading_level,omitempty"`
HeadingText *string `json:"heading_text,omitempty"`
}
func (NodeContentGroup) isNodeContent() {}
func (NodeContentGroup) Type() string { return "group" }
func (v NodeContentGroup) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Label *string `json:"label,omitempty"`
HeadingLevel *uint8 `json:"heading_level,omitempty"`
HeadingText *string `json:"heading_text,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Label: v.Label,
HeadingLevel: v.HeadingLevel,
HeadingText: v.HeadingText,
})
}
// NodeContentPageBreak page break marker.
type NodeContentPageBreak struct {
}
func (NodeContentPageBreak) isNodeContent() {}
func (NodeContentPageBreak) Type() string { return "page_break" }
func (v NodeContentPageBreak) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentSlide presentation slide container — children are the slide's content nodes.
type NodeContentSlide struct {
// 1-indexed slide number.
Number uint32 `json:"number"`
Title *string `json:"title,omitempty"`
}
func (NodeContentSlide) isNodeContent() {}
func (NodeContentSlide) Type() string { return "slide" }
func (v NodeContentSlide) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Number uint32 `json:"number"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Number: v.Number,
Title: v.Title,
})
}
// NodeContentDefinitionList definition list container — children are `DefinitionItem` nodes.
type NodeContentDefinitionList struct {
}
func (NodeContentDefinitionList) isNodeContent() {}
func (NodeContentDefinitionList) Type() string { return "definition_list" }
func (v NodeContentDefinitionList) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentDefinitionItem individual definition list entry with term and definition.
type NodeContentDefinitionItem struct {
Term string `json:"term"`
Definition string `json:"definition"`
}
func (NodeContentDefinitionItem) isNodeContent() {}
func (NodeContentDefinitionItem) Type() string { return "definition_item" }
func (v NodeContentDefinitionItem) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Term string `json:"term"`
Definition string `json:"definition"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Term: v.Term,
Definition: v.Definition,
})
}
// NodeContentCitation citation or bibliographic reference.
type NodeContentCitation struct {
Key string `json:"key"`
Text string `json:"text"`
}
func (NodeContentCitation) isNodeContent() {}
func (NodeContentCitation) Type() string { return "citation" }
func (v NodeContentCitation) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Key string `json:"key"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Key: v.Key,
Text: v.Text,
})
}
// NodeContentAdmonition admonition / callout container (note, warning, tip, etc.).
//
// Children carry the admonition body content.
type NodeContentAdmonition struct {
// Kind of admonition (e.g. "note", "warning", "tip", "danger").
Kind string `json:"kind"`
Title *string `json:"title,omitempty"`
}
func (NodeContentAdmonition) isNodeContent() {}
func (NodeContentAdmonition) Type() string { return "admonition" }
func (v NodeContentAdmonition) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Kind string `json:"kind"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Kind: v.Kind,
Title: v.Title,
})
}
// NodeContentRawBlock raw block preserved verbatim from the source format.
//
// Used for content that cannot be mapped to a semantic node type
// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
type NodeContentRawBlock struct {
// Source format identifier (e.g. "html", "latex", "jsx").
Format string `json:"format"`
Content string `json:"content"`
}
func (NodeContentRawBlock) isNodeContent() {}
func (NodeContentRawBlock) Type() string { return "raw_block" }
func (v NodeContentRawBlock) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Format string `json:"format"`
Content string `json:"content"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Format: v.Format,
Content: v.Content,
})
}
// NodeContentMetadataBlock structured metadata block (email headers, YAML frontmatter, etc.).
type NodeContentMetadataBlock struct {
Entries [][]string `json:"entries"`
}
func (NodeContentMetadataBlock) isNodeContent() {}
func (NodeContentMetadataBlock) Type() string { return "metadata_block" }
func (v NodeContentMetadataBlock) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Entries [][]string `json:"entries"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Entries: v.Entries,
})
}
// UnmarshalNodeContent decodes JSON data into the appropriate concrete NodeContent variant.
func UnmarshalNodeContent(data []byte) (NodeContent, error) {
var wire struct {
NodeType string `json:"node_type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.NodeType {
case "title":
var v NodeContentTitle
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "heading":
var v NodeContentHeading
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "paragraph":
var v NodeContentParagraph
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "list":
var v NodeContentList
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "list_item":
var v NodeContentListItem
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "table":
var v NodeContentTable
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "image":
var v NodeContentImage
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "code":
var v NodeContentCode
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "quote":
var v NodeContentQuote
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "formula":
var v NodeContentFormula
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "footnote":
var v NodeContentFootnote
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "group":
var v NodeContentGroup
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "page_break":
var v NodeContentPageBreak
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "slide":
var v NodeContentSlide
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "definition_list":
var v NodeContentDefinitionList
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "definition_item":
var v NodeContentDefinitionItem
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "citation":
var v NodeContentCitation
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "admonition":
var v NodeContentAdmonition
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "raw_block":
var v NodeContentRawBlock
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "metadata_block":
var v NodeContentMetadataBlock
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown NodeContent type: %q", wire.NodeType)
}
// AnnotationKind types of inline text annotations.
// Variants: Bold, Italic, Underline, Strikethrough, Code, Subscript, Superscript, Link, Highlight, Color, FontSize, Custom
// Sealed interface — use one of AnnotationKindBold, AnnotationKindItalic.
type AnnotationKind interface {
isAnnotationKind()
Type() string
}
// AnnotationKindBold is the Bold variant of AnnotationKind.
type AnnotationKindBold struct {
}
func (AnnotationKindBold) isAnnotationKind() {}
func (AnnotationKindBold) Type() string { return "bold" }
func (v AnnotationKindBold) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindItalic is the Italic variant of AnnotationKind.
type AnnotationKindItalic struct {
}
func (AnnotationKindItalic) isAnnotationKind() {}
func (AnnotationKindItalic) Type() string { return "italic" }
func (v AnnotationKindItalic) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindUnderline is the Underline variant of AnnotationKind.
type AnnotationKindUnderline struct {
}
func (AnnotationKindUnderline) isAnnotationKind() {}
func (AnnotationKindUnderline) Type() string { return "underline" }
func (v AnnotationKindUnderline) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindStrikethrough is the Strikethrough variant of AnnotationKind.
type AnnotationKindStrikethrough struct {
}
func (AnnotationKindStrikethrough) isAnnotationKind() {}
func (AnnotationKindStrikethrough) Type() string { return "strikethrough" }
func (v AnnotationKindStrikethrough) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindCode is the Code variant of AnnotationKind.
type AnnotationKindCode struct {
}
func (AnnotationKindCode) isAnnotationKind() {}
func (AnnotationKindCode) Type() string { return "code" }
func (v AnnotationKindCode) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindSubscript is the Subscript variant of AnnotationKind.
type AnnotationKindSubscript struct {
}
func (AnnotationKindSubscript) isAnnotationKind() {}
func (AnnotationKindSubscript) Type() string { return "subscript" }
func (v AnnotationKindSubscript) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindSuperscript is the Superscript variant of AnnotationKind.
type AnnotationKindSuperscript struct {
}
func (AnnotationKindSuperscript) isAnnotationKind() {}
func (AnnotationKindSuperscript) Type() string { return "superscript" }
func (v AnnotationKindSuperscript) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindLink is the Link variant of AnnotationKind.
type AnnotationKindLink struct {
URL string `json:"url"`
Title *string `json:"title,omitempty"`
}
func (AnnotationKindLink) isAnnotationKind() {}
func (AnnotationKindLink) Type() string { return "link" }
func (v AnnotationKindLink) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
URL string `json:"url"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
URL: v.URL,
Title: v.Title,
})
}
// AnnotationKindHighlight highlighted text (PDF highlights, HTML `<mark>`).
type AnnotationKindHighlight struct {
}
func (AnnotationKindHighlight) isAnnotationKind() {}
func (AnnotationKindHighlight) Type() string { return "highlight" }
func (v AnnotationKindHighlight) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindColor text color (CSS-compatible value, e.g. "#ff0000", "red").
type AnnotationKindColor struct {
Value string `json:"value"`
}
func (AnnotationKindColor) isAnnotationKind() {}
func (AnnotationKindColor) Type() string { return "color" }
func (v AnnotationKindColor) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Value string `json:"value"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Value: v.Value,
})
}
// AnnotationKindFontSize font size with units (e.g. "12pt", "1.2em", "16px").
type AnnotationKindFontSize struct {
Value string `json:"value"`
}
func (AnnotationKindFontSize) isAnnotationKind() {}
func (AnnotationKindFontSize) Type() string { return "font_size" }
func (v AnnotationKindFontSize) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Value string `json:"value"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Value: v.Value,
})
}
// AnnotationKindCustom extensible annotation for format-specific styling.
type AnnotationKindCustom struct {
Name string `json:"name"`
Value *string `json:"value,omitempty"`
}
func (AnnotationKindCustom) isAnnotationKind() {}
func (AnnotationKindCustom) Type() string { return "custom" }
func (v AnnotationKindCustom) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Name string `json:"name"`
Value *string `json:"value,omitempty"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Name: v.Name,
Value: v.Value,
})
}
// UnmarshalAnnotationKind decodes JSON data into the appropriate concrete AnnotationKind variant.
func UnmarshalAnnotationKind(data []byte) (AnnotationKind, error) {
var wire struct {
AnnotationType string `json:"annotation_type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.AnnotationType {
case "bold":
var v AnnotationKindBold
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "italic":
var v AnnotationKindItalic
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "underline":
var v AnnotationKindUnderline
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "strikethrough":
var v AnnotationKindStrikethrough
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "code":
var v AnnotationKindCode
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "subscript":
var v AnnotationKindSubscript
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "superscript":
var v AnnotationKindSuperscript
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "link":
var v AnnotationKindLink
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "highlight":
var v AnnotationKindHighlight
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "color":
var v AnnotationKindColor
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "font_size":
var v AnnotationKindFontSize
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "custom":
var v AnnotationKindCustom
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown AnnotationKind type: %q", wire.AnnotationType)
}
// ExtractionMethod is an enumeration type.
type ExtractionMethod string
const (
// ExtractionMethodNative ExtractionMethodNative is the Native variant of ExtractionMethod.
ExtractionMethodNative ExtractionMethod = "native"
// ExtractionMethodOcr ExtractionMethodOcr is the Ocr variant of ExtractionMethod.
ExtractionMethodOcr ExtractionMethod = "ocr"
// ExtractionMethodMixed ExtractionMethodMixed is the Mixed variant of ExtractionMethod.
ExtractionMethodMixed ExtractionMethod = "mixed"
)
// ChunkType is an enumeration type.
type ChunkType string
const (
// ChunkTypeHeading ChunkTypeHeading section heading or document title.
ChunkTypeHeading ChunkType = "heading"
// ChunkTypePartyList ChunkTypePartyList party list: names, addresses, and signatories.
ChunkTypePartyList ChunkType = "party_list"
// ChunkTypeDefinitions ChunkTypeDefinitions definition clause ("X means…", "X shall mean…").
ChunkTypeDefinitions ChunkType = "definitions"
// ChunkTypeOperativeClause ChunkTypeOperativeClause operative clause containing legal/contractual action verbs.
ChunkTypeOperativeClause ChunkType = "operative_clause"
// ChunkTypeSignatureBlock ChunkTypeSignatureBlock signature block with signatures, names, and dates.
ChunkTypeSignatureBlock ChunkType = "signature_block"
// ChunkTypeSchedule ChunkTypeSchedule schedule, annex, appendix, or exhibit section.
ChunkTypeSchedule ChunkType = "schedule"
// ChunkTypeTableLike ChunkTypeTableLike table-like content with aligned columns or repeated patterns.
ChunkTypeTableLike ChunkType = "table_like"
// ChunkTypeFormula ChunkTypeFormula mathematical formula or equation.
ChunkTypeFormula ChunkType = "formula"
// ChunkTypeCodeBlock ChunkTypeCodeBlock code block or preformatted content.
ChunkTypeCodeBlock ChunkType = "code_block"
// ChunkTypeImage ChunkTypeImage embedded or referenced image content.
ChunkTypeImage ChunkType = "image"
// ChunkTypeOrgChart ChunkTypeOrgChart organizational chart or hierarchy diagram.
ChunkTypeOrgChart ChunkType = "org_chart"
// ChunkTypeDiagram ChunkTypeDiagram diagram, figure, or visual illustration.
ChunkTypeDiagram ChunkType = "diagram"
// ChunkTypeUnknown ChunkTypeUnknown unclassified or mixed content.
ChunkTypeUnknown ChunkType = "unknown"
)
// ImageKind is an enumeration type.
type ImageKind string
const (
// ImageKindPhotograph ImageKindPhotograph photographic image (natural scene, photograph)
ImageKindPhotograph ImageKind = "photograph"
// ImageKindDiagram ImageKindDiagram technical or schematic diagram
ImageKindDiagram ImageKind = "diagram"
// ImageKindChart ImageKindChart chart, graph, or plot
ImageKindChart ImageKind = "chart"
// ImageKindDrawing ImageKindDrawing freehand or technical drawing
ImageKindDrawing ImageKind = "drawing"
// ImageKindTextBlock ImageKindTextBlock text-heavy image (scanned text, document)
ImageKindTextBlock ImageKind = "text_block"
// ImageKindDecoration ImageKindDecoration decorative element or border
ImageKindDecoration ImageKind = "decoration"
// ImageKindLogo ImageKindLogo logo or brand mark
ImageKindLogo ImageKind = "logo"
// ImageKindIcon ImageKindIcon small icon
ImageKindIcon ImageKind = "icon"
// ImageKindTileFragment ImageKindTileFragment fragment of a larger tiled image (tile of a technical drawing)
ImageKindTileFragment ImageKind = "tile_fragment"
// ImageKindMask ImageKindMask mask or transparency map
ImageKindMask ImageKind = "mask"
// ImageKindPageRaster ImageKindPageRaster full-page render produced during OCR preprocessing; used as a citation thumbnail.
ImageKindPageRaster ImageKind = "page_raster"
// ImageKindUnknown ImageKindUnknown could not classify with reasonable confidence
ImageKindUnknown ImageKind = "unknown"
)
// ResultFormat is an enumeration type.
type ResultFormat string
const (
// ResultFormatUnified ResultFormatUnified unified format with all content in `content` field
ResultFormatUnified ResultFormat = "unified"
// ResultFormatElementBased ResultFormatElementBased element-based format with semantic element extraction
ResultFormatElementBased ResultFormat = "element_based"
)
// ElementType is an enumeration type.
type ElementType string
const (
// ElementTypeTitle ElementTypeTitle document title
ElementTypeTitle ElementType = "title"
// ElementTypeNarrativeText ElementTypeNarrativeText main narrative text body
ElementTypeNarrativeText ElementType = "narrative_text"
// ElementTypeHeading ElementTypeHeading section heading
ElementTypeHeading ElementType = "heading"
// ElementTypeListItem ElementTypeListItem list item (bullet, numbered, etc.)
ElementTypeListItem ElementType = "list_item"
// ElementTypeTable ElementTypeTable table element
ElementTypeTable ElementType = "table"
// ElementTypeImage ElementTypeImage image element
ElementTypeImage ElementType = "image"
// ElementTypePageBreak ElementTypePageBreak page break marker
ElementTypePageBreak ElementType = "page_break"
// ElementTypeCodeBlock ElementTypeCodeBlock code block
ElementTypeCodeBlock ElementType = "code_block"
// ElementTypeBlockQuote ElementTypeBlockQuote block quote
ElementTypeBlockQuote ElementType = "block_quote"
// ElementTypeFooter ElementTypeFooter footer text
ElementTypeFooter ElementType = "footer"
// ElementTypeHeader ElementTypeHeader header text
ElementTypeHeader ElementType = "header"
)
// FormatMetadata format-specific metadata (discriminated union).
//
// Only one format type can exist per extraction result. This provides
// type-safe, clean metadata without nested optionals.
// Variants: Pdf, Docx, Excel, Email, Pptx, Archive, Image, Xml, Text, Html, Ocr, Csv, Bibtex, Citation, FictionBook, Dbf, Jats, Epub, Pst, Code
type FormatMetadata struct {
FormatType string `json:"format_type"`
Pdf *PdfMetadata `json:"pdf,omitempty"`
Docx *DocxMetadata `json:"docx,omitempty"`
Excel *ExcelMetadata `json:"excel,omitempty"`
Email *EmailMetadata `json:"email,omitempty"`
Pptx *PptxMetadata `json:"pptx,omitempty"`
Archive *ArchiveMetadata `json:"archive,omitempty"`
Image *ImageMetadata `json:"image,omitempty"`
XML *XMLMetadata `json:"xml,omitempty"`
Text *TextMetadata `json:"text,omitempty"`
HTML *HTMLMetadata `json:"html,omitempty"`
Ocr *OcrMetadata `json:"ocr,omitempty"`
Csv *CsvMetadata `json:"csv,omitempty"`
Bibtex *BibtexMetadata `json:"bibtex,omitempty"`
Citation *CitationMetadata `json:"citation,omitempty"`
FictionBook *FictionBookMetadata `json:"fiction_book,omitempty"`
Dbf *DbfMetadata `json:"dbf,omitempty"`
Jats *JatsMetadata `json:"jats,omitempty"`
Epub *EpubMetadata `json:"epub,omitempty"`
Pst *PstMetadata `json:"pst,omitempty"`
}
// MarshalJSON encodes the tagged union with the discriminator tag.
func (t FormatMetadata) MarshalJSON() ([]byte, error) {
switch t.FormatType {
case "pdf":
if t.Pdf != nil {
data, err := json.Marshal(t.Pdf)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pdf"`)
return json.Marshal(m)
}
case "docx":
if t.Docx != nil {
data, err := json.Marshal(t.Docx)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"docx"`)
return json.Marshal(m)
}
case "excel":
if t.Excel != nil {
data, err := json.Marshal(t.Excel)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"excel"`)
return json.Marshal(m)
}
case "email":
if t.Email != nil {
data, err := json.Marshal(t.Email)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"email"`)
return json.Marshal(m)
}
case "pptx":
if t.Pptx != nil {
data, err := json.Marshal(t.Pptx)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pptx"`)
return json.Marshal(m)
}
case "archive":
if t.Archive != nil {
data, err := json.Marshal(t.Archive)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"archive"`)
return json.Marshal(m)
}
case "image":
if t.Image != nil {
data, err := json.Marshal(t.Image)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"image"`)
return json.Marshal(m)
}
case "xml":
if t.XML != nil {
data, err := json.Marshal(t.XML)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"xml"`)
return json.Marshal(m)
}
case "text":
if t.Text != nil {
data, err := json.Marshal(t.Text)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"text"`)
return json.Marshal(m)
}
case "html":
if t.HTML != nil {
data, err := json.Marshal(t.HTML)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"html"`)
return json.Marshal(m)
}
case "ocr":
if t.Ocr != nil {
data, err := json.Marshal(t.Ocr)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"ocr"`)
return json.Marshal(m)
}
case "csv":
if t.Csv != nil {
data, err := json.Marshal(t.Csv)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"csv"`)
return json.Marshal(m)
}
case "bibtex":
if t.Bibtex != nil {
data, err := json.Marshal(t.Bibtex)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"bibtex"`)
return json.Marshal(m)
}
case "citation":
if t.Citation != nil {
data, err := json.Marshal(t.Citation)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"citation"`)
return json.Marshal(m)
}
case "fiction_book":
if t.FictionBook != nil {
data, err := json.Marshal(t.FictionBook)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"fiction_book"`)
return json.Marshal(m)
}
case "dbf":
if t.Dbf != nil {
data, err := json.Marshal(t.Dbf)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"dbf"`)
return json.Marshal(m)
}
case "jats":
if t.Jats != nil {
data, err := json.Marshal(t.Jats)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"jats"`)
return json.Marshal(m)
}
case "epub":
if t.Epub != nil {
data, err := json.Marshal(t.Epub)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"epub"`)
return json.Marshal(m)
}
case "pst":
if t.Pst != nil {
data, err := json.Marshal(t.Pst)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pst"`)
return json.Marshal(m)
}
}
// Fallback: return just the tag
return json.Marshal(map[string]string{"format_type": t.FormatType})
}
// UnmarshalJSON decodes a tagged union by reading the tag first.
func (t *FormatMetadata) UnmarshalJSON(data []byte) error {
// Probe for the tag first
var probe struct {
FormatType string `json:"format_type"`
}
if err := json.Unmarshal(data, &probe); err != nil {
return err
}
t.FormatType = probe.FormatType
switch probe.FormatType {
case "pdf":
t.Pdf = &PdfMetadata{}
return json.Unmarshal(data, t.Pdf)
case "docx":
t.Docx = &DocxMetadata{}
return json.Unmarshal(data, t.Docx)
case "excel":
t.Excel = &ExcelMetadata{}
return json.Unmarshal(data, t.Excel)
case "email":
t.Email = &EmailMetadata{}
return json.Unmarshal(data, t.Email)
case "pptx":
t.Pptx = &PptxMetadata{}
return json.Unmarshal(data, t.Pptx)
case "archive":
t.Archive = &ArchiveMetadata{}
return json.Unmarshal(data, t.Archive)
case "image":
t.Image = &ImageMetadata{}
return json.Unmarshal(data, t.Image)
case "xml":
t.XML = &XMLMetadata{}
return json.Unmarshal(data, t.XML)
case "text":
t.Text = &TextMetadata{}
return json.Unmarshal(data, t.Text)
case "html":
t.HTML = &HTMLMetadata{}
return json.Unmarshal(data, t.HTML)
case "ocr":
t.Ocr = &OcrMetadata{}
return json.Unmarshal(data, t.Ocr)
case "csv":
t.Csv = &CsvMetadata{}
return json.Unmarshal(data, t.Csv)
case "bibtex":
t.Bibtex = &BibtexMetadata{}
return json.Unmarshal(data, t.Bibtex)
case "citation":
t.Citation = &CitationMetadata{}
return json.Unmarshal(data, t.Citation)
case "fiction_book":
t.FictionBook = &FictionBookMetadata{}
return json.Unmarshal(data, t.FictionBook)
case "dbf":
t.Dbf = &DbfMetadata{}
return json.Unmarshal(data, t.Dbf)
case "jats":
t.Jats = &JatsMetadata{}
return json.Unmarshal(data, t.Jats)
case "epub":
t.Epub = &EpubMetadata{}
return json.Unmarshal(data, t.Epub)
case "pst":
t.Pst = &PstMetadata{}
return json.Unmarshal(data, t.Pst)
}
return nil
}
// TextDirection is an enumeration type.
type TextDirection string
const (
// TextDirectionLeftToRight TextDirectionLeftToRight left-to-right text direction
TextDirectionLeftToRight TextDirection = "ltr"
// TextDirectionRightToLeft TextDirectionRightToLeft right-to-left text direction
TextDirectionRightToLeft TextDirection = "rtl"
// TextDirectionAuto TextDirectionAuto automatic text direction detection
TextDirectionAuto TextDirection = "auto"
)
// LinkType is an enumeration type.
type LinkType string
const (
// LinkTypeAnchor LinkTypeAnchor anchor link (#section)
LinkTypeAnchor LinkType = "anchor"
// LinkTypeInternal LinkTypeInternal internal link (same domain)
LinkTypeInternal LinkType = "internal"
// LinkTypeExternal LinkTypeExternal external link (different domain)
LinkTypeExternal LinkType = "external"
// LinkTypeEmail LinkTypeEmail email link (mailto:)
LinkTypeEmail LinkType = "email"
// LinkTypePhone LinkTypePhone phone link (tel:)
LinkTypePhone LinkType = "phone"
// LinkTypeOther LinkTypeOther other link type
LinkTypeOther LinkType = "other"
)
// ImageType is an enumeration type.
type ImageType string
const (
// ImageTypeDataURI ImageTypeDataURI data URI image
ImageTypeDataURI ImageType = "data-uri"
// ImageTypeInlineSvg ImageTypeInlineSvg inline SVG
ImageTypeInlineSvg ImageType = "inline-svg"
// ImageTypeExternal ImageTypeExternal external image URL
ImageTypeExternal ImageType = "external"
// ImageTypeRelative ImageTypeRelative relative path image
ImageTypeRelative ImageType = "relative"
)
// StructuredDataType is an enumeration type.
type StructuredDataType string
const (
// StructuredDataTypeJSONLd StructuredDataTypeJSONLd jSON-LD structured data
StructuredDataTypeJSONLd StructuredDataType = "json-ld"
// StructuredDataTypeMicrodata StructuredDataTypeMicrodata microdata
StructuredDataTypeMicrodata StructuredDataType = "microdata"
// StructuredDataTypeRdFa StructuredDataTypeRdFa rDFa
StructuredDataTypeRdFa StructuredDataType = "rdfa"
)
// OcrBoundingGeometry bounding geometry for an OCR element.
//
// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
// (from PaddleOCR and rotated text detection).
// Variants: Rectangle, Quadrilateral
// Sealed interface — use one of OcrBoundingGeometryRectangle, OcrBoundingGeometryQuadrilateral.
type OcrBoundingGeometry interface {
isOcrBoundingGeometry()
Type() string
}
// OcrBoundingGeometryRectangle axis-aligned bounding box (typical for Tesseract output).
type OcrBoundingGeometryRectangle struct {
// Left x-coordinate in pixels
Left uint32 `json:"left"`
// Top y-coordinate in pixels
Top uint32 `json:"top"`
// Width in pixels
Width uint32 `json:"width"`
// Height in pixels
Height uint32 `json:"height"`
}
func (OcrBoundingGeometryRectangle) isOcrBoundingGeometry() {}
func (OcrBoundingGeometryRectangle) Type() string { return "rectangle" }
func (v OcrBoundingGeometryRectangle) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Left uint32 `json:"left"`
Top uint32 `json:"top"`
Width uint32 `json:"width"`
Height uint32 `json:"height"`
}
return json.Marshal(aux{
Type: v.Type(),
Left: v.Left,
Top: v.Top,
Width: v.Width,
Height: v.Height,
})
}
// OcrBoundingGeometryQuadrilateral 4-point quadrilateral for rotated/skewed text (PaddleOCR).
//
// Points are in clockwise order starting from top-left:
// `[top_left, top_right, bottom_right, bottom_left]`
type OcrBoundingGeometryQuadrilateral struct {
// Four corner points as `[(x, y), ...]` in clockwise order
Points string `json:"points"`
}
func (OcrBoundingGeometryQuadrilateral) isOcrBoundingGeometry() {}
func (OcrBoundingGeometryQuadrilateral) Type() string { return "quadrilateral" }
func (v OcrBoundingGeometryQuadrilateral) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Points string `json:"points"`
}
return json.Marshal(aux{
Type: v.Type(),
Points: v.Points,
})
}
// UnmarshalOcrBoundingGeometry decodes JSON data into the appropriate concrete OcrBoundingGeometry variant.
func UnmarshalOcrBoundingGeometry(data []byte) (OcrBoundingGeometry, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "rectangle":
var v OcrBoundingGeometryRectangle
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "quadrilateral":
var v OcrBoundingGeometryQuadrilateral
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown OcrBoundingGeometry type: %q", wire.Type)
}
// OcrElementLevel is an enumeration type.
type OcrElementLevel string
const (
// OcrElementLevelWord OcrElementLevelWord individual word
OcrElementLevelWord OcrElementLevel = "word"
// OcrElementLevelLine OcrElementLevelLine line of text (default for PaddleOCR)
OcrElementLevelLine OcrElementLevel = "line"
// OcrElementLevelBlock OcrElementLevelBlock paragraph or text block
OcrElementLevelBlock OcrElementLevel = "block"
// OcrElementLevelPage OcrElementLevelPage page-level element
OcrElementLevelPage OcrElementLevel = "page"
)
// PageUnitType is an enumeration type.
type PageUnitType string
const (
// PageUnitTypePage PageUnitTypePage standard document pages (PDF, DOCX, images)
PageUnitTypePage PageUnitType = "page"
// PageUnitTypeSlide PageUnitTypeSlide presentation slides (PPTX, ODP)
PageUnitTypeSlide PageUnitType = "slide"
// PageUnitTypeSheet PageUnitTypeSheet spreadsheet sheets (XLSX, ODS)
PageUnitTypeSheet PageUnitType = "sheet"
)
// DiffLine single line in a unified-diff hunk.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type DiffLine string
// RevisionKind is an enumeration type.
type RevisionKind string
const (
// RevisionKindInsertion RevisionKindInsertion text or content was inserted.
RevisionKindInsertion RevisionKind = "insertion"
// RevisionKindDeletion RevisionKindDeletion text or content was deleted.
RevisionKindDeletion RevisionKind = "deletion"
// RevisionKindFormatChange RevisionKindFormatChange run-level formatting (font, size, colour, …) was changed.
RevisionKindFormatChange RevisionKind = "format_change"
// RevisionKindComment RevisionKindComment a reviewer comment or annotation.
RevisionKindComment RevisionKind = "comment"
)
// RevisionAnchor best-effort document location for a revision.
// Variants: Paragraph, TableCell, Page, Slide, Sheet
// Sealed interface — use one of RevisionAnchorParagraph, RevisionAnchorTableCell.
type RevisionAnchor interface {
isRevisionAnchor()
Type() string
}
// RevisionAnchorParagraph body paragraph, identified by its zero-based index in the document flow.
type RevisionAnchorParagraph struct {
// Zero-based index of the paragraph in document order.
Index uint `json:"index"`
}
func (RevisionAnchorParagraph) isRevisionAnchor() {}
func (RevisionAnchorParagraph) Type() string { return "paragraph" }
func (v RevisionAnchorParagraph) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorTableCell cell inside a table.
type RevisionAnchorTableCell struct {
// Zero-based row index within the table.
Row uint `json:"row"`
// Zero-based column index within the table.
Col uint `json:"col"`
// Zero-based index of the table in document order.
TableIndex uint `json:"table_index"`
}
func (RevisionAnchorTableCell) isRevisionAnchor() {}
func (RevisionAnchorTableCell) Type() string { return "table_cell" }
func (v RevisionAnchorTableCell) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Row uint `json:"row"`
Col uint `json:"col"`
TableIndex uint `json:"table_index"`
}
return json.Marshal(aux{
Type: v.Type(),
Row: v.Row,
Col: v.Col,
TableIndex: v.TableIndex,
})
}
// RevisionAnchorPage page, identified by its zero-based index.
type RevisionAnchorPage struct {
// Zero-based page index.
Index uint `json:"index"`
}
func (RevisionAnchorPage) isRevisionAnchor() {}
func (RevisionAnchorPage) Type() string { return "page" }
func (v RevisionAnchorPage) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorSlide presentation slide, identified by its zero-based index.
type RevisionAnchorSlide struct {
// Zero-based slide index.
Index uint `json:"index"`
}
func (RevisionAnchorSlide) isRevisionAnchor() {}
func (RevisionAnchorSlide) Type() string { return "slide" }
func (v RevisionAnchorSlide) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorSheet spreadsheet cell or range, identified by sheet index and optional name.
type RevisionAnchorSheet struct {
// Zero-based sheet index.
Index uint `json:"index"`
// Sheet display name when available.
Name *string `json:"name,omitempty"`
}
func (RevisionAnchorSheet) isRevisionAnchor() {}
func (RevisionAnchorSheet) Type() string { return "sheet" }
func (v RevisionAnchorSheet) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
Name *string `json:"name,omitempty"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
Name: v.Name,
})
}
// UnmarshalRevisionAnchor decodes JSON data into the appropriate concrete RevisionAnchor variant.
func UnmarshalRevisionAnchor(data []byte) (RevisionAnchor, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "paragraph":
var v RevisionAnchorParagraph
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "table_cell":
var v RevisionAnchorTableCell
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "page":
var v RevisionAnchorPage
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "slide":
var v RevisionAnchorSlide
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "sheet":
var v RevisionAnchorSheet
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown RevisionAnchor type: %q", wire.Type)
}
// URIKind is an enumeration type.
type URIKind string
const (
// URIKindHyperlink URIKindHyperlink a clickable hyperlink (web URL, file link).
URIKindHyperlink URIKind = "hyperlink"
// URIKindImage URIKindImage an image or media resource reference.
URIKindImage URIKind = "image"
// URIKindAnchor URIKindAnchor an internal anchor or cross-reference target.
URIKindAnchor URIKind = "anchor"
// URIKindCitation URIKindCitation a citation or bibliographic reference (DOI, academic ref).
URIKindCitation URIKind = "citation"
// URIKindReference URIKindReference a general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
URIKindReference URIKind = "reference"
// URIKindEmail URIKindEmail an email address (`mailto:` link or bare email).
URIKindEmail URIKind = "email"
)
// KeywordAlgorithm is an enumeration type.
type KeywordAlgorithm string
const (
// KeywordAlgorithmYake KeywordAlgorithmYake yAKE (Yet Another Keyword Extractor) - statistical approach
KeywordAlgorithmYake KeywordAlgorithm = "yake"
// KeywordAlgorithmRake KeywordAlgorithmRake rAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
KeywordAlgorithmRake KeywordAlgorithm = "rake"
)
// PSMMode is an enumeration type.
type PSMMode string
const (
// PSMModeOsdOnly PSMModeOsdOnly is the OsdOnly variant of PSMMode.
PSMModeOsdOnly PSMMode = "osd_only"
// PSMModeAutoOsd PSMModeAutoOsd is the AutoOsd variant of PSMMode.
PSMModeAutoOsd PSMMode = "auto_osd"
// PSMModeAutoOnly PSMModeAutoOnly is the AutoOnly variant of PSMMode.
PSMModeAutoOnly PSMMode = "auto_only"
// PSMModeAuto PSMModeAuto is the Auto variant of PSMMode.
PSMModeAuto PSMMode = "auto"
// PSMModeSingleColumn PSMModeSingleColumn is the SingleColumn variant of PSMMode.
PSMModeSingleColumn PSMMode = "single_column"
// PSMModeSingleBlockVertical PSMModeSingleBlockVertical is the SingleBlockVertical variant of PSMMode.
PSMModeSingleBlockVertical PSMMode = "single_block_vertical"
// PSMModeSingleBlock PSMModeSingleBlock is the SingleBlock variant of PSMMode.
PSMModeSingleBlock PSMMode = "single_block"
// PSMModeSingleLine PSMModeSingleLine is the SingleLine variant of PSMMode.
PSMModeSingleLine PSMMode = "single_line"
// PSMModeSingleWord PSMModeSingleWord is the SingleWord variant of PSMMode.
PSMModeSingleWord PSMMode = "single_word"
// PSMModeCircleWord PSMModeCircleWord is the CircleWord variant of PSMMode.
PSMModeCircleWord PSMMode = "circle_word"
// PSMModeSingleChar PSMModeSingleChar is the SingleChar variant of PSMMode.
PSMModeSingleChar PSMMode = "single_char"
)
// PaddleLanguage is an enumeration type.
type PaddleLanguage string
const (
// PaddleLanguageEnglish PaddleLanguageEnglish english
PaddleLanguageEnglish PaddleLanguage = "english"
// PaddleLanguageChinese PaddleLanguageChinese simplified Chinese
PaddleLanguageChinese PaddleLanguage = "chinese"
// PaddleLanguageJapanese PaddleLanguageJapanese japanese
PaddleLanguageJapanese PaddleLanguage = "japanese"
// PaddleLanguageKorean PaddleLanguageKorean korean
PaddleLanguageKorean PaddleLanguage = "korean"
// PaddleLanguageGerman PaddleLanguageGerman german
PaddleLanguageGerman PaddleLanguage = "german"
// PaddleLanguageFrench PaddleLanguageFrench french
PaddleLanguageFrench PaddleLanguage = "french"
// PaddleLanguageLatin PaddleLanguageLatin latin script (covers most European languages)
PaddleLanguageLatin PaddleLanguage = "latin"
// PaddleLanguageCyrillic PaddleLanguageCyrillic cyrillic (Russian and related)
PaddleLanguageCyrillic PaddleLanguage = "cyrillic"
// PaddleLanguageTraditionalChinese PaddleLanguageTraditionalChinese traditional Chinese
PaddleLanguageTraditionalChinese PaddleLanguage = "traditional_chinese"
// PaddleLanguageThai PaddleLanguageThai thai
PaddleLanguageThai PaddleLanguage = "thai"
// PaddleLanguageGreek PaddleLanguageGreek greek
PaddleLanguageGreek PaddleLanguage = "greek"
// PaddleLanguageEastSlavic PaddleLanguageEastSlavic east Slavic (Russian, Ukrainian, Belarusian)
PaddleLanguageEastSlavic PaddleLanguage = "east_slavic"
// PaddleLanguageArabic PaddleLanguageArabic arabic (Arabic, Persian, Urdu)
PaddleLanguageArabic PaddleLanguage = "arabic"
// PaddleLanguageDevanagari PaddleLanguageDevanagari devanagari (Hindi, Marathi, Sanskrit, Nepali)
PaddleLanguageDevanagari PaddleLanguage = "devanagari"
// PaddleLanguageTamil PaddleLanguageTamil tamil
PaddleLanguageTamil PaddleLanguage = "tamil"
// PaddleLanguageTelugu PaddleLanguageTelugu telugu
PaddleLanguageTelugu PaddleLanguage = "telugu"
)
// LayoutClass is an enumeration type.
type LayoutClass string
const (
// LayoutClassCaption LayoutClassCaption is the Caption variant of LayoutClass.
LayoutClassCaption LayoutClass = "caption"
// LayoutClassFootnote LayoutClassFootnote is the Footnote variant of LayoutClass.
LayoutClassFootnote LayoutClass = "footnote"
// LayoutClassFormula LayoutClassFormula is the Formula variant of LayoutClass.
LayoutClassFormula LayoutClass = "formula"
// LayoutClassListItem LayoutClassListItem is the ListItem variant of LayoutClass.
LayoutClassListItem LayoutClass = "list_item"
// LayoutClassPageFooter LayoutClassPageFooter is the PageFooter variant of LayoutClass.
LayoutClassPageFooter LayoutClass = "page_footer"
// LayoutClassPageHeader LayoutClassPageHeader is the PageHeader variant of LayoutClass.
LayoutClassPageHeader LayoutClass = "page_header"
// LayoutClassPicture LayoutClassPicture is the Picture variant of LayoutClass.
LayoutClassPicture LayoutClass = "picture"
// LayoutClassSectionHeader LayoutClassSectionHeader is the SectionHeader variant of LayoutClass.
LayoutClassSectionHeader LayoutClass = "section_header"
// LayoutClassTable LayoutClassTable is the Table variant of LayoutClass.
LayoutClassTable LayoutClass = "table"
// LayoutClassText LayoutClassText is the Text variant of LayoutClass.
LayoutClassText LayoutClass = "text"
// LayoutClassTitle LayoutClassTitle is the Title variant of LayoutClass.
LayoutClassTitle LayoutClass = "title"
// LayoutClassDocumentIndex LayoutClassDocumentIndex is the DocumentIndex variant of LayoutClass.
LayoutClassDocumentIndex LayoutClass = "document_index"
// LayoutClassCode LayoutClassCode is the Code variant of LayoutClass.
LayoutClassCode LayoutClass = "code"
// LayoutClassCheckboxSelected LayoutClassCheckboxSelected is the CheckboxSelected variant of LayoutClass.
LayoutClassCheckboxSelected LayoutClass = "checkbox_selected"
// LayoutClassCheckboxUnselected LayoutClassCheckboxUnselected is the CheckboxUnselected variant of LayoutClass.
LayoutClassCheckboxUnselected LayoutClass = "checkbox_unselected"
// LayoutClassForm LayoutClassForm is the Form variant of LayoutClass.
LayoutClassForm LayoutClass = "form"
// LayoutClassKeyValueRegion LayoutClassKeyValueRegion is the KeyValueRegion variant of LayoutClass.
LayoutClassKeyValueRegion LayoutClass = "key_value_region"
)
// CacheStats is a type.
type CacheStats struct {
TotalFiles uint `json:"total_files"`
TotalSizeMb float64 `json:"total_size_mb"`
AvailableSpaceMb float64 `json:"available_space_mb"`
OldestFileAgeDays float64 `json:"oldest_file_age_days"`
NewestFileAgeDays float64 `json:"newest_file_age_days"`
}
// AccelerationConfig hardware acceleration configuration for ONNX Runtime models.
//
// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
// for inference in layout detection and embedding generation.
//
// Example:
//
// // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
// let config = AccelerationConfig::default();
//
// // Force CPU only
// let config = AccelerationConfig {
// provider: kreuzberg::ExecutionProviderType::Cpu,
// ..Default::default()
// };
type AccelerationConfig struct {
// Execution provider to use for ONNX inference.
Provider ExecutionProviderType `json:"provider,omitempty"`
// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
DeviceID uint32 `json:"device_id"`
}
// ContentFilterConfig cross-extractor content filtering configuration.
//
// Controls whether "furniture" content (headers, footers, page numbers,
// watermarks, repeating text) is included in or stripped from extraction
// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
// with format-specific implementation.
//
// When `None` on `ExtractionConfig`, each extractor uses its current
// default behavior unchanged.
type ContentFilterConfig struct {
// Include running headers in extraction output.
//
// - PDF: Disables top-margin furniture stripping and prevents the layout
// model from treating `PageHeader`-classified regions as furniture.
// - DOCX: Includes document headers in text output.
// - RTF/ODT: Headers already included; this is a no-op when true.
// - HTML/EPUB: Keeps `<header>` element content.
//
// Default: `false` (headers are stripped or excluded).
IncludeHeaders bool `json:"include_headers"`
// Include running footers in extraction output.
//
// - PDF: Disables bottom-margin furniture stripping and prevents the layout
// model from treating `PageFooter`-classified regions as furniture.
// - DOCX: Includes document footers in text output.
// - RTF/ODT: Footers already included; this is a no-op when true.
// - HTML/EPUB: Keeps `<footer>` element content.
//
// Default: `false` (footers are stripped or excluded).
IncludeFooters bool `json:"include_footers"`
// Enable the heuristic cross-page repeating text detector.
//
// When `true` (default), text that repeats verbatim across a supermajority
// of pages is classified as furniture and stripped. Disable this if brand
// names or repeated headings are being incorrectly removed by the heuristic.
//
// Note: when a layout-detection model is active, the model may independently
// classify page-header / page-footer regions as furniture on a per-page basis.
// To preserve those regions, set `include_headers = true`, `include_footers = true`,
// or both, in addition to disabling this flag.
//
// Primarily affects PDF extraction.
//
// Default: `true`.
StripRepeatingText *bool `json:"strip_repeating_text,omitempty"`
// Include watermark text in extraction output.
//
// - PDF: Keeps watermark artifacts and arXiv identifiers.
// - Other formats: No effect currently.
//
// Default: `false` (watermarks are stripped).
IncludeWatermarks bool `json:"include_watermarks"`
}
// EmailConfig configuration for email extraction.
type EmailConfig struct {
// Windows codepage number to use when an MSG file contains no codepage property.
// Defaults to `None`, which falls back to windows-1252.
//
// If an unrecognized or invalid codepage number is supplied (including 0),
// the behavior silently falls back to windows-1252 — the same as when the
// MSG file itself contains an unrecognized codepage. No error or warning is
// emitted. Users should verify output when supplying unusual values.
//
// Common values:
// - 1250: Central European (Polish, Czech, Hungarian, etc.)
// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
// - 1252: Western European (default)
// - 1253: Greek
// - 1254: Turkish
// - 1255: Hebrew
// - 1256: Arabic
// - 932: Japanese (Shift-JIS)
// - 936: Simplified Chinese (GBK)
MsgFallbackCodepage *uint32 `json:"msg_fallback_codepage,omitempty"`
}
// ExtractionConfig main extraction configuration.
//
// This struct contains all configuration options for the extraction process.
// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
//
// Example:
//
// // Create with defaults
// let config = ExtractionConfig::default();
//
// // Load from TOML file
// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
type ExtractionConfig struct {
// Enable caching of extraction results
UseCache *bool `json:"use_cache,omitempty"`
// Enable quality post-processing
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
// OCR configuration (None = OCR disabled)
Ocr *OcrConfig `json:"ocr,omitempty"`
// Force OCR even for searchable PDFs
ForceOcr bool `json:"force_ocr"`
// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
//
// When set, only the listed pages are OCR'd regardless of text layer quality.
// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
// Only applies to PDF documents. Duplicates are automatically deduplicated.
// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
// Disable OCR entirely, even for images.
//
// When `true`, OCR is skipped for all document types. Images return metadata
// only (dimensions, format, EXIF) without text extraction. PDFs use only
// native text extraction without OCR fallback.
//
// Cannot be `true` simultaneously with `force_ocr`.
//
// *Added in v4.7.0.*
DisableOcr bool `json:"disable_ocr"`
// Text chunking configuration (None = chunking disabled)
Chunking *ChunkingConfig `json:"chunking,omitempty"`
// Content filtering configuration (None = use extractor defaults).
//
// Controls whether document "furniture" (headers, footers, watermarks,
// repeating text) is included in or stripped from extraction results.
// See [`ContentFilterConfig`] for per-field documentation.
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
// Image extraction configuration (None = no image extraction)
Images *ImageExtractionConfig `json:"images,omitempty"`
// PDF-specific options (None = use defaults)
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
// Token reduction configuration (None = no token reduction)
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
// Language detection configuration (None = no language detection)
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
// Page extraction configuration (None = no page tracking)
Pages *PageConfig `json:"pages,omitempty"`
// Keyword extraction configuration (None = no keyword extraction)
Keywords *KeywordConfig `json:"keywords,omitempty"`
// Post-processor configuration (None = use defaults)
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
// HTML to Markdown conversion options (None = use defaults)
//
// Configure how HTML documents are converted to Markdown, including heading styles,
// list formatting, code block styles, and preprocessing options.
HTMLOptions *string `json:"html_options,omitempty"`
// Styled HTML output configuration.
//
// When set alongside `output_format = OutputFormat::Html`, the extraction
// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
// which emits stable `kb-*` CSS class hooks on every structural element
// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
//
// When `None`, the existing plain comrak-based HTML renderer is used.
HTMLOutput *HTMLOutputConfig `json:"html_output,omitempty"`
// Default per-file timeout in seconds for batch extraction.
//
// When set, each file in a batch will be canceled after this duration
// unless overridden by [`FileExtractionConfig::timeout_secs`].
//
// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
// nested archives, documents with millions of cells) from running
// indefinitely and exhausting caller resources. Set to `None` to
// disable the timeout for trusted input or long-running workloads.
ExtractionTimeoutSecs *uint64 `json:"extraction_timeout_secs,omitempty"`
// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
//
// Limits parallelism to prevent resource exhaustion when processing
// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
MaxConcurrentExtractions *uint `json:"max_concurrent_extractions,omitempty"`
// Result structure format
//
// Controls whether results are returned in unified format (default) with all
// content in the `content` field, or element-based format with semantic
// elements (for Unstructured-compatible output).
ResultFormat ResultFormat `json:"result_format,omitempty"`
// Security limits for archive extraction.
//
// Controls maximum archive size, compression ratio, file count, and other
// security thresholds to prevent decompression bomb attacks. Also caps
// nesting depth, iteration count, entity / token length, total
// content size, and table cell count for every extraction path that
// ingests user-controlled bytes.
// When `None`, default limits are used.
SecurityLimits *SecurityLimits `json:"security_limits,omitempty"`
// Maximum uncompressed size in bytes for a single embedded file before
// recursive extraction is attempted (default: 50 MiB).
//
// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
// to email attachments processed via recursive extraction. Files that
// exceed this limit are skipped with a `ProcessingWarning` rather than
// passed to the extraction pipeline, preventing a single oversized
// embedded object from consuming unbounded memory or time.
//
// Set to `None` to disable the per-embedded-file cap (falls back to
// `security_limits.max_archive_size` as the only guard).
MaxEmbeddedFileBytes *uint64 `json:"max_embedded_file_bytes,omitempty"`
// Content text format (default: Plain).
//
// Controls the format of the extracted content:
// - `Plain`: Raw extracted text (default)
// - `Markdown`: Markdown formatted output
// - `Djot`: Djot markup format (requires djot feature)
// - `Html`: HTML formatted output
//
// When set to a structured format, extraction results will include
// formatted output. The `formatted_content` field may be populated
// when format conversion is applied.
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// Layout detection configuration (None = layout detection disabled).
//
// When set, PDF pages and images are analyzed for document structure
// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
// via ONNX Runtime. For PDFs, layout hints override paragraph classification
// in the markdown pipeline. For images, per-region OCR is performed with
// markdown formatting based on detected layout classes.
// Requires the `layout-detection` feature to run inference; the field is
// present whenever the `layout-types` feature is active (which includes
// `layout-detection` as well as the no-ORT target groups).
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
// Run layout detection on the non-OCR PDF markdown path.
//
// When `true` and `layout` is `Some(_)`, layout regions inform heading,
// table, list, and figure detection in the structure pipeline that would
// otherwise rely on font-clustering heuristics alone. Significantly
// improves SF1 (structural F1) at the cost of inference latency
// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
// Requires the `layout-detection` feature.
UseLayoutForMarkdown bool `json:"use_layout_for_markdown"`
// Enable structured document tree output.
//
// When true, populates the `document` field on `ExtractionResult` with a
// hierarchical `DocumentStructure` containing heading-driven section nesting,
// table grids, content layer classification, and inline annotations.
//
// Independent of `result_format` — can be combined with Unified or ElementBased.
IncludeDocumentStructure bool `json:"include_document_structure"`
// Hardware acceleration configuration for ONNX Runtime models.
//
// Controls execution provider selection for layout detection and embedding
// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
// Linux, CPU on Windows).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Cache namespace for tenant isolation.
//
// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
// Different namespaces have isolated cache spaces on the same filesystem.
CacheNamespace *string `json:"cache_namespace,omitempty"`
// Per-request cache TTL in seconds.
//
// Overrides the global `max_age_days` for this specific extraction.
// When `0`, caching is completely skipped (no read or write).
// When `None`, the global TTL applies.
CacheTTLSecs *uint64 `json:"cache_ttl_secs,omitempty"`
// Email extraction configuration (None = use defaults).
//
// Currently supports configuring the fallback codepage for MSG files
// that do not specify one. See `EmailConfig` for details.
Email *EmailConfig `json:"email,omitempty"`
// Concurrency limits for constrained environments (None = use defaults).
//
// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
// (when `max_concurrent_extractions` is unset) the batch concurrency
// semaphore. See `ConcurrencyConfig` for details.
Concurrency *string `json:"concurrency,omitempty"`
// Maximum recursion depth for archive extraction (default: 3).
// Set to 0 to disable recursive extraction (legacy behavior).
MaxArchiveDepth uint `json:"max_archive_depth"`
// Tree-sitter language pack configuration (None = tree-sitter disabled).
//
// When set, enables code file extraction using tree-sitter parsers.
// Controls grammar download behavior and code analysis options.
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
// Structured extraction via LLM (None = disabled).
//
// When set, the extracted document content is sent to an LLM with the
// provided JSON schema. The structured response is stored in
// `ExtractionResult::structured_output`.
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
// Cancellation token for this extraction (None = no external cancellation).
//
// Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
// from another thread / task to abort the extraction in progress. The extractor
// checks the token at safe checkpoints (before lock acquisition, between pages,
// between batch items) and returns [`KreuzbergError::Cancelled`] when set.
//
// The field is excluded from serialization because `CancellationToken` is a
// runtime handle, not a configuration value.
CancelToken *string `json:"cancel_token,omitempty"`
}
// FileExtractionConfig per-file extraction configuration overrides for batch processing.
//
// All fields are `Option<T>` — `None` means "use the batch-level default."
// This type is used with `batch_extract_files` and
// `batch_extract_bytes` to allow heterogeneous
// extraction settings within a single batch.
//
// # Excluded Fields
//
// The following `ExtractionConfig` fields are batch-level only and
// cannot be overridden per file:
// - `max_concurrent_extractions` — controls batch parallelism
// - `use_cache` — global caching policy
// - `acceleration` — shared ONNX execution provider
// - `security_limits` — global archive security policy
//
// Example:
//
// // Override just OCR forcing for a specific file
// let config = FileExtractionConfig {
// force_ocr: Some(true),
// ..Default::default()
// };
type FileExtractionConfig struct {
// Override quality post-processing for this file.
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
// Override OCR configuration for this file (None in the Option = use batch default).
Ocr *OcrConfig `json:"ocr,omitempty"`
// Override force OCR for this file.
ForceOcr *bool `json:"force_ocr,omitempty"`
// Override force OCR pages for this file (1-indexed page numbers).
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
// Override disable OCR for this file.
DisableOcr *bool `json:"disable_ocr,omitempty"`
// Override chunking configuration for this file.
Chunking *ChunkingConfig `json:"chunking,omitempty"`
// Override content filtering configuration for this file.
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
// Override image extraction configuration for this file.
Images *ImageExtractionConfig `json:"images,omitempty"`
// Override PDF options for this file.
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
// Override token reduction for this file.
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
// Override language detection for this file.
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
// Override page extraction for this file.
Pages *PageConfig `json:"pages,omitempty"`
// Override keyword extraction for this file.
Keywords *KeywordConfig `json:"keywords,omitempty"`
// Override post-processor for this file.
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
// Override HTML conversion options for this file.
HTMLOptions *string `json:"html_options,omitempty"`
// Override result format for this file.
ResultFormat *ResultFormat `json:"result_format,omitempty"`
// Override output content format for this file.
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// Override document structure output for this file.
IncludeDocumentStructure *bool `json:"include_document_structure,omitempty"`
// Override layout detection for this file.
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
// Override per-file extraction timeout in seconds.
//
// When set, the extraction for this file will be canceled after the
// specified duration. A timed-out file produces an error result without
// affecting other files in the batch.
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
// Override tree-sitter configuration for this file.
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
// Override structured extraction configuration for this file.
//
// When set, enables LLM-based structured extraction with a JSON schema
// for this specific file. The extracted content is sent to a VLM/LLM
// and the response is parsed according to the provided schema.
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
}
// BatchBytesItem batch item for byte array extraction.
//
// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
// to represent a single item in a batch extraction job.
type BatchBytesItem struct {
// The content bytes to extract from
Content []byte `json:"content"`
// MIME type of the content (e.g., "application/pdf", "text/html")
MimeType string `json:"mime_type"`
// Per-item configuration overrides (None uses batch-level defaults)
Config *FileExtractionConfig `json:"config,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v BatchBytesItem) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Content []int `json:"content"`
MimeType string `json:"mime_type"`
Config *FileExtractionConfig `json:"config,omitempty"`
}{}
aux.Content = make([]int, len(v.Content))
for i, b := range v.Content {
aux.Content[i] = int(b)
}
aux.MimeType = v.MimeType
aux.Config = v.Config
return json.Marshal(aux)
}
// BatchFileItem batch item for file extraction.
//
// Used with `batch_extract_files` and `batch_extract_files_sync`
// to represent a single file in a batch extraction job.
type BatchFileItem struct {
// Path to the file to extract from
Path string `json:"path"`
// Per-file configuration overrides (None uses batch-level defaults)
Config *FileExtractionConfig `json:"config,omitempty"`
}
// ImageExtractionConfig image extraction configuration.
type ImageExtractionConfig struct {
// Extract images from documents
ExtractImages *bool `json:"extract_images,omitempty"`
// Target DPI for image normalization
TargetDpi *int32 `json:"target_dpi,omitempty"`
// Maximum dimension for images (width or height)
MaxImageDimension *int32 `json:"max_image_dimension,omitempty"`
// Whether to inject image reference placeholders into markdown output.
// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
// are appended to the markdown. Set to `false` to extract images as data
// without polluting the markdown output.
InjectPlaceholders *bool `json:"inject_placeholders,omitempty"`
// Automatically adjust DPI based on image content
AutoAdjustDpi *bool `json:"auto_adjust_dpi,omitempty"`
// Minimum DPI threshold
MinDpi *int32 `json:"min_dpi,omitempty"`
// Maximum DPI threshold
MaxDpi *int32 `json:"max_dpi,omitempty"`
// Maximum number of image objects to extract per PDF page.
//
// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
// can trigger extremely long or indefinite extraction times when every image
// object on a dense page is decoded individually via the PDF extractor. Setting this
// limit causes kreuzberg to stop collecting individual images once the count
// per page reaches the cap and emit a warning instead.
//
// `None` (default) means no limit — all images are extracted.
MaxImagesPerPage *uint32 `json:"max_images_per_page,omitempty"`
// When `true` (default), extracted images are classified by kind and grouped
// into clusters where they appear to belong to one figure.
Classify *bool `json:"classify,omitempty"`
// When `true`, full-page renders produced during OCR preprocessing are captured
// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
//
// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
// document-level OCR bypass is active (whole-document backend). When OCR is
// enabled and this flag is set but the active backend skips per-page rendering,
// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
//
// Defaults to `false`. Enable when downstream consumers need page thumbnails
// (e.g. citation previews, visual grounding).
IncludePageRasters bool `json:"include_page_rasters"`
// Run OCR on extracted images and include the recognized text in the document content.
//
// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
// are processed with the configured OCR backend. Set to `false` to extract images
// without OCR processing, even when OCR is enabled.
RunOcrOnImages *bool `json:"run_ocr_on_images,omitempty"`
// When `true`, image OCR results are rendered as plain text without the
// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
// is also `true`.
OcrTextOnly bool `json:"ocr_text_only"`
// When `true` and `ocr_text_only` is `false`, append the OCR text after
// the image placeholder in the rendered output.
AppendOcrText bool `json:"append_ocr_text"`
}
// TokenReductionOptions token reduction configuration.
type TokenReductionOptions struct {
// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
Mode string `json:"mode"`
// Preserve important words (capitalized, technical terms)
PreserveImportantWords *bool `json:"preserve_important_words,omitempty"`
}
// LanguageDetectionConfig language detection configuration.
type LanguageDetectionConfig struct {
// Enable language detection
Enabled *bool `json:"enabled,omitempty"`
// Minimum confidence threshold (0.0-1.0)
MinConfidence *float64 `json:"min_confidence,omitempty"`
// Detect multiple languages in the document
DetectMultiple bool `json:"detect_multiple"`
}
// HTMLOutputConfig configuration for styled HTML output.
//
// When set on [`ExtractionConfig::html_output`] alongside
// `output_format = OutputFormat::Html`, the pipeline builds a
// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
// the plain comrak-based renderer.
//
// Example:
//
// let config = HtmlOutputConfig {
// theme: HtmlTheme::GitHub,
// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
// ..Default::default()
// };
type HTMLOutputConfig struct {
// Inline CSS string injected into the output after the theme stylesheet.
// Concatenated after `css_file` content when both are set.
CSS *string `json:"css,omitempty"`
// Path to a CSS file loaded once at renderer construction time.
// Concatenated before `css` when both are set.
CSSFile *string `json:"css_file,omitempty"`
// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
Theme *HTMLTheme `json:"theme,omitempty"`
// CSS class prefix applied to every emitted class name.
//
// Default: `"kb-"`. Change this if your host application already uses
// classes that start with `kb-`.
ClassPrefix string `json:"class_prefix"`
// When `true` (default), write the resolved CSS into a `<style>` block
// immediately after the opening `<div class="{prefix}doc">`.
//
// Set to `false` to emit only the structural markup and wire up your
// own stylesheet targeting the `kb-*` class names.
EmbedCSS *bool `json:"embed_css,omitempty"`
}
// LayoutDetectionConfig layout detection configuration.
//
// Controls layout detection behavior in the extraction pipeline.
// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
// is enabled for PDF extraction.
type LayoutDetectionConfig struct {
// Confidence threshold override (None = use model default).
ConfidenceThreshold *float32 `json:"confidence_threshold,omitempty"`
// Whether to apply postprocessing heuristics (default: true).
ApplyHeuristics *bool `json:"apply_heuristics,omitempty"`
// Table structure recognition model.
//
// Controls which model is used for table cell detection within layout-detected
// table regions. Defaults to [`TableModel::Tatr`].
TableModel TableModel `json:"table_model,omitempty"`
// Hardware acceleration for ONNX models (layout detection + table structure).
//
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
// is used for inference. Defaults to `None` (auto-select per platform).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
}
// LlmConfig configuration for an LLM provider/model via liter-llm.
//
// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
// its own `LlmConfig`, allowing different providers per feature.
//
// Example:
//
// [structured_extraction.llm]
// model = "openai/gpt-4o"
// api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
type LlmConfig struct {
// Provider/model string using liter-llm routing format.
//
// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
// `"groq/llama-3.1-70b-versatile"`.
Model string `json:"model"`
// API key for the provider. When `None`, liter-llm falls back to
// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
APIKey *string `json:"api_key,omitempty"`
// Custom base URL override for the provider endpoint.
BaseURL *string `json:"base_url,omitempty"`
// Request timeout in seconds (default: 60).
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
// Maximum retry attempts (default: 3).
MaxRetries *uint32 `json:"max_retries,omitempty"`
// Sampling temperature for generation tasks.
Temperature *float64 `json:"temperature,omitempty"`
// Maximum tokens to generate.
MaxTokens *uint64 `json:"max_tokens,omitempty"`
}
// StructuredExtractionConfig configuration for LLM-based structured data extraction.
//
// Sends extracted document content to a VLM with a JSON schema,
// returning structured data that conforms to the schema.
//
// Example:
//
// [structured_extraction]
// schema_name = "invoice_data"
// strict = true
//
// [structured_extraction.schema]
// type = "object"
// properties.vendor = { type = "string" }
// properties.total = { type = "number" }
// required = ["vendor", "total"]
//
// [structured_extraction.llm]
// model = "openai/gpt-4o"
type StructuredExtractionConfig struct {
// JSON Schema defining the desired output structure.
Schema json.RawMessage `json:"schema"`
// Schema name passed to the LLM's structured output mode.
SchemaName string `json:"schema_name"`
// Optional schema description for the LLM.
SchemaDescription *string `json:"schema_description,omitempty"`
// Enable strict mode — output must exactly match the schema.
Strict bool `json:"strict"`
// Custom Jinja2 extraction prompt template. When `None`, a default template is used.
//
// Available template variables:
// - `{{ content }}` — The extracted document text.
// - `{{ schema }}` — The JSON schema as a formatted string.
// - `{{ schema_name }}` — The schema name.
// - `{{ schema_description }}` — The schema description (may be empty).
Prompt *string `json:"prompt,omitempty"`
// LLM configuration for the extraction.
Llm LlmConfig `json:"llm"`
}
// OcrQualityThresholds quality thresholds for OCR fallback decisions and pipeline quality gating.
//
// All fields default to the values that match the previous hardcoded behavior,
// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
type OcrQualityThresholds struct {
// Minimum total non-whitespace characters to consider text substantive.
MinTotalNonWhitespace *uint `json:"min_total_non_whitespace,omitempty"`
// Minimum non-whitespace characters per page on average.
MinNonWhitespacePerPage *float64 `json:"min_non_whitespace_per_page,omitempty"`
// Minimum character count for a word to be "meaningful".
MinMeaningfulWordLen *uint `json:"min_meaningful_word_len,omitempty"`
// Minimum count of meaningful words before text is accepted.
MinMeaningfulWords *uint `json:"min_meaningful_words,omitempty"`
// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
MinAlnumRatio *float64 `json:"min_alnum_ratio,omitempty"`
// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
MinGarbageChars *uint `json:"min_garbage_chars,omitempty"`
// Maximum fraction of short (1-2 char) words before text is considered fragmented.
MaxFragmentedWordRatio *float64 `json:"max_fragmented_word_ratio,omitempty"`
// Critical fragmentation threshold — triggers OCR regardless of meaningful words.
// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
CriticalFragmentedWordRatio *float64 `json:"critical_fragmented_word_ratio,omitempty"`
// Minimum average word length. Below this with enough words indicates garbled extraction.
MinAvgWordLength *float64 `json:"min_avg_word_length,omitempty"`
// Minimum word count before average word length check applies.
MinWordsForAvgLengthCheck *uint `json:"min_words_for_avg_length_check,omitempty"`
// Minimum consecutive word repetition ratio to detect column scrambling.
MinConsecutiveRepeatRatio *float64 `json:"min_consecutive_repeat_ratio,omitempty"`
// Minimum word count before consecutive repetition check is applied.
MinWordsForRepeatCheck *uint `json:"min_words_for_repeat_check,omitempty"`
// Minimum character count for "substantive markdown" OCR skip gate.
SubstantiveMinChars *uint `json:"substantive_min_chars,omitempty"`
// Minimum character count for "non-text content" OCR skip gate.
NonTextMinChars *uint `json:"non_text_min_chars,omitempty"`
// Alphanumeric+whitespace ratio threshold for skip decisions.
AlnumWsRatioThreshold *float64 `json:"alnum_ws_ratio_threshold,omitempty"`
// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
// If the result from a backend scores below this, try the next backend.
PipelineMinQuality *float64 `json:"pipeline_min_quality,omitempty"`
}
// OcrPipelineStage single backend stage in the OCR pipeline.
type OcrPipelineStage struct {
// Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
Backend string `json:"backend"`
// Priority weight (higher = tried first). Stages are sorted by priority descending.
Priority uint32 `json:"priority"`
// Language override for this stage (None = use parent OcrConfig.language).
Language *string `json:"language,omitempty"`
// Tesseract-specific config override for this stage.
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
// PaddleOCR-specific config for this stage.
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
// VLM config override for this pipeline stage.
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
// Arbitrary per-call options passed through to the backend unchanged.
//
// Backends that support runtime tuning (mode switching, preprocessing
// flags, inference parameters, etc.) read this value and deserialize
// the keys they care about. Keys unknown to the backend are silently
// ignored, so options from different backends can coexist in the same
// config without conflict.
//
// Example (custom backend):
// ```json
// { "mode": "fast", "enable_layout": true }
// ```
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
}
// OcrPipelineConfig multi-backend OCR pipeline with quality-based fallback.
//
// Backends are tried in priority order (highest first). After each backend
// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
// the result is accepted. Otherwise the next backend is tried.
type OcrPipelineConfig struct {
// Ordered list of backends to try. Sorted by priority (descending) at runtime.
Stages []OcrPipelineStage `json:"stages,omitempty"`
// Quality thresholds for deciding whether to accept a result or try the next backend.
QualityThresholds OcrQualityThresholds `json:"quality_thresholds"`
}
// OcrConfig oCR configuration.
type OcrConfig struct {
// Whether OCR is enabled.
//
// Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
// [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
// metadata only; PDFs use native text extraction without OCR fallback.
//
// Defaults to `true`. When `false`, all other OCR settings are ignored.
Enabled *bool `json:"enabled,omitempty"`
// OCR backend: tesseract, easyocr, paddleocr
Backend string `json:"backend"`
// Language code (e.g., "eng", "deu")
Language string `json:"language"`
// Tesseract-specific configuration (optional)
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
// Output format for OCR results (optional, for format conversion)
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// PaddleOCR-specific configuration (optional, JSON passthrough)
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
// Arbitrary per-call options passed through to the backend unchanged.
//
// Custom OCR backends and built-in backends that support runtime tuning
// can read this value and deserialize the keys they care about. Keys
// unknown to the backend are silently ignored.
//
// This is the recommended extension point for per-call parameters that
// are not covered by the typed fields above (e.g. mode switching,
// preprocessing flags, inference batch size).
//
// **Scope:** when `pipeline` is `None`, this value is propagated to the
// primary stage of the auto-constructed pipeline. When `pipeline` is
// explicitly set, this field has **no effect** — the caller must set
// `OcrPipelineStage.backend_options` directly on the relevant stage(s)
// instead.
//
// Example:
// ```json
// { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
// ```
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
// OCR element extraction configuration
ElementConfig *OcrElementConfig `json:"element_config,omitempty"`
// Quality thresholds for the native-text-to-OCR fallback decision.
// When None, uses compiled defaults (matching previous hardcoded behavior).
QualityThresholds *OcrQualityThresholds `json:"quality_thresholds,omitempty"`
// Multi-backend OCR pipeline configuration. When set, enables weighted
// fallback across multiple OCR backends based on output quality.
// When None, uses the single `backend` field (same as today).
Pipeline *OcrPipelineConfig `json:"pipeline,omitempty"`
// Enable automatic page rotation based on orientation detection.
//
// When enabled, uses Tesseract's `DetectOrientationScript()` to detect
// page orientation (0/90/180/270 degrees) before OCR. If the page is
// rotated with high confidence, the image is corrected before recognition.
// This is critical for handling rotated scanned documents.
AutoRotate bool `json:"auto_rotate"`
// VLM (Vision Language Model) OCR configuration.
//
// Required when `backend` is `"vlm"`. Uses liter-llm to send page
// images to a vision model for text extraction.
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
// Custom Jinja2 prompt template for VLM OCR.
//
// When `None`, uses the default template. Available variables:
// - `{{ language }}` — The document language code (e.g., "eng", "deu").
VlmPrompt *string `json:"vlm_prompt,omitempty"`
// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
//
// Not user-configurable via config files — injected at runtime from
// `ExtractionConfig::acceleration` before each `process_image` call.
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Caller-supplied Tesseract `traineddata` bytes per language code.
//
// Primary use case is the WASM build, which has no filesystem and cannot
// download tessdata at runtime. Native builds typically rely on
// `TessdataManager` and ignore this field. When present, the WASM
// Tesseract backend prefers these bytes over its compile-time-bundled
// English data.
//
// Skipped by serde to keep config files small — supply via the typed API
// at runtime.
TessdataBytes map[string][]byte `json:"tessdata_bytes,omitempty"`
}
// PageConfig page extraction and tracking configuration.
//
// Controls how pages are extracted, tracked, and represented in the extraction results.
// When `None`, page tracking is disabled.
//
// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
// when page boundaries are available and chunking is configured.
type PageConfig struct {
// Extract pages as separate array (ExtractionResult.pages)
ExtractPages bool `json:"extract_pages"`
// Insert page markers in main content string
InsertPageMarkers bool `json:"insert_page_markers"`
// Page marker format (use {page_num} placeholder)
// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
MarkerFormat *string `json:"marker_format,omitempty"`
}
// PdfConfig pDF-specific configuration.
type PdfConfig struct {
// Extract images from PDF
ExtractImages bool `json:"extract_images"`
// Extract tables from PDF.
//
// When `true` (default), runs pdf_oxide's native grid detector and, if it
// finds nothing, falls back to the heuristic text-layer reconstruction in
// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
// both passes — `tables` will then be empty in the result.
ExtractTables *bool `json:"extract_tables,omitempty"`
// List of passwords to try when opening encrypted PDFs
Passwords []string `json:"passwords,omitempty"`
// Extract PDF metadata
ExtractMetadata *bool `json:"extract_metadata,omitempty"`
// Hierarchy extraction configuration (None = hierarchy extraction disabled)
Hierarchy *HierarchyConfig `json:"hierarchy,omitempty"`
// Extract PDF annotations (text notes, highlights, links, stamps).
// Default: false
ExtractAnnotations bool `json:"extract_annotations"`
// Top margin fraction (0.01.0) of page height to exclude headers/running heads.
// Default: 0.06 (6%)
TopMarginFraction *float32 `json:"top_margin_fraction,omitempty"`
// Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
// Default: 0.05 (5%)
BottomMarginFraction *float32 `json:"bottom_margin_fraction,omitempty"`
// Allow single-column pseudo tables in extraction results.
//
// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
// (heuristic) are rejected. When `true`, the minimum column count is relaxed
// to 1, allowing single-column structured data (glossaries, itemized lists)
// to be emitted as tables. Other quality filters (density, sparsity, prose
// detection) still apply.
AllowSingleColumnTables bool `json:"allow_single_column_tables"`
// Perform OCR on inline images extracted from PDF pages and attach the
// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
// to be available; if `ExtractionConfig.ocr` is `None` the extractor
// falls back to `TesseractConfig::default()`. Per-image failures degrade
// gracefully (the image is returned without OCR text rather than failing
// the whole extraction). Default: `false`.
OcrInlineImages bool `json:"ocr_inline_images"`
}
// HierarchyConfig hierarchy extraction configuration for PDF text structure analysis.
//
// Enables extraction of document hierarchy levels (H1-H6) based on font size
// clustering and semantic analysis. When enabled, hierarchical blocks are
// included in page content.
type HierarchyConfig struct {
// Enable hierarchy extraction
Enabled *bool `json:"enabled,omitempty"`
// Number of font size clusters to use for hierarchy levels (1-7)
//
// Default: 6, which provides H1-H6 heading levels with body text.
// Larger values create more fine-grained hierarchy levels.
KClusters *uint `json:"k_clusters,omitempty"`
// Include bounding box information in hierarchy blocks
IncludeBbox *bool `json:"include_bbox,omitempty"`
// OCR coverage threshold for smart OCR triggering (0.0-1.0)
//
// Determines when OCR should be triggered based on text block coverage.
// OCR is triggered when text blocks cover less than this fraction of the page.
// Default: 0.5 (trigger OCR if less than 50% of page has text)
OcrCoverageThreshold *float32 `json:"ocr_coverage_threshold,omitempty"`
}
// PostProcessorConfig post-processor configuration.
type PostProcessorConfig struct {
// Enable post-processors
Enabled *bool `json:"enabled,omitempty"`
// Whitelist of processor names to run (None = all enabled)
EnabledProcessors []string `json:"enabled_processors,omitempty"`
// Blacklist of processor names to skip (None = none disabled)
DisabledProcessors []string `json:"disabled_processors,omitempty"`
// Pre-computed AHashSet for O(1) enabled processor lookup
EnabledSet []string `json:"enabled_set,omitempty"`
// Pre-computed AHashSet for O(1) disabled processor lookup
DisabledSet []string `json:"disabled_set,omitempty"`
}
// ChunkingConfig chunking configuration.
//
// Configures text chunking for document content, including chunk size,
// overlap, trimming behavior, and optional embeddings.
//
// Use `..Default::default()` when constructing to allow for future field additions:
// ```rust
// let config = ChunkingConfig {
// max_characters: 500,
// ..Default::default()
// };
// ```
type ChunkingConfig struct {
// Maximum size per chunk (in units determined by `sizing`).
//
// When `sizing` is `Characters` (default), this is the max character count.
// When using token-based sizing, this is the max token count.
//
// Default: 1000
MaxCharacters *uint `json:"max_chars,omitempty"`
// Overlap between chunks (in units determined by `sizing`).
//
// Default: 200
Overlap *uint `json:"max_overlap,omitempty"`
// Whether to trim whitespace from chunk boundaries.
//
// Default: true
Trim *bool `json:"trim,omitempty"`
// Type of chunker to use (Text or Markdown).
//
// Default: Text
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
// Optional embedding configuration for chunk embeddings.
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
// Use a preset configuration (overrides individual settings if provided).
Preset *string `json:"preset,omitempty"`
// How to measure chunk size.
//
// Default: `Characters` (Unicode character count).
// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
Sizing ChunkSizing `json:"sizing"`
// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
// path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
//
// This is useful for RAG pipelines where each chunk needs self-contained
// context about its position in the document structure.
//
// Default: `false`
PrependHeadingContext bool `json:"prepend_heading_context"`
// Optional cosine similarity threshold for semantic topic boundary detection.
//
// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
// provided. You almost never need to set this. When omitted, defaults to
// `0.75` which works well for most documents. Lower values detect more
// topic boundaries (more, smaller chunks); higher values detect fewer.
// Range: `0.0..=1.0`.
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
}
func (s *ChunkingConfig) UnmarshalJSON(data []byte) error {
var raw struct {
MaxCharacters *uint `json:"max_chars,omitempty"`
Overlap *uint `json:"max_overlap,omitempty"`
Trim *bool `json:"trim,omitempty"`
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
Preset *string `json:"preset,omitempty"`
Sizing json.RawMessage `json:"sizing,omitempty"`
PrependHeadingContext bool `json:"prepend_heading_context"`
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.MaxCharacters = raw.MaxCharacters
s.Overlap = raw.Overlap
s.Trim = raw.Trim
s.ChunkerType = raw.ChunkerType
s.Embedding = raw.Embedding
s.Preset = raw.Preset
s.PrependHeadingContext = raw.PrependHeadingContext
s.TopicThreshold = raw.TopicThreshold
if len(raw.Sizing) > 0 && string(raw.Sizing) != "null" {
v, err := UnmarshalChunkSizing(raw.Sizing)
if err != nil {
return err
}
s.Sizing = v
}
return nil
}
// EmbeddingConfig embedding configuration for text chunks.
//
// Configures embedding generation using ONNX models via the vendored embedding engine.
// Requires the `embeddings` feature to be enabled.
type EmbeddingConfig struct {
// The embedding model to use (defaults to "balanced" preset if not specified)
Model EmbeddingModelType `json:"model"`
// Whether to normalize embedding vectors (recommended for cosine similarity)
Normalize *bool `json:"normalize,omitempty"`
// Batch size for embedding generation
BatchSize *uint `json:"batch_size,omitempty"`
// Show model download progress
ShowDownloadProgress bool `json:"show_download_progress"`
// Custom cache directory for model files
//
// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
// Allows full customization of model download location.
CacheDir *string `json:"cache_dir,omitempty"`
// Hardware acceleration for the embedding ONNX model.
//
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
// is used for inference. Defaults to `None` (auto-select per platform).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Maximum wall-clock duration (in seconds) for a single `embed()` call when
// using [`EmbeddingModelType::Plugin`].
//
// Applies only to the in-process plugin path — protects against hung
// host-language backends (e.g. a Python callback deadlocked on the GIL,
// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
// returns `Plugin` instead of blocking forever.
//
// `None` disables the timeout. The default (60 seconds) is conservative
// for common in-process inference; increase for large batches on slow
// hardware.
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
}
func (s *EmbeddingConfig) UnmarshalJSON(data []byte) error {
var raw struct {
Model json.RawMessage `json:"model,omitempty"`
Normalize *bool `json:"normalize,omitempty"`
BatchSize *uint `json:"batch_size,omitempty"`
ShowDownloadProgress bool `json:"show_download_progress"`
CacheDir *string `json:"cache_dir,omitempty"`
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Normalize = raw.Normalize
s.BatchSize = raw.BatchSize
s.ShowDownloadProgress = raw.ShowDownloadProgress
s.CacheDir = raw.CacheDir
s.Acceleration = raw.Acceleration
s.MaxEmbedDurationSecs = raw.MaxEmbedDurationSecs
if len(raw.Model) > 0 && string(raw.Model) != "null" {
v, err := UnmarshalEmbeddingModelType(raw.Model)
if err != nil {
return err
}
s.Model = v
}
return nil
}
// TreeSitterConfig configuration for tree-sitter language pack integration.
//
// Controls grammar download behavior and code analysis options.
//
// # Example (TOML)
//
// ```toml
// [tree_sitter]
// languages = ["python", "rust"]
// groups = ["web"]
//
// [tree_sitter.process]
// structure = true
// comments = true
// docstrings = true
// ```
type TreeSitterConfig struct {
// Enable code intelligence processing (default: true).
//
// When `false`, tree-sitter analysis is completely skipped even if
// the config section is present.
Enabled *bool `json:"enabled,omitempty"`
// Custom cache directory for downloaded grammars.
//
// When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
CacheDir *string `json:"cache_dir,omitempty"`
// Languages to pre-download on init (e.g., `["python", "rust"]`).
Languages []string `json:"languages,omitempty"`
// Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
Groups []string `json:"groups,omitempty"`
// Processing options for code analysis.
Process TreeSitterProcessConfig `json:"process"`
}
// TreeSitterProcessConfig processing options for tree-sitter code analysis.
//
// Controls which analysis features are enabled when extracting code files.
type TreeSitterProcessConfig struct {
// Extract structural items (functions, classes, structs, etc.). Default: true.
Structure *bool `json:"structure,omitempty"`
// Extract import statements. Default: true.
Imports *bool `json:"imports,omitempty"`
// Extract export statements. Default: true.
Exports *bool `json:"exports,omitempty"`
// Extract comments. Default: false.
Comments bool `json:"comments"`
// Extract docstrings. Default: false.
Docstrings bool `json:"docstrings"`
// Extract symbol definitions. Default: false.
Symbols bool `json:"symbols"`
// Include parse diagnostics. Default: false.
Diagnostics bool `json:"diagnostics"`
// Maximum chunk size in bytes. `None` disables chunking.
ChunkMaxSize *uint `json:"chunk_max_size,omitempty"`
// Content rendering mode for code extraction.
ContentMode CodeContentMode `json:"content_mode,omitempty"`
}
// SupportedFormat supported document format entry.
//
// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
type SupportedFormat struct {
// File extension (without leading dot), e.g., "pdf", "docx"
Extension string `json:"extension"`
// MIME type string, e.g., "application/pdf"
MimeType string `json:"mime_type"`
}
// ServerConfig aPI server configuration.
//
// This struct holds all configuration options for the Kreuzberg API server,
// including host/port settings, CORS configuration, and upload limits.
//
// # Defaults
//
// - `host`: "127.0.0.1" (localhost only)
// - `port`: 8000
// - `cors_origins`: empty vector (allows all origins)
// - `max_request_body_bytes`: 104_857_600 (100 MB)
// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
type ServerConfig struct {
// Server host address (e.g., "127.0.0.1", "0.0.0.0")
Host string `json:"host"`
// Server port number
Port uint16 `json:"port"`
// CORS allowed origins. Empty vector means allow all origins.
//
// If this is an empty vector, the server will accept requests from any origin.
// If populated with specific origins (e.g., `"https://example.com"`), only
// those origins will be allowed.
CorsOrigins []string `json:"cors_origins,omitempty"`
// Maximum size of request body in bytes (default: 100 MB)
MaxRequestBodyBytes uint `json:"max_request_body_bytes"`
// Maximum size of multipart fields in bytes (default: 100 MB)
MaxMultipartFieldBytes uint `json:"max_multipart_field_bytes"`
}
// StructuredDataResult is a type.
type StructuredDataResult struct {
Content string `json:"content"`
Format string `json:"format"`
Metadata map[string]string `json:"metadata,omitempty"`
TextFields []string `json:"text_fields,omitempty"`
}
// DocxAppProperties application properties from docProps/app.xml for DOCX
//
// Contains Word-specific document statistics and metadata.
type DocxAppProperties struct {
// Application name (e.g., "Microsoft Office Word")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Template filename
Template *string `json:"template,omitempty"`
// Total editing time in minutes
TotalTime *int32 `json:"total_time,omitempty"`
// Number of pages
Pages *int32 `json:"pages,omitempty"`
// Number of words
Words *int32 `json:"words,omitempty"`
// Number of characters (excluding spaces)
Characters *int32 `json:"characters,omitempty"`
// Number of characters (including spaces)
CharactersWithSpaces *int32 `json:"characters_with_spaces,omitempty"`
// Number of lines
Lines *int32 `json:"lines,omitempty"`
// Number of paragraphs
Paragraphs *int32 `json:"paragraphs,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
}
// XlsxAppProperties application properties from docProps/app.xml for XLSX
//
// Contains Excel-specific document metadata.
type XlsxAppProperties struct {
// Application name (e.g., "Microsoft Excel")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Worksheet names
WorksheetNames []string `json:"worksheet_names,omitempty"`
}
// PptxAppProperties application properties from docProps/app.xml for PPTX
//
// Contains PowerPoint-specific document metadata.
type PptxAppProperties struct {
// Application name (e.g., "Microsoft Office PowerPoint")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Total editing time in minutes
TotalTime *int32 `json:"total_time,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
// Number of slides
Slides *int32 `json:"slides,omitempty"`
// Number of notes
Notes *int32 `json:"notes,omitempty"`
// Number of hidden slides
HiddenSlides *int32 `json:"hidden_slides,omitempty"`
// Number of multimedia clips
MultimediaClips *int32 `json:"multimedia_clips,omitempty"`
// Presentation format (e.g., "Widescreen", "Standard")
PresentationFormat *string `json:"presentation_format,omitempty"`
// Slide titles
SlideTitles []string `json:"slide_titles,omitempty"`
}
// CoreProperties dublin Core metadata from docProps/core.xml
//
// Contains standard metadata fields defined by the Dublin Core standard
// and Office-specific extensions.
type CoreProperties struct {
// Document title
Title *string `json:"title,omitempty"`
// Document subject/topic
Subject *string `json:"subject,omitempty"`
// Document creator/author
Creator *string `json:"creator,omitempty"`
// Keywords or tags
Keywords *string `json:"keywords,omitempty"`
// Document description/abstract
Description *string `json:"description,omitempty"`
// User who last modified the document
LastModifiedBy *string `json:"last_modified_by,omitempty"`
// Revision number
Revision *string `json:"revision,omitempty"`
// Creation timestamp (ISO 8601)
Created *string `json:"created,omitempty"`
// Last modification timestamp (ISO 8601)
Modified *string `json:"modified,omitempty"`
// Document category
Category *string `json:"category,omitempty"`
// Content status (Draft, Final, etc.)
ContentStatus *string `json:"content_status,omitempty"`
// Document language
Language *string `json:"language,omitempty"`
// Unique identifier
Identifier *string `json:"identifier,omitempty"`
// Document version
Version *string `json:"version,omitempty"`
// Last print timestamp (ISO 8601)
LastPrinted *string `json:"last_printed,omitempty"`
}
// SecurityLimits configuration for security limits across extractors.
//
// All limits are intentionally conservative to prevent DoS attacks
// while still supporting legitimate documents.
type SecurityLimits struct {
// Maximum uncompressed size for archives (500 MB)
MaxArchiveSize *uint `json:"max_archive_size,omitempty"`
// Maximum compression ratio before flagging as potential bomb (100:1)
MaxCompressionRatio *uint `json:"max_compression_ratio,omitempty"`
// Maximum number of files in archive (10,000)
MaxFilesInArchive *uint `json:"max_files_in_archive,omitempty"`
// Maximum nesting depth for structures (100)
MaxNestingDepth *uint `json:"max_nesting_depth,omitempty"`
// Maximum length of any single XML entity / attribute / token (1 MiB).
// This is a per-token cap, NOT a total cap — billion-laughs class
// attacks where a single entity expands to hundreds of MB are caught
// here, while normal long text content (a paragraph, a CDATA block) is
// caught by `max_content_size` instead.
MaxEntityLength *uint `json:"max_entity_length,omitempty"`
// Maximum string growth per document (100 MB)
MaxContentSize *uint `json:"max_content_size,omitempty"`
// Maximum iterations per operation
MaxIterations *uint `json:"max_iterations,omitempty"`
// Maximum XML depth (100 levels)
MaxXMLDepth *uint `json:"max_xml_depth,omitempty"`
// Maximum cells per table (100,000)
MaxTableCells *uint `json:"max_table_cells,omitempty"`
}
// TokenReductionConfig is a type.
type TokenReductionConfig struct {
Level *ReductionLevel `json:"level,omitempty"`
LanguageHint *string `json:"language_hint,omitempty"`
PreserveMarkdown bool `json:"preserve_markdown"`
PreserveCode *bool `json:"preserve_code,omitempty"`
SemanticThreshold *float32 `json:"semantic_threshold,omitempty"`
EnableParallel *bool `json:"enable_parallel,omitempty"`
UseSimd *bool `json:"use_simd,omitempty"`
CustomStopwords map[string][]string `json:"custom_stopwords,omitempty"`
PreservePatterns []string `json:"preserve_patterns,omitempty"`
TargetReduction *float32 `json:"target_reduction,omitempty"`
EnableSemanticClustering bool `json:"enable_semantic_clustering"`
}
// PdfAnnotation pDF annotation extracted from a document page.
type PdfAnnotation struct {
// The type of annotation.
AnnotationType PdfAnnotationType `json:"annotation_type"`
// Text content of the annotation (e.g., comment text, link URL).
Content *string `json:"content,omitempty"`
// Page number where the annotation appears (1-indexed).
PageNumber uint32 `json:"page_number"`
// Bounding box of the annotation on the page.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}
// DjotContent comprehensive Djot document structure with semantic preservation.
//
// This type captures the full richness of Djot markup, including:
// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
// - Attributes (classes, IDs, key-value pairs)
// - Links, images, footnotes
// - Math expressions (inline and display)
// - Tables with full structure
//
// Available when the `djot` feature is enabled.
type DjotContent struct {
// Plain text representation for backwards compatibility
PlainText string `json:"plain_text"`
// Structured block-level content
Blocks []FormattedBlock `json:"blocks,omitempty"`
// Metadata from YAML frontmatter
Metadata Metadata `json:"metadata"`
// Extracted tables as structured data
Tables []Table `json:"tables,omitempty"`
// Extracted images with metadata
Images []DjotImage `json:"images,omitempty"`
// Extracted links with URLs
Links []DjotLink `json:"links,omitempty"`
// Footnote definitions
Footnotes []Footnote `json:"footnotes,omitempty"`
// Attributes mapped by element identifier (if present)
Attributes []string `json:"attributes,omitempty"`
}
// FormattedBlock block-level element in a Djot document.
//
// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
type FormattedBlock struct {
// Type of block element
BlockType BlockType `json:"block_type"`
// Heading level (1-6) for headings, or nesting level for lists
Level *uint `json:"level,omitempty"`
// Inline content within the block
InlineContent []InlineElement `json:"inline_content,omitempty"`
// Element attributes (classes, IDs, key-value pairs)
Attributes *string `json:"attributes,omitempty"`
// Language identifier for code blocks
Language *string `json:"language,omitempty"`
// Raw code content for code blocks
Code *string `json:"code,omitempty"`
// Nested blocks for containers (blockquotes, list items, divs)
Children []FormattedBlock `json:"children,omitempty"`
}
// InlineElement inline element within a block.
//
// Represents text with formatting, links, images, etc.
type InlineElement struct {
// Type of inline element
ElementType InlineType `json:"element_type"`
// Text content
Content string `json:"content"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
// Additional metadata (e.g., href for links, src/alt for images)
Metadata map[string]string `json:"metadata,omitempty"`
}
// DjotImage image element in Djot.
type DjotImage struct {
// Image source URL or path
Src string `json:"src"`
// Alternative text
Alt string `json:"alt"`
// Optional title
Title *string `json:"title,omitempty"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
}
// DjotLink link element in Djot.
type DjotLink struct {
// Link URL
URL string `json:"url"`
// Link text content
Text string `json:"text"`
// Optional title
Title *string `json:"title,omitempty"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
}
// Footnote in Djot.
type Footnote struct {
// Footnote label
Label string `json:"label"`
// Footnote content blocks
Content []FormattedBlock `json:"content,omitempty"`
}
// DocumentStructure top-level structured document representation.
//
// A flat array of nodes with index-based parent/child references forming a tree.
// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
// to iterate over top-level content by layer.
//
// # Validation
//
// Call `validate()` after construction to verify all node indices are in bounds
// and parent-child relationships are bidirectionally consistent.
type DocumentStructure struct {
// All nodes in document/reading order.
Nodes []DocumentNode `json:"nodes,omitempty"`
// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
//
// Allows renderers to apply format-aware heuristics when converting
// the document tree to output formats.
SourceFormat *string `json:"source_format,omitempty"`
// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
//
// Populated during derivation from the internal document representation.
// Empty when no relationships are detected.
Relationships []DocumentRelationship `json:"relationships,omitempty"`
// Sorted, deduplicated list of node type names present in this document.
//
// Each value is the snake_case `node_type` tag of the corresponding
// [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
//
// Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
// Empty until that method is called (internal construction paths call it
// at the end of derivation).
NodeTypes []string `json:"node_types,omitempty"`
}
// DocumentRelationship resolved relationship between two nodes in the document tree.
type DocumentRelationship struct {
// Source node index (the referencing node).
Source uint32 `json:"source"`
// Target node index (the referenced node).
Target uint32 `json:"target"`
// Semantic kind of the relationship.
Kind RelationshipKind `json:"kind"`
}
// DocumentNode single node in the document tree.
//
// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
// for tree structure, and metadata like page number, bounding box, and content layer.
type DocumentNode struct {
// Deterministic identifier (hash of content + position).
ID string `json:"id"`
// Node content — tagged enum, type-specific data only.
Content NodeContent `json:"content"`
// Parent node index (`None` = root-level node).
Parent *uint32 `json:"parent,omitempty"`
// Child node indices in reading order.
Children []uint32 `json:"children,omitempty"`
// Content layer classification.
ContentLayer ContentLayer `json:"content_layer"`
// Page number where this node starts (1-indexed).
Page *uint32 `json:"page,omitempty"`
// Page number where this node ends (for multi-page tables/sections).
PageEnd *uint32 `json:"page_end,omitempty"`
// Bounding box in document coordinates.
Bbox *BoundingBox `json:"bbox,omitempty"`
// Inline annotations (formatting, links) on this node's text content.
//
// Only meaningful for text-carrying nodes; empty for containers.
Annotations []TextAnnotation `json:"annotations,omitempty"`
// Format-specific key-value attributes.
//
// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
// LaTeX environment names, Excel cell formulas, slide layout names, etc.
Attributes map[string]string `json:"attributes,omitempty"`
}
func (s *DocumentNode) UnmarshalJSON(data []byte) error {
var raw struct {
ID string `json:"id"`
Content json.RawMessage `json:"content,omitempty"`
Parent *uint32 `json:"parent,omitempty"`
Children []uint32 `json:"children,omitempty"`
ContentLayer ContentLayer `json:"content_layer"`
Page *uint32 `json:"page,omitempty"`
PageEnd *uint32 `json:"page_end,omitempty"`
Bbox *BoundingBox `json:"bbox,omitempty"`
Annotations []TextAnnotation `json:"annotations,omitempty"`
Attributes map[string]string `json:"attributes,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.ID = raw.ID
s.Parent = raw.Parent
s.Children = raw.Children
s.ContentLayer = raw.ContentLayer
s.Page = raw.Page
s.PageEnd = raw.PageEnd
s.Bbox = raw.Bbox
s.Annotations = raw.Annotations
s.Attributes = raw.Attributes
if len(raw.Content) > 0 && string(raw.Content) != "null" {
v, err := UnmarshalNodeContent(raw.Content)
if err != nil {
return err
}
s.Content = v
}
return nil
}
// TableGrid structured table grid with cell-level metadata.
//
// Stores row/column dimensions and a flat list of cells with position info.
type TableGrid struct {
// Number of rows in the table.
Rows uint32 `json:"rows"`
// Number of columns in the table.
Cols uint32 `json:"cols"`
// All cells in row-major order.
Cells []GridCell `json:"cells,omitempty"`
}
// GridCell individual grid cell with position and span metadata.
type GridCell struct {
// Cell text content.
Content string `json:"content"`
// Zero-indexed row position.
Row uint32 `json:"row"`
// Zero-indexed column position.
Col uint32 `json:"col"`
// Number of rows this cell spans.
RowSpan uint32 `json:"row_span"`
// Number of columns this cell spans.
ColSpan uint32 `json:"col_span"`
// Whether this is a header cell.
IsHeader bool `json:"is_header"`
// Bounding box for this cell (if available).
Bbox *BoundingBox `json:"bbox,omitempty"`
}
// TextAnnotation inline text annotation — byte-range based formatting and links.
//
// Annotations reference byte offsets into the node's text content,
// enabling precise identification of formatted regions.
type TextAnnotation struct {
// Start byte offset in the node's text content (inclusive).
Start uint32 `json:"start"`
// End byte offset in the node's text content (exclusive).
End uint32 `json:"end"`
// Annotation type.
Kind AnnotationKind `json:"kind"`
}
func (s *TextAnnotation) UnmarshalJSON(data []byte) error {
var raw struct {
Start uint32 `json:"start"`
End uint32 `json:"end"`
Kind json.RawMessage `json:"kind,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Start = raw.Start
s.End = raw.End
if len(raw.Kind) > 0 && string(raw.Kind) != "null" {
v, err := UnmarshalAnnotationKind(raw.Kind)
if err != nil {
return err
}
s.Kind = v
}
return nil
}
// ExtractionResult general extraction result used by the core extraction API.
//
// This is the main result type returned by all extraction functions.
type ExtractionResult struct {
Content string `json:"content"`
MimeType string `json:"mime_type"`
Metadata Metadata `json:"metadata"`
// Extraction strategy used to produce the returned text.
//
// Populated when the extractor can reliably distinguish native text extraction,
// OCR-only extraction, or mixed native/OCR output.
ExtractionMethod *ExtractionMethod `json:"extraction_method,omitempty"`
Tables []Table `json:"tables,omitempty"`
DetectedLanguages []string `json:"detected_languages,omitempty"`
// Text chunks when chunking is enabled.
//
// When chunking configuration is provided, the content is split into
// overlapping chunks for efficient processing. Each chunk contains the text,
// optional embeddings (if enabled), and metadata about its position.
Chunks []Chunk `json:"chunks,omitempty"`
// Extracted images from the document.
//
// When image extraction is enabled via `ImageExtractionConfig`, this field
// contains all images found in the document with their raw data and metadata.
// Each image may optionally contain a nested `ocr_result` if OCR was performed.
Images []ExtractedImage `json:"images,omitempty"`
// Per-page content when page extraction is enabled.
//
// When page extraction is configured, the document is split into per-page content
// with tables and images mapped to their respective pages.
Pages []PageContent `json:"pages,omitempty"`
// Semantic elements when element-based result format is enabled.
//
// When result_format is set to ElementBased, this field contains semantic
// elements with type classification, unique identifiers, and metadata for
// Unstructured-compatible element-based processing.
Elements []Element `json:"elements,omitempty"`
// Rich Djot content structure (when extracting Djot documents).
//
// When extracting Djot documents with structured extraction enabled,
// this field contains the full semantic structure including:
// - Block-level elements with nesting
// - Inline formatting with attributes
// - Links, images, footnotes
// - Math expressions
// - Complete attribute information
//
// The `content` field still contains plain text for backward compatibility.
//
// Always `None` for non-Djot documents.
DjotContent *DjotContent `json:"djot_content,omitempty"`
// OCR elements with full spatial and confidence metadata.
//
// When OCR is performed with element extraction enabled, this field contains
// the structured representation of detected text including:
// - Bounding geometry (rectangles or quadrilaterals)
// - Confidence scores (detection and recognition)
// - Rotation information
// - Hierarchical relationships (Tesseract only)
//
// This field preserves all metadata that would otherwise be lost when
// converting to plain text or markdown output formats.
//
// Only populated when `OcrElementConfig.include_elements` is true.
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
// Structured document tree (when document structure extraction is enabled).
//
// When `include_document_structure` is true in `ExtractionConfig`, this field
// contains the full hierarchical representation of the document including:
// - Heading-driven section nesting
// - Table grids with cell-level metadata
// - Content layer classification (body, header, footer, footnote)
// - Inline text annotations (formatting, links)
// - Bounding boxes and page numbers
//
// Independent of `result_format` — can be combined with Unified or ElementBased.
Document *DocumentStructure `json:"document,omitempty"`
// Extracted keywords when keyword extraction is enabled.
//
// When keyword extraction (RAKE or YAKE) is configured, this field contains
// the extracted keywords with scores, algorithm info, and position data.
// Previously stored in `metadata.additional["keywords"]`.
ExtractedKeywords []Keyword `json:"extracted_keywords,omitempty"`
// Document quality score from quality analysis.
//
// A value between 0.0 and 1.0 indicating the overall text quality.
// Previously stored in `metadata.additional["quality_score"]`.
QualityScore *float64 `json:"quality_score,omitempty"`
// Non-fatal warnings collected during processing pipeline stages.
//
// Captures errors from optional pipeline features (embedding, chunking,
// language detection, output formatting) that don't prevent extraction
// but may indicate degraded results.
// Previously stored as individual keys in `metadata.additional`.
ProcessingWarnings []ProcessingWarning `json:"processing_warnings,omitempty"`
// PDF annotations extracted from the document.
//
// When annotation extraction is enabled via `PdfConfig::extract_annotations`,
// this field contains text notes, highlights, links, stamps, and other
// annotations found in PDF documents.
Annotations []PdfAnnotation `json:"annotations,omitempty"`
// Nested extraction results from archive contents.
//
// When extracting archives, each processable file inside produces its own
// full extraction result. Set to `None` for non-archive formats.
// Use `max_archive_depth` in config to control recursion depth.
Children []ArchiveEntry `json:"children,omitempty"`
// URIs/links discovered during document extraction.
//
// Contains hyperlinks, image references, citations, email addresses, and
// other URI-like references found in the document. Always extracted when
// present in the source document.
Uris []ExtractedURI `json:"uris,omitempty"`
// Tracked changes embedded in the source document.
//
// Populated by per-format extractors that understand change-tracking
// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
// …). Every extractor defaults to `None` until its format-specific
// implementation is added. Extractors that do populate this field follow
// the "accepted-changes" convention: inserted text is present in
// `content`, deleted text is absent — the revision list is the separate
// audit trail.
Revisions []DocumentRevision `json:"revisions,omitempty"`
// Structured extraction output from LLM-based JSON schema extraction.
//
// When `structured_extraction` is configured in `ExtractionConfig`, the
// extracted document content is sent to a VLM with the provided JSON schema.
// The response is parsed and stored here as a JSON value matching the schema.
StructuredOutput *json.RawMessage `json:"structured_output,omitempty"`
// Code intelligence results from tree-sitter analysis.
//
// Populated when extracting source code files with the `tree-sitter` feature.
// Contains metrics, structural analysis, imports/exports, comments,
// docstrings, symbols, diagnostics, and optionally chunked code segments.
//
// Stored as an opaque JSON value so that all language bindings (Go, Java,
// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
// The underlying type is `tree_sitter_language_pack::ProcessResult`.
CodeIntelligence *json.RawMessage `json:"code_intelligence,omitempty"`
// LLM token usage and cost data for all LLM calls made during this extraction.
//
// Contains one entry per LLM call. Multiple entries are produced when
// VLM OCR, structured extraction, or LLM embeddings run during
// the same extraction.
//
// `None` when no LLM was used.
LlmUsage []LlmUsage `json:"llm_usage,omitempty"`
// Pre-rendered content in the requested output format.
//
// Populated during `derive_extraction_result` before tree derivation consumes
// element data. `apply_output_format` swaps this into `content` at the end
// of the pipeline, after post-processors have operated on plain text.
FormattedContent *string `json:"formatted_content,omitempty"`
// Structured hOCR document for the OCR+layout pipeline.
//
// When tesseract produces hOCR output, the parsed `InternalDocument` carries
// paragraph structure with bounding boxes and confidence scores. The layout
// classification step enriches these elements before final rendering.
OcrInternalDocument *string `json:"ocr_internal_document,omitempty"`
}
// ArchiveEntry single file extracted from an archive.
//
// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
// enabled, each processable file produces its own full `ExtractionResult`.
type ArchiveEntry struct {
// Archive-relative file path (e.g. "folder/document.pdf").
Path string `json:"path"`
// Detected MIME type of the file.
MimeType string `json:"mime_type"`
// Full extraction result for this file.
Result ExtractionResult `json:"result"`
}
// ProcessingWarning non-fatal warning from a processing pipeline stage.
//
// Captures errors from optional features that don't prevent extraction
// but may indicate degraded results.
type ProcessingWarning struct {
// The pipeline stage or feature that produced this warning
// (e.g., "embedding", "chunking", "language_detection", "output_format").
Source string `json:"source"`
// Human-readable description of what went wrong.
Message string `json:"message"`
}
// LlmUsage token usage and cost data for a single LLM call made during extraction.
//
// Populated when VLM OCR, structured extraction, or LLM-based embeddings
// are used. Multiple entries may be present when multiple LLM calls occur
// within one extraction (e.g. VLM OCR + structured extraction).
type LlmUsage struct {
// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
Model string `json:"model"`
// The pipeline stage that triggered this LLM call
// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
Source string `json:"source"`
// Number of input/prompt tokens consumed.
InputTokens *uint64 `json:"input_tokens,omitempty"`
// Number of output/completion tokens generated.
OutputTokens *uint64 `json:"output_tokens,omitempty"`
// Total tokens (input + output).
TotalTokens *uint64 `json:"total_tokens,omitempty"`
// Estimated cost in USD based on the provider's published pricing.
EstimatedCost *float64 `json:"estimated_cost,omitempty"`
// Why the model stopped generating (e.g. "stop", "length", "content_filter").
FinishReason *string `json:"finish_reason,omitempty"`
}
// Chunk text chunk with optional embedding and metadata.
//
// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
// contains the text content, optional embedding vector (if embedding generation
// is configured), and metadata about its position in the document.
type Chunk struct {
// The text content of this chunk.
Content string `json:"content"`
// Semantic structural classification of this chunk.
//
// Assigned by the heuristic classifier based on content patterns and
// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
ChunkType ChunkType `json:"chunk_type"`
// Optional embedding vector for this chunk.
//
// Only populated when `EmbeddingConfig` is provided in chunking configuration.
// The dimensionality depends on the chosen embedding model.
Embedding []float32 `json:"embedding,omitempty"`
// Metadata about this chunk's position and properties.
Metadata ChunkMetadata `json:"metadata"`
}
// HeadingContext heading context for a chunk within a Markdown document.
//
// Contains the heading hierarchy from document root to this chunk's section.
type HeadingContext struct {
// The heading hierarchy from document root to this chunk's section.
// Index 0 is the outermost (h1), last element is the most specific.
Headings []HeadingLevel `json:"headings,omitempty"`
}
// HeadingLevel single heading in the hierarchy.
type HeadingLevel struct {
// Heading depth (1 = h1, 2 = h2, etc.)
Level uint8 `json:"level"`
// The text content of the heading.
Text string `json:"text"`
}
// ChunkMetadata metadata about a chunk's position in the original document.
type ChunkMetadata struct {
// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
ByteStart uint `json:"byte_start"`
// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
ByteEnd uint `json:"byte_end"`
// Number of tokens in this chunk (if available).
//
// This is calculated by the embedding model's tokenizer if embeddings are enabled.
TokenCount *uint `json:"token_count,omitempty"`
// Zero-based index of this chunk in the document.
ChunkIndex uint `json:"chunk_index"`
// Total number of chunks in the document.
TotalChunks uint `json:"total_chunks"`
// First page number this chunk spans (1-indexed).
//
// Only populated when page tracking is enabled in extraction configuration.
FirstPage *uint32 `json:"first_page,omitempty"`
// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
//
// Only populated when page tracking is enabled in extraction configuration.
LastPage *uint32 `json:"last_page,omitempty"`
// Heading context when using Markdown chunker.
//
// Contains the heading hierarchy this chunk falls under.
// Only populated when `ChunkerType::Markdown` is used.
HeadingContext *HeadingContext `json:"heading_context,omitempty"`
// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
//
// Contains zero-based indices into the top-level `images` collection for every
// image whose `page_number` falls within `[first_page, last_page]`.
// Empty when image extraction is disabled or the chunk spans no pages with images.
ImageIndices []uint32 `json:"image_indices,omitempty"`
}
// ExtractedImage extracted image from a document.
//
// Contains raw image data, metadata, and optional nested OCR results.
// Raw bytes allow cross-language compatibility - users can convert to
// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
type ExtractedImage struct {
// Raw image data (PNG, JPEG, WebP, etc. bytes).
// Uses `bytes::Bytes` for cheap cloning of large buffers.
Data []byte `json:"data"`
// Image format (e.g., "jpeg", "png", "webp")
// Uses Cow<'static, str> to avoid allocation for static literals.
Format string `json:"format"`
// Zero-indexed position of this image in the document/page
ImageIndex uint32 `json:"image_index"`
// Page/slide number where image was found (1-indexed)
PageNumber *uint32 `json:"page_number,omitempty"`
// Image width in pixels
Width *uint32 `json:"width,omitempty"`
// Image height in pixels
Height *uint32 `json:"height,omitempty"`
// Colorspace information (e.g., "RGB", "CMYK", "Gray")
Colorspace *string `json:"colorspace,omitempty"`
// Bits per color component (e.g., 8, 16)
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
// Whether this image is a mask image
IsMask bool `json:"is_mask"`
// Optional description of the image
Description *string `json:"description,omitempty"`
// Nested OCR extraction result (if image was OCRed)
//
// When OCR is performed on this image, the result is embedded here
// rather than in a separate collection, making the relationship explicit.
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
// Only populated for PDF-extracted images when position data is available from the PDF extractor.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
// Used for rendering image references when the binary data is not extracted.
SourcePath *string `json:"source_path,omitempty"`
// Heuristic classification of what this image likely depicts.
// `None` if classification was disabled or inconclusive.
ImageKind *ImageKind `json:"image_kind,omitempty"`
// Confidence score for `image_kind`, in the range 0.0 to 1.0.
KindConfidence *float32 `json:"kind_confidence,omitempty"`
// Identifier shared across images that form a single logical figure
// (e.g. all raster tiles of one technical drawing). `None` for singletons.
ClusterID *uint32 `json:"cluster_id,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v ExtractedImage) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Data []int `json:"data"`
Format string `json:"format"`
ImageIndex uint32 `json:"image_index"`
PageNumber *uint32 `json:"page_number,omitempty"`
Width *uint32 `json:"width,omitempty"`
Height *uint32 `json:"height,omitempty"`
Colorspace *string `json:"colorspace,omitempty"`
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
IsMask bool `json:"is_mask"`
Description *string `json:"description,omitempty"`
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
SourcePath *string `json:"source_path,omitempty"`
ImageKind *ImageKind `json:"image_kind,omitempty"`
KindConfidence *float32 `json:"kind_confidence,omitempty"`
ClusterID *uint32 `json:"cluster_id,omitempty"`
}{}
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
aux.Format = v.Format
aux.ImageIndex = v.ImageIndex
aux.PageNumber = v.PageNumber
aux.Width = v.Width
aux.Height = v.Height
aux.Colorspace = v.Colorspace
aux.BitsPerComponent = v.BitsPerComponent
aux.IsMask = v.IsMask
aux.Description = v.Description
aux.OcrResult = v.OcrResult
aux.BoundingBox = v.BoundingBox
aux.SourcePath = v.SourcePath
aux.ImageKind = v.ImageKind
aux.KindConfidence = v.KindConfidence
aux.ClusterID = v.ClusterID
return json.Marshal(aux)
}
// BoundingBox bounding box coordinates for element positioning.
type BoundingBox struct {
// Left x-coordinate
X0 float64 `json:"x0"`
// Bottom y-coordinate
Y0 float64 `json:"y0"`
// Right x-coordinate
X1 float64 `json:"x1"`
// Top y-coordinate
Y1 float64 `json:"y1"`
}
// ElementMetadata metadata for a semantic element.
type ElementMetadata struct {
// Page number (1-indexed)
PageNumber *uint32 `json:"page_number,omitempty"`
// Source filename or document name
Filename *string `json:"filename,omitempty"`
// Bounding box coordinates if available
Coordinates *BoundingBox `json:"coordinates,omitempty"`
// Position index in the element sequence
ElementIndex *uint `json:"element_index,omitempty"`
// Additional custom metadata
Additional map[string]string `json:"additional,omitempty"`
}
// Element semantic element extracted from document.
//
// Represents a logical unit of content with semantic classification,
// unique identifier, and metadata for tracking origin and position.
type Element struct {
// Unique element identifier
ElementID string `json:"element_id"`
// Semantic type of this element
ElementType ElementType `json:"element_type"`
// Text content of the element
Text string `json:"text"`
// Metadata about the element
Metadata ElementMetadata `json:"metadata"`
}
// ExcelWorkbook excel workbook representation.
//
// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
// extracted content and metadata.
type ExcelWorkbook struct {
// All sheets in the workbook
Sheets []ExcelSheet `json:"sheets,omitempty"`
// Workbook-level metadata (author, creation date, etc.)
Metadata map[string]string `json:"metadata,omitempty"`
// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
//
// Populated for legacy shared-workbook `.xlsx` files that contain the
// `xl/revisions/` directory. Each `<header>` element maps to one
// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
Revisions []DocumentRevision `json:"revisions,omitempty"`
}
// ExcelSheet single Excel worksheet.
//
// Represents one sheet from an Excel workbook with its content
// converted to Markdown format and dimensional statistics.
type ExcelSheet struct {
// Sheet name as it appears in Excel
Name string `json:"name"`
// Sheet content converted to Markdown tables
Markdown string `json:"markdown"`
// Number of rows
RowCount uint `json:"row_count"`
// Number of columns
ColCount uint `json:"col_count"`
// Total number of non-empty cells
CellCount uint `json:"cell_count"`
// Pre-extracted table cells (2D vector of cell values)
// Populated during markdown generation to avoid re-parsing markdown.
// None for empty sheets.
TableCells [][]string `json:"table_cells,omitempty"`
}
// XMLExtractionResult xML extraction result.
//
// Contains extracted text content from XML files along with
// structural statistics about the XML document.
type XMLExtractionResult struct {
// Extracted text content (XML structure filtered out)
Content string `json:"content"`
// Total number of XML elements processed
ElementCount uint `json:"element_count"`
// List of unique element names found (sorted)
UniqueElements []string `json:"unique_elements,omitempty"`
}
// TextExtractionResult plain text and Markdown extraction result.
//
// Contains the extracted text along with statistics and,
// for Markdown files, structural elements like headers and links.
type TextExtractionResult struct {
// Extracted text content
Content string `json:"content"`
// Number of lines
LineCount uint `json:"line_count"`
// Number of words
WordCount uint `json:"word_count"`
// Number of characters
CharacterCount uint `json:"character_count"`
// Markdown headers (text only, Markdown files only)
Headers []string `json:"headers,omitempty"`
// Markdown links as (text, URL) tuples (Markdown files only)
Links [][]string `json:"links,omitempty"`
// Code blocks as (language, code) tuples (Markdown files only)
CodeBlocks [][]string `json:"code_blocks,omitempty"`
}
// PptxExtractionResult powerPoint (PPTX) extraction result.
//
// Contains extracted slide content, metadata, and embedded images/tables.
type PptxExtractionResult struct {
// Extracted text content from all slides
Content string `json:"content"`
// Presentation metadata
Metadata PptxMetadata `json:"metadata"`
// Total number of slides
SlideCount uint `json:"slide_count"`
// Total number of embedded images
ImageCount uint `json:"image_count"`
// Total number of tables
TableCount uint `json:"table_count"`
// Extracted images from the presentation
Images []ExtractedImage `json:"images,omitempty"`
// Slide structure with boundaries (when page tracking is enabled)
PageStructure *PageStructure `json:"page_structure,omitempty"`
// Per-slide content (when page tracking is enabled)
PageContents []PageContent `json:"page_contents,omitempty"`
// Structured document representation
Document *DocumentStructure `json:"document,omitempty"`
// Hyperlinks discovered in slides as (url, optional_label) pairs.
Hyperlinks []string `json:"hyperlinks,omitempty"`
// Office metadata extracted from docProps/core.xml and docProps/app.xml.
//
// Contains keys like "title", "author", "created_by", "subject", "keywords",
// "modified_by", "created_at", "modified_at", etc.
OfficeMetadata map[string]string `json:"office_metadata,omitempty"`
// Slide comments as revisions.
//
// Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
// `DocumentRevision { kind: Comment }` with author (resolved from
// `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
// `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
Revisions []DocumentRevision `json:"revisions,omitempty"`
}
// EmailExtractionResult email extraction result.
//
// Complete representation of an extracted email message (.eml or .msg)
// including headers, body content, and attachments.
type EmailExtractionResult struct {
// Email subject line
Subject *string `json:"subject,omitempty"`
// Sender email address
FromEmail *string `json:"from_email,omitempty"`
// Primary recipient email addresses
ToEmails []string `json:"to_emails,omitempty"`
// CC recipient email addresses
CcEmails []string `json:"cc_emails,omitempty"`
// BCC recipient email addresses
BccEmails []string `json:"bcc_emails,omitempty"`
// Email date/timestamp
Date *string `json:"date,omitempty"`
// Message-ID header value
MessageID *string `json:"message_id,omitempty"`
// Plain text version of the email body
PlainText *string `json:"plain_text,omitempty"`
// HTML version of the email body
HTMLContent *string `json:"html_content,omitempty"`
// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
Content string `json:"content"`
// List of email attachments
Attachments []EmailAttachment `json:"attachments,omitempty"`
// Additional email headers and metadata
Metadata map[string]string `json:"metadata,omitempty"`
}
// EmailAttachment email attachment representation.
//
// Contains metadata and optionally the content of an email attachment.
type EmailAttachment struct {
// Attachment name (from Content-Disposition header)
Name *string `json:"name,omitempty"`
// Filename of the attachment
Filename *string `json:"filename,omitempty"`
// MIME type of the attachment
MimeType *string `json:"mime_type,omitempty"`
// Size in bytes
Size *uint `json:"size,omitempty"`
// Whether this attachment is an image
IsImage bool `json:"is_image"`
// Attachment data (if extracted).
// Uses `bytes::Bytes` for cheap cloning of large buffers.
Data []byte `json:"data,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmailAttachment) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Name *string `json:"name,omitempty"`
Filename *string `json:"filename,omitempty"`
MimeType *string `json:"mime_type,omitempty"`
Size *uint `json:"size,omitempty"`
IsImage bool `json:"is_image"`
Data []int `json:"data,omitempty"`
}{}
aux.Name = v.Name
aux.Filename = v.Filename
aux.MimeType = v.MimeType
aux.Size = v.Size
aux.IsImage = v.IsImage
if v.Data != nil {
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
}
return json.Marshal(aux)
}
// OcrExtractionResult oCR extraction result.
//
// Result of performing OCR on an image or scanned document,
// including recognized text and detected tables.
type OcrExtractionResult struct {
// Recognized text content
Content string `json:"content"`
// Original MIME type of the processed image
MimeType string `json:"mime_type"`
// OCR processing metadata (confidence scores, language, etc.)
Metadata map[string]json.RawMessage `json:"metadata,omitempty"`
// Tables detected and extracted via OCR
Tables []OcrTable `json:"tables,omitempty"`
// Structured OCR elements with bounding boxes and confidence scores.
// Available when TSV output is requested or table detection is enabled.
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
// Structured document produced from hOCR parsing.
// Carries paragraph structure, bounding boxes, and confidence scores
// that the flattened `content` string discards.
InternalDocument *string `json:"internal_document,omitempty"`
}
// OcrTable table detected via OCR.
//
// Represents a table structure recognized during OCR processing.
type OcrTable struct {
// Table cells as a 2D vector (rows × columns)
Cells [][]string `json:"cells,omitempty"`
// Markdown representation of the table
Markdown string `json:"markdown"`
// Page number where the table was found (1-indexed)
PageNumber uint32 `json:"page_number"`
// Bounding box of the table in pixel coordinates (from OCR word positions).
BoundingBox *OcrTableBoundingBox `json:"bounding_box,omitempty"`
}
// OcrTableBoundingBox bounding box for an OCR-detected table in pixel coordinates.
type OcrTableBoundingBox struct {
// Left x-coordinate (pixels)
Left uint32 `json:"left"`
// Top y-coordinate (pixels)
Top uint32 `json:"top"`
// Right x-coordinate (pixels)
Right uint32 `json:"right"`
// Bottom y-coordinate (pixels)
Bottom uint32 `json:"bottom"`
}
// ImagePreprocessingConfig image preprocessing configuration for OCR.
//
// These settings control how images are preprocessed before OCR to improve
// text recognition quality. Different preprocessing strategies work better
// for different document types.
type ImagePreprocessingConfig struct {
// Target DPI for the image (300 is standard, 600 for small text).
TargetDpi *int32 `json:"target_dpi,omitempty"`
// Auto-detect and correct image rotation.
AutoRotate *bool `json:"auto_rotate,omitempty"`
// Correct skew (tilted images).
Deskew *bool `json:"deskew,omitempty"`
// Remove noise from the image.
Denoise bool `json:"denoise"`
// Enhance contrast for better text visibility.
ContrastEnhance bool `json:"contrast_enhance"`
// Binarization method: "otsu", "sauvola", "adaptive".
BinarizationMethod *string `json:"binarization_method,omitempty"`
// Invert colors (white text on black → black on white).
InvertColors bool `json:"invert_colors"`
}
// TesseractConfig tesseract OCR configuration.
//
// Provides fine-grained control over Tesseract OCR engine parameters.
// Most users can use the defaults, but these settings allow optimization
// for specific document types (invoices, handwriting, etc.).
type TesseractConfig struct {
// Language code (e.g., "eng", "deu", "fra")
Language *string `json:"language,omitempty"`
// Page Segmentation Mode (0-13).
//
// Common values:
// - 3: Fully automatic page segmentation (native default)
// - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
// - 11: Sparse text with no particular order
Psm *int32 `json:"psm,omitempty"`
// Output format ("text" or "markdown")
OutputFormat *string `json:"output_format,omitempty"`
// OCR Engine Mode (0-3).
//
// - 0: Legacy engine only
// - 1: Neural nets (LSTM) only (usually best)
// - 2: Legacy + LSTM
// - 3: Default (based on what's available)
Oem *int32 `json:"oem,omitempty"`
// Minimum confidence threshold (0.0-100.0).
//
// Words with confidence below this threshold may be rejected or flagged.
MinConfidence float64 `json:"min_confidence"`
// Image preprocessing configuration.
//
// Controls how images are preprocessed before OCR. Can significantly
// improve quality for scanned documents or low-quality images.
Preprocessing *ImagePreprocessingConfig `json:"preprocessing,omitempty"`
// Enable automatic table detection and reconstruction
EnableTableDetection *bool `json:"enable_table_detection,omitempty"`
// Minimum confidence threshold for table detection (0.0-1.0)
TableMinConfidence float64 `json:"table_min_confidence"`
// Column threshold for table detection (pixels)
TableColumnThreshold *int32 `json:"table_column_threshold,omitempty"`
// Row threshold ratio for table detection (0.0-1.0)
TableRowThresholdRatio *float64 `json:"table_row_threshold_ratio,omitempty"`
// Enable OCR result caching
UseCache *bool `json:"use_cache,omitempty"`
// Use pre-adapted templates for character classification
ClassifyUsePreAdaptedTemplates *bool `json:"classify_use_pre_adapted_templates,omitempty"`
// Enable N-gram language model
LanguageModelNgramOn bool `json:"language_model_ngram_on"`
// Don't reject good words during block-level processing
TesseditDontBlkrejGoodWds *bool `json:"tessedit_dont_blkrej_good_wds,omitempty"`
// Don't reject good words during row-level processing
TesseditDontRowrejGoodWds *bool `json:"tessedit_dont_rowrej_good_wds,omitempty"`
// Enable dictionary correction
TesseditEnableDictCorrection *bool `json:"tessedit_enable_dict_correction,omitempty"`
// Whitelist of allowed characters (empty = all allowed)
TesseditCharWhitelist string `json:"tessedit_char_whitelist"`
// Blacklist of forbidden characters (empty = none forbidden)
TesseditCharBlacklist string `json:"tessedit_char_blacklist"`
// Use primary language params model
TesseditUsePrimaryParamsModel *bool `json:"tessedit_use_primary_params_model,omitempty"`
// Variable-width space detection
TextordSpaceSizeIsVariable *bool `json:"textord_space_size_is_variable,omitempty"`
// Use adaptive thresholding method
ThresholdingMethod bool `json:"thresholding_method"`
}
// ImagePreprocessingMetadata image preprocessing metadata.
//
// Tracks the transformations applied to an image during OCR preprocessing,
// including DPI normalization, resizing, and resampling.
type ImagePreprocessingMetadata struct {
// Original image dimensions (width, height) in pixels
OriginalDimensions []uint `json:"original_dimensions,omitempty"`
// Original image DPI (horizontal, vertical)
OriginalDpi []float64 `json:"original_dpi,omitempty"`
// Target DPI from configuration
TargetDpi int32 `json:"target_dpi"`
// Scaling factor applied to the image
ScaleFactor float64 `json:"scale_factor"`
// Whether DPI was auto-adjusted based on content
AutoAdjusted bool `json:"auto_adjusted"`
// Final DPI after processing
FinalDpi int32 `json:"final_dpi"`
// New dimensions after resizing (if resized)
NewDimensions []uint `json:"new_dimensions,omitempty"`
// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
ResampleMethod string `json:"resample_method"`
// Whether dimensions were clamped to max_image_dimension
DimensionClamped bool `json:"dimension_clamped"`
// Calculated optimal DPI (if auto_adjust_dpi enabled)
CalculatedDpi *int32 `json:"calculated_dpi,omitempty"`
// Whether resize was skipped (dimensions already optimal)
SkippedResize bool `json:"skipped_resize"`
// Error message if resize failed
ResizeError *string `json:"resize_error,omitempty"`
}
// Metadata extraction result metadata.
//
// Contains common fields applicable to all formats, format-specific metadata
// via a discriminated union, and additional custom fields from postprocessors.
type Metadata struct {
// Document title
Title *string `json:"title,omitempty"`
// Document subject or description
Subject *string `json:"subject,omitempty"`
// Primary author(s) - always Vec for consistency
Authors []string `json:"authors,omitempty"`
// Keywords/tags - always Vec for consistency
Keywords []string `json:"keywords,omitempty"`
// Primary language (ISO 639 code)
Language *string `json:"language,omitempty"`
// Creation timestamp (ISO 8601 format)
CreatedAt *string `json:"created_at,omitempty"`
// Last modification timestamp (ISO 8601 format)
ModifiedAt *string `json:"modified_at,omitempty"`
// User who created the document
CreatedBy *string `json:"created_by,omitempty"`
// User who last modified the document
ModifiedBy *string `json:"modified_by,omitempty"`
// Page/slide/sheet structure with boundaries
Pages *PageStructure `json:"pages,omitempty"`
// Format-specific metadata (discriminated union)
//
// Contains detailed metadata specific to the document format.
// Serialized as a nested `"format"` object with a `format_type` discriminator field.
Format *FormatMetadata `json:"format,omitempty"`
// Image preprocessing metadata (when OCR preprocessing was applied)
ImagePreprocessing *ImagePreprocessingMetadata `json:"image_preprocessing,omitempty"`
// JSON schema (for structured data extraction)
JSONSchema *json.RawMessage `json:"json_schema,omitempty"`
// Error metadata (for batch operations)
Error *ErrorMetadata `json:"error,omitempty"`
// Extraction duration in milliseconds (for benchmarking).
//
// This field is populated by batch extraction to provide per-file timing
// information. It's `None` for single-file extraction (which uses external timing).
ExtractionDurationMs *uint64 `json:"extraction_duration_ms,omitempty"`
// Document category (from frontmatter or classification).
Category *string `json:"category,omitempty"`
// Document tags (from frontmatter).
Tags []string `json:"tags,omitempty"`
// Document version string (from frontmatter).
DocumentVersion *string `json:"document_version,omitempty"`
// Abstract or summary text (from frontmatter).
AbstractText *string `json:"abstract_text,omitempty"`
// Output format identifier (e.g., "markdown", "html", "text").
//
// Set by the output format pipeline stage when format conversion is applied.
// Previously stored in `metadata.additional["output_format"]`.
OutputFormat *string `json:"output_format,omitempty"`
// Whether OCR was used during extraction.
//
// Set to `true` whenever the extraction pipeline ran an OCR backend
// (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
// or fallback text. `false` means native text extraction was used exclusively.
OcrUsed bool `json:"ocr_used"`
// Additional custom fields from postprocessors.
//
// Serialized as a nested `"additional"` object (not flattened at root level).
// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
Additional map[string]json.RawMessage `json:"additional,omitempty"`
}
// ExcelMetadata excel/spreadsheet format metadata.
//
// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
// discriminant. Sheet count and sheet names are stored inside this struct.
type ExcelMetadata struct {
// Number of sheets in the workbook.
SheetCount *uint32 `json:"sheet_count,omitempty"`
// Names of all sheets in the workbook.
SheetNames []string `json:"sheet_names,omitempty"`
}
// EmailMetadata email metadata extracted from .eml and .msg files.
//
// Includes sender/recipient information, message ID, and attachment list.
type EmailMetadata struct {
// Sender's email address
FromEmail *string `json:"from_email,omitempty"`
// Sender's display name
FromName *string `json:"from_name,omitempty"`
// Primary recipients
ToEmails []string `json:"to_emails,omitempty"`
// CC recipients
CcEmails []string `json:"cc_emails,omitempty"`
// BCC recipients
BccEmails []string `json:"bcc_emails,omitempty"`
// Message-ID header value
MessageID *string `json:"message_id,omitempty"`
// List of attachment filenames
Attachments []string `json:"attachments,omitempty"`
}
// ArchiveMetadata archive (ZIP/TAR/7Z) metadata.
//
// Extracted from compressed archive files containing file lists and size information.
type ArchiveMetadata struct {
// Archive format ("ZIP", "TAR", "7Z", etc.)
Format string `json:"format"`
// Total number of files in the archive
FileCount uint32 `json:"file_count"`
// List of file paths within the archive
FileList []string `json:"file_list,omitempty"`
// Total uncompressed size in bytes
TotalSize uint64 `json:"total_size"`
// Compressed size in bytes (if available)
CompressedSize *uint64 `json:"compressed_size,omitempty"`
}
// ImageMetadata image metadata extracted from image files.
//
// Includes dimensions, format, and EXIF data.
type ImageMetadata struct {
// Image width in pixels
Width uint32 `json:"width"`
// Image height in pixels
Height uint32 `json:"height"`
// Image format (e.g., "PNG", "JPEG", "TIFF")
Format string `json:"format"`
// EXIF metadata tags
Exif map[string]string `json:"exif,omitempty"`
}
// XMLMetadata xML metadata extracted during XML parsing.
//
// Provides statistics about XML document structure.
type XMLMetadata struct {
// Total number of XML elements processed
ElementCount uint32 `json:"element_count"`
// List of unique element tag names (sorted)
UniqueElements []string `json:"unique_elements,omitempty"`
}
// TextMetadata text/Markdown metadata.
//
// Extracted from plain text and Markdown files. Includes word counts and,
// for Markdown, structural elements like headers and links.
type TextMetadata struct {
// Number of lines in the document
LineCount uint32 `json:"line_count"`
// Number of words
WordCount uint32 `json:"word_count"`
// Number of characters
CharacterCount uint32 `json:"character_count"`
// Markdown headers (headings text only, for Markdown files)
Headers []string `json:"headers,omitempty"`
// Markdown links as (text, url) tuples (for Markdown files)
Links [][]string `json:"links,omitempty"`
// Code blocks as (language, code) tuples (for Markdown files)
CodeBlocks [][]string `json:"code_blocks,omitempty"`
}
// HeaderMetadata header/heading element metadata.
type HeaderMetadata struct {
// Header level: 1 (h1) through 6 (h6)
Level uint8 `json:"level"`
// Normalized text content of the header
Text string `json:"text"`
// HTML id attribute if present
ID *string `json:"id,omitempty"`
// Document tree depth at the header element
Depth uint32 `json:"depth"`
// Byte offset in original HTML document
HTMLOffset uint32 `json:"html_offset"`
}
// LinkMetadata link element metadata.
type LinkMetadata struct {
// The href URL value
Href string `json:"href"`
// Link text content (normalized)
Text string `json:"text"`
// Optional title attribute
Title *string `json:"title,omitempty"`
// Link type classification
LinkType LinkType `json:"link_type"`
// Rel attribute values
Rel []string `json:"rel,omitempty"`
// Additional attributes as key-value pairs
Attributes [][]string `json:"attributes,omitempty"`
}
// ImageMetadataType image element metadata.
type ImageMetadataType struct {
// Image source (URL, data URI, or SVG content)
Src string `json:"src"`
// Alternative text from alt attribute
Alt *string `json:"alt,omitempty"`
// Title attribute
Title *string `json:"title,omitempty"`
// Image dimensions as (width, height) if available
Dimensions []uint32 `json:"dimensions,omitempty"`
// Image type classification
ImageType ImageType `json:"image_type"`
// Additional attributes as key-value pairs
Attributes [][]string `json:"attributes,omitempty"`
}
// StructuredData structured data (Schema.org, microdata, RDFa) block.
type StructuredData struct {
// Type of structured data
DataType StructuredDataType `json:"data_type"`
// Raw JSON string representation
RawJSON string `json:"raw_json"`
// Schema type if detectable (e.g., "Article", "Event", "Product")
SchemaType *string `json:"schema_type,omitempty"`
}
// HTMLMetadata hTML metadata extracted from HTML documents.
//
// Includes document-level metadata, Open Graph data, Twitter Card metadata,
// and extracted structural elements (headers, links, images, structured data).
type HTMLMetadata struct {
// Document title from `<title>` tag
Title *string `json:"title,omitempty"`
// Document description from `<meta name="description">` tag
Description *string `json:"description,omitempty"`
// Document keywords from `<meta name="keywords">` tag, split on commas
Keywords []string `json:"keywords,omitempty"`
// Document author from `<meta name="author">` tag
Author *string `json:"author,omitempty"`
// Canonical URL from `<link rel="canonical">` tag
CanonicalURL *string `json:"canonical_url,omitempty"`
// Base URL from `<base href="">` tag for resolving relative URLs
BaseHref *string `json:"base_href,omitempty"`
// Document language from `lang` attribute
Language *string `json:"language,omitempty"`
// Document text direction from `dir` attribute
TextDirection *TextDirection `json:"text_direction,omitempty"`
// Open Graph metadata (og:* properties) for social media
// Keys like "title", "description", "image", "url", etc.
OpenGraph map[string]string `json:"open_graph,omitempty"`
// Twitter Card metadata (twitter:* properties)
// Keys like "card", "site", "creator", "title", "description", "image", etc.
TwitterCard map[string]string `json:"twitter_card,omitempty"`
// Additional meta tags not covered by specific fields
// Keys are meta name/property attributes, values are content
MetaTags map[string]string `json:"meta_tags,omitempty"`
// Extracted header elements with hierarchy
Headers []HeaderMetadata `json:"headers,omitempty"`
// Extracted hyperlinks with type classification
Links []LinkMetadata `json:"links,omitempty"`
// Extracted images with source and dimensions
Images []ImageMetadataType `json:"images,omitempty"`
// Extracted structured data blocks
StructuredData []StructuredData `json:"structured_data,omitempty"`
}
// OcrMetadata oCR processing metadata.
//
// Captures information about OCR processing configuration and results.
type OcrMetadata struct {
// OCR language code(s) used
Language string `json:"language"`
// Tesseract Page Segmentation Mode (PSM)
Psm int32 `json:"psm"`
// Output format (e.g., "text", "hocr")
OutputFormat string `json:"output_format"`
// Number of tables detected
TableCount uint32 `json:"table_count"`
TableRows *uint32 `json:"table_rows,omitempty"`
TableCols *uint32 `json:"table_cols,omitempty"`
}
// ErrorMetadata error metadata (for batch operations).
type ErrorMetadata struct {
ErrorType string `json:"error_type"`
Message string `json:"message"`
}
// PptxMetadata powerPoint presentation metadata.
//
// Extracted from PPTX files containing slide counts and presentation details.
type PptxMetadata struct {
// Total number of slides in the presentation
SlideCount uint32 `json:"slide_count"`
// Names of slides (if available)
SlideNames []string `json:"slide_names,omitempty"`
// Number of embedded images
ImageCount *uint32 `json:"image_count,omitempty"`
// Number of tables
TableCount *uint32 `json:"table_count,omitempty"`
}
// DocxMetadata word document metadata.
//
// Extracted from DOCX files using shared Office Open XML metadata extraction.
// Integrates with `office_metadata` module for core/app/custom properties.
type DocxMetadata struct {
// Core properties from docProps/core.xml (Dublin Core metadata)
//
// Contains title, creator, subject, keywords, dates, etc.
// Shared format across DOCX/PPTX/XLSX documents.
CoreProperties *CoreProperties `json:"core_properties,omitempty"`
// Application properties from docProps/app.xml (Word-specific statistics)
//
// Contains word count, page count, paragraph count, editing time, etc.
// DOCX-specific variant of Office application properties.
AppProperties *DocxAppProperties `json:"app_properties,omitempty"`
// Custom properties from docProps/custom.xml (user-defined properties)
//
// Contains key-value pairs defined by users or applications.
// Values can be strings, numbers, booleans, or dates.
CustomProperties map[string]json.RawMessage `json:"custom_properties,omitempty"`
}
// CsvMetadata cSV/TSV file metadata.
type CsvMetadata struct {
RowCount uint32 `json:"row_count"`
ColumnCount uint32 `json:"column_count"`
Delimiter *string `json:"delimiter,omitempty"`
HasHeader bool `json:"has_header"`
ColumnTypes []string `json:"column_types,omitempty"`
}
// BibtexMetadata bibTeX bibliography metadata.
type BibtexMetadata struct {
// Number of entries in the bibliography.
EntryCount uint `json:"entry_count"`
CitationKeys []string `json:"citation_keys,omitempty"`
Authors []string `json:"authors,omitempty"`
YearRange *YearRange `json:"year_range,omitempty"`
EntryTypes map[string]uint `json:"entry_types,omitempty"`
}
// CitationMetadata citation file metadata (RIS, PubMed, EndNote).
type CitationMetadata struct {
CitationCount uint `json:"citation_count"`
Format *string `json:"format,omitempty"`
Authors []string `json:"authors,omitempty"`
YearRange *YearRange `json:"year_range,omitempty"`
Dois []string `json:"dois,omitempty"`
Keywords []string `json:"keywords,omitempty"`
}
// YearRange year range for bibliographic metadata.
type YearRange struct {
Min *uint32 `json:"min,omitempty"`
Max *uint32 `json:"max,omitempty"`
Years []uint32 `json:"years,omitempty"`
}
// FictionBookMetadata fictionBook (FB2) metadata.
type FictionBookMetadata struct {
Genres []string `json:"genres,omitempty"`
Sequences []string `json:"sequences,omitempty"`
Annotation *string `json:"annotation,omitempty"`
}
// DbfMetadata dBASE (DBF) file metadata.
type DbfMetadata struct {
RecordCount uint `json:"record_count"`
FieldCount uint `json:"field_count"`
Fields []DbfFieldInfo `json:"fields,omitempty"`
}
// DbfFieldInfo dBASE field information.
type DbfFieldInfo struct {
Name string `json:"name"`
FieldType string `json:"field_type"`
}
// JatsMetadata jATS (Journal Article Tag Suite) metadata.
type JatsMetadata struct {
Copyright *string `json:"copyright,omitempty"`
License *string `json:"license,omitempty"`
HistoryDates map[string]string `json:"history_dates,omitempty"`
ContributorRoles []ContributorRole `json:"contributor_roles,omitempty"`
}
// ContributorRole jATS contributor with role.
type ContributorRole struct {
Name string `json:"name"`
Role *string `json:"role,omitempty"`
}
// EpubMetadata ePUB metadata (Dublin Core extensions).
type EpubMetadata struct {
Coverage *string `json:"coverage,omitempty"`
DcFormat *string `json:"dc_format,omitempty"`
Relation *string `json:"relation,omitempty"`
Source *string `json:"source,omitempty"`
DcType *string `json:"dc_type,omitempty"`
CoverImage *string `json:"cover_image,omitempty"`
}
// PstMetadata outlook PST archive metadata.
type PstMetadata struct {
MessageCount uint `json:"message_count"`
}
// OcrConfidence confidence scores for an OCR element.
//
// Separates detection confidence (how confident that text exists at this location)
// from recognition confidence (how confident about the actual text content).
type OcrConfidence struct {
// Detection confidence: how confident the OCR engine is that text exists here.
//
// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
// Range: 0.0 to 1.0 (or None if not available).
Detection *float64 `json:"detection,omitempty"`
// Recognition confidence: how confident about the text content.
//
// Range: 0.0 to 1.0.
Recognition float64 `json:"recognition"`
}
// OcrRotation rotation information for an OCR element.
type OcrRotation struct {
// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
AngleDegrees float64 `json:"angle_degrees"`
// Confidence score for the rotation detection.
Confidence *float64 `json:"confidence,omitempty"`
}
// OcrElement unified OCR element representing detected text with full metadata.
//
// This is the primary type for structured OCR output, preserving all information
// from both Tesseract and PaddleOCR backends.
type OcrElement struct {
// The recognized text content.
Text string `json:"text"`
// Bounding geometry (rectangle or quadrilateral).
Geometry OcrBoundingGeometry `json:"geometry"`
// Confidence scores for detection and recognition.
Confidence OcrConfidence `json:"confidence"`
// Hierarchical level (word, line, block, page).
Level OcrElementLevel `json:"level,omitempty"`
// Rotation information (if detected).
Rotation *OcrRotation `json:"rotation,omitempty"`
// Page number (1-indexed).
PageNumber uint32 `json:"page_number"`
// Parent element ID for hierarchical relationships.
//
// Only used for Tesseract output which has word -> line -> block hierarchy.
ParentID *string `json:"parent_id,omitempty"`
// Backend-specific metadata that doesn't fit the unified schema.
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
}
func (s *OcrElement) UnmarshalJSON(data []byte) error {
var raw struct {
Text string `json:"text"`
Geometry json.RawMessage `json:"geometry,omitempty"`
Confidence OcrConfidence `json:"confidence"`
Level OcrElementLevel `json:"level,omitempty"`
Rotation *OcrRotation `json:"rotation,omitempty"`
PageNumber uint32 `json:"page_number"`
ParentID *string `json:"parent_id,omitempty"`
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Text = raw.Text
s.Confidence = raw.Confidence
s.Level = raw.Level
s.Rotation = raw.Rotation
s.PageNumber = raw.PageNumber
s.ParentID = raw.ParentID
s.BackendMetadata = raw.BackendMetadata
if len(raw.Geometry) > 0 && string(raw.Geometry) != "null" {
v, err := UnmarshalOcrBoundingGeometry(raw.Geometry)
if err != nil {
return err
}
s.Geometry = v
}
return nil
}
// OcrElementConfig configuration for OCR element extraction.
//
// Controls how OCR elements are extracted and filtered.
type OcrElementConfig struct {
// Whether to include OCR elements in the extraction result.
//
// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
IncludeElements bool `json:"include_elements"`
// Minimum hierarchical level to include.
//
// Elements below this level (e.g., words when min_level is Line) will be excluded.
MinLevel OcrElementLevel `json:"min_level,omitempty"`
// Minimum recognition confidence threshold (0.0-1.0).
//
// Elements with confidence below this threshold will be filtered out.
MinConfidence float64 `json:"min_confidence"`
// Whether to build hierarchical relationships between elements.
//
// When true, `parent_id` fields will be populated based on spatial containment.
// Only meaningful for Tesseract output.
BuildHierarchy bool `json:"build_hierarchy"`
}
// PageStructure unified page structure for documents.
//
// Supports different page types (PDF pages, PPTX slides, Excel sheets)
// with character offset boundaries for chunk-to-page mapping.
type PageStructure struct {
// Total number of pages/slides/sheets
TotalCount uint32 `json:"total_count"`
// Type of paginated unit
UnitType PageUnitType `json:"unit_type"`
// Character offset boundaries for each page
//
// Maps character ranges in the extracted content to page numbers.
// Used for chunk page range calculation.
Boundaries []PageBoundary `json:"boundaries,omitempty"`
// Detailed per-page metadata (optional, only when needed)
Pages []PageInfo `json:"pages,omitempty"`
}
// PageBoundary byte offset boundary for a page.
//
// Tracks where a specific page's content starts and ends in the main content string,
// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
type PageBoundary struct {
// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
ByteStart uint `json:"byte_start"`
// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
ByteEnd uint `json:"byte_end"`
// Page number (1-indexed)
PageNumber uint32 `json:"page_number"`
}
// PageInfo metadata for individual page/slide/sheet.
//
// Captures per-page information including dimensions, content counts,
// and visibility state (for presentations).
type PageInfo struct {
// Page number (1-indexed)
Number uint32 `json:"number"`
// Page title (usually for presentations)
Title *string `json:"title,omitempty"`
// Dimensions in points (PDF) or pixels (images): (width, height)
Dimensions []float64 `json:"dimensions,omitempty"`
// Number of images on this page
ImageCount *uint32 `json:"image_count,omitempty"`
// Number of tables on this page
TableCount *uint32 `json:"table_count,omitempty"`
// Whether this page is hidden (e.g., in presentations)
Hidden *bool `json:"hidden,omitempty"`
// Whether this page is blank (no meaningful text, no images, no tables)
//
// A page is considered blank if it has fewer than 3 non-whitespace characters
// and contains no tables or images. This is useful for filtering out empty pages
// in scanned documents or PDFs with blank separator pages.
IsBlank *bool `json:"is_blank,omitempty"`
// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
//
// Indicates the presence of vector-drawn content such as charts, diagrams,
// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
// invisible to `ExtractionResult.images` since they are not embedded as raster
// XObjects. Set to `true` when path count exceeds a heuristic threshold,
// signaling that downstream consumers may want to rasterize the page to
// capture this content.
//
// Only populated for PDFs; `None` for other document types.
HasVectorGraphics bool `json:"has_vector_graphics"`
}
// PageContent content for a single page/slide.
//
// When page extraction is enabled, documents are split into per-page content
// with associated tables and images mapped to each page.
//
// # Performance
//
// Uses Arc-wrapped tables and images for memory efficiency:
// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
// - Maintains exact JSON compatibility via custom Serialize/Deserialize
//
// This reduces memory overhead for documents with shared tables/images
// by avoiding redundant copies during serialization.
type PageContent struct {
// Page number (1-indexed)
PageNumber uint32 `json:"page_number"`
// Text content for this page
Content string `json:"content"`
// Tables found on this page (uses Arc for memory efficiency)
//
// Serializes as Vec<Table> for JSON compatibility while maintaining
// Arc semantics in-memory for zero-copy sharing.
Tables []Table `json:"tables,omitempty"`
// Indices into `ExtractionResult.images` for images found on this page.
//
// Each value is a zero-based index into the top-level `images` collection.
// Only populated when `extract_images = true` in the extraction config.
ImageIndices []uint32 `json:"image_indices,omitempty"`
// Hierarchy information for the page (when hierarchy extraction is enabled)
//
// Contains text hierarchy levels (H1-H6) extracted from the page content.
Hierarchy *PageHierarchy `json:"hierarchy,omitempty"`
// Whether this page is blank (no meaningful text content)
//
// Determined during extraction based on text content analysis.
// A page is blank if it has fewer than 3 non-whitespace characters
// and contains no tables or images.
IsBlank *bool `json:"is_blank,omitempty"`
// Layout detection regions for this page (when layout detection is enabled).
//
// Contains detected layout regions with class, confidence, bounding box,
// and area fraction. Only populated when layout detection is configured.
LayoutRegions []LayoutRegion `json:"layout_regions,omitempty"`
// Speaker notes for this slide (PPTX only).
//
// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
// Only populated when the source is a PPTX file and notes are present.
SpeakerNotes *string `json:"speaker_notes,omitempty"`
// Section name this slide belongs to (PPTX only).
//
// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
// the slide belongs to a named section.
SectionName *string `json:"section_name,omitempty"`
// Sheet name for this page (XLSX/ODS only).
//
// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
// formats and for sheets with an empty name.
SheetName *string `json:"sheet_name,omitempty"`
}
// LayoutRegion detected layout region on a page.
//
// When layout detection is enabled, each page may have layout regions
// identifying different content types (text, pictures, tables, etc.)
// with confidence scores and spatial positions.
type LayoutRegion struct {
// Layout class name (e.g. "picture", "table", "text", "section_header").
ClassName string `json:"class_name"`
// Confidence score from the layout detection model (0.0 to 1.0).
Confidence float64 `json:"confidence"`
// Bounding box in document coordinate space.
BoundingBox BoundingBox `json:"bounding_box"`
// Fraction of the page area covered by this region (0.0 to 1.0).
AreaFraction float64 `json:"area_fraction"`
}
// PageHierarchy page hierarchy structure containing heading levels and block information.
//
// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
// blocks with heading levels (H1-H6) for semantic document structure.
type PageHierarchy struct {
// Number of hierarchy blocks on this page
BlockCount uint32 `json:"block_count"`
// Hierarchical blocks with heading levels
Blocks []HierarchicalBlock `json:"blocks,omitempty"`
}
// HierarchicalBlock text block with hierarchy level assignment.
//
// Represents a block of text with semantic heading information extracted from
// font size clustering and hierarchical analysis.
type HierarchicalBlock struct {
// The text content of this block
Text string `json:"text"`
// The font size of the text in this block
FontSize float32 `json:"font_size"`
// The hierarchy level of this block (H1-H6 or Body)
//
// Levels correspond to HTML heading tags:
// - "h1": Top-level heading
// - "h2": Secondary heading
// - "h3": Tertiary heading
// - "h4": Quaternary heading
// - "h5": Quinary heading
// - "h6": Senary heading
// - "body": Body text (no heading level)
Level string `json:"level"`
// Bounding box information for the block
//
// Contains coordinates as (left, top, right, bottom) in PDF units.
Bbox []float32 `json:"bbox,omitempty"`
}
// CellChange single changed cell within a table.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type CellChange struct {
// Zero-based row index.
Row uint `json:"row"`
// Zero-based column index.
Col uint `json:"col"`
// Value before the change.
From string `json:"from"`
// Value after the change.
To string `json:"to"`
}
// DocumentRevision single tracked change embedded in a document.
//
// Populated by per-format extractors that understand change-tracking metadata
// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
// extractor defaults to `ExtractionResult.revisions = None` until a
// format-specific implementation is added.
type DocumentRevision struct {
// Format-specific revision identifier.
//
// For DOCX this is the `w:id` attribute value on the change element
// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
RevisionID string `json:"revision_id"`
// Display name of the author who made this change, when available.
Author *string `json:"author,omitempty"`
// ISO-8601 timestamp of the change, when available.
//
// Stored as a plain string so this type remains FFI-friendly and
// unconditionally available without the `chrono` optional dep.
// DOCX populates this from the `w:date` attribute (e.g.
// `"2024-03-15T10:30:00Z"`).
Timestamp *string `json:"timestamp,omitempty"`
// Semantic kind of this revision.
Kind RevisionKind `json:"kind"`
// Best-effort document location for this revision.
//
// Resolution is format-dependent and may be `None` when the location
// cannot be determined (e.g. changes inside table cells before
// table-cell anchor support is added).
Anchor RevisionAnchor `json:"anchor,omitempty"`
// The content changes that make up this revision.
Delta RevisionDelta `json:"delta"`
}
func (s *DocumentRevision) UnmarshalJSON(data []byte) error {
var raw struct {
RevisionID string `json:"revision_id"`
Author *string `json:"author,omitempty"`
Timestamp *string `json:"timestamp,omitempty"`
Kind RevisionKind `json:"kind"`
Anchor json.RawMessage `json:"anchor,omitempty"`
Delta RevisionDelta `json:"delta"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.RevisionID = raw.RevisionID
s.Author = raw.Author
s.Timestamp = raw.Timestamp
s.Kind = raw.Kind
s.Delta = raw.Delta
if len(raw.Anchor) > 0 && string(raw.Anchor) != "null" {
v, err := UnmarshalRevisionAnchor(raw.Anchor)
if err != nil {
return err
}
s.Anchor = v
}
return nil
}
// RevisionDelta content changes that make up a single revision.
//
// For insertions and deletions the `content` field carries the added/removed
// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
// changes, `content` is empty — the property diff is left as a TODO for a
// later enrichment pass.
type RevisionDelta struct {
// Line-level content changes for this revision.
Content []DiffLine `json:"content,omitempty"`
// Cell-level table changes for this revision.
TableChanges []CellChange `json:"table_changes,omitempty"`
}
// Table extracted table structure.
//
// Represents a table detected and extracted from a document (PDF, image, etc.).
// Tables are converted to both structured cell data and Markdown format.
type Table struct {
// Table cells as a 2D vector (rows × columns)
Cells [][]string `json:"cells,omitempty"`
// Markdown representation of the table
Markdown string `json:"markdown"`
// Page number where the table was found (1-indexed)
PageNumber uint32 `json:"page_number"`
// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
// Only populated for PDF-extracted tables when position data is available.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}
// TableCell individual table cell with content and optional styling.
//
// Future extension point for rich table support with cell-level metadata.
type TableCell struct {
// Cell content as text
Content string `json:"content"`
// Row span (number of rows this cell spans)
RowSpan uint32 `json:"row_span"`
// Column span (number of columns this cell spans)
ColSpan uint32 `json:"col_span"`
// Whether this is a header cell
IsHeader bool `json:"is_header"`
}
// ExtractedURI uRI extracted from a document.
//
// Represents any link, reference, or resource pointer found during extraction.
// The `kind` field classifies the URI semantically, while `label` carries
// optional human-readable display text.
type ExtractedURI struct {
// The URL or path string.
URL string `json:"url"`
// Optional display text / label for the link.
Label *string `json:"label,omitempty"`
// Optional page number where the URI was found (1-indexed).
Page *uint32 `json:"page,omitempty"`
// Semantic classification of the URI.
Kind URIKind `json:"kind"`
}
// DetectResponse mIME type detection response.
type DetectResponse struct {
// Detected MIME type
MimeType string `json:"mime_type"`
// Original filename (if provided)
Filename *string `json:"filename,omitempty"`
}
// DiffOptions options controlling how two `ExtractionResult` values are compared.
type DiffOptions struct {
// Include metadata changes in the diff. Default: `true`.
IncludeMetadata *bool `json:"include_metadata,omitempty"`
// Include embedded-children changes in the diff. Default: `true`.
IncludeEmbedded *bool `json:"include_embedded,omitempty"`
// Truncate content to this many characters before diffing.
//
// Useful for very large documents where only the first N characters matter.
// `None` means no truncation.
MaxContentChars *uint `json:"max_content_chars,omitempty"`
}
// ExtractionDiff complete diff between two `ExtractionResult` values.
type ExtractionDiff struct {
// Unified-diff hunks for the `content` field.
//
// Empty when the content is identical.
ContentDiff []DiffHunk `json:"content_diff,omitempty"`
// Tables present in `b` but not in `a` (by index position, excess right-side tables).
TablesAdded []Table `json:"tables_added,omitempty"`
// Tables present in `a` but not in `b` (by index position, excess left-side tables).
TablesRemoved []Table `json:"tables_removed,omitempty"`
// Cell-level changes for table pairs that share the same index and dimensions.
TablesChanged []TableDiff `json:"tables_changed,omitempty"`
// Metadata difference, encoded as a JSON object with three top-level keys:
// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
// but not `b`), and `changed` (keys whose values differ — each entry is
// `{ "from": <value-in-a>, "to": <value-in-b> }`).
//
// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
// preferred json-patch impl directly.
MetadataChanged json.RawMessage `json:"metadata_changed"`
// Changes to embedded archive children.
EmbeddedChanges EmbeddedChanges `json:"embedded_changes"`
}
// DiffHunk single contiguous hunk in a unified diff.
type DiffHunk struct {
// Starting line number in the old content (0-indexed).
FromLine uint `json:"from_line"`
// Number of lines from the old content in this hunk.
FromCount uint `json:"from_count"`
// Starting line number in the new content (0-indexed).
ToLine uint `json:"to_line"`
// Number of lines from the new content in this hunk.
ToCount uint `json:"to_count"`
// Lines that make up this hunk.
Lines []DiffLine `json:"lines,omitempty"`
}
// TableDiff cell-level changes for a pair of tables that share the same index.
type TableDiff struct {
// Zero-based index of the table in both `a.tables` and `b.tables`.
FromIndex uint `json:"from_index"`
// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
ToIndex uint `json:"to_index"`
// Cell-level changes within the table.
CellChanges []CellChange `json:"cell_changes,omitempty"`
}
// EmbeddedChanges changes to embedded archive children between two results.
type EmbeddedChanges struct {
// Children present in `b` but not in `a` (matched by `path`).
Added []ArchiveEntry `json:"added,omitempty"`
// Children present in `a` but not in `b` (matched by `path`).
Removed []ArchiveEntry `json:"removed,omitempty"`
// Children present in both but with differing content (matched by `path`).
//
// Each entry holds the diff of the nested `ExtractionResult`.
Changed []EmbeddedDiff `json:"changed,omitempty"`
}
// EmbeddedDiff diff for a single embedded archive entry that appears in both results.
type EmbeddedDiff struct {
// Archive-relative path identifying this entry.
Path string `json:"path"`
// The recursive diff of the entry's extraction result.
Diff ExtractionDiff `json:"diff"`
}
// EmbeddingPreset preset configurations for common RAG use cases.
//
// Each preset combines chunk size, overlap, and embedding model
// to provide an optimized configuration for specific scenarios.
//
// All string fields are owned `String` for FFI compatibility — instances
// are safe to clone and pass across language boundaries.
type EmbeddingPreset struct {
Name string `json:"name"`
ChunkSize uint `json:"chunk_size"`
Overlap uint `json:"overlap"`
// HuggingFace repository name for the model.
ModelRepo string `json:"model_repo"`
// Pooling strategy: "cls" or "mean".
Pooling string `json:"pooling"`
// Path to the ONNX model file within the repo.
ModelFile string `json:"model_file"`
Dimensions uint `json:"dimensions"`
Description string `json:"description"`
}
// YakeParams yAKE-specific parameters.
type YakeParams struct {
// Window size for co-occurrence analysis (default: 2).
//
// Controls the context window for computing co-occurrence statistics.
WindowSize *uint `json:"window_size,omitempty"`
}
// RakeParams rAKE-specific parameters.
type RakeParams struct {
// Minimum word length to consider (default: 1).
MinWordLength *uint `json:"min_word_length,omitempty"`
// Maximum words in a keyword phrase (default: 3).
MaxWordsPerPhrase *uint `json:"max_words_per_phrase,omitempty"`
}
// KeywordConfig keyword extraction configuration.
type KeywordConfig struct {
// Algorithm to use for extraction.
Algorithm KeywordAlgorithm `json:"algorithm,omitempty"`
// Maximum number of keywords to extract (default: 10).
MaxKeywords *uint `json:"max_keywords,omitempty"`
// Minimum score threshold (0.0-1.0, default: 0.0).
//
// Keywords with scores below this threshold are filtered out.
// Note: Score ranges differ between algorithms.
MinScore float32 `json:"min_score"`
// N-gram range for keyword extraction (min, max).
//
// (1, 1) = unigrams only
// (1, 2) = unigrams and bigrams
// (1, 3) = unigrams, bigrams, and trigrams (default)
NgramRange []uint `json:"ngram_range,omitempty"`
// Language code for stopword filtering (e.g., "en", "de", "fr").
//
// If None, no stopword filtering is applied.
Language *string `json:"language,omitempty"`
// YAKE-specific tuning parameters.
YakeParams *YakeParams `json:"yake_params,omitempty"`
// RAKE-specific tuning parameters.
RakeParams *RakeParams `json:"rake_params,omitempty"`
}
// Keyword extracted keyword with metadata.
type Keyword struct {
// The keyword text.
Text string `json:"text"`
// Relevance score (higher is better, algorithm-specific range).
Score float32 `json:"score"`
// Algorithm that extracted this keyword.
Algorithm KeywordAlgorithm `json:"algorithm"`
// Optional positions where keyword appears in text (character offsets).
Positions []uint `json:"positions,omitempty"`
}
// PaddleOcrConfig configuration for PaddleOCR backend.
//
// Configures PaddleOCR text detection and recognition with multi-language support.
// Uses a builder pattern for convenient configuration.
//
// Example:
//
// // Create with default English configuration
// let config = PaddleOcrConfig::new("en");
//
// // Create with custom cache directory
// let config = PaddleOcrConfig::new("ch")
// .with_cache_dir("/path/to/cache".into());
//
// // Enable table detection
// let config = PaddleOcrConfig::new("en")
// .with_table_detection(true);
type PaddleOcrConfig struct {
// Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
Language string `json:"language"`
// Optional custom cache directory for model files
CacheDir *string `json:"cache_dir,omitempty"`
// Enable angle classification for rotated text (default: false).
// Can misfire on short text regions, rotating crops incorrectly before recognition.
UseAngleCls bool `json:"use_angle_cls"`
// Enable table structure detection (default: false)
EnableTableDetection bool `json:"enable_table_detection"`
// Database threshold for text detection (default: 0.3)
// Range: 0.0-1.0, higher values require more confident detections
DetDbThresh float32 `json:"det_db_thresh"`
// Box threshold for text bounding box refinement (default: 0.5)
// Range: 0.0-1.0
DetDbBoxThresh float32 `json:"det_db_box_thresh"`
// Unclip ratio for expanding text bounding boxes (default: 1.6)
// Controls the expansion of detected text regions
DetDbUnclipRatio float32 `json:"det_db_unclip_ratio"`
// Maximum side length for detection image (default: 960)
// Larger images may be resized to this limit for faster inference
DetLimitSideLen uint32 `json:"det_limit_side_len"`
// Batch size for recognition inference (default: 6)
// Number of text regions to process simultaneously
RecBatchNum uint32 `json:"rec_batch_num"`
// Padding in pixels added around the image before detection (default: 10).
// Large values can include surrounding content like table gridlines.
Padding uint32 `json:"padding"`
// Minimum recognition confidence score for text lines (default: 0.5).
// Text regions with recognition confidence below this threshold are discarded.
// Matches PaddleOCR Python's `drop_score` parameter.
// Range: 0.0-1.0
DropScore float32 `json:"drop_score"`
// Model tier controlling detection/recognition model size and accuracy trade-off.
// - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
// - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
ModelTier string `json:"model_tier"`
}
// ModelPaths combined paths to all models needed for OCR (backward compatibility).
type ModelPaths struct {
// Path to the detection model directory.
DetModel string `json:"det_model"`
// Path to the classification model directory.
ClsModel string `json:"cls_model"`
// Path to the recognition model directory.
RecModel string `json:"rec_model"`
// Path to the character dictionary file.
DictFile string `json:"dict_file"`
}
// OrientationResult document orientation detection result.
type OrientationResult struct {
// Detected orientation in degrees (0, 90, 180, or 270).
Degrees uint32 `json:"degrees"`
// Confidence score (0.0-1.0).
Confidence float32 `json:"confidence"`
}
// BBox bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
type BBox struct {
X1 float32 `json:"x1"`
Y1 float32 `json:"y1"`
X2 float32 `json:"x2"`
Y2 float32 `json:"y2"`
}
// LayoutDetection single layout detection result.
type LayoutDetection struct {
ClassName LayoutClass `json:"class_name"`
Confidence float32 `json:"confidence"`
Bbox BBox `json:"bbox"`
}
// RecognizedTable pre-computed table markdown for a table detection region.
//
// Produced by the TATR-based table structure recognizer and surfaced as part of
// layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
// so that consumers who do not enable `layout-detection` (ORT) can still reference
// the type in their own code.
type RecognizedTable struct {
// Detection bbox that this table corresponds to (for matching).
DetectionBbox BBox `json:"detection_bbox"`
// Table cells as a 2D vector (rows × columns).
Cells [][]string `json:"cells,omitempty"`
// Rendered markdown table.
Markdown string `json:"markdown"`
}
// DetectionResult page-level detection result containing all detections and page metadata.
type DetectionResult struct {
PageWidth uint32 `json:"page_width"`
PageHeight uint32 `json:"page_height"`
Detections []LayoutDetection `json:"detections,omitempty"`
}
// EmbeddedFile embedded file descriptor extracted from the PDF name tree.
type EmbeddedFile struct {
// The filename as stored in the PDF name tree.
Name string `json:"name"`
// Raw file bytes from the embedded stream (already decompressed by lopdf).
Data []byte `json:"data"`
// Compressed byte count of the original stream (before decompression).
//
// Used by callers to compute the decompression ratio and detect zip-bomb-style
// attacks that embed a tiny compressed stream expanding to gigabytes of data.
CompressedSize uint `json:"compressed_size"`
// MIME type if specified in the filespec, otherwise `None`.
MimeType *string `json:"mime_type,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmbeddedFile) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Name string `json:"name"`
Data []int `json:"data"`
CompressedSize uint `json:"compressed_size"`
MimeType *string `json:"mime_type,omitempty"`
}{}
aux.Name = v.Name
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
aux.CompressedSize = v.CompressedSize
aux.MimeType = v.MimeType
return json.Marshal(aux)
}
// PdfMetadata pDF-specific metadata.
//
// Contains metadata fields specific to PDF documents that are not in the common
// `Metadata` structure. Common fields like title, authors, keywords, and dates
// are at the `Metadata` level.
type PdfMetadata struct {
// PDF version (e.g., "1.7", "2.0")
PdfVersion *string `json:"pdf_version,omitempty"`
// PDF producer (application that created the PDF)
Producer *string `json:"producer,omitempty"`
// Whether the PDF is encrypted/password-protected
IsEncrypted *bool `json:"is_encrypted,omitempty"`
// First page width in points (1/72 inch)
Width *int64 `json:"width,omitempty"`
// First page height in points (1/72 inch)
Height *int64 `json:"height,omitempty"`
// Total number of pages in the PDF document
PageCount *uint32 `json:"page_count,omitempty"`
}
// ExtractBytes extract content from a byte array.
//
// This is the main entry point for in-memory extraction. It performs the following steps:
// 1. Validate MIME type
// 2. Handle legacy format conversion if needed
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
//
// Arguments:
// - content: The byte array to extract
// - mime_type: MIME type of the content
// - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Validation` if MIME type is invalid.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
// let config = ExtractionConfig::default();
// let bytes = b"Hello, world!";
// let result = extract_bytes(bytes, "text/plain", &config).await?;
// println!("Content: {}", result.content);
func ExtractBytes(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_bytes(cContent, cContentLen, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractFile extract content from a file.
//
// This is the main entry point for file-based extraction. It performs the following steps:
// 1. Check cache for existing result (if caching enabled)
// 2. Detect or validate MIME type
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
// 6. Store result in cache (if caching enabled)
//
// Arguments:
// - path: Path to the file to extract
// - mime_type: Optional MIME type override. If None, will be auto-detected
// - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
// let config = ExtractionConfig::default();
// let result = extract_file("document.pdf", None, &config).await?;
// println!("Content: {}", result.content);
func ExtractFile(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cMimeType *C.char
if mimeType != nil {
cMimeType = C.CString(*mimeType)
defer C.free(unsafe.Pointer(cMimeType))
}
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_file(cPath, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractFileSync synchronous wrapper for `extract_file`.
//
// This is a convenience function that blocks the current thread until extraction completes.
// For async code, use `extract_file` directly.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
//
// This function is only available with the `tokio-runtime` feature. For WASM targets,
// use a truly synchronous extraction approach instead.
//
// Example:
//
// let config = ExtractionConfig::default();
// let result = extract_file_sync("document.pdf", None, &config)?;
// println!("Content: {}", result.content);
func ExtractFileSync(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cMimeType *C.char
if mimeType != nil {
cMimeType = C.CString(*mimeType)
defer C.free(unsafe.Pointer(cMimeType))
}
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_file_sync(cPath, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractBytesSync synchronous wrapper for `extract_bytes`.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call.
//
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
//
// Example:
//
// let config = ExtractionConfig::default();
// let bytes = b"Hello, world!";
// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
// println!("Content: {}", result.content);
func ExtractBytesSync(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_bytes_sync(cContent, cContentLen, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// BatchExtractFilesSync synchronous wrapper for `batch_extract_files`.
//
// Uses the global Tokio runtime for optimal performance.
// Only available with `tokio-runtime` (WASM has no filesystem).
//
// Example:
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem {
// path: "doc1.pdf".into(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// BatchFileItem { path: "doc2.pdf".into(), config: None },
// ];
// let results = batch_extract_files_sync(items, &config)?;
func BatchExtractFilesSync(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_files_sync(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractBytesSync synchronous wrapper for `batch_extract_bytes`.
//
// Uses the global Tokio runtime for optimal performance.
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
// that iterates through items and calls `extract_bytes_sync()`.
//
// Example:
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem {
// content: b"other".to_vec(),
// mime_type: "text/plain".to_string(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// ];
// let results = batch_extract_bytes_sync(items, &config)?;
func BatchExtractBytesSync(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_bytes_sync(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractFiles extract content from multiple files concurrently.
//
// This function processes multiple files in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
// taken from the batch-level `config`.
//
// Arguments:
// - items: Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
// - config: Batch-level extraction configuration (provides defaults and batch settings)
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Errors are returned when individual file errors are captured in the result metadata. System errors
// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
//
// Example:
//
// Simple usage with no per-file overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem { path: "doc1.pdf".into(), config: None },
// BatchFileItem { path: "doc2.pdf".into(), config: None },
// ];
// let results = batch_extract_files(items, &config).await?;
// println!("Processed {} files", results.len());
//
// Per-file configuration overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem {
// path: "scan.pdf".into(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// BatchFileItem { path: "notes.txt".into(), config: None },
// ];
// let results = batch_extract_files(items, &config).await?;
func BatchExtractFiles(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_files(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractBytes extract content from multiple byte arrays concurrently.
//
// This function processes multiple byte arrays in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` as the config to use
// the batch-level defaults for that item.
//
// Arguments:
// - items: Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
// - config: Batch-level extraction configuration
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Example:
//
// Simple usage with no per-item overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
// ];
// let results = batch_extract_bytes(items, &config).await?;
// println!("Processed {} items", results.len());
//
// Per-item configuration overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem {
// content: b"<html>test</html>".to_vec(),
// mime_type: "text/html".to_string(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// ];
// let results = batch_extract_bytes(items, &config).await?;
func BatchExtractBytes(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_bytes(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// DetectMimeTypeFromBytes detect MIME type from raw file bytes.
//
// Uses magic byte signatures to detect file type from content.
// Falls back to `infer` crate for comprehensive detection.
//
// For ZIP-based files, inspects contents to distinguish Office Open XML
// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
//
// Arguments:
// - content: Raw file bytes
//
// Returns the detected MIME type string.
//
// Errors are returned when returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
func DetectMimeTypeFromBytes(content []byte) (string, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
ptr := C.kreuzberg_detect_mime_type_from_bytes(cContent, cContentLen)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_free_string(ptr)
}
return "", err
}
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// GetExtensionsForMime get file extensions for a given MIME type.
//
// Returns all known file extensions that map to the specified MIME type.
//
// Arguments:
// - mime_type: The MIME type to look up
//
// Returns a vector of file extensions (without leading dot) for the MIME type.
//
// Example:
//
// let extensions = get_extensions_for_mime("application/pdf");
// assert_eq!(extensions, vec!["pdf"]);
//
// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
// assert!(doc_extensions.contains(&"docx".to_string()));
func GetExtensionsForMime(mimeType string) ([]string, error) {
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
ptr := C.kreuzberg_get_extensions_for_mime(cMimeType)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListEmbeddingBackends list the names of all registered embedding backends.
//
// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
// bindings.
func ListEmbeddingBackends() ([]string, error) {
ptr := C.kreuzberg_list_embedding_backends()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListDocumentExtractors list names of all registered document extractors.
func ListDocumentExtractors() ([]string, error) {
ptr := C.kreuzberg_list_document_extractors()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListOcrBackends list all registered OCR backends.
//
// Returns the names of all OCR backends currently registered in the global registry.
//
// Returns a vector of OCR backend names.
//
// Example:
//
// let backends = list_ocr_backends()?;
// for name in backends {
// println!("Registered OCR backend: {}", name);
// }
func ListOcrBackends() ([]string, error) {
ptr := C.kreuzberg_list_ocr_backends()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListPostProcessors list all registered post-processor names.
//
// Returns a vector of all post-processor names currently registered in the
// global registry.
//
// Returns - `Ok(Vec<String>)` - Vector of post-processor names
// - `Err(...)` if the registry lock is poisoned
//
// Example:
//
// let processors = list_post_processors()?;
// for name in processors {
// println!("Registered post-processor: {}", name);
// }
func ListPostProcessors() ([]string, error) {
ptr := C.kreuzberg_list_post_processors()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListRenderers list names of all registered renderers.
//
// Errors are returned when returns an error if the registry lock is poisoned.
func ListRenderers() ([]string, error) {
ptr := C.kreuzberg_list_renderers()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListValidators list names of all registered validators.
func ListValidators() ([]string, error) {
ptr := C.kreuzberg_list_validators()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// Compare two extraction results and return a structured diff.
//
// The comparison is purely structural — no I/O, no side effects. All fields
// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
//
// Arguments:
// - a: — the "before" extraction result
// - b: — the "after" extraction result
// - opts: — controls which sections are compared and optional truncation
//
// Example:
//
// let mut a = ExtractionResult::default();
// let mut b = ExtractionResult::default();
// a.content = "Hello world".to_string();
// b.content = "Hello Rust".to_string();
//
// let diff = compare(&a, &b, &DiffOptions::default());
// assert_eq!(diff.content_diff.len(), 1);
func Compare(a ExtractionResult, b ExtractionResult, opts DiffOptions) (*ExtractionDiff, error) {
jsonBytesca, err := json.Marshal(a)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytesca) == "null" {
jsonBytesca = []byte("{}")
}
tmpStrca := C.CString(string(jsonBytesca))
ca := C.kreuzberg_extraction_result_from_json(tmpStrca)
C.free(unsafe.Pointer(tmpStrca))
if ca == nil {
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_result_free(ca)
jsonBytescb, err := json.Marshal(b)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescb) == "null" {
jsonBytescb = []byte("{}")
}
tmpStrcb := C.CString(string(jsonBytescb))
cb := C.kreuzberg_extraction_result_from_json(tmpStrcb)
C.free(unsafe.Pointer(tmpStrcb))
if cb == nil {
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_result_free(cb)
jsonBytescOpts, err := json.Marshal(opts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescOpts) == "null" {
jsonBytescOpts = []byte("{}")
}
tmpStrcOpts := C.CString(string(jsonBytescOpts))
cOpts := C.kreuzberg_diff_options_from_json(tmpStrcOpts)
C.free(unsafe.Pointer(tmpStrcOpts))
if cOpts == nil {
return nil, fmt.Errorf("failed to create diff_options: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_diff_options_free(cOpts)
ptr := C.kreuzberg_compare(ca, cb, cOpts)
defer C.kreuzberg_extraction_diff_free(ptr)
jsonPtr := C.kreuzberg_extraction_diff_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionDiff
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// EmbedTextsAsync generate embeddings asynchronously for a list of text strings.
//
// This is the async counterpart to [`embed_texts`]. It offloads the blocking
// ONNX inference work to a dedicated blocking thread pool via Tokio's
// `spawn_blocking`, keeping the async executor free.
//
// Returns one embedding vector per input text in the same order.
//
// Arguments:
// - texts: Vec of strings to embed (owned, sent to blocking thread)
// - config: Embedding configuration specifying model, batch size, and normalization
//
// Errors are returned when - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
// or the blocking inference task panics
//
// Example:
//
// let embeddings = embed_texts_async(
// vec!["Hello!".to_string()],
// &EmbeddingConfig::default(),
// ).await?;
func EmbedTextsAsync(texts []string, config EmbeddingConfig) ([][]float32, error) {
jsonBytescTexts, err := json.Marshal(texts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cTexts := C.CString(string(jsonBytescTexts))
defer C.free(unsafe.Pointer(cTexts))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_embedding_config_free(cConfig)
ptr := C.kreuzberg_embed_texts_async(cTexts, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result [][]float32
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// RenderPdfPageToPng render a single PDF page to PNG bytes.
//
// Returns raw PNG-encoded bytes for the specified page at the given DPI.
// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
//
// Arguments:
// - pdf_bytes: Raw PDF file bytes
// - page_index: Zero-based page index
// - dpi: Resolution in dots per inch (default: 150)
// - password: Optional password for encrypted PDFs
//
// Errors are returned when returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
// or rendered, or if `page_index` is out of range.
func RenderPdfPageToPng(pdfBytes []byte, pageIndex uint, dpi *int32, password *string) ([]byte, error) {
var cPdfBytes *C.uint8_t
if len(pdfBytes) > 0 {
var cPdfBytesPinner runtime.Pinner
cPdfBytesPinner.Pin(&pdfBytes[0])
defer cPdfBytesPinner.Unpin()
cPdfBytes = (*C.uint8_t)(unsafe.Pointer(&pdfBytes[0]))
}
cPdfBytesLen := C.uintptr_t(len(pdfBytes))
cPageIndex := C.size_t(uint(pageIndex))
var cDpi C.int32_t = C.int32_t(int32(2147483647))
if dpi != nil {
cDpi = C.int32_t(int32(*dpi))
}
var cPassword *C.char
if password != nil {
cPassword = C.CString(*password)
defer C.free(unsafe.Pointer(cPassword))
}
var outPtr *C.uint8_t
var outLen, outCap C.uintptr_t
rc := C.kreuzberg_render_pdf_page_to_png(cPdfBytes, cPdfBytesLen, cPageIndex, cDpi, cPassword, &outPtr, &outLen, &outCap)
if rc != 0 {
return nil, lastError()
}
if outPtr == nil {
return nil, lastError()
}
result := C.GoBytes(unsafe.Pointer(outPtr), C.int(outLen))
C.kreuzberg_free_bytes(outPtr, outLen, outCap)
return result, nil
}
// DetectMimeType detect the MIME type of a file at the given path.
//
// Uses the file extension and optionally the file content to determine the MIME type.
// Set `check_exists` to `true` to verify the file exists before detection.
func DetectMimeType(path string, checkExists bool) (string, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cCheckExists C.int32_t
if checkExists {
cCheckExists = 1
} else {
cCheckExists = 0
}
ptr := C.kreuzberg_detect_mime_type(cPath, cCheckExists)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_free_string(ptr)
}
return "", err
}
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// EmbedTexts embed a list of texts using the configured embedding model.
//
// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
func EmbedTexts(texts []string, config EmbeddingConfig) ([][]float32, error) {
jsonBytescTexts, err := json.Marshal(texts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cTexts := C.CString(string(jsonBytescTexts))
defer C.free(unsafe.Pointer(cTexts))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_embedding_config_free(cConfig)
ptr := C.kreuzberg_embed_texts(cTexts, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result [][]float32
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// GetEmbeddingPreset get an embedding preset by name.
//
// Returns `None` if no preset with the given name exists. Returns an owned
// clone so the value is safe to pass across FFI boundaries.
func GetEmbeddingPreset(name string) *EmbeddingPreset {
cName := C.CString(name)
defer C.free(unsafe.Pointer(cName))
ptr := C.kreuzberg_get_embedding_preset(cName)
return func() *EmbeddingPreset {
jsonPtr := C.kreuzberg_embedding_preset_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result EmbeddingPreset
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}()
}
// ListEmbeddingPresets list the names of all available embedding presets.
//
// Returns owned `String`s so the values are safe to pass across FFI boundaries.
func ListEmbeddingPresets() []string {
ptr := C.kreuzberg_list_embedding_presets()
return func() []string {
if ptr == nil {
return nil
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil
}
return result
}()
}
// NeedsImageProcessing check if image processing is needed by examining OCR and image extraction settings.
//
// Returns `true` if either OCR is enabled or image extraction is configured,
// indicating that image decompression and processing should occur.
// Returns `false` if both are disabled, allowing optimization to skip unnecessary
// image decompression for text-only extraction workflows.
//
// # Optimization Impact
// For text-only extractions (no OCR, no image extraction), skipping image
// decompression can improve CPU utilization by 5-10% by avoiding wasteful
// image I/O and processing when results won't be used.
func (r *ExtractionConfig) NeedsImageProcessing() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_extraction_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cRecv)
ptr := C.kreuzberg_extraction_config_needs_image_processing(cRecv)
return ptr != 0, nil
}
// ListenAddr get the server listen address (host:port).
//
// Example:
//
// let config = ServerConfig::default();
// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
func (r *ServerConfig) ListenAddr() (string, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return "", fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return "", fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_listen_addr(cRecv)
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// CorsAllowsAll check if CORS allows all origins.
//
// Returns `true` if the `cors_origins` vector is empty, meaning all origins
// are allowed. Returns `false` if specific origins are configured.
//
// Example:
//
// let mut config = ServerConfig::default();
// assert!(config.cors_allows_all());
//
// config.cors_origins.push("https://example.com".to_string());
// assert!(!config.cors_allows_all());
func (r *ServerConfig) CorsAllowsAll() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_cors_allows_all(cRecv)
return ptr != 0, nil
}
// IsOriginAllowed check if a given origin is allowed by CORS configuration.
//
// Returns `true` if:
// - CORS allows all origins (empty origins list), or
// - The given origin is in the allowed origins list
//
// Arguments:
// - origin: The origin to check (e.g., "https://example.com")
//
// Example:
//
// let mut config = ServerConfig::default();
// assert!(config.is_origin_allowed("https://example.com"));
//
// config.cors_origins.push("https://allowed.com".to_string());
// assert!(config.is_origin_allowed("https://allowed.com"));
// assert!(!config.is_origin_allowed("https://denied.com"));
func (r *ServerConfig) IsOriginAllowed(origin string) (bool, error) {
cOrigin := C.CString(origin)
defer C.free(unsafe.Pointer(cOrigin))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_is_origin_allowed(cRecv, cOrigin)
return ptr != 0, nil
}
// MaxRequestBodyMb get maximum request body size in megabytes (rounded up).
//
// Example:
//
// let mut config = ServerConfig::default();
// assert_eq!(config.max_request_body_mb(), 100);
func (r *ServerConfig) MaxRequestBodyMb() (uint, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_max_request_body_mb(cRecv)
return uint(ptr), nil
}
// MaxMultipartFieldMb get maximum multipart field size in megabytes (rounded up).
//
// Example:
//
// let mut config = ServerConfig::default();
// assert_eq!(config.max_multipart_field_mb(), 100);
func (r *ServerConfig) MaxMultipartFieldMb() (uint, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_max_multipart_field_mb(cRecv)
return uint(ptr), nil
}
// FinalizeNodeTypes compute and populate the `node_types` field from the current `nodes`.
//
// Call this after all nodes have been added to the structure. Internal
// construction paths (builder, derivation) call this automatically.
//
// Example:
//
// let mut structure = DocumentStructure {
// nodes: vec![DocumentNode {
// id: NodeId::from("n1"),
// content: NodeContent::Paragraph { text: "Hello".into() },
// parent: None,
// children: vec![],
// content_layer: Default::default(),
// page: None,
// page_end: None,
// bbox: None,
// annotations: vec![],
// attributes: None,
// }],
// source_format: None,
// relationships: vec![],
// node_types: vec![],
// };
// structure.finalize_node_types();
// assert!(structure.node_types.contains(&"paragraph".to_string()));
func (r *DocumentStructure) FinalizeNodeTypes() error {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_document_structure_free(cRecv)
C.kreuzberg_document_structure_finalize_node_types(cRecv)
jsonPtrUpdated := C.kreuzberg_document_structure_to_json(cRecv)
if jsonPtrUpdated != nil {
_ = json.Unmarshal([]byte(C.GoString(jsonPtrUpdated)), r)
C.kreuzberg_free_string(jsonPtrUpdated)
}
return nil
}
// IsEmpty check if the document structure is empty.
func (r *DocumentStructure) IsEmpty() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_document_structure_free(cRecv)
ptr := C.kreuzberg_document_structure_is_empty(cRecv)
return ptr != 0, nil
}
// FromOcr convert from an OCR result.
func ExtractionResultFromOcr(ocr OcrExtractionResult) (*ExtractionResult, error) {
jsonBytescOcr, err := json.Marshal(ocr)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescOcr) == "null" {
jsonBytescOcr = []byte("{}")
}
tmpStrcOcr := C.CString(string(jsonBytescOcr))
cOcr := C.kreuzberg_ocr_extraction_result_from_json(tmpStrcOcr)
C.free(unsafe.Pointer(tmpStrcOcr))
if cOcr == nil {
return nil, fmt.Errorf("failed to create ocr_extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_ocr_extraction_result_free(cOcr)
ptr := C.kreuzberg_extraction_result_from_ocr(cOcr)
defer C.kreuzberg_extraction_result_free(ptr)
return func() *ExtractionResult {
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// IsEmpty returns `true` when no metadata fields, format-specific metadata, or
// additional postprocessor fields are populated.
func (r *Metadata) IsEmpty() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_metadata_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_metadata_free(cRecv)
ptr := C.kreuzberg_metadata_is_empty(cRecv)
return ptr != 0, nil
}
// WithCacheDir sets a custom cache directory for model files.
//
// Arguments:
// - path: Path to cache directory
//
// Example:
//
// let config = PaddleOcrConfig::new("en")
// .with_cache_dir(PathBuf::from("/tmp/paddle-cache"));
func (r *PaddleOcrConfig) WithCacheDir(path string) (*PaddleOcrConfig, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_cache_dir(cRecv, cPath)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithTableDetection enables or disables table structure detection.
//
// Arguments:
// - enable: Whether to enable table detection
//
// Example:
//
// let config = PaddleOcrConfig::new("en")
// .with_table_detection(true);
func (r *PaddleOcrConfig) WithTableDetection(enable bool) (*PaddleOcrConfig, error) {
var cEnable C.int32_t
if enable {
cEnable = 1
} else {
cEnable = 0
}
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_table_detection(cRecv, cEnable)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithAngleCls enables or disables angle classification for rotated text.
//
// Arguments:
// - enable: Whether to enable angle classification
func (r *PaddleOcrConfig) WithAngleCls(enable bool) (*PaddleOcrConfig, error) {
var cEnable C.int32_t
if enable {
cEnable = 1
} else {
cEnable = 0
}
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_angle_cls(cRecv, cEnable)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbThresh sets the database threshold for text detection.
//
// Arguments:
// - threshold: Detection threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbThresh(threshold float32) (*PaddleOcrConfig, error) {
cThreshold := C.float(float32(threshold))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_thresh(cRecv, cThreshold)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbBoxThresh sets the box threshold for text bounding box refinement.
//
// Arguments:
// - threshold: Box threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbBoxThresh(threshold float32) (*PaddleOcrConfig, error) {
cThreshold := C.float(float32(threshold))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_box_thresh(cRecv, cThreshold)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbUnclipRatio sets the unclip ratio for expanding text bounding boxes.
//
// Arguments:
// - ratio: Unclip ratio (typically 1.5-2.0)
func (r *PaddleOcrConfig) WithDetDbUnclipRatio(ratio float32) (*PaddleOcrConfig, error) {
cRatio := C.float(float32(ratio))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_unclip_ratio(cRecv, cRatio)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetLimitSideLen sets the maximum side length for detection images.
//
// Arguments:
// - length: Maximum side length in pixels
func (r *PaddleOcrConfig) WithDetLimitSideLen(length uint32) (*PaddleOcrConfig, error) {
cLength := C.uint32_t(uint32(length))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_limit_side_len(cRecv, cLength)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithRecBatchNum sets the batch size for recognition inference.
//
// Arguments:
// - batch_size: Number of text regions to process simultaneously
func (r *PaddleOcrConfig) WithRecBatchNum(batchSize uint32) (*PaddleOcrConfig, error) {
cBatchSize := C.uint32_t(uint32(batchSize))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_rec_batch_num(cRecv, cBatchSize)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDropScore sets the minimum recognition confidence threshold.
//
// Arguments:
// - score: Minimum confidence (0.0-1.0), text below this is dropped
func (r *PaddleOcrConfig) WithDropScore(score float32) (*PaddleOcrConfig, error) {
cScore := C.float(float32(score))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_drop_score(cRecv, cScore)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithPadding sets padding in pixels added around images before detection.
//
// Arguments:
// - padding: Padding in pixels (0-100)
func (r *PaddleOcrConfig) WithPadding(padding uint32) (*PaddleOcrConfig, error) {
cPadding := C.uint32_t(uint32(padding))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_padding(cRecv, cPadding)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithModelTier sets the model tier controlling detection/recognition model size.
//
// Arguments:
// - tier: `"mobile"` (default, lightweight, faster) or `"server"` (high accuracy, GPU/complex documents)
func (r *PaddleOcrConfig) WithModelTier(tier string) (*PaddleOcrConfig, error) {
cTier := C.CString(tier)
defer C.free(unsafe.Pointer(cTier))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_model_tier(cRecv, cTier)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}