Files
fil/packages/go/v5/binding.go

7848 lines
289 KiB
Go
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
// Package kreuzberg provides Go bindings for the kreuzberg library.
package kreuzberg
/*
#cgo CFLAGS: -I${SRCDIR}/include
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/.lib/macos-arm64 -Wl,-rpath,${SRCDIR}/.lib/macos-arm64 -lkreuzberg_ffi
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/.lib/macos-amd64 -Wl,-rpath,${SRCDIR}/.lib/macos-amd64 -lkreuzberg_ffi
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/.lib/linux-amd64 -Wl,-rpath,${SRCDIR}/.lib/linux-amd64 -lkreuzberg_ffi
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/.lib/linux-arm64 -Wl,-rpath,${SRCDIR}/.lib/linux-arm64 -lkreuzberg_ffi
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/.lib/windows-amd64 -lkreuzberg_ffi
#include "kreuzberg.h"
*/
import "C"
import (
"encoding/json"
"errors"
"fmt"
"runtime"
"unsafe"
)
// lastError retrieves the last error from the FFI layer.
func lastError() error {
code := int32(C.kreuzberg_last_error_code())
if code == 0 {
return nil
}
ctx := C.kreuzberg_last_error_context()
if ctx == nil {
return fmt.Errorf("[%d] native error", code)
}
message := C.GoString(ctx)
return fmt.Errorf("[%d] %s", code, message)
}
// unmarshalBytes copies a C byte buffer into a Go []byte.
//
// The pointer is treated as a NUL-terminated C string; binary payloads
// that may contain interior NULs should be exposed by the FFI with an
// explicit length out-parameter instead.
func unmarshalBytes(ptr *C.uint8_t) []byte {
if ptr == nil {
return nil
}
return []byte(C.GoString((*C.char)(unsafe.Pointer(ptr))))
}
// Ptr returns a pointer to the given value.
//
// Used by data DTOs to construct pointers for optional fields without the
// functional-options pattern boilerplate. For example:
//
// &MyStruct{Field: Ptr("value"), OtherField: Ptr(42)}
func Ptr[T any](v T) *T {
return &v
}
var (
// ErrIo is returned when IO error.
ErrIo = errors.New("IO error")
// ErrParsing is returned when parsing error.
ErrParsing = errors.New("parsing error")
// ErrOcr is returned when OCR error.
ErrOcr = errors.New("OCR error")
// ErrValidation is returned when validation error.
ErrValidation = errors.New("validation error")
// ErrCache is returned when cache error.
ErrCache = errors.New("cache error")
// ErrImageProcessing is returned when image processing error.
ErrImageProcessing = errors.New("image processing error")
// ErrSerialization is returned when serialization error.
ErrSerialization = errors.New("serialization error")
// ErrMissingDependency is returned when missing dependency.
ErrMissingDependency = errors.New("missing dependency")
// ErrPlugin is returned when plugin error in.
ErrPlugin = errors.New("plugin error in")
// ErrLockPoisoned is returned when lock poisoned.
ErrLockPoisoned = errors.New("lock poisoned")
// ErrUnsupportedFormat is returned when unsupported format.
ErrUnsupportedFormat = errors.New("unsupported format")
// ErrEmbedding is returned when embedding error.
ErrEmbedding = errors.New("embedding error")
// ErrTimeout is returned when extraction timed out after ms (limit: ms).
ErrTimeout = errors.New("extraction timed out after ms (limit: ms)")
// ErrCancelled is returned when extraction cancelled.
ErrCancelled = errors.New("extraction cancelled")
// ErrSecurity is returned when security violation.
ErrSecurity = errors.New("security violation")
// ErrOther is returned when other.
ErrOther = errors.New("other")
)
// Error is a structured error type.
type Error struct {
Code string
Message string
}
func (e Error) Error() string { return e.Message }
// ExecutionProviderType is an enumeration type.
type ExecutionProviderType string
const (
// ExecutionProviderTypeAuto ExecutionProviderTypeAuto auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
ExecutionProviderTypeAuto ExecutionProviderType = "auto"
// ExecutionProviderTypeCPU ExecutionProviderTypeCPU cPU execution provider (always available).
ExecutionProviderTypeCPU ExecutionProviderType = "cpu"
// ExecutionProviderTypeCoreMl ExecutionProviderTypeCoreMl apple CoreML (macOS/iOS Neural Engine + GPU).
ExecutionProviderTypeCoreMl ExecutionProviderType = "core_ml"
// ExecutionProviderTypeCuda ExecutionProviderTypeCuda nVIDIA CUDA GPU acceleration.
ExecutionProviderTypeCuda ExecutionProviderType = "cuda"
// ExecutionProviderTypeTensorRt ExecutionProviderTypeTensorRt nVIDIA TensorRT (optimized CUDA inference).
ExecutionProviderTypeTensorRt ExecutionProviderType = "tensor_rt"
)
// OutputFormat output format for extraction results.
//
// Controls the format of the `content` field in `ExtractionResult`.
// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
// `Plain` returns the raw extracted text.
// `Structured` returns JSON with full OCR element data including bounding
// boxes and confidence scores.
type OutputFormat string
const (
// OutputFormatPlain plain text content only (default)
OutputFormatPlain OutputFormat = "plain"
// OutputFormatMarkdown markdown format
OutputFormatMarkdown OutputFormat = "markdown"
// OutputFormatDjot djot markup format
OutputFormatDjot OutputFormat = "djot"
// OutputFormatHTML hTML format
OutputFormatHTML OutputFormat = "html"
// OutputFormatJSON jSON tree format with heading-driven sections.
OutputFormatJSON OutputFormat = "json"
// OutputFormatStructured structured JSON format with full OCR element metadata.
OutputFormatStructured OutputFormat = "structured"
)
// HTMLTheme is an enumeration type.
type HTMLTheme string
const (
// HTMLThemeDefault HTMLThemeDefault sensible defaults: system font stack, neutral colours, readable line
// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
// can override individual values.
HTMLThemeDefault HTMLTheme = "default"
// HTMLThemeGitHub HTMLThemeGitHub gitHub Markdown-inspired palette and spacing.
HTMLThemeGitHub HTMLTheme = "git_hub"
// HTMLThemeDark HTMLThemeDark dark background, light text.
HTMLThemeDark HTMLTheme = "dark"
// HTMLThemeLight HTMLThemeLight minimal light theme with generous whitespace.
HTMLThemeLight HTMLTheme = "light"
// HTMLThemeUnstyled HTMLThemeUnstyled no built-in stylesheet emitted. CSS custom properties are still defined
// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
HTMLThemeUnstyled HTMLTheme = "unstyled"
)
// TableModel is an enumeration type.
type TableModel string
const (
// TableModelTatr TableModelTatr tATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
TableModelTatr TableModel = "tatr"
// TableModelSlanetWired TableModelSlanetWired sLANeXT wired variant -- 365MB, optimized for bordered tables.
TableModelSlanetWired TableModel = "slanet_wired"
// TableModelSlanetWireless TableModelSlanetWireless sLANeXT wireless variant -- 365MB, optimized for borderless tables.
TableModelSlanetWireless TableModel = "slanet_wireless"
// TableModelSlanetPlus TableModelSlanetPlus sLANet-plus -- 7.78MB, lightweight general-purpose.
TableModelSlanetPlus TableModel = "slanet_plus"
// TableModelSlanetAuto TableModelSlanetAuto classifier-routed SLANeXT: auto-select wired/wireless per table.
// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
TableModelSlanetAuto TableModel = "slanet_auto"
// TableModelDisabled TableModelDisabled disable table structure model inference entirely; use heuristic path only.
TableModelDisabled TableModel = "disabled"
)
// ChunkerType is an enumeration type.
type ChunkerType string
const (
// ChunkerTypeText ChunkerTypeText is the Text variant of ChunkerType.
ChunkerTypeText ChunkerType = "text"
// ChunkerTypeMarkdown ChunkerTypeMarkdown is the Markdown variant of ChunkerType.
ChunkerTypeMarkdown ChunkerType = "markdown"
// ChunkerTypeYaml ChunkerTypeYaml is the Yaml variant of ChunkerType.
ChunkerTypeYaml ChunkerType = "yaml"
// ChunkerTypeSemantic ChunkerTypeSemantic is the Semantic variant of ChunkerType.
ChunkerTypeSemantic ChunkerType = "semantic"
)
// ChunkSizing how chunk size is measured.
//
// Defaults to `Characters` (Unicode character count). When using token-based sizing,
// chunks are sized by token count according to the specified tokenizer.
//
// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
// Variants: Characters, Tokenizer
// Sealed interface — use one of ChunkSizingCharacters, ChunkSizingTokenizer.
type ChunkSizing interface {
isChunkSizing()
Type() string
}
// ChunkSizingCharacters size measured in Unicode characters (default).
type ChunkSizingCharacters struct {
}
func (ChunkSizingCharacters) isChunkSizing() {}
func (ChunkSizingCharacters) Type() string { return "characters" }
func (v ChunkSizingCharacters) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
}
return json.Marshal(aux{
Type: v.Type(),
})
}
// ChunkSizingTokenizer size measured in tokens from a HuggingFace tokenizer.
type ChunkSizingTokenizer struct {
// HuggingFace model ID or path, e.g. "Xenova/gpt-4o", "bert-base-uncased".
Model string `json:"model"`
// Optional cache directory override for tokenizer files.
// Defaults to hf-hub's standard cache (`~/.cache/huggingface/`).
// Can also be set via `KREUZBERG_TOKENIZER_CACHE_DIR` environment variable.
CacheDir *string `json:"cache_dir,omitempty"`
}
func (ChunkSizingTokenizer) isChunkSizing() {}
func (ChunkSizingTokenizer) Type() string { return "tokenizer" }
func (v ChunkSizingTokenizer) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Model string `json:"model"`
CacheDir *string `json:"cache_dir,omitempty"`
}
return json.Marshal(aux{
Type: v.Type(),
Model: v.Model,
CacheDir: v.CacheDir,
})
}
// UnmarshalChunkSizing decodes JSON data into the appropriate concrete ChunkSizing variant.
func UnmarshalChunkSizing(data []byte) (ChunkSizing, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "characters":
var v ChunkSizingCharacters
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "tokenizer":
var v ChunkSizingTokenizer
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown ChunkSizing type: %q", wire.Type)
}
// EmbeddingModelType embedding model types supported by Kreuzberg.
// Variants: Preset, Custom, Llm, Plugin
// Sealed interface — use one of EmbeddingModelTypePreset, EmbeddingModelTypeCustom.
type EmbeddingModelType interface {
isEmbeddingModelType()
Type() string
}
// EmbeddingModelTypePreset use a preset model configuration (recommended)
type EmbeddingModelTypePreset struct {
Name string `json:"name"`
}
func (EmbeddingModelTypePreset) isEmbeddingModelType() {}
func (EmbeddingModelTypePreset) Type() string { return "preset" }
func (v EmbeddingModelTypePreset) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Name string `json:"name"`
}
return json.Marshal(aux{
Type: v.Type(),
Name: v.Name,
})
}
// EmbeddingModelTypeCustom use a custom ONNX model from HuggingFace
type EmbeddingModelTypeCustom struct {
ModelID string `json:"model_id"`
Dimensions uint `json:"dimensions"`
}
func (EmbeddingModelTypeCustom) isEmbeddingModelType() {}
func (EmbeddingModelTypeCustom) Type() string { return "custom" }
func (v EmbeddingModelTypeCustom) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
ModelID string `json:"model_id"`
Dimensions uint `json:"dimensions"`
}
return json.Marshal(aux{
Type: v.Type(),
ModelID: v.ModelID,
Dimensions: v.Dimensions,
})
}
// EmbeddingModelTypeLlm provider-hosted embedding model via liter-llm.
//
// Uses the model specified in the nested `LlmConfig` (e.g.,
// `"openai/text-embedding-3-small"`).
type EmbeddingModelTypeLlm struct {
Llm LlmConfig `json:"llm"`
}
func (EmbeddingModelTypeLlm) isEmbeddingModelType() {}
func (EmbeddingModelTypeLlm) Type() string { return "llm" }
func (v EmbeddingModelTypeLlm) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Llm LlmConfig `json:"llm"`
}
return json.Marshal(aux{
Type: v.Type(),
Llm: v.Llm,
})
}
// EmbeddingModelTypePlugin in-process embedding backend registered via the plugin system.
//
// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
// into the registered backend during chunking and standalone embed requests —
// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
//
// When this variant is selected, only the following [`EmbeddingConfig`] fields
// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
// `show_download_progress`, `acceleration`) are ignored — the host owns the
// model lifecycle.
//
// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
// is used, since there is no preset to look a chunk-size ceiling up against — size your
// context window via `max_characters` directly.
//
// See `register_embedding_backend`.
type EmbeddingModelTypePlugin struct {
Name string `json:"name"`
}
func (EmbeddingModelTypePlugin) isEmbeddingModelType() {}
func (EmbeddingModelTypePlugin) Type() string { return "plugin" }
func (v EmbeddingModelTypePlugin) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Name string `json:"name"`
}
return json.Marshal(aux{
Type: v.Type(),
Name: v.Name,
})
}
// UnmarshalEmbeddingModelType decodes JSON data into the appropriate concrete EmbeddingModelType variant.
func UnmarshalEmbeddingModelType(data []byte) (EmbeddingModelType, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "preset":
var v EmbeddingModelTypePreset
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "custom":
var v EmbeddingModelTypeCustom
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "llm":
var v EmbeddingModelTypeLlm
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "plugin":
var v EmbeddingModelTypePlugin
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown EmbeddingModelType type: %q", wire.Type)
}
// CodeContentMode is an enumeration type.
type CodeContentMode string
const (
// CodeContentModeChunks CodeContentModeChunks use TSLP semantic chunks as content (default).
CodeContentModeChunks CodeContentMode = "chunks"
// CodeContentModeRaw CodeContentModeRaw use raw source code as content.
CodeContentModeRaw CodeContentMode = "raw"
// CodeContentModeStructure CodeContentModeStructure emit function/class headings + docstrings (no code bodies).
CodeContentModeStructure CodeContentMode = "structure"
)
// ListType is an enumeration type.
type ListType string
const (
// ListTypeBullet ListTypeBullet bullet points (-, *, •, etc.)
ListTypeBullet ListType = "bullet"
// ListTypeNumbered ListTypeNumbered numbered lists (1., 2., etc.)
ListTypeNumbered ListType = "numbered"
// ListTypeLettered ListTypeLettered lettered lists (a., b., A., B., etc.)
ListTypeLettered ListType = "lettered"
// ListTypeIndented ListTypeIndented indented items
ListTypeIndented ListType = "indented"
)
// OcrBackendType is an enumeration type.
type OcrBackendType string
const (
// OcrBackendTypeTesseract OcrBackendTypeTesseract tesseract OCR (native Rust binding)
OcrBackendTypeTesseract OcrBackendType = "tesseract"
// OcrBackendTypeEasyOcr OcrBackendTypeEasyOcr easyOCR (Python-based, via FFI)
OcrBackendTypeEasyOcr OcrBackendType = "easy_ocr"
// OcrBackendTypePaddleOcr OcrBackendTypePaddleOcr paddleOCR (Python-based, via FFI)
OcrBackendTypePaddleOcr OcrBackendType = "paddle_ocr"
// OcrBackendTypeCustom OcrBackendTypeCustom custom/third-party OCR backend
OcrBackendTypeCustom OcrBackendType = "custom"
)
// ProcessingStage is an enumeration type.
type ProcessingStage string
const (
// ProcessingStageEarly ProcessingStageEarly early stage - foundational processing.
//
// Use for:
// - Language detection
// - Character encoding normalization
// - Entity extraction (NER)
// - Text quality scoring
ProcessingStageEarly ProcessingStage = "early"
// ProcessingStageMiddle ProcessingStageMiddle middle stage - content transformation.
//
// Use for:
// - Keyword extraction
// - Token reduction
// - Text summarization
// - Semantic analysis
ProcessingStageMiddle ProcessingStage = "middle"
// ProcessingStageLate ProcessingStageLate late stage - final enrichment.
//
// Use for:
// - Custom user hooks
// - Analytics/logging
// - Final validation
// - Output formatting
ProcessingStageLate ProcessingStage = "late"
)
// ReductionLevel is an enumeration type.
type ReductionLevel string
const (
// ReductionLevelOff ReductionLevelOff is the Off variant of ReductionLevel.
ReductionLevelOff ReductionLevel = "off"
// ReductionLevelLight ReductionLevelLight is the Light variant of ReductionLevel.
ReductionLevelLight ReductionLevel = "light"
// ReductionLevelModerate ReductionLevelModerate is the Moderate variant of ReductionLevel.
ReductionLevelModerate ReductionLevel = "moderate"
// ReductionLevelAggressive ReductionLevelAggressive is the Aggressive variant of ReductionLevel.
ReductionLevelAggressive ReductionLevel = "aggressive"
// ReductionLevelMaximum ReductionLevelMaximum is the Maximum variant of ReductionLevel.
ReductionLevelMaximum ReductionLevel = "maximum"
)
// PdfAnnotationType is an enumeration type.
type PdfAnnotationType string
const (
// PdfAnnotationTypeText PdfAnnotationTypeText sticky note / text annotation
PdfAnnotationTypeText PdfAnnotationType = "text"
// PdfAnnotationTypeHighlight PdfAnnotationTypeHighlight highlighted text region
PdfAnnotationTypeHighlight PdfAnnotationType = "highlight"
// PdfAnnotationTypeLink PdfAnnotationTypeLink hyperlink annotation
PdfAnnotationTypeLink PdfAnnotationType = "link"
// PdfAnnotationTypeStamp PdfAnnotationTypeStamp rubber stamp annotation
PdfAnnotationTypeStamp PdfAnnotationType = "stamp"
// PdfAnnotationTypeUnderline PdfAnnotationTypeUnderline underline text markup
PdfAnnotationTypeUnderline PdfAnnotationType = "underline"
// PdfAnnotationTypeStrikeOut PdfAnnotationTypeStrikeOut strikeout text markup
PdfAnnotationTypeStrikeOut PdfAnnotationType = "strike_out"
// PdfAnnotationTypeOther PdfAnnotationTypeOther any other annotation type
PdfAnnotationTypeOther PdfAnnotationType = "other"
)
// BlockType is an enumeration type.
type BlockType string
const (
// BlockTypeParagraph BlockTypeParagraph is the Paragraph variant of BlockType.
BlockTypeParagraph BlockType = "paragraph"
// BlockTypeHeading BlockTypeHeading is the Heading variant of BlockType.
BlockTypeHeading BlockType = "heading"
// BlockTypeBlockquote BlockTypeBlockquote is the Blockquote variant of BlockType.
BlockTypeBlockquote BlockType = "blockquote"
// BlockTypeCodeBlock BlockTypeCodeBlock is the CodeBlock variant of BlockType.
BlockTypeCodeBlock BlockType = "code_block"
// BlockTypeListItem BlockTypeListItem is the ListItem variant of BlockType.
BlockTypeListItem BlockType = "list_item"
// BlockTypeOrderedList BlockTypeOrderedList is the OrderedList variant of BlockType.
BlockTypeOrderedList BlockType = "ordered_list"
// BlockTypeBulletList BlockTypeBulletList is the BulletList variant of BlockType.
BlockTypeBulletList BlockType = "bullet_list"
// BlockTypeTaskList BlockTypeTaskList is the TaskList variant of BlockType.
BlockTypeTaskList BlockType = "task_list"
// BlockTypeDefinitionList BlockTypeDefinitionList is the DefinitionList variant of BlockType.
BlockTypeDefinitionList BlockType = "definition_list"
// BlockTypeDefinitionTerm BlockTypeDefinitionTerm is the DefinitionTerm variant of BlockType.
BlockTypeDefinitionTerm BlockType = "definition_term"
// BlockTypeDefinitionDescription BlockTypeDefinitionDescription is the DefinitionDescription variant of BlockType.
BlockTypeDefinitionDescription BlockType = "definition_description"
// BlockTypeDiv BlockTypeDiv is the Div variant of BlockType.
BlockTypeDiv BlockType = "div"
// BlockTypeSection BlockTypeSection is the Section variant of BlockType.
BlockTypeSection BlockType = "section"
// BlockTypeThematicBreak BlockTypeThematicBreak is the ThematicBreak variant of BlockType.
BlockTypeThematicBreak BlockType = "thematic_break"
// BlockTypeRawBlock BlockTypeRawBlock is the RawBlock variant of BlockType.
BlockTypeRawBlock BlockType = "raw_block"
// BlockTypeMathDisplay BlockTypeMathDisplay is the MathDisplay variant of BlockType.
BlockTypeMathDisplay BlockType = "math_display"
)
// InlineType is an enumeration type.
type InlineType string
const (
// InlineTypeText InlineTypeText is the Text variant of InlineType.
InlineTypeText InlineType = "text"
// InlineTypeStrong InlineTypeStrong is the Strong variant of InlineType.
InlineTypeStrong InlineType = "strong"
// InlineTypeEmphasis InlineTypeEmphasis is the Emphasis variant of InlineType.
InlineTypeEmphasis InlineType = "emphasis"
// InlineTypeHighlight InlineTypeHighlight is the Highlight variant of InlineType.
InlineTypeHighlight InlineType = "highlight"
// InlineTypeSubscript InlineTypeSubscript is the Subscript variant of InlineType.
InlineTypeSubscript InlineType = "subscript"
// InlineTypeSuperscript InlineTypeSuperscript is the Superscript variant of InlineType.
InlineTypeSuperscript InlineType = "superscript"
// InlineTypeInsert InlineTypeInsert is the Insert variant of InlineType.
InlineTypeInsert InlineType = "insert"
// InlineTypeDelete InlineTypeDelete is the Delete variant of InlineType.
InlineTypeDelete InlineType = "delete"
// InlineTypeCode InlineTypeCode is the Code variant of InlineType.
InlineTypeCode InlineType = "code"
// InlineTypeLink InlineTypeLink is the Link variant of InlineType.
InlineTypeLink InlineType = "link"
// InlineTypeImage InlineTypeImage is the Image variant of InlineType.
InlineTypeImage InlineType = "image"
// InlineTypeSpan InlineTypeSpan is the Span variant of InlineType.
InlineTypeSpan InlineType = "span"
// InlineTypeMath InlineTypeMath is the Math variant of InlineType.
InlineTypeMath InlineType = "math"
// InlineTypeRawInline InlineTypeRawInline is the RawInline variant of InlineType.
InlineTypeRawInline InlineType = "raw_inline"
// InlineTypeFootnoteRef InlineTypeFootnoteRef is the FootnoteRef variant of InlineType.
InlineTypeFootnoteRef InlineType = "footnote_ref"
// InlineTypeSymbol InlineTypeSymbol is the Symbol variant of InlineType.
InlineTypeSymbol InlineType = "symbol"
)
// RelationshipKind is an enumeration type.
type RelationshipKind string
const (
// RelationshipKindFootnoteReference RelationshipKindFootnoteReference footnote marker -> footnote definition.
RelationshipKindFootnoteReference RelationshipKind = "footnote_reference"
// RelationshipKindCitationReference RelationshipKindCitationReference citation marker -> bibliography entry.
RelationshipKindCitationReference RelationshipKind = "citation_reference"
// RelationshipKindInternalLink RelationshipKindInternalLink internal anchor link (`#id`) -> target heading/element.
RelationshipKindInternalLink RelationshipKind = "internal_link"
// RelationshipKindCaption RelationshipKindCaption caption paragraph -> figure/table it describes.
RelationshipKindCaption RelationshipKind = "caption"
// RelationshipKindLabel RelationshipKindLabel label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
RelationshipKindLabel RelationshipKind = "label"
// RelationshipKindTocEntry RelationshipKindTocEntry tOC entry -> target section.
RelationshipKindTocEntry RelationshipKind = "toc_entry"
// RelationshipKindCrossReference RelationshipKindCrossReference cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
RelationshipKindCrossReference RelationshipKind = "cross_reference"
)
// ContentLayer is an enumeration type.
type ContentLayer string
const (
// ContentLayerBody ContentLayerBody main document body content.
ContentLayerBody ContentLayer = "body"
// ContentLayerHeader ContentLayerHeader page/section header (running header).
ContentLayerHeader ContentLayer = "header"
// ContentLayerFooter ContentLayerFooter page/section footer (running footer).
ContentLayerFooter ContentLayer = "footer"
// ContentLayerFootnote ContentLayerFootnote footnote content.
ContentLayerFootnote ContentLayer = "footnote"
)
// NodeContent tagged enum for node content. Each variant carries only type-specific data.
//
// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
// Go/Java/TypeScript bindings.
// Variants: Title, Heading, Paragraph, List, ListItem, Table, Image, Code, Quote, Formula, Footnote, Group, PageBreak, Slide, DefinitionList, DefinitionItem, Citation, Admonition, RawBlock, MetadataBlock
// Sealed interface — use one of NodeContentTitle, NodeContentHeading.
type NodeContent interface {
isNodeContent()
Type() string
}
// NodeContentTitle document title.
type NodeContentTitle struct {
Text string `json:"text"`
}
func (NodeContentTitle) isNodeContent() {}
func (NodeContentTitle) Type() string { return "title" }
func (v NodeContentTitle) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentHeading section heading with level (1-6).
type NodeContentHeading struct {
Level uint8 `json:"level"`
Text string `json:"text"`
}
func (NodeContentHeading) isNodeContent() {}
func (NodeContentHeading) Type() string { return "heading" }
func (v NodeContentHeading) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Level uint8 `json:"level"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Level: v.Level,
Text: v.Text,
})
}
// NodeContentParagraph body text paragraph.
type NodeContentParagraph struct {
Text string `json:"text"`
}
func (NodeContentParagraph) isNodeContent() {}
func (NodeContentParagraph) Type() string { return "paragraph" }
func (v NodeContentParagraph) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentList list container — children are `ListItem` nodes.
type NodeContentList struct {
Ordered bool `json:"ordered"`
}
func (NodeContentList) isNodeContent() {}
func (NodeContentList) Type() string { return "list" }
func (v NodeContentList) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Ordered bool `json:"ordered"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Ordered: v.Ordered,
})
}
// NodeContentListItem individual list item.
type NodeContentListItem struct {
Text string `json:"text"`
}
func (NodeContentListItem) isNodeContent() {}
func (NodeContentListItem) Type() string { return "list_item" }
func (v NodeContentListItem) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentTable table with structured cell grid.
type NodeContentTable struct {
Grid TableGrid `json:"grid"`
}
func (NodeContentTable) isNodeContent() {}
func (NodeContentTable) Type() string { return "table" }
func (v NodeContentTable) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Grid TableGrid `json:"grid"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Grid: v.Grid,
})
}
// NodeContentImage image reference.
type NodeContentImage struct {
Description *string `json:"description,omitempty"`
ImageIndex *uint32 `json:"image_index,omitempty"`
// Source URL or path of the image (from `<img src="...">` or `![](src)`).
Src *string `json:"src,omitempty"`
}
func (NodeContentImage) isNodeContent() {}
func (NodeContentImage) Type() string { return "image" }
func (v NodeContentImage) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Description *string `json:"description,omitempty"`
ImageIndex *uint32 `json:"image_index,omitempty"`
Src *string `json:"src,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Description: v.Description,
ImageIndex: v.ImageIndex,
Src: v.Src,
})
}
// NodeContentCode code block.
type NodeContentCode struct {
Text string `json:"text"`
Language *string `json:"language,omitempty"`
}
func (NodeContentCode) isNodeContent() {}
func (NodeContentCode) Type() string { return "code" }
func (v NodeContentCode) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
Language *string `json:"language,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
Language: v.Language,
})
}
// NodeContentQuote block quote — container, children carry the quoted content.
type NodeContentQuote struct {
}
func (NodeContentQuote) isNodeContent() {}
func (NodeContentQuote) Type() string { return "quote" }
func (v NodeContentQuote) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentFormula mathematical formula / equation.
type NodeContentFormula struct {
Text string `json:"text"`
}
func (NodeContentFormula) isNodeContent() {}
func (NodeContentFormula) Type() string { return "formula" }
func (v NodeContentFormula) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentFootnote footnote reference content.
type NodeContentFootnote struct {
Text string `json:"text"`
}
func (NodeContentFootnote) isNodeContent() {}
func (NodeContentFootnote) Type() string { return "footnote" }
func (v NodeContentFootnote) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Text: v.Text,
})
}
// NodeContentGroup logical grouping container (section, key-value area).
//
// `heading_level` + `heading_text` capture the section heading directly
// rather than relying on a first-child positional convention.
type NodeContentGroup struct {
Label *string `json:"label,omitempty"`
HeadingLevel *uint8 `json:"heading_level,omitempty"`
HeadingText *string `json:"heading_text,omitempty"`
}
func (NodeContentGroup) isNodeContent() {}
func (NodeContentGroup) Type() string { return "group" }
func (v NodeContentGroup) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Label *string `json:"label,omitempty"`
HeadingLevel *uint8 `json:"heading_level,omitempty"`
HeadingText *string `json:"heading_text,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Label: v.Label,
HeadingLevel: v.HeadingLevel,
HeadingText: v.HeadingText,
})
}
// NodeContentPageBreak page break marker.
type NodeContentPageBreak struct {
}
func (NodeContentPageBreak) isNodeContent() {}
func (NodeContentPageBreak) Type() string { return "page_break" }
func (v NodeContentPageBreak) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentSlide presentation slide container — children are the slide's content nodes.
type NodeContentSlide struct {
// 1-indexed slide number.
Number uint32 `json:"number"`
Title *string `json:"title,omitempty"`
}
func (NodeContentSlide) isNodeContent() {}
func (NodeContentSlide) Type() string { return "slide" }
func (v NodeContentSlide) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Number uint32 `json:"number"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Number: v.Number,
Title: v.Title,
})
}
// NodeContentDefinitionList definition list container — children are `DefinitionItem` nodes.
type NodeContentDefinitionList struct {
}
func (NodeContentDefinitionList) isNodeContent() {}
func (NodeContentDefinitionList) Type() string { return "definition_list" }
func (v NodeContentDefinitionList) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
}
return json.Marshal(aux{
NodeType: v.Type(),
})
}
// NodeContentDefinitionItem individual definition list entry with term and definition.
type NodeContentDefinitionItem struct {
Term string `json:"term"`
Definition string `json:"definition"`
}
func (NodeContentDefinitionItem) isNodeContent() {}
func (NodeContentDefinitionItem) Type() string { return "definition_item" }
func (v NodeContentDefinitionItem) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Term string `json:"term"`
Definition string `json:"definition"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Term: v.Term,
Definition: v.Definition,
})
}
// NodeContentCitation citation or bibliographic reference.
type NodeContentCitation struct {
Key string `json:"key"`
Text string `json:"text"`
}
func (NodeContentCitation) isNodeContent() {}
func (NodeContentCitation) Type() string { return "citation" }
func (v NodeContentCitation) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Key string `json:"key"`
Text string `json:"text"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Key: v.Key,
Text: v.Text,
})
}
// NodeContentAdmonition admonition / callout container (note, warning, tip, etc.).
//
// Children carry the admonition body content.
type NodeContentAdmonition struct {
// Kind of admonition (e.g. "note", "warning", "tip", "danger").
Kind string `json:"kind"`
Title *string `json:"title,omitempty"`
}
func (NodeContentAdmonition) isNodeContent() {}
func (NodeContentAdmonition) Type() string { return "admonition" }
func (v NodeContentAdmonition) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Kind string `json:"kind"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Kind: v.Kind,
Title: v.Title,
})
}
// NodeContentRawBlock raw block preserved verbatim from the source format.
//
// Used for content that cannot be mapped to a semantic node type
// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
type NodeContentRawBlock struct {
// Source format identifier (e.g. "html", "latex", "jsx").
Format string `json:"format"`
Content string `json:"content"`
}
func (NodeContentRawBlock) isNodeContent() {}
func (NodeContentRawBlock) Type() string { return "raw_block" }
func (v NodeContentRawBlock) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Format string `json:"format"`
Content string `json:"content"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Format: v.Format,
Content: v.Content,
})
}
// NodeContentMetadataBlock structured metadata block (email headers, YAML frontmatter, etc.).
type NodeContentMetadataBlock struct {
Entries [][]string `json:"entries"`
}
func (NodeContentMetadataBlock) isNodeContent() {}
func (NodeContentMetadataBlock) Type() string { return "metadata_block" }
func (v NodeContentMetadataBlock) MarshalJSON() ([]byte, error) {
type aux struct {
NodeType string `json:"node_type"`
Entries [][]string `json:"entries"`
}
return json.Marshal(aux{
NodeType: v.Type(),
Entries: v.Entries,
})
}
// UnmarshalNodeContent decodes JSON data into the appropriate concrete NodeContent variant.
func UnmarshalNodeContent(data []byte) (NodeContent, error) {
var wire struct {
NodeType string `json:"node_type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.NodeType {
case "title":
var v NodeContentTitle
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "heading":
var v NodeContentHeading
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "paragraph":
var v NodeContentParagraph
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "list":
var v NodeContentList
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "list_item":
var v NodeContentListItem
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "table":
var v NodeContentTable
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "image":
var v NodeContentImage
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "code":
var v NodeContentCode
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "quote":
var v NodeContentQuote
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "formula":
var v NodeContentFormula
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "footnote":
var v NodeContentFootnote
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "group":
var v NodeContentGroup
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "page_break":
var v NodeContentPageBreak
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "slide":
var v NodeContentSlide
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "definition_list":
var v NodeContentDefinitionList
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "definition_item":
var v NodeContentDefinitionItem
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "citation":
var v NodeContentCitation
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "admonition":
var v NodeContentAdmonition
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "raw_block":
var v NodeContentRawBlock
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "metadata_block":
var v NodeContentMetadataBlock
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown NodeContent type: %q", wire.NodeType)
}
// AnnotationKind types of inline text annotations.
// Variants: Bold, Italic, Underline, Strikethrough, Code, Subscript, Superscript, Link, Highlight, Color, FontSize, Custom
// Sealed interface — use one of AnnotationKindBold, AnnotationKindItalic.
type AnnotationKind interface {
isAnnotationKind()
Type() string
}
// AnnotationKindBold is the Bold variant of AnnotationKind.
type AnnotationKindBold struct {
}
func (AnnotationKindBold) isAnnotationKind() {}
func (AnnotationKindBold) Type() string { return "bold" }
func (v AnnotationKindBold) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindItalic is the Italic variant of AnnotationKind.
type AnnotationKindItalic struct {
}
func (AnnotationKindItalic) isAnnotationKind() {}
func (AnnotationKindItalic) Type() string { return "italic" }
func (v AnnotationKindItalic) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindUnderline is the Underline variant of AnnotationKind.
type AnnotationKindUnderline struct {
}
func (AnnotationKindUnderline) isAnnotationKind() {}
func (AnnotationKindUnderline) Type() string { return "underline" }
func (v AnnotationKindUnderline) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindStrikethrough is the Strikethrough variant of AnnotationKind.
type AnnotationKindStrikethrough struct {
}
func (AnnotationKindStrikethrough) isAnnotationKind() {}
func (AnnotationKindStrikethrough) Type() string { return "strikethrough" }
func (v AnnotationKindStrikethrough) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindCode is the Code variant of AnnotationKind.
type AnnotationKindCode struct {
}
func (AnnotationKindCode) isAnnotationKind() {}
func (AnnotationKindCode) Type() string { return "code" }
func (v AnnotationKindCode) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindSubscript is the Subscript variant of AnnotationKind.
type AnnotationKindSubscript struct {
}
func (AnnotationKindSubscript) isAnnotationKind() {}
func (AnnotationKindSubscript) Type() string { return "subscript" }
func (v AnnotationKindSubscript) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindSuperscript is the Superscript variant of AnnotationKind.
type AnnotationKindSuperscript struct {
}
func (AnnotationKindSuperscript) isAnnotationKind() {}
func (AnnotationKindSuperscript) Type() string { return "superscript" }
func (v AnnotationKindSuperscript) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindLink is the Link variant of AnnotationKind.
type AnnotationKindLink struct {
URL string `json:"url"`
Title *string `json:"title,omitempty"`
}
func (AnnotationKindLink) isAnnotationKind() {}
func (AnnotationKindLink) Type() string { return "link" }
func (v AnnotationKindLink) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
URL string `json:"url"`
Title *string `json:"title,omitempty"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
URL: v.URL,
Title: v.Title,
})
}
// AnnotationKindHighlight highlighted text (PDF highlights, HTML `<mark>`).
type AnnotationKindHighlight struct {
}
func (AnnotationKindHighlight) isAnnotationKind() {}
func (AnnotationKindHighlight) Type() string { return "highlight" }
func (v AnnotationKindHighlight) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
})
}
// AnnotationKindColor text color (CSS-compatible value, e.g. "#ff0000", "red").
type AnnotationKindColor struct {
Value string `json:"value"`
}
func (AnnotationKindColor) isAnnotationKind() {}
func (AnnotationKindColor) Type() string { return "color" }
func (v AnnotationKindColor) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Value string `json:"value"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Value: v.Value,
})
}
// AnnotationKindFontSize font size with units (e.g. "12pt", "1.2em", "16px").
type AnnotationKindFontSize struct {
Value string `json:"value"`
}
func (AnnotationKindFontSize) isAnnotationKind() {}
func (AnnotationKindFontSize) Type() string { return "font_size" }
func (v AnnotationKindFontSize) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Value string `json:"value"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Value: v.Value,
})
}
// AnnotationKindCustom extensible annotation for format-specific styling.
type AnnotationKindCustom struct {
Name string `json:"name"`
Value *string `json:"value,omitempty"`
}
func (AnnotationKindCustom) isAnnotationKind() {}
func (AnnotationKindCustom) Type() string { return "custom" }
func (v AnnotationKindCustom) MarshalJSON() ([]byte, error) {
type aux struct {
AnnotationType string `json:"annotation_type"`
Name string `json:"name"`
Value *string `json:"value,omitempty"`
}
return json.Marshal(aux{
AnnotationType: v.Type(),
Name: v.Name,
Value: v.Value,
})
}
// UnmarshalAnnotationKind decodes JSON data into the appropriate concrete AnnotationKind variant.
func UnmarshalAnnotationKind(data []byte) (AnnotationKind, error) {
var wire struct {
AnnotationType string `json:"annotation_type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.AnnotationType {
case "bold":
var v AnnotationKindBold
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "italic":
var v AnnotationKindItalic
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "underline":
var v AnnotationKindUnderline
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "strikethrough":
var v AnnotationKindStrikethrough
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "code":
var v AnnotationKindCode
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "subscript":
var v AnnotationKindSubscript
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "superscript":
var v AnnotationKindSuperscript
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "link":
var v AnnotationKindLink
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "highlight":
var v AnnotationKindHighlight
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "color":
var v AnnotationKindColor
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "font_size":
var v AnnotationKindFontSize
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "custom":
var v AnnotationKindCustom
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown AnnotationKind type: %q", wire.AnnotationType)
}
// ExtractionMethod is an enumeration type.
type ExtractionMethod string
const (
// ExtractionMethodNative ExtractionMethodNative is the Native variant of ExtractionMethod.
ExtractionMethodNative ExtractionMethod = "native"
// ExtractionMethodOcr ExtractionMethodOcr is the Ocr variant of ExtractionMethod.
ExtractionMethodOcr ExtractionMethod = "ocr"
// ExtractionMethodMixed ExtractionMethodMixed is the Mixed variant of ExtractionMethod.
ExtractionMethodMixed ExtractionMethod = "mixed"
)
// ChunkType is an enumeration type.
type ChunkType string
const (
// ChunkTypeHeading ChunkTypeHeading section heading or document title.
ChunkTypeHeading ChunkType = "heading"
// ChunkTypePartyList ChunkTypePartyList party list: names, addresses, and signatories.
ChunkTypePartyList ChunkType = "party_list"
// ChunkTypeDefinitions ChunkTypeDefinitions definition clause ("X means…", "X shall mean…").
ChunkTypeDefinitions ChunkType = "definitions"
// ChunkTypeOperativeClause ChunkTypeOperativeClause operative clause containing legal/contractual action verbs.
ChunkTypeOperativeClause ChunkType = "operative_clause"
// ChunkTypeSignatureBlock ChunkTypeSignatureBlock signature block with signatures, names, and dates.
ChunkTypeSignatureBlock ChunkType = "signature_block"
// ChunkTypeSchedule ChunkTypeSchedule schedule, annex, appendix, or exhibit section.
ChunkTypeSchedule ChunkType = "schedule"
// ChunkTypeTableLike ChunkTypeTableLike table-like content with aligned columns or repeated patterns.
ChunkTypeTableLike ChunkType = "table_like"
// ChunkTypeFormula ChunkTypeFormula mathematical formula or equation.
ChunkTypeFormula ChunkType = "formula"
// ChunkTypeCodeBlock ChunkTypeCodeBlock code block or preformatted content.
ChunkTypeCodeBlock ChunkType = "code_block"
// ChunkTypeImage ChunkTypeImage embedded or referenced image content.
ChunkTypeImage ChunkType = "image"
// ChunkTypeOrgChart ChunkTypeOrgChart organizational chart or hierarchy diagram.
ChunkTypeOrgChart ChunkType = "org_chart"
// ChunkTypeDiagram ChunkTypeDiagram diagram, figure, or visual illustration.
ChunkTypeDiagram ChunkType = "diagram"
// ChunkTypeUnknown ChunkTypeUnknown unclassified or mixed content.
ChunkTypeUnknown ChunkType = "unknown"
)
// ImageKind is an enumeration type.
type ImageKind string
const (
// ImageKindPhotograph ImageKindPhotograph photographic image (natural scene, photograph)
ImageKindPhotograph ImageKind = "photograph"
// ImageKindDiagram ImageKindDiagram technical or schematic diagram
ImageKindDiagram ImageKind = "diagram"
// ImageKindChart ImageKindChart chart, graph, or plot
ImageKindChart ImageKind = "chart"
// ImageKindDrawing ImageKindDrawing freehand or technical drawing
ImageKindDrawing ImageKind = "drawing"
// ImageKindTextBlock ImageKindTextBlock text-heavy image (scanned text, document)
ImageKindTextBlock ImageKind = "text_block"
// ImageKindDecoration ImageKindDecoration decorative element or border
ImageKindDecoration ImageKind = "decoration"
// ImageKindLogo ImageKindLogo logo or brand mark
ImageKindLogo ImageKind = "logo"
// ImageKindIcon ImageKindIcon small icon
ImageKindIcon ImageKind = "icon"
// ImageKindTileFragment ImageKindTileFragment fragment of a larger tiled image (tile of a technical drawing)
ImageKindTileFragment ImageKind = "tile_fragment"
// ImageKindMask ImageKindMask mask or transparency map
ImageKindMask ImageKind = "mask"
// ImageKindPageRaster ImageKindPageRaster full-page render produced during OCR preprocessing; used as a citation thumbnail.
ImageKindPageRaster ImageKind = "page_raster"
// ImageKindUnknown ImageKindUnknown could not classify with reasonable confidence
ImageKindUnknown ImageKind = "unknown"
)
// ResultFormat is an enumeration type.
type ResultFormat string
const (
// ResultFormatUnified ResultFormatUnified unified format with all content in `content` field
ResultFormatUnified ResultFormat = "unified"
// ResultFormatElementBased ResultFormatElementBased element-based format with semantic element extraction
ResultFormatElementBased ResultFormat = "element_based"
)
// ElementType is an enumeration type.
type ElementType string
const (
// ElementTypeTitle ElementTypeTitle document title
ElementTypeTitle ElementType = "title"
// ElementTypeNarrativeText ElementTypeNarrativeText main narrative text body
ElementTypeNarrativeText ElementType = "narrative_text"
// ElementTypeHeading ElementTypeHeading section heading
ElementTypeHeading ElementType = "heading"
// ElementTypeListItem ElementTypeListItem list item (bullet, numbered, etc.)
ElementTypeListItem ElementType = "list_item"
// ElementTypeTable ElementTypeTable table element
ElementTypeTable ElementType = "table"
// ElementTypeImage ElementTypeImage image element
ElementTypeImage ElementType = "image"
// ElementTypePageBreak ElementTypePageBreak page break marker
ElementTypePageBreak ElementType = "page_break"
// ElementTypeCodeBlock ElementTypeCodeBlock code block
ElementTypeCodeBlock ElementType = "code_block"
// ElementTypeBlockQuote ElementTypeBlockQuote block quote
ElementTypeBlockQuote ElementType = "block_quote"
// ElementTypeFooter ElementTypeFooter footer text
ElementTypeFooter ElementType = "footer"
// ElementTypeHeader ElementTypeHeader header text
ElementTypeHeader ElementType = "header"
)
// FormatMetadata format-specific metadata (discriminated union).
//
// Only one format type can exist per extraction result. This provides
// type-safe, clean metadata without nested optionals.
// Variants: Pdf, Docx, Excel, Email, Pptx, Archive, Image, Xml, Text, Html, Ocr, Csv, Bibtex, Citation, FictionBook, Dbf, Jats, Epub, Pst, Code
type FormatMetadata struct {
FormatType string `json:"format_type"`
Pdf *PdfMetadata `json:"pdf,omitempty"`
Docx *DocxMetadata `json:"docx,omitempty"`
Excel *ExcelMetadata `json:"excel,omitempty"`
Email *EmailMetadata `json:"email,omitempty"`
Pptx *PptxMetadata `json:"pptx,omitempty"`
Archive *ArchiveMetadata `json:"archive,omitempty"`
Image *ImageMetadata `json:"image,omitempty"`
XML *XMLMetadata `json:"xml,omitempty"`
Text *TextMetadata `json:"text,omitempty"`
HTML *HTMLMetadata `json:"html,omitempty"`
Ocr *OcrMetadata `json:"ocr,omitempty"`
Csv *CsvMetadata `json:"csv,omitempty"`
Bibtex *BibtexMetadata `json:"bibtex,omitempty"`
Citation *CitationMetadata `json:"citation,omitempty"`
FictionBook *FictionBookMetadata `json:"fiction_book,omitempty"`
Dbf *DbfMetadata `json:"dbf,omitempty"`
Jats *JatsMetadata `json:"jats,omitempty"`
Epub *EpubMetadata `json:"epub,omitempty"`
Pst *PstMetadata `json:"pst,omitempty"`
}
// MarshalJSON encodes the tagged union with the discriminator tag.
func (t FormatMetadata) MarshalJSON() ([]byte, error) {
switch t.FormatType {
case "pdf":
if t.Pdf != nil {
data, err := json.Marshal(t.Pdf)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pdf"`)
return json.Marshal(m)
}
case "docx":
if t.Docx != nil {
data, err := json.Marshal(t.Docx)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"docx"`)
return json.Marshal(m)
}
case "excel":
if t.Excel != nil {
data, err := json.Marshal(t.Excel)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"excel"`)
return json.Marshal(m)
}
case "email":
if t.Email != nil {
data, err := json.Marshal(t.Email)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"email"`)
return json.Marshal(m)
}
case "pptx":
if t.Pptx != nil {
data, err := json.Marshal(t.Pptx)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pptx"`)
return json.Marshal(m)
}
case "archive":
if t.Archive != nil {
data, err := json.Marshal(t.Archive)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"archive"`)
return json.Marshal(m)
}
case "image":
if t.Image != nil {
data, err := json.Marshal(t.Image)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"image"`)
return json.Marshal(m)
}
case "xml":
if t.XML != nil {
data, err := json.Marshal(t.XML)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"xml"`)
return json.Marshal(m)
}
case "text":
if t.Text != nil {
data, err := json.Marshal(t.Text)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"text"`)
return json.Marshal(m)
}
case "html":
if t.HTML != nil {
data, err := json.Marshal(t.HTML)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"html"`)
return json.Marshal(m)
}
case "ocr":
if t.Ocr != nil {
data, err := json.Marshal(t.Ocr)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"ocr"`)
return json.Marshal(m)
}
case "csv":
if t.Csv != nil {
data, err := json.Marshal(t.Csv)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"csv"`)
return json.Marshal(m)
}
case "bibtex":
if t.Bibtex != nil {
data, err := json.Marshal(t.Bibtex)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"bibtex"`)
return json.Marshal(m)
}
case "citation":
if t.Citation != nil {
data, err := json.Marshal(t.Citation)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"citation"`)
return json.Marshal(m)
}
case "fiction_book":
if t.FictionBook != nil {
data, err := json.Marshal(t.FictionBook)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"fiction_book"`)
return json.Marshal(m)
}
case "dbf":
if t.Dbf != nil {
data, err := json.Marshal(t.Dbf)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"dbf"`)
return json.Marshal(m)
}
case "jats":
if t.Jats != nil {
data, err := json.Marshal(t.Jats)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"jats"`)
return json.Marshal(m)
}
case "epub":
if t.Epub != nil {
data, err := json.Marshal(t.Epub)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"epub"`)
return json.Marshal(m)
}
case "pst":
if t.Pst != nil {
data, err := json.Marshal(t.Pst)
if err != nil {
return nil, err
}
var m map[string]json.RawMessage
if err := json.Unmarshal(data, &m); err != nil {
return nil, err
}
m["format_type"] = []byte(`"pst"`)
return json.Marshal(m)
}
}
// Fallback: return just the tag
return json.Marshal(map[string]string{"format_type": t.FormatType})
}
// UnmarshalJSON decodes a tagged union by reading the tag first.
func (t *FormatMetadata) UnmarshalJSON(data []byte) error {
// Probe for the tag first
var probe struct {
FormatType string `json:"format_type"`
}
if err := json.Unmarshal(data, &probe); err != nil {
return err
}
t.FormatType = probe.FormatType
switch probe.FormatType {
case "pdf":
t.Pdf = &PdfMetadata{}
return json.Unmarshal(data, t.Pdf)
case "docx":
t.Docx = &DocxMetadata{}
return json.Unmarshal(data, t.Docx)
case "excel":
t.Excel = &ExcelMetadata{}
return json.Unmarshal(data, t.Excel)
case "email":
t.Email = &EmailMetadata{}
return json.Unmarshal(data, t.Email)
case "pptx":
t.Pptx = &PptxMetadata{}
return json.Unmarshal(data, t.Pptx)
case "archive":
t.Archive = &ArchiveMetadata{}
return json.Unmarshal(data, t.Archive)
case "image":
t.Image = &ImageMetadata{}
return json.Unmarshal(data, t.Image)
case "xml":
t.XML = &XMLMetadata{}
return json.Unmarshal(data, t.XML)
case "text":
t.Text = &TextMetadata{}
return json.Unmarshal(data, t.Text)
case "html":
t.HTML = &HTMLMetadata{}
return json.Unmarshal(data, t.HTML)
case "ocr":
t.Ocr = &OcrMetadata{}
return json.Unmarshal(data, t.Ocr)
case "csv":
t.Csv = &CsvMetadata{}
return json.Unmarshal(data, t.Csv)
case "bibtex":
t.Bibtex = &BibtexMetadata{}
return json.Unmarshal(data, t.Bibtex)
case "citation":
t.Citation = &CitationMetadata{}
return json.Unmarshal(data, t.Citation)
case "fiction_book":
t.FictionBook = &FictionBookMetadata{}
return json.Unmarshal(data, t.FictionBook)
case "dbf":
t.Dbf = &DbfMetadata{}
return json.Unmarshal(data, t.Dbf)
case "jats":
t.Jats = &JatsMetadata{}
return json.Unmarshal(data, t.Jats)
case "epub":
t.Epub = &EpubMetadata{}
return json.Unmarshal(data, t.Epub)
case "pst":
t.Pst = &PstMetadata{}
return json.Unmarshal(data, t.Pst)
}
return nil
}
// TextDirection is an enumeration type.
type TextDirection string
const (
// TextDirectionLeftToRight TextDirectionLeftToRight left-to-right text direction
TextDirectionLeftToRight TextDirection = "ltr"
// TextDirectionRightToLeft TextDirectionRightToLeft right-to-left text direction
TextDirectionRightToLeft TextDirection = "rtl"
// TextDirectionAuto TextDirectionAuto automatic text direction detection
TextDirectionAuto TextDirection = "auto"
)
// LinkType is an enumeration type.
type LinkType string
const (
// LinkTypeAnchor LinkTypeAnchor anchor link (#section)
LinkTypeAnchor LinkType = "anchor"
// LinkTypeInternal LinkTypeInternal internal link (same domain)
LinkTypeInternal LinkType = "internal"
// LinkTypeExternal LinkTypeExternal external link (different domain)
LinkTypeExternal LinkType = "external"
// LinkTypeEmail LinkTypeEmail email link (mailto:)
LinkTypeEmail LinkType = "email"
// LinkTypePhone LinkTypePhone phone link (tel:)
LinkTypePhone LinkType = "phone"
// LinkTypeOther LinkTypeOther other link type
LinkTypeOther LinkType = "other"
)
// ImageType is an enumeration type.
type ImageType string
const (
// ImageTypeDataURI ImageTypeDataURI data URI image
ImageTypeDataURI ImageType = "data-uri"
// ImageTypeInlineSvg ImageTypeInlineSvg inline SVG
ImageTypeInlineSvg ImageType = "inline-svg"
// ImageTypeExternal ImageTypeExternal external image URL
ImageTypeExternal ImageType = "external"
// ImageTypeRelative ImageTypeRelative relative path image
ImageTypeRelative ImageType = "relative"
)
// StructuredDataType is an enumeration type.
type StructuredDataType string
const (
// StructuredDataTypeJSONLd StructuredDataTypeJSONLd jSON-LD structured data
StructuredDataTypeJSONLd StructuredDataType = "json-ld"
// StructuredDataTypeMicrodata StructuredDataTypeMicrodata microdata
StructuredDataTypeMicrodata StructuredDataType = "microdata"
// StructuredDataTypeRdFa StructuredDataTypeRdFa rDFa
StructuredDataTypeRdFa StructuredDataType = "rdfa"
)
// OcrBoundingGeometry bounding geometry for an OCR element.
//
// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
// (from PaddleOCR and rotated text detection).
// Variants: Rectangle, Quadrilateral
// Sealed interface — use one of OcrBoundingGeometryRectangle, OcrBoundingGeometryQuadrilateral.
type OcrBoundingGeometry interface {
isOcrBoundingGeometry()
Type() string
}
// OcrBoundingGeometryRectangle axis-aligned bounding box (typical for Tesseract output).
type OcrBoundingGeometryRectangle struct {
// Left x-coordinate in pixels
Left uint32 `json:"left"`
// Top y-coordinate in pixels
Top uint32 `json:"top"`
// Width in pixels
Width uint32 `json:"width"`
// Height in pixels
Height uint32 `json:"height"`
}
func (OcrBoundingGeometryRectangle) isOcrBoundingGeometry() {}
func (OcrBoundingGeometryRectangle) Type() string { return "rectangle" }
func (v OcrBoundingGeometryRectangle) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Left uint32 `json:"left"`
Top uint32 `json:"top"`
Width uint32 `json:"width"`
Height uint32 `json:"height"`
}
return json.Marshal(aux{
Type: v.Type(),
Left: v.Left,
Top: v.Top,
Width: v.Width,
Height: v.Height,
})
}
// OcrBoundingGeometryQuadrilateral 4-point quadrilateral for rotated/skewed text (PaddleOCR).
//
// Points are in clockwise order starting from top-left:
// `[top_left, top_right, bottom_right, bottom_left]`
type OcrBoundingGeometryQuadrilateral struct {
// Four corner points as `[(x, y), ...]` in clockwise order
Points string `json:"points"`
}
func (OcrBoundingGeometryQuadrilateral) isOcrBoundingGeometry() {}
func (OcrBoundingGeometryQuadrilateral) Type() string { return "quadrilateral" }
func (v OcrBoundingGeometryQuadrilateral) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Points string `json:"points"`
}
return json.Marshal(aux{
Type: v.Type(),
Points: v.Points,
})
}
// UnmarshalOcrBoundingGeometry decodes JSON data into the appropriate concrete OcrBoundingGeometry variant.
func UnmarshalOcrBoundingGeometry(data []byte) (OcrBoundingGeometry, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "rectangle":
var v OcrBoundingGeometryRectangle
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "quadrilateral":
var v OcrBoundingGeometryQuadrilateral
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown OcrBoundingGeometry type: %q", wire.Type)
}
// OcrElementLevel is an enumeration type.
type OcrElementLevel string
const (
// OcrElementLevelWord OcrElementLevelWord individual word
OcrElementLevelWord OcrElementLevel = "word"
// OcrElementLevelLine OcrElementLevelLine line of text (default for PaddleOCR)
OcrElementLevelLine OcrElementLevel = "line"
// OcrElementLevelBlock OcrElementLevelBlock paragraph or text block
OcrElementLevelBlock OcrElementLevel = "block"
// OcrElementLevelPage OcrElementLevelPage page-level element
OcrElementLevelPage OcrElementLevel = "page"
)
// PageUnitType is an enumeration type.
type PageUnitType string
const (
// PageUnitTypePage PageUnitTypePage standard document pages (PDF, DOCX, images)
PageUnitTypePage PageUnitType = "page"
// PageUnitTypeSlide PageUnitTypeSlide presentation slides (PPTX, ODP)
PageUnitTypeSlide PageUnitType = "slide"
// PageUnitTypeSheet PageUnitTypeSheet spreadsheet sheets (XLSX, ODS)
PageUnitTypeSheet PageUnitType = "sheet"
)
// DiffLine single line in a unified-diff hunk.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type DiffLine string
// RevisionKind is an enumeration type.
type RevisionKind string
const (
// RevisionKindInsertion RevisionKindInsertion text or content was inserted.
RevisionKindInsertion RevisionKind = "insertion"
// RevisionKindDeletion RevisionKindDeletion text or content was deleted.
RevisionKindDeletion RevisionKind = "deletion"
// RevisionKindFormatChange RevisionKindFormatChange run-level formatting (font, size, colour, …) was changed.
RevisionKindFormatChange RevisionKind = "format_change"
// RevisionKindComment RevisionKindComment a reviewer comment or annotation.
RevisionKindComment RevisionKind = "comment"
)
// RevisionAnchor best-effort document location for a revision.
// Variants: Paragraph, TableCell, Page, Slide, Sheet
// Sealed interface — use one of RevisionAnchorParagraph, RevisionAnchorTableCell.
type RevisionAnchor interface {
isRevisionAnchor()
Type() string
}
// RevisionAnchorParagraph body paragraph, identified by its zero-based index in the document flow.
type RevisionAnchorParagraph struct {
// Zero-based index of the paragraph in document order.
Index uint `json:"index"`
}
func (RevisionAnchorParagraph) isRevisionAnchor() {}
func (RevisionAnchorParagraph) Type() string { return "paragraph" }
func (v RevisionAnchorParagraph) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorTableCell cell inside a table.
type RevisionAnchorTableCell struct {
// Zero-based row index within the table.
Row uint `json:"row"`
// Zero-based column index within the table.
Col uint `json:"col"`
// Zero-based index of the table in document order.
TableIndex uint `json:"table_index"`
}
func (RevisionAnchorTableCell) isRevisionAnchor() {}
func (RevisionAnchorTableCell) Type() string { return "table_cell" }
func (v RevisionAnchorTableCell) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Row uint `json:"row"`
Col uint `json:"col"`
TableIndex uint `json:"table_index"`
}
return json.Marshal(aux{
Type: v.Type(),
Row: v.Row,
Col: v.Col,
TableIndex: v.TableIndex,
})
}
// RevisionAnchorPage page, identified by its zero-based index.
type RevisionAnchorPage struct {
// Zero-based page index.
Index uint `json:"index"`
}
func (RevisionAnchorPage) isRevisionAnchor() {}
func (RevisionAnchorPage) Type() string { return "page" }
func (v RevisionAnchorPage) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorSlide presentation slide, identified by its zero-based index.
type RevisionAnchorSlide struct {
// Zero-based slide index.
Index uint `json:"index"`
}
func (RevisionAnchorSlide) isRevisionAnchor() {}
func (RevisionAnchorSlide) Type() string { return "slide" }
func (v RevisionAnchorSlide) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
})
}
// RevisionAnchorSheet spreadsheet cell or range, identified by sheet index and optional name.
type RevisionAnchorSheet struct {
// Zero-based sheet index.
Index uint `json:"index"`
// Sheet display name when available.
Name *string `json:"name,omitempty"`
}
func (RevisionAnchorSheet) isRevisionAnchor() {}
func (RevisionAnchorSheet) Type() string { return "sheet" }
func (v RevisionAnchorSheet) MarshalJSON() ([]byte, error) {
type aux struct {
Type string `json:"type"`
Index uint `json:"index"`
Name *string `json:"name,omitempty"`
}
return json.Marshal(aux{
Type: v.Type(),
Index: v.Index,
Name: v.Name,
})
}
// UnmarshalRevisionAnchor decodes JSON data into the appropriate concrete RevisionAnchor variant.
func UnmarshalRevisionAnchor(data []byte) (RevisionAnchor, error) {
var wire struct {
Type string `json:"type"`
}
if err := json.Unmarshal(data, &wire); err != nil {
return nil, err
}
switch wire.Type {
case "paragraph":
var v RevisionAnchorParagraph
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "table_cell":
var v RevisionAnchorTableCell
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "page":
var v RevisionAnchorPage
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "slide":
var v RevisionAnchorSlide
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
case "sheet":
var v RevisionAnchorSheet
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}
return v, nil
}
return nil, fmt.Errorf("unknown RevisionAnchor type: %q", wire.Type)
}
// URIKind is an enumeration type.
type URIKind string
const (
// URIKindHyperlink URIKindHyperlink a clickable hyperlink (web URL, file link).
URIKindHyperlink URIKind = "hyperlink"
// URIKindImage URIKindImage an image or media resource reference.
URIKindImage URIKind = "image"
// URIKindAnchor URIKindAnchor an internal anchor or cross-reference target.
URIKindAnchor URIKind = "anchor"
// URIKindCitation URIKindCitation a citation or bibliographic reference (DOI, academic ref).
URIKindCitation URIKind = "citation"
// URIKindReference URIKindReference a general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
URIKindReference URIKind = "reference"
// URIKindEmail URIKindEmail an email address (`mailto:` link or bare email).
URIKindEmail URIKind = "email"
)
// KeywordAlgorithm is an enumeration type.
type KeywordAlgorithm string
const (
// KeywordAlgorithmYake KeywordAlgorithmYake yAKE (Yet Another Keyword Extractor) - statistical approach
KeywordAlgorithmYake KeywordAlgorithm = "yake"
// KeywordAlgorithmRake KeywordAlgorithmRake rAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
KeywordAlgorithmRake KeywordAlgorithm = "rake"
)
// PSMMode is an enumeration type.
type PSMMode string
const (
// PSMModeOsdOnly PSMModeOsdOnly is the OsdOnly variant of PSMMode.
PSMModeOsdOnly PSMMode = "osd_only"
// PSMModeAutoOsd PSMModeAutoOsd is the AutoOsd variant of PSMMode.
PSMModeAutoOsd PSMMode = "auto_osd"
// PSMModeAutoOnly PSMModeAutoOnly is the AutoOnly variant of PSMMode.
PSMModeAutoOnly PSMMode = "auto_only"
// PSMModeAuto PSMModeAuto is the Auto variant of PSMMode.
PSMModeAuto PSMMode = "auto"
// PSMModeSingleColumn PSMModeSingleColumn is the SingleColumn variant of PSMMode.
PSMModeSingleColumn PSMMode = "single_column"
// PSMModeSingleBlockVertical PSMModeSingleBlockVertical is the SingleBlockVertical variant of PSMMode.
PSMModeSingleBlockVertical PSMMode = "single_block_vertical"
// PSMModeSingleBlock PSMModeSingleBlock is the SingleBlock variant of PSMMode.
PSMModeSingleBlock PSMMode = "single_block"
// PSMModeSingleLine PSMModeSingleLine is the SingleLine variant of PSMMode.
PSMModeSingleLine PSMMode = "single_line"
// PSMModeSingleWord PSMModeSingleWord is the SingleWord variant of PSMMode.
PSMModeSingleWord PSMMode = "single_word"
// PSMModeCircleWord PSMModeCircleWord is the CircleWord variant of PSMMode.
PSMModeCircleWord PSMMode = "circle_word"
// PSMModeSingleChar PSMModeSingleChar is the SingleChar variant of PSMMode.
PSMModeSingleChar PSMMode = "single_char"
)
// PaddleLanguage is an enumeration type.
type PaddleLanguage string
const (
// PaddleLanguageEnglish PaddleLanguageEnglish english
PaddleLanguageEnglish PaddleLanguage = "english"
// PaddleLanguageChinese PaddleLanguageChinese simplified Chinese
PaddleLanguageChinese PaddleLanguage = "chinese"
// PaddleLanguageJapanese PaddleLanguageJapanese japanese
PaddleLanguageJapanese PaddleLanguage = "japanese"
// PaddleLanguageKorean PaddleLanguageKorean korean
PaddleLanguageKorean PaddleLanguage = "korean"
// PaddleLanguageGerman PaddleLanguageGerman german
PaddleLanguageGerman PaddleLanguage = "german"
// PaddleLanguageFrench PaddleLanguageFrench french
PaddleLanguageFrench PaddleLanguage = "french"
// PaddleLanguageLatin PaddleLanguageLatin latin script (covers most European languages)
PaddleLanguageLatin PaddleLanguage = "latin"
// PaddleLanguageCyrillic PaddleLanguageCyrillic cyrillic (Russian and related)
PaddleLanguageCyrillic PaddleLanguage = "cyrillic"
// PaddleLanguageTraditionalChinese PaddleLanguageTraditionalChinese traditional Chinese
PaddleLanguageTraditionalChinese PaddleLanguage = "traditional_chinese"
// PaddleLanguageThai PaddleLanguageThai thai
PaddleLanguageThai PaddleLanguage = "thai"
// PaddleLanguageGreek PaddleLanguageGreek greek
PaddleLanguageGreek PaddleLanguage = "greek"
// PaddleLanguageEastSlavic PaddleLanguageEastSlavic east Slavic (Russian, Ukrainian, Belarusian)
PaddleLanguageEastSlavic PaddleLanguage = "east_slavic"
// PaddleLanguageArabic PaddleLanguageArabic arabic (Arabic, Persian, Urdu)
PaddleLanguageArabic PaddleLanguage = "arabic"
// PaddleLanguageDevanagari PaddleLanguageDevanagari devanagari (Hindi, Marathi, Sanskrit, Nepali)
PaddleLanguageDevanagari PaddleLanguage = "devanagari"
// PaddleLanguageTamil PaddleLanguageTamil tamil
PaddleLanguageTamil PaddleLanguage = "tamil"
// PaddleLanguageTelugu PaddleLanguageTelugu telugu
PaddleLanguageTelugu PaddleLanguage = "telugu"
)
// LayoutClass is an enumeration type.
type LayoutClass string
const (
// LayoutClassCaption LayoutClassCaption is the Caption variant of LayoutClass.
LayoutClassCaption LayoutClass = "caption"
// LayoutClassFootnote LayoutClassFootnote is the Footnote variant of LayoutClass.
LayoutClassFootnote LayoutClass = "footnote"
// LayoutClassFormula LayoutClassFormula is the Formula variant of LayoutClass.
LayoutClassFormula LayoutClass = "formula"
// LayoutClassListItem LayoutClassListItem is the ListItem variant of LayoutClass.
LayoutClassListItem LayoutClass = "list_item"
// LayoutClassPageFooter LayoutClassPageFooter is the PageFooter variant of LayoutClass.
LayoutClassPageFooter LayoutClass = "page_footer"
// LayoutClassPageHeader LayoutClassPageHeader is the PageHeader variant of LayoutClass.
LayoutClassPageHeader LayoutClass = "page_header"
// LayoutClassPicture LayoutClassPicture is the Picture variant of LayoutClass.
LayoutClassPicture LayoutClass = "picture"
// LayoutClassSectionHeader LayoutClassSectionHeader is the SectionHeader variant of LayoutClass.
LayoutClassSectionHeader LayoutClass = "section_header"
// LayoutClassTable LayoutClassTable is the Table variant of LayoutClass.
LayoutClassTable LayoutClass = "table"
// LayoutClassText LayoutClassText is the Text variant of LayoutClass.
LayoutClassText LayoutClass = "text"
// LayoutClassTitle LayoutClassTitle is the Title variant of LayoutClass.
LayoutClassTitle LayoutClass = "title"
// LayoutClassDocumentIndex LayoutClassDocumentIndex is the DocumentIndex variant of LayoutClass.
LayoutClassDocumentIndex LayoutClass = "document_index"
// LayoutClassCode LayoutClassCode is the Code variant of LayoutClass.
LayoutClassCode LayoutClass = "code"
// LayoutClassCheckboxSelected LayoutClassCheckboxSelected is the CheckboxSelected variant of LayoutClass.
LayoutClassCheckboxSelected LayoutClass = "checkbox_selected"
// LayoutClassCheckboxUnselected LayoutClassCheckboxUnselected is the CheckboxUnselected variant of LayoutClass.
LayoutClassCheckboxUnselected LayoutClass = "checkbox_unselected"
// LayoutClassForm LayoutClassForm is the Form variant of LayoutClass.
LayoutClassForm LayoutClass = "form"
// LayoutClassKeyValueRegion LayoutClassKeyValueRegion is the KeyValueRegion variant of LayoutClass.
LayoutClassKeyValueRegion LayoutClass = "key_value_region"
)
// CacheStats is a type.
type CacheStats struct {
TotalFiles uint `json:"total_files"`
TotalSizeMb float64 `json:"total_size_mb"`
AvailableSpaceMb float64 `json:"available_space_mb"`
OldestFileAgeDays float64 `json:"oldest_file_age_days"`
NewestFileAgeDays float64 `json:"newest_file_age_days"`
}
// AccelerationConfig hardware acceleration configuration for ONNX Runtime models.
//
// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
// for inference in layout detection and embedding generation.
//
// Example:
//
// // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
// let config = AccelerationConfig::default();
//
// // Force CPU only
// let config = AccelerationConfig {
// provider: kreuzberg::ExecutionProviderType::Cpu,
// ..Default::default()
// };
type AccelerationConfig struct {
// Execution provider to use for ONNX inference.
Provider ExecutionProviderType `json:"provider,omitempty"`
// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
DeviceID uint32 `json:"device_id"`
}
// ContentFilterConfig cross-extractor content filtering configuration.
//
// Controls whether "furniture" content (headers, footers, page numbers,
// watermarks, repeating text) is included in or stripped from extraction
// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
// with format-specific implementation.
//
// When `None` on `ExtractionConfig`, each extractor uses its current
// default behavior unchanged.
type ContentFilterConfig struct {
// Include running headers in extraction output.
//
// - PDF: Disables top-margin furniture stripping and prevents the layout
// model from treating `PageHeader`-classified regions as furniture.
// - DOCX: Includes document headers in text output.
// - RTF/ODT: Headers already included; this is a no-op when true.
// - HTML/EPUB: Keeps `<header>` element content.
//
// Default: `false` (headers are stripped or excluded).
IncludeHeaders bool `json:"include_headers"`
// Include running footers in extraction output.
//
// - PDF: Disables bottom-margin furniture stripping and prevents the layout
// model from treating `PageFooter`-classified regions as furniture.
// - DOCX: Includes document footers in text output.
// - RTF/ODT: Footers already included; this is a no-op when true.
// - HTML/EPUB: Keeps `<footer>` element content.
//
// Default: `false` (footers are stripped or excluded).
IncludeFooters bool `json:"include_footers"`
// Enable the heuristic cross-page repeating text detector.
//
// When `true` (default), text that repeats verbatim across a supermajority
// of pages is classified as furniture and stripped. Disable this if brand
// names or repeated headings are being incorrectly removed by the heuristic.
//
// Note: when a layout-detection model is active, the model may independently
// classify page-header / page-footer regions as furniture on a per-page basis.
// To preserve those regions, set `include_headers = true`, `include_footers = true`,
// or both, in addition to disabling this flag.
//
// Primarily affects PDF extraction.
//
// Default: `true`.
StripRepeatingText *bool `json:"strip_repeating_text,omitempty"`
// Include watermark text in extraction output.
//
// - PDF: Keeps watermark artifacts and arXiv identifiers.
// - Other formats: No effect currently.
//
// Default: `false` (watermarks are stripped).
IncludeWatermarks bool `json:"include_watermarks"`
}
// EmailConfig configuration for email extraction.
type EmailConfig struct {
// Windows codepage number to use when an MSG file contains no codepage property.
// Defaults to `None`, which falls back to windows-1252.
//
// If an unrecognized or invalid codepage number is supplied (including 0),
// the behavior silently falls back to windows-1252 — the same as when the
// MSG file itself contains an unrecognized codepage. No error or warning is
// emitted. Users should verify output when supplying unusual values.
//
// Common values:
// - 1250: Central European (Polish, Czech, Hungarian, etc.)
// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
// - 1252: Western European (default)
// - 1253: Greek
// - 1254: Turkish
// - 1255: Hebrew
// - 1256: Arabic
// - 932: Japanese (Shift-JIS)
// - 936: Simplified Chinese (GBK)
MsgFallbackCodepage *uint32 `json:"msg_fallback_codepage,omitempty"`
}
// ExtractionConfig main extraction configuration.
//
// This struct contains all configuration options for the extraction process.
// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
//
// Example:
//
// // Create with defaults
// let config = ExtractionConfig::default();
//
// // Load from TOML file
// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
type ExtractionConfig struct {
// Enable caching of extraction results
UseCache *bool `json:"use_cache,omitempty"`
// Enable quality post-processing
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
// OCR configuration (None = OCR disabled)
Ocr *OcrConfig `json:"ocr,omitempty"`
// Force OCR even for searchable PDFs
ForceOcr bool `json:"force_ocr"`
// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
//
// When set, only the listed pages are OCR'd regardless of text layer quality.
// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
// Only applies to PDF documents. Duplicates are automatically deduplicated.
// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
// Disable OCR entirely, even for images.
//
// When `true`, OCR is skipped for all document types. Images return metadata
// only (dimensions, format, EXIF) without text extraction. PDFs use only
// native text extraction without OCR fallback.
//
// Cannot be `true` simultaneously with `force_ocr`.
//
// *Added in v4.7.0.*
DisableOcr bool `json:"disable_ocr"`
// Text chunking configuration (None = chunking disabled)
Chunking *ChunkingConfig `json:"chunking,omitempty"`
// Content filtering configuration (None = use extractor defaults).
//
// Controls whether document "furniture" (headers, footers, watermarks,
// repeating text) is included in or stripped from extraction results.
// See [`ContentFilterConfig`] for per-field documentation.
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
// Image extraction configuration (None = no image extraction)
Images *ImageExtractionConfig `json:"images,omitempty"`
// PDF-specific options (None = use defaults)
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
// Token reduction configuration (None = no token reduction)
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
// Language detection configuration (None = no language detection)
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
// Page extraction configuration (None = no page tracking)
Pages *PageConfig `json:"pages,omitempty"`
// Keyword extraction configuration (None = no keyword extraction)
Keywords *KeywordConfig `json:"keywords,omitempty"`
// Post-processor configuration (None = use defaults)
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
// HTML to Markdown conversion options (None = use defaults)
//
// Configure how HTML documents are converted to Markdown, including heading styles,
// list formatting, code block styles, and preprocessing options.
HTMLOptions *string `json:"html_options,omitempty"`
// Styled HTML output configuration.
//
// When set alongside `output_format = OutputFormat::Html`, the extraction
// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
// which emits stable `kb-*` CSS class hooks on every structural element
// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
//
// When `None`, the existing plain comrak-based HTML renderer is used.
HTMLOutput *HTMLOutputConfig `json:"html_output,omitempty"`
// Default per-file timeout in seconds for batch extraction.
//
// When set, each file in a batch will be canceled after this duration
// unless overridden by [`FileExtractionConfig::timeout_secs`].
//
// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
// nested archives, documents with millions of cells) from running
// indefinitely and exhausting caller resources. Set to `None` to
// disable the timeout for trusted input or long-running workloads.
ExtractionTimeoutSecs *uint64 `json:"extraction_timeout_secs,omitempty"`
// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
//
// Limits parallelism to prevent resource exhaustion when processing
// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
MaxConcurrentExtractions *uint `json:"max_concurrent_extractions,omitempty"`
// Result structure format
//
// Controls whether results are returned in unified format (default) with all
// content in the `content` field, or element-based format with semantic
// elements (for Unstructured-compatible output).
ResultFormat ResultFormat `json:"result_format,omitempty"`
// Security limits for archive extraction.
//
// Controls maximum archive size, compression ratio, file count, and other
// security thresholds to prevent decompression bomb attacks. Also caps
// nesting depth, iteration count, entity / token length, total
// content size, and table cell count for every extraction path that
// ingests user-controlled bytes.
// When `None`, default limits are used.
SecurityLimits *SecurityLimits `json:"security_limits,omitempty"`
// Maximum uncompressed size in bytes for a single embedded file before
// recursive extraction is attempted (default: 50 MiB).
//
// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
// to email attachments processed via recursive extraction. Files that
// exceed this limit are skipped with a `ProcessingWarning` rather than
// passed to the extraction pipeline, preventing a single oversized
// embedded object from consuming unbounded memory or time.
//
// Set to `None` to disable the per-embedded-file cap (falls back to
// `security_limits.max_archive_size` as the only guard).
MaxEmbeddedFileBytes *uint64 `json:"max_embedded_file_bytes,omitempty"`
// Content text format (default: Plain).
//
// Controls the format of the extracted content:
// - `Plain`: Raw extracted text (default)
// - `Markdown`: Markdown formatted output
// - `Djot`: Djot markup format (requires djot feature)
// - `Html`: HTML formatted output
//
// When set to a structured format, extraction results will include
// formatted output. The `formatted_content` field may be populated
// when format conversion is applied.
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// Layout detection configuration (None = layout detection disabled).
//
// When set, PDF pages and images are analyzed for document structure
// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
// via ONNX Runtime. For PDFs, layout hints override paragraph classification
// in the markdown pipeline. For images, per-region OCR is performed with
// markdown formatting based on detected layout classes.
// Requires the `layout-detection` feature to run inference; the field is
// present whenever the `layout-types` feature is active (which includes
// `layout-detection` as well as the no-ORT target groups).
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
// Run layout detection on the non-OCR PDF markdown path.
//
// When `true` and `layout` is `Some(_)`, layout regions inform heading,
// table, list, and figure detection in the structure pipeline that would
// otherwise rely on font-clustering heuristics alone. Significantly
// improves SF1 (structural F1) at the cost of inference latency
// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
// Requires the `layout-detection` feature.
UseLayoutForMarkdown bool `json:"use_layout_for_markdown"`
// Enable structured document tree output.
//
// When true, populates the `document` field on `ExtractionResult` with a
// hierarchical `DocumentStructure` containing heading-driven section nesting,
// table grids, content layer classification, and inline annotations.
//
// Independent of `result_format` — can be combined with Unified or ElementBased.
IncludeDocumentStructure bool `json:"include_document_structure"`
// Hardware acceleration configuration for ONNX Runtime models.
//
// Controls execution provider selection for layout detection and embedding
// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
// Linux, CPU on Windows).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Cache namespace for tenant isolation.
//
// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
// Different namespaces have isolated cache spaces on the same filesystem.
CacheNamespace *string `json:"cache_namespace,omitempty"`
// Per-request cache TTL in seconds.
//
// Overrides the global `max_age_days` for this specific extraction.
// When `0`, caching is completely skipped (no read or write).
// When `None`, the global TTL applies.
CacheTTLSecs *uint64 `json:"cache_ttl_secs,omitempty"`
// Email extraction configuration (None = use defaults).
//
// Currently supports configuring the fallback codepage for MSG files
// that do not specify one. See `EmailConfig` for details.
Email *EmailConfig `json:"email,omitempty"`
// Concurrency limits for constrained environments (None = use defaults).
//
// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
// (when `max_concurrent_extractions` is unset) the batch concurrency
// semaphore. See `ConcurrencyConfig` for details.
Concurrency *string `json:"concurrency,omitempty"`
// Maximum recursion depth for archive extraction (default: 3).
// Set to 0 to disable recursive extraction (legacy behavior).
MaxArchiveDepth uint `json:"max_archive_depth"`
// Tree-sitter language pack configuration (None = tree-sitter disabled).
//
// When set, enables code file extraction using tree-sitter parsers.
// Controls grammar download behavior and code analysis options.
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
// Structured extraction via LLM (None = disabled).
//
// When set, the extracted document content is sent to an LLM with the
// provided JSON schema. The structured response is stored in
// `ExtractionResult::structured_output`.
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
// Cancellation token for this extraction (None = no external cancellation).
//
// Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
// from another thread / task to abort the extraction in progress. The extractor
// checks the token at safe checkpoints (before lock acquisition, between pages,
// between batch items) and returns [`KreuzbergError::Cancelled`] when set.
//
// The field is excluded from serialization because `CancellationToken` is a
// runtime handle, not a configuration value.
CancelToken *string `json:"cancel_token,omitempty"`
}
// FileExtractionConfig per-file extraction configuration overrides for batch processing.
//
// All fields are `Option<T>` — `None` means "use the batch-level default."
// This type is used with `batch_extract_files` and
// `batch_extract_bytes` to allow heterogeneous
// extraction settings within a single batch.
//
// # Excluded Fields
//
// The following `ExtractionConfig` fields are batch-level only and
// cannot be overridden per file:
// - `max_concurrent_extractions` — controls batch parallelism
// - `use_cache` — global caching policy
// - `acceleration` — shared ONNX execution provider
// - `security_limits` — global archive security policy
//
// Example:
//
// // Override just OCR forcing for a specific file
// let config = FileExtractionConfig {
// force_ocr: Some(true),
// ..Default::default()
// };
type FileExtractionConfig struct {
// Override quality post-processing for this file.
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
// Override OCR configuration for this file (None in the Option = use batch default).
Ocr *OcrConfig `json:"ocr,omitempty"`
// Override force OCR for this file.
ForceOcr *bool `json:"force_ocr,omitempty"`
// Override force OCR pages for this file (1-indexed page numbers).
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
// Override disable OCR for this file.
DisableOcr *bool `json:"disable_ocr,omitempty"`
// Override chunking configuration for this file.
Chunking *ChunkingConfig `json:"chunking,omitempty"`
// Override content filtering configuration for this file.
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
// Override image extraction configuration for this file.
Images *ImageExtractionConfig `json:"images,omitempty"`
// Override PDF options for this file.
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
// Override token reduction for this file.
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
// Override language detection for this file.
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
// Override page extraction for this file.
Pages *PageConfig `json:"pages,omitempty"`
// Override keyword extraction for this file.
Keywords *KeywordConfig `json:"keywords,omitempty"`
// Override post-processor for this file.
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
// Override HTML conversion options for this file.
HTMLOptions *string `json:"html_options,omitempty"`
// Override result format for this file.
ResultFormat *ResultFormat `json:"result_format,omitempty"`
// Override output content format for this file.
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// Override document structure output for this file.
IncludeDocumentStructure *bool `json:"include_document_structure,omitempty"`
// Override layout detection for this file.
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
// Override per-file extraction timeout in seconds.
//
// When set, the extraction for this file will be canceled after the
// specified duration. A timed-out file produces an error result without
// affecting other files in the batch.
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
// Override tree-sitter configuration for this file.
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
// Override structured extraction configuration for this file.
//
// When set, enables LLM-based structured extraction with a JSON schema
// for this specific file. The extracted content is sent to a VLM/LLM
// and the response is parsed according to the provided schema.
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
}
// BatchBytesItem batch item for byte array extraction.
//
// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
// to represent a single item in a batch extraction job.
type BatchBytesItem struct {
// The content bytes to extract from
Content []byte `json:"content"`
// MIME type of the content (e.g., "application/pdf", "text/html")
MimeType string `json:"mime_type"`
// Per-item configuration overrides (None uses batch-level defaults)
Config *FileExtractionConfig `json:"config,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v BatchBytesItem) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Content []int `json:"content"`
MimeType string `json:"mime_type"`
Config *FileExtractionConfig `json:"config,omitempty"`
}{}
aux.Content = make([]int, len(v.Content))
for i, b := range v.Content {
aux.Content[i] = int(b)
}
aux.MimeType = v.MimeType
aux.Config = v.Config
return json.Marshal(aux)
}
// BatchFileItem batch item for file extraction.
//
// Used with `batch_extract_files` and `batch_extract_files_sync`
// to represent a single file in a batch extraction job.
type BatchFileItem struct {
// Path to the file to extract from
Path string `json:"path"`
// Per-file configuration overrides (None uses batch-level defaults)
Config *FileExtractionConfig `json:"config,omitempty"`
}
// ImageExtractionConfig image extraction configuration.
type ImageExtractionConfig struct {
// Extract images from documents
ExtractImages *bool `json:"extract_images,omitempty"`
// Target DPI for image normalization
TargetDpi *int32 `json:"target_dpi,omitempty"`
// Maximum dimension for images (width or height)
MaxImageDimension *int32 `json:"max_image_dimension,omitempty"`
// Whether to inject image reference placeholders into markdown output.
// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
// are appended to the markdown. Set to `false` to extract images as data
// without polluting the markdown output.
InjectPlaceholders *bool `json:"inject_placeholders,omitempty"`
// Automatically adjust DPI based on image content
AutoAdjustDpi *bool `json:"auto_adjust_dpi,omitempty"`
// Minimum DPI threshold
MinDpi *int32 `json:"min_dpi,omitempty"`
// Maximum DPI threshold
MaxDpi *int32 `json:"max_dpi,omitempty"`
// Maximum number of image objects to extract per PDF page.
//
// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
// can trigger extremely long or indefinite extraction times when every image
// object on a dense page is decoded individually via the PDF extractor. Setting this
// limit causes kreuzberg to stop collecting individual images once the count
// per page reaches the cap and emit a warning instead.
//
// `None` (default) means no limit — all images are extracted.
MaxImagesPerPage *uint32 `json:"max_images_per_page,omitempty"`
// When `true` (default), extracted images are classified by kind and grouped
// into clusters where they appear to belong to one figure.
Classify *bool `json:"classify,omitempty"`
// When `true`, full-page renders produced during OCR preprocessing are captured
// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
//
// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
// document-level OCR bypass is active (whole-document backend). When OCR is
// enabled and this flag is set but the active backend skips per-page rendering,
// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
//
// Defaults to `false`. Enable when downstream consumers need page thumbnails
// (e.g. citation previews, visual grounding).
IncludePageRasters bool `json:"include_page_rasters"`
// Run OCR on extracted images and include the recognized text in the document content.
//
// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
// are processed with the configured OCR backend. Set to `false` to extract images
// without OCR processing, even when OCR is enabled.
RunOcrOnImages *bool `json:"run_ocr_on_images,omitempty"`
// When `true`, image OCR results are rendered as plain text without the
// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
// is also `true`.
OcrTextOnly bool `json:"ocr_text_only"`
// When `true` and `ocr_text_only` is `false`, append the OCR text after
// the image placeholder in the rendered output.
AppendOcrText bool `json:"append_ocr_text"`
}
// TokenReductionOptions token reduction configuration.
type TokenReductionOptions struct {
// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
Mode string `json:"mode"`
// Preserve important words (capitalized, technical terms)
PreserveImportantWords *bool `json:"preserve_important_words,omitempty"`
}
// LanguageDetectionConfig language detection configuration.
type LanguageDetectionConfig struct {
// Enable language detection
Enabled *bool `json:"enabled,omitempty"`
// Minimum confidence threshold (0.0-1.0)
MinConfidence *float64 `json:"min_confidence,omitempty"`
// Detect multiple languages in the document
DetectMultiple bool `json:"detect_multiple"`
}
// HTMLOutputConfig configuration for styled HTML output.
//
// When set on [`ExtractionConfig::html_output`] alongside
// `output_format = OutputFormat::Html`, the pipeline builds a
// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
// the plain comrak-based renderer.
//
// Example:
//
// let config = HtmlOutputConfig {
// theme: HtmlTheme::GitHub,
// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
// ..Default::default()
// };
type HTMLOutputConfig struct {
// Inline CSS string injected into the output after the theme stylesheet.
// Concatenated after `css_file` content when both are set.
CSS *string `json:"css,omitempty"`
// Path to a CSS file loaded once at renderer construction time.
// Concatenated before `css` when both are set.
CSSFile *string `json:"css_file,omitempty"`
// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
Theme *HTMLTheme `json:"theme,omitempty"`
// CSS class prefix applied to every emitted class name.
//
// Default: `"kb-"`. Change this if your host application already uses
// classes that start with `kb-`.
ClassPrefix string `json:"class_prefix"`
// When `true` (default), write the resolved CSS into a `<style>` block
// immediately after the opening `<div class="{prefix}doc">`.
//
// Set to `false` to emit only the structural markup and wire up your
// own stylesheet targeting the `kb-*` class names.
EmbedCSS *bool `json:"embed_css,omitempty"`
}
// LayoutDetectionConfig layout detection configuration.
//
// Controls layout detection behavior in the extraction pipeline.
// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
// is enabled for PDF extraction.
type LayoutDetectionConfig struct {
// Confidence threshold override (None = use model default).
ConfidenceThreshold *float32 `json:"confidence_threshold,omitempty"`
// Whether to apply postprocessing heuristics (default: true).
ApplyHeuristics *bool `json:"apply_heuristics,omitempty"`
// Table structure recognition model.
//
// Controls which model is used for table cell detection within layout-detected
// table regions. Defaults to [`TableModel::Tatr`].
TableModel TableModel `json:"table_model,omitempty"`
// Hardware acceleration for ONNX models (layout detection + table structure).
//
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
// is used for inference. Defaults to `None` (auto-select per platform).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
}
// LlmConfig configuration for an LLM provider/model via liter-llm.
//
// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
// its own `LlmConfig`, allowing different providers per feature.
//
// Example:
//
// [structured_extraction.llm]
// model = "openai/gpt-4o"
// api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
type LlmConfig struct {
// Provider/model string using liter-llm routing format.
//
// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
// `"groq/llama-3.1-70b-versatile"`.
Model string `json:"model"`
// API key for the provider. When `None`, liter-llm falls back to
// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
APIKey *string `json:"api_key,omitempty"`
// Custom base URL override for the provider endpoint.
BaseURL *string `json:"base_url,omitempty"`
// Request timeout in seconds (default: 60).
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
// Maximum retry attempts (default: 3).
MaxRetries *uint32 `json:"max_retries,omitempty"`
// Sampling temperature for generation tasks.
Temperature *float64 `json:"temperature,omitempty"`
// Maximum tokens to generate.
MaxTokens *uint64 `json:"max_tokens,omitempty"`
}
// StructuredExtractionConfig configuration for LLM-based structured data extraction.
//
// Sends extracted document content to a VLM with a JSON schema,
// returning structured data that conforms to the schema.
//
// Example:
//
// [structured_extraction]
// schema_name = "invoice_data"
// strict = true
//
// [structured_extraction.schema]
// type = "object"
// properties.vendor = { type = "string" }
// properties.total = { type = "number" }
// required = ["vendor", "total"]
//
// [structured_extraction.llm]
// model = "openai/gpt-4o"
type StructuredExtractionConfig struct {
// JSON Schema defining the desired output structure.
Schema json.RawMessage `json:"schema"`
// Schema name passed to the LLM's structured output mode.
SchemaName string `json:"schema_name"`
// Optional schema description for the LLM.
SchemaDescription *string `json:"schema_description,omitempty"`
// Enable strict mode — output must exactly match the schema.
Strict bool `json:"strict"`
// Custom Jinja2 extraction prompt template. When `None`, a default template is used.
//
// Available template variables:
// - `{{ content }}` — The extracted document text.
// - `{{ schema }}` — The JSON schema as a formatted string.
// - `{{ schema_name }}` — The schema name.
// - `{{ schema_description }}` — The schema description (may be empty).
Prompt *string `json:"prompt,omitempty"`
// LLM configuration for the extraction.
Llm LlmConfig `json:"llm"`
}
// OcrQualityThresholds quality thresholds for OCR fallback decisions and pipeline quality gating.
//
// All fields default to the values that match the previous hardcoded behavior,
// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
type OcrQualityThresholds struct {
// Minimum total non-whitespace characters to consider text substantive.
MinTotalNonWhitespace *uint `json:"min_total_non_whitespace,omitempty"`
// Minimum non-whitespace characters per page on average.
MinNonWhitespacePerPage *float64 `json:"min_non_whitespace_per_page,omitempty"`
// Minimum character count for a word to be "meaningful".
MinMeaningfulWordLen *uint `json:"min_meaningful_word_len,omitempty"`
// Minimum count of meaningful words before text is accepted.
MinMeaningfulWords *uint `json:"min_meaningful_words,omitempty"`
// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
MinAlnumRatio *float64 `json:"min_alnum_ratio,omitempty"`
// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
MinGarbageChars *uint `json:"min_garbage_chars,omitempty"`
// Maximum fraction of short (1-2 char) words before text is considered fragmented.
MaxFragmentedWordRatio *float64 `json:"max_fragmented_word_ratio,omitempty"`
// Critical fragmentation threshold — triggers OCR regardless of meaningful words.
// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
CriticalFragmentedWordRatio *float64 `json:"critical_fragmented_word_ratio,omitempty"`
// Minimum average word length. Below this with enough words indicates garbled extraction.
MinAvgWordLength *float64 `json:"min_avg_word_length,omitempty"`
// Minimum word count before average word length check applies.
MinWordsForAvgLengthCheck *uint `json:"min_words_for_avg_length_check,omitempty"`
// Minimum consecutive word repetition ratio to detect column scrambling.
MinConsecutiveRepeatRatio *float64 `json:"min_consecutive_repeat_ratio,omitempty"`
// Minimum word count before consecutive repetition check is applied.
MinWordsForRepeatCheck *uint `json:"min_words_for_repeat_check,omitempty"`
// Minimum character count for "substantive markdown" OCR skip gate.
SubstantiveMinChars *uint `json:"substantive_min_chars,omitempty"`
// Minimum character count for "non-text content" OCR skip gate.
NonTextMinChars *uint `json:"non_text_min_chars,omitempty"`
// Alphanumeric+whitespace ratio threshold for skip decisions.
AlnumWsRatioThreshold *float64 `json:"alnum_ws_ratio_threshold,omitempty"`
// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
// If the result from a backend scores below this, try the next backend.
PipelineMinQuality *float64 `json:"pipeline_min_quality,omitempty"`
}
// OcrPipelineStage single backend stage in the OCR pipeline.
type OcrPipelineStage struct {
// Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
Backend string `json:"backend"`
// Priority weight (higher = tried first). Stages are sorted by priority descending.
Priority uint32 `json:"priority"`
// Language override for this stage (None = use parent OcrConfig.language).
Language *string `json:"language,omitempty"`
// Tesseract-specific config override for this stage.
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
// PaddleOCR-specific config for this stage.
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
// VLM config override for this pipeline stage.
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
// Arbitrary per-call options passed through to the backend unchanged.
//
// Backends that support runtime tuning (mode switching, preprocessing
// flags, inference parameters, etc.) read this value and deserialize
// the keys they care about. Keys unknown to the backend are silently
// ignored, so options from different backends can coexist in the same
// config without conflict.
//
// Example (custom backend):
// ```json
// { "mode": "fast", "enable_layout": true }
// ```
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
}
// OcrPipelineConfig multi-backend OCR pipeline with quality-based fallback.
//
// Backends are tried in priority order (highest first). After each backend
// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
// the result is accepted. Otherwise the next backend is tried.
type OcrPipelineConfig struct {
// Ordered list of backends to try. Sorted by priority (descending) at runtime.
Stages []OcrPipelineStage `json:"stages,omitempty"`
// Quality thresholds for deciding whether to accept a result or try the next backend.
QualityThresholds OcrQualityThresholds `json:"quality_thresholds"`
}
// OcrConfig oCR configuration.
type OcrConfig struct {
// Whether OCR is enabled.
//
// Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
// [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
// metadata only; PDFs use native text extraction without OCR fallback.
//
// Defaults to `true`. When `false`, all other OCR settings are ignored.
Enabled *bool `json:"enabled,omitempty"`
// OCR backend: tesseract, easyocr, paddleocr
Backend string `json:"backend"`
// Language code (e.g., "eng", "deu")
Language string `json:"language"`
// Tesseract-specific configuration (optional)
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
// Output format for OCR results (optional, for format conversion)
OutputFormat *OutputFormat `json:"output_format,omitempty"`
// PaddleOCR-specific configuration (optional, JSON passthrough)
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
// Arbitrary per-call options passed through to the backend unchanged.
//
// Custom OCR backends and built-in backends that support runtime tuning
// can read this value and deserialize the keys they care about. Keys
// unknown to the backend are silently ignored.
//
// This is the recommended extension point for per-call parameters that
// are not covered by the typed fields above (e.g. mode switching,
// preprocessing flags, inference batch size).
//
// **Scope:** when `pipeline` is `None`, this value is propagated to the
// primary stage of the auto-constructed pipeline. When `pipeline` is
// explicitly set, this field has **no effect** — the caller must set
// `OcrPipelineStage.backend_options` directly on the relevant stage(s)
// instead.
//
// Example:
// ```json
// { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
// ```
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
// OCR element extraction configuration
ElementConfig *OcrElementConfig `json:"element_config,omitempty"`
// Quality thresholds for the native-text-to-OCR fallback decision.
// When None, uses compiled defaults (matching previous hardcoded behavior).
QualityThresholds *OcrQualityThresholds `json:"quality_thresholds,omitempty"`
// Multi-backend OCR pipeline configuration. When set, enables weighted
// fallback across multiple OCR backends based on output quality.
// When None, uses the single `backend` field (same as today).
Pipeline *OcrPipelineConfig `json:"pipeline,omitempty"`
// Enable automatic page rotation based on orientation detection.
//
// When enabled, uses Tesseract's `DetectOrientationScript()` to detect
// page orientation (0/90/180/270 degrees) before OCR. If the page is
// rotated with high confidence, the image is corrected before recognition.
// This is critical for handling rotated scanned documents.
AutoRotate bool `json:"auto_rotate"`
// VLM (Vision Language Model) OCR configuration.
//
// Required when `backend` is `"vlm"`. Uses liter-llm to send page
// images to a vision model for text extraction.
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
// Custom Jinja2 prompt template for VLM OCR.
//
// When `None`, uses the default template. Available variables:
// - `{{ language }}` — The document language code (e.g., "eng", "deu").
VlmPrompt *string `json:"vlm_prompt,omitempty"`
// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
//
// Not user-configurable via config files — injected at runtime from
// `ExtractionConfig::acceleration` before each `process_image` call.
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Caller-supplied Tesseract `traineddata` bytes per language code.
//
// Primary use case is the WASM build, which has no filesystem and cannot
// download tessdata at runtime. Native builds typically rely on
// `TessdataManager` and ignore this field. When present, the WASM
// Tesseract backend prefers these bytes over its compile-time-bundled
// English data.
//
// Skipped by serde to keep config files small — supply via the typed API
// at runtime.
TessdataBytes map[string][]byte `json:"tessdata_bytes,omitempty"`
}
// PageConfig page extraction and tracking configuration.
//
// Controls how pages are extracted, tracked, and represented in the extraction results.
// When `None`, page tracking is disabled.
//
// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
// when page boundaries are available and chunking is configured.
type PageConfig struct {
// Extract pages as separate array (ExtractionResult.pages)
ExtractPages bool `json:"extract_pages"`
// Insert page markers in main content string
InsertPageMarkers bool `json:"insert_page_markers"`
// Page marker format (use {page_num} placeholder)
// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
MarkerFormat *string `json:"marker_format,omitempty"`
}
// PdfConfig pDF-specific configuration.
type PdfConfig struct {
// Extract images from PDF
ExtractImages bool `json:"extract_images"`
// Extract tables from PDF.
//
// When `true` (default), runs pdf_oxide's native grid detector and, if it
// finds nothing, falls back to the heuristic text-layer reconstruction in
// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
// both passes — `tables` will then be empty in the result.
ExtractTables *bool `json:"extract_tables,omitempty"`
// List of passwords to try when opening encrypted PDFs
Passwords []string `json:"passwords,omitempty"`
// Extract PDF metadata
ExtractMetadata *bool `json:"extract_metadata,omitempty"`
// Hierarchy extraction configuration (None = hierarchy extraction disabled)
Hierarchy *HierarchyConfig `json:"hierarchy,omitempty"`
// Extract PDF annotations (text notes, highlights, links, stamps).
// Default: false
ExtractAnnotations bool `json:"extract_annotations"`
// Top margin fraction (0.01.0) of page height to exclude headers/running heads.
// Default: 0.06 (6%)
TopMarginFraction *float32 `json:"top_margin_fraction,omitempty"`
// Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
// Default: 0.05 (5%)
BottomMarginFraction *float32 `json:"bottom_margin_fraction,omitempty"`
// Allow single-column pseudo tables in extraction results.
//
// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
// (heuristic) are rejected. When `true`, the minimum column count is relaxed
// to 1, allowing single-column structured data (glossaries, itemized lists)
// to be emitted as tables. Other quality filters (density, sparsity, prose
// detection) still apply.
AllowSingleColumnTables bool `json:"allow_single_column_tables"`
// Perform OCR on inline images extracted from PDF pages and attach the
// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
// to be available; if `ExtractionConfig.ocr` is `None` the extractor
// falls back to `TesseractConfig::default()`. Per-image failures degrade
// gracefully (the image is returned without OCR text rather than failing
// the whole extraction). Default: `false`.
OcrInlineImages bool `json:"ocr_inline_images"`
}
// HierarchyConfig hierarchy extraction configuration for PDF text structure analysis.
//
// Enables extraction of document hierarchy levels (H1-H6) based on font size
// clustering and semantic analysis. When enabled, hierarchical blocks are
// included in page content.
type HierarchyConfig struct {
// Enable hierarchy extraction
Enabled *bool `json:"enabled,omitempty"`
// Number of font size clusters to use for hierarchy levels (1-7)
//
// Default: 6, which provides H1-H6 heading levels with body text.
// Larger values create more fine-grained hierarchy levels.
KClusters *uint `json:"k_clusters,omitempty"`
// Include bounding box information in hierarchy blocks
IncludeBbox *bool `json:"include_bbox,omitempty"`
// OCR coverage threshold for smart OCR triggering (0.0-1.0)
//
// Determines when OCR should be triggered based on text block coverage.
// OCR is triggered when text blocks cover less than this fraction of the page.
// Default: 0.5 (trigger OCR if less than 50% of page has text)
OcrCoverageThreshold *float32 `json:"ocr_coverage_threshold,omitempty"`
}
// PostProcessorConfig post-processor configuration.
type PostProcessorConfig struct {
// Enable post-processors
Enabled *bool `json:"enabled,omitempty"`
// Whitelist of processor names to run (None = all enabled)
EnabledProcessors []string `json:"enabled_processors,omitempty"`
// Blacklist of processor names to skip (None = none disabled)
DisabledProcessors []string `json:"disabled_processors,omitempty"`
// Pre-computed AHashSet for O(1) enabled processor lookup
EnabledSet []string `json:"enabled_set,omitempty"`
// Pre-computed AHashSet for O(1) disabled processor lookup
DisabledSet []string `json:"disabled_set,omitempty"`
}
// ChunkingConfig chunking configuration.
//
// Configures text chunking for document content, including chunk size,
// overlap, trimming behavior, and optional embeddings.
//
// Use `..Default::default()` when constructing to allow for future field additions:
// ```rust
// let config = ChunkingConfig {
// max_characters: 500,
// ..Default::default()
// };
// ```
type ChunkingConfig struct {
// Maximum size per chunk (in units determined by `sizing`).
//
// When `sizing` is `Characters` (default), this is the max character count.
// When using token-based sizing, this is the max token count.
//
// Default: 1000
MaxCharacters *uint `json:"max_chars,omitempty"`
// Overlap between chunks (in units determined by `sizing`).
//
// Default: 200
Overlap *uint `json:"max_overlap,omitempty"`
// Whether to trim whitespace from chunk boundaries.
//
// Default: true
Trim *bool `json:"trim,omitempty"`
// Type of chunker to use (Text or Markdown).
//
// Default: Text
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
// Optional embedding configuration for chunk embeddings.
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
// Use a preset configuration (overrides individual settings if provided).
Preset *string `json:"preset,omitempty"`
// How to measure chunk size.
//
// Default: `Characters` (Unicode character count).
// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
Sizing ChunkSizing `json:"sizing"`
// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
// path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
//
// This is useful for RAG pipelines where each chunk needs self-contained
// context about its position in the document structure.
//
// Default: `false`
PrependHeadingContext bool `json:"prepend_heading_context"`
// Optional cosine similarity threshold for semantic topic boundary detection.
//
// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
// provided. You almost never need to set this. When omitted, defaults to
// `0.75` which works well for most documents. Lower values detect more
// topic boundaries (more, smaller chunks); higher values detect fewer.
// Range: `0.0..=1.0`.
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
}
func (s *ChunkingConfig) UnmarshalJSON(data []byte) error {
var raw struct {
MaxCharacters *uint `json:"max_chars,omitempty"`
Overlap *uint `json:"max_overlap,omitempty"`
Trim *bool `json:"trim,omitempty"`
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
Preset *string `json:"preset,omitempty"`
Sizing json.RawMessage `json:"sizing,omitempty"`
PrependHeadingContext bool `json:"prepend_heading_context"`
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.MaxCharacters = raw.MaxCharacters
s.Overlap = raw.Overlap
s.Trim = raw.Trim
s.ChunkerType = raw.ChunkerType
s.Embedding = raw.Embedding
s.Preset = raw.Preset
s.PrependHeadingContext = raw.PrependHeadingContext
s.TopicThreshold = raw.TopicThreshold
if len(raw.Sizing) > 0 && string(raw.Sizing) != "null" {
v, err := UnmarshalChunkSizing(raw.Sizing)
if err != nil {
return err
}
s.Sizing = v
}
return nil
}
// EmbeddingConfig embedding configuration for text chunks.
//
// Configures embedding generation using ONNX models via the vendored embedding engine.
// Requires the `embeddings` feature to be enabled.
type EmbeddingConfig struct {
// The embedding model to use (defaults to "balanced" preset if not specified)
Model EmbeddingModelType `json:"model"`
// Whether to normalize embedding vectors (recommended for cosine similarity)
Normalize *bool `json:"normalize,omitempty"`
// Batch size for embedding generation
BatchSize *uint `json:"batch_size,omitempty"`
// Show model download progress
ShowDownloadProgress bool `json:"show_download_progress"`
// Custom cache directory for model files
//
// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
// Allows full customization of model download location.
CacheDir *string `json:"cache_dir,omitempty"`
// Hardware acceleration for the embedding ONNX model.
//
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
// is used for inference. Defaults to `None` (auto-select per platform).
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
// Maximum wall-clock duration (in seconds) for a single `embed()` call when
// using [`EmbeddingModelType::Plugin`].
//
// Applies only to the in-process plugin path — protects against hung
// host-language backends (e.g. a Python callback deadlocked on the GIL,
// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
// returns `Plugin` instead of blocking forever.
//
// `None` disables the timeout. The default (60 seconds) is conservative
// for common in-process inference; increase for large batches on slow
// hardware.
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
}
func (s *EmbeddingConfig) UnmarshalJSON(data []byte) error {
var raw struct {
Model json.RawMessage `json:"model,omitempty"`
Normalize *bool `json:"normalize,omitempty"`
BatchSize *uint `json:"batch_size,omitempty"`
ShowDownloadProgress bool `json:"show_download_progress"`
CacheDir *string `json:"cache_dir,omitempty"`
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Normalize = raw.Normalize
s.BatchSize = raw.BatchSize
s.ShowDownloadProgress = raw.ShowDownloadProgress
s.CacheDir = raw.CacheDir
s.Acceleration = raw.Acceleration
s.MaxEmbedDurationSecs = raw.MaxEmbedDurationSecs
if len(raw.Model) > 0 && string(raw.Model) != "null" {
v, err := UnmarshalEmbeddingModelType(raw.Model)
if err != nil {
return err
}
s.Model = v
}
return nil
}
// TreeSitterConfig configuration for tree-sitter language pack integration.
//
// Controls grammar download behavior and code analysis options.
//
// # Example (TOML)
//
// ```toml
// [tree_sitter]
// languages = ["python", "rust"]
// groups = ["web"]
//
// [tree_sitter.process]
// structure = true
// comments = true
// docstrings = true
// ```
type TreeSitterConfig struct {
// Enable code intelligence processing (default: true).
//
// When `false`, tree-sitter analysis is completely skipped even if
// the config section is present.
Enabled *bool `json:"enabled,omitempty"`
// Custom cache directory for downloaded grammars.
//
// When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
CacheDir *string `json:"cache_dir,omitempty"`
// Languages to pre-download on init (e.g., `["python", "rust"]`).
Languages []string `json:"languages,omitempty"`
// Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
Groups []string `json:"groups,omitempty"`
// Processing options for code analysis.
Process TreeSitterProcessConfig `json:"process"`
}
// TreeSitterProcessConfig processing options for tree-sitter code analysis.
//
// Controls which analysis features are enabled when extracting code files.
type TreeSitterProcessConfig struct {
// Extract structural items (functions, classes, structs, etc.). Default: true.
Structure *bool `json:"structure,omitempty"`
// Extract import statements. Default: true.
Imports *bool `json:"imports,omitempty"`
// Extract export statements. Default: true.
Exports *bool `json:"exports,omitempty"`
// Extract comments. Default: false.
Comments bool `json:"comments"`
// Extract docstrings. Default: false.
Docstrings bool `json:"docstrings"`
// Extract symbol definitions. Default: false.
Symbols bool `json:"symbols"`
// Include parse diagnostics. Default: false.
Diagnostics bool `json:"diagnostics"`
// Maximum chunk size in bytes. `None` disables chunking.
ChunkMaxSize *uint `json:"chunk_max_size,omitempty"`
// Content rendering mode for code extraction.
ContentMode CodeContentMode `json:"content_mode,omitempty"`
}
// SupportedFormat supported document format entry.
//
// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
type SupportedFormat struct {
// File extension (without leading dot), e.g., "pdf", "docx"
Extension string `json:"extension"`
// MIME type string, e.g., "application/pdf"
MimeType string `json:"mime_type"`
}
// ServerConfig aPI server configuration.
//
// This struct holds all configuration options for the Kreuzberg API server,
// including host/port settings, CORS configuration, and upload limits.
//
// # Defaults
//
// - `host`: "127.0.0.1" (localhost only)
// - `port`: 8000
// - `cors_origins`: empty vector (allows all origins)
// - `max_request_body_bytes`: 104_857_600 (100 MB)
// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
type ServerConfig struct {
// Server host address (e.g., "127.0.0.1", "0.0.0.0")
Host string `json:"host"`
// Server port number
Port uint16 `json:"port"`
// CORS allowed origins. Empty vector means allow all origins.
//
// If this is an empty vector, the server will accept requests from any origin.
// If populated with specific origins (e.g., `"https://example.com"`), only
// those origins will be allowed.
CorsOrigins []string `json:"cors_origins,omitempty"`
// Maximum size of request body in bytes (default: 100 MB)
MaxRequestBodyBytes uint `json:"max_request_body_bytes"`
// Maximum size of multipart fields in bytes (default: 100 MB)
MaxMultipartFieldBytes uint `json:"max_multipart_field_bytes"`
}
// StructuredDataResult is a type.
type StructuredDataResult struct {
Content string `json:"content"`
Format string `json:"format"`
Metadata map[string]string `json:"metadata,omitempty"`
TextFields []string `json:"text_fields,omitempty"`
}
// DocxAppProperties application properties from docProps/app.xml for DOCX
//
// Contains Word-specific document statistics and metadata.
type DocxAppProperties struct {
// Application name (e.g., "Microsoft Office Word")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Template filename
Template *string `json:"template,omitempty"`
// Total editing time in minutes
TotalTime *int32 `json:"total_time,omitempty"`
// Number of pages
Pages *int32 `json:"pages,omitempty"`
// Number of words
Words *int32 `json:"words,omitempty"`
// Number of characters (excluding spaces)
Characters *int32 `json:"characters,omitempty"`
// Number of characters (including spaces)
CharactersWithSpaces *int32 `json:"characters_with_spaces,omitempty"`
// Number of lines
Lines *int32 `json:"lines,omitempty"`
// Number of paragraphs
Paragraphs *int32 `json:"paragraphs,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
}
// XlsxAppProperties application properties from docProps/app.xml for XLSX
//
// Contains Excel-specific document metadata.
type XlsxAppProperties struct {
// Application name (e.g., "Microsoft Excel")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Worksheet names
WorksheetNames []string `json:"worksheet_names,omitempty"`
}
// PptxAppProperties application properties from docProps/app.xml for PPTX
//
// Contains PowerPoint-specific document metadata.
type PptxAppProperties struct {
// Application name (e.g., "Microsoft Office PowerPoint")
Application *string `json:"application,omitempty"`
// Application version
AppVersion *string `json:"app_version,omitempty"`
// Total editing time in minutes
TotalTime *int32 `json:"total_time,omitempty"`
// Company name
Company *string `json:"company,omitempty"`
// Document security level
DocSecurity *int32 `json:"doc_security,omitempty"`
// Scale crop flag
ScaleCrop *bool `json:"scale_crop,omitempty"`
// Links up to date flag
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
// Shared document flag
SharedDoc *bool `json:"shared_doc,omitempty"`
// Hyperlinks changed flag
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
// Number of slides
Slides *int32 `json:"slides,omitempty"`
// Number of notes
Notes *int32 `json:"notes,omitempty"`
// Number of hidden slides
HiddenSlides *int32 `json:"hidden_slides,omitempty"`
// Number of multimedia clips
MultimediaClips *int32 `json:"multimedia_clips,omitempty"`
// Presentation format (e.g., "Widescreen", "Standard")
PresentationFormat *string `json:"presentation_format,omitempty"`
// Slide titles
SlideTitles []string `json:"slide_titles,omitempty"`
}
// CoreProperties dublin Core metadata from docProps/core.xml
//
// Contains standard metadata fields defined by the Dublin Core standard
// and Office-specific extensions.
type CoreProperties struct {
// Document title
Title *string `json:"title,omitempty"`
// Document subject/topic
Subject *string `json:"subject,omitempty"`
// Document creator/author
Creator *string `json:"creator,omitempty"`
// Keywords or tags
Keywords *string `json:"keywords,omitempty"`
// Document description/abstract
Description *string `json:"description,omitempty"`
// User who last modified the document
LastModifiedBy *string `json:"last_modified_by,omitempty"`
// Revision number
Revision *string `json:"revision,omitempty"`
// Creation timestamp (ISO 8601)
Created *string `json:"created,omitempty"`
// Last modification timestamp (ISO 8601)
Modified *string `json:"modified,omitempty"`
// Document category
Category *string `json:"category,omitempty"`
// Content status (Draft, Final, etc.)
ContentStatus *string `json:"content_status,omitempty"`
// Document language
Language *string `json:"language,omitempty"`
// Unique identifier
Identifier *string `json:"identifier,omitempty"`
// Document version
Version *string `json:"version,omitempty"`
// Last print timestamp (ISO 8601)
LastPrinted *string `json:"last_printed,omitempty"`
}
// SecurityLimits configuration for security limits across extractors.
//
// All limits are intentionally conservative to prevent DoS attacks
// while still supporting legitimate documents.
type SecurityLimits struct {
// Maximum uncompressed size for archives (500 MB)
MaxArchiveSize *uint `json:"max_archive_size,omitempty"`
// Maximum compression ratio before flagging as potential bomb (100:1)
MaxCompressionRatio *uint `json:"max_compression_ratio,omitempty"`
// Maximum number of files in archive (10,000)
MaxFilesInArchive *uint `json:"max_files_in_archive,omitempty"`
// Maximum nesting depth for structures (100)
MaxNestingDepth *uint `json:"max_nesting_depth,omitempty"`
// Maximum length of any single XML entity / attribute / token (1 MiB).
// This is a per-token cap, NOT a total cap — billion-laughs class
// attacks where a single entity expands to hundreds of MB are caught
// here, while normal long text content (a paragraph, a CDATA block) is
// caught by `max_content_size` instead.
MaxEntityLength *uint `json:"max_entity_length,omitempty"`
// Maximum string growth per document (100 MB)
MaxContentSize *uint `json:"max_content_size,omitempty"`
// Maximum iterations per operation
MaxIterations *uint `json:"max_iterations,omitempty"`
// Maximum XML depth (100 levels)
MaxXMLDepth *uint `json:"max_xml_depth,omitempty"`
// Maximum cells per table (100,000)
MaxTableCells *uint `json:"max_table_cells,omitempty"`
}
// TokenReductionConfig is a type.
type TokenReductionConfig struct {
Level *ReductionLevel `json:"level,omitempty"`
LanguageHint *string `json:"language_hint,omitempty"`
PreserveMarkdown bool `json:"preserve_markdown"`
PreserveCode *bool `json:"preserve_code,omitempty"`
SemanticThreshold *float32 `json:"semantic_threshold,omitempty"`
EnableParallel *bool `json:"enable_parallel,omitempty"`
UseSimd *bool `json:"use_simd,omitempty"`
CustomStopwords map[string][]string `json:"custom_stopwords,omitempty"`
PreservePatterns []string `json:"preserve_patterns,omitempty"`
TargetReduction *float32 `json:"target_reduction,omitempty"`
EnableSemanticClustering bool `json:"enable_semantic_clustering"`
}
// PdfAnnotation pDF annotation extracted from a document page.
type PdfAnnotation struct {
// The type of annotation.
AnnotationType PdfAnnotationType `json:"annotation_type"`
// Text content of the annotation (e.g., comment text, link URL).
Content *string `json:"content,omitempty"`
// Page number where the annotation appears (1-indexed).
PageNumber uint32 `json:"page_number"`
// Bounding box of the annotation on the page.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}
// DjotContent comprehensive Djot document structure with semantic preservation.
//
// This type captures the full richness of Djot markup, including:
// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
// - Attributes (classes, IDs, key-value pairs)
// - Links, images, footnotes
// - Math expressions (inline and display)
// - Tables with full structure
//
// Available when the `djot` feature is enabled.
type DjotContent struct {
// Plain text representation for backwards compatibility
PlainText string `json:"plain_text"`
// Structured block-level content
Blocks []FormattedBlock `json:"blocks,omitempty"`
// Metadata from YAML frontmatter
Metadata Metadata `json:"metadata"`
// Extracted tables as structured data
Tables []Table `json:"tables,omitempty"`
// Extracted images with metadata
Images []DjotImage `json:"images,omitempty"`
// Extracted links with URLs
Links []DjotLink `json:"links,omitempty"`
// Footnote definitions
Footnotes []Footnote `json:"footnotes,omitempty"`
// Attributes mapped by element identifier (if present)
Attributes []string `json:"attributes,omitempty"`
}
// FormattedBlock block-level element in a Djot document.
//
// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
type FormattedBlock struct {
// Type of block element
BlockType BlockType `json:"block_type"`
// Heading level (1-6) for headings, or nesting level for lists
Level *uint `json:"level,omitempty"`
// Inline content within the block
InlineContent []InlineElement `json:"inline_content,omitempty"`
// Element attributes (classes, IDs, key-value pairs)
Attributes *string `json:"attributes,omitempty"`
// Language identifier for code blocks
Language *string `json:"language,omitempty"`
// Raw code content for code blocks
Code *string `json:"code,omitempty"`
// Nested blocks for containers (blockquotes, list items, divs)
Children []FormattedBlock `json:"children,omitempty"`
}
// InlineElement inline element within a block.
//
// Represents text with formatting, links, images, etc.
type InlineElement struct {
// Type of inline element
ElementType InlineType `json:"element_type"`
// Text content
Content string `json:"content"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
// Additional metadata (e.g., href for links, src/alt for images)
Metadata map[string]string `json:"metadata,omitempty"`
}
// DjotImage image element in Djot.
type DjotImage struct {
// Image source URL or path
Src string `json:"src"`
// Alternative text
Alt string `json:"alt"`
// Optional title
Title *string `json:"title,omitempty"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
}
// DjotLink link element in Djot.
type DjotLink struct {
// Link URL
URL string `json:"url"`
// Link text content
Text string `json:"text"`
// Optional title
Title *string `json:"title,omitempty"`
// Element attributes
Attributes *string `json:"attributes,omitempty"`
}
// Footnote in Djot.
type Footnote struct {
// Footnote label
Label string `json:"label"`
// Footnote content blocks
Content []FormattedBlock `json:"content,omitempty"`
}
// DocumentStructure top-level structured document representation.
//
// A flat array of nodes with index-based parent/child references forming a tree.
// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
// to iterate over top-level content by layer.
//
// # Validation
//
// Call `validate()` after construction to verify all node indices are in bounds
// and parent-child relationships are bidirectionally consistent.
type DocumentStructure struct {
// All nodes in document/reading order.
Nodes []DocumentNode `json:"nodes,omitempty"`
// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
//
// Allows renderers to apply format-aware heuristics when converting
// the document tree to output formats.
SourceFormat *string `json:"source_format,omitempty"`
// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
//
// Populated during derivation from the internal document representation.
// Empty when no relationships are detected.
Relationships []DocumentRelationship `json:"relationships,omitempty"`
// Sorted, deduplicated list of node type names present in this document.
//
// Each value is the snake_case `node_type` tag of the corresponding
// [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
//
// Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
// Empty until that method is called (internal construction paths call it
// at the end of derivation).
NodeTypes []string `json:"node_types,omitempty"`
}
// DocumentRelationship resolved relationship between two nodes in the document tree.
type DocumentRelationship struct {
// Source node index (the referencing node).
Source uint32 `json:"source"`
// Target node index (the referenced node).
Target uint32 `json:"target"`
// Semantic kind of the relationship.
Kind RelationshipKind `json:"kind"`
}
// DocumentNode single node in the document tree.
//
// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
// for tree structure, and metadata like page number, bounding box, and content layer.
type DocumentNode struct {
// Deterministic identifier (hash of content + position).
ID string `json:"id"`
// Node content — tagged enum, type-specific data only.
Content NodeContent `json:"content"`
// Parent node index (`None` = root-level node).
Parent *uint32 `json:"parent,omitempty"`
// Child node indices in reading order.
Children []uint32 `json:"children,omitempty"`
// Content layer classification.
ContentLayer ContentLayer `json:"content_layer"`
// Page number where this node starts (1-indexed).
Page *uint32 `json:"page,omitempty"`
// Page number where this node ends (for multi-page tables/sections).
PageEnd *uint32 `json:"page_end,omitempty"`
// Bounding box in document coordinates.
Bbox *BoundingBox `json:"bbox,omitempty"`
// Inline annotations (formatting, links) on this node's text content.
//
// Only meaningful for text-carrying nodes; empty for containers.
Annotations []TextAnnotation `json:"annotations,omitempty"`
// Format-specific key-value attributes.
//
// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
// LaTeX environment names, Excel cell formulas, slide layout names, etc.
Attributes map[string]string `json:"attributes,omitempty"`
}
func (s *DocumentNode) UnmarshalJSON(data []byte) error {
var raw struct {
ID string `json:"id"`
Content json.RawMessage `json:"content,omitempty"`
Parent *uint32 `json:"parent,omitempty"`
Children []uint32 `json:"children,omitempty"`
ContentLayer ContentLayer `json:"content_layer"`
Page *uint32 `json:"page,omitempty"`
PageEnd *uint32 `json:"page_end,omitempty"`
Bbox *BoundingBox `json:"bbox,omitempty"`
Annotations []TextAnnotation `json:"annotations,omitempty"`
Attributes map[string]string `json:"attributes,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.ID = raw.ID
s.Parent = raw.Parent
s.Children = raw.Children
s.ContentLayer = raw.ContentLayer
s.Page = raw.Page
s.PageEnd = raw.PageEnd
s.Bbox = raw.Bbox
s.Annotations = raw.Annotations
s.Attributes = raw.Attributes
if len(raw.Content) > 0 && string(raw.Content) != "null" {
v, err := UnmarshalNodeContent(raw.Content)
if err != nil {
return err
}
s.Content = v
}
return nil
}
// TableGrid structured table grid with cell-level metadata.
//
// Stores row/column dimensions and a flat list of cells with position info.
type TableGrid struct {
// Number of rows in the table.
Rows uint32 `json:"rows"`
// Number of columns in the table.
Cols uint32 `json:"cols"`
// All cells in row-major order.
Cells []GridCell `json:"cells,omitempty"`
}
// GridCell individual grid cell with position and span metadata.
type GridCell struct {
// Cell text content.
Content string `json:"content"`
// Zero-indexed row position.
Row uint32 `json:"row"`
// Zero-indexed column position.
Col uint32 `json:"col"`
// Number of rows this cell spans.
RowSpan uint32 `json:"row_span"`
// Number of columns this cell spans.
ColSpan uint32 `json:"col_span"`
// Whether this is a header cell.
IsHeader bool `json:"is_header"`
// Bounding box for this cell (if available).
Bbox *BoundingBox `json:"bbox,omitempty"`
}
// TextAnnotation inline text annotation — byte-range based formatting and links.
//
// Annotations reference byte offsets into the node's text content,
// enabling precise identification of formatted regions.
type TextAnnotation struct {
// Start byte offset in the node's text content (inclusive).
Start uint32 `json:"start"`
// End byte offset in the node's text content (exclusive).
End uint32 `json:"end"`
// Annotation type.
Kind AnnotationKind `json:"kind"`
}
func (s *TextAnnotation) UnmarshalJSON(data []byte) error {
var raw struct {
Start uint32 `json:"start"`
End uint32 `json:"end"`
Kind json.RawMessage `json:"kind,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Start = raw.Start
s.End = raw.End
if len(raw.Kind) > 0 && string(raw.Kind) != "null" {
v, err := UnmarshalAnnotationKind(raw.Kind)
if err != nil {
return err
}
s.Kind = v
}
return nil
}
// ExtractionResult general extraction result used by the core extraction API.
//
// This is the main result type returned by all extraction functions.
type ExtractionResult struct {
Content string `json:"content"`
MimeType string `json:"mime_type"`
Metadata Metadata `json:"metadata"`
// Extraction strategy used to produce the returned text.
//
// Populated when the extractor can reliably distinguish native text extraction,
// OCR-only extraction, or mixed native/OCR output.
ExtractionMethod *ExtractionMethod `json:"extraction_method,omitempty"`
Tables []Table `json:"tables,omitempty"`
DetectedLanguages []string `json:"detected_languages,omitempty"`
// Text chunks when chunking is enabled.
//
// When chunking configuration is provided, the content is split into
// overlapping chunks for efficient processing. Each chunk contains the text,
// optional embeddings (if enabled), and metadata about its position.
Chunks []Chunk `json:"chunks,omitempty"`
// Extracted images from the document.
//
// When image extraction is enabled via `ImageExtractionConfig`, this field
// contains all images found in the document with their raw data and metadata.
// Each image may optionally contain a nested `ocr_result` if OCR was performed.
Images []ExtractedImage `json:"images,omitempty"`
// Per-page content when page extraction is enabled.
//
// When page extraction is configured, the document is split into per-page content
// with tables and images mapped to their respective pages.
Pages []PageContent `json:"pages,omitempty"`
// Semantic elements when element-based result format is enabled.
//
// When result_format is set to ElementBased, this field contains semantic
// elements with type classification, unique identifiers, and metadata for
// Unstructured-compatible element-based processing.
Elements []Element `json:"elements,omitempty"`
// Rich Djot content structure (when extracting Djot documents).
//
// When extracting Djot documents with structured extraction enabled,
// this field contains the full semantic structure including:
// - Block-level elements with nesting
// - Inline formatting with attributes
// - Links, images, footnotes
// - Math expressions
// - Complete attribute information
//
// The `content` field still contains plain text for backward compatibility.
//
// Always `None` for non-Djot documents.
DjotContent *DjotContent `json:"djot_content,omitempty"`
// OCR elements with full spatial and confidence metadata.
//
// When OCR is performed with element extraction enabled, this field contains
// the structured representation of detected text including:
// - Bounding geometry (rectangles or quadrilaterals)
// - Confidence scores (detection and recognition)
// - Rotation information
// - Hierarchical relationships (Tesseract only)
//
// This field preserves all metadata that would otherwise be lost when
// converting to plain text or markdown output formats.
//
// Only populated when `OcrElementConfig.include_elements` is true.
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
// Structured document tree (when document structure extraction is enabled).
//
// When `include_document_structure` is true in `ExtractionConfig`, this field
// contains the full hierarchical representation of the document including:
// - Heading-driven section nesting
// - Table grids with cell-level metadata
// - Content layer classification (body, header, footer, footnote)
// - Inline text annotations (formatting, links)
// - Bounding boxes and page numbers
//
// Independent of `result_format` — can be combined with Unified or ElementBased.
Document *DocumentStructure `json:"document,omitempty"`
// Extracted keywords when keyword extraction is enabled.
//
// When keyword extraction (RAKE or YAKE) is configured, this field contains
// the extracted keywords with scores, algorithm info, and position data.
// Previously stored in `metadata.additional["keywords"]`.
ExtractedKeywords []Keyword `json:"extracted_keywords,omitempty"`
// Document quality score from quality analysis.
//
// A value between 0.0 and 1.0 indicating the overall text quality.
// Previously stored in `metadata.additional["quality_score"]`.
QualityScore *float64 `json:"quality_score,omitempty"`
// Non-fatal warnings collected during processing pipeline stages.
//
// Captures errors from optional pipeline features (embedding, chunking,
// language detection, output formatting) that don't prevent extraction
// but may indicate degraded results.
// Previously stored as individual keys in `metadata.additional`.
ProcessingWarnings []ProcessingWarning `json:"processing_warnings,omitempty"`
// PDF annotations extracted from the document.
//
// When annotation extraction is enabled via `PdfConfig::extract_annotations`,
// this field contains text notes, highlights, links, stamps, and other
// annotations found in PDF documents.
Annotations []PdfAnnotation `json:"annotations,omitempty"`
// Nested extraction results from archive contents.
//
// When extracting archives, each processable file inside produces its own
// full extraction result. Set to `None` for non-archive formats.
// Use `max_archive_depth` in config to control recursion depth.
Children []ArchiveEntry `json:"children,omitempty"`
// URIs/links discovered during document extraction.
//
// Contains hyperlinks, image references, citations, email addresses, and
// other URI-like references found in the document. Always extracted when
// present in the source document.
Uris []ExtractedURI `json:"uris,omitempty"`
// Tracked changes embedded in the source document.
//
// Populated by per-format extractors that understand change-tracking
// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
// …). Every extractor defaults to `None` until its format-specific
// implementation is added. Extractors that do populate this field follow
// the "accepted-changes" convention: inserted text is present in
// `content`, deleted text is absent — the revision list is the separate
// audit trail.
Revisions []DocumentRevision `json:"revisions,omitempty"`
// Structured extraction output from LLM-based JSON schema extraction.
//
// When `structured_extraction` is configured in `ExtractionConfig`, the
// extracted document content is sent to a VLM with the provided JSON schema.
// The response is parsed and stored here as a JSON value matching the schema.
StructuredOutput *json.RawMessage `json:"structured_output,omitempty"`
// Code intelligence results from tree-sitter analysis.
//
// Populated when extracting source code files with the `tree-sitter` feature.
// Contains metrics, structural analysis, imports/exports, comments,
// docstrings, symbols, diagnostics, and optionally chunked code segments.
//
// Stored as an opaque JSON value so that all language bindings (Go, Java,
// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
// The underlying type is `tree_sitter_language_pack::ProcessResult`.
CodeIntelligence *json.RawMessage `json:"code_intelligence,omitempty"`
// LLM token usage and cost data for all LLM calls made during this extraction.
//
// Contains one entry per LLM call. Multiple entries are produced when
// VLM OCR, structured extraction, or LLM embeddings run during
// the same extraction.
//
// `None` when no LLM was used.
LlmUsage []LlmUsage `json:"llm_usage,omitempty"`
// Pre-rendered content in the requested output format.
//
// Populated during `derive_extraction_result` before tree derivation consumes
// element data. `apply_output_format` swaps this into `content` at the end
// of the pipeline, after post-processors have operated on plain text.
FormattedContent *string `json:"formatted_content,omitempty"`
// Structured hOCR document for the OCR+layout pipeline.
//
// When tesseract produces hOCR output, the parsed `InternalDocument` carries
// paragraph structure with bounding boxes and confidence scores. The layout
// classification step enriches these elements before final rendering.
OcrInternalDocument *string `json:"ocr_internal_document,omitempty"`
}
// ArchiveEntry single file extracted from an archive.
//
// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
// enabled, each processable file produces its own full `ExtractionResult`.
type ArchiveEntry struct {
// Archive-relative file path (e.g. "folder/document.pdf").
Path string `json:"path"`
// Detected MIME type of the file.
MimeType string `json:"mime_type"`
// Full extraction result for this file.
Result ExtractionResult `json:"result"`
}
// ProcessingWarning non-fatal warning from a processing pipeline stage.
//
// Captures errors from optional features that don't prevent extraction
// but may indicate degraded results.
type ProcessingWarning struct {
// The pipeline stage or feature that produced this warning
// (e.g., "embedding", "chunking", "language_detection", "output_format").
Source string `json:"source"`
// Human-readable description of what went wrong.
Message string `json:"message"`
}
// LlmUsage token usage and cost data for a single LLM call made during extraction.
//
// Populated when VLM OCR, structured extraction, or LLM-based embeddings
// are used. Multiple entries may be present when multiple LLM calls occur
// within one extraction (e.g. VLM OCR + structured extraction).
type LlmUsage struct {
// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
Model string `json:"model"`
// The pipeline stage that triggered this LLM call
// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
Source string `json:"source"`
// Number of input/prompt tokens consumed.
InputTokens *uint64 `json:"input_tokens,omitempty"`
// Number of output/completion tokens generated.
OutputTokens *uint64 `json:"output_tokens,omitempty"`
// Total tokens (input + output).
TotalTokens *uint64 `json:"total_tokens,omitempty"`
// Estimated cost in USD based on the provider's published pricing.
EstimatedCost *float64 `json:"estimated_cost,omitempty"`
// Why the model stopped generating (e.g. "stop", "length", "content_filter").
FinishReason *string `json:"finish_reason,omitempty"`
}
// Chunk text chunk with optional embedding and metadata.
//
// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
// contains the text content, optional embedding vector (if embedding generation
// is configured), and metadata about its position in the document.
type Chunk struct {
// The text content of this chunk.
Content string `json:"content"`
// Semantic structural classification of this chunk.
//
// Assigned by the heuristic classifier based on content patterns and
// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
ChunkType ChunkType `json:"chunk_type"`
// Optional embedding vector for this chunk.
//
// Only populated when `EmbeddingConfig` is provided in chunking configuration.
// The dimensionality depends on the chosen embedding model.
Embedding []float32 `json:"embedding,omitempty"`
// Metadata about this chunk's position and properties.
Metadata ChunkMetadata `json:"metadata"`
}
// HeadingContext heading context for a chunk within a Markdown document.
//
// Contains the heading hierarchy from document root to this chunk's section.
type HeadingContext struct {
// The heading hierarchy from document root to this chunk's section.
// Index 0 is the outermost (h1), last element is the most specific.
Headings []HeadingLevel `json:"headings,omitempty"`
}
// HeadingLevel single heading in the hierarchy.
type HeadingLevel struct {
// Heading depth (1 = h1, 2 = h2, etc.)
Level uint8 `json:"level"`
// The text content of the heading.
Text string `json:"text"`
}
// ChunkMetadata metadata about a chunk's position in the original document.
type ChunkMetadata struct {
// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
ByteStart uint `json:"byte_start"`
// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
ByteEnd uint `json:"byte_end"`
// Number of tokens in this chunk (if available).
//
// This is calculated by the embedding model's tokenizer if embeddings are enabled.
TokenCount *uint `json:"token_count,omitempty"`
// Zero-based index of this chunk in the document.
ChunkIndex uint `json:"chunk_index"`
// Total number of chunks in the document.
TotalChunks uint `json:"total_chunks"`
// First page number this chunk spans (1-indexed).
//
// Only populated when page tracking is enabled in extraction configuration.
FirstPage *uint32 `json:"first_page,omitempty"`
// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
//
// Only populated when page tracking is enabled in extraction configuration.
LastPage *uint32 `json:"last_page,omitempty"`
// Heading context when using Markdown chunker.
//
// Contains the heading hierarchy this chunk falls under.
// Only populated when `ChunkerType::Markdown` is used.
HeadingContext *HeadingContext `json:"heading_context,omitempty"`
// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
//
// Contains zero-based indices into the top-level `images` collection for every
// image whose `page_number` falls within `[first_page, last_page]`.
// Empty when image extraction is disabled or the chunk spans no pages with images.
ImageIndices []uint32 `json:"image_indices,omitempty"`
}
// ExtractedImage extracted image from a document.
//
// Contains raw image data, metadata, and optional nested OCR results.
// Raw bytes allow cross-language compatibility - users can convert to
// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
type ExtractedImage struct {
// Raw image data (PNG, JPEG, WebP, etc. bytes).
// Uses `bytes::Bytes` for cheap cloning of large buffers.
Data []byte `json:"data"`
// Image format (e.g., "jpeg", "png", "webp")
// Uses Cow<'static, str> to avoid allocation for static literals.
Format string `json:"format"`
// Zero-indexed position of this image in the document/page
ImageIndex uint32 `json:"image_index"`
// Page/slide number where image was found (1-indexed)
PageNumber *uint32 `json:"page_number,omitempty"`
// Image width in pixels
Width *uint32 `json:"width,omitempty"`
// Image height in pixels
Height *uint32 `json:"height,omitempty"`
// Colorspace information (e.g., "RGB", "CMYK", "Gray")
Colorspace *string `json:"colorspace,omitempty"`
// Bits per color component (e.g., 8, 16)
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
// Whether this image is a mask image
IsMask bool `json:"is_mask"`
// Optional description of the image
Description *string `json:"description,omitempty"`
// Nested OCR extraction result (if image was OCRed)
//
// When OCR is performed on this image, the result is embedded here
// rather than in a separate collection, making the relationship explicit.
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
// Only populated for PDF-extracted images when position data is available from the PDF extractor.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
// Used for rendering image references when the binary data is not extracted.
SourcePath *string `json:"source_path,omitempty"`
// Heuristic classification of what this image likely depicts.
// `None` if classification was disabled or inconclusive.
ImageKind *ImageKind `json:"image_kind,omitempty"`
// Confidence score for `image_kind`, in the range 0.0 to 1.0.
KindConfidence *float32 `json:"kind_confidence,omitempty"`
// Identifier shared across images that form a single logical figure
// (e.g. all raster tiles of one technical drawing). `None` for singletons.
ClusterID *uint32 `json:"cluster_id,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v ExtractedImage) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Data []int `json:"data"`
Format string `json:"format"`
ImageIndex uint32 `json:"image_index"`
PageNumber *uint32 `json:"page_number,omitempty"`
Width *uint32 `json:"width,omitempty"`
Height *uint32 `json:"height,omitempty"`
Colorspace *string `json:"colorspace,omitempty"`
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
IsMask bool `json:"is_mask"`
Description *string `json:"description,omitempty"`
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
SourcePath *string `json:"source_path,omitempty"`
ImageKind *ImageKind `json:"image_kind,omitempty"`
KindConfidence *float32 `json:"kind_confidence,omitempty"`
ClusterID *uint32 `json:"cluster_id,omitempty"`
}{}
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
aux.Format = v.Format
aux.ImageIndex = v.ImageIndex
aux.PageNumber = v.PageNumber
aux.Width = v.Width
aux.Height = v.Height
aux.Colorspace = v.Colorspace
aux.BitsPerComponent = v.BitsPerComponent
aux.IsMask = v.IsMask
aux.Description = v.Description
aux.OcrResult = v.OcrResult
aux.BoundingBox = v.BoundingBox
aux.SourcePath = v.SourcePath
aux.ImageKind = v.ImageKind
aux.KindConfidence = v.KindConfidence
aux.ClusterID = v.ClusterID
return json.Marshal(aux)
}
// BoundingBox bounding box coordinates for element positioning.
type BoundingBox struct {
// Left x-coordinate
X0 float64 `json:"x0"`
// Bottom y-coordinate
Y0 float64 `json:"y0"`
// Right x-coordinate
X1 float64 `json:"x1"`
// Top y-coordinate
Y1 float64 `json:"y1"`
}
// ElementMetadata metadata for a semantic element.
type ElementMetadata struct {
// Page number (1-indexed)
PageNumber *uint32 `json:"page_number,omitempty"`
// Source filename or document name
Filename *string `json:"filename,omitempty"`
// Bounding box coordinates if available
Coordinates *BoundingBox `json:"coordinates,omitempty"`
// Position index in the element sequence
ElementIndex *uint `json:"element_index,omitempty"`
// Additional custom metadata
Additional map[string]string `json:"additional,omitempty"`
}
// Element semantic element extracted from document.
//
// Represents a logical unit of content with semantic classification,
// unique identifier, and metadata for tracking origin and position.
type Element struct {
// Unique element identifier
ElementID string `json:"element_id"`
// Semantic type of this element
ElementType ElementType `json:"element_type"`
// Text content of the element
Text string `json:"text"`
// Metadata about the element
Metadata ElementMetadata `json:"metadata"`
}
// ExcelWorkbook excel workbook representation.
//
// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
// extracted content and metadata.
type ExcelWorkbook struct {
// All sheets in the workbook
Sheets []ExcelSheet `json:"sheets,omitempty"`
// Workbook-level metadata (author, creation date, etc.)
Metadata map[string]string `json:"metadata,omitempty"`
// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
//
// Populated for legacy shared-workbook `.xlsx` files that contain the
// `xl/revisions/` directory. Each `<header>` element maps to one
// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
Revisions []DocumentRevision `json:"revisions,omitempty"`
}
// ExcelSheet single Excel worksheet.
//
// Represents one sheet from an Excel workbook with its content
// converted to Markdown format and dimensional statistics.
type ExcelSheet struct {
// Sheet name as it appears in Excel
Name string `json:"name"`
// Sheet content converted to Markdown tables
Markdown string `json:"markdown"`
// Number of rows
RowCount uint `json:"row_count"`
// Number of columns
ColCount uint `json:"col_count"`
// Total number of non-empty cells
CellCount uint `json:"cell_count"`
// Pre-extracted table cells (2D vector of cell values)
// Populated during markdown generation to avoid re-parsing markdown.
// None for empty sheets.
TableCells [][]string `json:"table_cells,omitempty"`
}
// XMLExtractionResult xML extraction result.
//
// Contains extracted text content from XML files along with
// structural statistics about the XML document.
type XMLExtractionResult struct {
// Extracted text content (XML structure filtered out)
Content string `json:"content"`
// Total number of XML elements processed
ElementCount uint `json:"element_count"`
// List of unique element names found (sorted)
UniqueElements []string `json:"unique_elements,omitempty"`
}
// TextExtractionResult plain text and Markdown extraction result.
//
// Contains the extracted text along with statistics and,
// for Markdown files, structural elements like headers and links.
type TextExtractionResult struct {
// Extracted text content
Content string `json:"content"`
// Number of lines
LineCount uint `json:"line_count"`
// Number of words
WordCount uint `json:"word_count"`
// Number of characters
CharacterCount uint `json:"character_count"`
// Markdown headers (text only, Markdown files only)
Headers []string `json:"headers,omitempty"`
// Markdown links as (text, URL) tuples (Markdown files only)
Links [][]string `json:"links,omitempty"`
// Code blocks as (language, code) tuples (Markdown files only)
CodeBlocks [][]string `json:"code_blocks,omitempty"`
}
// PptxExtractionResult powerPoint (PPTX) extraction result.
//
// Contains extracted slide content, metadata, and embedded images/tables.
type PptxExtractionResult struct {
// Extracted text content from all slides
Content string `json:"content"`
// Presentation metadata
Metadata PptxMetadata `json:"metadata"`
// Total number of slides
SlideCount uint `json:"slide_count"`
// Total number of embedded images
ImageCount uint `json:"image_count"`
// Total number of tables
TableCount uint `json:"table_count"`
// Extracted images from the presentation
Images []ExtractedImage `json:"images,omitempty"`
// Slide structure with boundaries (when page tracking is enabled)
PageStructure *PageStructure `json:"page_structure,omitempty"`
// Per-slide content (when page tracking is enabled)
PageContents []PageContent `json:"page_contents,omitempty"`
// Structured document representation
Document *DocumentStructure `json:"document,omitempty"`
// Hyperlinks discovered in slides as (url, optional_label) pairs.
Hyperlinks []string `json:"hyperlinks,omitempty"`
// Office metadata extracted from docProps/core.xml and docProps/app.xml.
//
// Contains keys like "title", "author", "created_by", "subject", "keywords",
// "modified_by", "created_at", "modified_at", etc.
OfficeMetadata map[string]string `json:"office_metadata,omitempty"`
// Slide comments as revisions.
//
// Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
// `DocumentRevision { kind: Comment }` with author (resolved from
// `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
// `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
Revisions []DocumentRevision `json:"revisions,omitempty"`
}
// EmailExtractionResult email extraction result.
//
// Complete representation of an extracted email message (.eml or .msg)
// including headers, body content, and attachments.
type EmailExtractionResult struct {
// Email subject line
Subject *string `json:"subject,omitempty"`
// Sender email address
FromEmail *string `json:"from_email,omitempty"`
// Primary recipient email addresses
ToEmails []string `json:"to_emails,omitempty"`
// CC recipient email addresses
CcEmails []string `json:"cc_emails,omitempty"`
// BCC recipient email addresses
BccEmails []string `json:"bcc_emails,omitempty"`
// Email date/timestamp
Date *string `json:"date,omitempty"`
// Message-ID header value
MessageID *string `json:"message_id,omitempty"`
// Plain text version of the email body
PlainText *string `json:"plain_text,omitempty"`
// HTML version of the email body
HTMLContent *string `json:"html_content,omitempty"`
// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
Content string `json:"content"`
// List of email attachments
Attachments []EmailAttachment `json:"attachments,omitempty"`
// Additional email headers and metadata
Metadata map[string]string `json:"metadata,omitempty"`
}
// EmailAttachment email attachment representation.
//
// Contains metadata and optionally the content of an email attachment.
type EmailAttachment struct {
// Attachment name (from Content-Disposition header)
Name *string `json:"name,omitempty"`
// Filename of the attachment
Filename *string `json:"filename,omitempty"`
// MIME type of the attachment
MimeType *string `json:"mime_type,omitempty"`
// Size in bytes
Size *uint `json:"size,omitempty"`
// Whether this attachment is an image
IsImage bool `json:"is_image"`
// Attachment data (if extracted).
// Uses `bytes::Bytes` for cheap cloning of large buffers.
Data []byte `json:"data,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmailAttachment) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Name *string `json:"name,omitempty"`
Filename *string `json:"filename,omitempty"`
MimeType *string `json:"mime_type,omitempty"`
Size *uint `json:"size,omitempty"`
IsImage bool `json:"is_image"`
Data []int `json:"data,omitempty"`
}{}
aux.Name = v.Name
aux.Filename = v.Filename
aux.MimeType = v.MimeType
aux.Size = v.Size
aux.IsImage = v.IsImage
if v.Data != nil {
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
}
return json.Marshal(aux)
}
// OcrExtractionResult oCR extraction result.
//
// Result of performing OCR on an image or scanned document,
// including recognized text and detected tables.
type OcrExtractionResult struct {
// Recognized text content
Content string `json:"content"`
// Original MIME type of the processed image
MimeType string `json:"mime_type"`
// OCR processing metadata (confidence scores, language, etc.)
Metadata map[string]json.RawMessage `json:"metadata,omitempty"`
// Tables detected and extracted via OCR
Tables []OcrTable `json:"tables,omitempty"`
// Structured OCR elements with bounding boxes and confidence scores.
// Available when TSV output is requested or table detection is enabled.
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
// Structured document produced from hOCR parsing.
// Carries paragraph structure, bounding boxes, and confidence scores
// that the flattened `content` string discards.
InternalDocument *string `json:"internal_document,omitempty"`
}
// OcrTable table detected via OCR.
//
// Represents a table structure recognized during OCR processing.
type OcrTable struct {
// Table cells as a 2D vector (rows × columns)
Cells [][]string `json:"cells,omitempty"`
// Markdown representation of the table
Markdown string `json:"markdown"`
// Page number where the table was found (1-indexed)
PageNumber uint32 `json:"page_number"`
// Bounding box of the table in pixel coordinates (from OCR word positions).
BoundingBox *OcrTableBoundingBox `json:"bounding_box,omitempty"`
}
// OcrTableBoundingBox bounding box for an OCR-detected table in pixel coordinates.
type OcrTableBoundingBox struct {
// Left x-coordinate (pixels)
Left uint32 `json:"left"`
// Top y-coordinate (pixels)
Top uint32 `json:"top"`
// Right x-coordinate (pixels)
Right uint32 `json:"right"`
// Bottom y-coordinate (pixels)
Bottom uint32 `json:"bottom"`
}
// ImagePreprocessingConfig image preprocessing configuration for OCR.
//
// These settings control how images are preprocessed before OCR to improve
// text recognition quality. Different preprocessing strategies work better
// for different document types.
type ImagePreprocessingConfig struct {
// Target DPI for the image (300 is standard, 600 for small text).
TargetDpi *int32 `json:"target_dpi,omitempty"`
// Auto-detect and correct image rotation.
AutoRotate *bool `json:"auto_rotate,omitempty"`
// Correct skew (tilted images).
Deskew *bool `json:"deskew,omitempty"`
// Remove noise from the image.
Denoise bool `json:"denoise"`
// Enhance contrast for better text visibility.
ContrastEnhance bool `json:"contrast_enhance"`
// Binarization method: "otsu", "sauvola", "adaptive".
BinarizationMethod *string `json:"binarization_method,omitempty"`
// Invert colors (white text on black → black on white).
InvertColors bool `json:"invert_colors"`
}
// TesseractConfig tesseract OCR configuration.
//
// Provides fine-grained control over Tesseract OCR engine parameters.
// Most users can use the defaults, but these settings allow optimization
// for specific document types (invoices, handwriting, etc.).
type TesseractConfig struct {
// Language code (e.g., "eng", "deu", "fra")
Language *string `json:"language,omitempty"`
// Page Segmentation Mode (0-13).
//
// Common values:
// - 3: Fully automatic page segmentation (native default)
// - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
// - 11: Sparse text with no particular order
Psm *int32 `json:"psm,omitempty"`
// Output format ("text" or "markdown")
OutputFormat *string `json:"output_format,omitempty"`
// OCR Engine Mode (0-3).
//
// - 0: Legacy engine only
// - 1: Neural nets (LSTM) only (usually best)
// - 2: Legacy + LSTM
// - 3: Default (based on what's available)
Oem *int32 `json:"oem,omitempty"`
// Minimum confidence threshold (0.0-100.0).
//
// Words with confidence below this threshold may be rejected or flagged.
MinConfidence float64 `json:"min_confidence"`
// Image preprocessing configuration.
//
// Controls how images are preprocessed before OCR. Can significantly
// improve quality for scanned documents or low-quality images.
Preprocessing *ImagePreprocessingConfig `json:"preprocessing,omitempty"`
// Enable automatic table detection and reconstruction
EnableTableDetection *bool `json:"enable_table_detection,omitempty"`
// Minimum confidence threshold for table detection (0.0-1.0)
TableMinConfidence float64 `json:"table_min_confidence"`
// Column threshold for table detection (pixels)
TableColumnThreshold *int32 `json:"table_column_threshold,omitempty"`
// Row threshold ratio for table detection (0.0-1.0)
TableRowThresholdRatio *float64 `json:"table_row_threshold_ratio,omitempty"`
// Enable OCR result caching
UseCache *bool `json:"use_cache,omitempty"`
// Use pre-adapted templates for character classification
ClassifyUsePreAdaptedTemplates *bool `json:"classify_use_pre_adapted_templates,omitempty"`
// Enable N-gram language model
LanguageModelNgramOn bool `json:"language_model_ngram_on"`
// Don't reject good words during block-level processing
TesseditDontBlkrejGoodWds *bool `json:"tessedit_dont_blkrej_good_wds,omitempty"`
// Don't reject good words during row-level processing
TesseditDontRowrejGoodWds *bool `json:"tessedit_dont_rowrej_good_wds,omitempty"`
// Enable dictionary correction
TesseditEnableDictCorrection *bool `json:"tessedit_enable_dict_correction,omitempty"`
// Whitelist of allowed characters (empty = all allowed)
TesseditCharWhitelist string `json:"tessedit_char_whitelist"`
// Blacklist of forbidden characters (empty = none forbidden)
TesseditCharBlacklist string `json:"tessedit_char_blacklist"`
// Use primary language params model
TesseditUsePrimaryParamsModel *bool `json:"tessedit_use_primary_params_model,omitempty"`
// Variable-width space detection
TextordSpaceSizeIsVariable *bool `json:"textord_space_size_is_variable,omitempty"`
// Use adaptive thresholding method
ThresholdingMethod bool `json:"thresholding_method"`
}
// ImagePreprocessingMetadata image preprocessing metadata.
//
// Tracks the transformations applied to an image during OCR preprocessing,
// including DPI normalization, resizing, and resampling.
type ImagePreprocessingMetadata struct {
// Original image dimensions (width, height) in pixels
OriginalDimensions []uint `json:"original_dimensions,omitempty"`
// Original image DPI (horizontal, vertical)
OriginalDpi []float64 `json:"original_dpi,omitempty"`
// Target DPI from configuration
TargetDpi int32 `json:"target_dpi"`
// Scaling factor applied to the image
ScaleFactor float64 `json:"scale_factor"`
// Whether DPI was auto-adjusted based on content
AutoAdjusted bool `json:"auto_adjusted"`
// Final DPI after processing
FinalDpi int32 `json:"final_dpi"`
// New dimensions after resizing (if resized)
NewDimensions []uint `json:"new_dimensions,omitempty"`
// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
ResampleMethod string `json:"resample_method"`
// Whether dimensions were clamped to max_image_dimension
DimensionClamped bool `json:"dimension_clamped"`
// Calculated optimal DPI (if auto_adjust_dpi enabled)
CalculatedDpi *int32 `json:"calculated_dpi,omitempty"`
// Whether resize was skipped (dimensions already optimal)
SkippedResize bool `json:"skipped_resize"`
// Error message if resize failed
ResizeError *string `json:"resize_error,omitempty"`
}
// Metadata extraction result metadata.
//
// Contains common fields applicable to all formats, format-specific metadata
// via a discriminated union, and additional custom fields from postprocessors.
type Metadata struct {
// Document title
Title *string `json:"title,omitempty"`
// Document subject or description
Subject *string `json:"subject,omitempty"`
// Primary author(s) - always Vec for consistency
Authors []string `json:"authors,omitempty"`
// Keywords/tags - always Vec for consistency
Keywords []string `json:"keywords,omitempty"`
// Primary language (ISO 639 code)
Language *string `json:"language,omitempty"`
// Creation timestamp (ISO 8601 format)
CreatedAt *string `json:"created_at,omitempty"`
// Last modification timestamp (ISO 8601 format)
ModifiedAt *string `json:"modified_at,omitempty"`
// User who created the document
CreatedBy *string `json:"created_by,omitempty"`
// User who last modified the document
ModifiedBy *string `json:"modified_by,omitempty"`
// Page/slide/sheet structure with boundaries
Pages *PageStructure `json:"pages,omitempty"`
// Format-specific metadata (discriminated union)
//
// Contains detailed metadata specific to the document format.
// Serialized as a nested `"format"` object with a `format_type` discriminator field.
Format *FormatMetadata `json:"format,omitempty"`
// Image preprocessing metadata (when OCR preprocessing was applied)
ImagePreprocessing *ImagePreprocessingMetadata `json:"image_preprocessing,omitempty"`
// JSON schema (for structured data extraction)
JSONSchema *json.RawMessage `json:"json_schema,omitempty"`
// Error metadata (for batch operations)
Error *ErrorMetadata `json:"error,omitempty"`
// Extraction duration in milliseconds (for benchmarking).
//
// This field is populated by batch extraction to provide per-file timing
// information. It's `None` for single-file extraction (which uses external timing).
ExtractionDurationMs *uint64 `json:"extraction_duration_ms,omitempty"`
// Document category (from frontmatter or classification).
Category *string `json:"category,omitempty"`
// Document tags (from frontmatter).
Tags []string `json:"tags,omitempty"`
// Document version string (from frontmatter).
DocumentVersion *string `json:"document_version,omitempty"`
// Abstract or summary text (from frontmatter).
AbstractText *string `json:"abstract_text,omitempty"`
// Output format identifier (e.g., "markdown", "html", "text").
//
// Set by the output format pipeline stage when format conversion is applied.
// Previously stored in `metadata.additional["output_format"]`.
OutputFormat *string `json:"output_format,omitempty"`
// Whether OCR was used during extraction.
//
// Set to `true` whenever the extraction pipeline ran an OCR backend
// (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
// or fallback text. `false` means native text extraction was used exclusively.
OcrUsed bool `json:"ocr_used"`
// Additional custom fields from postprocessors.
//
// Serialized as a nested `"additional"` object (not flattened at root level).
// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
Additional map[string]json.RawMessage `json:"additional,omitempty"`
}
// ExcelMetadata excel/spreadsheet format metadata.
//
// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
// discriminant. Sheet count and sheet names are stored inside this struct.
type ExcelMetadata struct {
// Number of sheets in the workbook.
SheetCount *uint32 `json:"sheet_count,omitempty"`
// Names of all sheets in the workbook.
SheetNames []string `json:"sheet_names,omitempty"`
}
// EmailMetadata email metadata extracted from .eml and .msg files.
//
// Includes sender/recipient information, message ID, and attachment list.
type EmailMetadata struct {
// Sender's email address
FromEmail *string `json:"from_email,omitempty"`
// Sender's display name
FromName *string `json:"from_name,omitempty"`
// Primary recipients
ToEmails []string `json:"to_emails,omitempty"`
// CC recipients
CcEmails []string `json:"cc_emails,omitempty"`
// BCC recipients
BccEmails []string `json:"bcc_emails,omitempty"`
// Message-ID header value
MessageID *string `json:"message_id,omitempty"`
// List of attachment filenames
Attachments []string `json:"attachments,omitempty"`
}
// ArchiveMetadata archive (ZIP/TAR/7Z) metadata.
//
// Extracted from compressed archive files containing file lists and size information.
type ArchiveMetadata struct {
// Archive format ("ZIP", "TAR", "7Z", etc.)
Format string `json:"format"`
// Total number of files in the archive
FileCount uint32 `json:"file_count"`
// List of file paths within the archive
FileList []string `json:"file_list,omitempty"`
// Total uncompressed size in bytes
TotalSize uint64 `json:"total_size"`
// Compressed size in bytes (if available)
CompressedSize *uint64 `json:"compressed_size,omitempty"`
}
// ImageMetadata image metadata extracted from image files.
//
// Includes dimensions, format, and EXIF data.
type ImageMetadata struct {
// Image width in pixels
Width uint32 `json:"width"`
// Image height in pixels
Height uint32 `json:"height"`
// Image format (e.g., "PNG", "JPEG", "TIFF")
Format string `json:"format"`
// EXIF metadata tags
Exif map[string]string `json:"exif,omitempty"`
}
// XMLMetadata xML metadata extracted during XML parsing.
//
// Provides statistics about XML document structure.
type XMLMetadata struct {
// Total number of XML elements processed
ElementCount uint32 `json:"element_count"`
// List of unique element tag names (sorted)
UniqueElements []string `json:"unique_elements,omitempty"`
}
// TextMetadata text/Markdown metadata.
//
// Extracted from plain text and Markdown files. Includes word counts and,
// for Markdown, structural elements like headers and links.
type TextMetadata struct {
// Number of lines in the document
LineCount uint32 `json:"line_count"`
// Number of words
WordCount uint32 `json:"word_count"`
// Number of characters
CharacterCount uint32 `json:"character_count"`
// Markdown headers (headings text only, for Markdown files)
Headers []string `json:"headers,omitempty"`
// Markdown links as (text, url) tuples (for Markdown files)
Links [][]string `json:"links,omitempty"`
// Code blocks as (language, code) tuples (for Markdown files)
CodeBlocks [][]string `json:"code_blocks,omitempty"`
}
// HeaderMetadata header/heading element metadata.
type HeaderMetadata struct {
// Header level: 1 (h1) through 6 (h6)
Level uint8 `json:"level"`
// Normalized text content of the header
Text string `json:"text"`
// HTML id attribute if present
ID *string `json:"id,omitempty"`
// Document tree depth at the header element
Depth uint32 `json:"depth"`
// Byte offset in original HTML document
HTMLOffset uint32 `json:"html_offset"`
}
// LinkMetadata link element metadata.
type LinkMetadata struct {
// The href URL value
Href string `json:"href"`
// Link text content (normalized)
Text string `json:"text"`
// Optional title attribute
Title *string `json:"title,omitempty"`
// Link type classification
LinkType LinkType `json:"link_type"`
// Rel attribute values
Rel []string `json:"rel,omitempty"`
// Additional attributes as key-value pairs
Attributes [][]string `json:"attributes,omitempty"`
}
// ImageMetadataType image element metadata.
type ImageMetadataType struct {
// Image source (URL, data URI, or SVG content)
Src string `json:"src"`
// Alternative text from alt attribute
Alt *string `json:"alt,omitempty"`
// Title attribute
Title *string `json:"title,omitempty"`
// Image dimensions as (width, height) if available
Dimensions []uint32 `json:"dimensions,omitempty"`
// Image type classification
ImageType ImageType `json:"image_type"`
// Additional attributes as key-value pairs
Attributes [][]string `json:"attributes,omitempty"`
}
// StructuredData structured data (Schema.org, microdata, RDFa) block.
type StructuredData struct {
// Type of structured data
DataType StructuredDataType `json:"data_type"`
// Raw JSON string representation
RawJSON string `json:"raw_json"`
// Schema type if detectable (e.g., "Article", "Event", "Product")
SchemaType *string `json:"schema_type,omitempty"`
}
// HTMLMetadata hTML metadata extracted from HTML documents.
//
// Includes document-level metadata, Open Graph data, Twitter Card metadata,
// and extracted structural elements (headers, links, images, structured data).
type HTMLMetadata struct {
// Document title from `<title>` tag
Title *string `json:"title,omitempty"`
// Document description from `<meta name="description">` tag
Description *string `json:"description,omitempty"`
// Document keywords from `<meta name="keywords">` tag, split on commas
Keywords []string `json:"keywords,omitempty"`
// Document author from `<meta name="author">` tag
Author *string `json:"author,omitempty"`
// Canonical URL from `<link rel="canonical">` tag
CanonicalURL *string `json:"canonical_url,omitempty"`
// Base URL from `<base href="">` tag for resolving relative URLs
BaseHref *string `json:"base_href,omitempty"`
// Document language from `lang` attribute
Language *string `json:"language,omitempty"`
// Document text direction from `dir` attribute
TextDirection *TextDirection `json:"text_direction,omitempty"`
// Open Graph metadata (og:* properties) for social media
// Keys like "title", "description", "image", "url", etc.
OpenGraph map[string]string `json:"open_graph,omitempty"`
// Twitter Card metadata (twitter:* properties)
// Keys like "card", "site", "creator", "title", "description", "image", etc.
TwitterCard map[string]string `json:"twitter_card,omitempty"`
// Additional meta tags not covered by specific fields
// Keys are meta name/property attributes, values are content
MetaTags map[string]string `json:"meta_tags,omitempty"`
// Extracted header elements with hierarchy
Headers []HeaderMetadata `json:"headers,omitempty"`
// Extracted hyperlinks with type classification
Links []LinkMetadata `json:"links,omitempty"`
// Extracted images with source and dimensions
Images []ImageMetadataType `json:"images,omitempty"`
// Extracted structured data blocks
StructuredData []StructuredData `json:"structured_data,omitempty"`
}
// OcrMetadata oCR processing metadata.
//
// Captures information about OCR processing configuration and results.
type OcrMetadata struct {
// OCR language code(s) used
Language string `json:"language"`
// Tesseract Page Segmentation Mode (PSM)
Psm int32 `json:"psm"`
// Output format (e.g., "text", "hocr")
OutputFormat string `json:"output_format"`
// Number of tables detected
TableCount uint32 `json:"table_count"`
TableRows *uint32 `json:"table_rows,omitempty"`
TableCols *uint32 `json:"table_cols,omitempty"`
}
// ErrorMetadata error metadata (for batch operations).
type ErrorMetadata struct {
ErrorType string `json:"error_type"`
Message string `json:"message"`
}
// PptxMetadata powerPoint presentation metadata.
//
// Extracted from PPTX files containing slide counts and presentation details.
type PptxMetadata struct {
// Total number of slides in the presentation
SlideCount uint32 `json:"slide_count"`
// Names of slides (if available)
SlideNames []string `json:"slide_names,omitempty"`
// Number of embedded images
ImageCount *uint32 `json:"image_count,omitempty"`
// Number of tables
TableCount *uint32 `json:"table_count,omitempty"`
}
// DocxMetadata word document metadata.
//
// Extracted from DOCX files using shared Office Open XML metadata extraction.
// Integrates with `office_metadata` module for core/app/custom properties.
type DocxMetadata struct {
// Core properties from docProps/core.xml (Dublin Core metadata)
//
// Contains title, creator, subject, keywords, dates, etc.
// Shared format across DOCX/PPTX/XLSX documents.
CoreProperties *CoreProperties `json:"core_properties,omitempty"`
// Application properties from docProps/app.xml (Word-specific statistics)
//
// Contains word count, page count, paragraph count, editing time, etc.
// DOCX-specific variant of Office application properties.
AppProperties *DocxAppProperties `json:"app_properties,omitempty"`
// Custom properties from docProps/custom.xml (user-defined properties)
//
// Contains key-value pairs defined by users or applications.
// Values can be strings, numbers, booleans, or dates.
CustomProperties map[string]json.RawMessage `json:"custom_properties,omitempty"`
}
// CsvMetadata cSV/TSV file metadata.
type CsvMetadata struct {
RowCount uint32 `json:"row_count"`
ColumnCount uint32 `json:"column_count"`
Delimiter *string `json:"delimiter,omitempty"`
HasHeader bool `json:"has_header"`
ColumnTypes []string `json:"column_types,omitempty"`
}
// BibtexMetadata bibTeX bibliography metadata.
type BibtexMetadata struct {
// Number of entries in the bibliography.
EntryCount uint `json:"entry_count"`
CitationKeys []string `json:"citation_keys,omitempty"`
Authors []string `json:"authors,omitempty"`
YearRange *YearRange `json:"year_range,omitempty"`
EntryTypes map[string]uint `json:"entry_types,omitempty"`
}
// CitationMetadata citation file metadata (RIS, PubMed, EndNote).
type CitationMetadata struct {
CitationCount uint `json:"citation_count"`
Format *string `json:"format,omitempty"`
Authors []string `json:"authors,omitempty"`
YearRange *YearRange `json:"year_range,omitempty"`
Dois []string `json:"dois,omitempty"`
Keywords []string `json:"keywords,omitempty"`
}
// YearRange year range for bibliographic metadata.
type YearRange struct {
Min *uint32 `json:"min,omitempty"`
Max *uint32 `json:"max,omitempty"`
Years []uint32 `json:"years,omitempty"`
}
// FictionBookMetadata fictionBook (FB2) metadata.
type FictionBookMetadata struct {
Genres []string `json:"genres,omitempty"`
Sequences []string `json:"sequences,omitempty"`
Annotation *string `json:"annotation,omitempty"`
}
// DbfMetadata dBASE (DBF) file metadata.
type DbfMetadata struct {
RecordCount uint `json:"record_count"`
FieldCount uint `json:"field_count"`
Fields []DbfFieldInfo `json:"fields,omitempty"`
}
// DbfFieldInfo dBASE field information.
type DbfFieldInfo struct {
Name string `json:"name"`
FieldType string `json:"field_type"`
}
// JatsMetadata jATS (Journal Article Tag Suite) metadata.
type JatsMetadata struct {
Copyright *string `json:"copyright,omitempty"`
License *string `json:"license,omitempty"`
HistoryDates map[string]string `json:"history_dates,omitempty"`
ContributorRoles []ContributorRole `json:"contributor_roles,omitempty"`
}
// ContributorRole jATS contributor with role.
type ContributorRole struct {
Name string `json:"name"`
Role *string `json:"role,omitempty"`
}
// EpubMetadata ePUB metadata (Dublin Core extensions).
type EpubMetadata struct {
Coverage *string `json:"coverage,omitempty"`
DcFormat *string `json:"dc_format,omitempty"`
Relation *string `json:"relation,omitempty"`
Source *string `json:"source,omitempty"`
DcType *string `json:"dc_type,omitempty"`
CoverImage *string `json:"cover_image,omitempty"`
}
// PstMetadata outlook PST archive metadata.
type PstMetadata struct {
MessageCount uint `json:"message_count"`
}
// OcrConfidence confidence scores for an OCR element.
//
// Separates detection confidence (how confident that text exists at this location)
// from recognition confidence (how confident about the actual text content).
type OcrConfidence struct {
// Detection confidence: how confident the OCR engine is that text exists here.
//
// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
// Range: 0.0 to 1.0 (or None if not available).
Detection *float64 `json:"detection,omitempty"`
// Recognition confidence: how confident about the text content.
//
// Range: 0.0 to 1.0.
Recognition float64 `json:"recognition"`
}
// OcrRotation rotation information for an OCR element.
type OcrRotation struct {
// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
AngleDegrees float64 `json:"angle_degrees"`
// Confidence score for the rotation detection.
Confidence *float64 `json:"confidence,omitempty"`
}
// OcrElement unified OCR element representing detected text with full metadata.
//
// This is the primary type for structured OCR output, preserving all information
// from both Tesseract and PaddleOCR backends.
type OcrElement struct {
// The recognized text content.
Text string `json:"text"`
// Bounding geometry (rectangle or quadrilateral).
Geometry OcrBoundingGeometry `json:"geometry"`
// Confidence scores for detection and recognition.
Confidence OcrConfidence `json:"confidence"`
// Hierarchical level (word, line, block, page).
Level OcrElementLevel `json:"level,omitempty"`
// Rotation information (if detected).
Rotation *OcrRotation `json:"rotation,omitempty"`
// Page number (1-indexed).
PageNumber uint32 `json:"page_number"`
// Parent element ID for hierarchical relationships.
//
// Only used for Tesseract output which has word -> line -> block hierarchy.
ParentID *string `json:"parent_id,omitempty"`
// Backend-specific metadata that doesn't fit the unified schema.
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
}
func (s *OcrElement) UnmarshalJSON(data []byte) error {
var raw struct {
Text string `json:"text"`
Geometry json.RawMessage `json:"geometry,omitempty"`
Confidence OcrConfidence `json:"confidence"`
Level OcrElementLevel `json:"level,omitempty"`
Rotation *OcrRotation `json:"rotation,omitempty"`
PageNumber uint32 `json:"page_number"`
ParentID *string `json:"parent_id,omitempty"`
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.Text = raw.Text
s.Confidence = raw.Confidence
s.Level = raw.Level
s.Rotation = raw.Rotation
s.PageNumber = raw.PageNumber
s.ParentID = raw.ParentID
s.BackendMetadata = raw.BackendMetadata
if len(raw.Geometry) > 0 && string(raw.Geometry) != "null" {
v, err := UnmarshalOcrBoundingGeometry(raw.Geometry)
if err != nil {
return err
}
s.Geometry = v
}
return nil
}
// OcrElementConfig configuration for OCR element extraction.
//
// Controls how OCR elements are extracted and filtered.
type OcrElementConfig struct {
// Whether to include OCR elements in the extraction result.
//
// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
IncludeElements bool `json:"include_elements"`
// Minimum hierarchical level to include.
//
// Elements below this level (e.g., words when min_level is Line) will be excluded.
MinLevel OcrElementLevel `json:"min_level,omitempty"`
// Minimum recognition confidence threshold (0.0-1.0).
//
// Elements with confidence below this threshold will be filtered out.
MinConfidence float64 `json:"min_confidence"`
// Whether to build hierarchical relationships between elements.
//
// When true, `parent_id` fields will be populated based on spatial containment.
// Only meaningful for Tesseract output.
BuildHierarchy bool `json:"build_hierarchy"`
}
// PageStructure unified page structure for documents.
//
// Supports different page types (PDF pages, PPTX slides, Excel sheets)
// with character offset boundaries for chunk-to-page mapping.
type PageStructure struct {
// Total number of pages/slides/sheets
TotalCount uint32 `json:"total_count"`
// Type of paginated unit
UnitType PageUnitType `json:"unit_type"`
// Character offset boundaries for each page
//
// Maps character ranges in the extracted content to page numbers.
// Used for chunk page range calculation.
Boundaries []PageBoundary `json:"boundaries,omitempty"`
// Detailed per-page metadata (optional, only when needed)
Pages []PageInfo `json:"pages,omitempty"`
}
// PageBoundary byte offset boundary for a page.
//
// Tracks where a specific page's content starts and ends in the main content string,
// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
type PageBoundary struct {
// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
ByteStart uint `json:"byte_start"`
// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
ByteEnd uint `json:"byte_end"`
// Page number (1-indexed)
PageNumber uint32 `json:"page_number"`
}
// PageInfo metadata for individual page/slide/sheet.
//
// Captures per-page information including dimensions, content counts,
// and visibility state (for presentations).
type PageInfo struct {
// Page number (1-indexed)
Number uint32 `json:"number"`
// Page title (usually for presentations)
Title *string `json:"title,omitempty"`
// Dimensions in points (PDF) or pixels (images): (width, height)
Dimensions []float64 `json:"dimensions,omitempty"`
// Number of images on this page
ImageCount *uint32 `json:"image_count,omitempty"`
// Number of tables on this page
TableCount *uint32 `json:"table_count,omitempty"`
// Whether this page is hidden (e.g., in presentations)
Hidden *bool `json:"hidden,omitempty"`
// Whether this page is blank (no meaningful text, no images, no tables)
//
// A page is considered blank if it has fewer than 3 non-whitespace characters
// and contains no tables or images. This is useful for filtering out empty pages
// in scanned documents or PDFs with blank separator pages.
IsBlank *bool `json:"is_blank,omitempty"`
// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
//
// Indicates the presence of vector-drawn content such as charts, diagrams,
// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
// invisible to `ExtractionResult.images` since they are not embedded as raster
// XObjects. Set to `true` when path count exceeds a heuristic threshold,
// signaling that downstream consumers may want to rasterize the page to
// capture this content.
//
// Only populated for PDFs; `None` for other document types.
HasVectorGraphics bool `json:"has_vector_graphics"`
}
// PageContent content for a single page/slide.
//
// When page extraction is enabled, documents are split into per-page content
// with associated tables and images mapped to each page.
//
// # Performance
//
// Uses Arc-wrapped tables and images for memory efficiency:
// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
// - Maintains exact JSON compatibility via custom Serialize/Deserialize
//
// This reduces memory overhead for documents with shared tables/images
// by avoiding redundant copies during serialization.
type PageContent struct {
// Page number (1-indexed)
PageNumber uint32 `json:"page_number"`
// Text content for this page
Content string `json:"content"`
// Tables found on this page (uses Arc for memory efficiency)
//
// Serializes as Vec<Table> for JSON compatibility while maintaining
// Arc semantics in-memory for zero-copy sharing.
Tables []Table `json:"tables,omitempty"`
// Indices into `ExtractionResult.images` for images found on this page.
//
// Each value is a zero-based index into the top-level `images` collection.
// Only populated when `extract_images = true` in the extraction config.
ImageIndices []uint32 `json:"image_indices,omitempty"`
// Hierarchy information for the page (when hierarchy extraction is enabled)
//
// Contains text hierarchy levels (H1-H6) extracted from the page content.
Hierarchy *PageHierarchy `json:"hierarchy,omitempty"`
// Whether this page is blank (no meaningful text content)
//
// Determined during extraction based on text content analysis.
// A page is blank if it has fewer than 3 non-whitespace characters
// and contains no tables or images.
IsBlank *bool `json:"is_blank,omitempty"`
// Layout detection regions for this page (when layout detection is enabled).
//
// Contains detected layout regions with class, confidence, bounding box,
// and area fraction. Only populated when layout detection is configured.
LayoutRegions []LayoutRegion `json:"layout_regions,omitempty"`
// Speaker notes for this slide (PPTX only).
//
// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
// Only populated when the source is a PPTX file and notes are present.
SpeakerNotes *string `json:"speaker_notes,omitempty"`
// Section name this slide belongs to (PPTX only).
//
// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
// the slide belongs to a named section.
SectionName *string `json:"section_name,omitempty"`
// Sheet name for this page (XLSX/ODS only).
//
// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
// formats and for sheets with an empty name.
SheetName *string `json:"sheet_name,omitempty"`
}
// LayoutRegion detected layout region on a page.
//
// When layout detection is enabled, each page may have layout regions
// identifying different content types (text, pictures, tables, etc.)
// with confidence scores and spatial positions.
type LayoutRegion struct {
// Layout class name (e.g. "picture", "table", "text", "section_header").
ClassName string `json:"class_name"`
// Confidence score from the layout detection model (0.0 to 1.0).
Confidence float64 `json:"confidence"`
// Bounding box in document coordinate space.
BoundingBox BoundingBox `json:"bounding_box"`
// Fraction of the page area covered by this region (0.0 to 1.0).
AreaFraction float64 `json:"area_fraction"`
}
// PageHierarchy page hierarchy structure containing heading levels and block information.
//
// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
// blocks with heading levels (H1-H6) for semantic document structure.
type PageHierarchy struct {
// Number of hierarchy blocks on this page
BlockCount uint32 `json:"block_count"`
// Hierarchical blocks with heading levels
Blocks []HierarchicalBlock `json:"blocks,omitempty"`
}
// HierarchicalBlock text block with hierarchy level assignment.
//
// Represents a block of text with semantic heading information extracted from
// font size clustering and hierarchical analysis.
type HierarchicalBlock struct {
// The text content of this block
Text string `json:"text"`
// The font size of the text in this block
FontSize float32 `json:"font_size"`
// The hierarchy level of this block (H1-H6 or Body)
//
// Levels correspond to HTML heading tags:
// - "h1": Top-level heading
// - "h2": Secondary heading
// - "h3": Tertiary heading
// - "h4": Quaternary heading
// - "h5": Quinary heading
// - "h6": Senary heading
// - "body": Body text (no heading level)
Level string `json:"level"`
// Bounding box information for the block
//
// Contains coordinates as (left, top, right, bottom) in PDF units.
Bbox []float32 `json:"bbox,omitempty"`
}
// CellChange single changed cell within a table.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type CellChange struct {
// Zero-based row index.
Row uint `json:"row"`
// Zero-based column index.
Col uint `json:"col"`
// Value before the change.
From string `json:"from"`
// Value after the change.
To string `json:"to"`
}
// DocumentRevision single tracked change embedded in a document.
//
// Populated by per-format extractors that understand change-tracking metadata
// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
// extractor defaults to `ExtractionResult.revisions = None` until a
// format-specific implementation is added.
type DocumentRevision struct {
// Format-specific revision identifier.
//
// For DOCX this is the `w:id` attribute value on the change element
// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
RevisionID string `json:"revision_id"`
// Display name of the author who made this change, when available.
Author *string `json:"author,omitempty"`
// ISO-8601 timestamp of the change, when available.
//
// Stored as a plain string so this type remains FFI-friendly and
// unconditionally available without the `chrono` optional dep.
// DOCX populates this from the `w:date` attribute (e.g.
// `"2024-03-15T10:30:00Z"`).
Timestamp *string `json:"timestamp,omitempty"`
// Semantic kind of this revision.
Kind RevisionKind `json:"kind"`
// Best-effort document location for this revision.
//
// Resolution is format-dependent and may be `None` when the location
// cannot be determined (e.g. changes inside table cells before
// table-cell anchor support is added).
Anchor RevisionAnchor `json:"anchor,omitempty"`
// The content changes that make up this revision.
Delta RevisionDelta `json:"delta"`
}
func (s *DocumentRevision) UnmarshalJSON(data []byte) error {
var raw struct {
RevisionID string `json:"revision_id"`
Author *string `json:"author,omitempty"`
Timestamp *string `json:"timestamp,omitempty"`
Kind RevisionKind `json:"kind"`
Anchor json.RawMessage `json:"anchor,omitempty"`
Delta RevisionDelta `json:"delta"`
}
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
s.RevisionID = raw.RevisionID
s.Author = raw.Author
s.Timestamp = raw.Timestamp
s.Kind = raw.Kind
s.Delta = raw.Delta
if len(raw.Anchor) > 0 && string(raw.Anchor) != "null" {
v, err := UnmarshalRevisionAnchor(raw.Anchor)
if err != nil {
return err
}
s.Anchor = v
}
return nil
}
// RevisionDelta content changes that make up a single revision.
//
// For insertions and deletions the `content` field carries the added/removed
// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
// changes, `content` is empty — the property diff is left as a TODO for a
// later enrichment pass.
type RevisionDelta struct {
// Line-level content changes for this revision.
Content []DiffLine `json:"content,omitempty"`
// Cell-level table changes for this revision.
TableChanges []CellChange `json:"table_changes,omitempty"`
}
// Table extracted table structure.
//
// Represents a table detected and extracted from a document (PDF, image, etc.).
// Tables are converted to both structured cell data and Markdown format.
type Table struct {
// Table cells as a 2D vector (rows × columns)
Cells [][]string `json:"cells,omitempty"`
// Markdown representation of the table
Markdown string `json:"markdown"`
// Page number where the table was found (1-indexed)
PageNumber uint32 `json:"page_number"`
// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
// Only populated for PDF-extracted tables when position data is available.
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}
// TableCell individual table cell with content and optional styling.
//
// Future extension point for rich table support with cell-level metadata.
type TableCell struct {
// Cell content as text
Content string `json:"content"`
// Row span (number of rows this cell spans)
RowSpan uint32 `json:"row_span"`
// Column span (number of columns this cell spans)
ColSpan uint32 `json:"col_span"`
// Whether this is a header cell
IsHeader bool `json:"is_header"`
}
// ExtractedURI uRI extracted from a document.
//
// Represents any link, reference, or resource pointer found during extraction.
// The `kind` field classifies the URI semantically, while `label` carries
// optional human-readable display text.
type ExtractedURI struct {
// The URL or path string.
URL string `json:"url"`
// Optional display text / label for the link.
Label *string `json:"label,omitempty"`
// Optional page number where the URI was found (1-indexed).
Page *uint32 `json:"page,omitempty"`
// Semantic classification of the URI.
Kind URIKind `json:"kind"`
}
// DetectResponse mIME type detection response.
type DetectResponse struct {
// Detected MIME type
MimeType string `json:"mime_type"`
// Original filename (if provided)
Filename *string `json:"filename,omitempty"`
}
// DiffOptions options controlling how two `ExtractionResult` values are compared.
type DiffOptions struct {
// Include metadata changes in the diff. Default: `true`.
IncludeMetadata *bool `json:"include_metadata,omitempty"`
// Include embedded-children changes in the diff. Default: `true`.
IncludeEmbedded *bool `json:"include_embedded,omitempty"`
// Truncate content to this many characters before diffing.
//
// Useful for very large documents where only the first N characters matter.
// `None` means no truncation.
MaxContentChars *uint `json:"max_content_chars,omitempty"`
}
// ExtractionDiff complete diff between two `ExtractionResult` values.
type ExtractionDiff struct {
// Unified-diff hunks for the `content` field.
//
// Empty when the content is identical.
ContentDiff []DiffHunk `json:"content_diff,omitempty"`
// Tables present in `b` but not in `a` (by index position, excess right-side tables).
TablesAdded []Table `json:"tables_added,omitempty"`
// Tables present in `a` but not in `b` (by index position, excess left-side tables).
TablesRemoved []Table `json:"tables_removed,omitempty"`
// Cell-level changes for table pairs that share the same index and dimensions.
TablesChanged []TableDiff `json:"tables_changed,omitempty"`
// Metadata difference, encoded as a JSON object with three top-level keys:
// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
// but not `b`), and `changed` (keys whose values differ — each entry is
// `{ "from": <value-in-a>, "to": <value-in-b> }`).
//
// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
// preferred json-patch impl directly.
MetadataChanged json.RawMessage `json:"metadata_changed"`
// Changes to embedded archive children.
EmbeddedChanges EmbeddedChanges `json:"embedded_changes"`
}
// DiffHunk single contiguous hunk in a unified diff.
type DiffHunk struct {
// Starting line number in the old content (0-indexed).
FromLine uint `json:"from_line"`
// Number of lines from the old content in this hunk.
FromCount uint `json:"from_count"`
// Starting line number in the new content (0-indexed).
ToLine uint `json:"to_line"`
// Number of lines from the new content in this hunk.
ToCount uint `json:"to_count"`
// Lines that make up this hunk.
Lines []DiffLine `json:"lines,omitempty"`
}
// TableDiff cell-level changes for a pair of tables that share the same index.
type TableDiff struct {
// Zero-based index of the table in both `a.tables` and `b.tables`.
FromIndex uint `json:"from_index"`
// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
ToIndex uint `json:"to_index"`
// Cell-level changes within the table.
CellChanges []CellChange `json:"cell_changes,omitempty"`
}
// EmbeddedChanges changes to embedded archive children between two results.
type EmbeddedChanges struct {
// Children present in `b` but not in `a` (matched by `path`).
Added []ArchiveEntry `json:"added,omitempty"`
// Children present in `a` but not in `b` (matched by `path`).
Removed []ArchiveEntry `json:"removed,omitempty"`
// Children present in both but with differing content (matched by `path`).
//
// Each entry holds the diff of the nested `ExtractionResult`.
Changed []EmbeddedDiff `json:"changed,omitempty"`
}
// EmbeddedDiff diff for a single embedded archive entry that appears in both results.
type EmbeddedDiff struct {
// Archive-relative path identifying this entry.
Path string `json:"path"`
// The recursive diff of the entry's extraction result.
Diff ExtractionDiff `json:"diff"`
}
// EmbeddingPreset preset configurations for common RAG use cases.
//
// Each preset combines chunk size, overlap, and embedding model
// to provide an optimized configuration for specific scenarios.
//
// All string fields are owned `String` for FFI compatibility — instances
// are safe to clone and pass across language boundaries.
type EmbeddingPreset struct {
Name string `json:"name"`
ChunkSize uint `json:"chunk_size"`
Overlap uint `json:"overlap"`
// HuggingFace repository name for the model.
ModelRepo string `json:"model_repo"`
// Pooling strategy: "cls" or "mean".
Pooling string `json:"pooling"`
// Path to the ONNX model file within the repo.
ModelFile string `json:"model_file"`
Dimensions uint `json:"dimensions"`
Description string `json:"description"`
}
// YakeParams yAKE-specific parameters.
type YakeParams struct {
// Window size for co-occurrence analysis (default: 2).
//
// Controls the context window for computing co-occurrence statistics.
WindowSize *uint `json:"window_size,omitempty"`
}
// RakeParams rAKE-specific parameters.
type RakeParams struct {
// Minimum word length to consider (default: 1).
MinWordLength *uint `json:"min_word_length,omitempty"`
// Maximum words in a keyword phrase (default: 3).
MaxWordsPerPhrase *uint `json:"max_words_per_phrase,omitempty"`
}
// KeywordConfig keyword extraction configuration.
type KeywordConfig struct {
// Algorithm to use for extraction.
Algorithm KeywordAlgorithm `json:"algorithm,omitempty"`
// Maximum number of keywords to extract (default: 10).
MaxKeywords *uint `json:"max_keywords,omitempty"`
// Minimum score threshold (0.0-1.0, default: 0.0).
//
// Keywords with scores below this threshold are filtered out.
// Note: Score ranges differ between algorithms.
MinScore float32 `json:"min_score"`
// N-gram range for keyword extraction (min, max).
//
// (1, 1) = unigrams only
// (1, 2) = unigrams and bigrams
// (1, 3) = unigrams, bigrams, and trigrams (default)
NgramRange []uint `json:"ngram_range,omitempty"`
// Language code for stopword filtering (e.g., "en", "de", "fr").
//
// If None, no stopword filtering is applied.
Language *string `json:"language,omitempty"`
// YAKE-specific tuning parameters.
YakeParams *YakeParams `json:"yake_params,omitempty"`
// RAKE-specific tuning parameters.
RakeParams *RakeParams `json:"rake_params,omitempty"`
}
// Keyword extracted keyword with metadata.
type Keyword struct {
// The keyword text.
Text string `json:"text"`
// Relevance score (higher is better, algorithm-specific range).
Score float32 `json:"score"`
// Algorithm that extracted this keyword.
Algorithm KeywordAlgorithm `json:"algorithm"`
// Optional positions where keyword appears in text (character offsets).
Positions []uint `json:"positions,omitempty"`
}
// PaddleOcrConfig configuration for PaddleOCR backend.
//
// Configures PaddleOCR text detection and recognition with multi-language support.
// Uses a builder pattern for convenient configuration.
//
// Example:
//
// // Create with default English configuration
// let config = PaddleOcrConfig::new("en");
//
// // Create with custom cache directory
// let config = PaddleOcrConfig::new("ch")
// .with_cache_dir("/path/to/cache".into());
//
// // Enable table detection
// let config = PaddleOcrConfig::new("en")
// .with_table_detection(true);
type PaddleOcrConfig struct {
// Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
Language string `json:"language"`
// Optional custom cache directory for model files
CacheDir *string `json:"cache_dir,omitempty"`
// Enable angle classification for rotated text (default: false).
// Can misfire on short text regions, rotating crops incorrectly before recognition.
UseAngleCls bool `json:"use_angle_cls"`
// Enable table structure detection (default: false)
EnableTableDetection bool `json:"enable_table_detection"`
// Database threshold for text detection (default: 0.3)
// Range: 0.0-1.0, higher values require more confident detections
DetDbThresh float32 `json:"det_db_thresh"`
// Box threshold for text bounding box refinement (default: 0.5)
// Range: 0.0-1.0
DetDbBoxThresh float32 `json:"det_db_box_thresh"`
// Unclip ratio for expanding text bounding boxes (default: 1.6)
// Controls the expansion of detected text regions
DetDbUnclipRatio float32 `json:"det_db_unclip_ratio"`
// Maximum side length for detection image (default: 960)
// Larger images may be resized to this limit for faster inference
DetLimitSideLen uint32 `json:"det_limit_side_len"`
// Batch size for recognition inference (default: 6)
// Number of text regions to process simultaneously
RecBatchNum uint32 `json:"rec_batch_num"`
// Padding in pixels added around the image before detection (default: 10).
// Large values can include surrounding content like table gridlines.
Padding uint32 `json:"padding"`
// Minimum recognition confidence score for text lines (default: 0.5).
// Text regions with recognition confidence below this threshold are discarded.
// Matches PaddleOCR Python's `drop_score` parameter.
// Range: 0.0-1.0
DropScore float32 `json:"drop_score"`
// Model tier controlling detection/recognition model size and accuracy trade-off.
// - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
// - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
ModelTier string `json:"model_tier"`
}
// ModelPaths combined paths to all models needed for OCR (backward compatibility).
type ModelPaths struct {
// Path to the detection model directory.
DetModel string `json:"det_model"`
// Path to the classification model directory.
ClsModel string `json:"cls_model"`
// Path to the recognition model directory.
RecModel string `json:"rec_model"`
// Path to the character dictionary file.
DictFile string `json:"dict_file"`
}
// OrientationResult document orientation detection result.
type OrientationResult struct {
// Detected orientation in degrees (0, 90, 180, or 270).
Degrees uint32 `json:"degrees"`
// Confidence score (0.0-1.0).
Confidence float32 `json:"confidence"`
}
// BBox bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
type BBox struct {
X1 float32 `json:"x1"`
Y1 float32 `json:"y1"`
X2 float32 `json:"x2"`
Y2 float32 `json:"y2"`
}
// LayoutDetection single layout detection result.
type LayoutDetection struct {
ClassName LayoutClass `json:"class_name"`
Confidence float32 `json:"confidence"`
Bbox BBox `json:"bbox"`
}
// RecognizedTable pre-computed table markdown for a table detection region.
//
// Produced by the TATR-based table structure recognizer and surfaced as part of
// layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
// so that consumers who do not enable `layout-detection` (ORT) can still reference
// the type in their own code.
type RecognizedTable struct {
// Detection bbox that this table corresponds to (for matching).
DetectionBbox BBox `json:"detection_bbox"`
// Table cells as a 2D vector (rows × columns).
Cells [][]string `json:"cells,omitempty"`
// Rendered markdown table.
Markdown string `json:"markdown"`
}
// DetectionResult page-level detection result containing all detections and page metadata.
type DetectionResult struct {
PageWidth uint32 `json:"page_width"`
PageHeight uint32 `json:"page_height"`
Detections []LayoutDetection `json:"detections,omitempty"`
}
// EmbeddedFile embedded file descriptor extracted from the PDF name tree.
type EmbeddedFile struct {
// The filename as stored in the PDF name tree.
Name string `json:"name"`
// Raw file bytes from the embedded stream (already decompressed by lopdf).
Data []byte `json:"data"`
// Compressed byte count of the original stream (before decompression).
//
// Used by callers to compute the decompression ratio and detect zip-bomb-style
// attacks that embed a tiny compressed stream expanding to gigabytes of data.
CompressedSize uint `json:"compressed_size"`
// MIME type if specified in the filespec, otherwise `None`.
MimeType *string `json:"mime_type,omitempty"`
}
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmbeddedFile) MarshalJSON() ([]byte, error) {
// Explicit shadow struct listing every field — embedding the original
// would cause both base64-string and int-array entries for the same JSON
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
aux := struct {
Name string `json:"name"`
Data []int `json:"data"`
CompressedSize uint `json:"compressed_size"`
MimeType *string `json:"mime_type,omitempty"`
}{}
aux.Name = v.Name
aux.Data = make([]int, len(v.Data))
for i, b := range v.Data {
aux.Data[i] = int(b)
}
aux.CompressedSize = v.CompressedSize
aux.MimeType = v.MimeType
return json.Marshal(aux)
}
// PdfMetadata pDF-specific metadata.
//
// Contains metadata fields specific to PDF documents that are not in the common
// `Metadata` structure. Common fields like title, authors, keywords, and dates
// are at the `Metadata` level.
type PdfMetadata struct {
// PDF version (e.g., "1.7", "2.0")
PdfVersion *string `json:"pdf_version,omitempty"`
// PDF producer (application that created the PDF)
Producer *string `json:"producer,omitempty"`
// Whether the PDF is encrypted/password-protected
IsEncrypted *bool `json:"is_encrypted,omitempty"`
// First page width in points (1/72 inch)
Width *int64 `json:"width,omitempty"`
// First page height in points (1/72 inch)
Height *int64 `json:"height,omitempty"`
// Total number of pages in the PDF document
PageCount *uint32 `json:"page_count,omitempty"`
}
// ExtractBytes extract content from a byte array.
//
// This is the main entry point for in-memory extraction. It performs the following steps:
// 1. Validate MIME type
// 2. Handle legacy format conversion if needed
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
//
// Arguments:
// - content: The byte array to extract
// - mime_type: MIME type of the content
// - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Validation` if MIME type is invalid.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
// let config = ExtractionConfig::default();
// let bytes = b"Hello, world!";
// let result = extract_bytes(bytes, "text/plain", &config).await?;
// println!("Content: {}", result.content);
func ExtractBytes(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_bytes(cContent, cContentLen, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractFile extract content from a file.
//
// This is the main entry point for file-based extraction. It performs the following steps:
// 1. Check cache for existing result (if caching enabled)
// 2. Detect or validate MIME type
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
// 6. Store result in cache (if caching enabled)
//
// Arguments:
// - path: Path to the file to extract
// - mime_type: Optional MIME type override. If None, will be auto-detected
// - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
// let config = ExtractionConfig::default();
// let result = extract_file("document.pdf", None, &config).await?;
// println!("Content: {}", result.content);
func ExtractFile(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cMimeType *C.char
if mimeType != nil {
cMimeType = C.CString(*mimeType)
defer C.free(unsafe.Pointer(cMimeType))
}
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_file(cPath, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractFileSync synchronous wrapper for `extract_file`.
//
// This is a convenience function that blocks the current thread until extraction completes.
// For async code, use `extract_file` directly.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
//
// This function is only available with the `tokio-runtime` feature. For WASM targets,
// use a truly synchronous extraction approach instead.
//
// Example:
//
// let config = ExtractionConfig::default();
// let result = extract_file_sync("document.pdf", None, &config)?;
// println!("Content: {}", result.content);
func ExtractFileSync(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cMimeType *C.char
if mimeType != nil {
cMimeType = C.CString(*mimeType)
defer C.free(unsafe.Pointer(cMimeType))
}
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_file_sync(cPath, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// ExtractBytesSync synchronous wrapper for `extract_bytes`.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call.
//
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
//
// Example:
//
// let config = ExtractionConfig::default();
// let bytes = b"Hello, world!";
// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
// println!("Content: {}", result.content);
func ExtractBytesSync(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_extract_bytes_sync(cContent, cContentLen, cMimeType, cConfig)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_extraction_result_free(ptr)
}
return nil, err
}
defer C.kreuzberg_extraction_result_free(ptr)
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// BatchExtractFilesSync synchronous wrapper for `batch_extract_files`.
//
// Uses the global Tokio runtime for optimal performance.
// Only available with `tokio-runtime` (WASM has no filesystem).
//
// Example:
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem {
// path: "doc1.pdf".into(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// BatchFileItem { path: "doc2.pdf".into(), config: None },
// ];
// let results = batch_extract_files_sync(items, &config)?;
func BatchExtractFilesSync(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_files_sync(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractBytesSync synchronous wrapper for `batch_extract_bytes`.
//
// Uses the global Tokio runtime for optimal performance.
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
// that iterates through items and calls `extract_bytes_sync()`.
//
// Example:
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem {
// content: b"other".to_vec(),
// mime_type: "text/plain".to_string(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// ];
// let results = batch_extract_bytes_sync(items, &config)?;
func BatchExtractBytesSync(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_bytes_sync(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractFiles extract content from multiple files concurrently.
//
// This function processes multiple files in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
// taken from the batch-level `config`.
//
// Arguments:
// - items: Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
// - config: Batch-level extraction configuration (provides defaults and batch settings)
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Errors are returned when individual file errors are captured in the result metadata. System errors
// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
//
// Example:
//
// Simple usage with no per-file overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem { path: "doc1.pdf".into(), config: None },
// BatchFileItem { path: "doc2.pdf".into(), config: None },
// ];
// let results = batch_extract_files(items, &config).await?;
// println!("Processed {} files", results.len());
//
// Per-file configuration overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchFileItem {
// path: "scan.pdf".into(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// BatchFileItem { path: "notes.txt".into(), config: None },
// ];
// let results = batch_extract_files(items, &config).await?;
func BatchExtractFiles(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_files(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// BatchExtractBytes extract content from multiple byte arrays concurrently.
//
// This function processes multiple byte arrays in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` as the config to use
// the batch-level defaults for that item.
//
// Arguments:
// - items: Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
// - config: Batch-level extraction configuration
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Example:
//
// Simple usage with no per-item overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
// ];
// let results = batch_extract_bytes(items, &config).await?;
// println!("Processed {} items", results.len());
//
// Per-item configuration overrides:
//
//
// let config = ExtractionConfig::default();
// let items = vec![
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
// BatchBytesItem {
// content: b"<html>test</html>".to_vec(),
// mime_type: "text/html".to_string(),
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
// },
// ];
// let results = batch_extract_bytes(items, &config).await?;
func BatchExtractBytes(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
jsonBytescItems, err := json.Marshal(items)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cItems := C.CString(string(jsonBytescItems))
defer C.free(unsafe.Pointer(cItems))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cConfig)
ptr := C.kreuzberg_batch_extract_bytes(cItems, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// DetectMimeTypeFromBytes detect MIME type from raw file bytes.
//
// Uses magic byte signatures to detect file type from content.
// Falls back to `infer` crate for comprehensive detection.
//
// For ZIP-based files, inspects contents to distinguish Office Open XML
// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
//
// Arguments:
// - content: Raw file bytes
//
// Returns the detected MIME type string.
//
// Errors are returned when returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
func DetectMimeTypeFromBytes(content []byte) (string, error) {
var cContent *C.uint8_t
if len(content) > 0 {
var cContentPinner runtime.Pinner
cContentPinner.Pin(&content[0])
defer cContentPinner.Unpin()
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
}
cContentLen := C.uintptr_t(len(content))
ptr := C.kreuzberg_detect_mime_type_from_bytes(cContent, cContentLen)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_free_string(ptr)
}
return "", err
}
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// GetExtensionsForMime get file extensions for a given MIME type.
//
// Returns all known file extensions that map to the specified MIME type.
//
// Arguments:
// - mime_type: The MIME type to look up
//
// Returns a vector of file extensions (without leading dot) for the MIME type.
//
// Example:
//
// let extensions = get_extensions_for_mime("application/pdf");
// assert_eq!(extensions, vec!["pdf"]);
//
// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
// assert!(doc_extensions.contains(&"docx".to_string()));
func GetExtensionsForMime(mimeType string) ([]string, error) {
cMimeType := C.CString(mimeType)
defer C.free(unsafe.Pointer(cMimeType))
ptr := C.kreuzberg_get_extensions_for_mime(cMimeType)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListEmbeddingBackends list the names of all registered embedding backends.
//
// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
// bindings.
func ListEmbeddingBackends() ([]string, error) {
ptr := C.kreuzberg_list_embedding_backends()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListDocumentExtractors list names of all registered document extractors.
func ListDocumentExtractors() ([]string, error) {
ptr := C.kreuzberg_list_document_extractors()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListOcrBackends list all registered OCR backends.
//
// Returns the names of all OCR backends currently registered in the global registry.
//
// Returns a vector of OCR backend names.
//
// Example:
//
// let backends = list_ocr_backends()?;
// for name in backends {
// println!("Registered OCR backend: {}", name);
// }
func ListOcrBackends() ([]string, error) {
ptr := C.kreuzberg_list_ocr_backends()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListPostProcessors list all registered post-processor names.
//
// Returns a vector of all post-processor names currently registered in the
// global registry.
//
// Returns - `Ok(Vec<String>)` - Vector of post-processor names
// - `Err(...)` if the registry lock is poisoned
//
// Example:
//
// let processors = list_post_processors()?;
// for name in processors {
// println!("Registered post-processor: {}", name);
// }
func ListPostProcessors() ([]string, error) {
ptr := C.kreuzberg_list_post_processors()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListRenderers list names of all registered renderers.
//
// Errors are returned when returns an error if the registry lock is poisoned.
func ListRenderers() ([]string, error) {
ptr := C.kreuzberg_list_renderers()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// ListValidators list names of all registered validators.
func ListValidators() ([]string, error) {
ptr := C.kreuzberg_list_validators()
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// Compare two extraction results and return a structured diff.
//
// The comparison is purely structural — no I/O, no side effects. All fields
// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
//
// Arguments:
// - a: — the "before" extraction result
// - b: — the "after" extraction result
// - opts: — controls which sections are compared and optional truncation
//
// Example:
//
// let mut a = ExtractionResult::default();
// let mut b = ExtractionResult::default();
// a.content = "Hello world".to_string();
// b.content = "Hello Rust".to_string();
//
// let diff = compare(&a, &b, &DiffOptions::default());
// assert_eq!(diff.content_diff.len(), 1);
func Compare(a ExtractionResult, b ExtractionResult, opts DiffOptions) (*ExtractionDiff, error) {
jsonBytesca, err := json.Marshal(a)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytesca) == "null" {
jsonBytesca = []byte("{}")
}
tmpStrca := C.CString(string(jsonBytesca))
ca := C.kreuzberg_extraction_result_from_json(tmpStrca)
C.free(unsafe.Pointer(tmpStrca))
if ca == nil {
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_result_free(ca)
jsonBytescb, err := json.Marshal(b)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescb) == "null" {
jsonBytescb = []byte("{}")
}
tmpStrcb := C.CString(string(jsonBytescb))
cb := C.kreuzberg_extraction_result_from_json(tmpStrcb)
C.free(unsafe.Pointer(tmpStrcb))
if cb == nil {
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_result_free(cb)
jsonBytescOpts, err := json.Marshal(opts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescOpts) == "null" {
jsonBytescOpts = []byte("{}")
}
tmpStrcOpts := C.CString(string(jsonBytescOpts))
cOpts := C.kreuzberg_diff_options_from_json(tmpStrcOpts)
C.free(unsafe.Pointer(tmpStrcOpts))
if cOpts == nil {
return nil, fmt.Errorf("failed to create diff_options: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_diff_options_free(cOpts)
ptr := C.kreuzberg_compare(ca, cb, cOpts)
defer C.kreuzberg_extraction_diff_free(ptr)
jsonPtr := C.kreuzberg_extraction_diff_to_json(ptr)
if jsonPtr == nil {
return nil, fmt.Errorf("failed to convert to JSON")
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionDiff
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return &result, nil
}
// EmbedTextsAsync generate embeddings asynchronously for a list of text strings.
//
// This is the async counterpart to [`embed_texts`]. It offloads the blocking
// ONNX inference work to a dedicated blocking thread pool via Tokio's
// `spawn_blocking`, keeping the async executor free.
//
// Returns one embedding vector per input text in the same order.
//
// Arguments:
// - texts: Vec of strings to embed (owned, sent to blocking thread)
// - config: Embedding configuration specifying model, batch size, and normalization
//
// Errors are returned when - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
// or the blocking inference task panics
//
// Example:
//
// let embeddings = embed_texts_async(
// vec!["Hello!".to_string()],
// &EmbeddingConfig::default(),
// ).await?;
func EmbedTextsAsync(texts []string, config EmbeddingConfig) ([][]float32, error) {
jsonBytescTexts, err := json.Marshal(texts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cTexts := C.CString(string(jsonBytescTexts))
defer C.free(unsafe.Pointer(cTexts))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_embedding_config_free(cConfig)
ptr := C.kreuzberg_embed_texts_async(cTexts, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result [][]float32
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// RenderPdfPageToPng render a single PDF page to PNG bytes.
//
// Returns raw PNG-encoded bytes for the specified page at the given DPI.
// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
//
// Arguments:
// - pdf_bytes: Raw PDF file bytes
// - page_index: Zero-based page index
// - dpi: Resolution in dots per inch (default: 150)
// - password: Optional password for encrypted PDFs
//
// Errors are returned when returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
// or rendered, or if `page_index` is out of range.
func RenderPdfPageToPng(pdfBytes []byte, pageIndex uint, dpi *int32, password *string) ([]byte, error) {
var cPdfBytes *C.uint8_t
if len(pdfBytes) > 0 {
var cPdfBytesPinner runtime.Pinner
cPdfBytesPinner.Pin(&pdfBytes[0])
defer cPdfBytesPinner.Unpin()
cPdfBytes = (*C.uint8_t)(unsafe.Pointer(&pdfBytes[0]))
}
cPdfBytesLen := C.uintptr_t(len(pdfBytes))
cPageIndex := C.size_t(uint(pageIndex))
var cDpi C.int32_t = C.int32_t(int32(2147483647))
if dpi != nil {
cDpi = C.int32_t(int32(*dpi))
}
var cPassword *C.char
if password != nil {
cPassword = C.CString(*password)
defer C.free(unsafe.Pointer(cPassword))
}
var outPtr *C.uint8_t
var outLen, outCap C.uintptr_t
rc := C.kreuzberg_render_pdf_page_to_png(cPdfBytes, cPdfBytesLen, cPageIndex, cDpi, cPassword, &outPtr, &outLen, &outCap)
if rc != 0 {
return nil, lastError()
}
if outPtr == nil {
return nil, lastError()
}
result := C.GoBytes(unsafe.Pointer(outPtr), C.int(outLen))
C.kreuzberg_free_bytes(outPtr, outLen, outCap)
return result, nil
}
// DetectMimeType detect the MIME type of a file at the given path.
//
// Uses the file extension and optionally the file content to determine the MIME type.
// Set `check_exists` to `true` to verify the file exists before detection.
func DetectMimeType(path string, checkExists bool) (string, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
var cCheckExists C.int32_t
if checkExists {
cCheckExists = 1
} else {
cCheckExists = 0
}
ptr := C.kreuzberg_detect_mime_type(cPath, cCheckExists)
if err := lastError(); err != nil {
if ptr != nil {
C.kreuzberg_free_string(ptr)
}
return "", err
}
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// EmbedTexts embed a list of texts using the configured embedding model.
//
// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
func EmbedTexts(texts []string, config EmbeddingConfig) ([][]float32, error) {
jsonBytescTexts, err := json.Marshal(texts)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
cTexts := C.CString(string(jsonBytescTexts))
defer C.free(unsafe.Pointer(cTexts))
jsonBytescConfig, err := json.Marshal(config)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescConfig) == "null" {
jsonBytescConfig = []byte("{}")
}
tmpStrcConfig := C.CString(string(jsonBytescConfig))
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
C.free(unsafe.Pointer(tmpStrcConfig))
if cConfig == nil {
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_embedding_config_free(cConfig)
ptr := C.kreuzberg_embed_texts(cTexts, cConfig)
if err := lastError(); err != nil {
return nil, err
}
if ptr == nil {
return nil, fmt.Errorf("failed to get result")
}
defer C.kreuzberg_free_string(ptr)
var result [][]float32
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal: %w", err)
}
return result, nil
}
// GetEmbeddingPreset get an embedding preset by name.
//
// Returns `None` if no preset with the given name exists. Returns an owned
// clone so the value is safe to pass across FFI boundaries.
func GetEmbeddingPreset(name string) *EmbeddingPreset {
cName := C.CString(name)
defer C.free(unsafe.Pointer(cName))
ptr := C.kreuzberg_get_embedding_preset(cName)
return func() *EmbeddingPreset {
jsonPtr := C.kreuzberg_embedding_preset_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result EmbeddingPreset
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}()
}
// ListEmbeddingPresets list the names of all available embedding presets.
//
// Returns owned `String`s so the values are safe to pass across FFI boundaries.
func ListEmbeddingPresets() []string {
ptr := C.kreuzberg_list_embedding_presets()
return func() []string {
if ptr == nil {
return nil
}
defer C.kreuzberg_free_string(ptr)
var result []string
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
return nil
}
return result
}()
}
// NeedsImageProcessing check if image processing is needed by examining OCR and image extraction settings.
//
// Returns `true` if either OCR is enabled or image extraction is configured,
// indicating that image decompression and processing should occur.
// Returns `false` if both are disabled, allowing optimization to skip unnecessary
// image decompression for text-only extraction workflows.
//
// # Optimization Impact
// For text-only extractions (no OCR, no image extraction), skipping image
// decompression can improve CPU utilization by 5-10% by avoiding wasteful
// image I/O and processing when results won't be used.
func (r *ExtractionConfig) NeedsImageProcessing() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_extraction_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_extraction_config_free(cRecv)
ptr := C.kreuzberg_extraction_config_needs_image_processing(cRecv)
return ptr != 0, nil
}
// ListenAddr get the server listen address (host:port).
//
// Example:
//
// let config = ServerConfig::default();
// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
func (r *ServerConfig) ListenAddr() (string, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return "", fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return "", fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_listen_addr(cRecv)
defer C.kreuzberg_free_string(ptr)
return C.GoString(ptr), nil
}
// CorsAllowsAll check if CORS allows all origins.
//
// Returns `true` if the `cors_origins` vector is empty, meaning all origins
// are allowed. Returns `false` if specific origins are configured.
//
// Example:
//
// let mut config = ServerConfig::default();
// assert!(config.cors_allows_all());
//
// config.cors_origins.push("https://example.com".to_string());
// assert!(!config.cors_allows_all());
func (r *ServerConfig) CorsAllowsAll() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_cors_allows_all(cRecv)
return ptr != 0, nil
}
// IsOriginAllowed check if a given origin is allowed by CORS configuration.
//
// Returns `true` if:
// - CORS allows all origins (empty origins list), or
// - The given origin is in the allowed origins list
//
// Arguments:
// - origin: The origin to check (e.g., "https://example.com")
//
// Example:
//
// let mut config = ServerConfig::default();
// assert!(config.is_origin_allowed("https://example.com"));
//
// config.cors_origins.push("https://allowed.com".to_string());
// assert!(config.is_origin_allowed("https://allowed.com"));
// assert!(!config.is_origin_allowed("https://denied.com"));
func (r *ServerConfig) IsOriginAllowed(origin string) (bool, error) {
cOrigin := C.CString(origin)
defer C.free(unsafe.Pointer(cOrigin))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_is_origin_allowed(cRecv, cOrigin)
return ptr != 0, nil
}
// MaxRequestBodyMb get maximum request body size in megabytes (rounded up).
//
// Example:
//
// let mut config = ServerConfig::default();
// assert_eq!(config.max_request_body_mb(), 100);
func (r *ServerConfig) MaxRequestBodyMb() (uint, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_max_request_body_mb(cRecv)
return uint(ptr), nil
}
// MaxMultipartFieldMb get maximum multipart field size in megabytes (rounded up).
//
// Example:
//
// let mut config = ServerConfig::default();
// assert_eq!(config.max_multipart_field_mb(), 100);
func (r *ServerConfig) MaxMultipartFieldMb() (uint, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_server_config_free(cRecv)
ptr := C.kreuzberg_server_config_max_multipart_field_mb(cRecv)
return uint(ptr), nil
}
// FinalizeNodeTypes compute and populate the `node_types` field from the current `nodes`.
//
// Call this after all nodes have been added to the structure. Internal
// construction paths (builder, derivation) call this automatically.
//
// Example:
//
// let mut structure = DocumentStructure {
// nodes: vec![DocumentNode {
// id: NodeId::from("n1"),
// content: NodeContent::Paragraph { text: "Hello".into() },
// parent: None,
// children: vec![],
// content_layer: Default::default(),
// page: None,
// page_end: None,
// bbox: None,
// annotations: vec![],
// attributes: None,
// }],
// source_format: None,
// relationships: vec![],
// node_types: vec![],
// };
// structure.finalize_node_types();
// assert!(structure.node_types.contains(&"paragraph".to_string()));
func (r *DocumentStructure) FinalizeNodeTypes() error {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_document_structure_free(cRecv)
C.kreuzberg_document_structure_finalize_node_types(cRecv)
jsonPtrUpdated := C.kreuzberg_document_structure_to_json(cRecv)
if jsonPtrUpdated != nil {
_ = json.Unmarshal([]byte(C.GoString(jsonPtrUpdated)), r)
C.kreuzberg_free_string(jsonPtrUpdated)
}
return nil
}
// IsEmpty check if the document structure is empty.
func (r *DocumentStructure) IsEmpty() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_document_structure_free(cRecv)
ptr := C.kreuzberg_document_structure_is_empty(cRecv)
return ptr != 0, nil
}
// FromOcr convert from an OCR result.
func ExtractionResultFromOcr(ocr OcrExtractionResult) (*ExtractionResult, error) {
jsonBytescOcr, err := json.Marshal(ocr)
if err != nil {
return nil, fmt.Errorf("failed to marshal: %w", err)
}
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
// instance is constructed instead — semantically equivalent to None for query types
// whose fields are all optional with serde(default).
if string(jsonBytescOcr) == "null" {
jsonBytescOcr = []byte("{}")
}
tmpStrcOcr := C.CString(string(jsonBytescOcr))
cOcr := C.kreuzberg_ocr_extraction_result_from_json(tmpStrcOcr)
C.free(unsafe.Pointer(tmpStrcOcr))
if cOcr == nil {
return nil, fmt.Errorf("failed to create ocr_extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_ocr_extraction_result_free(cOcr)
ptr := C.kreuzberg_extraction_result_from_ocr(cOcr)
defer C.kreuzberg_extraction_result_free(ptr)
return func() *ExtractionResult {
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result ExtractionResult
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// IsEmpty returns `true` when no metadata fields, format-specific metadata, or
// additional postprocessor fields are populated.
func (r *Metadata) IsEmpty() (bool, error) {
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return false, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_metadata_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_metadata_free(cRecv)
ptr := C.kreuzberg_metadata_is_empty(cRecv)
return ptr != 0, nil
}
// WithCacheDir sets a custom cache directory for model files.
//
// Arguments:
// - path: Path to cache directory
//
// Example:
//
// let config = PaddleOcrConfig::new("en")
// .with_cache_dir(PathBuf::from("/tmp/paddle-cache"));
func (r *PaddleOcrConfig) WithCacheDir(path string) (*PaddleOcrConfig, error) {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_cache_dir(cRecv, cPath)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithTableDetection enables or disables table structure detection.
//
// Arguments:
// - enable: Whether to enable table detection
//
// Example:
//
// let config = PaddleOcrConfig::new("en")
// .with_table_detection(true);
func (r *PaddleOcrConfig) WithTableDetection(enable bool) (*PaddleOcrConfig, error) {
var cEnable C.int32_t
if enable {
cEnable = 1
} else {
cEnable = 0
}
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_table_detection(cRecv, cEnable)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithAngleCls enables or disables angle classification for rotated text.
//
// Arguments:
// - enable: Whether to enable angle classification
func (r *PaddleOcrConfig) WithAngleCls(enable bool) (*PaddleOcrConfig, error) {
var cEnable C.int32_t
if enable {
cEnable = 1
} else {
cEnable = 0
}
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_angle_cls(cRecv, cEnable)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbThresh sets the database threshold for text detection.
//
// Arguments:
// - threshold: Detection threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbThresh(threshold float32) (*PaddleOcrConfig, error) {
cThreshold := C.float(float32(threshold))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_thresh(cRecv, cThreshold)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbBoxThresh sets the box threshold for text bounding box refinement.
//
// Arguments:
// - threshold: Box threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbBoxThresh(threshold float32) (*PaddleOcrConfig, error) {
cThreshold := C.float(float32(threshold))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_box_thresh(cRecv, cThreshold)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetDbUnclipRatio sets the unclip ratio for expanding text bounding boxes.
//
// Arguments:
// - ratio: Unclip ratio (typically 1.5-2.0)
func (r *PaddleOcrConfig) WithDetDbUnclipRatio(ratio float32) (*PaddleOcrConfig, error) {
cRatio := C.float(float32(ratio))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_unclip_ratio(cRecv, cRatio)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDetLimitSideLen sets the maximum side length for detection images.
//
// Arguments:
// - length: Maximum side length in pixels
func (r *PaddleOcrConfig) WithDetLimitSideLen(length uint32) (*PaddleOcrConfig, error) {
cLength := C.uint32_t(uint32(length))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_det_limit_side_len(cRecv, cLength)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithRecBatchNum sets the batch size for recognition inference.
//
// Arguments:
// - batch_size: Number of text regions to process simultaneously
func (r *PaddleOcrConfig) WithRecBatchNum(batchSize uint32) (*PaddleOcrConfig, error) {
cBatchSize := C.uint32_t(uint32(batchSize))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_rec_batch_num(cRecv, cBatchSize)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithDropScore sets the minimum recognition confidence threshold.
//
// Arguments:
// - score: Minimum confidence (0.0-1.0), text below this is dropped
func (r *PaddleOcrConfig) WithDropScore(score float32) (*PaddleOcrConfig, error) {
cScore := C.float(float32(score))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_drop_score(cRecv, cScore)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithPadding sets padding in pixels added around images before detection.
//
// Arguments:
// - padding: Padding in pixels (0-100)
func (r *PaddleOcrConfig) WithPadding(padding uint32) (*PaddleOcrConfig, error) {
cPadding := C.uint32_t(uint32(padding))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_padding(cRecv, cPadding)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}
// WithModelTier sets the model tier controlling detection/recognition model size.
//
// Arguments:
// - tier: `"mobile"` (default, lightweight, faster) or `"server"` (high accuracy, GPU/complex documents)
func (r *PaddleOcrConfig) WithModelTier(tier string) (*PaddleOcrConfig, error) {
cTier := C.CString(tier)
defer C.free(unsafe.Pointer(cTier))
jsonBytesRecv, err := json.Marshal(r)
if err != nil {
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
}
tmpStrRecv := C.CString(string(jsonBytesRecv))
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
C.free(unsafe.Pointer(tmpStrRecv))
if cRecv == nil {
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
}
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
ptr := C.kreuzberg_paddle_ocr_config_with_model_tier(cRecv, cTier)
defer C.kreuzberg_paddle_ocr_config_free(ptr)
return func() *PaddleOcrConfig {
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
if jsonPtr == nil {
return nil
}
defer C.kreuzberg_free_string(jsonPtr)
var result PaddleOcrConfig
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
return nil
}
return &result
}(), nil
}