fil/packages/go/v5/binding.go

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef

// Package kreuzberg provides Go bindings for the kreuzberg library.
package kreuzberg

/*
#cgo CFLAGS: -I${SRCDIR}/include
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/.lib/macos-arm64 -Wl,-rpath,${SRCDIR}/.lib/macos-arm64 -lkreuzberg_ffi
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/.lib/macos-amd64 -Wl,-rpath,${SRCDIR}/.lib/macos-amd64 -lkreuzberg_ffi
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/.lib/linux-amd64 -Wl,-rpath,${SRCDIR}/.lib/linux-amd64 -lkreuzberg_ffi
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/.lib/linux-arm64 -Wl,-rpath,${SRCDIR}/.lib/linux-arm64 -lkreuzberg_ffi
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/.lib/windows-amd64 -lkreuzberg_ffi
#include "kreuzberg.h"
*/
import "C"

import (
	"encoding/json"
	"errors"
	"fmt"
	"runtime"
	"unsafe"
)

// lastError retrieves the last error from the FFI layer.
func lastError() error {
	code := int32(C.kreuzberg_last_error_code())
	if code == 0 {
		return nil
	}
	ctx := C.kreuzberg_last_error_context()
	if ctx == nil {
		return fmt.Errorf("[%d] native error", code)
	}
	message := C.GoString(ctx)
	return fmt.Errorf("[%d] %s", code, message)
}

// unmarshalBytes copies a C byte buffer into a Go []byte.
//
// The pointer is treated as a NUL-terminated C string; binary payloads
// that may contain interior NULs should be exposed by the FFI with an
// explicit length out-parameter instead.
func unmarshalBytes(ptr *C.uint8_t) []byte {
	if ptr == nil {
		return nil
	}
	return []byte(C.GoString((*C.char)(unsafe.Pointer(ptr))))
}

// Ptr returns a pointer to the given value.
//
// Used by data DTOs to construct pointers for optional fields without the
// functional-options pattern boilerplate. For example:
//
//	&MyStruct{Field: Ptr("value"), OtherField: Ptr(42)}
func Ptr[T any](v T) *T {
	return &v
}

var (
	// ErrIo is returned when IO error.
	ErrIo = errors.New("IO error")
	// ErrParsing is returned when parsing error.
	ErrParsing = errors.New("parsing error")
	// ErrOcr is returned when OCR error.
	ErrOcr = errors.New("OCR error")
	// ErrValidation is returned when validation error.
	ErrValidation = errors.New("validation error")
	// ErrCache is returned when cache error.
	ErrCache = errors.New("cache error")
	// ErrImageProcessing is returned when image processing error.
	ErrImageProcessing = errors.New("image processing error")
	// ErrSerialization is returned when serialization error.
	ErrSerialization = errors.New("serialization error")
	// ErrMissingDependency is returned when missing dependency.
	ErrMissingDependency = errors.New("missing dependency")
	// ErrPlugin is returned when plugin error in.
	ErrPlugin = errors.New("plugin error in")
	// ErrLockPoisoned is returned when lock poisoned.
	ErrLockPoisoned = errors.New("lock poisoned")
	// ErrUnsupportedFormat is returned when unsupported format.
	ErrUnsupportedFormat = errors.New("unsupported format")
	// ErrEmbedding is returned when embedding error.
	ErrEmbedding = errors.New("embedding error")
	// ErrTimeout is returned when extraction timed out after ms (limit: ms).
	ErrTimeout = errors.New("extraction timed out after ms (limit: ms)")
	// ErrCancelled is returned when extraction cancelled.
	ErrCancelled = errors.New("extraction cancelled")
	// ErrSecurity is returned when security violation.
	ErrSecurity = errors.New("security violation")
	// ErrOther is returned when other.
	ErrOther = errors.New("other")
)

// Error is a structured error type.
type Error struct {
	Code    string
	Message string
}

func (e Error) Error() string { return e.Message }

// ExecutionProviderType is an enumeration type.
type ExecutionProviderType string

const (
	// ExecutionProviderTypeAuto ExecutionProviderTypeAuto auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
	ExecutionProviderTypeAuto ExecutionProviderType = "auto"
	// ExecutionProviderTypeCPU ExecutionProviderTypeCPU cPU execution provider (always available).
	ExecutionProviderTypeCPU ExecutionProviderType = "cpu"
	// ExecutionProviderTypeCoreMl ExecutionProviderTypeCoreMl apple CoreML (macOS/iOS Neural Engine + GPU).
	ExecutionProviderTypeCoreMl ExecutionProviderType = "core_ml"
	// ExecutionProviderTypeCuda ExecutionProviderTypeCuda nVIDIA CUDA GPU acceleration.
	ExecutionProviderTypeCuda ExecutionProviderType = "cuda"
	// ExecutionProviderTypeTensorRt ExecutionProviderTypeTensorRt nVIDIA TensorRT (optimized CUDA inference).
	ExecutionProviderTypeTensorRt ExecutionProviderType = "tensor_rt"
)

// OutputFormat output format for extraction results.
//
// Controls the format of the `content` field in `ExtractionResult`.
// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
// `Plain` returns the raw extracted text.
// `Structured` returns JSON with full OCR element data including bounding
// boxes and confidence scores.
type OutputFormat string

const (
	// OutputFormatPlain plain text content only (default)
	OutputFormatPlain OutputFormat = "plain"
	// OutputFormatMarkdown markdown format
	OutputFormatMarkdown OutputFormat = "markdown"
	// OutputFormatDjot djot markup format
	OutputFormatDjot OutputFormat = "djot"
	// OutputFormatHTML hTML format
	OutputFormatHTML OutputFormat = "html"
	// OutputFormatJSON jSON tree format with heading-driven sections.
	OutputFormatJSON OutputFormat = "json"
	// OutputFormatStructured structured JSON format with full OCR element metadata.
	OutputFormatStructured OutputFormat = "structured"
)

// HTMLTheme is an enumeration type.
type HTMLTheme string

const (
	// HTMLThemeDefault HTMLThemeDefault sensible defaults: system font stack, neutral colours, readable line
	// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
	// can override individual values.
	HTMLThemeDefault HTMLTheme = "default"
	// HTMLThemeGitHub HTMLThemeGitHub gitHub Markdown-inspired palette and spacing.
	HTMLThemeGitHub HTMLTheme = "git_hub"
	// HTMLThemeDark HTMLThemeDark dark background, light text.
	HTMLThemeDark HTMLTheme = "dark"
	// HTMLThemeLight HTMLThemeLight minimal light theme with generous whitespace.
	HTMLThemeLight HTMLTheme = "light"
	// HTMLThemeUnstyled HTMLThemeUnstyled no built-in stylesheet emitted. CSS custom properties are still defined
	// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
	HTMLThemeUnstyled HTMLTheme = "unstyled"
)

// TableModel is an enumeration type.
type TableModel string

const (
	// TableModelTatr TableModelTatr tATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
	TableModelTatr TableModel = "tatr"
	// TableModelSlanetWired TableModelSlanetWired sLANeXT wired variant -- 365MB, optimized for bordered tables.
	TableModelSlanetWired TableModel = "slanet_wired"
	// TableModelSlanetWireless TableModelSlanetWireless sLANeXT wireless variant -- 365MB, optimized for borderless tables.
	TableModelSlanetWireless TableModel = "slanet_wireless"
	// TableModelSlanetPlus TableModelSlanetPlus sLANet-plus -- 7.78MB, lightweight general-purpose.
	TableModelSlanetPlus TableModel = "slanet_plus"
	// TableModelSlanetAuto TableModelSlanetAuto classifier-routed SLANeXT: auto-select wired/wireless per table.
	// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
	TableModelSlanetAuto TableModel = "slanet_auto"
	// TableModelDisabled TableModelDisabled disable table structure model inference entirely; use heuristic path only.
	TableModelDisabled TableModel = "disabled"
)

// ChunkerType is an enumeration type.
type ChunkerType string

const (
	// ChunkerTypeText ChunkerTypeText is the Text variant of ChunkerType.
	ChunkerTypeText ChunkerType = "text"
	// ChunkerTypeMarkdown ChunkerTypeMarkdown is the Markdown variant of ChunkerType.
	ChunkerTypeMarkdown ChunkerType = "markdown"
	// ChunkerTypeYaml ChunkerTypeYaml is the Yaml variant of ChunkerType.
	ChunkerTypeYaml ChunkerType = "yaml"
	// ChunkerTypeSemantic ChunkerTypeSemantic is the Semantic variant of ChunkerType.
	ChunkerTypeSemantic ChunkerType = "semantic"
)

// ChunkSizing how chunk size is measured.
//
// Defaults to `Characters` (Unicode character count). When using token-based sizing,
// chunks are sized by token count according to the specified tokenizer.
//
// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
// Variants: Characters, Tokenizer
// Sealed interface — use one of ChunkSizingCharacters, ChunkSizingTokenizer.
type ChunkSizing interface {
	isChunkSizing()
	Type() string
}

// ChunkSizingCharacters size measured in Unicode characters (default).
type ChunkSizingCharacters struct {
}

func (ChunkSizingCharacters) isChunkSizing() {}

func (ChunkSizingCharacters) Type() string { return "characters" }

func (v ChunkSizingCharacters) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type string `json:"type"`
	}
	return json.Marshal(aux{
		Type: v.Type(),
	})
}

// ChunkSizingTokenizer size measured in tokens from a HuggingFace tokenizer.
type ChunkSizingTokenizer struct {
	// HuggingFace model ID or path, e.g. "Xenova/gpt-4o", "bert-base-uncased".
	Model string `json:"model"`
	// Optional cache directory override for tokenizer files.
	// Defaults to hf-hub's standard cache (`~/.cache/huggingface/`).
	// Can also be set via `KREUZBERG_TOKENIZER_CACHE_DIR` environment variable.
	CacheDir *string `json:"cache_dir,omitempty"`
}

func (ChunkSizingTokenizer) isChunkSizing() {}

func (ChunkSizingTokenizer) Type() string { return "tokenizer" }

func (v ChunkSizingTokenizer) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type     string  `json:"type"`
		Model    string  `json:"model"`
		CacheDir *string `json:"cache_dir,omitempty"`
	}
	return json.Marshal(aux{
		Type:     v.Type(),
		Model:    v.Model,
		CacheDir: v.CacheDir,
	})
}

// UnmarshalChunkSizing decodes JSON data into the appropriate concrete ChunkSizing variant.
func UnmarshalChunkSizing(data []byte) (ChunkSizing, error) {
	var wire struct {
		Type string `json:"type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.Type {
	case "characters":
		var v ChunkSizingCharacters
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "tokenizer":
		var v ChunkSizingTokenizer
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown ChunkSizing type: %q", wire.Type)
}

// EmbeddingModelType embedding model types supported by Kreuzberg.
// Variants: Preset, Custom, Llm, Plugin
// Sealed interface — use one of EmbeddingModelTypePreset, EmbeddingModelTypeCustom.
type EmbeddingModelType interface {
	isEmbeddingModelType()
	Type() string
}

// EmbeddingModelTypePreset use a preset model configuration (recommended)
type EmbeddingModelTypePreset struct {
	Name string `json:"name"`
}

func (EmbeddingModelTypePreset) isEmbeddingModelType() {}

func (EmbeddingModelTypePreset) Type() string { return "preset" }

func (v EmbeddingModelTypePreset) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type string `json:"type"`
		Name string `json:"name"`
	}
	return json.Marshal(aux{
		Type: v.Type(),
		Name: v.Name,
	})
}

// EmbeddingModelTypeCustom use a custom ONNX model from HuggingFace
type EmbeddingModelTypeCustom struct {
	ModelID    string `json:"model_id"`
	Dimensions uint   `json:"dimensions"`
}

func (EmbeddingModelTypeCustom) isEmbeddingModelType() {}

func (EmbeddingModelTypeCustom) Type() string { return "custom" }

func (v EmbeddingModelTypeCustom) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type       string `json:"type"`
		ModelID    string `json:"model_id"`
		Dimensions uint   `json:"dimensions"`
	}
	return json.Marshal(aux{
		Type:       v.Type(),
		ModelID:    v.ModelID,
		Dimensions: v.Dimensions,
	})
}

// EmbeddingModelTypeLlm provider-hosted embedding model via liter-llm.
//
// Uses the model specified in the nested `LlmConfig` (e.g.,
// `"openai/text-embedding-3-small"`).
type EmbeddingModelTypeLlm struct {
	Llm LlmConfig `json:"llm"`
}

func (EmbeddingModelTypeLlm) isEmbeddingModelType() {}

func (EmbeddingModelTypeLlm) Type() string { return "llm" }

func (v EmbeddingModelTypeLlm) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type string    `json:"type"`
		Llm  LlmConfig `json:"llm"`
	}
	return json.Marshal(aux{
		Type: v.Type(),
		Llm:  v.Llm,
	})
}

// EmbeddingModelTypePlugin in-process embedding backend registered via the plugin system.
//
// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
// into the registered backend during chunking and standalone embed requests —
// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
//
// When this variant is selected, only the following [`EmbeddingConfig`] fields
// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
// `show_download_progress`, `acceleration`) are ignored — the host owns the
// model lifecycle.
//
// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
// is used, since there is no preset to look a chunk-size ceiling up against — size your
// context window via `max_characters` directly.
//
// See `register_embedding_backend`.
type EmbeddingModelTypePlugin struct {
	Name string `json:"name"`
}

func (EmbeddingModelTypePlugin) isEmbeddingModelType() {}

func (EmbeddingModelTypePlugin) Type() string { return "plugin" }

func (v EmbeddingModelTypePlugin) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type string `json:"type"`
		Name string `json:"name"`
	}
	return json.Marshal(aux{
		Type: v.Type(),
		Name: v.Name,
	})
}

// UnmarshalEmbeddingModelType decodes JSON data into the appropriate concrete EmbeddingModelType variant.
func UnmarshalEmbeddingModelType(data []byte) (EmbeddingModelType, error) {
	var wire struct {
		Type string `json:"type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.Type {
	case "preset":
		var v EmbeddingModelTypePreset
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "custom":
		var v EmbeddingModelTypeCustom
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "llm":
		var v EmbeddingModelTypeLlm
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "plugin":
		var v EmbeddingModelTypePlugin
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown EmbeddingModelType type: %q", wire.Type)
}

// CodeContentMode is an enumeration type.
type CodeContentMode string

const (
	// CodeContentModeChunks CodeContentModeChunks use TSLP semantic chunks as content (default).
	CodeContentModeChunks CodeContentMode = "chunks"
	// CodeContentModeRaw CodeContentModeRaw use raw source code as content.
	CodeContentModeRaw CodeContentMode = "raw"
	// CodeContentModeStructure CodeContentModeStructure emit function/class headings + docstrings (no code bodies).
	CodeContentModeStructure CodeContentMode = "structure"
)

// ListType is an enumeration type.
type ListType string

const (
	// ListTypeBullet ListTypeBullet bullet points (-, *, •, etc.)
	ListTypeBullet ListType = "bullet"
	// ListTypeNumbered ListTypeNumbered numbered lists (1., 2., etc.)
	ListTypeNumbered ListType = "numbered"
	// ListTypeLettered ListTypeLettered lettered lists (a., b., A., B., etc.)
	ListTypeLettered ListType = "lettered"
	// ListTypeIndented ListTypeIndented indented items
	ListTypeIndented ListType = "indented"
)

// OcrBackendType is an enumeration type.
type OcrBackendType string

const (
	// OcrBackendTypeTesseract OcrBackendTypeTesseract tesseract OCR (native Rust binding)
	OcrBackendTypeTesseract OcrBackendType = "tesseract"
	// OcrBackendTypeEasyOcr OcrBackendTypeEasyOcr easyOCR (Python-based, via FFI)
	OcrBackendTypeEasyOcr OcrBackendType = "easy_ocr"
	// OcrBackendTypePaddleOcr OcrBackendTypePaddleOcr paddleOCR (Python-based, via FFI)
	OcrBackendTypePaddleOcr OcrBackendType = "paddle_ocr"
	// OcrBackendTypeCustom OcrBackendTypeCustom custom/third-party OCR backend
	OcrBackendTypeCustom OcrBackendType = "custom"
)

// ProcessingStage is an enumeration type.
type ProcessingStage string

const (
	// ProcessingStageEarly ProcessingStageEarly early stage - foundational processing.
	//
	// Use for:
	// - Language detection
	// - Character encoding normalization
	// - Entity extraction (NER)
	// - Text quality scoring
	ProcessingStageEarly ProcessingStage = "early"
	// ProcessingStageMiddle ProcessingStageMiddle middle stage - content transformation.
	//
	// Use for:
	// - Keyword extraction
	// - Token reduction
	// - Text summarization
	// - Semantic analysis
	ProcessingStageMiddle ProcessingStage = "middle"
	// ProcessingStageLate ProcessingStageLate late stage - final enrichment.
	//
	// Use for:
	// - Custom user hooks
	// - Analytics/logging
	// - Final validation
	// - Output formatting
	ProcessingStageLate ProcessingStage = "late"
)

// ReductionLevel is an enumeration type.
type ReductionLevel string

const (
	// ReductionLevelOff ReductionLevelOff is the Off variant of ReductionLevel.
	ReductionLevelOff ReductionLevel = "off"
	// ReductionLevelLight ReductionLevelLight is the Light variant of ReductionLevel.
	ReductionLevelLight ReductionLevel = "light"
	// ReductionLevelModerate ReductionLevelModerate is the Moderate variant of ReductionLevel.
	ReductionLevelModerate ReductionLevel = "moderate"
	// ReductionLevelAggressive ReductionLevelAggressive is the Aggressive variant of ReductionLevel.
	ReductionLevelAggressive ReductionLevel = "aggressive"
	// ReductionLevelMaximum ReductionLevelMaximum is the Maximum variant of ReductionLevel.
	ReductionLevelMaximum ReductionLevel = "maximum"
)

// PdfAnnotationType is an enumeration type.
type PdfAnnotationType string

const (
	// PdfAnnotationTypeText PdfAnnotationTypeText sticky note / text annotation
	PdfAnnotationTypeText PdfAnnotationType = "text"
	// PdfAnnotationTypeHighlight PdfAnnotationTypeHighlight highlighted text region
	PdfAnnotationTypeHighlight PdfAnnotationType = "highlight"
	// PdfAnnotationTypeLink PdfAnnotationTypeLink hyperlink annotation
	PdfAnnotationTypeLink PdfAnnotationType = "link"
	// PdfAnnotationTypeStamp PdfAnnotationTypeStamp rubber stamp annotation
	PdfAnnotationTypeStamp PdfAnnotationType = "stamp"
	// PdfAnnotationTypeUnderline PdfAnnotationTypeUnderline underline text markup
	PdfAnnotationTypeUnderline PdfAnnotationType = "underline"
	// PdfAnnotationTypeStrikeOut PdfAnnotationTypeStrikeOut strikeout text markup
	PdfAnnotationTypeStrikeOut PdfAnnotationType = "strike_out"
	// PdfAnnotationTypeOther PdfAnnotationTypeOther any other annotation type
	PdfAnnotationTypeOther PdfAnnotationType = "other"
)

// BlockType is an enumeration type.
type BlockType string

const (
	// BlockTypeParagraph BlockTypeParagraph is the Paragraph variant of BlockType.
	BlockTypeParagraph BlockType = "paragraph"
	// BlockTypeHeading BlockTypeHeading is the Heading variant of BlockType.
	BlockTypeHeading BlockType = "heading"
	// BlockTypeBlockquote BlockTypeBlockquote is the Blockquote variant of BlockType.
	BlockTypeBlockquote BlockType = "blockquote"
	// BlockTypeCodeBlock BlockTypeCodeBlock is the CodeBlock variant of BlockType.
	BlockTypeCodeBlock BlockType = "code_block"
	// BlockTypeListItem BlockTypeListItem is the ListItem variant of BlockType.
	BlockTypeListItem BlockType = "list_item"
	// BlockTypeOrderedList BlockTypeOrderedList is the OrderedList variant of BlockType.
	BlockTypeOrderedList BlockType = "ordered_list"
	// BlockTypeBulletList BlockTypeBulletList is the BulletList variant of BlockType.
	BlockTypeBulletList BlockType = "bullet_list"
	// BlockTypeTaskList BlockTypeTaskList is the TaskList variant of BlockType.
	BlockTypeTaskList BlockType = "task_list"
	// BlockTypeDefinitionList BlockTypeDefinitionList is the DefinitionList variant of BlockType.
	BlockTypeDefinitionList BlockType = "definition_list"
	// BlockTypeDefinitionTerm BlockTypeDefinitionTerm is the DefinitionTerm variant of BlockType.
	BlockTypeDefinitionTerm BlockType = "definition_term"
	// BlockTypeDefinitionDescription BlockTypeDefinitionDescription is the DefinitionDescription variant of BlockType.
	BlockTypeDefinitionDescription BlockType = "definition_description"
	// BlockTypeDiv BlockTypeDiv is the Div variant of BlockType.
	BlockTypeDiv BlockType = "div"
	// BlockTypeSection BlockTypeSection is the Section variant of BlockType.
	BlockTypeSection BlockType = "section"
	// BlockTypeThematicBreak BlockTypeThematicBreak is the ThematicBreak variant of BlockType.
	BlockTypeThematicBreak BlockType = "thematic_break"
	// BlockTypeRawBlock BlockTypeRawBlock is the RawBlock variant of BlockType.
	BlockTypeRawBlock BlockType = "raw_block"
	// BlockTypeMathDisplay BlockTypeMathDisplay is the MathDisplay variant of BlockType.
	BlockTypeMathDisplay BlockType = "math_display"
)

// InlineType is an enumeration type.
type InlineType string

const (
	// InlineTypeText InlineTypeText is the Text variant of InlineType.
	InlineTypeText InlineType = "text"
	// InlineTypeStrong InlineTypeStrong is the Strong variant of InlineType.
	InlineTypeStrong InlineType = "strong"
	// InlineTypeEmphasis InlineTypeEmphasis is the Emphasis variant of InlineType.
	InlineTypeEmphasis InlineType = "emphasis"
	// InlineTypeHighlight InlineTypeHighlight is the Highlight variant of InlineType.
	InlineTypeHighlight InlineType = "highlight"
	// InlineTypeSubscript InlineTypeSubscript is the Subscript variant of InlineType.
	InlineTypeSubscript InlineType = "subscript"
	// InlineTypeSuperscript InlineTypeSuperscript is the Superscript variant of InlineType.
	InlineTypeSuperscript InlineType = "superscript"
	// InlineTypeInsert InlineTypeInsert is the Insert variant of InlineType.
	InlineTypeInsert InlineType = "insert"
	// InlineTypeDelete InlineTypeDelete is the Delete variant of InlineType.
	InlineTypeDelete InlineType = "delete"
	// InlineTypeCode InlineTypeCode is the Code variant of InlineType.
	InlineTypeCode InlineType = "code"
	// InlineTypeLink InlineTypeLink is the Link variant of InlineType.
	InlineTypeLink InlineType = "link"
	// InlineTypeImage InlineTypeImage is the Image variant of InlineType.
	InlineTypeImage InlineType = "image"
	// InlineTypeSpan InlineTypeSpan is the Span variant of InlineType.
	InlineTypeSpan InlineType = "span"
	// InlineTypeMath InlineTypeMath is the Math variant of InlineType.
	InlineTypeMath InlineType = "math"
	// InlineTypeRawInline InlineTypeRawInline is the RawInline variant of InlineType.
	InlineTypeRawInline InlineType = "raw_inline"
	// InlineTypeFootnoteRef InlineTypeFootnoteRef is the FootnoteRef variant of InlineType.
	InlineTypeFootnoteRef InlineType = "footnote_ref"
	// InlineTypeSymbol InlineTypeSymbol is the Symbol variant of InlineType.
	InlineTypeSymbol InlineType = "symbol"
)

// RelationshipKind is an enumeration type.
type RelationshipKind string

const (
	// RelationshipKindFootnoteReference RelationshipKindFootnoteReference footnote marker -> footnote definition.
	RelationshipKindFootnoteReference RelationshipKind = "footnote_reference"
	// RelationshipKindCitationReference RelationshipKindCitationReference citation marker -> bibliography entry.
	RelationshipKindCitationReference RelationshipKind = "citation_reference"
	// RelationshipKindInternalLink RelationshipKindInternalLink internal anchor link (`#id`) -> target heading/element.
	RelationshipKindInternalLink RelationshipKind = "internal_link"
	// RelationshipKindCaption RelationshipKindCaption caption paragraph -> figure/table it describes.
	RelationshipKindCaption RelationshipKind = "caption"
	// RelationshipKindLabel RelationshipKindLabel label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
	RelationshipKindLabel RelationshipKind = "label"
	// RelationshipKindTocEntry RelationshipKindTocEntry tOC entry -> target section.
	RelationshipKindTocEntry RelationshipKind = "toc_entry"
	// RelationshipKindCrossReference RelationshipKindCrossReference cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
	RelationshipKindCrossReference RelationshipKind = "cross_reference"
)

// ContentLayer is an enumeration type.
type ContentLayer string

const (
	// ContentLayerBody ContentLayerBody main document body content.
	ContentLayerBody ContentLayer = "body"
	// ContentLayerHeader ContentLayerHeader page/section header (running header).
	ContentLayerHeader ContentLayer = "header"
	// ContentLayerFooter ContentLayerFooter page/section footer (running footer).
	ContentLayerFooter ContentLayer = "footer"
	// ContentLayerFootnote ContentLayerFootnote footnote content.
	ContentLayerFootnote ContentLayer = "footnote"
)

// NodeContent tagged enum for node content. Each variant carries only type-specific data.
//
// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
// Go/Java/TypeScript bindings.
// Variants: Title, Heading, Paragraph, List, ListItem, Table, Image, Code, Quote, Formula, Footnote, Group, PageBreak, Slide, DefinitionList, DefinitionItem, Citation, Admonition, RawBlock, MetadataBlock
// Sealed interface — use one of NodeContentTitle, NodeContentHeading.
type NodeContent interface {
	isNodeContent()
	Type() string
}

// NodeContentTitle document title.
type NodeContentTitle struct {
	Text string `json:"text"`
}

func (NodeContentTitle) isNodeContent() {}

func (NodeContentTitle) Type() string { return "title" }

func (v NodeContentTitle) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
	})
}

// NodeContentHeading section heading with level (1-6).
type NodeContentHeading struct {
	Level uint8  `json:"level"`
	Text  string `json:"text"`
}

func (NodeContentHeading) isNodeContent() {}

func (NodeContentHeading) Type() string { return "heading" }

func (v NodeContentHeading) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Level    uint8  `json:"level"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Level:    v.Level,
		Text:     v.Text,
	})
}

// NodeContentParagraph body text paragraph.
type NodeContentParagraph struct {
	Text string `json:"text"`
}

func (NodeContentParagraph) isNodeContent() {}

func (NodeContentParagraph) Type() string { return "paragraph" }

func (v NodeContentParagraph) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
	})
}

// NodeContentList list container — children are `ListItem` nodes.
type NodeContentList struct {
	Ordered bool `json:"ordered"`
}

func (NodeContentList) isNodeContent() {}

func (NodeContentList) Type() string { return "list" }

func (v NodeContentList) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Ordered  bool   `json:"ordered"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Ordered:  v.Ordered,
	})
}

// NodeContentListItem individual list item.
type NodeContentListItem struct {
	Text string `json:"text"`
}

func (NodeContentListItem) isNodeContent() {}

func (NodeContentListItem) Type() string { return "list_item" }

func (v NodeContentListItem) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
	})
}

// NodeContentTable table with structured cell grid.
type NodeContentTable struct {
	Grid TableGrid `json:"grid"`
}

func (NodeContentTable) isNodeContent() {}

func (NodeContentTable) Type() string { return "table" }

func (v NodeContentTable) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string    `json:"node_type"`
		Grid     TableGrid `json:"grid"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Grid:     v.Grid,
	})
}

// NodeContentImage image reference.
type NodeContentImage struct {
	Description *string `json:"description,omitempty"`
	ImageIndex  *uint32 `json:"image_index,omitempty"`
	// Source URL or path of the image (from `<img src="...">` or `![](src)`).
	Src *string `json:"src,omitempty"`
}

func (NodeContentImage) isNodeContent() {}

func (NodeContentImage) Type() string { return "image" }

func (v NodeContentImage) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType    string  `json:"node_type"`
		Description *string `json:"description,omitempty"`
		ImageIndex  *uint32 `json:"image_index,omitempty"`
		Src         *string `json:"src,omitempty"`
	}
	return json.Marshal(aux{
		NodeType:    v.Type(),
		Description: v.Description,
		ImageIndex:  v.ImageIndex,
		Src:         v.Src,
	})
}

// NodeContentCode code block.
type NodeContentCode struct {
	Text     string  `json:"text"`
	Language *string `json:"language,omitempty"`
}

func (NodeContentCode) isNodeContent() {}

func (NodeContentCode) Type() string { return "code" }

func (v NodeContentCode) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string  `json:"node_type"`
		Text     string  `json:"text"`
		Language *string `json:"language,omitempty"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
		Language: v.Language,
	})
}

// NodeContentQuote block quote — container, children carry the quoted content.
type NodeContentQuote struct {
}

func (NodeContentQuote) isNodeContent() {}

func (NodeContentQuote) Type() string { return "quote" }

func (v NodeContentQuote) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
	})
}

// NodeContentFormula mathematical formula / equation.
type NodeContentFormula struct {
	Text string `json:"text"`
}

func (NodeContentFormula) isNodeContent() {}

func (NodeContentFormula) Type() string { return "formula" }

func (v NodeContentFormula) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
	})
}

// NodeContentFootnote footnote reference content.
type NodeContentFootnote struct {
	Text string `json:"text"`
}

func (NodeContentFootnote) isNodeContent() {}

func (NodeContentFootnote) Type() string { return "footnote" }

func (v NodeContentFootnote) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Text:     v.Text,
	})
}

// NodeContentGroup logical grouping container (section, key-value area).
//
// `heading_level` + `heading_text` capture the section heading directly
// rather than relying on a first-child positional convention.
type NodeContentGroup struct {
	Label        *string `json:"label,omitempty"`
	HeadingLevel *uint8  `json:"heading_level,omitempty"`
	HeadingText  *string `json:"heading_text,omitempty"`
}

func (NodeContentGroup) isNodeContent() {}

func (NodeContentGroup) Type() string { return "group" }

func (v NodeContentGroup) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType     string  `json:"node_type"`
		Label        *string `json:"label,omitempty"`
		HeadingLevel *uint8  `json:"heading_level,omitempty"`
		HeadingText  *string `json:"heading_text,omitempty"`
	}
	return json.Marshal(aux{
		NodeType:     v.Type(),
		Label:        v.Label,
		HeadingLevel: v.HeadingLevel,
		HeadingText:  v.HeadingText,
	})
}

// NodeContentPageBreak page break marker.
type NodeContentPageBreak struct {
}

func (NodeContentPageBreak) isNodeContent() {}

func (NodeContentPageBreak) Type() string { return "page_break" }

func (v NodeContentPageBreak) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
	})
}

// NodeContentSlide presentation slide container — children are the slide's content nodes.
type NodeContentSlide struct {
	// 1-indexed slide number.
	Number uint32  `json:"number"`
	Title  *string `json:"title,omitempty"`
}

func (NodeContentSlide) isNodeContent() {}

func (NodeContentSlide) Type() string { return "slide" }

func (v NodeContentSlide) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string  `json:"node_type"`
		Number   uint32  `json:"number"`
		Title    *string `json:"title,omitempty"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Number:   v.Number,
		Title:    v.Title,
	})
}

// NodeContentDefinitionList definition list container — children are `DefinitionItem` nodes.
type NodeContentDefinitionList struct {
}

func (NodeContentDefinitionList) isNodeContent() {}

func (NodeContentDefinitionList) Type() string { return "definition_list" }

func (v NodeContentDefinitionList) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
	})
}

// NodeContentDefinitionItem individual definition list entry with term and definition.
type NodeContentDefinitionItem struct {
	Term       string `json:"term"`
	Definition string `json:"definition"`
}

func (NodeContentDefinitionItem) isNodeContent() {}

func (NodeContentDefinitionItem) Type() string { return "definition_item" }

func (v NodeContentDefinitionItem) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType   string `json:"node_type"`
		Term       string `json:"term"`
		Definition string `json:"definition"`
	}
	return json.Marshal(aux{
		NodeType:   v.Type(),
		Term:       v.Term,
		Definition: v.Definition,
	})
}

// NodeContentCitation citation or bibliographic reference.
type NodeContentCitation struct {
	Key  string `json:"key"`
	Text string `json:"text"`
}

func (NodeContentCitation) isNodeContent() {}

func (NodeContentCitation) Type() string { return "citation" }

func (v NodeContentCitation) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Key      string `json:"key"`
		Text     string `json:"text"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Key:      v.Key,
		Text:     v.Text,
	})
}

// NodeContentAdmonition admonition / callout container (note, warning, tip, etc.).
//
// Children carry the admonition body content.
type NodeContentAdmonition struct {
	// Kind of admonition (e.g. "note", "warning", "tip", "danger").
	Kind  string  `json:"kind"`
	Title *string `json:"title,omitempty"`
}

func (NodeContentAdmonition) isNodeContent() {}

func (NodeContentAdmonition) Type() string { return "admonition" }

func (v NodeContentAdmonition) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string  `json:"node_type"`
		Kind     string  `json:"kind"`
		Title    *string `json:"title,omitempty"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Kind:     v.Kind,
		Title:    v.Title,
	})
}

// NodeContentRawBlock raw block preserved verbatim from the source format.
//
// Used for content that cannot be mapped to a semantic node type
// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
type NodeContentRawBlock struct {
	// Source format identifier (e.g. "html", "latex", "jsx").
	Format  string `json:"format"`
	Content string `json:"content"`
}

func (NodeContentRawBlock) isNodeContent() {}

func (NodeContentRawBlock) Type() string { return "raw_block" }

func (v NodeContentRawBlock) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string `json:"node_type"`
		Format   string `json:"format"`
		Content  string `json:"content"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Format:   v.Format,
		Content:  v.Content,
	})
}

// NodeContentMetadataBlock structured metadata block (email headers, YAML frontmatter, etc.).
type NodeContentMetadataBlock struct {
	Entries [][]string `json:"entries"`
}

func (NodeContentMetadataBlock) isNodeContent() {}

func (NodeContentMetadataBlock) Type() string { return "metadata_block" }

func (v NodeContentMetadataBlock) MarshalJSON() ([]byte, error) {
	type aux struct {
		NodeType string     `json:"node_type"`
		Entries  [][]string `json:"entries"`
	}
	return json.Marshal(aux{
		NodeType: v.Type(),
		Entries:  v.Entries,
	})
}

// UnmarshalNodeContent decodes JSON data into the appropriate concrete NodeContent variant.
func UnmarshalNodeContent(data []byte) (NodeContent, error) {
	var wire struct {
		NodeType string `json:"node_type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.NodeType {
	case "title":
		var v NodeContentTitle
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "heading":
		var v NodeContentHeading
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "paragraph":
		var v NodeContentParagraph
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "list":
		var v NodeContentList
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "list_item":
		var v NodeContentListItem
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "table":
		var v NodeContentTable
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "image":
		var v NodeContentImage
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "code":
		var v NodeContentCode
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "quote":
		var v NodeContentQuote
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "formula":
		var v NodeContentFormula
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "footnote":
		var v NodeContentFootnote
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "group":
		var v NodeContentGroup
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "page_break":
		var v NodeContentPageBreak
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "slide":
		var v NodeContentSlide
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "definition_list":
		var v NodeContentDefinitionList
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "definition_item":
		var v NodeContentDefinitionItem
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "citation":
		var v NodeContentCitation
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "admonition":
		var v NodeContentAdmonition
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "raw_block":
		var v NodeContentRawBlock
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "metadata_block":
		var v NodeContentMetadataBlock
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown NodeContent type: %q", wire.NodeType)
}

// AnnotationKind types of inline text annotations.
// Variants: Bold, Italic, Underline, Strikethrough, Code, Subscript, Superscript, Link, Highlight, Color, FontSize, Custom
// Sealed interface — use one of AnnotationKindBold, AnnotationKindItalic.
type AnnotationKind interface {
	isAnnotationKind()
	Type() string
}

// AnnotationKindBold is the Bold variant of AnnotationKind.
type AnnotationKindBold struct {
}

func (AnnotationKindBold) isAnnotationKind() {}

func (AnnotationKindBold) Type() string { return "bold" }

func (v AnnotationKindBold) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindItalic is the Italic variant of AnnotationKind.
type AnnotationKindItalic struct {
}

func (AnnotationKindItalic) isAnnotationKind() {}

func (AnnotationKindItalic) Type() string { return "italic" }

func (v AnnotationKindItalic) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindUnderline is the Underline variant of AnnotationKind.
type AnnotationKindUnderline struct {
}

func (AnnotationKindUnderline) isAnnotationKind() {}

func (AnnotationKindUnderline) Type() string { return "underline" }

func (v AnnotationKindUnderline) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindStrikethrough is the Strikethrough variant of AnnotationKind.
type AnnotationKindStrikethrough struct {
}

func (AnnotationKindStrikethrough) isAnnotationKind() {}

func (AnnotationKindStrikethrough) Type() string { return "strikethrough" }

func (v AnnotationKindStrikethrough) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindCode is the Code variant of AnnotationKind.
type AnnotationKindCode struct {
}

func (AnnotationKindCode) isAnnotationKind() {}

func (AnnotationKindCode) Type() string { return "code" }

func (v AnnotationKindCode) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindSubscript is the Subscript variant of AnnotationKind.
type AnnotationKindSubscript struct {
}

func (AnnotationKindSubscript) isAnnotationKind() {}

func (AnnotationKindSubscript) Type() string { return "subscript" }

func (v AnnotationKindSubscript) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindSuperscript is the Superscript variant of AnnotationKind.
type AnnotationKindSuperscript struct {
}

func (AnnotationKindSuperscript) isAnnotationKind() {}

func (AnnotationKindSuperscript) Type() string { return "superscript" }

func (v AnnotationKindSuperscript) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindLink is the Link variant of AnnotationKind.
type AnnotationKindLink struct {
	URL   string  `json:"url"`
	Title *string `json:"title,omitempty"`
}

func (AnnotationKindLink) isAnnotationKind() {}

func (AnnotationKindLink) Type() string { return "link" }

func (v AnnotationKindLink) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string  `json:"annotation_type"`
		URL            string  `json:"url"`
		Title          *string `json:"title,omitempty"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
		URL:            v.URL,
		Title:          v.Title,
	})
}

// AnnotationKindHighlight highlighted text (PDF highlights, HTML `<mark>`).
type AnnotationKindHighlight struct {
}

func (AnnotationKindHighlight) isAnnotationKind() {}

func (AnnotationKindHighlight) Type() string { return "highlight" }

func (v AnnotationKindHighlight) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
	})
}

// AnnotationKindColor text color (CSS-compatible value, e.g. "#ff0000", "red").
type AnnotationKindColor struct {
	Value string `json:"value"`
}

func (AnnotationKindColor) isAnnotationKind() {}

func (AnnotationKindColor) Type() string { return "color" }

func (v AnnotationKindColor) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
		Value          string `json:"value"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
		Value:          v.Value,
	})
}

// AnnotationKindFontSize font size with units (e.g. "12pt", "1.2em", "16px").
type AnnotationKindFontSize struct {
	Value string `json:"value"`
}

func (AnnotationKindFontSize) isAnnotationKind() {}

func (AnnotationKindFontSize) Type() string { return "font_size" }

func (v AnnotationKindFontSize) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string `json:"annotation_type"`
		Value          string `json:"value"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
		Value:          v.Value,
	})
}

// AnnotationKindCustom extensible annotation for format-specific styling.
type AnnotationKindCustom struct {
	Name  string  `json:"name"`
	Value *string `json:"value,omitempty"`
}

func (AnnotationKindCustom) isAnnotationKind() {}

func (AnnotationKindCustom) Type() string { return "custom" }

func (v AnnotationKindCustom) MarshalJSON() ([]byte, error) {
	type aux struct {
		AnnotationType string  `json:"annotation_type"`
		Name           string  `json:"name"`
		Value          *string `json:"value,omitempty"`
	}
	return json.Marshal(aux{
		AnnotationType: v.Type(),
		Name:           v.Name,
		Value:          v.Value,
	})
}

// UnmarshalAnnotationKind decodes JSON data into the appropriate concrete AnnotationKind variant.
func UnmarshalAnnotationKind(data []byte) (AnnotationKind, error) {
	var wire struct {
		AnnotationType string `json:"annotation_type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.AnnotationType {
	case "bold":
		var v AnnotationKindBold
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "italic":
		var v AnnotationKindItalic
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "underline":
		var v AnnotationKindUnderline
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "strikethrough":
		var v AnnotationKindStrikethrough
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "code":
		var v AnnotationKindCode
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "subscript":
		var v AnnotationKindSubscript
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "superscript":
		var v AnnotationKindSuperscript
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "link":
		var v AnnotationKindLink
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "highlight":
		var v AnnotationKindHighlight
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "color":
		var v AnnotationKindColor
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "font_size":
		var v AnnotationKindFontSize
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "custom":
		var v AnnotationKindCustom
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown AnnotationKind type: %q", wire.AnnotationType)
}

// ExtractionMethod is an enumeration type.
type ExtractionMethod string

const (
	// ExtractionMethodNative ExtractionMethodNative is the Native variant of ExtractionMethod.
	ExtractionMethodNative ExtractionMethod = "native"
	// ExtractionMethodOcr ExtractionMethodOcr is the Ocr variant of ExtractionMethod.
	ExtractionMethodOcr ExtractionMethod = "ocr"
	// ExtractionMethodMixed ExtractionMethodMixed is the Mixed variant of ExtractionMethod.
	ExtractionMethodMixed ExtractionMethod = "mixed"
)

// ChunkType is an enumeration type.
type ChunkType string

const (
	// ChunkTypeHeading ChunkTypeHeading section heading or document title.
	ChunkTypeHeading ChunkType = "heading"
	// ChunkTypePartyList ChunkTypePartyList party list: names, addresses, and signatories.
	ChunkTypePartyList ChunkType = "party_list"
	// ChunkTypeDefinitions ChunkTypeDefinitions definition clause ("X means…", "X shall mean…").
	ChunkTypeDefinitions ChunkType = "definitions"
	// ChunkTypeOperativeClause ChunkTypeOperativeClause operative clause containing legal/contractual action verbs.
	ChunkTypeOperativeClause ChunkType = "operative_clause"
	// ChunkTypeSignatureBlock ChunkTypeSignatureBlock signature block with signatures, names, and dates.
	ChunkTypeSignatureBlock ChunkType = "signature_block"
	// ChunkTypeSchedule ChunkTypeSchedule schedule, annex, appendix, or exhibit section.
	ChunkTypeSchedule ChunkType = "schedule"
	// ChunkTypeTableLike ChunkTypeTableLike table-like content with aligned columns or repeated patterns.
	ChunkTypeTableLike ChunkType = "table_like"
	// ChunkTypeFormula ChunkTypeFormula mathematical formula or equation.
	ChunkTypeFormula ChunkType = "formula"
	// ChunkTypeCodeBlock ChunkTypeCodeBlock code block or preformatted content.
	ChunkTypeCodeBlock ChunkType = "code_block"
	// ChunkTypeImage ChunkTypeImage embedded or referenced image content.
	ChunkTypeImage ChunkType = "image"
	// ChunkTypeOrgChart ChunkTypeOrgChart organizational chart or hierarchy diagram.
	ChunkTypeOrgChart ChunkType = "org_chart"
	// ChunkTypeDiagram ChunkTypeDiagram diagram, figure, or visual illustration.
	ChunkTypeDiagram ChunkType = "diagram"
	// ChunkTypeUnknown ChunkTypeUnknown unclassified or mixed content.
	ChunkTypeUnknown ChunkType = "unknown"
)

// ImageKind is an enumeration type.
type ImageKind string

const (
	// ImageKindPhotograph ImageKindPhotograph photographic image (natural scene, photograph)
	ImageKindPhotograph ImageKind = "photograph"
	// ImageKindDiagram ImageKindDiagram technical or schematic diagram
	ImageKindDiagram ImageKind = "diagram"
	// ImageKindChart ImageKindChart chart, graph, or plot
	ImageKindChart ImageKind = "chart"
	// ImageKindDrawing ImageKindDrawing freehand or technical drawing
	ImageKindDrawing ImageKind = "drawing"
	// ImageKindTextBlock ImageKindTextBlock text-heavy image (scanned text, document)
	ImageKindTextBlock ImageKind = "text_block"
	// ImageKindDecoration ImageKindDecoration decorative element or border
	ImageKindDecoration ImageKind = "decoration"
	// ImageKindLogo ImageKindLogo logo or brand mark
	ImageKindLogo ImageKind = "logo"
	// ImageKindIcon ImageKindIcon small icon
	ImageKindIcon ImageKind = "icon"
	// ImageKindTileFragment ImageKindTileFragment fragment of a larger tiled image (tile of a technical drawing)
	ImageKindTileFragment ImageKind = "tile_fragment"
	// ImageKindMask ImageKindMask mask or transparency map
	ImageKindMask ImageKind = "mask"
	// ImageKindPageRaster ImageKindPageRaster full-page render produced during OCR preprocessing; used as a citation thumbnail.
	ImageKindPageRaster ImageKind = "page_raster"
	// ImageKindUnknown ImageKindUnknown could not classify with reasonable confidence
	ImageKindUnknown ImageKind = "unknown"
)

// ResultFormat is an enumeration type.
type ResultFormat string

const (
	// ResultFormatUnified ResultFormatUnified unified format with all content in `content` field
	ResultFormatUnified ResultFormat = "unified"
	// ResultFormatElementBased ResultFormatElementBased element-based format with semantic element extraction
	ResultFormatElementBased ResultFormat = "element_based"
)

// ElementType is an enumeration type.
type ElementType string

const (
	// ElementTypeTitle ElementTypeTitle document title
	ElementTypeTitle ElementType = "title"
	// ElementTypeNarrativeText ElementTypeNarrativeText main narrative text body
	ElementTypeNarrativeText ElementType = "narrative_text"
	// ElementTypeHeading ElementTypeHeading section heading
	ElementTypeHeading ElementType = "heading"
	// ElementTypeListItem ElementTypeListItem list item (bullet, numbered, etc.)
	ElementTypeListItem ElementType = "list_item"
	// ElementTypeTable ElementTypeTable table element
	ElementTypeTable ElementType = "table"
	// ElementTypeImage ElementTypeImage image element
	ElementTypeImage ElementType = "image"
	// ElementTypePageBreak ElementTypePageBreak page break marker
	ElementTypePageBreak ElementType = "page_break"
	// ElementTypeCodeBlock ElementTypeCodeBlock code block
	ElementTypeCodeBlock ElementType = "code_block"
	// ElementTypeBlockQuote ElementTypeBlockQuote block quote
	ElementTypeBlockQuote ElementType = "block_quote"
	// ElementTypeFooter ElementTypeFooter footer text
	ElementTypeFooter ElementType = "footer"
	// ElementTypeHeader ElementTypeHeader header text
	ElementTypeHeader ElementType = "header"
)

// FormatMetadata format-specific metadata (discriminated union).
//
// Only one format type can exist per extraction result. This provides
// type-safe, clean metadata without nested optionals.
// Variants: Pdf, Docx, Excel, Email, Pptx, Archive, Image, Xml, Text, Html, Ocr, Csv, Bibtex, Citation, FictionBook, Dbf, Jats, Epub, Pst, Code
type FormatMetadata struct {
	FormatType  string               `json:"format_type"`
	Pdf         *PdfMetadata         `json:"pdf,omitempty"`
	Docx        *DocxMetadata        `json:"docx,omitempty"`
	Excel       *ExcelMetadata       `json:"excel,omitempty"`
	Email       *EmailMetadata       `json:"email,omitempty"`
	Pptx        *PptxMetadata        `json:"pptx,omitempty"`
	Archive     *ArchiveMetadata     `json:"archive,omitempty"`
	Image       *ImageMetadata       `json:"image,omitempty"`
	XML         *XMLMetadata         `json:"xml,omitempty"`
	Text        *TextMetadata        `json:"text,omitempty"`
	HTML        *HTMLMetadata        `json:"html,omitempty"`
	Ocr         *OcrMetadata         `json:"ocr,omitempty"`
	Csv         *CsvMetadata         `json:"csv,omitempty"`
	Bibtex      *BibtexMetadata      `json:"bibtex,omitempty"`
	Citation    *CitationMetadata    `json:"citation,omitempty"`
	FictionBook *FictionBookMetadata `json:"fiction_book,omitempty"`
	Dbf         *DbfMetadata         `json:"dbf,omitempty"`
	Jats        *JatsMetadata        `json:"jats,omitempty"`
	Epub        *EpubMetadata        `json:"epub,omitempty"`
	Pst         *PstMetadata         `json:"pst,omitempty"`
}

// MarshalJSON encodes the tagged union with the discriminator tag.
func (t FormatMetadata) MarshalJSON() ([]byte, error) {
	switch t.FormatType {
	case "pdf":
		if t.Pdf != nil {
			data, err := json.Marshal(t.Pdf)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"pdf"`)
			return json.Marshal(m)
		}
	case "docx":
		if t.Docx != nil {
			data, err := json.Marshal(t.Docx)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"docx"`)
			return json.Marshal(m)
		}
	case "excel":
		if t.Excel != nil {
			data, err := json.Marshal(t.Excel)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"excel"`)
			return json.Marshal(m)
		}
	case "email":
		if t.Email != nil {
			data, err := json.Marshal(t.Email)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"email"`)
			return json.Marshal(m)
		}
	case "pptx":
		if t.Pptx != nil {
			data, err := json.Marshal(t.Pptx)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"pptx"`)
			return json.Marshal(m)
		}
	case "archive":
		if t.Archive != nil {
			data, err := json.Marshal(t.Archive)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"archive"`)
			return json.Marshal(m)
		}
	case "image":
		if t.Image != nil {
			data, err := json.Marshal(t.Image)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"image"`)
			return json.Marshal(m)
		}
	case "xml":
		if t.XML != nil {
			data, err := json.Marshal(t.XML)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"xml"`)
			return json.Marshal(m)
		}
	case "text":
		if t.Text != nil {
			data, err := json.Marshal(t.Text)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"text"`)
			return json.Marshal(m)
		}
	case "html":
		if t.HTML != nil {
			data, err := json.Marshal(t.HTML)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"html"`)
			return json.Marshal(m)
		}
	case "ocr":
		if t.Ocr != nil {
			data, err := json.Marshal(t.Ocr)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"ocr"`)
			return json.Marshal(m)
		}
	case "csv":
		if t.Csv != nil {
			data, err := json.Marshal(t.Csv)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"csv"`)
			return json.Marshal(m)
		}
	case "bibtex":
		if t.Bibtex != nil {
			data, err := json.Marshal(t.Bibtex)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"bibtex"`)
			return json.Marshal(m)
		}
	case "citation":
		if t.Citation != nil {
			data, err := json.Marshal(t.Citation)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"citation"`)
			return json.Marshal(m)
		}
	case "fiction_book":
		if t.FictionBook != nil {
			data, err := json.Marshal(t.FictionBook)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"fiction_book"`)
			return json.Marshal(m)
		}
	case "dbf":
		if t.Dbf != nil {
			data, err := json.Marshal(t.Dbf)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"dbf"`)
			return json.Marshal(m)
		}
	case "jats":
		if t.Jats != nil {
			data, err := json.Marshal(t.Jats)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"jats"`)
			return json.Marshal(m)
		}
	case "epub":
		if t.Epub != nil {
			data, err := json.Marshal(t.Epub)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"epub"`)
			return json.Marshal(m)
		}
	case "pst":
		if t.Pst != nil {
			data, err := json.Marshal(t.Pst)
			if err != nil {
				return nil, err
			}
			var m map[string]json.RawMessage
			if err := json.Unmarshal(data, &m); err != nil {
				return nil, err
			}
			m["format_type"] = []byte(`"pst"`)
			return json.Marshal(m)
		}
	}
	// Fallback: return just the tag
	return json.Marshal(map[string]string{"format_type": t.FormatType})
}

// UnmarshalJSON decodes a tagged union by reading the tag first.
func (t *FormatMetadata) UnmarshalJSON(data []byte) error {
	// Probe for the tag first
	var probe struct {
		FormatType string `json:"format_type"`
	}
	if err := json.Unmarshal(data, &probe); err != nil {
		return err
	}

	t.FormatType = probe.FormatType

	switch probe.FormatType {
	case "pdf":
		t.Pdf = &PdfMetadata{}
		return json.Unmarshal(data, t.Pdf)
	case "docx":
		t.Docx = &DocxMetadata{}
		return json.Unmarshal(data, t.Docx)
	case "excel":
		t.Excel = &ExcelMetadata{}
		return json.Unmarshal(data, t.Excel)
	case "email":
		t.Email = &EmailMetadata{}
		return json.Unmarshal(data, t.Email)
	case "pptx":
		t.Pptx = &PptxMetadata{}
		return json.Unmarshal(data, t.Pptx)
	case "archive":
		t.Archive = &ArchiveMetadata{}
		return json.Unmarshal(data, t.Archive)
	case "image":
		t.Image = &ImageMetadata{}
		return json.Unmarshal(data, t.Image)
	case "xml":
		t.XML = &XMLMetadata{}
		return json.Unmarshal(data, t.XML)
	case "text":
		t.Text = &TextMetadata{}
		return json.Unmarshal(data, t.Text)
	case "html":
		t.HTML = &HTMLMetadata{}
		return json.Unmarshal(data, t.HTML)
	case "ocr":
		t.Ocr = &OcrMetadata{}
		return json.Unmarshal(data, t.Ocr)
	case "csv":
		t.Csv = &CsvMetadata{}
		return json.Unmarshal(data, t.Csv)
	case "bibtex":
		t.Bibtex = &BibtexMetadata{}
		return json.Unmarshal(data, t.Bibtex)
	case "citation":
		t.Citation = &CitationMetadata{}
		return json.Unmarshal(data, t.Citation)
	case "fiction_book":
		t.FictionBook = &FictionBookMetadata{}
		return json.Unmarshal(data, t.FictionBook)
	case "dbf":
		t.Dbf = &DbfMetadata{}
		return json.Unmarshal(data, t.Dbf)
	case "jats":
		t.Jats = &JatsMetadata{}
		return json.Unmarshal(data, t.Jats)
	case "epub":
		t.Epub = &EpubMetadata{}
		return json.Unmarshal(data, t.Epub)
	case "pst":
		t.Pst = &PstMetadata{}
		return json.Unmarshal(data, t.Pst)
	}
	return nil
}

// TextDirection is an enumeration type.
type TextDirection string

const (
	// TextDirectionLeftToRight TextDirectionLeftToRight left-to-right text direction
	TextDirectionLeftToRight TextDirection = "ltr"
	// TextDirectionRightToLeft TextDirectionRightToLeft right-to-left text direction
	TextDirectionRightToLeft TextDirection = "rtl"
	// TextDirectionAuto TextDirectionAuto automatic text direction detection
	TextDirectionAuto TextDirection = "auto"
)

// LinkType is an enumeration type.
type LinkType string

const (
	// LinkTypeAnchor LinkTypeAnchor anchor link (#section)
	LinkTypeAnchor LinkType = "anchor"
	// LinkTypeInternal LinkTypeInternal internal link (same domain)
	LinkTypeInternal LinkType = "internal"
	// LinkTypeExternal LinkTypeExternal external link (different domain)
	LinkTypeExternal LinkType = "external"
	// LinkTypeEmail LinkTypeEmail email link (mailto:)
	LinkTypeEmail LinkType = "email"
	// LinkTypePhone LinkTypePhone phone link (tel:)
	LinkTypePhone LinkType = "phone"
	// LinkTypeOther LinkTypeOther other link type
	LinkTypeOther LinkType = "other"
)

// ImageType is an enumeration type.
type ImageType string

const (
	// ImageTypeDataURI ImageTypeDataURI data URI image
	ImageTypeDataURI ImageType = "data-uri"
	// ImageTypeInlineSvg ImageTypeInlineSvg inline SVG
	ImageTypeInlineSvg ImageType = "inline-svg"
	// ImageTypeExternal ImageTypeExternal external image URL
	ImageTypeExternal ImageType = "external"
	// ImageTypeRelative ImageTypeRelative relative path image
	ImageTypeRelative ImageType = "relative"
)

// StructuredDataType is an enumeration type.
type StructuredDataType string

const (
	// StructuredDataTypeJSONLd StructuredDataTypeJSONLd jSON-LD structured data
	StructuredDataTypeJSONLd StructuredDataType = "json-ld"
	// StructuredDataTypeMicrodata StructuredDataTypeMicrodata microdata
	StructuredDataTypeMicrodata StructuredDataType = "microdata"
	// StructuredDataTypeRdFa StructuredDataTypeRdFa rDFa
	StructuredDataTypeRdFa StructuredDataType = "rdfa"
)

// OcrBoundingGeometry bounding geometry for an OCR element.
//
// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
// (from PaddleOCR and rotated text detection).
// Variants: Rectangle, Quadrilateral
// Sealed interface — use one of OcrBoundingGeometryRectangle, OcrBoundingGeometryQuadrilateral.
type OcrBoundingGeometry interface {
	isOcrBoundingGeometry()
	Type() string
}

// OcrBoundingGeometryRectangle axis-aligned bounding box (typical for Tesseract output).
type OcrBoundingGeometryRectangle struct {
	// Left x-coordinate in pixels
	Left uint32 `json:"left"`
	// Top y-coordinate in pixels
	Top uint32 `json:"top"`
	// Width in pixels
	Width uint32 `json:"width"`
	// Height in pixels
	Height uint32 `json:"height"`
}

func (OcrBoundingGeometryRectangle) isOcrBoundingGeometry() {}

func (OcrBoundingGeometryRectangle) Type() string { return "rectangle" }

func (v OcrBoundingGeometryRectangle) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type   string `json:"type"`
		Left   uint32 `json:"left"`
		Top    uint32 `json:"top"`
		Width  uint32 `json:"width"`
		Height uint32 `json:"height"`
	}
	return json.Marshal(aux{
		Type:   v.Type(),
		Left:   v.Left,
		Top:    v.Top,
		Width:  v.Width,
		Height: v.Height,
	})
}

// OcrBoundingGeometryQuadrilateral 4-point quadrilateral for rotated/skewed text (PaddleOCR).
//
// Points are in clockwise order starting from top-left:
// `[top_left, top_right, bottom_right, bottom_left]`
type OcrBoundingGeometryQuadrilateral struct {
	// Four corner points as `[(x, y), ...]` in clockwise order
	Points string `json:"points"`
}

func (OcrBoundingGeometryQuadrilateral) isOcrBoundingGeometry() {}

func (OcrBoundingGeometryQuadrilateral) Type() string { return "quadrilateral" }

func (v OcrBoundingGeometryQuadrilateral) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type   string `json:"type"`
		Points string `json:"points"`
	}
	return json.Marshal(aux{
		Type:   v.Type(),
		Points: v.Points,
	})
}

// UnmarshalOcrBoundingGeometry decodes JSON data into the appropriate concrete OcrBoundingGeometry variant.
func UnmarshalOcrBoundingGeometry(data []byte) (OcrBoundingGeometry, error) {
	var wire struct {
		Type string `json:"type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.Type {
	case "rectangle":
		var v OcrBoundingGeometryRectangle
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "quadrilateral":
		var v OcrBoundingGeometryQuadrilateral
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown OcrBoundingGeometry type: %q", wire.Type)
}

// OcrElementLevel is an enumeration type.
type OcrElementLevel string

const (
	// OcrElementLevelWord OcrElementLevelWord individual word
	OcrElementLevelWord OcrElementLevel = "word"
	// OcrElementLevelLine OcrElementLevelLine line of text (default for PaddleOCR)
	OcrElementLevelLine OcrElementLevel = "line"
	// OcrElementLevelBlock OcrElementLevelBlock paragraph or text block
	OcrElementLevelBlock OcrElementLevel = "block"
	// OcrElementLevelPage OcrElementLevelPage page-level element
	OcrElementLevelPage OcrElementLevel = "page"
)

// PageUnitType is an enumeration type.
type PageUnitType string

const (
	// PageUnitTypePage PageUnitTypePage standard document pages (PDF, DOCX, images)
	PageUnitTypePage PageUnitType = "page"
	// PageUnitTypeSlide PageUnitTypeSlide presentation slides (PPTX, ODP)
	PageUnitTypeSlide PageUnitType = "slide"
	// PageUnitTypeSheet PageUnitTypeSheet spreadsheet sheets (XLSX, ODS)
	PageUnitTypeSheet PageUnitType = "sheet"
)

// DiffLine single line in a unified-diff hunk.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type DiffLine string

// RevisionKind is an enumeration type.
type RevisionKind string

const (
	// RevisionKindInsertion RevisionKindInsertion text or content was inserted.
	RevisionKindInsertion RevisionKind = "insertion"
	// RevisionKindDeletion RevisionKindDeletion text or content was deleted.
	RevisionKindDeletion RevisionKind = "deletion"
	// RevisionKindFormatChange RevisionKindFormatChange run-level formatting (font, size, colour, …) was changed.
	RevisionKindFormatChange RevisionKind = "format_change"
	// RevisionKindComment RevisionKindComment a reviewer comment or annotation.
	RevisionKindComment RevisionKind = "comment"
)

// RevisionAnchor best-effort document location for a revision.
// Variants: Paragraph, TableCell, Page, Slide, Sheet
// Sealed interface — use one of RevisionAnchorParagraph, RevisionAnchorTableCell.
type RevisionAnchor interface {
	isRevisionAnchor()
	Type() string
}

// RevisionAnchorParagraph body paragraph, identified by its zero-based index in the document flow.
type RevisionAnchorParagraph struct {
	// Zero-based index of the paragraph in document order.
	Index uint `json:"index"`
}

func (RevisionAnchorParagraph) isRevisionAnchor() {}

func (RevisionAnchorParagraph) Type() string { return "paragraph" }

func (v RevisionAnchorParagraph) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type  string `json:"type"`
		Index uint   `json:"index"`
	}
	return json.Marshal(aux{
		Type:  v.Type(),
		Index: v.Index,
	})
}

// RevisionAnchorTableCell cell inside a table.
type RevisionAnchorTableCell struct {
	// Zero-based row index within the table.
	Row uint `json:"row"`
	// Zero-based column index within the table.
	Col uint `json:"col"`
	// Zero-based index of the table in document order.
	TableIndex uint `json:"table_index"`
}

func (RevisionAnchorTableCell) isRevisionAnchor() {}

func (RevisionAnchorTableCell) Type() string { return "table_cell" }

func (v RevisionAnchorTableCell) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type       string `json:"type"`
		Row        uint   `json:"row"`
		Col        uint   `json:"col"`
		TableIndex uint   `json:"table_index"`
	}
	return json.Marshal(aux{
		Type:       v.Type(),
		Row:        v.Row,
		Col:        v.Col,
		TableIndex: v.TableIndex,
	})
}

// RevisionAnchorPage page, identified by its zero-based index.
type RevisionAnchorPage struct {
	// Zero-based page index.
	Index uint `json:"index"`
}

func (RevisionAnchorPage) isRevisionAnchor() {}

func (RevisionAnchorPage) Type() string { return "page" }

func (v RevisionAnchorPage) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type  string `json:"type"`
		Index uint   `json:"index"`
	}
	return json.Marshal(aux{
		Type:  v.Type(),
		Index: v.Index,
	})
}

// RevisionAnchorSlide presentation slide, identified by its zero-based index.
type RevisionAnchorSlide struct {
	// Zero-based slide index.
	Index uint `json:"index"`
}

func (RevisionAnchorSlide) isRevisionAnchor() {}

func (RevisionAnchorSlide) Type() string { return "slide" }

func (v RevisionAnchorSlide) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type  string `json:"type"`
		Index uint   `json:"index"`
	}
	return json.Marshal(aux{
		Type:  v.Type(),
		Index: v.Index,
	})
}

// RevisionAnchorSheet spreadsheet cell or range, identified by sheet index and optional name.
type RevisionAnchorSheet struct {
	// Zero-based sheet index.
	Index uint `json:"index"`
	// Sheet display name when available.
	Name *string `json:"name,omitempty"`
}

func (RevisionAnchorSheet) isRevisionAnchor() {}

func (RevisionAnchorSheet) Type() string { return "sheet" }

func (v RevisionAnchorSheet) MarshalJSON() ([]byte, error) {
	type aux struct {
		Type  string  `json:"type"`
		Index uint    `json:"index"`
		Name  *string `json:"name,omitempty"`
	}
	return json.Marshal(aux{
		Type:  v.Type(),
		Index: v.Index,
		Name:  v.Name,
	})
}

// UnmarshalRevisionAnchor decodes JSON data into the appropriate concrete RevisionAnchor variant.
func UnmarshalRevisionAnchor(data []byte) (RevisionAnchor, error) {
	var wire struct {
		Type string `json:"type"`
	}
	if err := json.Unmarshal(data, &wire); err != nil {
		return nil, err
	}

	switch wire.Type {
	case "paragraph":
		var v RevisionAnchorParagraph
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "table_cell":
		var v RevisionAnchorTableCell
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "page":
		var v RevisionAnchorPage
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "slide":
		var v RevisionAnchorSlide
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	case "sheet":
		var v RevisionAnchorSheet
		if err := json.Unmarshal(data, &v); err != nil {
			return nil, err
		}
		return v, nil
	}
	return nil, fmt.Errorf("unknown RevisionAnchor type: %q", wire.Type)
}

// URIKind is an enumeration type.
type URIKind string

const (
	// URIKindHyperlink URIKindHyperlink a clickable hyperlink (web URL, file link).
	URIKindHyperlink URIKind = "hyperlink"
	// URIKindImage URIKindImage an image or media resource reference.
	URIKindImage URIKind = "image"
	// URIKindAnchor URIKindAnchor an internal anchor or cross-reference target.
	URIKindAnchor URIKind = "anchor"
	// URIKindCitation URIKindCitation a citation or bibliographic reference (DOI, academic ref).
	URIKindCitation URIKind = "citation"
	// URIKindReference URIKindReference a general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
	URIKindReference URIKind = "reference"
	// URIKindEmail URIKindEmail an email address (`mailto:` link or bare email).
	URIKindEmail URIKind = "email"
)

// KeywordAlgorithm is an enumeration type.
type KeywordAlgorithm string

const (
	// KeywordAlgorithmYake KeywordAlgorithmYake yAKE (Yet Another Keyword Extractor) - statistical approach
	KeywordAlgorithmYake KeywordAlgorithm = "yake"
	// KeywordAlgorithmRake KeywordAlgorithmRake rAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
	KeywordAlgorithmRake KeywordAlgorithm = "rake"
)

// PSMMode is an enumeration type.
type PSMMode string

const (
	// PSMModeOsdOnly PSMModeOsdOnly is the OsdOnly variant of PSMMode.
	PSMModeOsdOnly PSMMode = "osd_only"
	// PSMModeAutoOsd PSMModeAutoOsd is the AutoOsd variant of PSMMode.
	PSMModeAutoOsd PSMMode = "auto_osd"
	// PSMModeAutoOnly PSMModeAutoOnly is the AutoOnly variant of PSMMode.
	PSMModeAutoOnly PSMMode = "auto_only"
	// PSMModeAuto PSMModeAuto is the Auto variant of PSMMode.
	PSMModeAuto PSMMode = "auto"
	// PSMModeSingleColumn PSMModeSingleColumn is the SingleColumn variant of PSMMode.
	PSMModeSingleColumn PSMMode = "single_column"
	// PSMModeSingleBlockVertical PSMModeSingleBlockVertical is the SingleBlockVertical variant of PSMMode.
	PSMModeSingleBlockVertical PSMMode = "single_block_vertical"
	// PSMModeSingleBlock PSMModeSingleBlock is the SingleBlock variant of PSMMode.
	PSMModeSingleBlock PSMMode = "single_block"
	// PSMModeSingleLine PSMModeSingleLine is the SingleLine variant of PSMMode.
	PSMModeSingleLine PSMMode = "single_line"
	// PSMModeSingleWord PSMModeSingleWord is the SingleWord variant of PSMMode.
	PSMModeSingleWord PSMMode = "single_word"
	// PSMModeCircleWord PSMModeCircleWord is the CircleWord variant of PSMMode.
	PSMModeCircleWord PSMMode = "circle_word"
	// PSMModeSingleChar PSMModeSingleChar is the SingleChar variant of PSMMode.
	PSMModeSingleChar PSMMode = "single_char"
)

// PaddleLanguage is an enumeration type.
type PaddleLanguage string

const (
	// PaddleLanguageEnglish PaddleLanguageEnglish english
	PaddleLanguageEnglish PaddleLanguage = "english"
	// PaddleLanguageChinese PaddleLanguageChinese simplified Chinese
	PaddleLanguageChinese PaddleLanguage = "chinese"
	// PaddleLanguageJapanese PaddleLanguageJapanese japanese
	PaddleLanguageJapanese PaddleLanguage = "japanese"
	// PaddleLanguageKorean PaddleLanguageKorean korean
	PaddleLanguageKorean PaddleLanguage = "korean"
	// PaddleLanguageGerman PaddleLanguageGerman german
	PaddleLanguageGerman PaddleLanguage = "german"
	// PaddleLanguageFrench PaddleLanguageFrench french
	PaddleLanguageFrench PaddleLanguage = "french"
	// PaddleLanguageLatin PaddleLanguageLatin latin script (covers most European languages)
	PaddleLanguageLatin PaddleLanguage = "latin"
	// PaddleLanguageCyrillic PaddleLanguageCyrillic cyrillic (Russian and related)
	PaddleLanguageCyrillic PaddleLanguage = "cyrillic"
	// PaddleLanguageTraditionalChinese PaddleLanguageTraditionalChinese traditional Chinese
	PaddleLanguageTraditionalChinese PaddleLanguage = "traditional_chinese"
	// PaddleLanguageThai PaddleLanguageThai thai
	PaddleLanguageThai PaddleLanguage = "thai"
	// PaddleLanguageGreek PaddleLanguageGreek greek
	PaddleLanguageGreek PaddleLanguage = "greek"
	// PaddleLanguageEastSlavic PaddleLanguageEastSlavic east Slavic (Russian, Ukrainian, Belarusian)
	PaddleLanguageEastSlavic PaddleLanguage = "east_slavic"
	// PaddleLanguageArabic PaddleLanguageArabic arabic (Arabic, Persian, Urdu)
	PaddleLanguageArabic PaddleLanguage = "arabic"
	// PaddleLanguageDevanagari PaddleLanguageDevanagari devanagari (Hindi, Marathi, Sanskrit, Nepali)
	PaddleLanguageDevanagari PaddleLanguage = "devanagari"
	// PaddleLanguageTamil PaddleLanguageTamil tamil
	PaddleLanguageTamil PaddleLanguage = "tamil"
	// PaddleLanguageTelugu PaddleLanguageTelugu telugu
	PaddleLanguageTelugu PaddleLanguage = "telugu"
)

// LayoutClass is an enumeration type.
type LayoutClass string

const (
	// LayoutClassCaption LayoutClassCaption is the Caption variant of LayoutClass.
	LayoutClassCaption LayoutClass = "caption"
	// LayoutClassFootnote LayoutClassFootnote is the Footnote variant of LayoutClass.
	LayoutClassFootnote LayoutClass = "footnote"
	// LayoutClassFormula LayoutClassFormula is the Formula variant of LayoutClass.
	LayoutClassFormula LayoutClass = "formula"
	// LayoutClassListItem LayoutClassListItem is the ListItem variant of LayoutClass.
	LayoutClassListItem LayoutClass = "list_item"
	// LayoutClassPageFooter LayoutClassPageFooter is the PageFooter variant of LayoutClass.
	LayoutClassPageFooter LayoutClass = "page_footer"
	// LayoutClassPageHeader LayoutClassPageHeader is the PageHeader variant of LayoutClass.
	LayoutClassPageHeader LayoutClass = "page_header"
	// LayoutClassPicture LayoutClassPicture is the Picture variant of LayoutClass.
	LayoutClassPicture LayoutClass = "picture"
	// LayoutClassSectionHeader LayoutClassSectionHeader is the SectionHeader variant of LayoutClass.
	LayoutClassSectionHeader LayoutClass = "section_header"
	// LayoutClassTable LayoutClassTable is the Table variant of LayoutClass.
	LayoutClassTable LayoutClass = "table"
	// LayoutClassText LayoutClassText is the Text variant of LayoutClass.
	LayoutClassText LayoutClass = "text"
	// LayoutClassTitle LayoutClassTitle is the Title variant of LayoutClass.
	LayoutClassTitle LayoutClass = "title"
	// LayoutClassDocumentIndex LayoutClassDocumentIndex is the DocumentIndex variant of LayoutClass.
	LayoutClassDocumentIndex LayoutClass = "document_index"
	// LayoutClassCode LayoutClassCode is the Code variant of LayoutClass.
	LayoutClassCode LayoutClass = "code"
	// LayoutClassCheckboxSelected LayoutClassCheckboxSelected is the CheckboxSelected variant of LayoutClass.
	LayoutClassCheckboxSelected LayoutClass = "checkbox_selected"
	// LayoutClassCheckboxUnselected LayoutClassCheckboxUnselected is the CheckboxUnselected variant of LayoutClass.
	LayoutClassCheckboxUnselected LayoutClass = "checkbox_unselected"
	// LayoutClassForm LayoutClassForm is the Form variant of LayoutClass.
	LayoutClassForm LayoutClass = "form"
	// LayoutClassKeyValueRegion LayoutClassKeyValueRegion is the KeyValueRegion variant of LayoutClass.
	LayoutClassKeyValueRegion LayoutClass = "key_value_region"
)

// CacheStats is a type.
type CacheStats struct {
	TotalFiles        uint    `json:"total_files"`
	TotalSizeMb       float64 `json:"total_size_mb"`
	AvailableSpaceMb  float64 `json:"available_space_mb"`
	OldestFileAgeDays float64 `json:"oldest_file_age_days"`
	NewestFileAgeDays float64 `json:"newest_file_age_days"`
}

// AccelerationConfig hardware acceleration configuration for ONNX Runtime models.
//
// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
// for inference in layout detection and embedding generation.
//
// Example:
//
//	// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
//	let config = AccelerationConfig::default();
//
//	// Force CPU only
//	let config = AccelerationConfig {
//	    provider: kreuzberg::ExecutionProviderType::Cpu,
//	    ..Default::default()
//	};
type AccelerationConfig struct {
	// Execution provider to use for ONNX inference.
	Provider ExecutionProviderType `json:"provider,omitempty"`
	// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
	DeviceID uint32 `json:"device_id"`
}

// ContentFilterConfig cross-extractor content filtering configuration.
//
// Controls whether "furniture" content (headers, footers, page numbers,
// watermarks, repeating text) is included in or stripped from extraction
// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
// with format-specific implementation.
//
// When `None` on `ExtractionConfig`, each extractor uses its current
// default behavior unchanged.
type ContentFilterConfig struct {
	// Include running headers in extraction output.
	//
	// - PDF: Disables top-margin furniture stripping and prevents the layout
	// model from treating `PageHeader`-classified regions as furniture.
	// - DOCX: Includes document headers in text output.
	// - RTF/ODT: Headers already included; this is a no-op when true.
	// - HTML/EPUB: Keeps `<header>` element content.
	//
	// Default: `false` (headers are stripped or excluded).
	IncludeHeaders bool `json:"include_headers"`
	// Include running footers in extraction output.
	//
	// - PDF: Disables bottom-margin furniture stripping and prevents the layout
	// model from treating `PageFooter`-classified regions as furniture.
	// - DOCX: Includes document footers in text output.
	// - RTF/ODT: Footers already included; this is a no-op when true.
	// - HTML/EPUB: Keeps `<footer>` element content.
	//
	// Default: `false` (footers are stripped or excluded).
	IncludeFooters bool `json:"include_footers"`
	// Enable the heuristic cross-page repeating text detector.
	//
	// When `true` (default), text that repeats verbatim across a supermajority
	// of pages is classified as furniture and stripped.  Disable this if brand
	// names or repeated headings are being incorrectly removed by the heuristic.
	//
	// Note: when a layout-detection model is active, the model may independently
	// classify page-header / page-footer regions as furniture on a per-page basis.
	// To preserve those regions, set `include_headers = true`, `include_footers = true`,
	// or both, in addition to disabling this flag.
	//
	// Primarily affects PDF extraction.
	//
	// Default: `true`.
	StripRepeatingText *bool `json:"strip_repeating_text,omitempty"`
	// Include watermark text in extraction output.
	//
	// - PDF: Keeps watermark artifacts and arXiv identifiers.
	// - Other formats: No effect currently.
	//
	// Default: `false` (watermarks are stripped).
	IncludeWatermarks bool `json:"include_watermarks"`
}

// EmailConfig configuration for email extraction.
type EmailConfig struct {
	// Windows codepage number to use when an MSG file contains no codepage property.
	// Defaults to `None`, which falls back to windows-1252.
	//
	// If an unrecognized or invalid codepage number is supplied (including 0),
	// the behavior silently falls back to windows-1252 — the same as when the
	// MSG file itself contains an unrecognized codepage. No error or warning is
	// emitted. Users should verify output when supplying unusual values.
	//
	// Common values:
	// - 1250: Central European (Polish, Czech, Hungarian, etc.)
	// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
	// - 1252: Western European (default)
	// - 1253: Greek
	// - 1254: Turkish
	// - 1255: Hebrew
	// - 1256: Arabic
	// - 932:  Japanese (Shift-JIS)
	// - 936:  Simplified Chinese (GBK)
	MsgFallbackCodepage *uint32 `json:"msg_fallback_codepage,omitempty"`
}

// ExtractionConfig main extraction configuration.
//
// This struct contains all configuration options for the extraction process.
// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
//
// Example:
//
//	// Create with defaults
//	let config = ExtractionConfig::default();
//
//	// Load from TOML file
//	// let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
type ExtractionConfig struct {
	// Enable caching of extraction results
	UseCache *bool `json:"use_cache,omitempty"`
	// Enable quality post-processing
	EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
	// OCR configuration (None = OCR disabled)
	Ocr *OcrConfig `json:"ocr,omitempty"`
	// Force OCR even for searchable PDFs
	ForceOcr bool `json:"force_ocr"`
	// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
	//
	// When set, only the listed pages are OCR'd regardless of text layer quality.
	// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
	// Only applies to PDF documents. Duplicates are automatically deduplicated.
	// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
	ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
	// Disable OCR entirely, even for images.
	//
	// When `true`, OCR is skipped for all document types. Images return metadata
	// only (dimensions, format, EXIF) without text extraction. PDFs use only
	// native text extraction without OCR fallback.
	//
	// Cannot be `true` simultaneously with `force_ocr`.
	//
	// *Added in v4.7.0.*
	DisableOcr bool `json:"disable_ocr"`
	// Text chunking configuration (None = chunking disabled)
	Chunking *ChunkingConfig `json:"chunking,omitempty"`
	// Content filtering configuration (None = use extractor defaults).
	//
	// Controls whether document "furniture" (headers, footers, watermarks,
	// repeating text) is included in or stripped from extraction results.
	// See [`ContentFilterConfig`] for per-field documentation.
	ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
	// Image extraction configuration (None = no image extraction)
	Images *ImageExtractionConfig `json:"images,omitempty"`
	// PDF-specific options (None = use defaults)
	PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
	// Token reduction configuration (None = no token reduction)
	TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
	// Language detection configuration (None = no language detection)
	LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
	// Page extraction configuration (None = no page tracking)
	Pages *PageConfig `json:"pages,omitempty"`
	// Keyword extraction configuration (None = no keyword extraction)
	Keywords *KeywordConfig `json:"keywords,omitempty"`
	// Post-processor configuration (None = use defaults)
	Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
	// HTML to Markdown conversion options (None = use defaults)
	//
	// Configure how HTML documents are converted to Markdown, including heading styles,
	// list formatting, code block styles, and preprocessing options.
	HTMLOptions *string `json:"html_options,omitempty"`
	// Styled HTML output configuration.
	//
	// When set alongside `output_format = OutputFormat::Html`, the extraction
	// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
	// which emits stable `kb-*` CSS class hooks on every structural element
	// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
	//
	// When `None`, the existing plain comrak-based HTML renderer is used.
	HTMLOutput *HTMLOutputConfig `json:"html_output,omitempty"`
	// Default per-file timeout in seconds for batch extraction.
	//
	// When set, each file in a batch will be canceled after this duration
	// unless overridden by [`FileExtractionConfig::timeout_secs`].
	//
	// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
	// nested archives, documents with millions of cells) from running
	// indefinitely and exhausting caller resources. Set to `None` to
	// disable the timeout for trusted input or long-running workloads.
	ExtractionTimeoutSecs *uint64 `json:"extraction_timeout_secs,omitempty"`
	// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
	//
	// Limits parallelism to prevent resource exhaustion when processing
	// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
	MaxConcurrentExtractions *uint `json:"max_concurrent_extractions,omitempty"`
	// Result structure format
	//
	// Controls whether results are returned in unified format (default) with all
	// content in the `content` field, or element-based format with semantic
	// elements (for Unstructured-compatible output).
	ResultFormat ResultFormat `json:"result_format,omitempty"`
	// Security limits for archive extraction.
	//
	// Controls maximum archive size, compression ratio, file count, and other
	// security thresholds to prevent decompression bomb attacks. Also caps
	// nesting depth, iteration count, entity / token length, total
	// content size, and table cell count for every extraction path that
	// ingests user-controlled bytes.
	// When `None`, default limits are used.
	SecurityLimits *SecurityLimits `json:"security_limits,omitempty"`
	// Maximum uncompressed size in bytes for a single embedded file before
	// recursive extraction is attempted (default: 50 MiB).
	//
	// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
	// to email attachments processed via recursive extraction. Files that
	// exceed this limit are skipped with a `ProcessingWarning` rather than
	// passed to the extraction pipeline, preventing a single oversized
	// embedded object from consuming unbounded memory or time.
	//
	// Set to `None` to disable the per-embedded-file cap (falls back to
	// `security_limits.max_archive_size` as the only guard).
	MaxEmbeddedFileBytes *uint64 `json:"max_embedded_file_bytes,omitempty"`
	// Content text format (default: Plain).
	//
	// Controls the format of the extracted content:
	// - `Plain`: Raw extracted text (default)
	// - `Markdown`: Markdown formatted output
	// - `Djot`: Djot markup format (requires djot feature)
	// - `Html`: HTML formatted output
	//
	// When set to a structured format, extraction results will include
	// formatted output. The `formatted_content` field may be populated
	// when format conversion is applied.
	OutputFormat *OutputFormat `json:"output_format,omitempty"`
	// Layout detection configuration (None = layout detection disabled).
	//
	// When set, PDF pages and images are analyzed for document structure
	// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
	// via ONNX Runtime. For PDFs, layout hints override paragraph classification
	// in the markdown pipeline. For images, per-region OCR is performed with
	// markdown formatting based on detected layout classes.
	// Requires the `layout-detection` feature to run inference; the field is
	// present whenever the `layout-types` feature is active (which includes
	// `layout-detection` as well as the no-ORT target groups).
	Layout *LayoutDetectionConfig `json:"layout,omitempty"`
	// Run layout detection on the non-OCR PDF markdown path.
	//
	// When `true` and `layout` is `Some(_)`, layout regions inform heading,
	// table, list, and figure detection in the structure pipeline that would
	// otherwise rely on font-clustering heuristics alone. Significantly
	// improves SF1 (structural F1) at the cost of inference latency
	// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
	// Requires the `layout-detection` feature.
	UseLayoutForMarkdown bool `json:"use_layout_for_markdown"`
	// Enable structured document tree output.
	//
	// When true, populates the `document` field on `ExtractionResult` with a
	// hierarchical `DocumentStructure` containing heading-driven section nesting,
	// table grids, content layer classification, and inline annotations.
	//
	// Independent of `result_format` — can be combined with Unified or ElementBased.
	IncludeDocumentStructure bool `json:"include_document_structure"`
	// Hardware acceleration configuration for ONNX Runtime models.
	//
	// Controls execution provider selection for layout detection and embedding
	// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
	// Linux, CPU on Windows).
	Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
	// Cache namespace for tenant isolation.
	//
	// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
	// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
	// Different namespaces have isolated cache spaces on the same filesystem.
	CacheNamespace *string `json:"cache_namespace,omitempty"`
	// Per-request cache TTL in seconds.
	//
	// Overrides the global `max_age_days` for this specific extraction.
	// When `0`, caching is completely skipped (no read or write).
	// When `None`, the global TTL applies.
	CacheTTLSecs *uint64 `json:"cache_ttl_secs,omitempty"`
	// Email extraction configuration (None = use defaults).
	//
	// Currently supports configuring the fallback codepage for MSG files
	// that do not specify one. See `EmailConfig` for details.
	Email *EmailConfig `json:"email,omitempty"`
	// Concurrency limits for constrained environments (None = use defaults).
	//
	// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
	// (when `max_concurrent_extractions` is unset) the batch concurrency
	// semaphore. See `ConcurrencyConfig` for details.
	Concurrency *string `json:"concurrency,omitempty"`
	// Maximum recursion depth for archive extraction (default: 3).
	// Set to 0 to disable recursive extraction (legacy behavior).
	MaxArchiveDepth uint `json:"max_archive_depth"`
	// Tree-sitter language pack configuration (None = tree-sitter disabled).
	//
	// When set, enables code file extraction using tree-sitter parsers.
	// Controls grammar download behavior and code analysis options.
	TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
	// Structured extraction via LLM (None = disabled).
	//
	// When set, the extracted document content is sent to an LLM with the
	// provided JSON schema. The structured response is stored in
	// `ExtractionResult::structured_output`.
	StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
	// Cancellation token for this extraction (None = no external cancellation).
	//
	// Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
	// from another thread / task to abort the extraction in progress. The extractor
	// checks the token at safe checkpoints (before lock acquisition, between pages,
	// between batch items) and returns [`KreuzbergError::Cancelled`] when set.
	//
	// The field is excluded from serialization because `CancellationToken` is a
	// runtime handle, not a configuration value.
	CancelToken *string `json:"cancel_token,omitempty"`
}

// FileExtractionConfig per-file extraction configuration overrides for batch processing.
//
// All fields are `Option<T>` — `None` means "use the batch-level default."
// This type is used with `batch_extract_files` and
// `batch_extract_bytes` to allow heterogeneous
// extraction settings within a single batch.
//
// # Excluded Fields
//
// The following `ExtractionConfig` fields are batch-level only and
// cannot be overridden per file:
// - `max_concurrent_extractions` — controls batch parallelism
// - `use_cache` — global caching policy
// - `acceleration` — shared ONNX execution provider
// - `security_limits` — global archive security policy
//
// Example:
//
//	// Override just OCR forcing for a specific file
//	let config = FileExtractionConfig {
//	    force_ocr: Some(true),
//	    ..Default::default()
//	};
type FileExtractionConfig struct {
	// Override quality post-processing for this file.
	EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
	// Override OCR configuration for this file (None in the Option = use batch default).
	Ocr *OcrConfig `json:"ocr,omitempty"`
	// Override force OCR for this file.
	ForceOcr *bool `json:"force_ocr,omitempty"`
	// Override force OCR pages for this file (1-indexed page numbers).
	ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
	// Override disable OCR for this file.
	DisableOcr *bool `json:"disable_ocr,omitempty"`
	// Override chunking configuration for this file.
	Chunking *ChunkingConfig `json:"chunking,omitempty"`
	// Override content filtering configuration for this file.
	ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
	// Override image extraction configuration for this file.
	Images *ImageExtractionConfig `json:"images,omitempty"`
	// Override PDF options for this file.
	PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
	// Override token reduction for this file.
	TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
	// Override language detection for this file.
	LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
	// Override page extraction for this file.
	Pages *PageConfig `json:"pages,omitempty"`
	// Override keyword extraction for this file.
	Keywords *KeywordConfig `json:"keywords,omitempty"`
	// Override post-processor for this file.
	Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
	// Override HTML conversion options for this file.
	HTMLOptions *string `json:"html_options,omitempty"`
	// Override result format for this file.
	ResultFormat *ResultFormat `json:"result_format,omitempty"`
	// Override output content format for this file.
	OutputFormat *OutputFormat `json:"output_format,omitempty"`
	// Override document structure output for this file.
	IncludeDocumentStructure *bool `json:"include_document_structure,omitempty"`
	// Override layout detection for this file.
	Layout *LayoutDetectionConfig `json:"layout,omitempty"`
	// Override per-file extraction timeout in seconds.
	//
	// When set, the extraction for this file will be canceled after the
	// specified duration. A timed-out file produces an error result without
	// affecting other files in the batch.
	TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
	// Override tree-sitter configuration for this file.
	TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
	// Override structured extraction configuration for this file.
	//
	// When set, enables LLM-based structured extraction with a JSON schema
	// for this specific file. The extracted content is sent to a VLM/LLM
	// and the response is parsed according to the provided schema.
	StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
}

// BatchBytesItem batch item for byte array extraction.
//
// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
// to represent a single item in a batch extraction job.
type BatchBytesItem struct {
	// The content bytes to extract from
	Content []byte `json:"content"`
	// MIME type of the content (e.g., "application/pdf", "text/html")
	MimeType string `json:"mime_type"`
	// Per-item configuration overrides (None uses batch-level defaults)
	Config *FileExtractionConfig `json:"config,omitempty"`
}

// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v BatchBytesItem) MarshalJSON() ([]byte, error) {
	// Explicit shadow struct listing every field — embedding the original
	// would cause both base64-string and int-array entries for the same JSON
	// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
	aux := struct {
		Content  []int                 `json:"content"`
		MimeType string                `json:"mime_type"`
		Config   *FileExtractionConfig `json:"config,omitempty"`
	}{}
	aux.Content = make([]int, len(v.Content))
	for i, b := range v.Content {
		aux.Content[i] = int(b)
	}
	aux.MimeType = v.MimeType
	aux.Config = v.Config
	return json.Marshal(aux)
}

// BatchFileItem batch item for file extraction.
//
// Used with `batch_extract_files` and `batch_extract_files_sync`
// to represent a single file in a batch extraction job.
type BatchFileItem struct {
	// Path to the file to extract from
	Path string `json:"path"`
	// Per-file configuration overrides (None uses batch-level defaults)
	Config *FileExtractionConfig `json:"config,omitempty"`
}

// ImageExtractionConfig image extraction configuration.
type ImageExtractionConfig struct {
	// Extract images from documents
	ExtractImages *bool `json:"extract_images,omitempty"`
	// Target DPI for image normalization
	TargetDpi *int32 `json:"target_dpi,omitempty"`
	// Maximum dimension for images (width or height)
	MaxImageDimension *int32 `json:"max_image_dimension,omitempty"`
	// Whether to inject image reference placeholders into markdown output.
	// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
	// are appended to the markdown. Set to `false` to extract images as data
	// without polluting the markdown output.
	InjectPlaceholders *bool `json:"inject_placeholders,omitempty"`
	// Automatically adjust DPI based on image content
	AutoAdjustDpi *bool `json:"auto_adjust_dpi,omitempty"`
	// Minimum DPI threshold
	MinDpi *int32 `json:"min_dpi,omitempty"`
	// Maximum DPI threshold
	MaxDpi *int32 `json:"max_dpi,omitempty"`
	// Maximum number of image objects to extract per PDF page.
	//
	// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
	// can trigger extremely long or indefinite extraction times when every image
	// object on a dense page is decoded individually via the PDF extractor. Setting this
	// limit causes kreuzberg to stop collecting individual images once the count
	// per page reaches the cap and emit a warning instead.
	//
	// `None` (default) means no limit — all images are extracted.
	MaxImagesPerPage *uint32 `json:"max_images_per_page,omitempty"`
	// When `true` (default), extracted images are classified by kind and grouped
	// into clusters where they appear to belong to one figure.
	Classify *bool `json:"classify,omitempty"`
	// When `true`, full-page renders produced during OCR preprocessing are captured
	// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
	//
	// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
	// document-level OCR bypass is active (whole-document backend). When OCR is
	// enabled and this flag is set but the active backend skips per-page rendering,
	// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
	//
	// Defaults to `false`. Enable when downstream consumers need page thumbnails
	// (e.g. citation previews, visual grounding).
	IncludePageRasters bool `json:"include_page_rasters"`
	// Run OCR on extracted images and include the recognized text in the document content.
	//
	// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
	// are processed with the configured OCR backend. Set to `false` to extract images
	// without OCR processing, even when OCR is enabled.
	RunOcrOnImages *bool `json:"run_ocr_on_images,omitempty"`
	// When `true`, image OCR results are rendered as plain text without the
	// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
	// is also `true`.
	OcrTextOnly bool `json:"ocr_text_only"`
	// When `true` and `ocr_text_only` is `false`, append the OCR text after
	// the image placeholder in the rendered output.
	AppendOcrText bool `json:"append_ocr_text"`
}

// TokenReductionOptions token reduction configuration.
type TokenReductionOptions struct {
	// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
	Mode string `json:"mode"`
	// Preserve important words (capitalized, technical terms)
	PreserveImportantWords *bool `json:"preserve_important_words,omitempty"`
}

// LanguageDetectionConfig language detection configuration.
type LanguageDetectionConfig struct {
	// Enable language detection
	Enabled *bool `json:"enabled,omitempty"`
	// Minimum confidence threshold (0.0-1.0)
	MinConfidence *float64 `json:"min_confidence,omitempty"`
	// Detect multiple languages in the document
	DetectMultiple bool `json:"detect_multiple"`
}

// HTMLOutputConfig configuration for styled HTML output.
//
// When set on [`ExtractionConfig::html_output`] alongside
// `output_format = OutputFormat::Html`, the pipeline builds a
// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
// the plain comrak-based renderer.
//
// Example:
//
//	let config = HtmlOutputConfig {
//	    theme: HtmlTheme::GitHub,
//	    css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
//	    ..Default::default()
//	};
type HTMLOutputConfig struct {
	// Inline CSS string injected into the output after the theme stylesheet.
	// Concatenated after `css_file` content when both are set.
	CSS *string `json:"css,omitempty"`
	// Path to a CSS file loaded once at renderer construction time.
	// Concatenated before `css` when both are set.
	CSSFile *string `json:"css_file,omitempty"`
	// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
	Theme *HTMLTheme `json:"theme,omitempty"`
	// CSS class prefix applied to every emitted class name.
	//
	// Default: `"kb-"`. Change this if your host application already uses
	// classes that start with `kb-`.
	ClassPrefix string `json:"class_prefix"`
	// When `true` (default), write the resolved CSS into a `<style>` block
	// immediately after the opening `<div class="{prefix}doc">`.
	//
	// Set to `false` to emit only the structural markup and wire up your
	// own stylesheet targeting the `kb-*` class names.
	EmbedCSS *bool `json:"embed_css,omitempty"`
}

// LayoutDetectionConfig layout detection configuration.
//
// Controls layout detection behavior in the extraction pipeline.
// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
// is enabled for PDF extraction.
type LayoutDetectionConfig struct {
	// Confidence threshold override (None = use model default).
	ConfidenceThreshold *float32 `json:"confidence_threshold,omitempty"`
	// Whether to apply postprocessing heuristics (default: true).
	ApplyHeuristics *bool `json:"apply_heuristics,omitempty"`
	// Table structure recognition model.
	//
	// Controls which model is used for table cell detection within layout-detected
	// table regions. Defaults to [`TableModel::Tatr`].
	TableModel TableModel `json:"table_model,omitempty"`
	// Hardware acceleration for ONNX models (layout detection + table structure).
	//
	// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
	// is used for inference. Defaults to `None` (auto-select per platform).
	Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
}

// LlmConfig configuration for an LLM provider/model via liter-llm.
//
// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
// its own `LlmConfig`, allowing different providers per feature.
//
// Example:
//
//	[structured_extraction.llm]
//	model = "openai/gpt-4o"
//	api_key = "sk-..."  # or use KREUZBERG_LLM_API_KEY env var
type LlmConfig struct {
	// Provider/model string using liter-llm routing format.
	//
	// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
	// `"groq/llama-3.1-70b-versatile"`.
	Model string `json:"model"`
	// API key for the provider. When `None`, liter-llm falls back to
	// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
	APIKey *string `json:"api_key,omitempty"`
	// Custom base URL override for the provider endpoint.
	BaseURL *string `json:"base_url,omitempty"`
	// Request timeout in seconds (default: 60).
	TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
	// Maximum retry attempts (default: 3).
	MaxRetries *uint32 `json:"max_retries,omitempty"`
	// Sampling temperature for generation tasks.
	Temperature *float64 `json:"temperature,omitempty"`
	// Maximum tokens to generate.
	MaxTokens *uint64 `json:"max_tokens,omitempty"`
}

// StructuredExtractionConfig configuration for LLM-based structured data extraction.
//
// Sends extracted document content to a VLM with a JSON schema,
// returning structured data that conforms to the schema.
//
// Example:
//
//	[structured_extraction]
//	schema_name = "invoice_data"
//	strict = true
//
//	[structured_extraction.schema]
//	type = "object"
//	properties.vendor = { type = "string" }
//	properties.total = { type = "number" }
//	required = ["vendor", "total"]
//
//	[structured_extraction.llm]
//	model = "openai/gpt-4o"
type StructuredExtractionConfig struct {
	// JSON Schema defining the desired output structure.
	Schema json.RawMessage `json:"schema"`
	// Schema name passed to the LLM's structured output mode.
	SchemaName string `json:"schema_name"`
	// Optional schema description for the LLM.
	SchemaDescription *string `json:"schema_description,omitempty"`
	// Enable strict mode — output must exactly match the schema.
	Strict bool `json:"strict"`
	// Custom Jinja2 extraction prompt template. When `None`, a default template is used.
	//
	// Available template variables:
	// - `{{ content }}` — The extracted document text.
	// - `{{ schema }}` — The JSON schema as a formatted string.
	// - `{{ schema_name }}` — The schema name.
	// - `{{ schema_description }}` — The schema description (may be empty).
	Prompt *string `json:"prompt,omitempty"`
	// LLM configuration for the extraction.
	Llm LlmConfig `json:"llm"`
}

// OcrQualityThresholds quality thresholds for OCR fallback decisions and pipeline quality gating.
//
// All fields default to the values that match the previous hardcoded behavior,
// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
type OcrQualityThresholds struct {
	// Minimum total non-whitespace characters to consider text substantive.
	MinTotalNonWhitespace *uint `json:"min_total_non_whitespace,omitempty"`
	// Minimum non-whitespace characters per page on average.
	MinNonWhitespacePerPage *float64 `json:"min_non_whitespace_per_page,omitempty"`
	// Minimum character count for a word to be "meaningful".
	MinMeaningfulWordLen *uint `json:"min_meaningful_word_len,omitempty"`
	// Minimum count of meaningful words before text is accepted.
	MinMeaningfulWords *uint `json:"min_meaningful_words,omitempty"`
	// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
	MinAlnumRatio *float64 `json:"min_alnum_ratio,omitempty"`
	// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
	MinGarbageChars *uint `json:"min_garbage_chars,omitempty"`
	// Maximum fraction of short (1-2 char) words before text is considered fragmented.
	MaxFragmentedWordRatio *float64 `json:"max_fragmented_word_ratio,omitempty"`
	// Critical fragmentation threshold — triggers OCR regardless of meaningful words.
	// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
	CriticalFragmentedWordRatio *float64 `json:"critical_fragmented_word_ratio,omitempty"`
	// Minimum average word length. Below this with enough words indicates garbled extraction.
	MinAvgWordLength *float64 `json:"min_avg_word_length,omitempty"`
	// Minimum word count before average word length check applies.
	MinWordsForAvgLengthCheck *uint `json:"min_words_for_avg_length_check,omitempty"`
	// Minimum consecutive word repetition ratio to detect column scrambling.
	MinConsecutiveRepeatRatio *float64 `json:"min_consecutive_repeat_ratio,omitempty"`
	// Minimum word count before consecutive repetition check is applied.
	MinWordsForRepeatCheck *uint `json:"min_words_for_repeat_check,omitempty"`
	// Minimum character count for "substantive markdown" OCR skip gate.
	SubstantiveMinChars *uint `json:"substantive_min_chars,omitempty"`
	// Minimum character count for "non-text content" OCR skip gate.
	NonTextMinChars *uint `json:"non_text_min_chars,omitempty"`
	// Alphanumeric+whitespace ratio threshold for skip decisions.
	AlnumWsRatioThreshold *float64 `json:"alnum_ws_ratio_threshold,omitempty"`
	// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
	// If the result from a backend scores below this, try the next backend.
	PipelineMinQuality *float64 `json:"pipeline_min_quality,omitempty"`
}

// OcrPipelineStage single backend stage in the OCR pipeline.
type OcrPipelineStage struct {
	// Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
	Backend string `json:"backend"`
	// Priority weight (higher = tried first). Stages are sorted by priority descending.
	Priority uint32 `json:"priority"`
	// Language override for this stage (None = use parent OcrConfig.language).
	Language *string `json:"language,omitempty"`
	// Tesseract-specific config override for this stage.
	TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
	// PaddleOCR-specific config for this stage.
	PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
	// VLM config override for this pipeline stage.
	VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
	// Arbitrary per-call options passed through to the backend unchanged.
	//
	// Backends that support runtime tuning (mode switching, preprocessing
	// flags, inference parameters, etc.) read this value and deserialize
	// the keys they care about. Keys unknown to the backend are silently
	// ignored, so options from different backends can coexist in the same
	// config without conflict.
	//
	// Example (custom backend):
	// ```json
	// { "mode": "fast", "enable_layout": true }
	// ```
	BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
}

// OcrPipelineConfig multi-backend OCR pipeline with quality-based fallback.
//
// Backends are tried in priority order (highest first). After each backend
// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
// the result is accepted. Otherwise the next backend is tried.
type OcrPipelineConfig struct {
	// Ordered list of backends to try. Sorted by priority (descending) at runtime.
	Stages []OcrPipelineStage `json:"stages,omitempty"`
	// Quality thresholds for deciding whether to accept a result or try the next backend.
	QualityThresholds OcrQualityThresholds `json:"quality_thresholds"`
}

// OcrConfig oCR configuration.
type OcrConfig struct {
	// Whether OCR is enabled.
	//
	// Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
	// [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
	// metadata only; PDFs use native text extraction without OCR fallback.
	//
	// Defaults to `true`. When `false`, all other OCR settings are ignored.
	Enabled *bool `json:"enabled,omitempty"`
	// OCR backend: tesseract, easyocr, paddleocr
	Backend string `json:"backend"`
	// Language code (e.g., "eng", "deu")
	Language string `json:"language"`
	// Tesseract-specific configuration (optional)
	TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
	// Output format for OCR results (optional, for format conversion)
	OutputFormat *OutputFormat `json:"output_format,omitempty"`
	// PaddleOCR-specific configuration (optional, JSON passthrough)
	PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
	// Arbitrary per-call options passed through to the backend unchanged.
	//
	// Custom OCR backends and built-in backends that support runtime tuning
	// can read this value and deserialize the keys they care about. Keys
	// unknown to the backend are silently ignored.
	//
	// This is the recommended extension point for per-call parameters that
	// are not covered by the typed fields above (e.g. mode switching,
	// preprocessing flags, inference batch size).
	//
	// **Scope:** when `pipeline` is `None`, this value is propagated to the
	// primary stage of the auto-constructed pipeline. When `pipeline` is
	// explicitly set, this field has **no effect** — the caller must set
	// `OcrPipelineStage.backend_options` directly on the relevant stage(s)
	// instead.
	//
	// Example:
	// ```json
	// { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
	// ```
	BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
	// OCR element extraction configuration
	ElementConfig *OcrElementConfig `json:"element_config,omitempty"`
	// Quality thresholds for the native-text-to-OCR fallback decision.
	// When None, uses compiled defaults (matching previous hardcoded behavior).
	QualityThresholds *OcrQualityThresholds `json:"quality_thresholds,omitempty"`
	// Multi-backend OCR pipeline configuration. When set, enables weighted
	// fallback across multiple OCR backends based on output quality.
	// When None, uses the single `backend` field (same as today).
	Pipeline *OcrPipelineConfig `json:"pipeline,omitempty"`
	// Enable automatic page rotation based on orientation detection.
	//
	// When enabled, uses Tesseract's `DetectOrientationScript()` to detect
	// page orientation (0/90/180/270 degrees) before OCR. If the page is
	// rotated with high confidence, the image is corrected before recognition.
	// This is critical for handling rotated scanned documents.
	AutoRotate bool `json:"auto_rotate"`
	// VLM (Vision Language Model) OCR configuration.
	//
	// Required when `backend` is `"vlm"`. Uses liter-llm to send page
	// images to a vision model for text extraction.
	VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
	// Custom Jinja2 prompt template for VLM OCR.
	//
	// When `None`, uses the default template. Available variables:
	// - `{{ language }}` — The document language code (e.g., "eng", "deu").
	VlmPrompt *string `json:"vlm_prompt,omitempty"`
	// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
	//
	// Not user-configurable via config files — injected at runtime from
	// `ExtractionConfig::acceleration` before each `process_image` call.
	Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
	// Caller-supplied Tesseract `traineddata` bytes per language code.
	//
	// Primary use case is the WASM build, which has no filesystem and cannot
	// download tessdata at runtime. Native builds typically rely on
	// `TessdataManager` and ignore this field. When present, the WASM
	// Tesseract backend prefers these bytes over its compile-time-bundled
	// English data.
	//
	// Skipped by serde to keep config files small — supply via the typed API
	// at runtime.
	TessdataBytes map[string][]byte `json:"tessdata_bytes,omitempty"`
}

// PageConfig page extraction and tracking configuration.
//
// Controls how pages are extracted, tracked, and represented in the extraction results.
// When `None`, page tracking is disabled.
//
// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
// when page boundaries are available and chunking is configured.
type PageConfig struct {
	// Extract pages as separate array (ExtractionResult.pages)
	ExtractPages bool `json:"extract_pages"`
	// Insert page markers in main content string
	InsertPageMarkers bool `json:"insert_page_markers"`
	// Page marker format (use {page_num} placeholder)
	// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
	MarkerFormat *string `json:"marker_format,omitempty"`
}

// PdfConfig pDF-specific configuration.
type PdfConfig struct {
	// Extract images from PDF
	ExtractImages bool `json:"extract_images"`
	// Extract tables from PDF.
	//
	// When `true` (default), runs pdf_oxide's native grid detector and, if it
	// finds nothing, falls back to the heuristic text-layer reconstruction in
	// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
	// both passes — `tables` will then be empty in the result.
	ExtractTables *bool `json:"extract_tables,omitempty"`
	// List of passwords to try when opening encrypted PDFs
	Passwords []string `json:"passwords,omitempty"`
	// Extract PDF metadata
	ExtractMetadata *bool `json:"extract_metadata,omitempty"`
	// Hierarchy extraction configuration (None = hierarchy extraction disabled)
	Hierarchy *HierarchyConfig `json:"hierarchy,omitempty"`
	// Extract PDF annotations (text notes, highlights, links, stamps).
	// Default: false
	ExtractAnnotations bool `json:"extract_annotations"`
	// Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
	// Default: 0.06 (6%)
	TopMarginFraction *float32 `json:"top_margin_fraction,omitempty"`
	// Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
	// Default: 0.05 (5%)
	BottomMarginFraction *float32 `json:"bottom_margin_fraction,omitempty"`
	// Allow single-column pseudo tables in extraction results.
	//
	// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
	// (heuristic) are rejected. When `true`, the minimum column count is relaxed
	// to 1, allowing single-column structured data (glossaries, itemized lists)
	// to be emitted as tables. Other quality filters (density, sparsity, prose
	// detection) still apply.
	AllowSingleColumnTables bool `json:"allow_single_column_tables"`
	// Perform OCR on inline images extracted from PDF pages and attach the
	// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
	// to be available; if `ExtractionConfig.ocr` is `None` the extractor
	// falls back to `TesseractConfig::default()`. Per-image failures degrade
	// gracefully (the image is returned without OCR text rather than failing
	// the whole extraction). Default: `false`.
	OcrInlineImages bool `json:"ocr_inline_images"`
}

// HierarchyConfig hierarchy extraction configuration for PDF text structure analysis.
//
// Enables extraction of document hierarchy levels (H1-H6) based on font size
// clustering and semantic analysis. When enabled, hierarchical blocks are
// included in page content.
type HierarchyConfig struct {
	// Enable hierarchy extraction
	Enabled *bool `json:"enabled,omitempty"`
	// Number of font size clusters to use for hierarchy levels (1-7)
	//
	// Default: 6, which provides H1-H6 heading levels with body text.
	// Larger values create more fine-grained hierarchy levels.
	KClusters *uint `json:"k_clusters,omitempty"`
	// Include bounding box information in hierarchy blocks
	IncludeBbox *bool `json:"include_bbox,omitempty"`
	// OCR coverage threshold for smart OCR triggering (0.0-1.0)
	//
	// Determines when OCR should be triggered based on text block coverage.
	// OCR is triggered when text blocks cover less than this fraction of the page.
	// Default: 0.5 (trigger OCR if less than 50% of page has text)
	OcrCoverageThreshold *float32 `json:"ocr_coverage_threshold,omitempty"`
}

// PostProcessorConfig post-processor configuration.
type PostProcessorConfig struct {
	// Enable post-processors
	Enabled *bool `json:"enabled,omitempty"`
	// Whitelist of processor names to run (None = all enabled)
	EnabledProcessors []string `json:"enabled_processors,omitempty"`
	// Blacklist of processor names to skip (None = none disabled)
	DisabledProcessors []string `json:"disabled_processors,omitempty"`
	// Pre-computed AHashSet for O(1) enabled processor lookup
	EnabledSet []string `json:"enabled_set,omitempty"`
	// Pre-computed AHashSet for O(1) disabled processor lookup
	DisabledSet []string `json:"disabled_set,omitempty"`
}

// ChunkingConfig chunking configuration.
//
// Configures text chunking for document content, including chunk size,
// overlap, trimming behavior, and optional embeddings.
//
// Use `..Default::default()` when constructing to allow for future field additions:
// ```rust
// let config = ChunkingConfig {
// max_characters: 500,
// ..Default::default()
// };
// ```
type ChunkingConfig struct {
	// Maximum size per chunk (in units determined by `sizing`).
	//
	// When `sizing` is `Characters` (default), this is the max character count.
	// When using token-based sizing, this is the max token count.
	//
	// Default: 1000
	MaxCharacters *uint `json:"max_chars,omitempty"`
	// Overlap between chunks (in units determined by `sizing`).
	//
	// Default: 200
	Overlap *uint `json:"max_overlap,omitempty"`
	// Whether to trim whitespace from chunk boundaries.
	//
	// Default: true
	Trim *bool `json:"trim,omitempty"`
	// Type of chunker to use (Text or Markdown).
	//
	// Default: Text
	ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
	// Optional embedding configuration for chunk embeddings.
	Embedding *EmbeddingConfig `json:"embedding,omitempty"`
	// Use a preset configuration (overrides individual settings if provided).
	Preset *string `json:"preset,omitempty"`
	// How to measure chunk size.
	//
	// Default: `Characters` (Unicode character count).
	// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
	Sizing ChunkSizing `json:"sizing"`
	// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
	// path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
	//
	// This is useful for RAG pipelines where each chunk needs self-contained
	// context about its position in the document structure.
	//
	// Default: `false`
	PrependHeadingContext bool `json:"prepend_heading_context"`
	// Optional cosine similarity threshold for semantic topic boundary detection.
	//
	// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
	// provided. You almost never need to set this. When omitted, defaults to
	// `0.75` which works well for most documents. Lower values detect more
	// topic boundaries (more, smaller chunks); higher values detect fewer.
	// Range: `0.0..=1.0`.
	TopicThreshold *float32 `json:"topic_threshold,omitempty"`
}

func (s *ChunkingConfig) UnmarshalJSON(data []byte) error {
	var raw struct {
		MaxCharacters         *uint            `json:"max_chars,omitempty"`
		Overlap               *uint            `json:"max_overlap,omitempty"`
		Trim                  *bool            `json:"trim,omitempty"`
		ChunkerType           *ChunkerType     `json:"chunker_type,omitempty"`
		Embedding             *EmbeddingConfig `json:"embedding,omitempty"`
		Preset                *string          `json:"preset,omitempty"`
		Sizing                json.RawMessage  `json:"sizing,omitempty"`
		PrependHeadingContext bool             `json:"prepend_heading_context"`
		TopicThreshold        *float32         `json:"topic_threshold,omitempty"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.MaxCharacters = raw.MaxCharacters
	s.Overlap = raw.Overlap
	s.Trim = raw.Trim
	s.ChunkerType = raw.ChunkerType
	s.Embedding = raw.Embedding
	s.Preset = raw.Preset
	s.PrependHeadingContext = raw.PrependHeadingContext
	s.TopicThreshold = raw.TopicThreshold
	if len(raw.Sizing) > 0 && string(raw.Sizing) != "null" {
		v, err := UnmarshalChunkSizing(raw.Sizing)
		if err != nil {
			return err
		}
		s.Sizing = v
	}
	return nil
}

// EmbeddingConfig embedding configuration for text chunks.
//
// Configures embedding generation using ONNX models via the vendored embedding engine.
// Requires the `embeddings` feature to be enabled.
type EmbeddingConfig struct {
	// The embedding model to use (defaults to "balanced" preset if not specified)
	Model EmbeddingModelType `json:"model"`
	// Whether to normalize embedding vectors (recommended for cosine similarity)
	Normalize *bool `json:"normalize,omitempty"`
	// Batch size for embedding generation
	BatchSize *uint `json:"batch_size,omitempty"`
	// Show model download progress
	ShowDownloadProgress bool `json:"show_download_progress"`
	// Custom cache directory for model files
	//
	// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
	// Allows full customization of model download location.
	CacheDir *string `json:"cache_dir,omitempty"`
	// Hardware acceleration for the embedding ONNX model.
	//
	// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
	// is used for inference. Defaults to `None` (auto-select per platform).
	Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
	// Maximum wall-clock duration (in seconds) for a single `embed()` call when
	// using [`EmbeddingModelType::Plugin`].
	//
	// Applies only to the in-process plugin path — protects against hung
	// host-language backends (e.g. a Python callback deadlocked on the GIL,
	// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
	// returns `Plugin` instead of blocking forever.
	//
	// `None` disables the timeout. The default (60 seconds) is conservative
	// for common in-process inference; increase for large batches on slow
	// hardware.
	MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
}

func (s *EmbeddingConfig) UnmarshalJSON(data []byte) error {
	var raw struct {
		Model                json.RawMessage     `json:"model,omitempty"`
		Normalize            *bool               `json:"normalize,omitempty"`
		BatchSize            *uint               `json:"batch_size,omitempty"`
		ShowDownloadProgress bool                `json:"show_download_progress"`
		CacheDir             *string             `json:"cache_dir,omitempty"`
		Acceleration         *AccelerationConfig `json:"acceleration,omitempty"`
		MaxEmbedDurationSecs *uint64             `json:"max_embed_duration_secs,omitempty"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.Normalize = raw.Normalize
	s.BatchSize = raw.BatchSize
	s.ShowDownloadProgress = raw.ShowDownloadProgress
	s.CacheDir = raw.CacheDir
	s.Acceleration = raw.Acceleration
	s.MaxEmbedDurationSecs = raw.MaxEmbedDurationSecs
	if len(raw.Model) > 0 && string(raw.Model) != "null" {
		v, err := UnmarshalEmbeddingModelType(raw.Model)
		if err != nil {
			return err
		}
		s.Model = v
	}
	return nil
}

// TreeSitterConfig configuration for tree-sitter language pack integration.
//
// Controls grammar download behavior and code analysis options.
//
// # Example (TOML)
//
// ```toml
// [tree_sitter]
// languages = ["python", "rust"]
// groups = ["web"]
//
// [tree_sitter.process]
// structure = true
// comments = true
// docstrings = true
// ```
type TreeSitterConfig struct {
	// Enable code intelligence processing (default: true).
	//
	// When `false`, tree-sitter analysis is completely skipped even if
	// the config section is present.
	Enabled *bool `json:"enabled,omitempty"`
	// Custom cache directory for downloaded grammars.
	//
	// When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
	CacheDir *string `json:"cache_dir,omitempty"`
	// Languages to pre-download on init (e.g., `["python", "rust"]`).
	Languages []string `json:"languages,omitempty"`
	// Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
	Groups []string `json:"groups,omitempty"`
	// Processing options for code analysis.
	Process TreeSitterProcessConfig `json:"process"`
}

// TreeSitterProcessConfig processing options for tree-sitter code analysis.
//
// Controls which analysis features are enabled when extracting code files.
type TreeSitterProcessConfig struct {
	// Extract structural items (functions, classes, structs, etc.). Default: true.
	Structure *bool `json:"structure,omitempty"`
	// Extract import statements. Default: true.
	Imports *bool `json:"imports,omitempty"`
	// Extract export statements. Default: true.
	Exports *bool `json:"exports,omitempty"`
	// Extract comments. Default: false.
	Comments bool `json:"comments"`
	// Extract docstrings. Default: false.
	Docstrings bool `json:"docstrings"`
	// Extract symbol definitions. Default: false.
	Symbols bool `json:"symbols"`
	// Include parse diagnostics. Default: false.
	Diagnostics bool `json:"diagnostics"`
	// Maximum chunk size in bytes. `None` disables chunking.
	ChunkMaxSize *uint `json:"chunk_max_size,omitempty"`
	// Content rendering mode for code extraction.
	ContentMode CodeContentMode `json:"content_mode,omitempty"`
}

// SupportedFormat supported document format entry.
//
// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
type SupportedFormat struct {
	// File extension (without leading dot), e.g., "pdf", "docx"
	Extension string `json:"extension"`
	// MIME type string, e.g., "application/pdf"
	MimeType string `json:"mime_type"`
}

// ServerConfig aPI server configuration.
//
// This struct holds all configuration options for the Kreuzberg API server,
// including host/port settings, CORS configuration, and upload limits.
//
// # Defaults
//
// - `host`: "127.0.0.1" (localhost only)
// - `port`: 8000
// - `cors_origins`: empty vector (allows all origins)
// - `max_request_body_bytes`: 104_857_600 (100 MB)
// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
type ServerConfig struct {
	// Server host address (e.g., "127.0.0.1", "0.0.0.0")
	Host string `json:"host"`
	// Server port number
	Port uint16 `json:"port"`
	// CORS allowed origins. Empty vector means allow all origins.
	//
	// If this is an empty vector, the server will accept requests from any origin.
	// If populated with specific origins (e.g., `"https://example.com"`), only
	// those origins will be allowed.
	CorsOrigins []string `json:"cors_origins,omitempty"`
	// Maximum size of request body in bytes (default: 100 MB)
	MaxRequestBodyBytes uint `json:"max_request_body_bytes"`
	// Maximum size of multipart fields in bytes (default: 100 MB)
	MaxMultipartFieldBytes uint `json:"max_multipart_field_bytes"`
}

// StructuredDataResult is a type.
type StructuredDataResult struct {
	Content    string            `json:"content"`
	Format     string            `json:"format"`
	Metadata   map[string]string `json:"metadata,omitempty"`
	TextFields []string          `json:"text_fields,omitempty"`
}

// DocxAppProperties application properties from docProps/app.xml for DOCX
//
// Contains Word-specific document statistics and metadata.
type DocxAppProperties struct {
	// Application name (e.g., "Microsoft Office Word")
	Application *string `json:"application,omitempty"`
	// Application version
	AppVersion *string `json:"app_version,omitempty"`
	// Template filename
	Template *string `json:"template,omitempty"`
	// Total editing time in minutes
	TotalTime *int32 `json:"total_time,omitempty"`
	// Number of pages
	Pages *int32 `json:"pages,omitempty"`
	// Number of words
	Words *int32 `json:"words,omitempty"`
	// Number of characters (excluding spaces)
	Characters *int32 `json:"characters,omitempty"`
	// Number of characters (including spaces)
	CharactersWithSpaces *int32 `json:"characters_with_spaces,omitempty"`
	// Number of lines
	Lines *int32 `json:"lines,omitempty"`
	// Number of paragraphs
	Paragraphs *int32 `json:"paragraphs,omitempty"`
	// Company name
	Company *string `json:"company,omitempty"`
	// Document security level
	DocSecurity *int32 `json:"doc_security,omitempty"`
	// Scale crop flag
	ScaleCrop *bool `json:"scale_crop,omitempty"`
	// Links up to date flag
	LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
	// Shared document flag
	SharedDoc *bool `json:"shared_doc,omitempty"`
	// Hyperlinks changed flag
	HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
}

// XlsxAppProperties application properties from docProps/app.xml for XLSX
//
// Contains Excel-specific document metadata.
type XlsxAppProperties struct {
	// Application name (e.g., "Microsoft Excel")
	Application *string `json:"application,omitempty"`
	// Application version
	AppVersion *string `json:"app_version,omitempty"`
	// Document security level
	DocSecurity *int32 `json:"doc_security,omitempty"`
	// Scale crop flag
	ScaleCrop *bool `json:"scale_crop,omitempty"`
	// Links up to date flag
	LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
	// Shared document flag
	SharedDoc *bool `json:"shared_doc,omitempty"`
	// Hyperlinks changed flag
	HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
	// Company name
	Company *string `json:"company,omitempty"`
	// Worksheet names
	WorksheetNames []string `json:"worksheet_names,omitempty"`
}

// PptxAppProperties application properties from docProps/app.xml for PPTX
//
// Contains PowerPoint-specific document metadata.
type PptxAppProperties struct {
	// Application name (e.g., "Microsoft Office PowerPoint")
	Application *string `json:"application,omitempty"`
	// Application version
	AppVersion *string `json:"app_version,omitempty"`
	// Total editing time in minutes
	TotalTime *int32 `json:"total_time,omitempty"`
	// Company name
	Company *string `json:"company,omitempty"`
	// Document security level
	DocSecurity *int32 `json:"doc_security,omitempty"`
	// Scale crop flag
	ScaleCrop *bool `json:"scale_crop,omitempty"`
	// Links up to date flag
	LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
	// Shared document flag
	SharedDoc *bool `json:"shared_doc,omitempty"`
	// Hyperlinks changed flag
	HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
	// Number of slides
	Slides *int32 `json:"slides,omitempty"`
	// Number of notes
	Notes *int32 `json:"notes,omitempty"`
	// Number of hidden slides
	HiddenSlides *int32 `json:"hidden_slides,omitempty"`
	// Number of multimedia clips
	MultimediaClips *int32 `json:"multimedia_clips,omitempty"`
	// Presentation format (e.g., "Widescreen", "Standard")
	PresentationFormat *string `json:"presentation_format,omitempty"`
	// Slide titles
	SlideTitles []string `json:"slide_titles,omitempty"`
}

// CoreProperties dublin Core metadata from docProps/core.xml
//
// Contains standard metadata fields defined by the Dublin Core standard
// and Office-specific extensions.
type CoreProperties struct {
	// Document title
	Title *string `json:"title,omitempty"`
	// Document subject/topic
	Subject *string `json:"subject,omitempty"`
	// Document creator/author
	Creator *string `json:"creator,omitempty"`
	// Keywords or tags
	Keywords *string `json:"keywords,omitempty"`
	// Document description/abstract
	Description *string `json:"description,omitempty"`
	// User who last modified the document
	LastModifiedBy *string `json:"last_modified_by,omitempty"`
	// Revision number
	Revision *string `json:"revision,omitempty"`
	// Creation timestamp (ISO 8601)
	Created *string `json:"created,omitempty"`
	// Last modification timestamp (ISO 8601)
	Modified *string `json:"modified,omitempty"`
	// Document category
	Category *string `json:"category,omitempty"`
	// Content status (Draft, Final, etc.)
	ContentStatus *string `json:"content_status,omitempty"`
	// Document language
	Language *string `json:"language,omitempty"`
	// Unique identifier
	Identifier *string `json:"identifier,omitempty"`
	// Document version
	Version *string `json:"version,omitempty"`
	// Last print timestamp (ISO 8601)
	LastPrinted *string `json:"last_printed,omitempty"`
}

// SecurityLimits configuration for security limits across extractors.
//
// All limits are intentionally conservative to prevent DoS attacks
// while still supporting legitimate documents.
type SecurityLimits struct {
	// Maximum uncompressed size for archives (500 MB)
	MaxArchiveSize *uint `json:"max_archive_size,omitempty"`
	// Maximum compression ratio before flagging as potential bomb (100:1)
	MaxCompressionRatio *uint `json:"max_compression_ratio,omitempty"`
	// Maximum number of files in archive (10,000)
	MaxFilesInArchive *uint `json:"max_files_in_archive,omitempty"`
	// Maximum nesting depth for structures (100)
	MaxNestingDepth *uint `json:"max_nesting_depth,omitempty"`
	// Maximum length of any single XML entity / attribute / token (1 MiB).
	// This is a per-token cap, NOT a total cap — billion-laughs class
	// attacks where a single entity expands to hundreds of MB are caught
	// here, while normal long text content (a paragraph, a CDATA block) is
	// caught by `max_content_size` instead.
	MaxEntityLength *uint `json:"max_entity_length,omitempty"`
	// Maximum string growth per document (100 MB)
	MaxContentSize *uint `json:"max_content_size,omitempty"`
	// Maximum iterations per operation
	MaxIterations *uint `json:"max_iterations,omitempty"`
	// Maximum XML depth (100 levels)
	MaxXMLDepth *uint `json:"max_xml_depth,omitempty"`
	// Maximum cells per table (100,000)
	MaxTableCells *uint `json:"max_table_cells,omitempty"`
}

// TokenReductionConfig is a type.
type TokenReductionConfig struct {
	Level                    *ReductionLevel     `json:"level,omitempty"`
	LanguageHint             *string             `json:"language_hint,omitempty"`
	PreserveMarkdown         bool                `json:"preserve_markdown"`
	PreserveCode             *bool               `json:"preserve_code,omitempty"`
	SemanticThreshold        *float32            `json:"semantic_threshold,omitempty"`
	EnableParallel           *bool               `json:"enable_parallel,omitempty"`
	UseSimd                  *bool               `json:"use_simd,omitempty"`
	CustomStopwords          map[string][]string `json:"custom_stopwords,omitempty"`
	PreservePatterns         []string            `json:"preserve_patterns,omitempty"`
	TargetReduction          *float32            `json:"target_reduction,omitempty"`
	EnableSemanticClustering bool                `json:"enable_semantic_clustering"`
}

// PdfAnnotation pDF annotation extracted from a document page.
type PdfAnnotation struct {
	// The type of annotation.
	AnnotationType PdfAnnotationType `json:"annotation_type"`
	// Text content of the annotation (e.g., comment text, link URL).
	Content *string `json:"content,omitempty"`
	// Page number where the annotation appears (1-indexed).
	PageNumber uint32 `json:"page_number"`
	// Bounding box of the annotation on the page.
	BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}

// DjotContent comprehensive Djot document structure with semantic preservation.
//
// This type captures the full richness of Djot markup, including:
// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
// - Attributes (classes, IDs, key-value pairs)
// - Links, images, footnotes
// - Math expressions (inline and display)
// - Tables with full structure
//
// Available when the `djot` feature is enabled.
type DjotContent struct {
	// Plain text representation for backwards compatibility
	PlainText string `json:"plain_text"`
	// Structured block-level content
	Blocks []FormattedBlock `json:"blocks,omitempty"`
	// Metadata from YAML frontmatter
	Metadata Metadata `json:"metadata"`
	// Extracted tables as structured data
	Tables []Table `json:"tables,omitempty"`
	// Extracted images with metadata
	Images []DjotImage `json:"images,omitempty"`
	// Extracted links with URLs
	Links []DjotLink `json:"links,omitempty"`
	// Footnote definitions
	Footnotes []Footnote `json:"footnotes,omitempty"`
	// Attributes mapped by element identifier (if present)
	Attributes []string `json:"attributes,omitempty"`
}

// FormattedBlock block-level element in a Djot document.
//
// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
type FormattedBlock struct {
	// Type of block element
	BlockType BlockType `json:"block_type"`
	// Heading level (1-6) for headings, or nesting level for lists
	Level *uint `json:"level,omitempty"`
	// Inline content within the block
	InlineContent []InlineElement `json:"inline_content,omitempty"`
	// Element attributes (classes, IDs, key-value pairs)
	Attributes *string `json:"attributes,omitempty"`
	// Language identifier for code blocks
	Language *string `json:"language,omitempty"`
	// Raw code content for code blocks
	Code *string `json:"code,omitempty"`
	// Nested blocks for containers (blockquotes, list items, divs)
	Children []FormattedBlock `json:"children,omitempty"`
}

// InlineElement inline element within a block.
//
// Represents text with formatting, links, images, etc.
type InlineElement struct {
	// Type of inline element
	ElementType InlineType `json:"element_type"`
	// Text content
	Content string `json:"content"`
	// Element attributes
	Attributes *string `json:"attributes,omitempty"`
	// Additional metadata (e.g., href for links, src/alt for images)
	Metadata map[string]string `json:"metadata,omitempty"`
}

// DjotImage image element in Djot.
type DjotImage struct {
	// Image source URL or path
	Src string `json:"src"`
	// Alternative text
	Alt string `json:"alt"`
	// Optional title
	Title *string `json:"title,omitempty"`
	// Element attributes
	Attributes *string `json:"attributes,omitempty"`
}

// DjotLink link element in Djot.
type DjotLink struct {
	// Link URL
	URL string `json:"url"`
	// Link text content
	Text string `json:"text"`
	// Optional title
	Title *string `json:"title,omitempty"`
	// Element attributes
	Attributes *string `json:"attributes,omitempty"`
}

// Footnote in Djot.
type Footnote struct {
	// Footnote label
	Label string `json:"label"`
	// Footnote content blocks
	Content []FormattedBlock `json:"content,omitempty"`
}

// DocumentStructure top-level structured document representation.
//
// A flat array of nodes with index-based parent/child references forming a tree.
// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
// to iterate over top-level content by layer.
//
// # Validation
//
// Call `validate()` after construction to verify all node indices are in bounds
// and parent-child relationships are bidirectionally consistent.
type DocumentStructure struct {
	// All nodes in document/reading order.
	Nodes []DocumentNode `json:"nodes,omitempty"`
	// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
	//
	// Allows renderers to apply format-aware heuristics when converting
	// the document tree to output formats.
	SourceFormat *string `json:"source_format,omitempty"`
	// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
	//
	// Populated during derivation from the internal document representation.
	// Empty when no relationships are detected.
	Relationships []DocumentRelationship `json:"relationships,omitempty"`
	// Sorted, deduplicated list of node type names present in this document.
	//
	// Each value is the snake_case `node_type` tag of the corresponding
	// [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
	//
	// Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
	// Empty until that method is called (internal construction paths call it
	// at the end of derivation).
	NodeTypes []string `json:"node_types,omitempty"`
}

// DocumentRelationship resolved relationship between two nodes in the document tree.
type DocumentRelationship struct {
	// Source node index (the referencing node).
	Source uint32 `json:"source"`
	// Target node index (the referenced node).
	Target uint32 `json:"target"`
	// Semantic kind of the relationship.
	Kind RelationshipKind `json:"kind"`
}

// DocumentNode single node in the document tree.
//
// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
// for tree structure, and metadata like page number, bounding box, and content layer.
type DocumentNode struct {
	// Deterministic identifier (hash of content + position).
	ID string `json:"id"`
	// Node content — tagged enum, type-specific data only.
	Content NodeContent `json:"content"`
	// Parent node index (`None` = root-level node).
	Parent *uint32 `json:"parent,omitempty"`
	// Child node indices in reading order.
	Children []uint32 `json:"children,omitempty"`
	// Content layer classification.
	ContentLayer ContentLayer `json:"content_layer"`
	// Page number where this node starts (1-indexed).
	Page *uint32 `json:"page,omitempty"`
	// Page number where this node ends (for multi-page tables/sections).
	PageEnd *uint32 `json:"page_end,omitempty"`
	// Bounding box in document coordinates.
	Bbox *BoundingBox `json:"bbox,omitempty"`
	// Inline annotations (formatting, links) on this node's text content.
	//
	// Only meaningful for text-carrying nodes; empty for containers.
	Annotations []TextAnnotation `json:"annotations,omitempty"`
	// Format-specific key-value attributes.
	//
	// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
	// LaTeX environment names, Excel cell formulas, slide layout names, etc.
	Attributes map[string]string `json:"attributes,omitempty"`
}

func (s *DocumentNode) UnmarshalJSON(data []byte) error {
	var raw struct {
		ID           string            `json:"id"`
		Content      json.RawMessage   `json:"content,omitempty"`
		Parent       *uint32           `json:"parent,omitempty"`
		Children     []uint32          `json:"children,omitempty"`
		ContentLayer ContentLayer      `json:"content_layer"`
		Page         *uint32           `json:"page,omitempty"`
		PageEnd      *uint32           `json:"page_end,omitempty"`
		Bbox         *BoundingBox      `json:"bbox,omitempty"`
		Annotations  []TextAnnotation  `json:"annotations,omitempty"`
		Attributes   map[string]string `json:"attributes,omitempty"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.ID = raw.ID
	s.Parent = raw.Parent
	s.Children = raw.Children
	s.ContentLayer = raw.ContentLayer
	s.Page = raw.Page
	s.PageEnd = raw.PageEnd
	s.Bbox = raw.Bbox
	s.Annotations = raw.Annotations
	s.Attributes = raw.Attributes
	if len(raw.Content) > 0 && string(raw.Content) != "null" {
		v, err := UnmarshalNodeContent(raw.Content)
		if err != nil {
			return err
		}
		s.Content = v
	}
	return nil
}

// TableGrid structured table grid with cell-level metadata.
//
// Stores row/column dimensions and a flat list of cells with position info.
type TableGrid struct {
	// Number of rows in the table.
	Rows uint32 `json:"rows"`
	// Number of columns in the table.
	Cols uint32 `json:"cols"`
	// All cells in row-major order.
	Cells []GridCell `json:"cells,omitempty"`
}

// GridCell individual grid cell with position and span metadata.
type GridCell struct {
	// Cell text content.
	Content string `json:"content"`
	// Zero-indexed row position.
	Row uint32 `json:"row"`
	// Zero-indexed column position.
	Col uint32 `json:"col"`
	// Number of rows this cell spans.
	RowSpan uint32 `json:"row_span"`
	// Number of columns this cell spans.
	ColSpan uint32 `json:"col_span"`
	// Whether this is a header cell.
	IsHeader bool `json:"is_header"`
	// Bounding box for this cell (if available).
	Bbox *BoundingBox `json:"bbox,omitempty"`
}

// TextAnnotation inline text annotation — byte-range based formatting and links.
//
// Annotations reference byte offsets into the node's text content,
// enabling precise identification of formatted regions.
type TextAnnotation struct {
	// Start byte offset in the node's text content (inclusive).
	Start uint32 `json:"start"`
	// End byte offset in the node's text content (exclusive).
	End uint32 `json:"end"`
	// Annotation type.
	Kind AnnotationKind `json:"kind"`
}

func (s *TextAnnotation) UnmarshalJSON(data []byte) error {
	var raw struct {
		Start uint32          `json:"start"`
		End   uint32          `json:"end"`
		Kind  json.RawMessage `json:"kind,omitempty"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.Start = raw.Start
	s.End = raw.End
	if len(raw.Kind) > 0 && string(raw.Kind) != "null" {
		v, err := UnmarshalAnnotationKind(raw.Kind)
		if err != nil {
			return err
		}
		s.Kind = v
	}
	return nil
}

// ExtractionResult general extraction result used by the core extraction API.
//
// This is the main result type returned by all extraction functions.
type ExtractionResult struct {
	Content  string   `json:"content"`
	MimeType string   `json:"mime_type"`
	Metadata Metadata `json:"metadata"`
	// Extraction strategy used to produce the returned text.
	//
	// Populated when the extractor can reliably distinguish native text extraction,
	// OCR-only extraction, or mixed native/OCR output.
	ExtractionMethod  *ExtractionMethod `json:"extraction_method,omitempty"`
	Tables            []Table           `json:"tables,omitempty"`
	DetectedLanguages []string          `json:"detected_languages,omitempty"`
	// Text chunks when chunking is enabled.
	//
	// When chunking configuration is provided, the content is split into
	// overlapping chunks for efficient processing. Each chunk contains the text,
	// optional embeddings (if enabled), and metadata about its position.
	Chunks []Chunk `json:"chunks,omitempty"`
	// Extracted images from the document.
	//
	// When image extraction is enabled via `ImageExtractionConfig`, this field
	// contains all images found in the document with their raw data and metadata.
	// Each image may optionally contain a nested `ocr_result` if OCR was performed.
	Images []ExtractedImage `json:"images,omitempty"`
	// Per-page content when page extraction is enabled.
	//
	// When page extraction is configured, the document is split into per-page content
	// with tables and images mapped to their respective pages.
	Pages []PageContent `json:"pages,omitempty"`
	// Semantic elements when element-based result format is enabled.
	//
	// When result_format is set to ElementBased, this field contains semantic
	// elements with type classification, unique identifiers, and metadata for
	// Unstructured-compatible element-based processing.
	Elements []Element `json:"elements,omitempty"`
	// Rich Djot content structure (when extracting Djot documents).
	//
	// When extracting Djot documents with structured extraction enabled,
	// this field contains the full semantic structure including:
	// - Block-level elements with nesting
	// - Inline formatting with attributes
	// - Links, images, footnotes
	// - Math expressions
	// - Complete attribute information
	//
	// The `content` field still contains plain text for backward compatibility.
	//
	// Always `None` for non-Djot documents.
	DjotContent *DjotContent `json:"djot_content,omitempty"`
	// OCR elements with full spatial and confidence metadata.
	//
	// When OCR is performed with element extraction enabled, this field contains
	// the structured representation of detected text including:
	// - Bounding geometry (rectangles or quadrilaterals)
	// - Confidence scores (detection and recognition)
	// - Rotation information
	// - Hierarchical relationships (Tesseract only)
	//
	// This field preserves all metadata that would otherwise be lost when
	// converting to plain text or markdown output formats.
	//
	// Only populated when `OcrElementConfig.include_elements` is true.
	OcrElements []OcrElement `json:"ocr_elements,omitempty"`
	// Structured document tree (when document structure extraction is enabled).
	//
	// When `include_document_structure` is true in `ExtractionConfig`, this field
	// contains the full hierarchical representation of the document including:
	// - Heading-driven section nesting
	// - Table grids with cell-level metadata
	// - Content layer classification (body, header, footer, footnote)
	// - Inline text annotations (formatting, links)
	// - Bounding boxes and page numbers
	//
	// Independent of `result_format` — can be combined with Unified or ElementBased.
	Document *DocumentStructure `json:"document,omitempty"`
	// Extracted keywords when keyword extraction is enabled.
	//
	// When keyword extraction (RAKE or YAKE) is configured, this field contains
	// the extracted keywords with scores, algorithm info, and position data.
	// Previously stored in `metadata.additional["keywords"]`.
	ExtractedKeywords []Keyword `json:"extracted_keywords,omitempty"`
	// Document quality score from quality analysis.
	//
	// A value between 0.0 and 1.0 indicating the overall text quality.
	// Previously stored in `metadata.additional["quality_score"]`.
	QualityScore *float64 `json:"quality_score,omitempty"`
	// Non-fatal warnings collected during processing pipeline stages.
	//
	// Captures errors from optional pipeline features (embedding, chunking,
	// language detection, output formatting) that don't prevent extraction
	// but may indicate degraded results.
	// Previously stored as individual keys in `metadata.additional`.
	ProcessingWarnings []ProcessingWarning `json:"processing_warnings,omitempty"`
	// PDF annotations extracted from the document.
	//
	// When annotation extraction is enabled via `PdfConfig::extract_annotations`,
	// this field contains text notes, highlights, links, stamps, and other
	// annotations found in PDF documents.
	Annotations []PdfAnnotation `json:"annotations,omitempty"`
	// Nested extraction results from archive contents.
	//
	// When extracting archives, each processable file inside produces its own
	// full extraction result. Set to `None` for non-archive formats.
	// Use `max_archive_depth` in config to control recursion depth.
	Children []ArchiveEntry `json:"children,omitempty"`
	// URIs/links discovered during document extraction.
	//
	// Contains hyperlinks, image references, citations, email addresses, and
	// other URI-like references found in the document. Always extracted when
	// present in the source document.
	Uris []ExtractedURI `json:"uris,omitempty"`
	// Tracked changes embedded in the source document.
	//
	// Populated by per-format extractors that understand change-tracking
	// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
	// …). Every extractor defaults to `None` until its format-specific
	// implementation is added. Extractors that do populate this field follow
	// the "accepted-changes" convention: inserted text is present in
	// `content`, deleted text is absent — the revision list is the separate
	// audit trail.
	Revisions []DocumentRevision `json:"revisions,omitempty"`
	// Structured extraction output from LLM-based JSON schema extraction.
	//
	// When `structured_extraction` is configured in `ExtractionConfig`, the
	// extracted document content is sent to a VLM with the provided JSON schema.
	// The response is parsed and stored here as a JSON value matching the schema.
	StructuredOutput *json.RawMessage `json:"structured_output,omitempty"`
	// Code intelligence results from tree-sitter analysis.
	//
	// Populated when extracting source code files with the `tree-sitter` feature.
	// Contains metrics, structural analysis, imports/exports, comments,
	// docstrings, symbols, diagnostics, and optionally chunked code segments.
	//
	// Stored as an opaque JSON value so that all language bindings (Go, Java,
	// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
	// The underlying type is `tree_sitter_language_pack::ProcessResult`.
	CodeIntelligence *json.RawMessage `json:"code_intelligence,omitempty"`
	// LLM token usage and cost data for all LLM calls made during this extraction.
	//
	// Contains one entry per LLM call. Multiple entries are produced when
	// VLM OCR, structured extraction, or LLM embeddings run during
	// the same extraction.
	//
	// `None` when no LLM was used.
	LlmUsage []LlmUsage `json:"llm_usage,omitempty"`
	// Pre-rendered content in the requested output format.
	//
	// Populated during `derive_extraction_result` before tree derivation consumes
	// element data. `apply_output_format` swaps this into `content` at the end
	// of the pipeline, after post-processors have operated on plain text.
	FormattedContent *string `json:"formatted_content,omitempty"`
	// Structured hOCR document for the OCR+layout pipeline.
	//
	// When tesseract produces hOCR output, the parsed `InternalDocument` carries
	// paragraph structure with bounding boxes and confidence scores. The layout
	// classification step enriches these elements before final rendering.
	OcrInternalDocument *string `json:"ocr_internal_document,omitempty"`
}

// ArchiveEntry single file extracted from an archive.
//
// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
// enabled, each processable file produces its own full `ExtractionResult`.
type ArchiveEntry struct {
	// Archive-relative file path (e.g. "folder/document.pdf").
	Path string `json:"path"`
	// Detected MIME type of the file.
	MimeType string `json:"mime_type"`
	// Full extraction result for this file.
	Result ExtractionResult `json:"result"`
}

// ProcessingWarning non-fatal warning from a processing pipeline stage.
//
// Captures errors from optional features that don't prevent extraction
// but may indicate degraded results.
type ProcessingWarning struct {
	// The pipeline stage or feature that produced this warning
	// (e.g., "embedding", "chunking", "language_detection", "output_format").
	Source string `json:"source"`
	// Human-readable description of what went wrong.
	Message string `json:"message"`
}

// LlmUsage token usage and cost data for a single LLM call made during extraction.
//
// Populated when VLM OCR, structured extraction, or LLM-based embeddings
// are used. Multiple entries may be present when multiple LLM calls occur
// within one extraction (e.g. VLM OCR + structured extraction).
type LlmUsage struct {
	// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
	Model string `json:"model"`
	// The pipeline stage that triggered this LLM call
	// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
	Source string `json:"source"`
	// Number of input/prompt tokens consumed.
	InputTokens *uint64 `json:"input_tokens,omitempty"`
	// Number of output/completion tokens generated.
	OutputTokens *uint64 `json:"output_tokens,omitempty"`
	// Total tokens (input + output).
	TotalTokens *uint64 `json:"total_tokens,omitempty"`
	// Estimated cost in USD based on the provider's published pricing.
	EstimatedCost *float64 `json:"estimated_cost,omitempty"`
	// Why the model stopped generating (e.g. "stop", "length", "content_filter").
	FinishReason *string `json:"finish_reason,omitempty"`
}

// Chunk text chunk with optional embedding and metadata.
//
// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
// contains the text content, optional embedding vector (if embedding generation
// is configured), and metadata about its position in the document.
type Chunk struct {
	// The text content of this chunk.
	Content string `json:"content"`
	// Semantic structural classification of this chunk.
	//
	// Assigned by the heuristic classifier based on content patterns and
	// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
	ChunkType ChunkType `json:"chunk_type"`
	// Optional embedding vector for this chunk.
	//
	// Only populated when `EmbeddingConfig` is provided in chunking configuration.
	// The dimensionality depends on the chosen embedding model.
	Embedding []float32 `json:"embedding,omitempty"`
	// Metadata about this chunk's position and properties.
	Metadata ChunkMetadata `json:"metadata"`
}

// HeadingContext heading context for a chunk within a Markdown document.
//
// Contains the heading hierarchy from document root to this chunk's section.
type HeadingContext struct {
	// The heading hierarchy from document root to this chunk's section.
	// Index 0 is the outermost (h1), last element is the most specific.
	Headings []HeadingLevel `json:"headings,omitempty"`
}

// HeadingLevel single heading in the hierarchy.
type HeadingLevel struct {
	// Heading depth (1 = h1, 2 = h2, etc.)
	Level uint8 `json:"level"`
	// The text content of the heading.
	Text string `json:"text"`
}

// ChunkMetadata metadata about a chunk's position in the original document.
type ChunkMetadata struct {
	// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
	ByteStart uint `json:"byte_start"`
	// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
	ByteEnd uint `json:"byte_end"`
	// Number of tokens in this chunk (if available).
	//
	// This is calculated by the embedding model's tokenizer if embeddings are enabled.
	TokenCount *uint `json:"token_count,omitempty"`
	// Zero-based index of this chunk in the document.
	ChunkIndex uint `json:"chunk_index"`
	// Total number of chunks in the document.
	TotalChunks uint `json:"total_chunks"`
	// First page number this chunk spans (1-indexed).
	//
	// Only populated when page tracking is enabled in extraction configuration.
	FirstPage *uint32 `json:"first_page,omitempty"`
	// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
	//
	// Only populated when page tracking is enabled in extraction configuration.
	LastPage *uint32 `json:"last_page,omitempty"`
	// Heading context when using Markdown chunker.
	//
	// Contains the heading hierarchy this chunk falls under.
	// Only populated when `ChunkerType::Markdown` is used.
	HeadingContext *HeadingContext `json:"heading_context,omitempty"`
	// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
	//
	// Contains zero-based indices into the top-level `images` collection for every
	// image whose `page_number` falls within `[first_page, last_page]`.
	// Empty when image extraction is disabled or the chunk spans no pages with images.
	ImageIndices []uint32 `json:"image_indices,omitempty"`
}

// ExtractedImage extracted image from a document.
//
// Contains raw image data, metadata, and optional nested OCR results.
// Raw bytes allow cross-language compatibility - users can convert to
// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
type ExtractedImage struct {
	// Raw image data (PNG, JPEG, WebP, etc. bytes).
	// Uses `bytes::Bytes` for cheap cloning of large buffers.
	Data []byte `json:"data"`
	// Image format (e.g., "jpeg", "png", "webp")
	// Uses Cow<'static, str> to avoid allocation for static literals.
	Format string `json:"format"`
	// Zero-indexed position of this image in the document/page
	ImageIndex uint32 `json:"image_index"`
	// Page/slide number where image was found (1-indexed)
	PageNumber *uint32 `json:"page_number,omitempty"`
	// Image width in pixels
	Width *uint32 `json:"width,omitempty"`
	// Image height in pixels
	Height *uint32 `json:"height,omitempty"`
	// Colorspace information (e.g., "RGB", "CMYK", "Gray")
	Colorspace *string `json:"colorspace,omitempty"`
	// Bits per color component (e.g., 8, 16)
	BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
	// Whether this image is a mask image
	IsMask bool `json:"is_mask"`
	// Optional description of the image
	Description *string `json:"description,omitempty"`
	// Nested OCR extraction result (if image was OCRed)
	//
	// When OCR is performed on this image, the result is embedded here
	// rather than in a separate collection, making the relationship explicit.
	OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
	// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
	// Only populated for PDF-extracted images when position data is available from the PDF extractor.
	BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
	// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
	// Used for rendering image references when the binary data is not extracted.
	SourcePath *string `json:"source_path,omitempty"`
	// Heuristic classification of what this image likely depicts.
	// `None` if classification was disabled or inconclusive.
	ImageKind *ImageKind `json:"image_kind,omitempty"`
	// Confidence score for `image_kind`, in the range 0.0 to 1.0.
	KindConfidence *float32 `json:"kind_confidence,omitempty"`
	// Identifier shared across images that form a single logical figure
	// (e.g. all raster tiles of one technical drawing). `None` for singletons.
	ClusterID *uint32 `json:"cluster_id,omitempty"`
}

// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v ExtractedImage) MarshalJSON() ([]byte, error) {
	// Explicit shadow struct listing every field — embedding the original
	// would cause both base64-string and int-array entries for the same JSON
	// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
	aux := struct {
		Data             []int             `json:"data"`
		Format           string            `json:"format"`
		ImageIndex       uint32            `json:"image_index"`
		PageNumber       *uint32           `json:"page_number,omitempty"`
		Width            *uint32           `json:"width,omitempty"`
		Height           *uint32           `json:"height,omitempty"`
		Colorspace       *string           `json:"colorspace,omitempty"`
		BitsPerComponent *uint32           `json:"bits_per_component,omitempty"`
		IsMask           bool              `json:"is_mask"`
		Description      *string           `json:"description,omitempty"`
		OcrResult        *ExtractionResult `json:"ocr_result,omitempty"`
		BoundingBox      *BoundingBox      `json:"bounding_box,omitempty"`
		SourcePath       *string           `json:"source_path,omitempty"`
		ImageKind        *ImageKind        `json:"image_kind,omitempty"`
		KindConfidence   *float32          `json:"kind_confidence,omitempty"`
		ClusterID        *uint32           `json:"cluster_id,omitempty"`
	}{}
	aux.Data = make([]int, len(v.Data))
	for i, b := range v.Data {
		aux.Data[i] = int(b)
	}
	aux.Format = v.Format
	aux.ImageIndex = v.ImageIndex
	aux.PageNumber = v.PageNumber
	aux.Width = v.Width
	aux.Height = v.Height
	aux.Colorspace = v.Colorspace
	aux.BitsPerComponent = v.BitsPerComponent
	aux.IsMask = v.IsMask
	aux.Description = v.Description
	aux.OcrResult = v.OcrResult
	aux.BoundingBox = v.BoundingBox
	aux.SourcePath = v.SourcePath
	aux.ImageKind = v.ImageKind
	aux.KindConfidence = v.KindConfidence
	aux.ClusterID = v.ClusterID
	return json.Marshal(aux)
}

// BoundingBox bounding box coordinates for element positioning.
type BoundingBox struct {
	// Left x-coordinate
	X0 float64 `json:"x0"`
	// Bottom y-coordinate
	Y0 float64 `json:"y0"`
	// Right x-coordinate
	X1 float64 `json:"x1"`
	// Top y-coordinate
	Y1 float64 `json:"y1"`
}

// ElementMetadata metadata for a semantic element.
type ElementMetadata struct {
	// Page number (1-indexed)
	PageNumber *uint32 `json:"page_number,omitempty"`
	// Source filename or document name
	Filename *string `json:"filename,omitempty"`
	// Bounding box coordinates if available
	Coordinates *BoundingBox `json:"coordinates,omitempty"`
	// Position index in the element sequence
	ElementIndex *uint `json:"element_index,omitempty"`
	// Additional custom metadata
	Additional map[string]string `json:"additional,omitempty"`
}

// Element semantic element extracted from document.
//
// Represents a logical unit of content with semantic classification,
// unique identifier, and metadata for tracking origin and position.
type Element struct {
	// Unique element identifier
	ElementID string `json:"element_id"`
	// Semantic type of this element
	ElementType ElementType `json:"element_type"`
	// Text content of the element
	Text string `json:"text"`
	// Metadata about the element
	Metadata ElementMetadata `json:"metadata"`
}

// ExcelWorkbook excel workbook representation.
//
// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
// extracted content and metadata.
type ExcelWorkbook struct {
	// All sheets in the workbook
	Sheets []ExcelSheet `json:"sheets,omitempty"`
	// Workbook-level metadata (author, creation date, etc.)
	Metadata map[string]string `json:"metadata,omitempty"`
	// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
	//
	// Populated for legacy shared-workbook `.xlsx` files that contain the
	// `xl/revisions/` directory. Each `<header>` element maps to one
	// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
	// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
	// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
	// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
	Revisions []DocumentRevision `json:"revisions,omitempty"`
}

// ExcelSheet single Excel worksheet.
//
// Represents one sheet from an Excel workbook with its content
// converted to Markdown format and dimensional statistics.
type ExcelSheet struct {
	// Sheet name as it appears in Excel
	Name string `json:"name"`
	// Sheet content converted to Markdown tables
	Markdown string `json:"markdown"`
	// Number of rows
	RowCount uint `json:"row_count"`
	// Number of columns
	ColCount uint `json:"col_count"`
	// Total number of non-empty cells
	CellCount uint `json:"cell_count"`
	// Pre-extracted table cells (2D vector of cell values)
	// Populated during markdown generation to avoid re-parsing markdown.
	// None for empty sheets.
	TableCells [][]string `json:"table_cells,omitempty"`
}

// XMLExtractionResult xML extraction result.
//
// Contains extracted text content from XML files along with
// structural statistics about the XML document.
type XMLExtractionResult struct {
	// Extracted text content (XML structure filtered out)
	Content string `json:"content"`
	// Total number of XML elements processed
	ElementCount uint `json:"element_count"`
	// List of unique element names found (sorted)
	UniqueElements []string `json:"unique_elements,omitempty"`
}

// TextExtractionResult plain text and Markdown extraction result.
//
// Contains the extracted text along with statistics and,
// for Markdown files, structural elements like headers and links.
type TextExtractionResult struct {
	// Extracted text content
	Content string `json:"content"`
	// Number of lines
	LineCount uint `json:"line_count"`
	// Number of words
	WordCount uint `json:"word_count"`
	// Number of characters
	CharacterCount uint `json:"character_count"`
	// Markdown headers (text only, Markdown files only)
	Headers []string `json:"headers,omitempty"`
	// Markdown links as (text, URL) tuples (Markdown files only)
	Links [][]string `json:"links,omitempty"`
	// Code blocks as (language, code) tuples (Markdown files only)
	CodeBlocks [][]string `json:"code_blocks,omitempty"`
}

// PptxExtractionResult powerPoint (PPTX) extraction result.
//
// Contains extracted slide content, metadata, and embedded images/tables.
type PptxExtractionResult struct {
	// Extracted text content from all slides
	Content string `json:"content"`
	// Presentation metadata
	Metadata PptxMetadata `json:"metadata"`
	// Total number of slides
	SlideCount uint `json:"slide_count"`
	// Total number of embedded images
	ImageCount uint `json:"image_count"`
	// Total number of tables
	TableCount uint `json:"table_count"`
	// Extracted images from the presentation
	Images []ExtractedImage `json:"images,omitempty"`
	// Slide structure with boundaries (when page tracking is enabled)
	PageStructure *PageStructure `json:"page_structure,omitempty"`
	// Per-slide content (when page tracking is enabled)
	PageContents []PageContent `json:"page_contents,omitempty"`
	// Structured document representation
	Document *DocumentStructure `json:"document,omitempty"`
	// Hyperlinks discovered in slides as (url, optional_label) pairs.
	Hyperlinks []string `json:"hyperlinks,omitempty"`
	// Office metadata extracted from docProps/core.xml and docProps/app.xml.
	//
	// Contains keys like "title", "author", "created_by", "subject", "keywords",
	// "modified_by", "created_at", "modified_at", etc.
	OfficeMetadata map[string]string `json:"office_metadata,omitempty"`
	// Slide comments as revisions.
	//
	// Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
	// `DocumentRevision { kind: Comment }` with author (resolved from
	// `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
	// `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
	Revisions []DocumentRevision `json:"revisions,omitempty"`
}

// EmailExtractionResult email extraction result.
//
// Complete representation of an extracted email message (.eml or .msg)
// including headers, body content, and attachments.
type EmailExtractionResult struct {
	// Email subject line
	Subject *string `json:"subject,omitempty"`
	// Sender email address
	FromEmail *string `json:"from_email,omitempty"`
	// Primary recipient email addresses
	ToEmails []string `json:"to_emails,omitempty"`
	// CC recipient email addresses
	CcEmails []string `json:"cc_emails,omitempty"`
	// BCC recipient email addresses
	BccEmails []string `json:"bcc_emails,omitempty"`
	// Email date/timestamp
	Date *string `json:"date,omitempty"`
	// Message-ID header value
	MessageID *string `json:"message_id,omitempty"`
	// Plain text version of the email body
	PlainText *string `json:"plain_text,omitempty"`
	// HTML version of the email body
	HTMLContent *string `json:"html_content,omitempty"`
	// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
	Content string `json:"content"`
	// List of email attachments
	Attachments []EmailAttachment `json:"attachments,omitempty"`
	// Additional email headers and metadata
	Metadata map[string]string `json:"metadata,omitempty"`
}

// EmailAttachment email attachment representation.
//
// Contains metadata and optionally the content of an email attachment.
type EmailAttachment struct {
	// Attachment name (from Content-Disposition header)
	Name *string `json:"name,omitempty"`
	// Filename of the attachment
	Filename *string `json:"filename,omitempty"`
	// MIME type of the attachment
	MimeType *string `json:"mime_type,omitempty"`
	// Size in bytes
	Size *uint `json:"size,omitempty"`
	// Whether this attachment is an image
	IsImage bool `json:"is_image"`
	// Attachment data (if extracted).
	// Uses `bytes::Bytes` for cheap cloning of large buffers.
	Data []byte `json:"data,omitempty"`
}

// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmailAttachment) MarshalJSON() ([]byte, error) {
	// Explicit shadow struct listing every field — embedding the original
	// would cause both base64-string and int-array entries for the same JSON
	// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
	aux := struct {
		Name     *string `json:"name,omitempty"`
		Filename *string `json:"filename,omitempty"`
		MimeType *string `json:"mime_type,omitempty"`
		Size     *uint   `json:"size,omitempty"`
		IsImage  bool    `json:"is_image"`
		Data     []int   `json:"data,omitempty"`
	}{}
	aux.Name = v.Name
	aux.Filename = v.Filename
	aux.MimeType = v.MimeType
	aux.Size = v.Size
	aux.IsImage = v.IsImage
	if v.Data != nil {
		aux.Data = make([]int, len(v.Data))
		for i, b := range v.Data {
			aux.Data[i] = int(b)
		}
	}
	return json.Marshal(aux)
}

// OcrExtractionResult oCR extraction result.
//
// Result of performing OCR on an image or scanned document,
// including recognized text and detected tables.
type OcrExtractionResult struct {
	// Recognized text content
	Content string `json:"content"`
	// Original MIME type of the processed image
	MimeType string `json:"mime_type"`
	// OCR processing metadata (confidence scores, language, etc.)
	Metadata map[string]json.RawMessage `json:"metadata,omitempty"`
	// Tables detected and extracted via OCR
	Tables []OcrTable `json:"tables,omitempty"`
	// Structured OCR elements with bounding boxes and confidence scores.
	// Available when TSV output is requested or table detection is enabled.
	OcrElements []OcrElement `json:"ocr_elements,omitempty"`
	// Structured document produced from hOCR parsing.
	// Carries paragraph structure, bounding boxes, and confidence scores
	// that the flattened `content` string discards.
	InternalDocument *string `json:"internal_document,omitempty"`
}

// OcrTable table detected via OCR.
//
// Represents a table structure recognized during OCR processing.
type OcrTable struct {
	// Table cells as a 2D vector (rows × columns)
	Cells [][]string `json:"cells,omitempty"`
	// Markdown representation of the table
	Markdown string `json:"markdown"`
	// Page number where the table was found (1-indexed)
	PageNumber uint32 `json:"page_number"`
	// Bounding box of the table in pixel coordinates (from OCR word positions).
	BoundingBox *OcrTableBoundingBox `json:"bounding_box,omitempty"`
}

// OcrTableBoundingBox bounding box for an OCR-detected table in pixel coordinates.
type OcrTableBoundingBox struct {
	// Left x-coordinate (pixels)
	Left uint32 `json:"left"`
	// Top y-coordinate (pixels)
	Top uint32 `json:"top"`
	// Right x-coordinate (pixels)
	Right uint32 `json:"right"`
	// Bottom y-coordinate (pixels)
	Bottom uint32 `json:"bottom"`
}

// ImagePreprocessingConfig image preprocessing configuration for OCR.
//
// These settings control how images are preprocessed before OCR to improve
// text recognition quality. Different preprocessing strategies work better
// for different document types.
type ImagePreprocessingConfig struct {
	// Target DPI for the image (300 is standard, 600 for small text).
	TargetDpi *int32 `json:"target_dpi,omitempty"`
	// Auto-detect and correct image rotation.
	AutoRotate *bool `json:"auto_rotate,omitempty"`
	// Correct skew (tilted images).
	Deskew *bool `json:"deskew,omitempty"`
	// Remove noise from the image.
	Denoise bool `json:"denoise"`
	// Enhance contrast for better text visibility.
	ContrastEnhance bool `json:"contrast_enhance"`
	// Binarization method: "otsu", "sauvola", "adaptive".
	BinarizationMethod *string `json:"binarization_method,omitempty"`
	// Invert colors (white text on black → black on white).
	InvertColors bool `json:"invert_colors"`
}

// TesseractConfig tesseract OCR configuration.
//
// Provides fine-grained control over Tesseract OCR engine parameters.
// Most users can use the defaults, but these settings allow optimization
// for specific document types (invoices, handwriting, etc.).
type TesseractConfig struct {
	// Language code (e.g., "eng", "deu", "fra")
	Language *string `json:"language,omitempty"`
	// Page Segmentation Mode (0-13).
	//
	// Common values:
	// - 3: Fully automatic page segmentation (native default)
	// - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
	// - 11: Sparse text with no particular order
	Psm *int32 `json:"psm,omitempty"`
	// Output format ("text" or "markdown")
	OutputFormat *string `json:"output_format,omitempty"`
	// OCR Engine Mode (0-3).
	//
	// - 0: Legacy engine only
	// - 1: Neural nets (LSTM) only (usually best)
	// - 2: Legacy + LSTM
	// - 3: Default (based on what's available)
	Oem *int32 `json:"oem,omitempty"`
	// Minimum confidence threshold (0.0-100.0).
	//
	// Words with confidence below this threshold may be rejected or flagged.
	MinConfidence float64 `json:"min_confidence"`
	// Image preprocessing configuration.
	//
	// Controls how images are preprocessed before OCR. Can significantly
	// improve quality for scanned documents or low-quality images.
	Preprocessing *ImagePreprocessingConfig `json:"preprocessing,omitempty"`
	// Enable automatic table detection and reconstruction
	EnableTableDetection *bool `json:"enable_table_detection,omitempty"`
	// Minimum confidence threshold for table detection (0.0-1.0)
	TableMinConfidence float64 `json:"table_min_confidence"`
	// Column threshold for table detection (pixels)
	TableColumnThreshold *int32 `json:"table_column_threshold,omitempty"`
	// Row threshold ratio for table detection (0.0-1.0)
	TableRowThresholdRatio *float64 `json:"table_row_threshold_ratio,omitempty"`
	// Enable OCR result caching
	UseCache *bool `json:"use_cache,omitempty"`
	// Use pre-adapted templates for character classification
	ClassifyUsePreAdaptedTemplates *bool `json:"classify_use_pre_adapted_templates,omitempty"`
	// Enable N-gram language model
	LanguageModelNgramOn bool `json:"language_model_ngram_on"`
	// Don't reject good words during block-level processing
	TesseditDontBlkrejGoodWds *bool `json:"tessedit_dont_blkrej_good_wds,omitempty"`
	// Don't reject good words during row-level processing
	TesseditDontRowrejGoodWds *bool `json:"tessedit_dont_rowrej_good_wds,omitempty"`
	// Enable dictionary correction
	TesseditEnableDictCorrection *bool `json:"tessedit_enable_dict_correction,omitempty"`
	// Whitelist of allowed characters (empty = all allowed)
	TesseditCharWhitelist string `json:"tessedit_char_whitelist"`
	// Blacklist of forbidden characters (empty = none forbidden)
	TesseditCharBlacklist string `json:"tessedit_char_blacklist"`
	// Use primary language params model
	TesseditUsePrimaryParamsModel *bool `json:"tessedit_use_primary_params_model,omitempty"`
	// Variable-width space detection
	TextordSpaceSizeIsVariable *bool `json:"textord_space_size_is_variable,omitempty"`
	// Use adaptive thresholding method
	ThresholdingMethod bool `json:"thresholding_method"`
}

// ImagePreprocessingMetadata image preprocessing metadata.
//
// Tracks the transformations applied to an image during OCR preprocessing,
// including DPI normalization, resizing, and resampling.
type ImagePreprocessingMetadata struct {
	// Original image dimensions (width, height) in pixels
	OriginalDimensions []uint `json:"original_dimensions,omitempty"`
	// Original image DPI (horizontal, vertical)
	OriginalDpi []float64 `json:"original_dpi,omitempty"`
	// Target DPI from configuration
	TargetDpi int32 `json:"target_dpi"`
	// Scaling factor applied to the image
	ScaleFactor float64 `json:"scale_factor"`
	// Whether DPI was auto-adjusted based on content
	AutoAdjusted bool `json:"auto_adjusted"`
	// Final DPI after processing
	FinalDpi int32 `json:"final_dpi"`
	// New dimensions after resizing (if resized)
	NewDimensions []uint `json:"new_dimensions,omitempty"`
	// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
	ResampleMethod string `json:"resample_method"`
	// Whether dimensions were clamped to max_image_dimension
	DimensionClamped bool `json:"dimension_clamped"`
	// Calculated optimal DPI (if auto_adjust_dpi enabled)
	CalculatedDpi *int32 `json:"calculated_dpi,omitempty"`
	// Whether resize was skipped (dimensions already optimal)
	SkippedResize bool `json:"skipped_resize"`
	// Error message if resize failed
	ResizeError *string `json:"resize_error,omitempty"`
}

// Metadata extraction result metadata.
//
// Contains common fields applicable to all formats, format-specific metadata
// via a discriminated union, and additional custom fields from postprocessors.
type Metadata struct {
	// Document title
	Title *string `json:"title,omitempty"`
	// Document subject or description
	Subject *string `json:"subject,omitempty"`
	// Primary author(s) - always Vec for consistency
	Authors []string `json:"authors,omitempty"`
	// Keywords/tags - always Vec for consistency
	Keywords []string `json:"keywords,omitempty"`
	// Primary language (ISO 639 code)
	Language *string `json:"language,omitempty"`
	// Creation timestamp (ISO 8601 format)
	CreatedAt *string `json:"created_at,omitempty"`
	// Last modification timestamp (ISO 8601 format)
	ModifiedAt *string `json:"modified_at,omitempty"`
	// User who created the document
	CreatedBy *string `json:"created_by,omitempty"`
	// User who last modified the document
	ModifiedBy *string `json:"modified_by,omitempty"`
	// Page/slide/sheet structure with boundaries
	Pages *PageStructure `json:"pages,omitempty"`
	// Format-specific metadata (discriminated union)
	//
	// Contains detailed metadata specific to the document format.
	// Serialized as a nested `"format"` object with a `format_type` discriminator field.
	Format *FormatMetadata `json:"format,omitempty"`
	// Image preprocessing metadata (when OCR preprocessing was applied)
	ImagePreprocessing *ImagePreprocessingMetadata `json:"image_preprocessing,omitempty"`
	// JSON schema (for structured data extraction)
	JSONSchema *json.RawMessage `json:"json_schema,omitempty"`
	// Error metadata (for batch operations)
	Error *ErrorMetadata `json:"error,omitempty"`
	// Extraction duration in milliseconds (for benchmarking).
	//
	// This field is populated by batch extraction to provide per-file timing
	// information. It's `None` for single-file extraction (which uses external timing).
	ExtractionDurationMs *uint64 `json:"extraction_duration_ms,omitempty"`
	// Document category (from frontmatter or classification).
	Category *string `json:"category,omitempty"`
	// Document tags (from frontmatter).
	Tags []string `json:"tags,omitempty"`
	// Document version string (from frontmatter).
	DocumentVersion *string `json:"document_version,omitempty"`
	// Abstract or summary text (from frontmatter).
	AbstractText *string `json:"abstract_text,omitempty"`
	// Output format identifier (e.g., "markdown", "html", "text").
	//
	// Set by the output format pipeline stage when format conversion is applied.
	// Previously stored in `metadata.additional["output_format"]`.
	OutputFormat *string `json:"output_format,omitempty"`
	// Whether OCR was used during extraction.
	//
	// Set to `true` whenever the extraction pipeline ran an OCR backend
	// (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
	// or fallback text. `false` means native text extraction was used exclusively.
	OcrUsed bool `json:"ocr_used"`
	// Additional custom fields from postprocessors.
	//
	// Serialized as a nested `"additional"` object (not flattened at root level).
	// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
	Additional map[string]json.RawMessage `json:"additional,omitempty"`
}

// ExcelMetadata excel/spreadsheet format metadata.
//
// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
// discriminant. Sheet count and sheet names are stored inside this struct.
type ExcelMetadata struct {
	// Number of sheets in the workbook.
	SheetCount *uint32 `json:"sheet_count,omitempty"`
	// Names of all sheets in the workbook.
	SheetNames []string `json:"sheet_names,omitempty"`
}

// EmailMetadata email metadata extracted from .eml and .msg files.
//
// Includes sender/recipient information, message ID, and attachment list.
type EmailMetadata struct {
	// Sender's email address
	FromEmail *string `json:"from_email,omitempty"`
	// Sender's display name
	FromName *string `json:"from_name,omitempty"`
	// Primary recipients
	ToEmails []string `json:"to_emails,omitempty"`
	// CC recipients
	CcEmails []string `json:"cc_emails,omitempty"`
	// BCC recipients
	BccEmails []string `json:"bcc_emails,omitempty"`
	// Message-ID header value
	MessageID *string `json:"message_id,omitempty"`
	// List of attachment filenames
	Attachments []string `json:"attachments,omitempty"`
}

// ArchiveMetadata archive (ZIP/TAR/7Z) metadata.
//
// Extracted from compressed archive files containing file lists and size information.
type ArchiveMetadata struct {
	// Archive format ("ZIP", "TAR", "7Z", etc.)
	Format string `json:"format"`
	// Total number of files in the archive
	FileCount uint32 `json:"file_count"`
	// List of file paths within the archive
	FileList []string `json:"file_list,omitempty"`
	// Total uncompressed size in bytes
	TotalSize uint64 `json:"total_size"`
	// Compressed size in bytes (if available)
	CompressedSize *uint64 `json:"compressed_size,omitempty"`
}

// ImageMetadata image metadata extracted from image files.
//
// Includes dimensions, format, and EXIF data.
type ImageMetadata struct {
	// Image width in pixels
	Width uint32 `json:"width"`
	// Image height in pixels
	Height uint32 `json:"height"`
	// Image format (e.g., "PNG", "JPEG", "TIFF")
	Format string `json:"format"`
	// EXIF metadata tags
	Exif map[string]string `json:"exif,omitempty"`
}

// XMLMetadata xML metadata extracted during XML parsing.
//
// Provides statistics about XML document structure.
type XMLMetadata struct {
	// Total number of XML elements processed
	ElementCount uint32 `json:"element_count"`
	// List of unique element tag names (sorted)
	UniqueElements []string `json:"unique_elements,omitempty"`
}

// TextMetadata text/Markdown metadata.
//
// Extracted from plain text and Markdown files. Includes word counts and,
// for Markdown, structural elements like headers and links.
type TextMetadata struct {
	// Number of lines in the document
	LineCount uint32 `json:"line_count"`
	// Number of words
	WordCount uint32 `json:"word_count"`
	// Number of characters
	CharacterCount uint32 `json:"character_count"`
	// Markdown headers (headings text only, for Markdown files)
	Headers []string `json:"headers,omitempty"`
	// Markdown links as (text, url) tuples (for Markdown files)
	Links [][]string `json:"links,omitempty"`
	// Code blocks as (language, code) tuples (for Markdown files)
	CodeBlocks [][]string `json:"code_blocks,omitempty"`
}

// HeaderMetadata header/heading element metadata.
type HeaderMetadata struct {
	// Header level: 1 (h1) through 6 (h6)
	Level uint8 `json:"level"`
	// Normalized text content of the header
	Text string `json:"text"`
	// HTML id attribute if present
	ID *string `json:"id,omitempty"`
	// Document tree depth at the header element
	Depth uint32 `json:"depth"`
	// Byte offset in original HTML document
	HTMLOffset uint32 `json:"html_offset"`
}

// LinkMetadata link element metadata.
type LinkMetadata struct {
	// The href URL value
	Href string `json:"href"`
	// Link text content (normalized)
	Text string `json:"text"`
	// Optional title attribute
	Title *string `json:"title,omitempty"`
	// Link type classification
	LinkType LinkType `json:"link_type"`
	// Rel attribute values
	Rel []string `json:"rel,omitempty"`
	// Additional attributes as key-value pairs
	Attributes [][]string `json:"attributes,omitempty"`
}

// ImageMetadataType image element metadata.
type ImageMetadataType struct {
	// Image source (URL, data URI, or SVG content)
	Src string `json:"src"`
	// Alternative text from alt attribute
	Alt *string `json:"alt,omitempty"`
	// Title attribute
	Title *string `json:"title,omitempty"`
	// Image dimensions as (width, height) if available
	Dimensions []uint32 `json:"dimensions,omitempty"`
	// Image type classification
	ImageType ImageType `json:"image_type"`
	// Additional attributes as key-value pairs
	Attributes [][]string `json:"attributes,omitempty"`
}

// StructuredData structured data (Schema.org, microdata, RDFa) block.
type StructuredData struct {
	// Type of structured data
	DataType StructuredDataType `json:"data_type"`
	// Raw JSON string representation
	RawJSON string `json:"raw_json"`
	// Schema type if detectable (e.g., "Article", "Event", "Product")
	SchemaType *string `json:"schema_type,omitempty"`
}

// HTMLMetadata hTML metadata extracted from HTML documents.
//
// Includes document-level metadata, Open Graph data, Twitter Card metadata,
// and extracted structural elements (headers, links, images, structured data).
type HTMLMetadata struct {
	// Document title from `<title>` tag
	Title *string `json:"title,omitempty"`
	// Document description from `<meta name="description">` tag
	Description *string `json:"description,omitempty"`
	// Document keywords from `<meta name="keywords">` tag, split on commas
	Keywords []string `json:"keywords,omitempty"`
	// Document author from `<meta name="author">` tag
	Author *string `json:"author,omitempty"`
	// Canonical URL from `<link rel="canonical">` tag
	CanonicalURL *string `json:"canonical_url,omitempty"`
	// Base URL from `<base href="">` tag for resolving relative URLs
	BaseHref *string `json:"base_href,omitempty"`
	// Document language from `lang` attribute
	Language *string `json:"language,omitempty"`
	// Document text direction from `dir` attribute
	TextDirection *TextDirection `json:"text_direction,omitempty"`
	// Open Graph metadata (og:* properties) for social media
	// Keys like "title", "description", "image", "url", etc.
	OpenGraph map[string]string `json:"open_graph,omitempty"`
	// Twitter Card metadata (twitter:* properties)
	// Keys like "card", "site", "creator", "title", "description", "image", etc.
	TwitterCard map[string]string `json:"twitter_card,omitempty"`
	// Additional meta tags not covered by specific fields
	// Keys are meta name/property attributes, values are content
	MetaTags map[string]string `json:"meta_tags,omitempty"`
	// Extracted header elements with hierarchy
	Headers []HeaderMetadata `json:"headers,omitempty"`
	// Extracted hyperlinks with type classification
	Links []LinkMetadata `json:"links,omitempty"`
	// Extracted images with source and dimensions
	Images []ImageMetadataType `json:"images,omitempty"`
	// Extracted structured data blocks
	StructuredData []StructuredData `json:"structured_data,omitempty"`
}

// OcrMetadata oCR processing metadata.
//
// Captures information about OCR processing configuration and results.
type OcrMetadata struct {
	// OCR language code(s) used
	Language string `json:"language"`
	// Tesseract Page Segmentation Mode (PSM)
	Psm int32 `json:"psm"`
	// Output format (e.g., "text", "hocr")
	OutputFormat string `json:"output_format"`
	// Number of tables detected
	TableCount uint32  `json:"table_count"`
	TableRows  *uint32 `json:"table_rows,omitempty"`
	TableCols  *uint32 `json:"table_cols,omitempty"`
}

// ErrorMetadata error metadata (for batch operations).
type ErrorMetadata struct {
	ErrorType string `json:"error_type"`
	Message   string `json:"message"`
}

// PptxMetadata powerPoint presentation metadata.
//
// Extracted from PPTX files containing slide counts and presentation details.
type PptxMetadata struct {
	// Total number of slides in the presentation
	SlideCount uint32 `json:"slide_count"`
	// Names of slides (if available)
	SlideNames []string `json:"slide_names,omitempty"`
	// Number of embedded images
	ImageCount *uint32 `json:"image_count,omitempty"`
	// Number of tables
	TableCount *uint32 `json:"table_count,omitempty"`
}

// DocxMetadata word document metadata.
//
// Extracted from DOCX files using shared Office Open XML metadata extraction.
// Integrates with `office_metadata` module for core/app/custom properties.
type DocxMetadata struct {
	// Core properties from docProps/core.xml (Dublin Core metadata)
	//
	// Contains title, creator, subject, keywords, dates, etc.
	// Shared format across DOCX/PPTX/XLSX documents.
	CoreProperties *CoreProperties `json:"core_properties,omitempty"`
	// Application properties from docProps/app.xml (Word-specific statistics)
	//
	// Contains word count, page count, paragraph count, editing time, etc.
	// DOCX-specific variant of Office application properties.
	AppProperties *DocxAppProperties `json:"app_properties,omitempty"`
	// Custom properties from docProps/custom.xml (user-defined properties)
	//
	// Contains key-value pairs defined by users or applications.
	// Values can be strings, numbers, booleans, or dates.
	CustomProperties map[string]json.RawMessage `json:"custom_properties,omitempty"`
}

// CsvMetadata cSV/TSV file metadata.
type CsvMetadata struct {
	RowCount    uint32   `json:"row_count"`
	ColumnCount uint32   `json:"column_count"`
	Delimiter   *string  `json:"delimiter,omitempty"`
	HasHeader   bool     `json:"has_header"`
	ColumnTypes []string `json:"column_types,omitempty"`
}

// BibtexMetadata bibTeX bibliography metadata.
type BibtexMetadata struct {
	// Number of entries in the bibliography.
	EntryCount   uint            `json:"entry_count"`
	CitationKeys []string        `json:"citation_keys,omitempty"`
	Authors      []string        `json:"authors,omitempty"`
	YearRange    *YearRange      `json:"year_range,omitempty"`
	EntryTypes   map[string]uint `json:"entry_types,omitempty"`
}

// CitationMetadata citation file metadata (RIS, PubMed, EndNote).
type CitationMetadata struct {
	CitationCount uint       `json:"citation_count"`
	Format        *string    `json:"format,omitempty"`
	Authors       []string   `json:"authors,omitempty"`
	YearRange     *YearRange `json:"year_range,omitempty"`
	Dois          []string   `json:"dois,omitempty"`
	Keywords      []string   `json:"keywords,omitempty"`
}

// YearRange year range for bibliographic metadata.
type YearRange struct {
	Min   *uint32  `json:"min,omitempty"`
	Max   *uint32  `json:"max,omitempty"`
	Years []uint32 `json:"years,omitempty"`
}

// FictionBookMetadata fictionBook (FB2) metadata.
type FictionBookMetadata struct {
	Genres     []string `json:"genres,omitempty"`
	Sequences  []string `json:"sequences,omitempty"`
	Annotation *string  `json:"annotation,omitempty"`
}

// DbfMetadata dBASE (DBF) file metadata.
type DbfMetadata struct {
	RecordCount uint           `json:"record_count"`
	FieldCount  uint           `json:"field_count"`
	Fields      []DbfFieldInfo `json:"fields,omitempty"`
}

// DbfFieldInfo dBASE field information.
type DbfFieldInfo struct {
	Name      string `json:"name"`
	FieldType string `json:"field_type"`
}

// JatsMetadata jATS (Journal Article Tag Suite) metadata.
type JatsMetadata struct {
	Copyright        *string           `json:"copyright,omitempty"`
	License          *string           `json:"license,omitempty"`
	HistoryDates     map[string]string `json:"history_dates,omitempty"`
	ContributorRoles []ContributorRole `json:"contributor_roles,omitempty"`
}

// ContributorRole jATS contributor with role.
type ContributorRole struct {
	Name string  `json:"name"`
	Role *string `json:"role,omitempty"`
}

// EpubMetadata ePUB metadata (Dublin Core extensions).
type EpubMetadata struct {
	Coverage   *string `json:"coverage,omitempty"`
	DcFormat   *string `json:"dc_format,omitempty"`
	Relation   *string `json:"relation,omitempty"`
	Source     *string `json:"source,omitempty"`
	DcType     *string `json:"dc_type,omitempty"`
	CoverImage *string `json:"cover_image,omitempty"`
}

// PstMetadata outlook PST archive metadata.
type PstMetadata struct {
	MessageCount uint `json:"message_count"`
}

// OcrConfidence confidence scores for an OCR element.
//
// Separates detection confidence (how confident that text exists at this location)
// from recognition confidence (how confident about the actual text content).
type OcrConfidence struct {
	// Detection confidence: how confident the OCR engine is that text exists here.
	//
	// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
	// Range: 0.0 to 1.0 (or None if not available).
	Detection *float64 `json:"detection,omitempty"`
	// Recognition confidence: how confident about the text content.
	//
	// Range: 0.0 to 1.0.
	Recognition float64 `json:"recognition"`
}

// OcrRotation rotation information for an OCR element.
type OcrRotation struct {
	// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
	AngleDegrees float64 `json:"angle_degrees"`
	// Confidence score for the rotation detection.
	Confidence *float64 `json:"confidence,omitempty"`
}

// OcrElement unified OCR element representing detected text with full metadata.
//
// This is the primary type for structured OCR output, preserving all information
// from both Tesseract and PaddleOCR backends.
type OcrElement struct {
	// The recognized text content.
	Text string `json:"text"`
	// Bounding geometry (rectangle or quadrilateral).
	Geometry OcrBoundingGeometry `json:"geometry"`
	// Confidence scores for detection and recognition.
	Confidence OcrConfidence `json:"confidence"`
	// Hierarchical level (word, line, block, page).
	Level OcrElementLevel `json:"level,omitempty"`
	// Rotation information (if detected).
	Rotation *OcrRotation `json:"rotation,omitempty"`
	// Page number (1-indexed).
	PageNumber uint32 `json:"page_number"`
	// Parent element ID for hierarchical relationships.
	//
	// Only used for Tesseract output which has word -> line -> block hierarchy.
	ParentID *string `json:"parent_id,omitempty"`
	// Backend-specific metadata that doesn't fit the unified schema.
	BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
}

func (s *OcrElement) UnmarshalJSON(data []byte) error {
	var raw struct {
		Text            string                     `json:"text"`
		Geometry        json.RawMessage            `json:"geometry,omitempty"`
		Confidence      OcrConfidence              `json:"confidence"`
		Level           OcrElementLevel            `json:"level,omitempty"`
		Rotation        *OcrRotation               `json:"rotation,omitempty"`
		PageNumber      uint32                     `json:"page_number"`
		ParentID        *string                    `json:"parent_id,omitempty"`
		BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.Text = raw.Text
	s.Confidence = raw.Confidence
	s.Level = raw.Level
	s.Rotation = raw.Rotation
	s.PageNumber = raw.PageNumber
	s.ParentID = raw.ParentID
	s.BackendMetadata = raw.BackendMetadata
	if len(raw.Geometry) > 0 && string(raw.Geometry) != "null" {
		v, err := UnmarshalOcrBoundingGeometry(raw.Geometry)
		if err != nil {
			return err
		}
		s.Geometry = v
	}
	return nil
}

// OcrElementConfig configuration for OCR element extraction.
//
// Controls how OCR elements are extracted and filtered.
type OcrElementConfig struct {
	// Whether to include OCR elements in the extraction result.
	//
	// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
	IncludeElements bool `json:"include_elements"`
	// Minimum hierarchical level to include.
	//
	// Elements below this level (e.g., words when min_level is Line) will be excluded.
	MinLevel OcrElementLevel `json:"min_level,omitempty"`
	// Minimum recognition confidence threshold (0.0-1.0).
	//
	// Elements with confidence below this threshold will be filtered out.
	MinConfidence float64 `json:"min_confidence"`
	// Whether to build hierarchical relationships between elements.
	//
	// When true, `parent_id` fields will be populated based on spatial containment.
	// Only meaningful for Tesseract output.
	BuildHierarchy bool `json:"build_hierarchy"`
}

// PageStructure unified page structure for documents.
//
// Supports different page types (PDF pages, PPTX slides, Excel sheets)
// with character offset boundaries for chunk-to-page mapping.
type PageStructure struct {
	// Total number of pages/slides/sheets
	TotalCount uint32 `json:"total_count"`
	// Type of paginated unit
	UnitType PageUnitType `json:"unit_type"`
	// Character offset boundaries for each page
	//
	// Maps character ranges in the extracted content to page numbers.
	// Used for chunk page range calculation.
	Boundaries []PageBoundary `json:"boundaries,omitempty"`
	// Detailed per-page metadata (optional, only when needed)
	Pages []PageInfo `json:"pages,omitempty"`
}

// PageBoundary byte offset boundary for a page.
//
// Tracks where a specific page's content starts and ends in the main content string,
// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
type PageBoundary struct {
	// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
	ByteStart uint `json:"byte_start"`
	// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
	ByteEnd uint `json:"byte_end"`
	// Page number (1-indexed)
	PageNumber uint32 `json:"page_number"`
}

// PageInfo metadata for individual page/slide/sheet.
//
// Captures per-page information including dimensions, content counts,
// and visibility state (for presentations).
type PageInfo struct {
	// Page number (1-indexed)
	Number uint32 `json:"number"`
	// Page title (usually for presentations)
	Title *string `json:"title,omitempty"`
	// Dimensions in points (PDF) or pixels (images): (width, height)
	Dimensions []float64 `json:"dimensions,omitempty"`
	// Number of images on this page
	ImageCount *uint32 `json:"image_count,omitempty"`
	// Number of tables on this page
	TableCount *uint32 `json:"table_count,omitempty"`
	// Whether this page is hidden (e.g., in presentations)
	Hidden *bool `json:"hidden,omitempty"`
	// Whether this page is blank (no meaningful text, no images, no tables)
	//
	// A page is considered blank if it has fewer than 3 non-whitespace characters
	// and contains no tables or images. This is useful for filtering out empty pages
	// in scanned documents or PDFs with blank separator pages.
	IsBlank *bool `json:"is_blank,omitempty"`
	// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
	//
	// Indicates the presence of vector-drawn content such as charts, diagrams,
	// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
	// invisible to `ExtractionResult.images` since they are not embedded as raster
	// XObjects. Set to `true` when path count exceeds a heuristic threshold,
	// signaling that downstream consumers may want to rasterize the page to
	// capture this content.
	//
	// Only populated for PDFs; `None` for other document types.
	HasVectorGraphics bool `json:"has_vector_graphics"`
}

// PageContent content for a single page/slide.
//
// When page extraction is enabled, documents are split into per-page content
// with associated tables and images mapped to each page.
//
// # Performance
//
// Uses Arc-wrapped tables and images for memory efficiency:
// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
// - Maintains exact JSON compatibility via custom Serialize/Deserialize
//
// This reduces memory overhead for documents with shared tables/images
// by avoiding redundant copies during serialization.
type PageContent struct {
	// Page number (1-indexed)
	PageNumber uint32 `json:"page_number"`
	// Text content for this page
	Content string `json:"content"`
	// Tables found on this page (uses Arc for memory efficiency)
	//
	// Serializes as Vec<Table> for JSON compatibility while maintaining
	// Arc semantics in-memory for zero-copy sharing.
	Tables []Table `json:"tables,omitempty"`
	// Indices into `ExtractionResult.images` for images found on this page.
	//
	// Each value is a zero-based index into the top-level `images` collection.
	// Only populated when `extract_images = true` in the extraction config.
	ImageIndices []uint32 `json:"image_indices,omitempty"`
	// Hierarchy information for the page (when hierarchy extraction is enabled)
	//
	// Contains text hierarchy levels (H1-H6) extracted from the page content.
	Hierarchy *PageHierarchy `json:"hierarchy,omitempty"`
	// Whether this page is blank (no meaningful text content)
	//
	// Determined during extraction based on text content analysis.
	// A page is blank if it has fewer than 3 non-whitespace characters
	// and contains no tables or images.
	IsBlank *bool `json:"is_blank,omitempty"`
	// Layout detection regions for this page (when layout detection is enabled).
	//
	// Contains detected layout regions with class, confidence, bounding box,
	// and area fraction. Only populated when layout detection is configured.
	LayoutRegions []LayoutRegion `json:"layout_regions,omitempty"`
	// Speaker notes for this slide (PPTX only).
	//
	// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
	// Only populated when the source is a PPTX file and notes are present.
	SpeakerNotes *string `json:"speaker_notes,omitempty"`
	// Section name this slide belongs to (PPTX only).
	//
	// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
	// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
	// the slide belongs to a named section.
	SectionName *string `json:"section_name,omitempty"`
	// Sheet name for this page (XLSX/ODS only).
	//
	// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
	// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
	// formats and for sheets with an empty name.
	SheetName *string `json:"sheet_name,omitempty"`
}

// LayoutRegion detected layout region on a page.
//
// When layout detection is enabled, each page may have layout regions
// identifying different content types (text, pictures, tables, etc.)
// with confidence scores and spatial positions.
type LayoutRegion struct {
	// Layout class name (e.g. "picture", "table", "text", "section_header").
	ClassName string `json:"class_name"`
	// Confidence score from the layout detection model (0.0 to 1.0).
	Confidence float64 `json:"confidence"`
	// Bounding box in document coordinate space.
	BoundingBox BoundingBox `json:"bounding_box"`
	// Fraction of the page area covered by this region (0.0 to 1.0).
	AreaFraction float64 `json:"area_fraction"`
}

// PageHierarchy page hierarchy structure containing heading levels and block information.
//
// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
// blocks with heading levels (H1-H6) for semantic document structure.
type PageHierarchy struct {
	// Number of hierarchy blocks on this page
	BlockCount uint32 `json:"block_count"`
	// Hierarchical blocks with heading levels
	Blocks []HierarchicalBlock `json:"blocks,omitempty"`
}

// HierarchicalBlock text block with hierarchy level assignment.
//
// Represents a block of text with semantic heading information extracted from
// font size clustering and hierarchical analysis.
type HierarchicalBlock struct {
	// The text content of this block
	Text string `json:"text"`
	// The font size of the text in this block
	FontSize float32 `json:"font_size"`
	// The hierarchy level of this block (H1-H6 or Body)
	//
	// Levels correspond to HTML heading tags:
	// - "h1": Top-level heading
	// - "h2": Secondary heading
	// - "h3": Tertiary heading
	// - "h4": Quaternary heading
	// - "h5": Quinary heading
	// - "h6": Senary heading
	// - "body": Body text (no heading level)
	Level string `json:"level"`
	// Bounding box information for the block
	//
	// Contains coordinates as (left, top, right, bottom) in PDF units.
	Bbox []float32 `json:"bbox,omitempty"`
}

// CellChange single changed cell within a table.
//
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
// reference it unconditionally, without requiring the `diff` Cargo feature.
// `crate::diff` re-exports this type verbatim.
type CellChange struct {
	// Zero-based row index.
	Row uint `json:"row"`
	// Zero-based column index.
	Col uint `json:"col"`
	// Value before the change.
	From string `json:"from"`
	// Value after the change.
	To string `json:"to"`
}

// DocumentRevision single tracked change embedded in a document.
//
// Populated by per-format extractors that understand change-tracking metadata
// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
// extractor defaults to `ExtractionResult.revisions = None` until a
// format-specific implementation is added.
type DocumentRevision struct {
	// Format-specific revision identifier.
	//
	// For DOCX this is the `w:id` attribute value on the change element
	// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
	// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
	RevisionID string `json:"revision_id"`
	// Display name of the author who made this change, when available.
	Author *string `json:"author,omitempty"`
	// ISO-8601 timestamp of the change, when available.
	//
	// Stored as a plain string so this type remains FFI-friendly and
	// unconditionally available without the `chrono` optional dep.
	// DOCX populates this from the `w:date` attribute (e.g.
	// `"2024-03-15T10:30:00Z"`).
	Timestamp *string `json:"timestamp,omitempty"`
	// Semantic kind of this revision.
	Kind RevisionKind `json:"kind"`
	// Best-effort document location for this revision.
	//
	// Resolution is format-dependent and may be `None` when the location
	// cannot be determined (e.g. changes inside table cells before
	// table-cell anchor support is added).
	Anchor RevisionAnchor `json:"anchor,omitempty"`
	// The content changes that make up this revision.
	Delta RevisionDelta `json:"delta"`
}

func (s *DocumentRevision) UnmarshalJSON(data []byte) error {
	var raw struct {
		RevisionID string          `json:"revision_id"`
		Author     *string         `json:"author,omitempty"`
		Timestamp  *string         `json:"timestamp,omitempty"`
		Kind       RevisionKind    `json:"kind"`
		Anchor     json.RawMessage `json:"anchor,omitempty"`
		Delta      RevisionDelta   `json:"delta"`
	}
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}
	s.RevisionID = raw.RevisionID
	s.Author = raw.Author
	s.Timestamp = raw.Timestamp
	s.Kind = raw.Kind
	s.Delta = raw.Delta
	if len(raw.Anchor) > 0 && string(raw.Anchor) != "null" {
		v, err := UnmarshalRevisionAnchor(raw.Anchor)
		if err != nil {
			return err
		}
		s.Anchor = v
	}
	return nil
}

// RevisionDelta content changes that make up a single revision.
//
// For insertions and deletions the `content` field carries the added/removed
// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
// changes, `content` is empty — the property diff is left as a TODO for a
// later enrichment pass.
type RevisionDelta struct {
	// Line-level content changes for this revision.
	Content []DiffLine `json:"content,omitempty"`
	// Cell-level table changes for this revision.
	TableChanges []CellChange `json:"table_changes,omitempty"`
}

// Table extracted table structure.
//
// Represents a table detected and extracted from a document (PDF, image, etc.).
// Tables are converted to both structured cell data and Markdown format.
type Table struct {
	// Table cells as a 2D vector (rows × columns)
	Cells [][]string `json:"cells,omitempty"`
	// Markdown representation of the table
	Markdown string `json:"markdown"`
	// Page number where the table was found (1-indexed)
	PageNumber uint32 `json:"page_number"`
	// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
	// Only populated for PDF-extracted tables when position data is available.
	BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
}

// TableCell individual table cell with content and optional styling.
//
// Future extension point for rich table support with cell-level metadata.
type TableCell struct {
	// Cell content as text
	Content string `json:"content"`
	// Row span (number of rows this cell spans)
	RowSpan uint32 `json:"row_span"`
	// Column span (number of columns this cell spans)
	ColSpan uint32 `json:"col_span"`
	// Whether this is a header cell
	IsHeader bool `json:"is_header"`
}

// ExtractedURI uRI extracted from a document.
//
// Represents any link, reference, or resource pointer found during extraction.
// The `kind` field classifies the URI semantically, while `label` carries
// optional human-readable display text.
type ExtractedURI struct {
	// The URL or path string.
	URL string `json:"url"`
	// Optional display text / label for the link.
	Label *string `json:"label,omitempty"`
	// Optional page number where the URI was found (1-indexed).
	Page *uint32 `json:"page,omitempty"`
	// Semantic classification of the URI.
	Kind URIKind `json:"kind"`
}

// DetectResponse mIME type detection response.
type DetectResponse struct {
	// Detected MIME type
	MimeType string `json:"mime_type"`
	// Original filename (if provided)
	Filename *string `json:"filename,omitempty"`
}

// DiffOptions options controlling how two `ExtractionResult` values are compared.
type DiffOptions struct {
	// Include metadata changes in the diff. Default: `true`.
	IncludeMetadata *bool `json:"include_metadata,omitempty"`
	// Include embedded-children changes in the diff. Default: `true`.
	IncludeEmbedded *bool `json:"include_embedded,omitempty"`
	// Truncate content to this many characters before diffing.
	//
	// Useful for very large documents where only the first N characters matter.
	// `None` means no truncation.
	MaxContentChars *uint `json:"max_content_chars,omitempty"`
}

// ExtractionDiff complete diff between two `ExtractionResult` values.
type ExtractionDiff struct {
	// Unified-diff hunks for the `content` field.
	//
	// Empty when the content is identical.
	ContentDiff []DiffHunk `json:"content_diff,omitempty"`
	// Tables present in `b` but not in `a` (by index position, excess right-side tables).
	TablesAdded []Table `json:"tables_added,omitempty"`
	// Tables present in `a` but not in `b` (by index position, excess left-side tables).
	TablesRemoved []Table `json:"tables_removed,omitempty"`
	// Cell-level changes for table pairs that share the same index and dimensions.
	TablesChanged []TableDiff `json:"tables_changed,omitempty"`
	// Metadata difference, encoded as a JSON object with three top-level keys:
	// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
	// but not `b`), and `changed` (keys whose values differ — each entry is
	// `{ "from": <value-in-a>, "to": <value-in-b> }`).
	//
	// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
	// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
	// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
	// preferred json-patch impl directly.
	MetadataChanged json.RawMessage `json:"metadata_changed"`
	// Changes to embedded archive children.
	EmbeddedChanges EmbeddedChanges `json:"embedded_changes"`
}

// DiffHunk single contiguous hunk in a unified diff.
type DiffHunk struct {
	// Starting line number in the old content (0-indexed).
	FromLine uint `json:"from_line"`
	// Number of lines from the old content in this hunk.
	FromCount uint `json:"from_count"`
	// Starting line number in the new content (0-indexed).
	ToLine uint `json:"to_line"`
	// Number of lines from the new content in this hunk.
	ToCount uint `json:"to_count"`
	// Lines that make up this hunk.
	Lines []DiffLine `json:"lines,omitempty"`
}

// TableDiff cell-level changes for a pair of tables that share the same index.
type TableDiff struct {
	// Zero-based index of the table in both `a.tables` and `b.tables`.
	FromIndex uint `json:"from_index"`
	// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
	ToIndex uint `json:"to_index"`
	// Cell-level changes within the table.
	CellChanges []CellChange `json:"cell_changes,omitempty"`
}

// EmbeddedChanges changes to embedded archive children between two results.
type EmbeddedChanges struct {
	// Children present in `b` but not in `a` (matched by `path`).
	Added []ArchiveEntry `json:"added,omitempty"`
	// Children present in `a` but not in `b` (matched by `path`).
	Removed []ArchiveEntry `json:"removed,omitempty"`
	// Children present in both but with differing content (matched by `path`).
	//
	// Each entry holds the diff of the nested `ExtractionResult`.
	Changed []EmbeddedDiff `json:"changed,omitempty"`
}

// EmbeddedDiff diff for a single embedded archive entry that appears in both results.
type EmbeddedDiff struct {
	// Archive-relative path identifying this entry.
	Path string `json:"path"`
	// The recursive diff of the entry's extraction result.
	Diff ExtractionDiff `json:"diff"`
}

// EmbeddingPreset preset configurations for common RAG use cases.
//
// Each preset combines chunk size, overlap, and embedding model
// to provide an optimized configuration for specific scenarios.
//
// All string fields are owned `String` for FFI compatibility — instances
// are safe to clone and pass across language boundaries.
type EmbeddingPreset struct {
	Name      string `json:"name"`
	ChunkSize uint   `json:"chunk_size"`
	Overlap   uint   `json:"overlap"`
	// HuggingFace repository name for the model.
	ModelRepo string `json:"model_repo"`
	// Pooling strategy: "cls" or "mean".
	Pooling string `json:"pooling"`
	// Path to the ONNX model file within the repo.
	ModelFile   string `json:"model_file"`
	Dimensions  uint   `json:"dimensions"`
	Description string `json:"description"`
}

// YakeParams yAKE-specific parameters.
type YakeParams struct {
	// Window size for co-occurrence analysis (default: 2).
	//
	// Controls the context window for computing co-occurrence statistics.
	WindowSize *uint `json:"window_size,omitempty"`
}

// RakeParams rAKE-specific parameters.
type RakeParams struct {
	// Minimum word length to consider (default: 1).
	MinWordLength *uint `json:"min_word_length,omitempty"`
	// Maximum words in a keyword phrase (default: 3).
	MaxWordsPerPhrase *uint `json:"max_words_per_phrase,omitempty"`
}

// KeywordConfig keyword extraction configuration.
type KeywordConfig struct {
	// Algorithm to use for extraction.
	Algorithm KeywordAlgorithm `json:"algorithm,omitempty"`
	// Maximum number of keywords to extract (default: 10).
	MaxKeywords *uint `json:"max_keywords,omitempty"`
	// Minimum score threshold (0.0-1.0, default: 0.0).
	//
	// Keywords with scores below this threshold are filtered out.
	// Note: Score ranges differ between algorithms.
	MinScore float32 `json:"min_score"`
	// N-gram range for keyword extraction (min, max).
	//
	// (1, 1) = unigrams only
	// (1, 2) = unigrams and bigrams
	// (1, 3) = unigrams, bigrams, and trigrams (default)
	NgramRange []uint `json:"ngram_range,omitempty"`
	// Language code for stopword filtering (e.g., "en", "de", "fr").
	//
	// If None, no stopword filtering is applied.
	Language *string `json:"language,omitempty"`
	// YAKE-specific tuning parameters.
	YakeParams *YakeParams `json:"yake_params,omitempty"`
	// RAKE-specific tuning parameters.
	RakeParams *RakeParams `json:"rake_params,omitempty"`
}

// Keyword extracted keyword with metadata.
type Keyword struct {
	// The keyword text.
	Text string `json:"text"`
	// Relevance score (higher is better, algorithm-specific range).
	Score float32 `json:"score"`
	// Algorithm that extracted this keyword.
	Algorithm KeywordAlgorithm `json:"algorithm"`
	// Optional positions where keyword appears in text (character offsets).
	Positions []uint `json:"positions,omitempty"`
}

// PaddleOcrConfig configuration for PaddleOCR backend.
//
// Configures PaddleOCR text detection and recognition with multi-language support.
// Uses a builder pattern for convenient configuration.
//
// Example:
//
//	// Create with default English configuration
//	let config = PaddleOcrConfig::new("en");
//
//	// Create with custom cache directory
//	let config = PaddleOcrConfig::new("ch")
//	    .with_cache_dir("/path/to/cache".into());
//
//	// Enable table detection
//	let config = PaddleOcrConfig::new("en")
//	    .with_table_detection(true);
type PaddleOcrConfig struct {
	// Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
	Language string `json:"language"`
	// Optional custom cache directory for model files
	CacheDir *string `json:"cache_dir,omitempty"`
	// Enable angle classification for rotated text (default: false).
	// Can misfire on short text regions, rotating crops incorrectly before recognition.
	UseAngleCls bool `json:"use_angle_cls"`
	// Enable table structure detection (default: false)
	EnableTableDetection bool `json:"enable_table_detection"`
	// Database threshold for text detection (default: 0.3)
	// Range: 0.0-1.0, higher values require more confident detections
	DetDbThresh float32 `json:"det_db_thresh"`
	// Box threshold for text bounding box refinement (default: 0.5)
	// Range: 0.0-1.0
	DetDbBoxThresh float32 `json:"det_db_box_thresh"`
	// Unclip ratio for expanding text bounding boxes (default: 1.6)
	// Controls the expansion of detected text regions
	DetDbUnclipRatio float32 `json:"det_db_unclip_ratio"`
	// Maximum side length for detection image (default: 960)
	// Larger images may be resized to this limit for faster inference
	DetLimitSideLen uint32 `json:"det_limit_side_len"`
	// Batch size for recognition inference (default: 6)
	// Number of text regions to process simultaneously
	RecBatchNum uint32 `json:"rec_batch_num"`
	// Padding in pixels added around the image before detection (default: 10).
	// Large values can include surrounding content like table gridlines.
	Padding uint32 `json:"padding"`
	// Minimum recognition confidence score for text lines (default: 0.5).
	// Text regions with recognition confidence below this threshold are discarded.
	// Matches PaddleOCR Python's `drop_score` parameter.
	// Range: 0.0-1.0
	DropScore float32 `json:"drop_score"`
	// Model tier controlling detection/recognition model size and accuracy trade-off.
	// - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
	// - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
	ModelTier string `json:"model_tier"`
}

// ModelPaths combined paths to all models needed for OCR (backward compatibility).
type ModelPaths struct {
	// Path to the detection model directory.
	DetModel string `json:"det_model"`
	// Path to the classification model directory.
	ClsModel string `json:"cls_model"`
	// Path to the recognition model directory.
	RecModel string `json:"rec_model"`
	// Path to the character dictionary file.
	DictFile string `json:"dict_file"`
}

// OrientationResult document orientation detection result.
type OrientationResult struct {
	// Detected orientation in degrees (0, 90, 180, or 270).
	Degrees uint32 `json:"degrees"`
	// Confidence score (0.0-1.0).
	Confidence float32 `json:"confidence"`
}

// BBox bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
type BBox struct {
	X1 float32 `json:"x1"`
	Y1 float32 `json:"y1"`
	X2 float32 `json:"x2"`
	Y2 float32 `json:"y2"`
}

// LayoutDetection single layout detection result.
type LayoutDetection struct {
	ClassName  LayoutClass `json:"class_name"`
	Confidence float32     `json:"confidence"`
	Bbox       BBox        `json:"bbox"`
}

// RecognizedTable pre-computed table markdown for a table detection region.
//
// Produced by the TATR-based table structure recognizer and surfaced as part of
// layout-aware OCR results.  The struct lives here (under `layout-types`, pure-Rust)
// so that consumers who do not enable `layout-detection` (ORT) can still reference
// the type in their own code.
type RecognizedTable struct {
	// Detection bbox that this table corresponds to (for matching).
	DetectionBbox BBox `json:"detection_bbox"`
	// Table cells as a 2D vector (rows × columns).
	Cells [][]string `json:"cells,omitempty"`
	// Rendered markdown table.
	Markdown string `json:"markdown"`
}

// DetectionResult page-level detection result containing all detections and page metadata.
type DetectionResult struct {
	PageWidth  uint32            `json:"page_width"`
	PageHeight uint32            `json:"page_height"`
	Detections []LayoutDetection `json:"detections,omitempty"`
}

// EmbeddedFile embedded file descriptor extracted from the PDF name tree.
type EmbeddedFile struct {
	// The filename as stored in the PDF name tree.
	Name string `json:"name"`
	// Raw file bytes from the embedded stream (already decompressed by lopdf).
	Data []byte `json:"data"`
	// Compressed byte count of the original stream (before decompression).
	//
	// Used by callers to compute the decompression ratio and detect zip-bomb-style
	// attacks that embed a tiny compressed stream expanding to gigabytes of data.
	CompressedSize uint `json:"compressed_size"`
	// MIME type if specified in the filespec, otherwise `None`.
	MimeType *string `json:"mime_type,omitempty"`
}

// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
func (v EmbeddedFile) MarshalJSON() ([]byte, error) {
	// Explicit shadow struct listing every field — embedding the original
	// would cause both base64-string and int-array entries for the same JSON
	// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
	aux := struct {
		Name           string  `json:"name"`
		Data           []int   `json:"data"`
		CompressedSize uint    `json:"compressed_size"`
		MimeType       *string `json:"mime_type,omitempty"`
	}{}
	aux.Name = v.Name
	aux.Data = make([]int, len(v.Data))
	for i, b := range v.Data {
		aux.Data[i] = int(b)
	}
	aux.CompressedSize = v.CompressedSize
	aux.MimeType = v.MimeType
	return json.Marshal(aux)
}

// PdfMetadata pDF-specific metadata.
//
// Contains metadata fields specific to PDF documents that are not in the common
// `Metadata` structure. Common fields like title, authors, keywords, and dates
// are at the `Metadata` level.
type PdfMetadata struct {
	// PDF version (e.g., "1.7", "2.0")
	PdfVersion *string `json:"pdf_version,omitempty"`
	// PDF producer (application that created the PDF)
	Producer *string `json:"producer,omitempty"`
	// Whether the PDF is encrypted/password-protected
	IsEncrypted *bool `json:"is_encrypted,omitempty"`
	// First page width in points (1/72 inch)
	Width *int64 `json:"width,omitempty"`
	// First page height in points (1/72 inch)
	Height *int64 `json:"height,omitempty"`
	// Total number of pages in the PDF document
	PageCount *uint32 `json:"page_count,omitempty"`
}

// ExtractBytes extract content from a byte array.
//
// This is the main entry point for in-memory extraction. It performs the following steps:
// 1. Validate MIME type
// 2. Handle legacy format conversion if needed
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
//
// Arguments:
//   - content: The byte array to extract
//   - mime_type: MIME type of the content
//   - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Validation` if MIME type is invalid.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let bytes = b"Hello, world!";
//	let result = extract_bytes(bytes, "text/plain", &config).await?;
//	println!("Content: {}", result.content);
func ExtractBytes(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
	var cContent *C.uint8_t
	if len(content) > 0 {
		var cContentPinner runtime.Pinner
		cContentPinner.Pin(&content[0])
		defer cContentPinner.Unpin()
		cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
	}
	cContentLen := C.uintptr_t(len(content))

	cMimeType := C.CString(mimeType)
	defer C.free(unsafe.Pointer(cMimeType))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_extract_bytes(cContent, cContentLen, cMimeType, cConfig)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_extraction_result_free(ptr)
		}
		return nil, err
	}
	defer C.kreuzberg_extraction_result_free(ptr)
	jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
	if jsonPtr == nil {
		return nil, fmt.Errorf("failed to convert to JSON")
	}
	defer C.kreuzberg_free_string(jsonPtr)
	var result ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return &result, nil
}

// ExtractFile extract content from a file.
//
// This is the main entry point for file-based extraction. It performs the following steps:
// 1. Check cache for existing result (if caching enabled)
// 2. Detect or validate MIME type
// 3. Select appropriate extractor from registry
// 4. Extract content
// 5. Run post-processing pipeline
// 6. Store result in cache (if caching enabled)
//
// Arguments:
//   - path: Path to the file to extract
//   - mime_type: Optional MIME type override. If None, will be auto-detected
//   - config: Extraction configuration
//
// Returns an `ExtractionResult` containing the extracted content and metadata.
//
// Errors are returned when returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let result = extract_file("document.pdf", None, &config).await?;
//	println!("Content: {}", result.content);
func ExtractFile(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
	cPath := C.CString(path)
	defer C.free(unsafe.Pointer(cPath))

	var cMimeType *C.char
	if mimeType != nil {
		cMimeType = C.CString(*mimeType)
		defer C.free(unsafe.Pointer(cMimeType))
	}

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_extract_file(cPath, cMimeType, cConfig)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_extraction_result_free(ptr)
		}
		return nil, err
	}
	defer C.kreuzberg_extraction_result_free(ptr)
	jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
	if jsonPtr == nil {
		return nil, fmt.Errorf("failed to convert to JSON")
	}
	defer C.kreuzberg_free_string(jsonPtr)
	var result ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return &result, nil
}

// ExtractFileSync synchronous wrapper for `extract_file`.
//
// This is a convenience function that blocks the current thread until extraction completes.
// For async code, use `extract_file` directly.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
//
// This function is only available with the `tokio-runtime` feature. For WASM targets,
// use a truly synchronous extraction approach instead.
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let result = extract_file_sync("document.pdf", None, &config)?;
//	println!("Content: {}", result.content);
func ExtractFileSync(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
	cPath := C.CString(path)
	defer C.free(unsafe.Pointer(cPath))

	var cMimeType *C.char
	if mimeType != nil {
		cMimeType = C.CString(*mimeType)
		defer C.free(unsafe.Pointer(cMimeType))
	}

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_extract_file_sync(cPath, cMimeType, cConfig)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_extraction_result_free(ptr)
		}
		return nil, err
	}
	defer C.kreuzberg_extraction_result_free(ptr)
	jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
	if jsonPtr == nil {
		return nil, fmt.Errorf("failed to convert to JSON")
	}
	defer C.kreuzberg_free_string(jsonPtr)
	var result ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return &result, nil
}

// ExtractBytesSync synchronous wrapper for `extract_bytes`.
//
// Uses the global Tokio runtime for 100x+ performance improvement over creating
// a new runtime per call.
//
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let bytes = b"Hello, world!";
//	let result = extract_bytes_sync(bytes, "text/plain", &config)?;
//	println!("Content: {}", result.content);
func ExtractBytesSync(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
	var cContent *C.uint8_t
	if len(content) > 0 {
		var cContentPinner runtime.Pinner
		cContentPinner.Pin(&content[0])
		defer cContentPinner.Unpin()
		cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
	}
	cContentLen := C.uintptr_t(len(content))

	cMimeType := C.CString(mimeType)
	defer C.free(unsafe.Pointer(cMimeType))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_extract_bytes_sync(cContent, cContentLen, cMimeType, cConfig)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_extraction_result_free(ptr)
		}
		return nil, err
	}
	defer C.kreuzberg_extraction_result_free(ptr)
	jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
	if jsonPtr == nil {
		return nil, fmt.Errorf("failed to convert to JSON")
	}
	defer C.kreuzberg_free_string(jsonPtr)
	var result ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return &result, nil
}

// BatchExtractFilesSync synchronous wrapper for `batch_extract_files`.
//
// Uses the global Tokio runtime for optimal performance.
// Only available with `tokio-runtime` (WASM has no filesystem).
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchFileItem {
//	        path: "doc1.pdf".into(),
//	        config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
//	    },
//	    BatchFileItem { path: "doc2.pdf".into(), config: None },
//	];
//	let results = batch_extract_files_sync(items, &config)?;
func BatchExtractFilesSync(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
	jsonBytescItems, err := json.Marshal(items)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cItems := C.CString(string(jsonBytescItems))
	defer C.free(unsafe.Pointer(cItems))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_batch_extract_files_sync(cItems, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// BatchExtractBytesSync synchronous wrapper for `batch_extract_bytes`.
//
// Uses the global Tokio runtime for optimal performance.
// With the `tokio-runtime` feature, this blocks the current thread using the global
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
// that iterates through items and calls `extract_bytes_sync()`.
//
// Example:
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
//	    BatchBytesItem {
//	        content: b"other".to_vec(),
//	        mime_type: "text/plain".to_string(),
//	        config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
//	    },
//	];
//	let results = batch_extract_bytes_sync(items, &config)?;
func BatchExtractBytesSync(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
	jsonBytescItems, err := json.Marshal(items)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cItems := C.CString(string(jsonBytescItems))
	defer C.free(unsafe.Pointer(cItems))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_batch_extract_bytes_sync(cItems, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// BatchExtractFiles extract content from multiple files concurrently.
//
// This function processes multiple files in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
// taken from the batch-level `config`.
//
// Arguments:
//   - items: Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
//   - config: Batch-level extraction configuration (provides defaults and batch settings)
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Errors are returned when individual file errors are captured in the result metadata. System errors
// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
//
// Example:
//
//	Simple usage with no per-file overrides:
//
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchFileItem { path: "doc1.pdf".into(), config: None },
//	    BatchFileItem { path: "doc2.pdf".into(), config: None },
//	];
//	let results = batch_extract_files(items, &config).await?;
//	println!("Processed {} files", results.len());
//
//	Per-file configuration overrides:
//
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchFileItem {
//	        path: "scan.pdf".into(),
//	        config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
//	    },
//	    BatchFileItem { path: "notes.txt".into(), config: None },
//	];
//	let results = batch_extract_files(items, &config).await?;
func BatchExtractFiles(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
	jsonBytescItems, err := json.Marshal(items)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cItems := C.CString(string(jsonBytescItems))
	defer C.free(unsafe.Pointer(cItems))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_batch_extract_files(cItems, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// BatchExtractBytes extract content from multiple byte arrays concurrently.
//
// This function processes multiple byte arrays in parallel, automatically managing
// concurrency to prevent resource exhaustion. The concurrency limit can be
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
// to `(num_cpus * 1.5).ceil()`.
//
// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
// fields from the batch-level `config`. Pass `None` as the config to use
// the batch-level defaults for that item.
//
// Arguments:
//   - items: Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
//   - config: Batch-level extraction configuration
//
// Returns a vector of `ExtractionResult` in the same order as the input items.
//
// Example:
//
//	Simple usage with no per-item overrides:
//
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
//	    BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
//	];
//	let results = batch_extract_bytes(items, &config).await?;
//	println!("Processed {} items", results.len());
//
//	Per-item configuration overrides:
//
//
//	let config = ExtractionConfig::default();
//	let items = vec![
//	    BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
//	    BatchBytesItem {
//	        content: b"<html>test</html>".to_vec(),
//	        mime_type: "text/html".to_string(),
//	        config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
//	    },
//	];
//	let results = batch_extract_bytes(items, &config).await?;
func BatchExtractBytes(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
	jsonBytescItems, err := json.Marshal(items)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cItems := C.CString(string(jsonBytescItems))
	defer C.free(unsafe.Pointer(cItems))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cConfig)

	ptr := C.kreuzberg_batch_extract_bytes(cItems, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []ExtractionResult
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// DetectMimeTypeFromBytes detect MIME type from raw file bytes.
//
// Uses magic byte signatures to detect file type from content.
// Falls back to `infer` crate for comprehensive detection.
//
// For ZIP-based files, inspects contents to distinguish Office Open XML
// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
//
// Arguments:
//   - content: Raw file bytes
//
// Returns the detected MIME type string.
//
// Errors are returned when returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
func DetectMimeTypeFromBytes(content []byte) (string, error) {
	var cContent *C.uint8_t
	if len(content) > 0 {
		var cContentPinner runtime.Pinner
		cContentPinner.Pin(&content[0])
		defer cContentPinner.Unpin()
		cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
	}
	cContentLen := C.uintptr_t(len(content))

	ptr := C.kreuzberg_detect_mime_type_from_bytes(cContent, cContentLen)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_free_string(ptr)
		}
		return "", err
	}
	defer C.kreuzberg_free_string(ptr)
	return C.GoString(ptr), nil
}

// GetExtensionsForMime get file extensions for a given MIME type.
//
// Returns all known file extensions that map to the specified MIME type.
//
// Arguments:
//   - mime_type: The MIME type to look up
//
// Returns a vector of file extensions (without leading dot) for the MIME type.
//
// Example:
//
//	let extensions = get_extensions_for_mime("application/pdf");
//	assert_eq!(extensions, vec!["pdf"]);
//
//	let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
//	assert!(doc_extensions.contains(&"docx".to_string()));
func GetExtensionsForMime(mimeType string) ([]string, error) {
	cMimeType := C.CString(mimeType)
	defer C.free(unsafe.Pointer(cMimeType))

	ptr := C.kreuzberg_get_extensions_for_mime(cMimeType)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListEmbeddingBackends list the names of all registered embedding backends.
//
// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
// bindings.
func ListEmbeddingBackends() ([]string, error) {
	ptr := C.kreuzberg_list_embedding_backends()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListDocumentExtractors list names of all registered document extractors.
func ListDocumentExtractors() ([]string, error) {
	ptr := C.kreuzberg_list_document_extractors()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListOcrBackends list all registered OCR backends.
//
// Returns the names of all OCR backends currently registered in the global registry.
//
// Returns a vector of OCR backend names.
//
// Example:
//
//	let backends = list_ocr_backends()?;
//	for name in backends {
//	    println!("Registered OCR backend: {}", name);
//	}
func ListOcrBackends() ([]string, error) {
	ptr := C.kreuzberg_list_ocr_backends()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListPostProcessors list all registered post-processor names.
//
// Returns a vector of all post-processor names currently registered in the
// global registry.
//
// Returns - `Ok(Vec<String>)` - Vector of post-processor names
// - `Err(...)` if the registry lock is poisoned
//
// Example:
//
//	let processors = list_post_processors()?;
//	for name in processors {
//	    println!("Registered post-processor: {}", name);
//	}
func ListPostProcessors() ([]string, error) {
	ptr := C.kreuzberg_list_post_processors()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListRenderers list names of all registered renderers.
//
// Errors are returned when returns an error if the registry lock is poisoned.
func ListRenderers() ([]string, error) {
	ptr := C.kreuzberg_list_renderers()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// ListValidators list names of all registered validators.
func ListValidators() ([]string, error) {
	ptr := C.kreuzberg_list_validators()
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result []string
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// Compare two extraction results and return a structured diff.
//
// The comparison is purely structural — no I/O, no side effects. All fields
// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
//
// Arguments:
//   - a: — the "before" extraction result
//   - b: — the "after" extraction result
//   - opts: — controls which sections are compared and optional truncation
//
// Example:
//
//	let mut a = ExtractionResult::default();
//	let mut b = ExtractionResult::default();
//	a.content = "Hello world".to_string();
//	b.content = "Hello Rust".to_string();
//
//	let diff = compare(&a, &b, &DiffOptions::default());
//	assert_eq!(diff.content_diff.len(), 1);
func Compare(a ExtractionResult, b ExtractionResult, opts DiffOptions) (*ExtractionDiff, error) {
	jsonBytesca, err := json.Marshal(a)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytesca) == "null" {
		jsonBytesca = []byte("{}")
	}
	tmpStrca := C.CString(string(jsonBytesca))
	ca := C.kreuzberg_extraction_result_from_json(tmpStrca)
	C.free(unsafe.Pointer(tmpStrca))
	if ca == nil {
		return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_result_free(ca)

	jsonBytescb, err := json.Marshal(b)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescb) == "null" {
		jsonBytescb = []byte("{}")
	}
	tmpStrcb := C.CString(string(jsonBytescb))
	cb := C.kreuzberg_extraction_result_from_json(tmpStrcb)
	C.free(unsafe.Pointer(tmpStrcb))
	if cb == nil {
		return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_result_free(cb)

	jsonBytescOpts, err := json.Marshal(opts)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescOpts) == "null" {
		jsonBytescOpts = []byte("{}")
	}
	tmpStrcOpts := C.CString(string(jsonBytescOpts))
	cOpts := C.kreuzberg_diff_options_from_json(tmpStrcOpts)
	C.free(unsafe.Pointer(tmpStrcOpts))
	if cOpts == nil {
		return nil, fmt.Errorf("failed to create diff_options: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_diff_options_free(cOpts)

	ptr := C.kreuzberg_compare(ca, cb, cOpts)
	defer C.kreuzberg_extraction_diff_free(ptr)
	jsonPtr := C.kreuzberg_extraction_diff_to_json(ptr)
	if jsonPtr == nil {
		return nil, fmt.Errorf("failed to convert to JSON")
	}
	defer C.kreuzberg_free_string(jsonPtr)
	var result ExtractionDiff
	if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return &result, nil
}

// EmbedTextsAsync generate embeddings asynchronously for a list of text strings.
//
// This is the async counterpart to [`embed_texts`]. It offloads the blocking
// ONNX inference work to a dedicated blocking thread pool via Tokio's
// `spawn_blocking`, keeping the async executor free.
//
// Returns one embedding vector per input text in the same order.
//
// Arguments:
//   - texts: Vec of strings to embed (owned, sent to blocking thread)
//   - config: Embedding configuration specifying model, batch size, and normalization
//
// Errors are returned when - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
// or the blocking inference task panics
//
// Example:
//
//	let embeddings = embed_texts_async(
//	    vec!["Hello!".to_string()],
//	    &EmbeddingConfig::default(),
//	).await?;
func EmbedTextsAsync(texts []string, config EmbeddingConfig) ([][]float32, error) {
	jsonBytescTexts, err := json.Marshal(texts)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cTexts := C.CString(string(jsonBytescTexts))
	defer C.free(unsafe.Pointer(cTexts))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_embedding_config_free(cConfig)

	ptr := C.kreuzberg_embed_texts_async(cTexts, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result [][]float32
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// RenderPdfPageToPng render a single PDF page to PNG bytes.
//
// Returns raw PNG-encoded bytes for the specified page at the given DPI.
// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
//
// Arguments:
//   - pdf_bytes: Raw PDF file bytes
//   - page_index: Zero-based page index
//   - dpi: Resolution in dots per inch (default: 150)
//   - password: Optional password for encrypted PDFs
//
// Errors are returned when returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
// or rendered, or if `page_index` is out of range.
func RenderPdfPageToPng(pdfBytes []byte, pageIndex uint, dpi *int32, password *string) ([]byte, error) {
	var cPdfBytes *C.uint8_t
	if len(pdfBytes) > 0 {
		var cPdfBytesPinner runtime.Pinner
		cPdfBytesPinner.Pin(&pdfBytes[0])
		defer cPdfBytesPinner.Unpin()
		cPdfBytes = (*C.uint8_t)(unsafe.Pointer(&pdfBytes[0]))
	}
	cPdfBytesLen := C.uintptr_t(len(pdfBytes))

	cPageIndex := C.size_t(uint(pageIndex))

	var cDpi C.int32_t = C.int32_t(int32(2147483647))
	if dpi != nil {
		cDpi = C.int32_t(int32(*dpi))
	}

	var cPassword *C.char
	if password != nil {
		cPassword = C.CString(*password)
		defer C.free(unsafe.Pointer(cPassword))
	}

	var outPtr *C.uint8_t
	var outLen, outCap C.uintptr_t
	rc := C.kreuzberg_render_pdf_page_to_png(cPdfBytes, cPdfBytesLen, cPageIndex, cDpi, cPassword, &outPtr, &outLen, &outCap)
	if rc != 0 {
		return nil, lastError()
	}
	if outPtr == nil {
		return nil, lastError()
	}
	result := C.GoBytes(unsafe.Pointer(outPtr), C.int(outLen))
	C.kreuzberg_free_bytes(outPtr, outLen, outCap)
	return result, nil
}

// DetectMimeType detect the MIME type of a file at the given path.
//
// Uses the file extension and optionally the file content to determine the MIME type.
// Set `check_exists` to `true` to verify the file exists before detection.
func DetectMimeType(path string, checkExists bool) (string, error) {
	cPath := C.CString(path)
	defer C.free(unsafe.Pointer(cPath))

	var cCheckExists C.int32_t
	if checkExists {
		cCheckExists = 1
	} else {
		cCheckExists = 0
	}

	ptr := C.kreuzberg_detect_mime_type(cPath, cCheckExists)
	if err := lastError(); err != nil {
		if ptr != nil {
			C.kreuzberg_free_string(ptr)
		}
		return "", err
	}
	defer C.kreuzberg_free_string(ptr)
	return C.GoString(ptr), nil
}

// EmbedTexts embed a list of texts using the configured embedding model.
//
// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
func EmbedTexts(texts []string, config EmbeddingConfig) ([][]float32, error) {
	jsonBytescTexts, err := json.Marshal(texts)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	cTexts := C.CString(string(jsonBytescTexts))
	defer C.free(unsafe.Pointer(cTexts))

	jsonBytescConfig, err := json.Marshal(config)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescConfig) == "null" {
		jsonBytescConfig = []byte("{}")
	}
	tmpStrcConfig := C.CString(string(jsonBytescConfig))
	cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
	C.free(unsafe.Pointer(tmpStrcConfig))
	if cConfig == nil {
		return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_embedding_config_free(cConfig)

	ptr := C.kreuzberg_embed_texts(cTexts, cConfig)
	if err := lastError(); err != nil {
		return nil, err
	}
	if ptr == nil {
		return nil, fmt.Errorf("failed to get result")
	}
	defer C.kreuzberg_free_string(ptr)
	var result [][]float32
	if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
		return nil, fmt.Errorf("failed to unmarshal: %w", err)
	}
	return result, nil
}

// GetEmbeddingPreset get an embedding preset by name.
//
// Returns `None` if no preset with the given name exists. Returns an owned
// clone so the value is safe to pass across FFI boundaries.
func GetEmbeddingPreset(name string) *EmbeddingPreset {
	cName := C.CString(name)
	defer C.free(unsafe.Pointer(cName))

	ptr := C.kreuzberg_get_embedding_preset(cName)
	return func() *EmbeddingPreset {
		jsonPtr := C.kreuzberg_embedding_preset_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result EmbeddingPreset
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}()
}

// ListEmbeddingPresets list the names of all available embedding presets.
//
// Returns owned `String`s so the values are safe to pass across FFI boundaries.
func ListEmbeddingPresets() []string {
	ptr := C.kreuzberg_list_embedding_presets()
	return func() []string {
		if ptr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(ptr)
		var result []string
		if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
			return nil
		}
		return result
	}()
}

// NeedsImageProcessing check if image processing is needed by examining OCR and image extraction settings.
//
// Returns `true` if either OCR is enabled or image extraction is configured,
// indicating that image decompression and processing should occur.
// Returns `false` if both are disabled, allowing optimization to skip unnecessary
// image decompression for text-only extraction workflows.
//
// # Optimization Impact
// For text-only extractions (no OCR, no image extraction), skipping image
// decompression can improve CPU utilization by 5-10% by avoiding wasteful
// image I/O and processing when results won't be used.
func (r *ExtractionConfig) NeedsImageProcessing() (bool, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return false, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_extraction_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_extraction_config_free(cRecv)
	ptr := C.kreuzberg_extraction_config_needs_image_processing(cRecv)
	return ptr != 0, nil
}

// ListenAddr get the server listen address (host:port).
//
// Example:
//
//	let config = ServerConfig::default();
//	assert_eq!(config.listen_addr(), "127.0.0.1:8000");
func (r *ServerConfig) ListenAddr() (string, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return "", fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return "", fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_server_config_free(cRecv)
	ptr := C.kreuzberg_server_config_listen_addr(cRecv)
	defer C.kreuzberg_free_string(ptr)
	return C.GoString(ptr), nil
}

// CorsAllowsAll check if CORS allows all origins.
//
// Returns `true` if the `cors_origins` vector is empty, meaning all origins
// are allowed. Returns `false` if specific origins are configured.
//
// Example:
//
//	let mut config = ServerConfig::default();
//	assert!(config.cors_allows_all());
//
//	config.cors_origins.push("https://example.com".to_string());
//	assert!(!config.cors_allows_all());
func (r *ServerConfig) CorsAllowsAll() (bool, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return false, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_server_config_free(cRecv)
	ptr := C.kreuzberg_server_config_cors_allows_all(cRecv)
	return ptr != 0, nil
}

// IsOriginAllowed check if a given origin is allowed by CORS configuration.
//
// Returns `true` if:
// - CORS allows all origins (empty origins list), or
// - The given origin is in the allowed origins list
//
// Arguments:
//   - origin: The origin to check (e.g., "https://example.com")
//
// Example:
//
//	let mut config = ServerConfig::default();
//	assert!(config.is_origin_allowed("https://example.com"));
//
//	config.cors_origins.push("https://allowed.com".to_string());
//	assert!(config.is_origin_allowed("https://allowed.com"));
//	assert!(!config.is_origin_allowed("https://denied.com"));
func (r *ServerConfig) IsOriginAllowed(origin string) (bool, error) {
	cOrigin := C.CString(origin)
	defer C.free(unsafe.Pointer(cOrigin))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return false, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_server_config_free(cRecv)
	ptr := C.kreuzberg_server_config_is_origin_allowed(cRecv, cOrigin)
	return ptr != 0, nil
}

// MaxRequestBodyMb get maximum request body size in megabytes (rounded up).
//
// Example:
//
//	let mut config = ServerConfig::default();
//	assert_eq!(config.max_request_body_mb(), 100);
func (r *ServerConfig) MaxRequestBodyMb() (uint, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return 0, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_server_config_free(cRecv)
	ptr := C.kreuzberg_server_config_max_request_body_mb(cRecv)
	return uint(ptr), nil
}

// MaxMultipartFieldMb get maximum multipart field size in megabytes (rounded up).
//
// Example:
//
//	let mut config = ServerConfig::default();
//	assert_eq!(config.max_multipart_field_mb(), 100);
func (r *ServerConfig) MaxMultipartFieldMb() (uint, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return 0, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_server_config_free(cRecv)
	ptr := C.kreuzberg_server_config_max_multipart_field_mb(cRecv)
	return uint(ptr), nil
}

// FinalizeNodeTypes compute and populate the `node_types` field from the current `nodes`.
//
// Call this after all nodes have been added to the structure. Internal
// construction paths (builder, derivation) call this automatically.
//
// Example:
//
//	let mut structure = DocumentStructure {
//	    nodes: vec![DocumentNode {
//	        id: NodeId::from("n1"),
//	        content: NodeContent::Paragraph { text: "Hello".into() },
//	        parent: None,
//	        children: vec![],
//	        content_layer: Default::default(),
//	        page: None,
//	        page_end: None,
//	        bbox: None,
//	        annotations: vec![],
//	        attributes: None,
//	    }],
//	    source_format: None,
//	    relationships: vec![],
//	    node_types: vec![],
//	};
//	structure.finalize_node_types();
//	assert!(structure.node_types.contains(&"paragraph".to_string()));
func (r *DocumentStructure) FinalizeNodeTypes() error {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_document_structure_free(cRecv)
	C.kreuzberg_document_structure_finalize_node_types(cRecv)
	jsonPtrUpdated := C.kreuzberg_document_structure_to_json(cRecv)
	if jsonPtrUpdated != nil {
		_ = json.Unmarshal([]byte(C.GoString(jsonPtrUpdated)), r)
		C.kreuzberg_free_string(jsonPtrUpdated)
	}
	return nil
}

// IsEmpty check if the document structure is empty.
func (r *DocumentStructure) IsEmpty() (bool, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return false, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_document_structure_free(cRecv)
	ptr := C.kreuzberg_document_structure_is_empty(cRecv)
	return ptr != 0, nil
}

// FromOcr convert from an OCR result.
func ExtractionResultFromOcr(ocr OcrExtractionResult) (*ExtractionResult, error) {
	jsonBytescOcr, err := json.Marshal(ocr)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal: %w", err)
	}
	// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
	// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
	// instance is constructed instead — semantically equivalent to None for query types
	// whose fields are all optional with serde(default).
	if string(jsonBytescOcr) == "null" {
		jsonBytescOcr = []byte("{}")
	}
	tmpStrcOcr := C.CString(string(jsonBytescOcr))
	cOcr := C.kreuzberg_ocr_extraction_result_from_json(tmpStrcOcr)
	C.free(unsafe.Pointer(tmpStrcOcr))
	if cOcr == nil {
		return nil, fmt.Errorf("failed to create ocr_extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_ocr_extraction_result_free(cOcr)

	ptr := C.kreuzberg_extraction_result_from_ocr(cOcr)
	defer C.kreuzberg_extraction_result_free(ptr)
	return func() *ExtractionResult {
		jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result ExtractionResult
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// IsEmpty returns `true` when no metadata fields, format-specific metadata, or
// additional postprocessor fields are populated.
func (r *Metadata) IsEmpty() (bool, error) {
	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return false, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_metadata_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_metadata_free(cRecv)
	ptr := C.kreuzberg_metadata_is_empty(cRecv)
	return ptr != 0, nil
}

// WithCacheDir sets a custom cache directory for model files.
//
// Arguments:
//   - path: Path to cache directory
//
// Example:
//
//	let config = PaddleOcrConfig::new("en")
//	    .with_cache_dir(PathBuf::from("/tmp/paddle-cache"));
func (r *PaddleOcrConfig) WithCacheDir(path string) (*PaddleOcrConfig, error) {
	cPath := C.CString(path)
	defer C.free(unsafe.Pointer(cPath))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_cache_dir(cRecv, cPath)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithTableDetection enables or disables table structure detection.
//
// Arguments:
//   - enable: Whether to enable table detection
//
// Example:
//
//	let config = PaddleOcrConfig::new("en")
//	    .with_table_detection(true);
func (r *PaddleOcrConfig) WithTableDetection(enable bool) (*PaddleOcrConfig, error) {
	var cEnable C.int32_t
	if enable {
		cEnable = 1
	} else {
		cEnable = 0
	}

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_table_detection(cRecv, cEnable)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithAngleCls enables or disables angle classification for rotated text.
//
// Arguments:
//   - enable: Whether to enable angle classification
func (r *PaddleOcrConfig) WithAngleCls(enable bool) (*PaddleOcrConfig, error) {
	var cEnable C.int32_t
	if enable {
		cEnable = 1
	} else {
		cEnable = 0
	}

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_angle_cls(cRecv, cEnable)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithDetDbThresh sets the database threshold for text detection.
//
// Arguments:
//   - threshold: Detection threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbThresh(threshold float32) (*PaddleOcrConfig, error) {
	cThreshold := C.float(float32(threshold))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_det_db_thresh(cRecv, cThreshold)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithDetDbBoxThresh sets the box threshold for text bounding box refinement.
//
// Arguments:
//   - threshold: Box threshold (0.0-1.0)
func (r *PaddleOcrConfig) WithDetDbBoxThresh(threshold float32) (*PaddleOcrConfig, error) {
	cThreshold := C.float(float32(threshold))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_det_db_box_thresh(cRecv, cThreshold)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithDetDbUnclipRatio sets the unclip ratio for expanding text bounding boxes.
//
// Arguments:
//   - ratio: Unclip ratio (typically 1.5-2.0)
func (r *PaddleOcrConfig) WithDetDbUnclipRatio(ratio float32) (*PaddleOcrConfig, error) {
	cRatio := C.float(float32(ratio))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_det_db_unclip_ratio(cRecv, cRatio)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithDetLimitSideLen sets the maximum side length for detection images.
//
// Arguments:
//   - length: Maximum side length in pixels
func (r *PaddleOcrConfig) WithDetLimitSideLen(length uint32) (*PaddleOcrConfig, error) {
	cLength := C.uint32_t(uint32(length))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_det_limit_side_len(cRecv, cLength)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithRecBatchNum sets the batch size for recognition inference.
//
// Arguments:
//   - batch_size: Number of text regions to process simultaneously
func (r *PaddleOcrConfig) WithRecBatchNum(batchSize uint32) (*PaddleOcrConfig, error) {
	cBatchSize := C.uint32_t(uint32(batchSize))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_rec_batch_num(cRecv, cBatchSize)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithDropScore sets the minimum recognition confidence threshold.
//
// Arguments:
//   - score: Minimum confidence (0.0-1.0), text below this is dropped
func (r *PaddleOcrConfig) WithDropScore(score float32) (*PaddleOcrConfig, error) {
	cScore := C.float(float32(score))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_drop_score(cRecv, cScore)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithPadding sets padding in pixels added around images before detection.
//
// Arguments:
//   - padding: Padding in pixels (0-100)
func (r *PaddleOcrConfig) WithPadding(padding uint32) (*PaddleOcrConfig, error) {
	cPadding := C.uint32_t(uint32(padding))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_padding(cRecv, cPadding)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}

// WithModelTier sets the model tier controlling detection/recognition model size.
//
// Arguments:
//   - tier: `"mobile"` (default, lightweight, faster) or `"server"` (high accuracy, GPU/complex documents)
func (r *PaddleOcrConfig) WithModelTier(tier string) (*PaddleOcrConfig, error) {
	cTier := C.CString(tier)
	defer C.free(unsafe.Pointer(cTier))

	jsonBytesRecv, err := json.Marshal(r)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal receiver: %w", err)
	}
	tmpStrRecv := C.CString(string(jsonBytesRecv))
	cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
	C.free(unsafe.Pointer(tmpStrRecv))
	if cRecv == nil {
		return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
	}
	defer C.kreuzberg_paddle_ocr_config_free(cRecv)
	ptr := C.kreuzberg_paddle_ocr_config_with_model_tier(cRecv, cTier)
	defer C.kreuzberg_paddle_ocr_config_free(ptr)
	return func() *PaddleOcrConfig {
		jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
		if jsonPtr == nil {
			return nil
		}
		defer C.kreuzberg_free_string(jsonPtr)
		var result PaddleOcrConfig
		if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
			return nil
		}
		return &result
	}(), nil
}