7848 lines
289 KiB
Go
7848 lines
289 KiB
Go
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
|||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|||
|
|
// To regenerate: alef generate
|
|||
|
|
// To verify freshness: alef verify --exit-code
|
|||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|||
|
|
|
|||
|
|
// Package kreuzberg provides Go bindings for the kreuzberg library.
|
|||
|
|
package kreuzberg
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
#cgo CFLAGS: -I${SRCDIR}/include
|
|||
|
|
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/.lib/macos-arm64 -Wl,-rpath,${SRCDIR}/.lib/macos-arm64 -lkreuzberg_ffi
|
|||
|
|
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/.lib/macos-amd64 -Wl,-rpath,${SRCDIR}/.lib/macos-amd64 -lkreuzberg_ffi
|
|||
|
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/.lib/linux-amd64 -Wl,-rpath,${SRCDIR}/.lib/linux-amd64 -lkreuzberg_ffi
|
|||
|
|
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/.lib/linux-arm64 -Wl,-rpath,${SRCDIR}/.lib/linux-arm64 -lkreuzberg_ffi
|
|||
|
|
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/.lib/windows-amd64 -lkreuzberg_ffi
|
|||
|
|
#include "kreuzberg.h"
|
|||
|
|
*/
|
|||
|
|
import "C"
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"encoding/json"
|
|||
|
|
"errors"
|
|||
|
|
"fmt"
|
|||
|
|
"runtime"
|
|||
|
|
"unsafe"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// lastError retrieves the last error from the FFI layer.
|
|||
|
|
func lastError() error {
|
|||
|
|
code := int32(C.kreuzberg_last_error_code())
|
|||
|
|
if code == 0 {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
ctx := C.kreuzberg_last_error_context()
|
|||
|
|
if ctx == nil {
|
|||
|
|
return fmt.Errorf("[%d] native error", code)
|
|||
|
|
}
|
|||
|
|
message := C.GoString(ctx)
|
|||
|
|
return fmt.Errorf("[%d] %s", code, message)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// unmarshalBytes copies a C byte buffer into a Go []byte.
|
|||
|
|
//
|
|||
|
|
// The pointer is treated as a NUL-terminated C string; binary payloads
|
|||
|
|
// that may contain interior NULs should be exposed by the FFI with an
|
|||
|
|
// explicit length out-parameter instead.
|
|||
|
|
func unmarshalBytes(ptr *C.uint8_t) []byte {
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return []byte(C.GoString((*C.char)(unsafe.Pointer(ptr))))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Ptr returns a pointer to the given value.
|
|||
|
|
//
|
|||
|
|
// Used by data DTOs to construct pointers for optional fields without the
|
|||
|
|
// functional-options pattern boilerplate. For example:
|
|||
|
|
//
|
|||
|
|
// &MyStruct{Field: Ptr("value"), OtherField: Ptr(42)}
|
|||
|
|
func Ptr[T any](v T) *T {
|
|||
|
|
return &v
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var (
|
|||
|
|
// ErrIo is returned when IO error.
|
|||
|
|
ErrIo = errors.New("IO error")
|
|||
|
|
// ErrParsing is returned when parsing error.
|
|||
|
|
ErrParsing = errors.New("parsing error")
|
|||
|
|
// ErrOcr is returned when OCR error.
|
|||
|
|
ErrOcr = errors.New("OCR error")
|
|||
|
|
// ErrValidation is returned when validation error.
|
|||
|
|
ErrValidation = errors.New("validation error")
|
|||
|
|
// ErrCache is returned when cache error.
|
|||
|
|
ErrCache = errors.New("cache error")
|
|||
|
|
// ErrImageProcessing is returned when image processing error.
|
|||
|
|
ErrImageProcessing = errors.New("image processing error")
|
|||
|
|
// ErrSerialization is returned when serialization error.
|
|||
|
|
ErrSerialization = errors.New("serialization error")
|
|||
|
|
// ErrMissingDependency is returned when missing dependency.
|
|||
|
|
ErrMissingDependency = errors.New("missing dependency")
|
|||
|
|
// ErrPlugin is returned when plugin error in.
|
|||
|
|
ErrPlugin = errors.New("plugin error in")
|
|||
|
|
// ErrLockPoisoned is returned when lock poisoned.
|
|||
|
|
ErrLockPoisoned = errors.New("lock poisoned")
|
|||
|
|
// ErrUnsupportedFormat is returned when unsupported format.
|
|||
|
|
ErrUnsupportedFormat = errors.New("unsupported format")
|
|||
|
|
// ErrEmbedding is returned when embedding error.
|
|||
|
|
ErrEmbedding = errors.New("embedding error")
|
|||
|
|
// ErrTimeout is returned when extraction timed out after ms (limit: ms).
|
|||
|
|
ErrTimeout = errors.New("extraction timed out after ms (limit: ms)")
|
|||
|
|
// ErrCancelled is returned when extraction cancelled.
|
|||
|
|
ErrCancelled = errors.New("extraction cancelled")
|
|||
|
|
// ErrSecurity is returned when security violation.
|
|||
|
|
ErrSecurity = errors.New("security violation")
|
|||
|
|
// ErrOther is returned when other.
|
|||
|
|
ErrOther = errors.New("other")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// Error is a structured error type.
|
|||
|
|
type Error struct {
|
|||
|
|
Code string
|
|||
|
|
Message string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (e Error) Error() string { return e.Message }
|
|||
|
|
|
|||
|
|
// ExecutionProviderType is an enumeration type.
|
|||
|
|
type ExecutionProviderType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ExecutionProviderTypeAuto ExecutionProviderTypeAuto auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
|
|||
|
|
ExecutionProviderTypeAuto ExecutionProviderType = "auto"
|
|||
|
|
// ExecutionProviderTypeCPU ExecutionProviderTypeCPU cPU execution provider (always available).
|
|||
|
|
ExecutionProviderTypeCPU ExecutionProviderType = "cpu"
|
|||
|
|
// ExecutionProviderTypeCoreMl ExecutionProviderTypeCoreMl apple CoreML (macOS/iOS Neural Engine + GPU).
|
|||
|
|
ExecutionProviderTypeCoreMl ExecutionProviderType = "core_ml"
|
|||
|
|
// ExecutionProviderTypeCuda ExecutionProviderTypeCuda nVIDIA CUDA GPU acceleration.
|
|||
|
|
ExecutionProviderTypeCuda ExecutionProviderType = "cuda"
|
|||
|
|
// ExecutionProviderTypeTensorRt ExecutionProviderTypeTensorRt nVIDIA TensorRT (optimized CUDA inference).
|
|||
|
|
ExecutionProviderTypeTensorRt ExecutionProviderType = "tensor_rt"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// OutputFormat output format for extraction results.
|
|||
|
|
//
|
|||
|
|
// Controls the format of the `content` field in `ExtractionResult`.
|
|||
|
|
// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
|||
|
|
// `Plain` returns the raw extracted text.
|
|||
|
|
// `Structured` returns JSON with full OCR element data including bounding
|
|||
|
|
// boxes and confidence scores.
|
|||
|
|
type OutputFormat string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// OutputFormatPlain plain text content only (default)
|
|||
|
|
OutputFormatPlain OutputFormat = "plain"
|
|||
|
|
// OutputFormatMarkdown markdown format
|
|||
|
|
OutputFormatMarkdown OutputFormat = "markdown"
|
|||
|
|
// OutputFormatDjot djot markup format
|
|||
|
|
OutputFormatDjot OutputFormat = "djot"
|
|||
|
|
// OutputFormatHTML hTML format
|
|||
|
|
OutputFormatHTML OutputFormat = "html"
|
|||
|
|
// OutputFormatJSON jSON tree format with heading-driven sections.
|
|||
|
|
OutputFormatJSON OutputFormat = "json"
|
|||
|
|
// OutputFormatStructured structured JSON format with full OCR element metadata.
|
|||
|
|
OutputFormatStructured OutputFormat = "structured"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// HTMLTheme is an enumeration type.
|
|||
|
|
type HTMLTheme string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// HTMLThemeDefault HTMLThemeDefault sensible defaults: system font stack, neutral colours, readable line
|
|||
|
|
// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
|
|||
|
|
// can override individual values.
|
|||
|
|
HTMLThemeDefault HTMLTheme = "default"
|
|||
|
|
// HTMLThemeGitHub HTMLThemeGitHub gitHub Markdown-inspired palette and spacing.
|
|||
|
|
HTMLThemeGitHub HTMLTheme = "git_hub"
|
|||
|
|
// HTMLThemeDark HTMLThemeDark dark background, light text.
|
|||
|
|
HTMLThemeDark HTMLTheme = "dark"
|
|||
|
|
// HTMLThemeLight HTMLThemeLight minimal light theme with generous whitespace.
|
|||
|
|
HTMLThemeLight HTMLTheme = "light"
|
|||
|
|
// HTMLThemeUnstyled HTMLThemeUnstyled no built-in stylesheet emitted. CSS custom properties are still defined
|
|||
|
|
// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
|
|||
|
|
HTMLThemeUnstyled HTMLTheme = "unstyled"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// TableModel is an enumeration type.
|
|||
|
|
type TableModel string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// TableModelTatr TableModelTatr tATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
|
|||
|
|
TableModelTatr TableModel = "tatr"
|
|||
|
|
// TableModelSlanetWired TableModelSlanetWired sLANeXT wired variant -- 365MB, optimized for bordered tables.
|
|||
|
|
TableModelSlanetWired TableModel = "slanet_wired"
|
|||
|
|
// TableModelSlanetWireless TableModelSlanetWireless sLANeXT wireless variant -- 365MB, optimized for borderless tables.
|
|||
|
|
TableModelSlanetWireless TableModel = "slanet_wireless"
|
|||
|
|
// TableModelSlanetPlus TableModelSlanetPlus sLANet-plus -- 7.78MB, lightweight general-purpose.
|
|||
|
|
TableModelSlanetPlus TableModel = "slanet_plus"
|
|||
|
|
// TableModelSlanetAuto TableModelSlanetAuto classifier-routed SLANeXT: auto-select wired/wireless per table.
|
|||
|
|
// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
|
|||
|
|
TableModelSlanetAuto TableModel = "slanet_auto"
|
|||
|
|
// TableModelDisabled TableModelDisabled disable table structure model inference entirely; use heuristic path only.
|
|||
|
|
TableModelDisabled TableModel = "disabled"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ChunkerType is an enumeration type.
|
|||
|
|
type ChunkerType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ChunkerTypeText ChunkerTypeText is the Text variant of ChunkerType.
|
|||
|
|
ChunkerTypeText ChunkerType = "text"
|
|||
|
|
// ChunkerTypeMarkdown ChunkerTypeMarkdown is the Markdown variant of ChunkerType.
|
|||
|
|
ChunkerTypeMarkdown ChunkerType = "markdown"
|
|||
|
|
// ChunkerTypeYaml ChunkerTypeYaml is the Yaml variant of ChunkerType.
|
|||
|
|
ChunkerTypeYaml ChunkerType = "yaml"
|
|||
|
|
// ChunkerTypeSemantic ChunkerTypeSemantic is the Semantic variant of ChunkerType.
|
|||
|
|
ChunkerTypeSemantic ChunkerType = "semantic"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ChunkSizing how chunk size is measured.
|
|||
|
|
//
|
|||
|
|
// Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
|||
|
|
// chunks are sized by token count according to the specified tokenizer.
|
|||
|
|
//
|
|||
|
|
// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
|||
|
|
// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
|||
|
|
// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
|||
|
|
// Variants: Characters, Tokenizer
|
|||
|
|
// Sealed interface — use one of ChunkSizingCharacters, ChunkSizingTokenizer.
|
|||
|
|
type ChunkSizing interface {
|
|||
|
|
isChunkSizing()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ChunkSizingCharacters size measured in Unicode characters (default).
|
|||
|
|
type ChunkSizingCharacters struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (ChunkSizingCharacters) isChunkSizing() {}
|
|||
|
|
|
|||
|
|
func (ChunkSizingCharacters) Type() string { return "characters" }
|
|||
|
|
|
|||
|
|
func (v ChunkSizingCharacters) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ChunkSizingTokenizer size measured in tokens from a HuggingFace tokenizer.
|
|||
|
|
type ChunkSizingTokenizer struct {
|
|||
|
|
// HuggingFace model ID or path, e.g. "Xenova/gpt-4o", "bert-base-uncased".
|
|||
|
|
Model string `json:"model"`
|
|||
|
|
// Optional cache directory override for tokenizer files.
|
|||
|
|
// Defaults to hf-hub's standard cache (`~/.cache/huggingface/`).
|
|||
|
|
// Can also be set via `KREUZBERG_TOKENIZER_CACHE_DIR` environment variable.
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (ChunkSizingTokenizer) isChunkSizing() {}
|
|||
|
|
|
|||
|
|
func (ChunkSizingTokenizer) Type() string { return "tokenizer" }
|
|||
|
|
|
|||
|
|
func (v ChunkSizingTokenizer) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Model string `json:"model"`
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Model: v.Model,
|
|||
|
|
CacheDir: v.CacheDir,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalChunkSizing decodes JSON data into the appropriate concrete ChunkSizing variant.
|
|||
|
|
func UnmarshalChunkSizing(data []byte) (ChunkSizing, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.Type {
|
|||
|
|
case "characters":
|
|||
|
|
var v ChunkSizingCharacters
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "tokenizer":
|
|||
|
|
var v ChunkSizingTokenizer
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown ChunkSizing type: %q", wire.Type)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingModelType embedding model types supported by Kreuzberg.
|
|||
|
|
// Variants: Preset, Custom, Llm, Plugin
|
|||
|
|
// Sealed interface — use one of EmbeddingModelTypePreset, EmbeddingModelTypeCustom.
|
|||
|
|
type EmbeddingModelType interface {
|
|||
|
|
isEmbeddingModelType()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingModelTypePreset use a preset model configuration (recommended)
|
|||
|
|
type EmbeddingModelTypePreset struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypePreset) isEmbeddingModelType() {}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypePreset) Type() string { return "preset" }
|
|||
|
|
|
|||
|
|
func (v EmbeddingModelTypePreset) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Name: v.Name,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingModelTypeCustom use a custom ONNX model from HuggingFace
|
|||
|
|
type EmbeddingModelTypeCustom struct {
|
|||
|
|
ModelID string `json:"model_id"`
|
|||
|
|
Dimensions uint `json:"dimensions"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypeCustom) isEmbeddingModelType() {}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypeCustom) Type() string { return "custom" }
|
|||
|
|
|
|||
|
|
func (v EmbeddingModelTypeCustom) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
ModelID string `json:"model_id"`
|
|||
|
|
Dimensions uint `json:"dimensions"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
ModelID: v.ModelID,
|
|||
|
|
Dimensions: v.Dimensions,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingModelTypeLlm provider-hosted embedding model via liter-llm.
|
|||
|
|
//
|
|||
|
|
// Uses the model specified in the nested `LlmConfig` (e.g.,
|
|||
|
|
// `"openai/text-embedding-3-small"`).
|
|||
|
|
type EmbeddingModelTypeLlm struct {
|
|||
|
|
Llm LlmConfig `json:"llm"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypeLlm) isEmbeddingModelType() {}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypeLlm) Type() string { return "llm" }
|
|||
|
|
|
|||
|
|
func (v EmbeddingModelTypeLlm) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Llm LlmConfig `json:"llm"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Llm: v.Llm,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingModelTypePlugin in-process embedding backend registered via the plugin system.
|
|||
|
|
//
|
|||
|
|
// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
|
|||
|
|
// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
|
|||
|
|
// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
|
|||
|
|
// into the registered backend during chunking and standalone embed requests —
|
|||
|
|
// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
|
|||
|
|
//
|
|||
|
|
// When this variant is selected, only the following [`EmbeddingConfig`] fields
|
|||
|
|
// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
|
|||
|
|
// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
|
|||
|
|
// `show_download_progress`, `acceleration`) are ignored — the host owns the
|
|||
|
|
// model lifecycle.
|
|||
|
|
//
|
|||
|
|
// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
|
|||
|
|
// is used, since there is no preset to look a chunk-size ceiling up against — size your
|
|||
|
|
// context window via `max_characters` directly.
|
|||
|
|
//
|
|||
|
|
// See `register_embedding_backend`.
|
|||
|
|
type EmbeddingModelTypePlugin struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypePlugin) isEmbeddingModelType() {}
|
|||
|
|
|
|||
|
|
func (EmbeddingModelTypePlugin) Type() string { return "plugin" }
|
|||
|
|
|
|||
|
|
func (v EmbeddingModelTypePlugin) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Name: v.Name,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalEmbeddingModelType decodes JSON data into the appropriate concrete EmbeddingModelType variant.
|
|||
|
|
func UnmarshalEmbeddingModelType(data []byte) (EmbeddingModelType, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.Type {
|
|||
|
|
case "preset":
|
|||
|
|
var v EmbeddingModelTypePreset
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "custom":
|
|||
|
|
var v EmbeddingModelTypeCustom
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "llm":
|
|||
|
|
var v EmbeddingModelTypeLlm
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "plugin":
|
|||
|
|
var v EmbeddingModelTypePlugin
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown EmbeddingModelType type: %q", wire.Type)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CodeContentMode is an enumeration type.
|
|||
|
|
type CodeContentMode string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// CodeContentModeChunks CodeContentModeChunks use TSLP semantic chunks as content (default).
|
|||
|
|
CodeContentModeChunks CodeContentMode = "chunks"
|
|||
|
|
// CodeContentModeRaw CodeContentModeRaw use raw source code as content.
|
|||
|
|
CodeContentModeRaw CodeContentMode = "raw"
|
|||
|
|
// CodeContentModeStructure CodeContentModeStructure emit function/class headings + docstrings (no code bodies).
|
|||
|
|
CodeContentModeStructure CodeContentMode = "structure"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ListType is an enumeration type.
|
|||
|
|
type ListType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ListTypeBullet ListTypeBullet bullet points (-, *, •, etc.)
|
|||
|
|
ListTypeBullet ListType = "bullet"
|
|||
|
|
// ListTypeNumbered ListTypeNumbered numbered lists (1., 2., etc.)
|
|||
|
|
ListTypeNumbered ListType = "numbered"
|
|||
|
|
// ListTypeLettered ListTypeLettered lettered lists (a., b., A., B., etc.)
|
|||
|
|
ListTypeLettered ListType = "lettered"
|
|||
|
|
// ListTypeIndented ListTypeIndented indented items
|
|||
|
|
ListTypeIndented ListType = "indented"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// OcrBackendType is an enumeration type.
|
|||
|
|
type OcrBackendType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// OcrBackendTypeTesseract OcrBackendTypeTesseract tesseract OCR (native Rust binding)
|
|||
|
|
OcrBackendTypeTesseract OcrBackendType = "tesseract"
|
|||
|
|
// OcrBackendTypeEasyOcr OcrBackendTypeEasyOcr easyOCR (Python-based, via FFI)
|
|||
|
|
OcrBackendTypeEasyOcr OcrBackendType = "easy_ocr"
|
|||
|
|
// OcrBackendTypePaddleOcr OcrBackendTypePaddleOcr paddleOCR (Python-based, via FFI)
|
|||
|
|
OcrBackendTypePaddleOcr OcrBackendType = "paddle_ocr"
|
|||
|
|
// OcrBackendTypeCustom OcrBackendTypeCustom custom/third-party OCR backend
|
|||
|
|
OcrBackendTypeCustom OcrBackendType = "custom"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ProcessingStage is an enumeration type.
|
|||
|
|
type ProcessingStage string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ProcessingStageEarly ProcessingStageEarly early stage - foundational processing.
|
|||
|
|
//
|
|||
|
|
// Use for:
|
|||
|
|
// - Language detection
|
|||
|
|
// - Character encoding normalization
|
|||
|
|
// - Entity extraction (NER)
|
|||
|
|
// - Text quality scoring
|
|||
|
|
ProcessingStageEarly ProcessingStage = "early"
|
|||
|
|
// ProcessingStageMiddle ProcessingStageMiddle middle stage - content transformation.
|
|||
|
|
//
|
|||
|
|
// Use for:
|
|||
|
|
// - Keyword extraction
|
|||
|
|
// - Token reduction
|
|||
|
|
// - Text summarization
|
|||
|
|
// - Semantic analysis
|
|||
|
|
ProcessingStageMiddle ProcessingStage = "middle"
|
|||
|
|
// ProcessingStageLate ProcessingStageLate late stage - final enrichment.
|
|||
|
|
//
|
|||
|
|
// Use for:
|
|||
|
|
// - Custom user hooks
|
|||
|
|
// - Analytics/logging
|
|||
|
|
// - Final validation
|
|||
|
|
// - Output formatting
|
|||
|
|
ProcessingStageLate ProcessingStage = "late"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ReductionLevel is an enumeration type.
|
|||
|
|
type ReductionLevel string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ReductionLevelOff ReductionLevelOff is the Off variant of ReductionLevel.
|
|||
|
|
ReductionLevelOff ReductionLevel = "off"
|
|||
|
|
// ReductionLevelLight ReductionLevelLight is the Light variant of ReductionLevel.
|
|||
|
|
ReductionLevelLight ReductionLevel = "light"
|
|||
|
|
// ReductionLevelModerate ReductionLevelModerate is the Moderate variant of ReductionLevel.
|
|||
|
|
ReductionLevelModerate ReductionLevel = "moderate"
|
|||
|
|
// ReductionLevelAggressive ReductionLevelAggressive is the Aggressive variant of ReductionLevel.
|
|||
|
|
ReductionLevelAggressive ReductionLevel = "aggressive"
|
|||
|
|
// ReductionLevelMaximum ReductionLevelMaximum is the Maximum variant of ReductionLevel.
|
|||
|
|
ReductionLevelMaximum ReductionLevel = "maximum"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PdfAnnotationType is an enumeration type.
|
|||
|
|
type PdfAnnotationType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// PdfAnnotationTypeText PdfAnnotationTypeText sticky note / text annotation
|
|||
|
|
PdfAnnotationTypeText PdfAnnotationType = "text"
|
|||
|
|
// PdfAnnotationTypeHighlight PdfAnnotationTypeHighlight highlighted text region
|
|||
|
|
PdfAnnotationTypeHighlight PdfAnnotationType = "highlight"
|
|||
|
|
// PdfAnnotationTypeLink PdfAnnotationTypeLink hyperlink annotation
|
|||
|
|
PdfAnnotationTypeLink PdfAnnotationType = "link"
|
|||
|
|
// PdfAnnotationTypeStamp PdfAnnotationTypeStamp rubber stamp annotation
|
|||
|
|
PdfAnnotationTypeStamp PdfAnnotationType = "stamp"
|
|||
|
|
// PdfAnnotationTypeUnderline PdfAnnotationTypeUnderline underline text markup
|
|||
|
|
PdfAnnotationTypeUnderline PdfAnnotationType = "underline"
|
|||
|
|
// PdfAnnotationTypeStrikeOut PdfAnnotationTypeStrikeOut strikeout text markup
|
|||
|
|
PdfAnnotationTypeStrikeOut PdfAnnotationType = "strike_out"
|
|||
|
|
// PdfAnnotationTypeOther PdfAnnotationTypeOther any other annotation type
|
|||
|
|
PdfAnnotationTypeOther PdfAnnotationType = "other"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// BlockType is an enumeration type.
|
|||
|
|
type BlockType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// BlockTypeParagraph BlockTypeParagraph is the Paragraph variant of BlockType.
|
|||
|
|
BlockTypeParagraph BlockType = "paragraph"
|
|||
|
|
// BlockTypeHeading BlockTypeHeading is the Heading variant of BlockType.
|
|||
|
|
BlockTypeHeading BlockType = "heading"
|
|||
|
|
// BlockTypeBlockquote BlockTypeBlockquote is the Blockquote variant of BlockType.
|
|||
|
|
BlockTypeBlockquote BlockType = "blockquote"
|
|||
|
|
// BlockTypeCodeBlock BlockTypeCodeBlock is the CodeBlock variant of BlockType.
|
|||
|
|
BlockTypeCodeBlock BlockType = "code_block"
|
|||
|
|
// BlockTypeListItem BlockTypeListItem is the ListItem variant of BlockType.
|
|||
|
|
BlockTypeListItem BlockType = "list_item"
|
|||
|
|
// BlockTypeOrderedList BlockTypeOrderedList is the OrderedList variant of BlockType.
|
|||
|
|
BlockTypeOrderedList BlockType = "ordered_list"
|
|||
|
|
// BlockTypeBulletList BlockTypeBulletList is the BulletList variant of BlockType.
|
|||
|
|
BlockTypeBulletList BlockType = "bullet_list"
|
|||
|
|
// BlockTypeTaskList BlockTypeTaskList is the TaskList variant of BlockType.
|
|||
|
|
BlockTypeTaskList BlockType = "task_list"
|
|||
|
|
// BlockTypeDefinitionList BlockTypeDefinitionList is the DefinitionList variant of BlockType.
|
|||
|
|
BlockTypeDefinitionList BlockType = "definition_list"
|
|||
|
|
// BlockTypeDefinitionTerm BlockTypeDefinitionTerm is the DefinitionTerm variant of BlockType.
|
|||
|
|
BlockTypeDefinitionTerm BlockType = "definition_term"
|
|||
|
|
// BlockTypeDefinitionDescription BlockTypeDefinitionDescription is the DefinitionDescription variant of BlockType.
|
|||
|
|
BlockTypeDefinitionDescription BlockType = "definition_description"
|
|||
|
|
// BlockTypeDiv BlockTypeDiv is the Div variant of BlockType.
|
|||
|
|
BlockTypeDiv BlockType = "div"
|
|||
|
|
// BlockTypeSection BlockTypeSection is the Section variant of BlockType.
|
|||
|
|
BlockTypeSection BlockType = "section"
|
|||
|
|
// BlockTypeThematicBreak BlockTypeThematicBreak is the ThematicBreak variant of BlockType.
|
|||
|
|
BlockTypeThematicBreak BlockType = "thematic_break"
|
|||
|
|
// BlockTypeRawBlock BlockTypeRawBlock is the RawBlock variant of BlockType.
|
|||
|
|
BlockTypeRawBlock BlockType = "raw_block"
|
|||
|
|
// BlockTypeMathDisplay BlockTypeMathDisplay is the MathDisplay variant of BlockType.
|
|||
|
|
BlockTypeMathDisplay BlockType = "math_display"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// InlineType is an enumeration type.
|
|||
|
|
type InlineType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// InlineTypeText InlineTypeText is the Text variant of InlineType.
|
|||
|
|
InlineTypeText InlineType = "text"
|
|||
|
|
// InlineTypeStrong InlineTypeStrong is the Strong variant of InlineType.
|
|||
|
|
InlineTypeStrong InlineType = "strong"
|
|||
|
|
// InlineTypeEmphasis InlineTypeEmphasis is the Emphasis variant of InlineType.
|
|||
|
|
InlineTypeEmphasis InlineType = "emphasis"
|
|||
|
|
// InlineTypeHighlight InlineTypeHighlight is the Highlight variant of InlineType.
|
|||
|
|
InlineTypeHighlight InlineType = "highlight"
|
|||
|
|
// InlineTypeSubscript InlineTypeSubscript is the Subscript variant of InlineType.
|
|||
|
|
InlineTypeSubscript InlineType = "subscript"
|
|||
|
|
// InlineTypeSuperscript InlineTypeSuperscript is the Superscript variant of InlineType.
|
|||
|
|
InlineTypeSuperscript InlineType = "superscript"
|
|||
|
|
// InlineTypeInsert InlineTypeInsert is the Insert variant of InlineType.
|
|||
|
|
InlineTypeInsert InlineType = "insert"
|
|||
|
|
// InlineTypeDelete InlineTypeDelete is the Delete variant of InlineType.
|
|||
|
|
InlineTypeDelete InlineType = "delete"
|
|||
|
|
// InlineTypeCode InlineTypeCode is the Code variant of InlineType.
|
|||
|
|
InlineTypeCode InlineType = "code"
|
|||
|
|
// InlineTypeLink InlineTypeLink is the Link variant of InlineType.
|
|||
|
|
InlineTypeLink InlineType = "link"
|
|||
|
|
// InlineTypeImage InlineTypeImage is the Image variant of InlineType.
|
|||
|
|
InlineTypeImage InlineType = "image"
|
|||
|
|
// InlineTypeSpan InlineTypeSpan is the Span variant of InlineType.
|
|||
|
|
InlineTypeSpan InlineType = "span"
|
|||
|
|
// InlineTypeMath InlineTypeMath is the Math variant of InlineType.
|
|||
|
|
InlineTypeMath InlineType = "math"
|
|||
|
|
// InlineTypeRawInline InlineTypeRawInline is the RawInline variant of InlineType.
|
|||
|
|
InlineTypeRawInline InlineType = "raw_inline"
|
|||
|
|
// InlineTypeFootnoteRef InlineTypeFootnoteRef is the FootnoteRef variant of InlineType.
|
|||
|
|
InlineTypeFootnoteRef InlineType = "footnote_ref"
|
|||
|
|
// InlineTypeSymbol InlineTypeSymbol is the Symbol variant of InlineType.
|
|||
|
|
InlineTypeSymbol InlineType = "symbol"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// RelationshipKind is an enumeration type.
|
|||
|
|
type RelationshipKind string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// RelationshipKindFootnoteReference RelationshipKindFootnoteReference footnote marker -> footnote definition.
|
|||
|
|
RelationshipKindFootnoteReference RelationshipKind = "footnote_reference"
|
|||
|
|
// RelationshipKindCitationReference RelationshipKindCitationReference citation marker -> bibliography entry.
|
|||
|
|
RelationshipKindCitationReference RelationshipKind = "citation_reference"
|
|||
|
|
// RelationshipKindInternalLink RelationshipKindInternalLink internal anchor link (`#id`) -> target heading/element.
|
|||
|
|
RelationshipKindInternalLink RelationshipKind = "internal_link"
|
|||
|
|
// RelationshipKindCaption RelationshipKindCaption caption paragraph -> figure/table it describes.
|
|||
|
|
RelationshipKindCaption RelationshipKind = "caption"
|
|||
|
|
// RelationshipKindLabel RelationshipKindLabel label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
|
|||
|
|
RelationshipKindLabel RelationshipKind = "label"
|
|||
|
|
// RelationshipKindTocEntry RelationshipKindTocEntry tOC entry -> target section.
|
|||
|
|
RelationshipKindTocEntry RelationshipKind = "toc_entry"
|
|||
|
|
// RelationshipKindCrossReference RelationshipKindCrossReference cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
|
|||
|
|
RelationshipKindCrossReference RelationshipKind = "cross_reference"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ContentLayer is an enumeration type.
|
|||
|
|
type ContentLayer string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ContentLayerBody ContentLayerBody main document body content.
|
|||
|
|
ContentLayerBody ContentLayer = "body"
|
|||
|
|
// ContentLayerHeader ContentLayerHeader page/section header (running header).
|
|||
|
|
ContentLayerHeader ContentLayer = "header"
|
|||
|
|
// ContentLayerFooter ContentLayerFooter page/section footer (running footer).
|
|||
|
|
ContentLayerFooter ContentLayer = "footer"
|
|||
|
|
// ContentLayerFootnote ContentLayerFootnote footnote content.
|
|||
|
|
ContentLayerFootnote ContentLayer = "footnote"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// NodeContent tagged enum for node content. Each variant carries only type-specific data.
|
|||
|
|
//
|
|||
|
|
// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
|||
|
|
// Go/Java/TypeScript bindings.
|
|||
|
|
// Variants: Title, Heading, Paragraph, List, ListItem, Table, Image, Code, Quote, Formula, Footnote, Group, PageBreak, Slide, DefinitionList, DefinitionItem, Citation, Admonition, RawBlock, MetadataBlock
|
|||
|
|
// Sealed interface — use one of NodeContentTitle, NodeContentHeading.
|
|||
|
|
type NodeContent interface {
|
|||
|
|
isNodeContent()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentTitle document title.
|
|||
|
|
type NodeContentTitle struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentTitle) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentTitle) Type() string { return "title" }
|
|||
|
|
|
|||
|
|
func (v NodeContentTitle) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentHeading section heading with level (1-6).
|
|||
|
|
type NodeContentHeading struct {
|
|||
|
|
Level uint8 `json:"level"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentHeading) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentHeading) Type() string { return "heading" }
|
|||
|
|
|
|||
|
|
func (v NodeContentHeading) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Level uint8 `json:"level"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Level: v.Level,
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentParagraph body text paragraph.
|
|||
|
|
type NodeContentParagraph struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentParagraph) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentParagraph) Type() string { return "paragraph" }
|
|||
|
|
|
|||
|
|
func (v NodeContentParagraph) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentList list container — children are `ListItem` nodes.
|
|||
|
|
type NodeContentList struct {
|
|||
|
|
Ordered bool `json:"ordered"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentList) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentList) Type() string { return "list" }
|
|||
|
|
|
|||
|
|
func (v NodeContentList) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Ordered bool `json:"ordered"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Ordered: v.Ordered,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentListItem individual list item.
|
|||
|
|
type NodeContentListItem struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentListItem) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentListItem) Type() string { return "list_item" }
|
|||
|
|
|
|||
|
|
func (v NodeContentListItem) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentTable table with structured cell grid.
|
|||
|
|
type NodeContentTable struct {
|
|||
|
|
Grid TableGrid `json:"grid"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentTable) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentTable) Type() string { return "table" }
|
|||
|
|
|
|||
|
|
func (v NodeContentTable) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Grid TableGrid `json:"grid"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Grid: v.Grid,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentImage image reference.
|
|||
|
|
type NodeContentImage struct {
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
ImageIndex *uint32 `json:"image_index,omitempty"`
|
|||
|
|
// Source URL or path of the image (from `<img src="...">` or ``).
|
|||
|
|
Src *string `json:"src,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentImage) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentImage) Type() string { return "image" }
|
|||
|
|
|
|||
|
|
func (v NodeContentImage) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
ImageIndex *uint32 `json:"image_index,omitempty"`
|
|||
|
|
Src *string `json:"src,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Description: v.Description,
|
|||
|
|
ImageIndex: v.ImageIndex,
|
|||
|
|
Src: v.Src,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentCode code block.
|
|||
|
|
type NodeContentCode struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentCode) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentCode) Type() string { return "code" }
|
|||
|
|
|
|||
|
|
func (v NodeContentCode) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
Language: v.Language,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentQuote block quote — container, children carry the quoted content.
|
|||
|
|
type NodeContentQuote struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentQuote) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentQuote) Type() string { return "quote" }
|
|||
|
|
|
|||
|
|
func (v NodeContentQuote) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentFormula mathematical formula / equation.
|
|||
|
|
type NodeContentFormula struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentFormula) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentFormula) Type() string { return "formula" }
|
|||
|
|
|
|||
|
|
func (v NodeContentFormula) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentFootnote footnote reference content.
|
|||
|
|
type NodeContentFootnote struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentFootnote) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentFootnote) Type() string { return "footnote" }
|
|||
|
|
|
|||
|
|
func (v NodeContentFootnote) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentGroup logical grouping container (section, key-value area).
|
|||
|
|
//
|
|||
|
|
// `heading_level` + `heading_text` capture the section heading directly
|
|||
|
|
// rather than relying on a first-child positional convention.
|
|||
|
|
type NodeContentGroup struct {
|
|||
|
|
Label *string `json:"label,omitempty"`
|
|||
|
|
HeadingLevel *uint8 `json:"heading_level,omitempty"`
|
|||
|
|
HeadingText *string `json:"heading_text,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentGroup) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentGroup) Type() string { return "group" }
|
|||
|
|
|
|||
|
|
func (v NodeContentGroup) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Label *string `json:"label,omitempty"`
|
|||
|
|
HeadingLevel *uint8 `json:"heading_level,omitempty"`
|
|||
|
|
HeadingText *string `json:"heading_text,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Label: v.Label,
|
|||
|
|
HeadingLevel: v.HeadingLevel,
|
|||
|
|
HeadingText: v.HeadingText,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentPageBreak page break marker.
|
|||
|
|
type NodeContentPageBreak struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentPageBreak) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentPageBreak) Type() string { return "page_break" }
|
|||
|
|
|
|||
|
|
func (v NodeContentPageBreak) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentSlide presentation slide container — children are the slide's content nodes.
|
|||
|
|
type NodeContentSlide struct {
|
|||
|
|
// 1-indexed slide number.
|
|||
|
|
Number uint32 `json:"number"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentSlide) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentSlide) Type() string { return "slide" }
|
|||
|
|
|
|||
|
|
func (v NodeContentSlide) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Number uint32 `json:"number"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Number: v.Number,
|
|||
|
|
Title: v.Title,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentDefinitionList definition list container — children are `DefinitionItem` nodes.
|
|||
|
|
type NodeContentDefinitionList struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentDefinitionList) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentDefinitionList) Type() string { return "definition_list" }
|
|||
|
|
|
|||
|
|
func (v NodeContentDefinitionList) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentDefinitionItem individual definition list entry with term and definition.
|
|||
|
|
type NodeContentDefinitionItem struct {
|
|||
|
|
Term string `json:"term"`
|
|||
|
|
Definition string `json:"definition"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentDefinitionItem) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentDefinitionItem) Type() string { return "definition_item" }
|
|||
|
|
|
|||
|
|
func (v NodeContentDefinitionItem) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Term string `json:"term"`
|
|||
|
|
Definition string `json:"definition"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Term: v.Term,
|
|||
|
|
Definition: v.Definition,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentCitation citation or bibliographic reference.
|
|||
|
|
type NodeContentCitation struct {
|
|||
|
|
Key string `json:"key"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentCitation) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentCitation) Type() string { return "citation" }
|
|||
|
|
|
|||
|
|
func (v NodeContentCitation) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Key string `json:"key"`
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Key: v.Key,
|
|||
|
|
Text: v.Text,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentAdmonition admonition / callout container (note, warning, tip, etc.).
|
|||
|
|
//
|
|||
|
|
// Children carry the admonition body content.
|
|||
|
|
type NodeContentAdmonition struct {
|
|||
|
|
// Kind of admonition (e.g. "note", "warning", "tip", "danger").
|
|||
|
|
Kind string `json:"kind"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentAdmonition) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentAdmonition) Type() string { return "admonition" }
|
|||
|
|
|
|||
|
|
func (v NodeContentAdmonition) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Kind string `json:"kind"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Kind: v.Kind,
|
|||
|
|
Title: v.Title,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentRawBlock raw block preserved verbatim from the source format.
|
|||
|
|
//
|
|||
|
|
// Used for content that cannot be mapped to a semantic node type
|
|||
|
|
// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
|
|||
|
|
type NodeContentRawBlock struct {
|
|||
|
|
// Source format identifier (e.g. "html", "latex", "jsx").
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentRawBlock) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentRawBlock) Type() string { return "raw_block" }
|
|||
|
|
|
|||
|
|
func (v NodeContentRawBlock) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Format: v.Format,
|
|||
|
|
Content: v.Content,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NodeContentMetadataBlock structured metadata block (email headers, YAML frontmatter, etc.).
|
|||
|
|
type NodeContentMetadataBlock struct {
|
|||
|
|
Entries [][]string `json:"entries"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (NodeContentMetadataBlock) isNodeContent() {}
|
|||
|
|
|
|||
|
|
func (NodeContentMetadataBlock) Type() string { return "metadata_block" }
|
|||
|
|
|
|||
|
|
func (v NodeContentMetadataBlock) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
Entries [][]string `json:"entries"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
NodeType: v.Type(),
|
|||
|
|
Entries: v.Entries,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalNodeContent decodes JSON data into the appropriate concrete NodeContent variant.
|
|||
|
|
func UnmarshalNodeContent(data []byte) (NodeContent, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
NodeType string `json:"node_type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.NodeType {
|
|||
|
|
case "title":
|
|||
|
|
var v NodeContentTitle
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "heading":
|
|||
|
|
var v NodeContentHeading
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "paragraph":
|
|||
|
|
var v NodeContentParagraph
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "list":
|
|||
|
|
var v NodeContentList
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "list_item":
|
|||
|
|
var v NodeContentListItem
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "table":
|
|||
|
|
var v NodeContentTable
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "image":
|
|||
|
|
var v NodeContentImage
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "code":
|
|||
|
|
var v NodeContentCode
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "quote":
|
|||
|
|
var v NodeContentQuote
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "formula":
|
|||
|
|
var v NodeContentFormula
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "footnote":
|
|||
|
|
var v NodeContentFootnote
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "group":
|
|||
|
|
var v NodeContentGroup
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "page_break":
|
|||
|
|
var v NodeContentPageBreak
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "slide":
|
|||
|
|
var v NodeContentSlide
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "definition_list":
|
|||
|
|
var v NodeContentDefinitionList
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "definition_item":
|
|||
|
|
var v NodeContentDefinitionItem
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "citation":
|
|||
|
|
var v NodeContentCitation
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "admonition":
|
|||
|
|
var v NodeContentAdmonition
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "raw_block":
|
|||
|
|
var v NodeContentRawBlock
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "metadata_block":
|
|||
|
|
var v NodeContentMetadataBlock
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown NodeContent type: %q", wire.NodeType)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKind types of inline text annotations.
|
|||
|
|
// Variants: Bold, Italic, Underline, Strikethrough, Code, Subscript, Superscript, Link, Highlight, Color, FontSize, Custom
|
|||
|
|
// Sealed interface — use one of AnnotationKindBold, AnnotationKindItalic.
|
|||
|
|
type AnnotationKind interface {
|
|||
|
|
isAnnotationKind()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindBold is the Bold variant of AnnotationKind.
|
|||
|
|
type AnnotationKindBold struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindBold) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindBold) Type() string { return "bold" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindBold) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindItalic is the Italic variant of AnnotationKind.
|
|||
|
|
type AnnotationKindItalic struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindItalic) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindItalic) Type() string { return "italic" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindItalic) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindUnderline is the Underline variant of AnnotationKind.
|
|||
|
|
type AnnotationKindUnderline struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindUnderline) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindUnderline) Type() string { return "underline" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindUnderline) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindStrikethrough is the Strikethrough variant of AnnotationKind.
|
|||
|
|
type AnnotationKindStrikethrough struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindStrikethrough) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindStrikethrough) Type() string { return "strikethrough" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindStrikethrough) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindCode is the Code variant of AnnotationKind.
|
|||
|
|
type AnnotationKindCode struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindCode) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindCode) Type() string { return "code" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindCode) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindSubscript is the Subscript variant of AnnotationKind.
|
|||
|
|
type AnnotationKindSubscript struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindSubscript) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindSubscript) Type() string { return "subscript" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindSubscript) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindSuperscript is the Superscript variant of AnnotationKind.
|
|||
|
|
type AnnotationKindSuperscript struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindSuperscript) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindSuperscript) Type() string { return "superscript" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindSuperscript) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindLink is the Link variant of AnnotationKind.
|
|||
|
|
type AnnotationKindLink struct {
|
|||
|
|
URL string `json:"url"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindLink) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindLink) Type() string { return "link" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindLink) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
URL string `json:"url"`
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
URL: v.URL,
|
|||
|
|
Title: v.Title,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindHighlight highlighted text (PDF highlights, HTML `<mark>`).
|
|||
|
|
type AnnotationKindHighlight struct {
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindHighlight) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindHighlight) Type() string { return "highlight" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindHighlight) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindColor text color (CSS-compatible value, e.g. "#ff0000", "red").
|
|||
|
|
type AnnotationKindColor struct {
|
|||
|
|
Value string `json:"value"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindColor) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindColor) Type() string { return "color" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindColor) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
Value string `json:"value"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
Value: v.Value,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindFontSize font size with units (e.g. "12pt", "1.2em", "16px").
|
|||
|
|
type AnnotationKindFontSize struct {
|
|||
|
|
Value string `json:"value"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindFontSize) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindFontSize) Type() string { return "font_size" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindFontSize) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
Value string `json:"value"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
Value: v.Value,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AnnotationKindCustom extensible annotation for format-specific styling.
|
|||
|
|
type AnnotationKindCustom struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Value *string `json:"value,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (AnnotationKindCustom) isAnnotationKind() {}
|
|||
|
|
|
|||
|
|
func (AnnotationKindCustom) Type() string { return "custom" }
|
|||
|
|
|
|||
|
|
func (v AnnotationKindCustom) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Value *string `json:"value,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
AnnotationType: v.Type(),
|
|||
|
|
Name: v.Name,
|
|||
|
|
Value: v.Value,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalAnnotationKind decodes JSON data into the appropriate concrete AnnotationKind variant.
|
|||
|
|
func UnmarshalAnnotationKind(data []byte) (AnnotationKind, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
AnnotationType string `json:"annotation_type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.AnnotationType {
|
|||
|
|
case "bold":
|
|||
|
|
var v AnnotationKindBold
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "italic":
|
|||
|
|
var v AnnotationKindItalic
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "underline":
|
|||
|
|
var v AnnotationKindUnderline
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "strikethrough":
|
|||
|
|
var v AnnotationKindStrikethrough
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "code":
|
|||
|
|
var v AnnotationKindCode
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "subscript":
|
|||
|
|
var v AnnotationKindSubscript
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "superscript":
|
|||
|
|
var v AnnotationKindSuperscript
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "link":
|
|||
|
|
var v AnnotationKindLink
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "highlight":
|
|||
|
|
var v AnnotationKindHighlight
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "color":
|
|||
|
|
var v AnnotationKindColor
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "font_size":
|
|||
|
|
var v AnnotationKindFontSize
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "custom":
|
|||
|
|
var v AnnotationKindCustom
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown AnnotationKind type: %q", wire.AnnotationType)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractionMethod is an enumeration type.
|
|||
|
|
type ExtractionMethod string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ExtractionMethodNative ExtractionMethodNative is the Native variant of ExtractionMethod.
|
|||
|
|
ExtractionMethodNative ExtractionMethod = "native"
|
|||
|
|
// ExtractionMethodOcr ExtractionMethodOcr is the Ocr variant of ExtractionMethod.
|
|||
|
|
ExtractionMethodOcr ExtractionMethod = "ocr"
|
|||
|
|
// ExtractionMethodMixed ExtractionMethodMixed is the Mixed variant of ExtractionMethod.
|
|||
|
|
ExtractionMethodMixed ExtractionMethod = "mixed"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ChunkType is an enumeration type.
|
|||
|
|
type ChunkType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ChunkTypeHeading ChunkTypeHeading section heading or document title.
|
|||
|
|
ChunkTypeHeading ChunkType = "heading"
|
|||
|
|
// ChunkTypePartyList ChunkTypePartyList party list: names, addresses, and signatories.
|
|||
|
|
ChunkTypePartyList ChunkType = "party_list"
|
|||
|
|
// ChunkTypeDefinitions ChunkTypeDefinitions definition clause ("X means…", "X shall mean…").
|
|||
|
|
ChunkTypeDefinitions ChunkType = "definitions"
|
|||
|
|
// ChunkTypeOperativeClause ChunkTypeOperativeClause operative clause containing legal/contractual action verbs.
|
|||
|
|
ChunkTypeOperativeClause ChunkType = "operative_clause"
|
|||
|
|
// ChunkTypeSignatureBlock ChunkTypeSignatureBlock signature block with signatures, names, and dates.
|
|||
|
|
ChunkTypeSignatureBlock ChunkType = "signature_block"
|
|||
|
|
// ChunkTypeSchedule ChunkTypeSchedule schedule, annex, appendix, or exhibit section.
|
|||
|
|
ChunkTypeSchedule ChunkType = "schedule"
|
|||
|
|
// ChunkTypeTableLike ChunkTypeTableLike table-like content with aligned columns or repeated patterns.
|
|||
|
|
ChunkTypeTableLike ChunkType = "table_like"
|
|||
|
|
// ChunkTypeFormula ChunkTypeFormula mathematical formula or equation.
|
|||
|
|
ChunkTypeFormula ChunkType = "formula"
|
|||
|
|
// ChunkTypeCodeBlock ChunkTypeCodeBlock code block or preformatted content.
|
|||
|
|
ChunkTypeCodeBlock ChunkType = "code_block"
|
|||
|
|
// ChunkTypeImage ChunkTypeImage embedded or referenced image content.
|
|||
|
|
ChunkTypeImage ChunkType = "image"
|
|||
|
|
// ChunkTypeOrgChart ChunkTypeOrgChart organizational chart or hierarchy diagram.
|
|||
|
|
ChunkTypeOrgChart ChunkType = "org_chart"
|
|||
|
|
// ChunkTypeDiagram ChunkTypeDiagram diagram, figure, or visual illustration.
|
|||
|
|
ChunkTypeDiagram ChunkType = "diagram"
|
|||
|
|
// ChunkTypeUnknown ChunkTypeUnknown unclassified or mixed content.
|
|||
|
|
ChunkTypeUnknown ChunkType = "unknown"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ImageKind is an enumeration type.
|
|||
|
|
type ImageKind string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ImageKindPhotograph ImageKindPhotograph photographic image (natural scene, photograph)
|
|||
|
|
ImageKindPhotograph ImageKind = "photograph"
|
|||
|
|
// ImageKindDiagram ImageKindDiagram technical or schematic diagram
|
|||
|
|
ImageKindDiagram ImageKind = "diagram"
|
|||
|
|
// ImageKindChart ImageKindChart chart, graph, or plot
|
|||
|
|
ImageKindChart ImageKind = "chart"
|
|||
|
|
// ImageKindDrawing ImageKindDrawing freehand or technical drawing
|
|||
|
|
ImageKindDrawing ImageKind = "drawing"
|
|||
|
|
// ImageKindTextBlock ImageKindTextBlock text-heavy image (scanned text, document)
|
|||
|
|
ImageKindTextBlock ImageKind = "text_block"
|
|||
|
|
// ImageKindDecoration ImageKindDecoration decorative element or border
|
|||
|
|
ImageKindDecoration ImageKind = "decoration"
|
|||
|
|
// ImageKindLogo ImageKindLogo logo or brand mark
|
|||
|
|
ImageKindLogo ImageKind = "logo"
|
|||
|
|
// ImageKindIcon ImageKindIcon small icon
|
|||
|
|
ImageKindIcon ImageKind = "icon"
|
|||
|
|
// ImageKindTileFragment ImageKindTileFragment fragment of a larger tiled image (tile of a technical drawing)
|
|||
|
|
ImageKindTileFragment ImageKind = "tile_fragment"
|
|||
|
|
// ImageKindMask ImageKindMask mask or transparency map
|
|||
|
|
ImageKindMask ImageKind = "mask"
|
|||
|
|
// ImageKindPageRaster ImageKindPageRaster full-page render produced during OCR preprocessing; used as a citation thumbnail.
|
|||
|
|
ImageKindPageRaster ImageKind = "page_raster"
|
|||
|
|
// ImageKindUnknown ImageKindUnknown could not classify with reasonable confidence
|
|||
|
|
ImageKindUnknown ImageKind = "unknown"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ResultFormat is an enumeration type.
|
|||
|
|
type ResultFormat string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ResultFormatUnified ResultFormatUnified unified format with all content in `content` field
|
|||
|
|
ResultFormatUnified ResultFormat = "unified"
|
|||
|
|
// ResultFormatElementBased ResultFormatElementBased element-based format with semantic element extraction
|
|||
|
|
ResultFormatElementBased ResultFormat = "element_based"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ElementType is an enumeration type.
|
|||
|
|
type ElementType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ElementTypeTitle ElementTypeTitle document title
|
|||
|
|
ElementTypeTitle ElementType = "title"
|
|||
|
|
// ElementTypeNarrativeText ElementTypeNarrativeText main narrative text body
|
|||
|
|
ElementTypeNarrativeText ElementType = "narrative_text"
|
|||
|
|
// ElementTypeHeading ElementTypeHeading section heading
|
|||
|
|
ElementTypeHeading ElementType = "heading"
|
|||
|
|
// ElementTypeListItem ElementTypeListItem list item (bullet, numbered, etc.)
|
|||
|
|
ElementTypeListItem ElementType = "list_item"
|
|||
|
|
// ElementTypeTable ElementTypeTable table element
|
|||
|
|
ElementTypeTable ElementType = "table"
|
|||
|
|
// ElementTypeImage ElementTypeImage image element
|
|||
|
|
ElementTypeImage ElementType = "image"
|
|||
|
|
// ElementTypePageBreak ElementTypePageBreak page break marker
|
|||
|
|
ElementTypePageBreak ElementType = "page_break"
|
|||
|
|
// ElementTypeCodeBlock ElementTypeCodeBlock code block
|
|||
|
|
ElementTypeCodeBlock ElementType = "code_block"
|
|||
|
|
// ElementTypeBlockQuote ElementTypeBlockQuote block quote
|
|||
|
|
ElementTypeBlockQuote ElementType = "block_quote"
|
|||
|
|
// ElementTypeFooter ElementTypeFooter footer text
|
|||
|
|
ElementTypeFooter ElementType = "footer"
|
|||
|
|
// ElementTypeHeader ElementTypeHeader header text
|
|||
|
|
ElementTypeHeader ElementType = "header"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// FormatMetadata format-specific metadata (discriminated union).
|
|||
|
|
//
|
|||
|
|
// Only one format type can exist per extraction result. This provides
|
|||
|
|
// type-safe, clean metadata without nested optionals.
|
|||
|
|
// Variants: Pdf, Docx, Excel, Email, Pptx, Archive, Image, Xml, Text, Html, Ocr, Csv, Bibtex, Citation, FictionBook, Dbf, Jats, Epub, Pst, Code
|
|||
|
|
type FormatMetadata struct {
|
|||
|
|
FormatType string `json:"format_type"`
|
|||
|
|
Pdf *PdfMetadata `json:"pdf,omitempty"`
|
|||
|
|
Docx *DocxMetadata `json:"docx,omitempty"`
|
|||
|
|
Excel *ExcelMetadata `json:"excel,omitempty"`
|
|||
|
|
Email *EmailMetadata `json:"email,omitempty"`
|
|||
|
|
Pptx *PptxMetadata `json:"pptx,omitempty"`
|
|||
|
|
Archive *ArchiveMetadata `json:"archive,omitempty"`
|
|||
|
|
Image *ImageMetadata `json:"image,omitempty"`
|
|||
|
|
XML *XMLMetadata `json:"xml,omitempty"`
|
|||
|
|
Text *TextMetadata `json:"text,omitempty"`
|
|||
|
|
HTML *HTMLMetadata `json:"html,omitempty"`
|
|||
|
|
Ocr *OcrMetadata `json:"ocr,omitempty"`
|
|||
|
|
Csv *CsvMetadata `json:"csv,omitempty"`
|
|||
|
|
Bibtex *BibtexMetadata `json:"bibtex,omitempty"`
|
|||
|
|
Citation *CitationMetadata `json:"citation,omitempty"`
|
|||
|
|
FictionBook *FictionBookMetadata `json:"fiction_book,omitempty"`
|
|||
|
|
Dbf *DbfMetadata `json:"dbf,omitempty"`
|
|||
|
|
Jats *JatsMetadata `json:"jats,omitempty"`
|
|||
|
|
Epub *EpubMetadata `json:"epub,omitempty"`
|
|||
|
|
Pst *PstMetadata `json:"pst,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MarshalJSON encodes the tagged union with the discriminator tag.
|
|||
|
|
func (t FormatMetadata) MarshalJSON() ([]byte, error) {
|
|||
|
|
switch t.FormatType {
|
|||
|
|
case "pdf":
|
|||
|
|
if t.Pdf != nil {
|
|||
|
|
data, err := json.Marshal(t.Pdf)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"pdf"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "docx":
|
|||
|
|
if t.Docx != nil {
|
|||
|
|
data, err := json.Marshal(t.Docx)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"docx"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "excel":
|
|||
|
|
if t.Excel != nil {
|
|||
|
|
data, err := json.Marshal(t.Excel)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"excel"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "email":
|
|||
|
|
if t.Email != nil {
|
|||
|
|
data, err := json.Marshal(t.Email)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"email"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "pptx":
|
|||
|
|
if t.Pptx != nil {
|
|||
|
|
data, err := json.Marshal(t.Pptx)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"pptx"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "archive":
|
|||
|
|
if t.Archive != nil {
|
|||
|
|
data, err := json.Marshal(t.Archive)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"archive"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "image":
|
|||
|
|
if t.Image != nil {
|
|||
|
|
data, err := json.Marshal(t.Image)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"image"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "xml":
|
|||
|
|
if t.XML != nil {
|
|||
|
|
data, err := json.Marshal(t.XML)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"xml"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "text":
|
|||
|
|
if t.Text != nil {
|
|||
|
|
data, err := json.Marshal(t.Text)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"text"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "html":
|
|||
|
|
if t.HTML != nil {
|
|||
|
|
data, err := json.Marshal(t.HTML)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"html"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "ocr":
|
|||
|
|
if t.Ocr != nil {
|
|||
|
|
data, err := json.Marshal(t.Ocr)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"ocr"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "csv":
|
|||
|
|
if t.Csv != nil {
|
|||
|
|
data, err := json.Marshal(t.Csv)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"csv"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "bibtex":
|
|||
|
|
if t.Bibtex != nil {
|
|||
|
|
data, err := json.Marshal(t.Bibtex)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"bibtex"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "citation":
|
|||
|
|
if t.Citation != nil {
|
|||
|
|
data, err := json.Marshal(t.Citation)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"citation"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "fiction_book":
|
|||
|
|
if t.FictionBook != nil {
|
|||
|
|
data, err := json.Marshal(t.FictionBook)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"fiction_book"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "dbf":
|
|||
|
|
if t.Dbf != nil {
|
|||
|
|
data, err := json.Marshal(t.Dbf)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"dbf"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "jats":
|
|||
|
|
if t.Jats != nil {
|
|||
|
|
data, err := json.Marshal(t.Jats)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"jats"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "epub":
|
|||
|
|
if t.Epub != nil {
|
|||
|
|
data, err := json.Marshal(t.Epub)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"epub"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
case "pst":
|
|||
|
|
if t.Pst != nil {
|
|||
|
|
data, err := json.Marshal(t.Pst)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
var m map[string]json.RawMessage
|
|||
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
m["format_type"] = []byte(`"pst"`)
|
|||
|
|
return json.Marshal(m)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// Fallback: return just the tag
|
|||
|
|
return json.Marshal(map[string]string{"format_type": t.FormatType})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalJSON decodes a tagged union by reading the tag first.
|
|||
|
|
func (t *FormatMetadata) UnmarshalJSON(data []byte) error {
|
|||
|
|
// Probe for the tag first
|
|||
|
|
var probe struct {
|
|||
|
|
FormatType string `json:"format_type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &probe); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
t.FormatType = probe.FormatType
|
|||
|
|
|
|||
|
|
switch probe.FormatType {
|
|||
|
|
case "pdf":
|
|||
|
|
t.Pdf = &PdfMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Pdf)
|
|||
|
|
case "docx":
|
|||
|
|
t.Docx = &DocxMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Docx)
|
|||
|
|
case "excel":
|
|||
|
|
t.Excel = &ExcelMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Excel)
|
|||
|
|
case "email":
|
|||
|
|
t.Email = &EmailMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Email)
|
|||
|
|
case "pptx":
|
|||
|
|
t.Pptx = &PptxMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Pptx)
|
|||
|
|
case "archive":
|
|||
|
|
t.Archive = &ArchiveMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Archive)
|
|||
|
|
case "image":
|
|||
|
|
t.Image = &ImageMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Image)
|
|||
|
|
case "xml":
|
|||
|
|
t.XML = &XMLMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.XML)
|
|||
|
|
case "text":
|
|||
|
|
t.Text = &TextMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Text)
|
|||
|
|
case "html":
|
|||
|
|
t.HTML = &HTMLMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.HTML)
|
|||
|
|
case "ocr":
|
|||
|
|
t.Ocr = &OcrMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Ocr)
|
|||
|
|
case "csv":
|
|||
|
|
t.Csv = &CsvMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Csv)
|
|||
|
|
case "bibtex":
|
|||
|
|
t.Bibtex = &BibtexMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Bibtex)
|
|||
|
|
case "citation":
|
|||
|
|
t.Citation = &CitationMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Citation)
|
|||
|
|
case "fiction_book":
|
|||
|
|
t.FictionBook = &FictionBookMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.FictionBook)
|
|||
|
|
case "dbf":
|
|||
|
|
t.Dbf = &DbfMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Dbf)
|
|||
|
|
case "jats":
|
|||
|
|
t.Jats = &JatsMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Jats)
|
|||
|
|
case "epub":
|
|||
|
|
t.Epub = &EpubMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Epub)
|
|||
|
|
case "pst":
|
|||
|
|
t.Pst = &PstMetadata{}
|
|||
|
|
return json.Unmarshal(data, t.Pst)
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TextDirection is an enumeration type.
|
|||
|
|
type TextDirection string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// TextDirectionLeftToRight TextDirectionLeftToRight left-to-right text direction
|
|||
|
|
TextDirectionLeftToRight TextDirection = "ltr"
|
|||
|
|
// TextDirectionRightToLeft TextDirectionRightToLeft right-to-left text direction
|
|||
|
|
TextDirectionRightToLeft TextDirection = "rtl"
|
|||
|
|
// TextDirectionAuto TextDirectionAuto automatic text direction detection
|
|||
|
|
TextDirectionAuto TextDirection = "auto"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// LinkType is an enumeration type.
|
|||
|
|
type LinkType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// LinkTypeAnchor LinkTypeAnchor anchor link (#section)
|
|||
|
|
LinkTypeAnchor LinkType = "anchor"
|
|||
|
|
// LinkTypeInternal LinkTypeInternal internal link (same domain)
|
|||
|
|
LinkTypeInternal LinkType = "internal"
|
|||
|
|
// LinkTypeExternal LinkTypeExternal external link (different domain)
|
|||
|
|
LinkTypeExternal LinkType = "external"
|
|||
|
|
// LinkTypeEmail LinkTypeEmail email link (mailto:)
|
|||
|
|
LinkTypeEmail LinkType = "email"
|
|||
|
|
// LinkTypePhone LinkTypePhone phone link (tel:)
|
|||
|
|
LinkTypePhone LinkType = "phone"
|
|||
|
|
// LinkTypeOther LinkTypeOther other link type
|
|||
|
|
LinkTypeOther LinkType = "other"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ImageType is an enumeration type.
|
|||
|
|
type ImageType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// ImageTypeDataURI ImageTypeDataURI data URI image
|
|||
|
|
ImageTypeDataURI ImageType = "data-uri"
|
|||
|
|
// ImageTypeInlineSvg ImageTypeInlineSvg inline SVG
|
|||
|
|
ImageTypeInlineSvg ImageType = "inline-svg"
|
|||
|
|
// ImageTypeExternal ImageTypeExternal external image URL
|
|||
|
|
ImageTypeExternal ImageType = "external"
|
|||
|
|
// ImageTypeRelative ImageTypeRelative relative path image
|
|||
|
|
ImageTypeRelative ImageType = "relative"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// StructuredDataType is an enumeration type.
|
|||
|
|
type StructuredDataType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// StructuredDataTypeJSONLd StructuredDataTypeJSONLd jSON-LD structured data
|
|||
|
|
StructuredDataTypeJSONLd StructuredDataType = "json-ld"
|
|||
|
|
// StructuredDataTypeMicrodata StructuredDataTypeMicrodata microdata
|
|||
|
|
StructuredDataTypeMicrodata StructuredDataType = "microdata"
|
|||
|
|
// StructuredDataTypeRdFa StructuredDataTypeRdFa rDFa
|
|||
|
|
StructuredDataTypeRdFa StructuredDataType = "rdfa"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// OcrBoundingGeometry bounding geometry for an OCR element.
|
|||
|
|
//
|
|||
|
|
// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
|||
|
|
// (from PaddleOCR and rotated text detection).
|
|||
|
|
// Variants: Rectangle, Quadrilateral
|
|||
|
|
// Sealed interface — use one of OcrBoundingGeometryRectangle, OcrBoundingGeometryQuadrilateral.
|
|||
|
|
type OcrBoundingGeometry interface {
|
|||
|
|
isOcrBoundingGeometry()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrBoundingGeometryRectangle axis-aligned bounding box (typical for Tesseract output).
|
|||
|
|
type OcrBoundingGeometryRectangle struct {
|
|||
|
|
// Left x-coordinate in pixels
|
|||
|
|
Left uint32 `json:"left"`
|
|||
|
|
// Top y-coordinate in pixels
|
|||
|
|
Top uint32 `json:"top"`
|
|||
|
|
// Width in pixels
|
|||
|
|
Width uint32 `json:"width"`
|
|||
|
|
// Height in pixels
|
|||
|
|
Height uint32 `json:"height"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (OcrBoundingGeometryRectangle) isOcrBoundingGeometry() {}
|
|||
|
|
|
|||
|
|
func (OcrBoundingGeometryRectangle) Type() string { return "rectangle" }
|
|||
|
|
|
|||
|
|
func (v OcrBoundingGeometryRectangle) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Left uint32 `json:"left"`
|
|||
|
|
Top uint32 `json:"top"`
|
|||
|
|
Width uint32 `json:"width"`
|
|||
|
|
Height uint32 `json:"height"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Left: v.Left,
|
|||
|
|
Top: v.Top,
|
|||
|
|
Width: v.Width,
|
|||
|
|
Height: v.Height,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrBoundingGeometryQuadrilateral 4-point quadrilateral for rotated/skewed text (PaddleOCR).
|
|||
|
|
//
|
|||
|
|
// Points are in clockwise order starting from top-left:
|
|||
|
|
// `[top_left, top_right, bottom_right, bottom_left]`
|
|||
|
|
type OcrBoundingGeometryQuadrilateral struct {
|
|||
|
|
// Four corner points as `[(x, y), ...]` in clockwise order
|
|||
|
|
Points string `json:"points"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (OcrBoundingGeometryQuadrilateral) isOcrBoundingGeometry() {}
|
|||
|
|
|
|||
|
|
func (OcrBoundingGeometryQuadrilateral) Type() string { return "quadrilateral" }
|
|||
|
|
|
|||
|
|
func (v OcrBoundingGeometryQuadrilateral) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Points string `json:"points"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Points: v.Points,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalOcrBoundingGeometry decodes JSON data into the appropriate concrete OcrBoundingGeometry variant.
|
|||
|
|
func UnmarshalOcrBoundingGeometry(data []byte) (OcrBoundingGeometry, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.Type {
|
|||
|
|
case "rectangle":
|
|||
|
|
var v OcrBoundingGeometryRectangle
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "quadrilateral":
|
|||
|
|
var v OcrBoundingGeometryQuadrilateral
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown OcrBoundingGeometry type: %q", wire.Type)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrElementLevel is an enumeration type.
|
|||
|
|
type OcrElementLevel string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// OcrElementLevelWord OcrElementLevelWord individual word
|
|||
|
|
OcrElementLevelWord OcrElementLevel = "word"
|
|||
|
|
// OcrElementLevelLine OcrElementLevelLine line of text (default for PaddleOCR)
|
|||
|
|
OcrElementLevelLine OcrElementLevel = "line"
|
|||
|
|
// OcrElementLevelBlock OcrElementLevelBlock paragraph or text block
|
|||
|
|
OcrElementLevelBlock OcrElementLevel = "block"
|
|||
|
|
// OcrElementLevelPage OcrElementLevelPage page-level element
|
|||
|
|
OcrElementLevelPage OcrElementLevel = "page"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PageUnitType is an enumeration type.
|
|||
|
|
type PageUnitType string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// PageUnitTypePage PageUnitTypePage standard document pages (PDF, DOCX, images)
|
|||
|
|
PageUnitTypePage PageUnitType = "page"
|
|||
|
|
// PageUnitTypeSlide PageUnitTypeSlide presentation slides (PPTX, ODP)
|
|||
|
|
PageUnitTypeSlide PageUnitType = "slide"
|
|||
|
|
// PageUnitTypeSheet PageUnitTypeSheet spreadsheet sheets (XLSX, ODS)
|
|||
|
|
PageUnitTypeSheet PageUnitType = "sheet"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// DiffLine single line in a unified-diff hunk.
|
|||
|
|
//
|
|||
|
|
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|||
|
|
// reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
// `crate::diff` re-exports this type verbatim.
|
|||
|
|
type DiffLine string
|
|||
|
|
|
|||
|
|
// RevisionKind is an enumeration type.
|
|||
|
|
type RevisionKind string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// RevisionKindInsertion RevisionKindInsertion text or content was inserted.
|
|||
|
|
RevisionKindInsertion RevisionKind = "insertion"
|
|||
|
|
// RevisionKindDeletion RevisionKindDeletion text or content was deleted.
|
|||
|
|
RevisionKindDeletion RevisionKind = "deletion"
|
|||
|
|
// RevisionKindFormatChange RevisionKindFormatChange run-level formatting (font, size, colour, …) was changed.
|
|||
|
|
RevisionKindFormatChange RevisionKind = "format_change"
|
|||
|
|
// RevisionKindComment RevisionKindComment a reviewer comment or annotation.
|
|||
|
|
RevisionKindComment RevisionKind = "comment"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// RevisionAnchor best-effort document location for a revision.
|
|||
|
|
// Variants: Paragraph, TableCell, Page, Slide, Sheet
|
|||
|
|
// Sealed interface — use one of RevisionAnchorParagraph, RevisionAnchorTableCell.
|
|||
|
|
type RevisionAnchor interface {
|
|||
|
|
isRevisionAnchor()
|
|||
|
|
Type() string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionAnchorParagraph body paragraph, identified by its zero-based index in the document flow.
|
|||
|
|
type RevisionAnchorParagraph struct {
|
|||
|
|
// Zero-based index of the paragraph in document order.
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorParagraph) isRevisionAnchor() {}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorParagraph) Type() string { return "paragraph" }
|
|||
|
|
|
|||
|
|
func (v RevisionAnchorParagraph) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Index: v.Index,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionAnchorTableCell cell inside a table.
|
|||
|
|
type RevisionAnchorTableCell struct {
|
|||
|
|
// Zero-based row index within the table.
|
|||
|
|
Row uint `json:"row"`
|
|||
|
|
// Zero-based column index within the table.
|
|||
|
|
Col uint `json:"col"`
|
|||
|
|
// Zero-based index of the table in document order.
|
|||
|
|
TableIndex uint `json:"table_index"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorTableCell) isRevisionAnchor() {}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorTableCell) Type() string { return "table_cell" }
|
|||
|
|
|
|||
|
|
func (v RevisionAnchorTableCell) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Row uint `json:"row"`
|
|||
|
|
Col uint `json:"col"`
|
|||
|
|
TableIndex uint `json:"table_index"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Row: v.Row,
|
|||
|
|
Col: v.Col,
|
|||
|
|
TableIndex: v.TableIndex,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionAnchorPage page, identified by its zero-based index.
|
|||
|
|
type RevisionAnchorPage struct {
|
|||
|
|
// Zero-based page index.
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorPage) isRevisionAnchor() {}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorPage) Type() string { return "page" }
|
|||
|
|
|
|||
|
|
func (v RevisionAnchorPage) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Index: v.Index,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionAnchorSlide presentation slide, identified by its zero-based index.
|
|||
|
|
type RevisionAnchorSlide struct {
|
|||
|
|
// Zero-based slide index.
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorSlide) isRevisionAnchor() {}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorSlide) Type() string { return "slide" }
|
|||
|
|
|
|||
|
|
func (v RevisionAnchorSlide) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Index: v.Index,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionAnchorSheet spreadsheet cell or range, identified by sheet index and optional name.
|
|||
|
|
type RevisionAnchorSheet struct {
|
|||
|
|
// Zero-based sheet index.
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
// Sheet display name when available.
|
|||
|
|
Name *string `json:"name,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorSheet) isRevisionAnchor() {}
|
|||
|
|
|
|||
|
|
func (RevisionAnchorSheet) Type() string { return "sheet" }
|
|||
|
|
|
|||
|
|
func (v RevisionAnchorSheet) MarshalJSON() ([]byte, error) {
|
|||
|
|
type aux struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
Index uint `json:"index"`
|
|||
|
|
Name *string `json:"name,omitempty"`
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux{
|
|||
|
|
Type: v.Type(),
|
|||
|
|
Index: v.Index,
|
|||
|
|
Name: v.Name,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// UnmarshalRevisionAnchor decodes JSON data into the appropriate concrete RevisionAnchor variant.
|
|||
|
|
func UnmarshalRevisionAnchor(data []byte) (RevisionAnchor, error) {
|
|||
|
|
var wire struct {
|
|||
|
|
Type string `json:"type"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &wire); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
switch wire.Type {
|
|||
|
|
case "paragraph":
|
|||
|
|
var v RevisionAnchorParagraph
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "table_cell":
|
|||
|
|
var v RevisionAnchorTableCell
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "page":
|
|||
|
|
var v RevisionAnchorPage
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "slide":
|
|||
|
|
var v RevisionAnchorSlide
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
case "sheet":
|
|||
|
|
var v RevisionAnchorSheet
|
|||
|
|
if err := json.Unmarshal(data, &v); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
return v, nil
|
|||
|
|
}
|
|||
|
|
return nil, fmt.Errorf("unknown RevisionAnchor type: %q", wire.Type)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// URIKind is an enumeration type.
|
|||
|
|
type URIKind string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// URIKindHyperlink URIKindHyperlink a clickable hyperlink (web URL, file link).
|
|||
|
|
URIKindHyperlink URIKind = "hyperlink"
|
|||
|
|
// URIKindImage URIKindImage an image or media resource reference.
|
|||
|
|
URIKindImage URIKind = "image"
|
|||
|
|
// URIKindAnchor URIKindAnchor an internal anchor or cross-reference target.
|
|||
|
|
URIKindAnchor URIKind = "anchor"
|
|||
|
|
// URIKindCitation URIKindCitation a citation or bibliographic reference (DOI, academic ref).
|
|||
|
|
URIKindCitation URIKind = "citation"
|
|||
|
|
// URIKindReference URIKindReference a general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
|
|||
|
|
URIKindReference URIKind = "reference"
|
|||
|
|
// URIKindEmail URIKindEmail an email address (`mailto:` link or bare email).
|
|||
|
|
URIKindEmail URIKind = "email"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// KeywordAlgorithm is an enumeration type.
|
|||
|
|
type KeywordAlgorithm string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// KeywordAlgorithmYake KeywordAlgorithmYake yAKE (Yet Another Keyword Extractor) - statistical approach
|
|||
|
|
KeywordAlgorithmYake KeywordAlgorithm = "yake"
|
|||
|
|
// KeywordAlgorithmRake KeywordAlgorithmRake rAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
|||
|
|
KeywordAlgorithmRake KeywordAlgorithm = "rake"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PSMMode is an enumeration type.
|
|||
|
|
type PSMMode string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// PSMModeOsdOnly PSMModeOsdOnly is the OsdOnly variant of PSMMode.
|
|||
|
|
PSMModeOsdOnly PSMMode = "osd_only"
|
|||
|
|
// PSMModeAutoOsd PSMModeAutoOsd is the AutoOsd variant of PSMMode.
|
|||
|
|
PSMModeAutoOsd PSMMode = "auto_osd"
|
|||
|
|
// PSMModeAutoOnly PSMModeAutoOnly is the AutoOnly variant of PSMMode.
|
|||
|
|
PSMModeAutoOnly PSMMode = "auto_only"
|
|||
|
|
// PSMModeAuto PSMModeAuto is the Auto variant of PSMMode.
|
|||
|
|
PSMModeAuto PSMMode = "auto"
|
|||
|
|
// PSMModeSingleColumn PSMModeSingleColumn is the SingleColumn variant of PSMMode.
|
|||
|
|
PSMModeSingleColumn PSMMode = "single_column"
|
|||
|
|
// PSMModeSingleBlockVertical PSMModeSingleBlockVertical is the SingleBlockVertical variant of PSMMode.
|
|||
|
|
PSMModeSingleBlockVertical PSMMode = "single_block_vertical"
|
|||
|
|
// PSMModeSingleBlock PSMModeSingleBlock is the SingleBlock variant of PSMMode.
|
|||
|
|
PSMModeSingleBlock PSMMode = "single_block"
|
|||
|
|
// PSMModeSingleLine PSMModeSingleLine is the SingleLine variant of PSMMode.
|
|||
|
|
PSMModeSingleLine PSMMode = "single_line"
|
|||
|
|
// PSMModeSingleWord PSMModeSingleWord is the SingleWord variant of PSMMode.
|
|||
|
|
PSMModeSingleWord PSMMode = "single_word"
|
|||
|
|
// PSMModeCircleWord PSMModeCircleWord is the CircleWord variant of PSMMode.
|
|||
|
|
PSMModeCircleWord PSMMode = "circle_word"
|
|||
|
|
// PSMModeSingleChar PSMModeSingleChar is the SingleChar variant of PSMMode.
|
|||
|
|
PSMModeSingleChar PSMMode = "single_char"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PaddleLanguage is an enumeration type.
|
|||
|
|
type PaddleLanguage string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// PaddleLanguageEnglish PaddleLanguageEnglish english
|
|||
|
|
PaddleLanguageEnglish PaddleLanguage = "english"
|
|||
|
|
// PaddleLanguageChinese PaddleLanguageChinese simplified Chinese
|
|||
|
|
PaddleLanguageChinese PaddleLanguage = "chinese"
|
|||
|
|
// PaddleLanguageJapanese PaddleLanguageJapanese japanese
|
|||
|
|
PaddleLanguageJapanese PaddleLanguage = "japanese"
|
|||
|
|
// PaddleLanguageKorean PaddleLanguageKorean korean
|
|||
|
|
PaddleLanguageKorean PaddleLanguage = "korean"
|
|||
|
|
// PaddleLanguageGerman PaddleLanguageGerman german
|
|||
|
|
PaddleLanguageGerman PaddleLanguage = "german"
|
|||
|
|
// PaddleLanguageFrench PaddleLanguageFrench french
|
|||
|
|
PaddleLanguageFrench PaddleLanguage = "french"
|
|||
|
|
// PaddleLanguageLatin PaddleLanguageLatin latin script (covers most European languages)
|
|||
|
|
PaddleLanguageLatin PaddleLanguage = "latin"
|
|||
|
|
// PaddleLanguageCyrillic PaddleLanguageCyrillic cyrillic (Russian and related)
|
|||
|
|
PaddleLanguageCyrillic PaddleLanguage = "cyrillic"
|
|||
|
|
// PaddleLanguageTraditionalChinese PaddleLanguageTraditionalChinese traditional Chinese
|
|||
|
|
PaddleLanguageTraditionalChinese PaddleLanguage = "traditional_chinese"
|
|||
|
|
// PaddleLanguageThai PaddleLanguageThai thai
|
|||
|
|
PaddleLanguageThai PaddleLanguage = "thai"
|
|||
|
|
// PaddleLanguageGreek PaddleLanguageGreek greek
|
|||
|
|
PaddleLanguageGreek PaddleLanguage = "greek"
|
|||
|
|
// PaddleLanguageEastSlavic PaddleLanguageEastSlavic east Slavic (Russian, Ukrainian, Belarusian)
|
|||
|
|
PaddleLanguageEastSlavic PaddleLanguage = "east_slavic"
|
|||
|
|
// PaddleLanguageArabic PaddleLanguageArabic arabic (Arabic, Persian, Urdu)
|
|||
|
|
PaddleLanguageArabic PaddleLanguage = "arabic"
|
|||
|
|
// PaddleLanguageDevanagari PaddleLanguageDevanagari devanagari (Hindi, Marathi, Sanskrit, Nepali)
|
|||
|
|
PaddleLanguageDevanagari PaddleLanguage = "devanagari"
|
|||
|
|
// PaddleLanguageTamil PaddleLanguageTamil tamil
|
|||
|
|
PaddleLanguageTamil PaddleLanguage = "tamil"
|
|||
|
|
// PaddleLanguageTelugu PaddleLanguageTelugu telugu
|
|||
|
|
PaddleLanguageTelugu PaddleLanguage = "telugu"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// LayoutClass is an enumeration type.
|
|||
|
|
type LayoutClass string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
// LayoutClassCaption LayoutClassCaption is the Caption variant of LayoutClass.
|
|||
|
|
LayoutClassCaption LayoutClass = "caption"
|
|||
|
|
// LayoutClassFootnote LayoutClassFootnote is the Footnote variant of LayoutClass.
|
|||
|
|
LayoutClassFootnote LayoutClass = "footnote"
|
|||
|
|
// LayoutClassFormula LayoutClassFormula is the Formula variant of LayoutClass.
|
|||
|
|
LayoutClassFormula LayoutClass = "formula"
|
|||
|
|
// LayoutClassListItem LayoutClassListItem is the ListItem variant of LayoutClass.
|
|||
|
|
LayoutClassListItem LayoutClass = "list_item"
|
|||
|
|
// LayoutClassPageFooter LayoutClassPageFooter is the PageFooter variant of LayoutClass.
|
|||
|
|
LayoutClassPageFooter LayoutClass = "page_footer"
|
|||
|
|
// LayoutClassPageHeader LayoutClassPageHeader is the PageHeader variant of LayoutClass.
|
|||
|
|
LayoutClassPageHeader LayoutClass = "page_header"
|
|||
|
|
// LayoutClassPicture LayoutClassPicture is the Picture variant of LayoutClass.
|
|||
|
|
LayoutClassPicture LayoutClass = "picture"
|
|||
|
|
// LayoutClassSectionHeader LayoutClassSectionHeader is the SectionHeader variant of LayoutClass.
|
|||
|
|
LayoutClassSectionHeader LayoutClass = "section_header"
|
|||
|
|
// LayoutClassTable LayoutClassTable is the Table variant of LayoutClass.
|
|||
|
|
LayoutClassTable LayoutClass = "table"
|
|||
|
|
// LayoutClassText LayoutClassText is the Text variant of LayoutClass.
|
|||
|
|
LayoutClassText LayoutClass = "text"
|
|||
|
|
// LayoutClassTitle LayoutClassTitle is the Title variant of LayoutClass.
|
|||
|
|
LayoutClassTitle LayoutClass = "title"
|
|||
|
|
// LayoutClassDocumentIndex LayoutClassDocumentIndex is the DocumentIndex variant of LayoutClass.
|
|||
|
|
LayoutClassDocumentIndex LayoutClass = "document_index"
|
|||
|
|
// LayoutClassCode LayoutClassCode is the Code variant of LayoutClass.
|
|||
|
|
LayoutClassCode LayoutClass = "code"
|
|||
|
|
// LayoutClassCheckboxSelected LayoutClassCheckboxSelected is the CheckboxSelected variant of LayoutClass.
|
|||
|
|
LayoutClassCheckboxSelected LayoutClass = "checkbox_selected"
|
|||
|
|
// LayoutClassCheckboxUnselected LayoutClassCheckboxUnselected is the CheckboxUnselected variant of LayoutClass.
|
|||
|
|
LayoutClassCheckboxUnselected LayoutClass = "checkbox_unselected"
|
|||
|
|
// LayoutClassForm LayoutClassForm is the Form variant of LayoutClass.
|
|||
|
|
LayoutClassForm LayoutClass = "form"
|
|||
|
|
// LayoutClassKeyValueRegion LayoutClassKeyValueRegion is the KeyValueRegion variant of LayoutClass.
|
|||
|
|
LayoutClassKeyValueRegion LayoutClass = "key_value_region"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// CacheStats is a type.
|
|||
|
|
type CacheStats struct {
|
|||
|
|
TotalFiles uint `json:"total_files"`
|
|||
|
|
TotalSizeMb float64 `json:"total_size_mb"`
|
|||
|
|
AvailableSpaceMb float64 `json:"available_space_mb"`
|
|||
|
|
OldestFileAgeDays float64 `json:"oldest_file_age_days"`
|
|||
|
|
NewestFileAgeDays float64 `json:"newest_file_age_days"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AccelerationConfig hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
//
|
|||
|
|
// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|||
|
|
// for inference in layout detection and embedding generation.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
|
|||
|
|
// let config = AccelerationConfig::default();
|
|||
|
|
//
|
|||
|
|
// // Force CPU only
|
|||
|
|
// let config = AccelerationConfig {
|
|||
|
|
// provider: kreuzberg::ExecutionProviderType::Cpu,
|
|||
|
|
// ..Default::default()
|
|||
|
|
// };
|
|||
|
|
type AccelerationConfig struct {
|
|||
|
|
// Execution provider to use for ONNX inference.
|
|||
|
|
Provider ExecutionProviderType `json:"provider,omitempty"`
|
|||
|
|
// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
|
|||
|
|
DeviceID uint32 `json:"device_id"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ContentFilterConfig cross-extractor content filtering configuration.
|
|||
|
|
//
|
|||
|
|
// Controls whether "furniture" content (headers, footers, page numbers,
|
|||
|
|
// watermarks, repeating text) is included in or stripped from extraction
|
|||
|
|
// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
|||
|
|
// with format-specific implementation.
|
|||
|
|
//
|
|||
|
|
// When `None` on `ExtractionConfig`, each extractor uses its current
|
|||
|
|
// default behavior unchanged.
|
|||
|
|
type ContentFilterConfig struct {
|
|||
|
|
// Include running headers in extraction output.
|
|||
|
|
//
|
|||
|
|
// - PDF: Disables top-margin furniture stripping and prevents the layout
|
|||
|
|
// model from treating `PageHeader`-classified regions as furniture.
|
|||
|
|
// - DOCX: Includes document headers in text output.
|
|||
|
|
// - RTF/ODT: Headers already included; this is a no-op when true.
|
|||
|
|
// - HTML/EPUB: Keeps `<header>` element content.
|
|||
|
|
//
|
|||
|
|
// Default: `false` (headers are stripped or excluded).
|
|||
|
|
IncludeHeaders bool `json:"include_headers"`
|
|||
|
|
// Include running footers in extraction output.
|
|||
|
|
//
|
|||
|
|
// - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
|||
|
|
// model from treating `PageFooter`-classified regions as furniture.
|
|||
|
|
// - DOCX: Includes document footers in text output.
|
|||
|
|
// - RTF/ODT: Footers already included; this is a no-op when true.
|
|||
|
|
// - HTML/EPUB: Keeps `<footer>` element content.
|
|||
|
|
//
|
|||
|
|
// Default: `false` (footers are stripped or excluded).
|
|||
|
|
IncludeFooters bool `json:"include_footers"`
|
|||
|
|
// Enable the heuristic cross-page repeating text detector.
|
|||
|
|
//
|
|||
|
|
// When `true` (default), text that repeats verbatim across a supermajority
|
|||
|
|
// of pages is classified as furniture and stripped. Disable this if brand
|
|||
|
|
// names or repeated headings are being incorrectly removed by the heuristic.
|
|||
|
|
//
|
|||
|
|
// Note: when a layout-detection model is active, the model may independently
|
|||
|
|
// classify page-header / page-footer regions as furniture on a per-page basis.
|
|||
|
|
// To preserve those regions, set `include_headers = true`, `include_footers = true`,
|
|||
|
|
// or both, in addition to disabling this flag.
|
|||
|
|
//
|
|||
|
|
// Primarily affects PDF extraction.
|
|||
|
|
//
|
|||
|
|
// Default: `true`.
|
|||
|
|
StripRepeatingText *bool `json:"strip_repeating_text,omitempty"`
|
|||
|
|
// Include watermark text in extraction output.
|
|||
|
|
//
|
|||
|
|
// - PDF: Keeps watermark artifacts and arXiv identifiers.
|
|||
|
|
// - Other formats: No effect currently.
|
|||
|
|
//
|
|||
|
|
// Default: `false` (watermarks are stripped).
|
|||
|
|
IncludeWatermarks bool `json:"include_watermarks"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmailConfig configuration for email extraction.
|
|||
|
|
type EmailConfig struct {
|
|||
|
|
// Windows codepage number to use when an MSG file contains no codepage property.
|
|||
|
|
// Defaults to `None`, which falls back to windows-1252.
|
|||
|
|
//
|
|||
|
|
// If an unrecognized or invalid codepage number is supplied (including 0),
|
|||
|
|
// the behavior silently falls back to windows-1252 — the same as when the
|
|||
|
|
// MSG file itself contains an unrecognized codepage. No error or warning is
|
|||
|
|
// emitted. Users should verify output when supplying unusual values.
|
|||
|
|
//
|
|||
|
|
// Common values:
|
|||
|
|
// - 1250: Central European (Polish, Czech, Hungarian, etc.)
|
|||
|
|
// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
|
|||
|
|
// - 1252: Western European (default)
|
|||
|
|
// - 1253: Greek
|
|||
|
|
// - 1254: Turkish
|
|||
|
|
// - 1255: Hebrew
|
|||
|
|
// - 1256: Arabic
|
|||
|
|
// - 932: Japanese (Shift-JIS)
|
|||
|
|
// - 936: Simplified Chinese (GBK)
|
|||
|
|
MsgFallbackCodepage *uint32 `json:"msg_fallback_codepage,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractionConfig main extraction configuration.
|
|||
|
|
//
|
|||
|
|
// This struct contains all configuration options for the extraction process.
|
|||
|
|
// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// // Create with defaults
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
//
|
|||
|
|
// // Load from TOML file
|
|||
|
|
// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
|||
|
|
type ExtractionConfig struct {
|
|||
|
|
// Enable caching of extraction results
|
|||
|
|
UseCache *bool `json:"use_cache,omitempty"`
|
|||
|
|
// Enable quality post-processing
|
|||
|
|
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
|
|||
|
|
// OCR configuration (None = OCR disabled)
|
|||
|
|
Ocr *OcrConfig `json:"ocr,omitempty"`
|
|||
|
|
// Force OCR even for searchable PDFs
|
|||
|
|
ForceOcr bool `json:"force_ocr"`
|
|||
|
|
// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
|||
|
|
//
|
|||
|
|
// When set, only the listed pages are OCR'd regardless of text layer quality.
|
|||
|
|
// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
|
|||
|
|
// Only applies to PDF documents. Duplicates are automatically deduplicated.
|
|||
|
|
// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
|
|||
|
|
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
|
|||
|
|
// Disable OCR entirely, even for images.
|
|||
|
|
//
|
|||
|
|
// When `true`, OCR is skipped for all document types. Images return metadata
|
|||
|
|
// only (dimensions, format, EXIF) without text extraction. PDFs use only
|
|||
|
|
// native text extraction without OCR fallback.
|
|||
|
|
//
|
|||
|
|
// Cannot be `true` simultaneously with `force_ocr`.
|
|||
|
|
//
|
|||
|
|
// *Added in v4.7.0.*
|
|||
|
|
DisableOcr bool `json:"disable_ocr"`
|
|||
|
|
// Text chunking configuration (None = chunking disabled)
|
|||
|
|
Chunking *ChunkingConfig `json:"chunking,omitempty"`
|
|||
|
|
// Content filtering configuration (None = use extractor defaults).
|
|||
|
|
//
|
|||
|
|
// Controls whether document "furniture" (headers, footers, watermarks,
|
|||
|
|
// repeating text) is included in or stripped from extraction results.
|
|||
|
|
// See [`ContentFilterConfig`] for per-field documentation.
|
|||
|
|
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
|
|||
|
|
// Image extraction configuration (None = no image extraction)
|
|||
|
|
Images *ImageExtractionConfig `json:"images,omitempty"`
|
|||
|
|
// PDF-specific options (None = use defaults)
|
|||
|
|
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
|
|||
|
|
// Token reduction configuration (None = no token reduction)
|
|||
|
|
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
|
|||
|
|
// Language detection configuration (None = no language detection)
|
|||
|
|
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
|
|||
|
|
// Page extraction configuration (None = no page tracking)
|
|||
|
|
Pages *PageConfig `json:"pages,omitempty"`
|
|||
|
|
// Keyword extraction configuration (None = no keyword extraction)
|
|||
|
|
Keywords *KeywordConfig `json:"keywords,omitempty"`
|
|||
|
|
// Post-processor configuration (None = use defaults)
|
|||
|
|
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
|
|||
|
|
// HTML to Markdown conversion options (None = use defaults)
|
|||
|
|
//
|
|||
|
|
// Configure how HTML documents are converted to Markdown, including heading styles,
|
|||
|
|
// list formatting, code block styles, and preprocessing options.
|
|||
|
|
HTMLOptions *string `json:"html_options,omitempty"`
|
|||
|
|
// Styled HTML output configuration.
|
|||
|
|
//
|
|||
|
|
// When set alongside `output_format = OutputFormat::Html`, the extraction
|
|||
|
|
// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
|
|||
|
|
// which emits stable `kb-*` CSS class hooks on every structural element
|
|||
|
|
// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
|
|||
|
|
//
|
|||
|
|
// When `None`, the existing plain comrak-based HTML renderer is used.
|
|||
|
|
HTMLOutput *HTMLOutputConfig `json:"html_output,omitempty"`
|
|||
|
|
// Default per-file timeout in seconds for batch extraction.
|
|||
|
|
//
|
|||
|
|
// When set, each file in a batch will be canceled after this duration
|
|||
|
|
// unless overridden by [`FileExtractionConfig::timeout_secs`].
|
|||
|
|
//
|
|||
|
|
// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
|
|||
|
|
// nested archives, documents with millions of cells) from running
|
|||
|
|
// indefinitely and exhausting caller resources. Set to `None` to
|
|||
|
|
// disable the timeout for trusted input or long-running workloads.
|
|||
|
|
ExtractionTimeoutSecs *uint64 `json:"extraction_timeout_secs,omitempty"`
|
|||
|
|
// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
|
|||
|
|
//
|
|||
|
|
// Limits parallelism to prevent resource exhaustion when processing
|
|||
|
|
// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
|||
|
|
MaxConcurrentExtractions *uint `json:"max_concurrent_extractions,omitempty"`
|
|||
|
|
// Result structure format
|
|||
|
|
//
|
|||
|
|
// Controls whether results are returned in unified format (default) with all
|
|||
|
|
// content in the `content` field, or element-based format with semantic
|
|||
|
|
// elements (for Unstructured-compatible output).
|
|||
|
|
ResultFormat ResultFormat `json:"result_format,omitempty"`
|
|||
|
|
// Security limits for archive extraction.
|
|||
|
|
//
|
|||
|
|
// Controls maximum archive size, compression ratio, file count, and other
|
|||
|
|
// security thresholds to prevent decompression bomb attacks. Also caps
|
|||
|
|
// nesting depth, iteration count, entity / token length, total
|
|||
|
|
// content size, and table cell count for every extraction path that
|
|||
|
|
// ingests user-controlled bytes.
|
|||
|
|
// When `None`, default limits are used.
|
|||
|
|
SecurityLimits *SecurityLimits `json:"security_limits,omitempty"`
|
|||
|
|
// Maximum uncompressed size in bytes for a single embedded file before
|
|||
|
|
// recursive extraction is attempted (default: 50 MiB).
|
|||
|
|
//
|
|||
|
|
// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
|||
|
|
// to email attachments processed via recursive extraction. Files that
|
|||
|
|
// exceed this limit are skipped with a `ProcessingWarning` rather than
|
|||
|
|
// passed to the extraction pipeline, preventing a single oversized
|
|||
|
|
// embedded object from consuming unbounded memory or time.
|
|||
|
|
//
|
|||
|
|
// Set to `None` to disable the per-embedded-file cap (falls back to
|
|||
|
|
// `security_limits.max_archive_size` as the only guard).
|
|||
|
|
MaxEmbeddedFileBytes *uint64 `json:"max_embedded_file_bytes,omitempty"`
|
|||
|
|
// Content text format (default: Plain).
|
|||
|
|
//
|
|||
|
|
// Controls the format of the extracted content:
|
|||
|
|
// - `Plain`: Raw extracted text (default)
|
|||
|
|
// - `Markdown`: Markdown formatted output
|
|||
|
|
// - `Djot`: Djot markup format (requires djot feature)
|
|||
|
|
// - `Html`: HTML formatted output
|
|||
|
|
//
|
|||
|
|
// When set to a structured format, extraction results will include
|
|||
|
|
// formatted output. The `formatted_content` field may be populated
|
|||
|
|
// when format conversion is applied.
|
|||
|
|
OutputFormat *OutputFormat `json:"output_format,omitempty"`
|
|||
|
|
// Layout detection configuration (None = layout detection disabled).
|
|||
|
|
//
|
|||
|
|
// When set, PDF pages and images are analyzed for document structure
|
|||
|
|
// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
|||
|
|
// via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
|||
|
|
// in the markdown pipeline. For images, per-region OCR is performed with
|
|||
|
|
// markdown formatting based on detected layout classes.
|
|||
|
|
// Requires the `layout-detection` feature to run inference; the field is
|
|||
|
|
// present whenever the `layout-types` feature is active (which includes
|
|||
|
|
// `layout-detection` as well as the no-ORT target groups).
|
|||
|
|
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
|
|||
|
|
// Run layout detection on the non-OCR PDF markdown path.
|
|||
|
|
//
|
|||
|
|
// When `true` and `layout` is `Some(_)`, layout regions inform heading,
|
|||
|
|
// table, list, and figure detection in the structure pipeline that would
|
|||
|
|
// otherwise rely on font-clustering heuristics alone. Significantly
|
|||
|
|
// improves SF1 (structural F1) at the cost of inference latency
|
|||
|
|
// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
|
|||
|
|
// Requires the `layout-detection` feature.
|
|||
|
|
UseLayoutForMarkdown bool `json:"use_layout_for_markdown"`
|
|||
|
|
// Enable structured document tree output.
|
|||
|
|
//
|
|||
|
|
// When true, populates the `document` field on `ExtractionResult` with a
|
|||
|
|
// hierarchical `DocumentStructure` containing heading-driven section nesting,
|
|||
|
|
// table grids, content layer classification, and inline annotations.
|
|||
|
|
//
|
|||
|
|
// Independent of `result_format` — can be combined with Unified or ElementBased.
|
|||
|
|
IncludeDocumentStructure bool `json:"include_document_structure"`
|
|||
|
|
// Hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
//
|
|||
|
|
// Controls execution provider selection for layout detection and embedding
|
|||
|
|
// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
|
|||
|
|
// Linux, CPU on Windows).
|
|||
|
|
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
|
|||
|
|
// Cache namespace for tenant isolation.
|
|||
|
|
//
|
|||
|
|
// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
|
|||
|
|
// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
|||
|
|
// Different namespaces have isolated cache spaces on the same filesystem.
|
|||
|
|
CacheNamespace *string `json:"cache_namespace,omitempty"`
|
|||
|
|
// Per-request cache TTL in seconds.
|
|||
|
|
//
|
|||
|
|
// Overrides the global `max_age_days` for this specific extraction.
|
|||
|
|
// When `0`, caching is completely skipped (no read or write).
|
|||
|
|
// When `None`, the global TTL applies.
|
|||
|
|
CacheTTLSecs *uint64 `json:"cache_ttl_secs,omitempty"`
|
|||
|
|
// Email extraction configuration (None = use defaults).
|
|||
|
|
//
|
|||
|
|
// Currently supports configuring the fallback codepage for MSG files
|
|||
|
|
// that do not specify one. See `EmailConfig` for details.
|
|||
|
|
Email *EmailConfig `json:"email,omitempty"`
|
|||
|
|
// Concurrency limits for constrained environments (None = use defaults).
|
|||
|
|
//
|
|||
|
|
// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
|||
|
|
// (when `max_concurrent_extractions` is unset) the batch concurrency
|
|||
|
|
// semaphore. See `ConcurrencyConfig` for details.
|
|||
|
|
Concurrency *string `json:"concurrency,omitempty"`
|
|||
|
|
// Maximum recursion depth for archive extraction (default: 3).
|
|||
|
|
// Set to 0 to disable recursive extraction (legacy behavior).
|
|||
|
|
MaxArchiveDepth uint `json:"max_archive_depth"`
|
|||
|
|
// Tree-sitter language pack configuration (None = tree-sitter disabled).
|
|||
|
|
//
|
|||
|
|
// When set, enables code file extraction using tree-sitter parsers.
|
|||
|
|
// Controls grammar download behavior and code analysis options.
|
|||
|
|
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
|
|||
|
|
// Structured extraction via LLM (None = disabled).
|
|||
|
|
//
|
|||
|
|
// When set, the extracted document content is sent to an LLM with the
|
|||
|
|
// provided JSON schema. The structured response is stored in
|
|||
|
|
// `ExtractionResult::structured_output`.
|
|||
|
|
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
|
|||
|
|
// Cancellation token for this extraction (None = no external cancellation).
|
|||
|
|
//
|
|||
|
|
// Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
|
|||
|
|
// from another thread / task to abort the extraction in progress. The extractor
|
|||
|
|
// checks the token at safe checkpoints (before lock acquisition, between pages,
|
|||
|
|
// between batch items) and returns [`KreuzbergError::Cancelled`] when set.
|
|||
|
|
//
|
|||
|
|
// The field is excluded from serialization because `CancellationToken` is a
|
|||
|
|
// runtime handle, not a configuration value.
|
|||
|
|
CancelToken *string `json:"cancel_token,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FileExtractionConfig per-file extraction configuration overrides for batch processing.
|
|||
|
|
//
|
|||
|
|
// All fields are `Option<T>` — `None` means "use the batch-level default."
|
|||
|
|
// This type is used with `batch_extract_files` and
|
|||
|
|
// `batch_extract_bytes` to allow heterogeneous
|
|||
|
|
// extraction settings within a single batch.
|
|||
|
|
//
|
|||
|
|
// # Excluded Fields
|
|||
|
|
//
|
|||
|
|
// The following `ExtractionConfig` fields are batch-level only and
|
|||
|
|
// cannot be overridden per file:
|
|||
|
|
// - `max_concurrent_extractions` — controls batch parallelism
|
|||
|
|
// - `use_cache` — global caching policy
|
|||
|
|
// - `acceleration` — shared ONNX execution provider
|
|||
|
|
// - `security_limits` — global archive security policy
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// // Override just OCR forcing for a specific file
|
|||
|
|
// let config = FileExtractionConfig {
|
|||
|
|
// force_ocr: Some(true),
|
|||
|
|
// ..Default::default()
|
|||
|
|
// };
|
|||
|
|
type FileExtractionConfig struct {
|
|||
|
|
// Override quality post-processing for this file.
|
|||
|
|
EnableQualityProcessing *bool `json:"enable_quality_processing,omitempty"`
|
|||
|
|
// Override OCR configuration for this file (None in the Option = use batch default).
|
|||
|
|
Ocr *OcrConfig `json:"ocr,omitempty"`
|
|||
|
|
// Override force OCR for this file.
|
|||
|
|
ForceOcr *bool `json:"force_ocr,omitempty"`
|
|||
|
|
// Override force OCR pages for this file (1-indexed page numbers).
|
|||
|
|
ForceOcrPages []uint32 `json:"force_ocr_pages,omitempty"`
|
|||
|
|
// Override disable OCR for this file.
|
|||
|
|
DisableOcr *bool `json:"disable_ocr,omitempty"`
|
|||
|
|
// Override chunking configuration for this file.
|
|||
|
|
Chunking *ChunkingConfig `json:"chunking,omitempty"`
|
|||
|
|
// Override content filtering configuration for this file.
|
|||
|
|
ContentFilter *ContentFilterConfig `json:"content_filter,omitempty"`
|
|||
|
|
// Override image extraction configuration for this file.
|
|||
|
|
Images *ImageExtractionConfig `json:"images,omitempty"`
|
|||
|
|
// Override PDF options for this file.
|
|||
|
|
PdfOptions *PdfConfig `json:"pdf_options,omitempty"`
|
|||
|
|
// Override token reduction for this file.
|
|||
|
|
TokenReduction *TokenReductionOptions `json:"token_reduction,omitempty"`
|
|||
|
|
// Override language detection for this file.
|
|||
|
|
LanguageDetection *LanguageDetectionConfig `json:"language_detection,omitempty"`
|
|||
|
|
// Override page extraction for this file.
|
|||
|
|
Pages *PageConfig `json:"pages,omitempty"`
|
|||
|
|
// Override keyword extraction for this file.
|
|||
|
|
Keywords *KeywordConfig `json:"keywords,omitempty"`
|
|||
|
|
// Override post-processor for this file.
|
|||
|
|
Postprocessor *PostProcessorConfig `json:"postprocessor,omitempty"`
|
|||
|
|
// Override HTML conversion options for this file.
|
|||
|
|
HTMLOptions *string `json:"html_options,omitempty"`
|
|||
|
|
// Override result format for this file.
|
|||
|
|
ResultFormat *ResultFormat `json:"result_format,omitempty"`
|
|||
|
|
// Override output content format for this file.
|
|||
|
|
OutputFormat *OutputFormat `json:"output_format,omitempty"`
|
|||
|
|
// Override document structure output for this file.
|
|||
|
|
IncludeDocumentStructure *bool `json:"include_document_structure,omitempty"`
|
|||
|
|
// Override layout detection for this file.
|
|||
|
|
Layout *LayoutDetectionConfig `json:"layout,omitempty"`
|
|||
|
|
// Override per-file extraction timeout in seconds.
|
|||
|
|
//
|
|||
|
|
// When set, the extraction for this file will be canceled after the
|
|||
|
|
// specified duration. A timed-out file produces an error result without
|
|||
|
|
// affecting other files in the batch.
|
|||
|
|
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
|
|||
|
|
// Override tree-sitter configuration for this file.
|
|||
|
|
TreeSitter *TreeSitterConfig `json:"tree_sitter,omitempty"`
|
|||
|
|
// Override structured extraction configuration for this file.
|
|||
|
|
//
|
|||
|
|
// When set, enables LLM-based structured extraction with a JSON schema
|
|||
|
|
// for this specific file. The extracted content is sent to a VLM/LLM
|
|||
|
|
// and the response is parsed according to the provided schema.
|
|||
|
|
StructuredExtraction *StructuredExtractionConfig `json:"structured_extraction,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchBytesItem batch item for byte array extraction.
|
|||
|
|
//
|
|||
|
|
// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
|||
|
|
// to represent a single item in a batch extraction job.
|
|||
|
|
type BatchBytesItem struct {
|
|||
|
|
// The content bytes to extract from
|
|||
|
|
Content []byte `json:"content"`
|
|||
|
|
// MIME type of the content (e.g., "application/pdf", "text/html")
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
// Per-item configuration overrides (None uses batch-level defaults)
|
|||
|
|
Config *FileExtractionConfig `json:"config,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
|
|||
|
|
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
|
|||
|
|
func (v BatchBytesItem) MarshalJSON() ([]byte, error) {
|
|||
|
|
// Explicit shadow struct listing every field — embedding the original
|
|||
|
|
// would cause both base64-string and int-array entries for the same JSON
|
|||
|
|
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
|
|||
|
|
aux := struct {
|
|||
|
|
Content []int `json:"content"`
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
Config *FileExtractionConfig `json:"config,omitempty"`
|
|||
|
|
}{}
|
|||
|
|
aux.Content = make([]int, len(v.Content))
|
|||
|
|
for i, b := range v.Content {
|
|||
|
|
aux.Content[i] = int(b)
|
|||
|
|
}
|
|||
|
|
aux.MimeType = v.MimeType
|
|||
|
|
aux.Config = v.Config
|
|||
|
|
return json.Marshal(aux)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchFileItem batch item for file extraction.
|
|||
|
|
//
|
|||
|
|
// Used with `batch_extract_files` and `batch_extract_files_sync`
|
|||
|
|
// to represent a single file in a batch extraction job.
|
|||
|
|
type BatchFileItem struct {
|
|||
|
|
// Path to the file to extract from
|
|||
|
|
Path string `json:"path"`
|
|||
|
|
// Per-file configuration overrides (None uses batch-level defaults)
|
|||
|
|
Config *FileExtractionConfig `json:"config,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ImageExtractionConfig image extraction configuration.
|
|||
|
|
type ImageExtractionConfig struct {
|
|||
|
|
// Extract images from documents
|
|||
|
|
ExtractImages *bool `json:"extract_images,omitempty"`
|
|||
|
|
// Target DPI for image normalization
|
|||
|
|
TargetDpi *int32 `json:"target_dpi,omitempty"`
|
|||
|
|
// Maximum dimension for images (width or height)
|
|||
|
|
MaxImageDimension *int32 `json:"max_image_dimension,omitempty"`
|
|||
|
|
// Whether to inject image reference placeholders into markdown output.
|
|||
|
|
// When `true` (default), image references like ``
|
|||
|
|
// are appended to the markdown. Set to `false` to extract images as data
|
|||
|
|
// without polluting the markdown output.
|
|||
|
|
InjectPlaceholders *bool `json:"inject_placeholders,omitempty"`
|
|||
|
|
// Automatically adjust DPI based on image content
|
|||
|
|
AutoAdjustDpi *bool `json:"auto_adjust_dpi,omitempty"`
|
|||
|
|
// Minimum DPI threshold
|
|||
|
|
MinDpi *int32 `json:"min_dpi,omitempty"`
|
|||
|
|
// Maximum DPI threshold
|
|||
|
|
MaxDpi *int32 `json:"max_dpi,omitempty"`
|
|||
|
|
// Maximum number of image objects to extract per PDF page.
|
|||
|
|
//
|
|||
|
|
// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
|||
|
|
// can trigger extremely long or indefinite extraction times when every image
|
|||
|
|
// object on a dense page is decoded individually via the PDF extractor. Setting this
|
|||
|
|
// limit causes kreuzberg to stop collecting individual images once the count
|
|||
|
|
// per page reaches the cap and emit a warning instead.
|
|||
|
|
//
|
|||
|
|
// `None` (default) means no limit — all images are extracted.
|
|||
|
|
MaxImagesPerPage *uint32 `json:"max_images_per_page,omitempty"`
|
|||
|
|
// When `true` (default), extracted images are classified by kind and grouped
|
|||
|
|
// into clusters where they appear to belong to one figure.
|
|||
|
|
Classify *bool `json:"classify,omitempty"`
|
|||
|
|
// When `true`, full-page renders produced during OCR preprocessing are captured
|
|||
|
|
// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
|
|||
|
|
//
|
|||
|
|
// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
|||
|
|
// document-level OCR bypass is active (whole-document backend). When OCR is
|
|||
|
|
// enabled and this flag is set but the active backend skips per-page rendering,
|
|||
|
|
// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
|
|||
|
|
//
|
|||
|
|
// Defaults to `false`. Enable when downstream consumers need page thumbnails
|
|||
|
|
// (e.g. citation previews, visual grounding).
|
|||
|
|
IncludePageRasters bool `json:"include_page_rasters"`
|
|||
|
|
// Run OCR on extracted images and include the recognized text in the document content.
|
|||
|
|
//
|
|||
|
|
// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
|
|||
|
|
// are processed with the configured OCR backend. Set to `false` to extract images
|
|||
|
|
// without OCR processing, even when OCR is enabled.
|
|||
|
|
RunOcrOnImages *bool `json:"run_ocr_on_images,omitempty"`
|
|||
|
|
// When `true`, image OCR results are rendered as plain text without the
|
|||
|
|
// `` markdown placeholder. Only takes effect when `run_ocr_on_images`
|
|||
|
|
// is also `true`.
|
|||
|
|
OcrTextOnly bool `json:"ocr_text_only"`
|
|||
|
|
// When `true` and `ocr_text_only` is `false`, append the OCR text after
|
|||
|
|
// the image placeholder in the rendered output.
|
|||
|
|
AppendOcrText bool `json:"append_ocr_text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TokenReductionOptions token reduction configuration.
|
|||
|
|
type TokenReductionOptions struct {
|
|||
|
|
// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
|
|||
|
|
Mode string `json:"mode"`
|
|||
|
|
// Preserve important words (capitalized, technical terms)
|
|||
|
|
PreserveImportantWords *bool `json:"preserve_important_words,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LanguageDetectionConfig language detection configuration.
|
|||
|
|
type LanguageDetectionConfig struct {
|
|||
|
|
// Enable language detection
|
|||
|
|
Enabled *bool `json:"enabled,omitempty"`
|
|||
|
|
// Minimum confidence threshold (0.0-1.0)
|
|||
|
|
MinConfidence *float64 `json:"min_confidence,omitempty"`
|
|||
|
|
// Detect multiple languages in the document
|
|||
|
|
DetectMultiple bool `json:"detect_multiple"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HTMLOutputConfig configuration for styled HTML output.
|
|||
|
|
//
|
|||
|
|
// When set on [`ExtractionConfig::html_output`] alongside
|
|||
|
|
// `output_format = OutputFormat::Html`, the pipeline builds a
|
|||
|
|
// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
|||
|
|
// the plain comrak-based renderer.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = HtmlOutputConfig {
|
|||
|
|
// theme: HtmlTheme::GitHub,
|
|||
|
|
// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
|||
|
|
// ..Default::default()
|
|||
|
|
// };
|
|||
|
|
type HTMLOutputConfig struct {
|
|||
|
|
// Inline CSS string injected into the output after the theme stylesheet.
|
|||
|
|
// Concatenated after `css_file` content when both are set.
|
|||
|
|
CSS *string `json:"css,omitempty"`
|
|||
|
|
// Path to a CSS file loaded once at renderer construction time.
|
|||
|
|
// Concatenated before `css` when both are set.
|
|||
|
|
CSSFile *string `json:"css_file,omitempty"`
|
|||
|
|
// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
|
|||
|
|
Theme *HTMLTheme `json:"theme,omitempty"`
|
|||
|
|
// CSS class prefix applied to every emitted class name.
|
|||
|
|
//
|
|||
|
|
// Default: `"kb-"`. Change this if your host application already uses
|
|||
|
|
// classes that start with `kb-`.
|
|||
|
|
ClassPrefix string `json:"class_prefix"`
|
|||
|
|
// When `true` (default), write the resolved CSS into a `<style>` block
|
|||
|
|
// immediately after the opening `<div class="{prefix}doc">`.
|
|||
|
|
//
|
|||
|
|
// Set to `false` to emit only the structural markup and wire up your
|
|||
|
|
// own stylesheet targeting the `kb-*` class names.
|
|||
|
|
EmbedCSS *bool `json:"embed_css,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LayoutDetectionConfig layout detection configuration.
|
|||
|
|
//
|
|||
|
|
// Controls layout detection behavior in the extraction pipeline.
|
|||
|
|
// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
|||
|
|
// is enabled for PDF extraction.
|
|||
|
|
type LayoutDetectionConfig struct {
|
|||
|
|
// Confidence threshold override (None = use model default).
|
|||
|
|
ConfidenceThreshold *float32 `json:"confidence_threshold,omitempty"`
|
|||
|
|
// Whether to apply postprocessing heuristics (default: true).
|
|||
|
|
ApplyHeuristics *bool `json:"apply_heuristics,omitempty"`
|
|||
|
|
// Table structure recognition model.
|
|||
|
|
//
|
|||
|
|
// Controls which model is used for table cell detection within layout-detected
|
|||
|
|
// table regions. Defaults to [`TableModel::Tatr`].
|
|||
|
|
TableModel TableModel `json:"table_model,omitempty"`
|
|||
|
|
// Hardware acceleration for ONNX models (layout detection + table structure).
|
|||
|
|
//
|
|||
|
|
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
|||
|
|
// is used for inference. Defaults to `None` (auto-select per platform).
|
|||
|
|
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LlmConfig configuration for an LLM provider/model via liter-llm.
|
|||
|
|
//
|
|||
|
|
// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
|||
|
|
// its own `LlmConfig`, allowing different providers per feature.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// [structured_extraction.llm]
|
|||
|
|
// model = "openai/gpt-4o"
|
|||
|
|
// api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
|
|||
|
|
type LlmConfig struct {
|
|||
|
|
// Provider/model string using liter-llm routing format.
|
|||
|
|
//
|
|||
|
|
// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
|
|||
|
|
// `"groq/llama-3.1-70b-versatile"`.
|
|||
|
|
Model string `json:"model"`
|
|||
|
|
// API key for the provider. When `None`, liter-llm falls back to
|
|||
|
|
// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
|
|||
|
|
APIKey *string `json:"api_key,omitempty"`
|
|||
|
|
// Custom base URL override for the provider endpoint.
|
|||
|
|
BaseURL *string `json:"base_url,omitempty"`
|
|||
|
|
// Request timeout in seconds (default: 60).
|
|||
|
|
TimeoutSecs *uint64 `json:"timeout_secs,omitempty"`
|
|||
|
|
// Maximum retry attempts (default: 3).
|
|||
|
|
MaxRetries *uint32 `json:"max_retries,omitempty"`
|
|||
|
|
// Sampling temperature for generation tasks.
|
|||
|
|
Temperature *float64 `json:"temperature,omitempty"`
|
|||
|
|
// Maximum tokens to generate.
|
|||
|
|
MaxTokens *uint64 `json:"max_tokens,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StructuredExtractionConfig configuration for LLM-based structured data extraction.
|
|||
|
|
//
|
|||
|
|
// Sends extracted document content to a VLM with a JSON schema,
|
|||
|
|
// returning structured data that conforms to the schema.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// [structured_extraction]
|
|||
|
|
// schema_name = "invoice_data"
|
|||
|
|
// strict = true
|
|||
|
|
//
|
|||
|
|
// [structured_extraction.schema]
|
|||
|
|
// type = "object"
|
|||
|
|
// properties.vendor = { type = "string" }
|
|||
|
|
// properties.total = { type = "number" }
|
|||
|
|
// required = ["vendor", "total"]
|
|||
|
|
//
|
|||
|
|
// [structured_extraction.llm]
|
|||
|
|
// model = "openai/gpt-4o"
|
|||
|
|
type StructuredExtractionConfig struct {
|
|||
|
|
// JSON Schema defining the desired output structure.
|
|||
|
|
Schema json.RawMessage `json:"schema"`
|
|||
|
|
// Schema name passed to the LLM's structured output mode.
|
|||
|
|
SchemaName string `json:"schema_name"`
|
|||
|
|
// Optional schema description for the LLM.
|
|||
|
|
SchemaDescription *string `json:"schema_description,omitempty"`
|
|||
|
|
// Enable strict mode — output must exactly match the schema.
|
|||
|
|
Strict bool `json:"strict"`
|
|||
|
|
// Custom Jinja2 extraction prompt template. When `None`, a default template is used.
|
|||
|
|
//
|
|||
|
|
// Available template variables:
|
|||
|
|
// - `{{ content }}` — The extracted document text.
|
|||
|
|
// - `{{ schema }}` — The JSON schema as a formatted string.
|
|||
|
|
// - `{{ schema_name }}` — The schema name.
|
|||
|
|
// - `{{ schema_description }}` — The schema description (may be empty).
|
|||
|
|
Prompt *string `json:"prompt,omitempty"`
|
|||
|
|
// LLM configuration for the extraction.
|
|||
|
|
Llm LlmConfig `json:"llm"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrQualityThresholds quality thresholds for OCR fallback decisions and pipeline quality gating.
|
|||
|
|
//
|
|||
|
|
// All fields default to the values that match the previous hardcoded behavior,
|
|||
|
|
// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
|||
|
|
type OcrQualityThresholds struct {
|
|||
|
|
// Minimum total non-whitespace characters to consider text substantive.
|
|||
|
|
MinTotalNonWhitespace *uint `json:"min_total_non_whitespace,omitempty"`
|
|||
|
|
// Minimum non-whitespace characters per page on average.
|
|||
|
|
MinNonWhitespacePerPage *float64 `json:"min_non_whitespace_per_page,omitempty"`
|
|||
|
|
// Minimum character count for a word to be "meaningful".
|
|||
|
|
MinMeaningfulWordLen *uint `json:"min_meaningful_word_len,omitempty"`
|
|||
|
|
// Minimum count of meaningful words before text is accepted.
|
|||
|
|
MinMeaningfulWords *uint `json:"min_meaningful_words,omitempty"`
|
|||
|
|
// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
|
|||
|
|
MinAlnumRatio *float64 `json:"min_alnum_ratio,omitempty"`
|
|||
|
|
// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
|
|||
|
|
MinGarbageChars *uint `json:"min_garbage_chars,omitempty"`
|
|||
|
|
// Maximum fraction of short (1-2 char) words before text is considered fragmented.
|
|||
|
|
MaxFragmentedWordRatio *float64 `json:"max_fragmented_word_ratio,omitempty"`
|
|||
|
|
// Critical fragmentation threshold — triggers OCR regardless of meaningful words.
|
|||
|
|
// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
|
|||
|
|
CriticalFragmentedWordRatio *float64 `json:"critical_fragmented_word_ratio,omitempty"`
|
|||
|
|
// Minimum average word length. Below this with enough words indicates garbled extraction.
|
|||
|
|
MinAvgWordLength *float64 `json:"min_avg_word_length,omitempty"`
|
|||
|
|
// Minimum word count before average word length check applies.
|
|||
|
|
MinWordsForAvgLengthCheck *uint `json:"min_words_for_avg_length_check,omitempty"`
|
|||
|
|
// Minimum consecutive word repetition ratio to detect column scrambling.
|
|||
|
|
MinConsecutiveRepeatRatio *float64 `json:"min_consecutive_repeat_ratio,omitempty"`
|
|||
|
|
// Minimum word count before consecutive repetition check is applied.
|
|||
|
|
MinWordsForRepeatCheck *uint `json:"min_words_for_repeat_check,omitempty"`
|
|||
|
|
// Minimum character count for "substantive markdown" OCR skip gate.
|
|||
|
|
SubstantiveMinChars *uint `json:"substantive_min_chars,omitempty"`
|
|||
|
|
// Minimum character count for "non-text content" OCR skip gate.
|
|||
|
|
NonTextMinChars *uint `json:"non_text_min_chars,omitempty"`
|
|||
|
|
// Alphanumeric+whitespace ratio threshold for skip decisions.
|
|||
|
|
AlnumWsRatioThreshold *float64 `json:"alnum_ws_ratio_threshold,omitempty"`
|
|||
|
|
// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
|
|||
|
|
// If the result from a backend scores below this, try the next backend.
|
|||
|
|
PipelineMinQuality *float64 `json:"pipeline_min_quality,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrPipelineStage single backend stage in the OCR pipeline.
|
|||
|
|
type OcrPipelineStage struct {
|
|||
|
|
// Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
|
|||
|
|
Backend string `json:"backend"`
|
|||
|
|
// Priority weight (higher = tried first). Stages are sorted by priority descending.
|
|||
|
|
Priority uint32 `json:"priority"`
|
|||
|
|
// Language override for this stage (None = use parent OcrConfig.language).
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Tesseract-specific config override for this stage.
|
|||
|
|
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
|
|||
|
|
// PaddleOCR-specific config for this stage.
|
|||
|
|
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
|
|||
|
|
// VLM config override for this pipeline stage.
|
|||
|
|
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
|
|||
|
|
// Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
//
|
|||
|
|
// Backends that support runtime tuning (mode switching, preprocessing
|
|||
|
|
// flags, inference parameters, etc.) read this value and deserialize
|
|||
|
|
// the keys they care about. Keys unknown to the backend are silently
|
|||
|
|
// ignored, so options from different backends can coexist in the same
|
|||
|
|
// config without conflict.
|
|||
|
|
//
|
|||
|
|
// Example (custom backend):
|
|||
|
|
// ```json
|
|||
|
|
// { "mode": "fast", "enable_layout": true }
|
|||
|
|
// ```
|
|||
|
|
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrPipelineConfig multi-backend OCR pipeline with quality-based fallback.
|
|||
|
|
//
|
|||
|
|
// Backends are tried in priority order (highest first). After each backend
|
|||
|
|
// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
|
|||
|
|
// the result is accepted. Otherwise the next backend is tried.
|
|||
|
|
type OcrPipelineConfig struct {
|
|||
|
|
// Ordered list of backends to try. Sorted by priority (descending) at runtime.
|
|||
|
|
Stages []OcrPipelineStage `json:"stages,omitempty"`
|
|||
|
|
// Quality thresholds for deciding whether to accept a result or try the next backend.
|
|||
|
|
QualityThresholds OcrQualityThresholds `json:"quality_thresholds"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrConfig oCR configuration.
|
|||
|
|
type OcrConfig struct {
|
|||
|
|
// Whether OCR is enabled.
|
|||
|
|
//
|
|||
|
|
// Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
|
|||
|
|
// [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
|
|||
|
|
// metadata only; PDFs use native text extraction without OCR fallback.
|
|||
|
|
//
|
|||
|
|
// Defaults to `true`. When `false`, all other OCR settings are ignored.
|
|||
|
|
Enabled *bool `json:"enabled,omitempty"`
|
|||
|
|
// OCR backend: tesseract, easyocr, paddleocr
|
|||
|
|
Backend string `json:"backend"`
|
|||
|
|
// Language code (e.g., "eng", "deu")
|
|||
|
|
Language string `json:"language"`
|
|||
|
|
// Tesseract-specific configuration (optional)
|
|||
|
|
TesseractConfig *TesseractConfig `json:"tesseract_config,omitempty"`
|
|||
|
|
// Output format for OCR results (optional, for format conversion)
|
|||
|
|
OutputFormat *OutputFormat `json:"output_format,omitempty"`
|
|||
|
|
// PaddleOCR-specific configuration (optional, JSON passthrough)
|
|||
|
|
PaddleOcrConfig *json.RawMessage `json:"paddle_ocr_config,omitempty"`
|
|||
|
|
// Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
//
|
|||
|
|
// Custom OCR backends and built-in backends that support runtime tuning
|
|||
|
|
// can read this value and deserialize the keys they care about. Keys
|
|||
|
|
// unknown to the backend are silently ignored.
|
|||
|
|
//
|
|||
|
|
// This is the recommended extension point for per-call parameters that
|
|||
|
|
// are not covered by the typed fields above (e.g. mode switching,
|
|||
|
|
// preprocessing flags, inference batch size).
|
|||
|
|
//
|
|||
|
|
// **Scope:** when `pipeline` is `None`, this value is propagated to the
|
|||
|
|
// primary stage of the auto-constructed pipeline. When `pipeline` is
|
|||
|
|
// explicitly set, this field has **no effect** — the caller must set
|
|||
|
|
// `OcrPipelineStage.backend_options` directly on the relevant stage(s)
|
|||
|
|
// instead.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
// ```json
|
|||
|
|
// { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
|
|||
|
|
// ```
|
|||
|
|
BackendOptions *json.RawMessage `json:"backend_options,omitempty"`
|
|||
|
|
// OCR element extraction configuration
|
|||
|
|
ElementConfig *OcrElementConfig `json:"element_config,omitempty"`
|
|||
|
|
// Quality thresholds for the native-text-to-OCR fallback decision.
|
|||
|
|
// When None, uses compiled defaults (matching previous hardcoded behavior).
|
|||
|
|
QualityThresholds *OcrQualityThresholds `json:"quality_thresholds,omitempty"`
|
|||
|
|
// Multi-backend OCR pipeline configuration. When set, enables weighted
|
|||
|
|
// fallback across multiple OCR backends based on output quality.
|
|||
|
|
// When None, uses the single `backend` field (same as today).
|
|||
|
|
Pipeline *OcrPipelineConfig `json:"pipeline,omitempty"`
|
|||
|
|
// Enable automatic page rotation based on orientation detection.
|
|||
|
|
//
|
|||
|
|
// When enabled, uses Tesseract's `DetectOrientationScript()` to detect
|
|||
|
|
// page orientation (0/90/180/270 degrees) before OCR. If the page is
|
|||
|
|
// rotated with high confidence, the image is corrected before recognition.
|
|||
|
|
// This is critical for handling rotated scanned documents.
|
|||
|
|
AutoRotate bool `json:"auto_rotate"`
|
|||
|
|
// VLM (Vision Language Model) OCR configuration.
|
|||
|
|
//
|
|||
|
|
// Required when `backend` is `"vlm"`. Uses liter-llm to send page
|
|||
|
|
// images to a vision model for text extraction.
|
|||
|
|
VlmConfig *LlmConfig `json:"vlm_config,omitempty"`
|
|||
|
|
// Custom Jinja2 prompt template for VLM OCR.
|
|||
|
|
//
|
|||
|
|
// When `None`, uses the default template. Available variables:
|
|||
|
|
// - `{{ language }}` — The document language code (e.g., "eng", "deu").
|
|||
|
|
VlmPrompt *string `json:"vlm_prompt,omitempty"`
|
|||
|
|
// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
|||
|
|
//
|
|||
|
|
// Not user-configurable via config files — injected at runtime from
|
|||
|
|
// `ExtractionConfig::acceleration` before each `process_image` call.
|
|||
|
|
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
|
|||
|
|
// Caller-supplied Tesseract `traineddata` bytes per language code.
|
|||
|
|
//
|
|||
|
|
// Primary use case is the WASM build, which has no filesystem and cannot
|
|||
|
|
// download tessdata at runtime. Native builds typically rely on
|
|||
|
|
// `TessdataManager` and ignore this field. When present, the WASM
|
|||
|
|
// Tesseract backend prefers these bytes over its compile-time-bundled
|
|||
|
|
// English data.
|
|||
|
|
//
|
|||
|
|
// Skipped by serde to keep config files small — supply via the typed API
|
|||
|
|
// at runtime.
|
|||
|
|
TessdataBytes map[string][]byte `json:"tessdata_bytes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageConfig page extraction and tracking configuration.
|
|||
|
|
//
|
|||
|
|
// Controls how pages are extracted, tracked, and represented in the extraction results.
|
|||
|
|
// When `None`, page tracking is disabled.
|
|||
|
|
//
|
|||
|
|
// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
|||
|
|
// when page boundaries are available and chunking is configured.
|
|||
|
|
type PageConfig struct {
|
|||
|
|
// Extract pages as separate array (ExtractionResult.pages)
|
|||
|
|
ExtractPages bool `json:"extract_pages"`
|
|||
|
|
// Insert page markers in main content string
|
|||
|
|
InsertPageMarkers bool `json:"insert_page_markers"`
|
|||
|
|
// Page marker format (use {page_num} placeholder)
|
|||
|
|
// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|||
|
|
MarkerFormat *string `json:"marker_format,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PdfConfig pDF-specific configuration.
|
|||
|
|
type PdfConfig struct {
|
|||
|
|
// Extract images from PDF
|
|||
|
|
ExtractImages bool `json:"extract_images"`
|
|||
|
|
// Extract tables from PDF.
|
|||
|
|
//
|
|||
|
|
// When `true` (default), runs pdf_oxide's native grid detector and, if it
|
|||
|
|
// finds nothing, falls back to the heuristic text-layer reconstruction in
|
|||
|
|
// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
|
|||
|
|
// both passes — `tables` will then be empty in the result.
|
|||
|
|
ExtractTables *bool `json:"extract_tables,omitempty"`
|
|||
|
|
// List of passwords to try when opening encrypted PDFs
|
|||
|
|
Passwords []string `json:"passwords,omitempty"`
|
|||
|
|
// Extract PDF metadata
|
|||
|
|
ExtractMetadata *bool `json:"extract_metadata,omitempty"`
|
|||
|
|
// Hierarchy extraction configuration (None = hierarchy extraction disabled)
|
|||
|
|
Hierarchy *HierarchyConfig `json:"hierarchy,omitempty"`
|
|||
|
|
// Extract PDF annotations (text notes, highlights, links, stamps).
|
|||
|
|
// Default: false
|
|||
|
|
ExtractAnnotations bool `json:"extract_annotations"`
|
|||
|
|
// Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
|||
|
|
// Default: 0.06 (6%)
|
|||
|
|
TopMarginFraction *float32 `json:"top_margin_fraction,omitempty"`
|
|||
|
|
// Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
|||
|
|
// Default: 0.05 (5%)
|
|||
|
|
BottomMarginFraction *float32 `json:"bottom_margin_fraction,omitempty"`
|
|||
|
|
// Allow single-column pseudo tables in extraction results.
|
|||
|
|
//
|
|||
|
|
// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
|||
|
|
// (heuristic) are rejected. When `true`, the minimum column count is relaxed
|
|||
|
|
// to 1, allowing single-column structured data (glossaries, itemized lists)
|
|||
|
|
// to be emitted as tables. Other quality filters (density, sparsity, prose
|
|||
|
|
// detection) still apply.
|
|||
|
|
AllowSingleColumnTables bool `json:"allow_single_column_tables"`
|
|||
|
|
// Perform OCR on inline images extracted from PDF pages and attach the
|
|||
|
|
// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
|
|||
|
|
// to be available; if `ExtractionConfig.ocr` is `None` the extractor
|
|||
|
|
// falls back to `TesseractConfig::default()`. Per-image failures degrade
|
|||
|
|
// gracefully (the image is returned without OCR text rather than failing
|
|||
|
|
// the whole extraction). Default: `false`.
|
|||
|
|
OcrInlineImages bool `json:"ocr_inline_images"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HierarchyConfig hierarchy extraction configuration for PDF text structure analysis.
|
|||
|
|
//
|
|||
|
|
// Enables extraction of document hierarchy levels (H1-H6) based on font size
|
|||
|
|
// clustering and semantic analysis. When enabled, hierarchical blocks are
|
|||
|
|
// included in page content.
|
|||
|
|
type HierarchyConfig struct {
|
|||
|
|
// Enable hierarchy extraction
|
|||
|
|
Enabled *bool `json:"enabled,omitempty"`
|
|||
|
|
// Number of font size clusters to use for hierarchy levels (1-7)
|
|||
|
|
//
|
|||
|
|
// Default: 6, which provides H1-H6 heading levels with body text.
|
|||
|
|
// Larger values create more fine-grained hierarchy levels.
|
|||
|
|
KClusters *uint `json:"k_clusters,omitempty"`
|
|||
|
|
// Include bounding box information in hierarchy blocks
|
|||
|
|
IncludeBbox *bool `json:"include_bbox,omitempty"`
|
|||
|
|
// OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
|||
|
|
//
|
|||
|
|
// Determines when OCR should be triggered based on text block coverage.
|
|||
|
|
// OCR is triggered when text blocks cover less than this fraction of the page.
|
|||
|
|
// Default: 0.5 (trigger OCR if less than 50% of page has text)
|
|||
|
|
OcrCoverageThreshold *float32 `json:"ocr_coverage_threshold,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PostProcessorConfig post-processor configuration.
|
|||
|
|
type PostProcessorConfig struct {
|
|||
|
|
// Enable post-processors
|
|||
|
|
Enabled *bool `json:"enabled,omitempty"`
|
|||
|
|
// Whitelist of processor names to run (None = all enabled)
|
|||
|
|
EnabledProcessors []string `json:"enabled_processors,omitempty"`
|
|||
|
|
// Blacklist of processor names to skip (None = none disabled)
|
|||
|
|
DisabledProcessors []string `json:"disabled_processors,omitempty"`
|
|||
|
|
// Pre-computed AHashSet for O(1) enabled processor lookup
|
|||
|
|
EnabledSet []string `json:"enabled_set,omitempty"`
|
|||
|
|
// Pre-computed AHashSet for O(1) disabled processor lookup
|
|||
|
|
DisabledSet []string `json:"disabled_set,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ChunkingConfig chunking configuration.
|
|||
|
|
//
|
|||
|
|
// Configures text chunking for document content, including chunk size,
|
|||
|
|
// overlap, trimming behavior, and optional embeddings.
|
|||
|
|
//
|
|||
|
|
// Use `..Default::default()` when constructing to allow for future field additions:
|
|||
|
|
// ```rust
|
|||
|
|
// let config = ChunkingConfig {
|
|||
|
|
// max_characters: 500,
|
|||
|
|
// ..Default::default()
|
|||
|
|
// };
|
|||
|
|
// ```
|
|||
|
|
type ChunkingConfig struct {
|
|||
|
|
// Maximum size per chunk (in units determined by `sizing`).
|
|||
|
|
//
|
|||
|
|
// When `sizing` is `Characters` (default), this is the max character count.
|
|||
|
|
// When using token-based sizing, this is the max token count.
|
|||
|
|
//
|
|||
|
|
// Default: 1000
|
|||
|
|
MaxCharacters *uint `json:"max_chars,omitempty"`
|
|||
|
|
// Overlap between chunks (in units determined by `sizing`).
|
|||
|
|
//
|
|||
|
|
// Default: 200
|
|||
|
|
Overlap *uint `json:"max_overlap,omitempty"`
|
|||
|
|
// Whether to trim whitespace from chunk boundaries.
|
|||
|
|
//
|
|||
|
|
// Default: true
|
|||
|
|
Trim *bool `json:"trim,omitempty"`
|
|||
|
|
// Type of chunker to use (Text or Markdown).
|
|||
|
|
//
|
|||
|
|
// Default: Text
|
|||
|
|
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
|
|||
|
|
// Optional embedding configuration for chunk embeddings.
|
|||
|
|
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
|
|||
|
|
// Use a preset configuration (overrides individual settings if provided).
|
|||
|
|
Preset *string `json:"preset,omitempty"`
|
|||
|
|
// How to measure chunk size.
|
|||
|
|
//
|
|||
|
|
// Default: `Characters` (Unicode character count).
|
|||
|
|
// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
|
|||
|
|
Sizing ChunkSizing `json:"sizing"`
|
|||
|
|
// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
|
|||
|
|
// path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
|
|||
|
|
//
|
|||
|
|
// This is useful for RAG pipelines where each chunk needs self-contained
|
|||
|
|
// context about its position in the document structure.
|
|||
|
|
//
|
|||
|
|
// Default: `false`
|
|||
|
|
PrependHeadingContext bool `json:"prepend_heading_context"`
|
|||
|
|
// Optional cosine similarity threshold for semantic topic boundary detection.
|
|||
|
|
//
|
|||
|
|
// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
|
|||
|
|
// provided. You almost never need to set this. When omitted, defaults to
|
|||
|
|
// `0.75` which works well for most documents. Lower values detect more
|
|||
|
|
// topic boundaries (more, smaller chunks); higher values detect fewer.
|
|||
|
|
// Range: `0.0..=1.0`.
|
|||
|
|
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *ChunkingConfig) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
MaxCharacters *uint `json:"max_chars,omitempty"`
|
|||
|
|
Overlap *uint `json:"max_overlap,omitempty"`
|
|||
|
|
Trim *bool `json:"trim,omitempty"`
|
|||
|
|
ChunkerType *ChunkerType `json:"chunker_type,omitempty"`
|
|||
|
|
Embedding *EmbeddingConfig `json:"embedding,omitempty"`
|
|||
|
|
Preset *string `json:"preset,omitempty"`
|
|||
|
|
Sizing json.RawMessage `json:"sizing,omitempty"`
|
|||
|
|
PrependHeadingContext bool `json:"prepend_heading_context"`
|
|||
|
|
TopicThreshold *float32 `json:"topic_threshold,omitempty"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.MaxCharacters = raw.MaxCharacters
|
|||
|
|
s.Overlap = raw.Overlap
|
|||
|
|
s.Trim = raw.Trim
|
|||
|
|
s.ChunkerType = raw.ChunkerType
|
|||
|
|
s.Embedding = raw.Embedding
|
|||
|
|
s.Preset = raw.Preset
|
|||
|
|
s.PrependHeadingContext = raw.PrependHeadingContext
|
|||
|
|
s.TopicThreshold = raw.TopicThreshold
|
|||
|
|
if len(raw.Sizing) > 0 && string(raw.Sizing) != "null" {
|
|||
|
|
v, err := UnmarshalChunkSizing(raw.Sizing)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Sizing = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingConfig embedding configuration for text chunks.
|
|||
|
|
//
|
|||
|
|
// Configures embedding generation using ONNX models via the vendored embedding engine.
|
|||
|
|
// Requires the `embeddings` feature to be enabled.
|
|||
|
|
type EmbeddingConfig struct {
|
|||
|
|
// The embedding model to use (defaults to "balanced" preset if not specified)
|
|||
|
|
Model EmbeddingModelType `json:"model"`
|
|||
|
|
// Whether to normalize embedding vectors (recommended for cosine similarity)
|
|||
|
|
Normalize *bool `json:"normalize,omitempty"`
|
|||
|
|
// Batch size for embedding generation
|
|||
|
|
BatchSize *uint `json:"batch_size,omitempty"`
|
|||
|
|
// Show model download progress
|
|||
|
|
ShowDownloadProgress bool `json:"show_download_progress"`
|
|||
|
|
// Custom cache directory for model files
|
|||
|
|
//
|
|||
|
|
// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
|
|||
|
|
// Allows full customization of model download location.
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
// Hardware acceleration for the embedding ONNX model.
|
|||
|
|
//
|
|||
|
|
// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
|||
|
|
// is used for inference. Defaults to `None` (auto-select per platform).
|
|||
|
|
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
|
|||
|
|
// Maximum wall-clock duration (in seconds) for a single `embed()` call when
|
|||
|
|
// using [`EmbeddingModelType::Plugin`].
|
|||
|
|
//
|
|||
|
|
// Applies only to the in-process plugin path — protects against hung
|
|||
|
|
// host-language backends (e.g. a Python callback deadlocked on the GIL,
|
|||
|
|
// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
|
|||
|
|
// returns `Plugin` instead of blocking forever.
|
|||
|
|
//
|
|||
|
|
// `None` disables the timeout. The default (60 seconds) is conservative
|
|||
|
|
// for common in-process inference; increase for large batches on slow
|
|||
|
|
// hardware.
|
|||
|
|
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *EmbeddingConfig) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
Model json.RawMessage `json:"model,omitempty"`
|
|||
|
|
Normalize *bool `json:"normalize,omitempty"`
|
|||
|
|
BatchSize *uint `json:"batch_size,omitempty"`
|
|||
|
|
ShowDownloadProgress bool `json:"show_download_progress"`
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
Acceleration *AccelerationConfig `json:"acceleration,omitempty"`
|
|||
|
|
MaxEmbedDurationSecs *uint64 `json:"max_embed_duration_secs,omitempty"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Normalize = raw.Normalize
|
|||
|
|
s.BatchSize = raw.BatchSize
|
|||
|
|
s.ShowDownloadProgress = raw.ShowDownloadProgress
|
|||
|
|
s.CacheDir = raw.CacheDir
|
|||
|
|
s.Acceleration = raw.Acceleration
|
|||
|
|
s.MaxEmbedDurationSecs = raw.MaxEmbedDurationSecs
|
|||
|
|
if len(raw.Model) > 0 && string(raw.Model) != "null" {
|
|||
|
|
v, err := UnmarshalEmbeddingModelType(raw.Model)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Model = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TreeSitterConfig configuration for tree-sitter language pack integration.
|
|||
|
|
//
|
|||
|
|
// Controls grammar download behavior and code analysis options.
|
|||
|
|
//
|
|||
|
|
// # Example (TOML)
|
|||
|
|
//
|
|||
|
|
// ```toml
|
|||
|
|
// [tree_sitter]
|
|||
|
|
// languages = ["python", "rust"]
|
|||
|
|
// groups = ["web"]
|
|||
|
|
//
|
|||
|
|
// [tree_sitter.process]
|
|||
|
|
// structure = true
|
|||
|
|
// comments = true
|
|||
|
|
// docstrings = true
|
|||
|
|
// ```
|
|||
|
|
type TreeSitterConfig struct {
|
|||
|
|
// Enable code intelligence processing (default: true).
|
|||
|
|
//
|
|||
|
|
// When `false`, tree-sitter analysis is completely skipped even if
|
|||
|
|
// the config section is present.
|
|||
|
|
Enabled *bool `json:"enabled,omitempty"`
|
|||
|
|
// Custom cache directory for downloaded grammars.
|
|||
|
|
//
|
|||
|
|
// When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
// Languages to pre-download on init (e.g., `["python", "rust"]`).
|
|||
|
|
Languages []string `json:"languages,omitempty"`
|
|||
|
|
// Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
|
|||
|
|
Groups []string `json:"groups,omitempty"`
|
|||
|
|
// Processing options for code analysis.
|
|||
|
|
Process TreeSitterProcessConfig `json:"process"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TreeSitterProcessConfig processing options for tree-sitter code analysis.
|
|||
|
|
//
|
|||
|
|
// Controls which analysis features are enabled when extracting code files.
|
|||
|
|
type TreeSitterProcessConfig struct {
|
|||
|
|
// Extract structural items (functions, classes, structs, etc.). Default: true.
|
|||
|
|
Structure *bool `json:"structure,omitempty"`
|
|||
|
|
// Extract import statements. Default: true.
|
|||
|
|
Imports *bool `json:"imports,omitempty"`
|
|||
|
|
// Extract export statements. Default: true.
|
|||
|
|
Exports *bool `json:"exports,omitempty"`
|
|||
|
|
// Extract comments. Default: false.
|
|||
|
|
Comments bool `json:"comments"`
|
|||
|
|
// Extract docstrings. Default: false.
|
|||
|
|
Docstrings bool `json:"docstrings"`
|
|||
|
|
// Extract symbol definitions. Default: false.
|
|||
|
|
Symbols bool `json:"symbols"`
|
|||
|
|
// Include parse diagnostics. Default: false.
|
|||
|
|
Diagnostics bool `json:"diagnostics"`
|
|||
|
|
// Maximum chunk size in bytes. `None` disables chunking.
|
|||
|
|
ChunkMaxSize *uint `json:"chunk_max_size,omitempty"`
|
|||
|
|
// Content rendering mode for code extraction.
|
|||
|
|
ContentMode CodeContentMode `json:"content_mode,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SupportedFormat supported document format entry.
|
|||
|
|
//
|
|||
|
|
// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
|||
|
|
type SupportedFormat struct {
|
|||
|
|
// File extension (without leading dot), e.g., "pdf", "docx"
|
|||
|
|
Extension string `json:"extension"`
|
|||
|
|
// MIME type string, e.g., "application/pdf"
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ServerConfig aPI server configuration.
|
|||
|
|
//
|
|||
|
|
// This struct holds all configuration options for the Kreuzberg API server,
|
|||
|
|
// including host/port settings, CORS configuration, and upload limits.
|
|||
|
|
//
|
|||
|
|
// # Defaults
|
|||
|
|
//
|
|||
|
|
// - `host`: "127.0.0.1" (localhost only)
|
|||
|
|
// - `port`: 8000
|
|||
|
|
// - `cors_origins`: empty vector (allows all origins)
|
|||
|
|
// - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|||
|
|
// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|||
|
|
type ServerConfig struct {
|
|||
|
|
// Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
|||
|
|
Host string `json:"host"`
|
|||
|
|
// Server port number
|
|||
|
|
Port uint16 `json:"port"`
|
|||
|
|
// CORS allowed origins. Empty vector means allow all origins.
|
|||
|
|
//
|
|||
|
|
// If this is an empty vector, the server will accept requests from any origin.
|
|||
|
|
// If populated with specific origins (e.g., `"https://example.com"`), only
|
|||
|
|
// those origins will be allowed.
|
|||
|
|
CorsOrigins []string `json:"cors_origins,omitempty"`
|
|||
|
|
// Maximum size of request body in bytes (default: 100 MB)
|
|||
|
|
MaxRequestBodyBytes uint `json:"max_request_body_bytes"`
|
|||
|
|
// Maximum size of multipart fields in bytes (default: 100 MB)
|
|||
|
|
MaxMultipartFieldBytes uint `json:"max_multipart_field_bytes"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StructuredDataResult is a type.
|
|||
|
|
type StructuredDataResult struct {
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|||
|
|
TextFields []string `json:"text_fields,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocxAppProperties application properties from docProps/app.xml for DOCX
|
|||
|
|
//
|
|||
|
|
// Contains Word-specific document statistics and metadata.
|
|||
|
|
type DocxAppProperties struct {
|
|||
|
|
// Application name (e.g., "Microsoft Office Word")
|
|||
|
|
Application *string `json:"application,omitempty"`
|
|||
|
|
// Application version
|
|||
|
|
AppVersion *string `json:"app_version,omitempty"`
|
|||
|
|
// Template filename
|
|||
|
|
Template *string `json:"template,omitempty"`
|
|||
|
|
// Total editing time in minutes
|
|||
|
|
TotalTime *int32 `json:"total_time,omitempty"`
|
|||
|
|
// Number of pages
|
|||
|
|
Pages *int32 `json:"pages,omitempty"`
|
|||
|
|
// Number of words
|
|||
|
|
Words *int32 `json:"words,omitempty"`
|
|||
|
|
// Number of characters (excluding spaces)
|
|||
|
|
Characters *int32 `json:"characters,omitempty"`
|
|||
|
|
// Number of characters (including spaces)
|
|||
|
|
CharactersWithSpaces *int32 `json:"characters_with_spaces,omitempty"`
|
|||
|
|
// Number of lines
|
|||
|
|
Lines *int32 `json:"lines,omitempty"`
|
|||
|
|
// Number of paragraphs
|
|||
|
|
Paragraphs *int32 `json:"paragraphs,omitempty"`
|
|||
|
|
// Company name
|
|||
|
|
Company *string `json:"company,omitempty"`
|
|||
|
|
// Document security level
|
|||
|
|
DocSecurity *int32 `json:"doc_security,omitempty"`
|
|||
|
|
// Scale crop flag
|
|||
|
|
ScaleCrop *bool `json:"scale_crop,omitempty"`
|
|||
|
|
// Links up to date flag
|
|||
|
|
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
|
|||
|
|
// Shared document flag
|
|||
|
|
SharedDoc *bool `json:"shared_doc,omitempty"`
|
|||
|
|
// Hyperlinks changed flag
|
|||
|
|
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// XlsxAppProperties application properties from docProps/app.xml for XLSX
|
|||
|
|
//
|
|||
|
|
// Contains Excel-specific document metadata.
|
|||
|
|
type XlsxAppProperties struct {
|
|||
|
|
// Application name (e.g., "Microsoft Excel")
|
|||
|
|
Application *string `json:"application,omitempty"`
|
|||
|
|
// Application version
|
|||
|
|
AppVersion *string `json:"app_version,omitempty"`
|
|||
|
|
// Document security level
|
|||
|
|
DocSecurity *int32 `json:"doc_security,omitempty"`
|
|||
|
|
// Scale crop flag
|
|||
|
|
ScaleCrop *bool `json:"scale_crop,omitempty"`
|
|||
|
|
// Links up to date flag
|
|||
|
|
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
|
|||
|
|
// Shared document flag
|
|||
|
|
SharedDoc *bool `json:"shared_doc,omitempty"`
|
|||
|
|
// Hyperlinks changed flag
|
|||
|
|
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
|
|||
|
|
// Company name
|
|||
|
|
Company *string `json:"company,omitempty"`
|
|||
|
|
// Worksheet names
|
|||
|
|
WorksheetNames []string `json:"worksheet_names,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PptxAppProperties application properties from docProps/app.xml for PPTX
|
|||
|
|
//
|
|||
|
|
// Contains PowerPoint-specific document metadata.
|
|||
|
|
type PptxAppProperties struct {
|
|||
|
|
// Application name (e.g., "Microsoft Office PowerPoint")
|
|||
|
|
Application *string `json:"application,omitempty"`
|
|||
|
|
// Application version
|
|||
|
|
AppVersion *string `json:"app_version,omitempty"`
|
|||
|
|
// Total editing time in minutes
|
|||
|
|
TotalTime *int32 `json:"total_time,omitempty"`
|
|||
|
|
// Company name
|
|||
|
|
Company *string `json:"company,omitempty"`
|
|||
|
|
// Document security level
|
|||
|
|
DocSecurity *int32 `json:"doc_security,omitempty"`
|
|||
|
|
// Scale crop flag
|
|||
|
|
ScaleCrop *bool `json:"scale_crop,omitempty"`
|
|||
|
|
// Links up to date flag
|
|||
|
|
LinksUpToDate *bool `json:"links_up_to_date,omitempty"`
|
|||
|
|
// Shared document flag
|
|||
|
|
SharedDoc *bool `json:"shared_doc,omitempty"`
|
|||
|
|
// Hyperlinks changed flag
|
|||
|
|
HyperlinksChanged *bool `json:"hyperlinks_changed,omitempty"`
|
|||
|
|
// Number of slides
|
|||
|
|
Slides *int32 `json:"slides,omitempty"`
|
|||
|
|
// Number of notes
|
|||
|
|
Notes *int32 `json:"notes,omitempty"`
|
|||
|
|
// Number of hidden slides
|
|||
|
|
HiddenSlides *int32 `json:"hidden_slides,omitempty"`
|
|||
|
|
// Number of multimedia clips
|
|||
|
|
MultimediaClips *int32 `json:"multimedia_clips,omitempty"`
|
|||
|
|
// Presentation format (e.g., "Widescreen", "Standard")
|
|||
|
|
PresentationFormat *string `json:"presentation_format,omitempty"`
|
|||
|
|
// Slide titles
|
|||
|
|
SlideTitles []string `json:"slide_titles,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CoreProperties dublin Core metadata from docProps/core.xml
|
|||
|
|
//
|
|||
|
|
// Contains standard metadata fields defined by the Dublin Core standard
|
|||
|
|
// and Office-specific extensions.
|
|||
|
|
type CoreProperties struct {
|
|||
|
|
// Document title
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Document subject/topic
|
|||
|
|
Subject *string `json:"subject,omitempty"`
|
|||
|
|
// Document creator/author
|
|||
|
|
Creator *string `json:"creator,omitempty"`
|
|||
|
|
// Keywords or tags
|
|||
|
|
Keywords *string `json:"keywords,omitempty"`
|
|||
|
|
// Document description/abstract
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
// User who last modified the document
|
|||
|
|
LastModifiedBy *string `json:"last_modified_by,omitempty"`
|
|||
|
|
// Revision number
|
|||
|
|
Revision *string `json:"revision,omitempty"`
|
|||
|
|
// Creation timestamp (ISO 8601)
|
|||
|
|
Created *string `json:"created,omitempty"`
|
|||
|
|
// Last modification timestamp (ISO 8601)
|
|||
|
|
Modified *string `json:"modified,omitempty"`
|
|||
|
|
// Document category
|
|||
|
|
Category *string `json:"category,omitempty"`
|
|||
|
|
// Content status (Draft, Final, etc.)
|
|||
|
|
ContentStatus *string `json:"content_status,omitempty"`
|
|||
|
|
// Document language
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Unique identifier
|
|||
|
|
Identifier *string `json:"identifier,omitempty"`
|
|||
|
|
// Document version
|
|||
|
|
Version *string `json:"version,omitempty"`
|
|||
|
|
// Last print timestamp (ISO 8601)
|
|||
|
|
LastPrinted *string `json:"last_printed,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SecurityLimits configuration for security limits across extractors.
|
|||
|
|
//
|
|||
|
|
// All limits are intentionally conservative to prevent DoS attacks
|
|||
|
|
// while still supporting legitimate documents.
|
|||
|
|
type SecurityLimits struct {
|
|||
|
|
// Maximum uncompressed size for archives (500 MB)
|
|||
|
|
MaxArchiveSize *uint `json:"max_archive_size,omitempty"`
|
|||
|
|
// Maximum compression ratio before flagging as potential bomb (100:1)
|
|||
|
|
MaxCompressionRatio *uint `json:"max_compression_ratio,omitempty"`
|
|||
|
|
// Maximum number of files in archive (10,000)
|
|||
|
|
MaxFilesInArchive *uint `json:"max_files_in_archive,omitempty"`
|
|||
|
|
// Maximum nesting depth for structures (100)
|
|||
|
|
MaxNestingDepth *uint `json:"max_nesting_depth,omitempty"`
|
|||
|
|
// Maximum length of any single XML entity / attribute / token (1 MiB).
|
|||
|
|
// This is a per-token cap, NOT a total cap — billion-laughs class
|
|||
|
|
// attacks where a single entity expands to hundreds of MB are caught
|
|||
|
|
// here, while normal long text content (a paragraph, a CDATA block) is
|
|||
|
|
// caught by `max_content_size` instead.
|
|||
|
|
MaxEntityLength *uint `json:"max_entity_length,omitempty"`
|
|||
|
|
// Maximum string growth per document (100 MB)
|
|||
|
|
MaxContentSize *uint `json:"max_content_size,omitempty"`
|
|||
|
|
// Maximum iterations per operation
|
|||
|
|
MaxIterations *uint `json:"max_iterations,omitempty"`
|
|||
|
|
// Maximum XML depth (100 levels)
|
|||
|
|
MaxXMLDepth *uint `json:"max_xml_depth,omitempty"`
|
|||
|
|
// Maximum cells per table (100,000)
|
|||
|
|
MaxTableCells *uint `json:"max_table_cells,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TokenReductionConfig is a type.
|
|||
|
|
type TokenReductionConfig struct {
|
|||
|
|
Level *ReductionLevel `json:"level,omitempty"`
|
|||
|
|
LanguageHint *string `json:"language_hint,omitempty"`
|
|||
|
|
PreserveMarkdown bool `json:"preserve_markdown"`
|
|||
|
|
PreserveCode *bool `json:"preserve_code,omitempty"`
|
|||
|
|
SemanticThreshold *float32 `json:"semantic_threshold,omitempty"`
|
|||
|
|
EnableParallel *bool `json:"enable_parallel,omitempty"`
|
|||
|
|
UseSimd *bool `json:"use_simd,omitempty"`
|
|||
|
|
CustomStopwords map[string][]string `json:"custom_stopwords,omitempty"`
|
|||
|
|
PreservePatterns []string `json:"preserve_patterns,omitempty"`
|
|||
|
|
TargetReduction *float32 `json:"target_reduction,omitempty"`
|
|||
|
|
EnableSemanticClustering bool `json:"enable_semantic_clustering"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PdfAnnotation pDF annotation extracted from a document page.
|
|||
|
|
type PdfAnnotation struct {
|
|||
|
|
// The type of annotation.
|
|||
|
|
AnnotationType PdfAnnotationType `json:"annotation_type"`
|
|||
|
|
// Text content of the annotation (e.g., comment text, link URL).
|
|||
|
|
Content *string `json:"content,omitempty"`
|
|||
|
|
// Page number where the annotation appears (1-indexed).
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
// Bounding box of the annotation on the page.
|
|||
|
|
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DjotContent comprehensive Djot document structure with semantic preservation.
|
|||
|
|
//
|
|||
|
|
// This type captures the full richness of Djot markup, including:
|
|||
|
|
// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
|||
|
|
// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
|||
|
|
// - Attributes (classes, IDs, key-value pairs)
|
|||
|
|
// - Links, images, footnotes
|
|||
|
|
// - Math expressions (inline and display)
|
|||
|
|
// - Tables with full structure
|
|||
|
|
//
|
|||
|
|
// Available when the `djot` feature is enabled.
|
|||
|
|
type DjotContent struct {
|
|||
|
|
// Plain text representation for backwards compatibility
|
|||
|
|
PlainText string `json:"plain_text"`
|
|||
|
|
// Structured block-level content
|
|||
|
|
Blocks []FormattedBlock `json:"blocks,omitempty"`
|
|||
|
|
// Metadata from YAML frontmatter
|
|||
|
|
Metadata Metadata `json:"metadata"`
|
|||
|
|
// Extracted tables as structured data
|
|||
|
|
Tables []Table `json:"tables,omitempty"`
|
|||
|
|
// Extracted images with metadata
|
|||
|
|
Images []DjotImage `json:"images,omitempty"`
|
|||
|
|
// Extracted links with URLs
|
|||
|
|
Links []DjotLink `json:"links,omitempty"`
|
|||
|
|
// Footnote definitions
|
|||
|
|
Footnotes []Footnote `json:"footnotes,omitempty"`
|
|||
|
|
// Attributes mapped by element identifier (if present)
|
|||
|
|
Attributes []string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FormattedBlock block-level element in a Djot document.
|
|||
|
|
//
|
|||
|
|
// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
|||
|
|
type FormattedBlock struct {
|
|||
|
|
// Type of block element
|
|||
|
|
BlockType BlockType `json:"block_type"`
|
|||
|
|
// Heading level (1-6) for headings, or nesting level for lists
|
|||
|
|
Level *uint `json:"level,omitempty"`
|
|||
|
|
// Inline content within the block
|
|||
|
|
InlineContent []InlineElement `json:"inline_content,omitempty"`
|
|||
|
|
// Element attributes (classes, IDs, key-value pairs)
|
|||
|
|
Attributes *string `json:"attributes,omitempty"`
|
|||
|
|
// Language identifier for code blocks
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Raw code content for code blocks
|
|||
|
|
Code *string `json:"code,omitempty"`
|
|||
|
|
// Nested blocks for containers (blockquotes, list items, divs)
|
|||
|
|
Children []FormattedBlock `json:"children,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// InlineElement inline element within a block.
|
|||
|
|
//
|
|||
|
|
// Represents text with formatting, links, images, etc.
|
|||
|
|
type InlineElement struct {
|
|||
|
|
// Type of inline element
|
|||
|
|
ElementType InlineType `json:"element_type"`
|
|||
|
|
// Text content
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Element attributes
|
|||
|
|
Attributes *string `json:"attributes,omitempty"`
|
|||
|
|
// Additional metadata (e.g., href for links, src/alt for images)
|
|||
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DjotImage image element in Djot.
|
|||
|
|
type DjotImage struct {
|
|||
|
|
// Image source URL or path
|
|||
|
|
Src string `json:"src"`
|
|||
|
|
// Alternative text
|
|||
|
|
Alt string `json:"alt"`
|
|||
|
|
// Optional title
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Element attributes
|
|||
|
|
Attributes *string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DjotLink link element in Djot.
|
|||
|
|
type DjotLink struct {
|
|||
|
|
// Link URL
|
|||
|
|
URL string `json:"url"`
|
|||
|
|
// Link text content
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// Optional title
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Element attributes
|
|||
|
|
Attributes *string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Footnote in Djot.
|
|||
|
|
type Footnote struct {
|
|||
|
|
// Footnote label
|
|||
|
|
Label string `json:"label"`
|
|||
|
|
// Footnote content blocks
|
|||
|
|
Content []FormattedBlock `json:"content,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocumentStructure top-level structured document representation.
|
|||
|
|
//
|
|||
|
|
// A flat array of nodes with index-based parent/child references forming a tree.
|
|||
|
|
// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
|||
|
|
// to iterate over top-level content by layer.
|
|||
|
|
//
|
|||
|
|
// # Validation
|
|||
|
|
//
|
|||
|
|
// Call `validate()` after construction to verify all node indices are in bounds
|
|||
|
|
// and parent-child relationships are bidirectionally consistent.
|
|||
|
|
type DocumentStructure struct {
|
|||
|
|
// All nodes in document/reading order.
|
|||
|
|
Nodes []DocumentNode `json:"nodes,omitempty"`
|
|||
|
|
// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
|
|||
|
|
//
|
|||
|
|
// Allows renderers to apply format-aware heuristics when converting
|
|||
|
|
// the document tree to output formats.
|
|||
|
|
SourceFormat *string `json:"source_format,omitempty"`
|
|||
|
|
// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
|
|||
|
|
//
|
|||
|
|
// Populated during derivation from the internal document representation.
|
|||
|
|
// Empty when no relationships are detected.
|
|||
|
|
Relationships []DocumentRelationship `json:"relationships,omitempty"`
|
|||
|
|
// Sorted, deduplicated list of node type names present in this document.
|
|||
|
|
//
|
|||
|
|
// Each value is the snake_case `node_type` tag of the corresponding
|
|||
|
|
// [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
|
|||
|
|
//
|
|||
|
|
// Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
|
|||
|
|
// Empty until that method is called (internal construction paths call it
|
|||
|
|
// at the end of derivation).
|
|||
|
|
NodeTypes []string `json:"node_types,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocumentRelationship resolved relationship between two nodes in the document tree.
|
|||
|
|
type DocumentRelationship struct {
|
|||
|
|
// Source node index (the referencing node).
|
|||
|
|
Source uint32 `json:"source"`
|
|||
|
|
// Target node index (the referenced node).
|
|||
|
|
Target uint32 `json:"target"`
|
|||
|
|
// Semantic kind of the relationship.
|
|||
|
|
Kind RelationshipKind `json:"kind"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocumentNode single node in the document tree.
|
|||
|
|
//
|
|||
|
|
// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
|||
|
|
// for tree structure, and metadata like page number, bounding box, and content layer.
|
|||
|
|
type DocumentNode struct {
|
|||
|
|
// Deterministic identifier (hash of content + position).
|
|||
|
|
ID string `json:"id"`
|
|||
|
|
// Node content — tagged enum, type-specific data only.
|
|||
|
|
Content NodeContent `json:"content"`
|
|||
|
|
// Parent node index (`None` = root-level node).
|
|||
|
|
Parent *uint32 `json:"parent,omitempty"`
|
|||
|
|
// Child node indices in reading order.
|
|||
|
|
Children []uint32 `json:"children,omitempty"`
|
|||
|
|
// Content layer classification.
|
|||
|
|
ContentLayer ContentLayer `json:"content_layer"`
|
|||
|
|
// Page number where this node starts (1-indexed).
|
|||
|
|
Page *uint32 `json:"page,omitempty"`
|
|||
|
|
// Page number where this node ends (for multi-page tables/sections).
|
|||
|
|
PageEnd *uint32 `json:"page_end,omitempty"`
|
|||
|
|
// Bounding box in document coordinates.
|
|||
|
|
Bbox *BoundingBox `json:"bbox,omitempty"`
|
|||
|
|
// Inline annotations (formatting, links) on this node's text content.
|
|||
|
|
//
|
|||
|
|
// Only meaningful for text-carrying nodes; empty for containers.
|
|||
|
|
Annotations []TextAnnotation `json:"annotations,omitempty"`
|
|||
|
|
// Format-specific key-value attributes.
|
|||
|
|
//
|
|||
|
|
// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
|
|||
|
|
// LaTeX environment names, Excel cell formulas, slide layout names, etc.
|
|||
|
|
Attributes map[string]string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *DocumentNode) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
ID string `json:"id"`
|
|||
|
|
Content json.RawMessage `json:"content,omitempty"`
|
|||
|
|
Parent *uint32 `json:"parent,omitempty"`
|
|||
|
|
Children []uint32 `json:"children,omitempty"`
|
|||
|
|
ContentLayer ContentLayer `json:"content_layer"`
|
|||
|
|
Page *uint32 `json:"page,omitempty"`
|
|||
|
|
PageEnd *uint32 `json:"page_end,omitempty"`
|
|||
|
|
Bbox *BoundingBox `json:"bbox,omitempty"`
|
|||
|
|
Annotations []TextAnnotation `json:"annotations,omitempty"`
|
|||
|
|
Attributes map[string]string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.ID = raw.ID
|
|||
|
|
s.Parent = raw.Parent
|
|||
|
|
s.Children = raw.Children
|
|||
|
|
s.ContentLayer = raw.ContentLayer
|
|||
|
|
s.Page = raw.Page
|
|||
|
|
s.PageEnd = raw.PageEnd
|
|||
|
|
s.Bbox = raw.Bbox
|
|||
|
|
s.Annotations = raw.Annotations
|
|||
|
|
s.Attributes = raw.Attributes
|
|||
|
|
if len(raw.Content) > 0 && string(raw.Content) != "null" {
|
|||
|
|
v, err := UnmarshalNodeContent(raw.Content)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Content = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TableGrid structured table grid with cell-level metadata.
|
|||
|
|
//
|
|||
|
|
// Stores row/column dimensions and a flat list of cells with position info.
|
|||
|
|
type TableGrid struct {
|
|||
|
|
// Number of rows in the table.
|
|||
|
|
Rows uint32 `json:"rows"`
|
|||
|
|
// Number of columns in the table.
|
|||
|
|
Cols uint32 `json:"cols"`
|
|||
|
|
// All cells in row-major order.
|
|||
|
|
Cells []GridCell `json:"cells,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GridCell individual grid cell with position and span metadata.
|
|||
|
|
type GridCell struct {
|
|||
|
|
// Cell text content.
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Zero-indexed row position.
|
|||
|
|
Row uint32 `json:"row"`
|
|||
|
|
// Zero-indexed column position.
|
|||
|
|
Col uint32 `json:"col"`
|
|||
|
|
// Number of rows this cell spans.
|
|||
|
|
RowSpan uint32 `json:"row_span"`
|
|||
|
|
// Number of columns this cell spans.
|
|||
|
|
ColSpan uint32 `json:"col_span"`
|
|||
|
|
// Whether this is a header cell.
|
|||
|
|
IsHeader bool `json:"is_header"`
|
|||
|
|
// Bounding box for this cell (if available).
|
|||
|
|
Bbox *BoundingBox `json:"bbox,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TextAnnotation inline text annotation — byte-range based formatting and links.
|
|||
|
|
//
|
|||
|
|
// Annotations reference byte offsets into the node's text content,
|
|||
|
|
// enabling precise identification of formatted regions.
|
|||
|
|
type TextAnnotation struct {
|
|||
|
|
// Start byte offset in the node's text content (inclusive).
|
|||
|
|
Start uint32 `json:"start"`
|
|||
|
|
// End byte offset in the node's text content (exclusive).
|
|||
|
|
End uint32 `json:"end"`
|
|||
|
|
// Annotation type.
|
|||
|
|
Kind AnnotationKind `json:"kind"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *TextAnnotation) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
Start uint32 `json:"start"`
|
|||
|
|
End uint32 `json:"end"`
|
|||
|
|
Kind json.RawMessage `json:"kind,omitempty"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Start = raw.Start
|
|||
|
|
s.End = raw.End
|
|||
|
|
if len(raw.Kind) > 0 && string(raw.Kind) != "null" {
|
|||
|
|
v, err := UnmarshalAnnotationKind(raw.Kind)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Kind = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractionResult general extraction result used by the core extraction API.
|
|||
|
|
//
|
|||
|
|
// This is the main result type returned by all extraction functions.
|
|||
|
|
type ExtractionResult struct {
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
Metadata Metadata `json:"metadata"`
|
|||
|
|
// Extraction strategy used to produce the returned text.
|
|||
|
|
//
|
|||
|
|
// Populated when the extractor can reliably distinguish native text extraction,
|
|||
|
|
// OCR-only extraction, or mixed native/OCR output.
|
|||
|
|
ExtractionMethod *ExtractionMethod `json:"extraction_method,omitempty"`
|
|||
|
|
Tables []Table `json:"tables,omitempty"`
|
|||
|
|
DetectedLanguages []string `json:"detected_languages,omitempty"`
|
|||
|
|
// Text chunks when chunking is enabled.
|
|||
|
|
//
|
|||
|
|
// When chunking configuration is provided, the content is split into
|
|||
|
|
// overlapping chunks for efficient processing. Each chunk contains the text,
|
|||
|
|
// optional embeddings (if enabled), and metadata about its position.
|
|||
|
|
Chunks []Chunk `json:"chunks,omitempty"`
|
|||
|
|
// Extracted images from the document.
|
|||
|
|
//
|
|||
|
|
// When image extraction is enabled via `ImageExtractionConfig`, this field
|
|||
|
|
// contains all images found in the document with their raw data and metadata.
|
|||
|
|
// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|||
|
|
Images []ExtractedImage `json:"images,omitempty"`
|
|||
|
|
// Per-page content when page extraction is enabled.
|
|||
|
|
//
|
|||
|
|
// When page extraction is configured, the document is split into per-page content
|
|||
|
|
// with tables and images mapped to their respective pages.
|
|||
|
|
Pages []PageContent `json:"pages,omitempty"`
|
|||
|
|
// Semantic elements when element-based result format is enabled.
|
|||
|
|
//
|
|||
|
|
// When result_format is set to ElementBased, this field contains semantic
|
|||
|
|
// elements with type classification, unique identifiers, and metadata for
|
|||
|
|
// Unstructured-compatible element-based processing.
|
|||
|
|
Elements []Element `json:"elements,omitempty"`
|
|||
|
|
// Rich Djot content structure (when extracting Djot documents).
|
|||
|
|
//
|
|||
|
|
// When extracting Djot documents with structured extraction enabled,
|
|||
|
|
// this field contains the full semantic structure including:
|
|||
|
|
// - Block-level elements with nesting
|
|||
|
|
// - Inline formatting with attributes
|
|||
|
|
// - Links, images, footnotes
|
|||
|
|
// - Math expressions
|
|||
|
|
// - Complete attribute information
|
|||
|
|
//
|
|||
|
|
// The `content` field still contains plain text for backward compatibility.
|
|||
|
|
//
|
|||
|
|
// Always `None` for non-Djot documents.
|
|||
|
|
DjotContent *DjotContent `json:"djot_content,omitempty"`
|
|||
|
|
// OCR elements with full spatial and confidence metadata.
|
|||
|
|
//
|
|||
|
|
// When OCR is performed with element extraction enabled, this field contains
|
|||
|
|
// the structured representation of detected text including:
|
|||
|
|
// - Bounding geometry (rectangles or quadrilaterals)
|
|||
|
|
// - Confidence scores (detection and recognition)
|
|||
|
|
// - Rotation information
|
|||
|
|
// - Hierarchical relationships (Tesseract only)
|
|||
|
|
//
|
|||
|
|
// This field preserves all metadata that would otherwise be lost when
|
|||
|
|
// converting to plain text or markdown output formats.
|
|||
|
|
//
|
|||
|
|
// Only populated when `OcrElementConfig.include_elements` is true.
|
|||
|
|
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
|
|||
|
|
// Structured document tree (when document structure extraction is enabled).
|
|||
|
|
//
|
|||
|
|
// When `include_document_structure` is true in `ExtractionConfig`, this field
|
|||
|
|
// contains the full hierarchical representation of the document including:
|
|||
|
|
// - Heading-driven section nesting
|
|||
|
|
// - Table grids with cell-level metadata
|
|||
|
|
// - Content layer classification (body, header, footer, footnote)
|
|||
|
|
// - Inline text annotations (formatting, links)
|
|||
|
|
// - Bounding boxes and page numbers
|
|||
|
|
//
|
|||
|
|
// Independent of `result_format` — can be combined with Unified or ElementBased.
|
|||
|
|
Document *DocumentStructure `json:"document,omitempty"`
|
|||
|
|
// Extracted keywords when keyword extraction is enabled.
|
|||
|
|
//
|
|||
|
|
// When keyword extraction (RAKE or YAKE) is configured, this field contains
|
|||
|
|
// the extracted keywords with scores, algorithm info, and position data.
|
|||
|
|
// Previously stored in `metadata.additional["keywords"]`.
|
|||
|
|
ExtractedKeywords []Keyword `json:"extracted_keywords,omitempty"`
|
|||
|
|
// Document quality score from quality analysis.
|
|||
|
|
//
|
|||
|
|
// A value between 0.0 and 1.0 indicating the overall text quality.
|
|||
|
|
// Previously stored in `metadata.additional["quality_score"]`.
|
|||
|
|
QualityScore *float64 `json:"quality_score,omitempty"`
|
|||
|
|
// Non-fatal warnings collected during processing pipeline stages.
|
|||
|
|
//
|
|||
|
|
// Captures errors from optional pipeline features (embedding, chunking,
|
|||
|
|
// language detection, output formatting) that don't prevent extraction
|
|||
|
|
// but may indicate degraded results.
|
|||
|
|
// Previously stored as individual keys in `metadata.additional`.
|
|||
|
|
ProcessingWarnings []ProcessingWarning `json:"processing_warnings,omitempty"`
|
|||
|
|
// PDF annotations extracted from the document.
|
|||
|
|
//
|
|||
|
|
// When annotation extraction is enabled via `PdfConfig::extract_annotations`,
|
|||
|
|
// this field contains text notes, highlights, links, stamps, and other
|
|||
|
|
// annotations found in PDF documents.
|
|||
|
|
Annotations []PdfAnnotation `json:"annotations,omitempty"`
|
|||
|
|
// Nested extraction results from archive contents.
|
|||
|
|
//
|
|||
|
|
// When extracting archives, each processable file inside produces its own
|
|||
|
|
// full extraction result. Set to `None` for non-archive formats.
|
|||
|
|
// Use `max_archive_depth` in config to control recursion depth.
|
|||
|
|
Children []ArchiveEntry `json:"children,omitempty"`
|
|||
|
|
// URIs/links discovered during document extraction.
|
|||
|
|
//
|
|||
|
|
// Contains hyperlinks, image references, citations, email addresses, and
|
|||
|
|
// other URI-like references found in the document. Always extracted when
|
|||
|
|
// present in the source document.
|
|||
|
|
Uris []ExtractedURI `json:"uris,omitempty"`
|
|||
|
|
// Tracked changes embedded in the source document.
|
|||
|
|
//
|
|||
|
|
// Populated by per-format extractors that understand change-tracking
|
|||
|
|
// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
|
|||
|
|
// …). Every extractor defaults to `None` until its format-specific
|
|||
|
|
// implementation is added. Extractors that do populate this field follow
|
|||
|
|
// the "accepted-changes" convention: inserted text is present in
|
|||
|
|
// `content`, deleted text is absent — the revision list is the separate
|
|||
|
|
// audit trail.
|
|||
|
|
Revisions []DocumentRevision `json:"revisions,omitempty"`
|
|||
|
|
// Structured extraction output from LLM-based JSON schema extraction.
|
|||
|
|
//
|
|||
|
|
// When `structured_extraction` is configured in `ExtractionConfig`, the
|
|||
|
|
// extracted document content is sent to a VLM with the provided JSON schema.
|
|||
|
|
// The response is parsed and stored here as a JSON value matching the schema.
|
|||
|
|
StructuredOutput *json.RawMessage `json:"structured_output,omitempty"`
|
|||
|
|
// Code intelligence results from tree-sitter analysis.
|
|||
|
|
//
|
|||
|
|
// Populated when extracting source code files with the `tree-sitter` feature.
|
|||
|
|
// Contains metrics, structural analysis, imports/exports, comments,
|
|||
|
|
// docstrings, symbols, diagnostics, and optionally chunked code segments.
|
|||
|
|
//
|
|||
|
|
// Stored as an opaque JSON value so that all language bindings (Go, Java,
|
|||
|
|
// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
|
|||
|
|
// The underlying type is `tree_sitter_language_pack::ProcessResult`.
|
|||
|
|
CodeIntelligence *json.RawMessage `json:"code_intelligence,omitempty"`
|
|||
|
|
// LLM token usage and cost data for all LLM calls made during this extraction.
|
|||
|
|
//
|
|||
|
|
// Contains one entry per LLM call. Multiple entries are produced when
|
|||
|
|
// VLM OCR, structured extraction, or LLM embeddings run during
|
|||
|
|
// the same extraction.
|
|||
|
|
//
|
|||
|
|
// `None` when no LLM was used.
|
|||
|
|
LlmUsage []LlmUsage `json:"llm_usage,omitempty"`
|
|||
|
|
// Pre-rendered content in the requested output format.
|
|||
|
|
//
|
|||
|
|
// Populated during `derive_extraction_result` before tree derivation consumes
|
|||
|
|
// element data. `apply_output_format` swaps this into `content` at the end
|
|||
|
|
// of the pipeline, after post-processors have operated on plain text.
|
|||
|
|
FormattedContent *string `json:"formatted_content,omitempty"`
|
|||
|
|
// Structured hOCR document for the OCR+layout pipeline.
|
|||
|
|
//
|
|||
|
|
// When tesseract produces hOCR output, the parsed `InternalDocument` carries
|
|||
|
|
// paragraph structure with bounding boxes and confidence scores. The layout
|
|||
|
|
// classification step enriches these elements before final rendering.
|
|||
|
|
OcrInternalDocument *string `json:"ocr_internal_document,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ArchiveEntry single file extracted from an archive.
|
|||
|
|
//
|
|||
|
|
// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
|||
|
|
// enabled, each processable file produces its own full `ExtractionResult`.
|
|||
|
|
type ArchiveEntry struct {
|
|||
|
|
// Archive-relative file path (e.g. "folder/document.pdf").
|
|||
|
|
Path string `json:"path"`
|
|||
|
|
// Detected MIME type of the file.
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
// Full extraction result for this file.
|
|||
|
|
Result ExtractionResult `json:"result"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ProcessingWarning non-fatal warning from a processing pipeline stage.
|
|||
|
|
//
|
|||
|
|
// Captures errors from optional features that don't prevent extraction
|
|||
|
|
// but may indicate degraded results.
|
|||
|
|
type ProcessingWarning struct {
|
|||
|
|
// The pipeline stage or feature that produced this warning
|
|||
|
|
// (e.g., "embedding", "chunking", "language_detection", "output_format").
|
|||
|
|
Source string `json:"source"`
|
|||
|
|
// Human-readable description of what went wrong.
|
|||
|
|
Message string `json:"message"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LlmUsage token usage and cost data for a single LLM call made during extraction.
|
|||
|
|
//
|
|||
|
|
// Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
|||
|
|
// are used. Multiple entries may be present when multiple LLM calls occur
|
|||
|
|
// within one extraction (e.g. VLM OCR + structured extraction).
|
|||
|
|
type LlmUsage struct {
|
|||
|
|
// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
|
|||
|
|
Model string `json:"model"`
|
|||
|
|
// The pipeline stage that triggered this LLM call
|
|||
|
|
// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
|
|||
|
|
Source string `json:"source"`
|
|||
|
|
// Number of input/prompt tokens consumed.
|
|||
|
|
InputTokens *uint64 `json:"input_tokens,omitempty"`
|
|||
|
|
// Number of output/completion tokens generated.
|
|||
|
|
OutputTokens *uint64 `json:"output_tokens,omitempty"`
|
|||
|
|
// Total tokens (input + output).
|
|||
|
|
TotalTokens *uint64 `json:"total_tokens,omitempty"`
|
|||
|
|
// Estimated cost in USD based on the provider's published pricing.
|
|||
|
|
EstimatedCost *float64 `json:"estimated_cost,omitempty"`
|
|||
|
|
// Why the model stopped generating (e.g. "stop", "length", "content_filter").
|
|||
|
|
FinishReason *string `json:"finish_reason,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Chunk text chunk with optional embedding and metadata.
|
|||
|
|
//
|
|||
|
|
// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|||
|
|
// contains the text content, optional embedding vector (if embedding generation
|
|||
|
|
// is configured), and metadata about its position in the document.
|
|||
|
|
type Chunk struct {
|
|||
|
|
// The text content of this chunk.
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Semantic structural classification of this chunk.
|
|||
|
|
//
|
|||
|
|
// Assigned by the heuristic classifier based on content patterns and
|
|||
|
|
// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
|
|||
|
|
ChunkType ChunkType `json:"chunk_type"`
|
|||
|
|
// Optional embedding vector for this chunk.
|
|||
|
|
//
|
|||
|
|
// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|||
|
|
// The dimensionality depends on the chosen embedding model.
|
|||
|
|
Embedding []float32 `json:"embedding,omitempty"`
|
|||
|
|
// Metadata about this chunk's position and properties.
|
|||
|
|
Metadata ChunkMetadata `json:"metadata"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HeadingContext heading context for a chunk within a Markdown document.
|
|||
|
|
//
|
|||
|
|
// Contains the heading hierarchy from document root to this chunk's section.
|
|||
|
|
type HeadingContext struct {
|
|||
|
|
// The heading hierarchy from document root to this chunk's section.
|
|||
|
|
// Index 0 is the outermost (h1), last element is the most specific.
|
|||
|
|
Headings []HeadingLevel `json:"headings,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HeadingLevel single heading in the hierarchy.
|
|||
|
|
type HeadingLevel struct {
|
|||
|
|
// Heading depth (1 = h1, 2 = h2, etc.)
|
|||
|
|
Level uint8 `json:"level"`
|
|||
|
|
// The text content of the heading.
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ChunkMetadata metadata about a chunk's position in the original document.
|
|||
|
|
type ChunkMetadata struct {
|
|||
|
|
// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|||
|
|
ByteStart uint `json:"byte_start"`
|
|||
|
|
// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|||
|
|
ByteEnd uint `json:"byte_end"`
|
|||
|
|
// Number of tokens in this chunk (if available).
|
|||
|
|
//
|
|||
|
|
// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|||
|
|
TokenCount *uint `json:"token_count,omitempty"`
|
|||
|
|
// Zero-based index of this chunk in the document.
|
|||
|
|
ChunkIndex uint `json:"chunk_index"`
|
|||
|
|
// Total number of chunks in the document.
|
|||
|
|
TotalChunks uint `json:"total_chunks"`
|
|||
|
|
// First page number this chunk spans (1-indexed).
|
|||
|
|
//
|
|||
|
|
// Only populated when page tracking is enabled in extraction configuration.
|
|||
|
|
FirstPage *uint32 `json:"first_page,omitempty"`
|
|||
|
|
// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|||
|
|
//
|
|||
|
|
// Only populated when page tracking is enabled in extraction configuration.
|
|||
|
|
LastPage *uint32 `json:"last_page,omitempty"`
|
|||
|
|
// Heading context when using Markdown chunker.
|
|||
|
|
//
|
|||
|
|
// Contains the heading hierarchy this chunk falls under.
|
|||
|
|
// Only populated when `ChunkerType::Markdown` is used.
|
|||
|
|
HeadingContext *HeadingContext `json:"heading_context,omitempty"`
|
|||
|
|
// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
|||
|
|
//
|
|||
|
|
// Contains zero-based indices into the top-level `images` collection for every
|
|||
|
|
// image whose `page_number` falls within `[first_page, last_page]`.
|
|||
|
|
// Empty when image extraction is disabled or the chunk spans no pages with images.
|
|||
|
|
ImageIndices []uint32 `json:"image_indices,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractedImage extracted image from a document.
|
|||
|
|
//
|
|||
|
|
// Contains raw image data, metadata, and optional nested OCR results.
|
|||
|
|
// Raw bytes allow cross-language compatibility - users can convert to
|
|||
|
|
// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|||
|
|
type ExtractedImage struct {
|
|||
|
|
// Raw image data (PNG, JPEG, WebP, etc. bytes).
|
|||
|
|
// Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
Data []byte `json:"data"`
|
|||
|
|
// Image format (e.g., "jpeg", "png", "webp")
|
|||
|
|
// Uses Cow<'static, str> to avoid allocation for static literals.
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
// Zero-indexed position of this image in the document/page
|
|||
|
|
ImageIndex uint32 `json:"image_index"`
|
|||
|
|
// Page/slide number where image was found (1-indexed)
|
|||
|
|
PageNumber *uint32 `json:"page_number,omitempty"`
|
|||
|
|
// Image width in pixels
|
|||
|
|
Width *uint32 `json:"width,omitempty"`
|
|||
|
|
// Image height in pixels
|
|||
|
|
Height *uint32 `json:"height,omitempty"`
|
|||
|
|
// Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|||
|
|
Colorspace *string `json:"colorspace,omitempty"`
|
|||
|
|
// Bits per color component (e.g., 8, 16)
|
|||
|
|
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
|
|||
|
|
// Whether this image is a mask image
|
|||
|
|
IsMask bool `json:"is_mask"`
|
|||
|
|
// Optional description of the image
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
// Nested OCR extraction result (if image was OCRed)
|
|||
|
|
//
|
|||
|
|
// When OCR is performed on this image, the result is embedded here
|
|||
|
|
// rather than in a separate collection, making the relationship explicit.
|
|||
|
|
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
|
|||
|
|
// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
|||
|
|
// Only populated for PDF-extracted images when position data is available from the PDF extractor.
|
|||
|
|
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
|
|||
|
|
// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
|||
|
|
// Used for rendering image references when the binary data is not extracted.
|
|||
|
|
SourcePath *string `json:"source_path,omitempty"`
|
|||
|
|
// Heuristic classification of what this image likely depicts.
|
|||
|
|
// `None` if classification was disabled or inconclusive.
|
|||
|
|
ImageKind *ImageKind `json:"image_kind,omitempty"`
|
|||
|
|
// Confidence score for `image_kind`, in the range 0.0 to 1.0.
|
|||
|
|
KindConfidence *float32 `json:"kind_confidence,omitempty"`
|
|||
|
|
// Identifier shared across images that form a single logical figure
|
|||
|
|
// (e.g. all raster tiles of one technical drawing). `None` for singletons.
|
|||
|
|
ClusterID *uint32 `json:"cluster_id,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
|
|||
|
|
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
|
|||
|
|
func (v ExtractedImage) MarshalJSON() ([]byte, error) {
|
|||
|
|
// Explicit shadow struct listing every field — embedding the original
|
|||
|
|
// would cause both base64-string and int-array entries for the same JSON
|
|||
|
|
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
|
|||
|
|
aux := struct {
|
|||
|
|
Data []int `json:"data"`
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
ImageIndex uint32 `json:"image_index"`
|
|||
|
|
PageNumber *uint32 `json:"page_number,omitempty"`
|
|||
|
|
Width *uint32 `json:"width,omitempty"`
|
|||
|
|
Height *uint32 `json:"height,omitempty"`
|
|||
|
|
Colorspace *string `json:"colorspace,omitempty"`
|
|||
|
|
BitsPerComponent *uint32 `json:"bits_per_component,omitempty"`
|
|||
|
|
IsMask bool `json:"is_mask"`
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
OcrResult *ExtractionResult `json:"ocr_result,omitempty"`
|
|||
|
|
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
|
|||
|
|
SourcePath *string `json:"source_path,omitempty"`
|
|||
|
|
ImageKind *ImageKind `json:"image_kind,omitempty"`
|
|||
|
|
KindConfidence *float32 `json:"kind_confidence,omitempty"`
|
|||
|
|
ClusterID *uint32 `json:"cluster_id,omitempty"`
|
|||
|
|
}{}
|
|||
|
|
aux.Data = make([]int, len(v.Data))
|
|||
|
|
for i, b := range v.Data {
|
|||
|
|
aux.Data[i] = int(b)
|
|||
|
|
}
|
|||
|
|
aux.Format = v.Format
|
|||
|
|
aux.ImageIndex = v.ImageIndex
|
|||
|
|
aux.PageNumber = v.PageNumber
|
|||
|
|
aux.Width = v.Width
|
|||
|
|
aux.Height = v.Height
|
|||
|
|
aux.Colorspace = v.Colorspace
|
|||
|
|
aux.BitsPerComponent = v.BitsPerComponent
|
|||
|
|
aux.IsMask = v.IsMask
|
|||
|
|
aux.Description = v.Description
|
|||
|
|
aux.OcrResult = v.OcrResult
|
|||
|
|
aux.BoundingBox = v.BoundingBox
|
|||
|
|
aux.SourcePath = v.SourcePath
|
|||
|
|
aux.ImageKind = v.ImageKind
|
|||
|
|
aux.KindConfidence = v.KindConfidence
|
|||
|
|
aux.ClusterID = v.ClusterID
|
|||
|
|
return json.Marshal(aux)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BoundingBox bounding box coordinates for element positioning.
|
|||
|
|
type BoundingBox struct {
|
|||
|
|
// Left x-coordinate
|
|||
|
|
X0 float64 `json:"x0"`
|
|||
|
|
// Bottom y-coordinate
|
|||
|
|
Y0 float64 `json:"y0"`
|
|||
|
|
// Right x-coordinate
|
|||
|
|
X1 float64 `json:"x1"`
|
|||
|
|
// Top y-coordinate
|
|||
|
|
Y1 float64 `json:"y1"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ElementMetadata metadata for a semantic element.
|
|||
|
|
type ElementMetadata struct {
|
|||
|
|
// Page number (1-indexed)
|
|||
|
|
PageNumber *uint32 `json:"page_number,omitempty"`
|
|||
|
|
// Source filename or document name
|
|||
|
|
Filename *string `json:"filename,omitempty"`
|
|||
|
|
// Bounding box coordinates if available
|
|||
|
|
Coordinates *BoundingBox `json:"coordinates,omitempty"`
|
|||
|
|
// Position index in the element sequence
|
|||
|
|
ElementIndex *uint `json:"element_index,omitempty"`
|
|||
|
|
// Additional custom metadata
|
|||
|
|
Additional map[string]string `json:"additional,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Element semantic element extracted from document.
|
|||
|
|
//
|
|||
|
|
// Represents a logical unit of content with semantic classification,
|
|||
|
|
// unique identifier, and metadata for tracking origin and position.
|
|||
|
|
type Element struct {
|
|||
|
|
// Unique element identifier
|
|||
|
|
ElementID string `json:"element_id"`
|
|||
|
|
// Semantic type of this element
|
|||
|
|
ElementType ElementType `json:"element_type"`
|
|||
|
|
// Text content of the element
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// Metadata about the element
|
|||
|
|
Metadata ElementMetadata `json:"metadata"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExcelWorkbook excel workbook representation.
|
|||
|
|
//
|
|||
|
|
// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|||
|
|
// extracted content and metadata.
|
|||
|
|
type ExcelWorkbook struct {
|
|||
|
|
// All sheets in the workbook
|
|||
|
|
Sheets []ExcelSheet `json:"sheets,omitempty"`
|
|||
|
|
// Workbook-level metadata (author, creation date, etc.)
|
|||
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|||
|
|
// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
|
|||
|
|
//
|
|||
|
|
// Populated for legacy shared-workbook `.xlsx` files that contain the
|
|||
|
|
// `xl/revisions/` directory. Each `<header>` element maps to one
|
|||
|
|
// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
|
|||
|
|
// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
|
|||
|
|
// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
|
|||
|
|
// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
|
|||
|
|
Revisions []DocumentRevision `json:"revisions,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExcelSheet single Excel worksheet.
|
|||
|
|
//
|
|||
|
|
// Represents one sheet from an Excel workbook with its content
|
|||
|
|
// converted to Markdown format and dimensional statistics.
|
|||
|
|
type ExcelSheet struct {
|
|||
|
|
// Sheet name as it appears in Excel
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
// Sheet content converted to Markdown tables
|
|||
|
|
Markdown string `json:"markdown"`
|
|||
|
|
// Number of rows
|
|||
|
|
RowCount uint `json:"row_count"`
|
|||
|
|
// Number of columns
|
|||
|
|
ColCount uint `json:"col_count"`
|
|||
|
|
// Total number of non-empty cells
|
|||
|
|
CellCount uint `json:"cell_count"`
|
|||
|
|
// Pre-extracted table cells (2D vector of cell values)
|
|||
|
|
// Populated during markdown generation to avoid re-parsing markdown.
|
|||
|
|
// None for empty sheets.
|
|||
|
|
TableCells [][]string `json:"table_cells,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// XMLExtractionResult xML extraction result.
|
|||
|
|
//
|
|||
|
|
// Contains extracted text content from XML files along with
|
|||
|
|
// structural statistics about the XML document.
|
|||
|
|
type XMLExtractionResult struct {
|
|||
|
|
// Extracted text content (XML structure filtered out)
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Total number of XML elements processed
|
|||
|
|
ElementCount uint `json:"element_count"`
|
|||
|
|
// List of unique element names found (sorted)
|
|||
|
|
UniqueElements []string `json:"unique_elements,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TextExtractionResult plain text and Markdown extraction result.
|
|||
|
|
//
|
|||
|
|
// Contains the extracted text along with statistics and,
|
|||
|
|
// for Markdown files, structural elements like headers and links.
|
|||
|
|
type TextExtractionResult struct {
|
|||
|
|
// Extracted text content
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Number of lines
|
|||
|
|
LineCount uint `json:"line_count"`
|
|||
|
|
// Number of words
|
|||
|
|
WordCount uint `json:"word_count"`
|
|||
|
|
// Number of characters
|
|||
|
|
CharacterCount uint `json:"character_count"`
|
|||
|
|
// Markdown headers (text only, Markdown files only)
|
|||
|
|
Headers []string `json:"headers,omitempty"`
|
|||
|
|
// Markdown links as (text, URL) tuples (Markdown files only)
|
|||
|
|
Links [][]string `json:"links,omitempty"`
|
|||
|
|
// Code blocks as (language, code) tuples (Markdown files only)
|
|||
|
|
CodeBlocks [][]string `json:"code_blocks,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PptxExtractionResult powerPoint (PPTX) extraction result.
|
|||
|
|
//
|
|||
|
|
// Contains extracted slide content, metadata, and embedded images/tables.
|
|||
|
|
type PptxExtractionResult struct {
|
|||
|
|
// Extracted text content from all slides
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Presentation metadata
|
|||
|
|
Metadata PptxMetadata `json:"metadata"`
|
|||
|
|
// Total number of slides
|
|||
|
|
SlideCount uint `json:"slide_count"`
|
|||
|
|
// Total number of embedded images
|
|||
|
|
ImageCount uint `json:"image_count"`
|
|||
|
|
// Total number of tables
|
|||
|
|
TableCount uint `json:"table_count"`
|
|||
|
|
// Extracted images from the presentation
|
|||
|
|
Images []ExtractedImage `json:"images,omitempty"`
|
|||
|
|
// Slide structure with boundaries (when page tracking is enabled)
|
|||
|
|
PageStructure *PageStructure `json:"page_structure,omitempty"`
|
|||
|
|
// Per-slide content (when page tracking is enabled)
|
|||
|
|
PageContents []PageContent `json:"page_contents,omitempty"`
|
|||
|
|
// Structured document representation
|
|||
|
|
Document *DocumentStructure `json:"document,omitempty"`
|
|||
|
|
// Hyperlinks discovered in slides as (url, optional_label) pairs.
|
|||
|
|
Hyperlinks []string `json:"hyperlinks,omitempty"`
|
|||
|
|
// Office metadata extracted from docProps/core.xml and docProps/app.xml.
|
|||
|
|
//
|
|||
|
|
// Contains keys like "title", "author", "created_by", "subject", "keywords",
|
|||
|
|
// "modified_by", "created_at", "modified_at", etc.
|
|||
|
|
OfficeMetadata map[string]string `json:"office_metadata,omitempty"`
|
|||
|
|
// Slide comments as revisions.
|
|||
|
|
//
|
|||
|
|
// Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
|
|||
|
|
// `DocumentRevision { kind: Comment }` with author (resolved from
|
|||
|
|
// `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
|
|||
|
|
// `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
|
|||
|
|
Revisions []DocumentRevision `json:"revisions,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmailExtractionResult email extraction result.
|
|||
|
|
//
|
|||
|
|
// Complete representation of an extracted email message (.eml or .msg)
|
|||
|
|
// including headers, body content, and attachments.
|
|||
|
|
type EmailExtractionResult struct {
|
|||
|
|
// Email subject line
|
|||
|
|
Subject *string `json:"subject,omitempty"`
|
|||
|
|
// Sender email address
|
|||
|
|
FromEmail *string `json:"from_email,omitempty"`
|
|||
|
|
// Primary recipient email addresses
|
|||
|
|
ToEmails []string `json:"to_emails,omitempty"`
|
|||
|
|
// CC recipient email addresses
|
|||
|
|
CcEmails []string `json:"cc_emails,omitempty"`
|
|||
|
|
// BCC recipient email addresses
|
|||
|
|
BccEmails []string `json:"bcc_emails,omitempty"`
|
|||
|
|
// Email date/timestamp
|
|||
|
|
Date *string `json:"date,omitempty"`
|
|||
|
|
// Message-ID header value
|
|||
|
|
MessageID *string `json:"message_id,omitempty"`
|
|||
|
|
// Plain text version of the email body
|
|||
|
|
PlainText *string `json:"plain_text,omitempty"`
|
|||
|
|
// HTML version of the email body
|
|||
|
|
HTMLContent *string `json:"html_content,omitempty"`
|
|||
|
|
// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// List of email attachments
|
|||
|
|
Attachments []EmailAttachment `json:"attachments,omitempty"`
|
|||
|
|
// Additional email headers and metadata
|
|||
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmailAttachment email attachment representation.
|
|||
|
|
//
|
|||
|
|
// Contains metadata and optionally the content of an email attachment.
|
|||
|
|
type EmailAttachment struct {
|
|||
|
|
// Attachment name (from Content-Disposition header)
|
|||
|
|
Name *string `json:"name,omitempty"`
|
|||
|
|
// Filename of the attachment
|
|||
|
|
Filename *string `json:"filename,omitempty"`
|
|||
|
|
// MIME type of the attachment
|
|||
|
|
MimeType *string `json:"mime_type,omitempty"`
|
|||
|
|
// Size in bytes
|
|||
|
|
Size *uint `json:"size,omitempty"`
|
|||
|
|
// Whether this attachment is an image
|
|||
|
|
IsImage bool `json:"is_image"`
|
|||
|
|
// Attachment data (if extracted).
|
|||
|
|
// Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
Data []byte `json:"data,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
|
|||
|
|
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
|
|||
|
|
func (v EmailAttachment) MarshalJSON() ([]byte, error) {
|
|||
|
|
// Explicit shadow struct listing every field — embedding the original
|
|||
|
|
// would cause both base64-string and int-array entries for the same JSON
|
|||
|
|
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
|
|||
|
|
aux := struct {
|
|||
|
|
Name *string `json:"name,omitempty"`
|
|||
|
|
Filename *string `json:"filename,omitempty"`
|
|||
|
|
MimeType *string `json:"mime_type,omitempty"`
|
|||
|
|
Size *uint `json:"size,omitempty"`
|
|||
|
|
IsImage bool `json:"is_image"`
|
|||
|
|
Data []int `json:"data,omitempty"`
|
|||
|
|
}{}
|
|||
|
|
aux.Name = v.Name
|
|||
|
|
aux.Filename = v.Filename
|
|||
|
|
aux.MimeType = v.MimeType
|
|||
|
|
aux.Size = v.Size
|
|||
|
|
aux.IsImage = v.IsImage
|
|||
|
|
if v.Data != nil {
|
|||
|
|
aux.Data = make([]int, len(v.Data))
|
|||
|
|
for i, b := range v.Data {
|
|||
|
|
aux.Data[i] = int(b)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return json.Marshal(aux)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrExtractionResult oCR extraction result.
|
|||
|
|
//
|
|||
|
|
// Result of performing OCR on an image or scanned document,
|
|||
|
|
// including recognized text and detected tables.
|
|||
|
|
type OcrExtractionResult struct {
|
|||
|
|
// Recognized text content
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Original MIME type of the processed image
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
// OCR processing metadata (confidence scores, language, etc.)
|
|||
|
|
Metadata map[string]json.RawMessage `json:"metadata,omitempty"`
|
|||
|
|
// Tables detected and extracted via OCR
|
|||
|
|
Tables []OcrTable `json:"tables,omitempty"`
|
|||
|
|
// Structured OCR elements with bounding boxes and confidence scores.
|
|||
|
|
// Available when TSV output is requested or table detection is enabled.
|
|||
|
|
OcrElements []OcrElement `json:"ocr_elements,omitempty"`
|
|||
|
|
// Structured document produced from hOCR parsing.
|
|||
|
|
// Carries paragraph structure, bounding boxes, and confidence scores
|
|||
|
|
// that the flattened `content` string discards.
|
|||
|
|
InternalDocument *string `json:"internal_document,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrTable table detected via OCR.
|
|||
|
|
//
|
|||
|
|
// Represents a table structure recognized during OCR processing.
|
|||
|
|
type OcrTable struct {
|
|||
|
|
// Table cells as a 2D vector (rows × columns)
|
|||
|
|
Cells [][]string `json:"cells,omitempty"`
|
|||
|
|
// Markdown representation of the table
|
|||
|
|
Markdown string `json:"markdown"`
|
|||
|
|
// Page number where the table was found (1-indexed)
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
// Bounding box of the table in pixel coordinates (from OCR word positions).
|
|||
|
|
BoundingBox *OcrTableBoundingBox `json:"bounding_box,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrTableBoundingBox bounding box for an OCR-detected table in pixel coordinates.
|
|||
|
|
type OcrTableBoundingBox struct {
|
|||
|
|
// Left x-coordinate (pixels)
|
|||
|
|
Left uint32 `json:"left"`
|
|||
|
|
// Top y-coordinate (pixels)
|
|||
|
|
Top uint32 `json:"top"`
|
|||
|
|
// Right x-coordinate (pixels)
|
|||
|
|
Right uint32 `json:"right"`
|
|||
|
|
// Bottom y-coordinate (pixels)
|
|||
|
|
Bottom uint32 `json:"bottom"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ImagePreprocessingConfig image preprocessing configuration for OCR.
|
|||
|
|
//
|
|||
|
|
// These settings control how images are preprocessed before OCR to improve
|
|||
|
|
// text recognition quality. Different preprocessing strategies work better
|
|||
|
|
// for different document types.
|
|||
|
|
type ImagePreprocessingConfig struct {
|
|||
|
|
// Target DPI for the image (300 is standard, 600 for small text).
|
|||
|
|
TargetDpi *int32 `json:"target_dpi,omitempty"`
|
|||
|
|
// Auto-detect and correct image rotation.
|
|||
|
|
AutoRotate *bool `json:"auto_rotate,omitempty"`
|
|||
|
|
// Correct skew (tilted images).
|
|||
|
|
Deskew *bool `json:"deskew,omitempty"`
|
|||
|
|
// Remove noise from the image.
|
|||
|
|
Denoise bool `json:"denoise"`
|
|||
|
|
// Enhance contrast for better text visibility.
|
|||
|
|
ContrastEnhance bool `json:"contrast_enhance"`
|
|||
|
|
// Binarization method: "otsu", "sauvola", "adaptive".
|
|||
|
|
BinarizationMethod *string `json:"binarization_method,omitempty"`
|
|||
|
|
// Invert colors (white text on black → black on white).
|
|||
|
|
InvertColors bool `json:"invert_colors"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TesseractConfig tesseract OCR configuration.
|
|||
|
|
//
|
|||
|
|
// Provides fine-grained control over Tesseract OCR engine parameters.
|
|||
|
|
// Most users can use the defaults, but these settings allow optimization
|
|||
|
|
// for specific document types (invoices, handwriting, etc.).
|
|||
|
|
type TesseractConfig struct {
|
|||
|
|
// Language code (e.g., "eng", "deu", "fra")
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Page Segmentation Mode (0-13).
|
|||
|
|
//
|
|||
|
|
// Common values:
|
|||
|
|
// - 3: Fully automatic page segmentation (native default)
|
|||
|
|
// - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
|||
|
|
// - 11: Sparse text with no particular order
|
|||
|
|
Psm *int32 `json:"psm,omitempty"`
|
|||
|
|
// Output format ("text" or "markdown")
|
|||
|
|
OutputFormat *string `json:"output_format,omitempty"`
|
|||
|
|
// OCR Engine Mode (0-3).
|
|||
|
|
//
|
|||
|
|
// - 0: Legacy engine only
|
|||
|
|
// - 1: Neural nets (LSTM) only (usually best)
|
|||
|
|
// - 2: Legacy + LSTM
|
|||
|
|
// - 3: Default (based on what's available)
|
|||
|
|
Oem *int32 `json:"oem,omitempty"`
|
|||
|
|
// Minimum confidence threshold (0.0-100.0).
|
|||
|
|
//
|
|||
|
|
// Words with confidence below this threshold may be rejected or flagged.
|
|||
|
|
MinConfidence float64 `json:"min_confidence"`
|
|||
|
|
// Image preprocessing configuration.
|
|||
|
|
//
|
|||
|
|
// Controls how images are preprocessed before OCR. Can significantly
|
|||
|
|
// improve quality for scanned documents or low-quality images.
|
|||
|
|
Preprocessing *ImagePreprocessingConfig `json:"preprocessing,omitempty"`
|
|||
|
|
// Enable automatic table detection and reconstruction
|
|||
|
|
EnableTableDetection *bool `json:"enable_table_detection,omitempty"`
|
|||
|
|
// Minimum confidence threshold for table detection (0.0-1.0)
|
|||
|
|
TableMinConfidence float64 `json:"table_min_confidence"`
|
|||
|
|
// Column threshold for table detection (pixels)
|
|||
|
|
TableColumnThreshold *int32 `json:"table_column_threshold,omitempty"`
|
|||
|
|
// Row threshold ratio for table detection (0.0-1.0)
|
|||
|
|
TableRowThresholdRatio *float64 `json:"table_row_threshold_ratio,omitempty"`
|
|||
|
|
// Enable OCR result caching
|
|||
|
|
UseCache *bool `json:"use_cache,omitempty"`
|
|||
|
|
// Use pre-adapted templates for character classification
|
|||
|
|
ClassifyUsePreAdaptedTemplates *bool `json:"classify_use_pre_adapted_templates,omitempty"`
|
|||
|
|
// Enable N-gram language model
|
|||
|
|
LanguageModelNgramOn bool `json:"language_model_ngram_on"`
|
|||
|
|
// Don't reject good words during block-level processing
|
|||
|
|
TesseditDontBlkrejGoodWds *bool `json:"tessedit_dont_blkrej_good_wds,omitempty"`
|
|||
|
|
// Don't reject good words during row-level processing
|
|||
|
|
TesseditDontRowrejGoodWds *bool `json:"tessedit_dont_rowrej_good_wds,omitempty"`
|
|||
|
|
// Enable dictionary correction
|
|||
|
|
TesseditEnableDictCorrection *bool `json:"tessedit_enable_dict_correction,omitempty"`
|
|||
|
|
// Whitelist of allowed characters (empty = all allowed)
|
|||
|
|
TesseditCharWhitelist string `json:"tessedit_char_whitelist"`
|
|||
|
|
// Blacklist of forbidden characters (empty = none forbidden)
|
|||
|
|
TesseditCharBlacklist string `json:"tessedit_char_blacklist"`
|
|||
|
|
// Use primary language params model
|
|||
|
|
TesseditUsePrimaryParamsModel *bool `json:"tessedit_use_primary_params_model,omitempty"`
|
|||
|
|
// Variable-width space detection
|
|||
|
|
TextordSpaceSizeIsVariable *bool `json:"textord_space_size_is_variable,omitempty"`
|
|||
|
|
// Use adaptive thresholding method
|
|||
|
|
ThresholdingMethod bool `json:"thresholding_method"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ImagePreprocessingMetadata image preprocessing metadata.
|
|||
|
|
//
|
|||
|
|
// Tracks the transformations applied to an image during OCR preprocessing,
|
|||
|
|
// including DPI normalization, resizing, and resampling.
|
|||
|
|
type ImagePreprocessingMetadata struct {
|
|||
|
|
// Original image dimensions (width, height) in pixels
|
|||
|
|
OriginalDimensions []uint `json:"original_dimensions,omitempty"`
|
|||
|
|
// Original image DPI (horizontal, vertical)
|
|||
|
|
OriginalDpi []float64 `json:"original_dpi,omitempty"`
|
|||
|
|
// Target DPI from configuration
|
|||
|
|
TargetDpi int32 `json:"target_dpi"`
|
|||
|
|
// Scaling factor applied to the image
|
|||
|
|
ScaleFactor float64 `json:"scale_factor"`
|
|||
|
|
// Whether DPI was auto-adjusted based on content
|
|||
|
|
AutoAdjusted bool `json:"auto_adjusted"`
|
|||
|
|
// Final DPI after processing
|
|||
|
|
FinalDpi int32 `json:"final_dpi"`
|
|||
|
|
// New dimensions after resizing (if resized)
|
|||
|
|
NewDimensions []uint `json:"new_dimensions,omitempty"`
|
|||
|
|
// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|||
|
|
ResampleMethod string `json:"resample_method"`
|
|||
|
|
// Whether dimensions were clamped to max_image_dimension
|
|||
|
|
DimensionClamped bool `json:"dimension_clamped"`
|
|||
|
|
// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|||
|
|
CalculatedDpi *int32 `json:"calculated_dpi,omitempty"`
|
|||
|
|
// Whether resize was skipped (dimensions already optimal)
|
|||
|
|
SkippedResize bool `json:"skipped_resize"`
|
|||
|
|
// Error message if resize failed
|
|||
|
|
ResizeError *string `json:"resize_error,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Metadata extraction result metadata.
|
|||
|
|
//
|
|||
|
|
// Contains common fields applicable to all formats, format-specific metadata
|
|||
|
|
// via a discriminated union, and additional custom fields from postprocessors.
|
|||
|
|
type Metadata struct {
|
|||
|
|
// Document title
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Document subject or description
|
|||
|
|
Subject *string `json:"subject,omitempty"`
|
|||
|
|
// Primary author(s) - always Vec for consistency
|
|||
|
|
Authors []string `json:"authors,omitempty"`
|
|||
|
|
// Keywords/tags - always Vec for consistency
|
|||
|
|
Keywords []string `json:"keywords,omitempty"`
|
|||
|
|
// Primary language (ISO 639 code)
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Creation timestamp (ISO 8601 format)
|
|||
|
|
CreatedAt *string `json:"created_at,omitempty"`
|
|||
|
|
// Last modification timestamp (ISO 8601 format)
|
|||
|
|
ModifiedAt *string `json:"modified_at,omitempty"`
|
|||
|
|
// User who created the document
|
|||
|
|
CreatedBy *string `json:"created_by,omitempty"`
|
|||
|
|
// User who last modified the document
|
|||
|
|
ModifiedBy *string `json:"modified_by,omitempty"`
|
|||
|
|
// Page/slide/sheet structure with boundaries
|
|||
|
|
Pages *PageStructure `json:"pages,omitempty"`
|
|||
|
|
// Format-specific metadata (discriminated union)
|
|||
|
|
//
|
|||
|
|
// Contains detailed metadata specific to the document format.
|
|||
|
|
// Serialized as a nested `"format"` object with a `format_type` discriminator field.
|
|||
|
|
Format *FormatMetadata `json:"format,omitempty"`
|
|||
|
|
// Image preprocessing metadata (when OCR preprocessing was applied)
|
|||
|
|
ImagePreprocessing *ImagePreprocessingMetadata `json:"image_preprocessing,omitempty"`
|
|||
|
|
// JSON schema (for structured data extraction)
|
|||
|
|
JSONSchema *json.RawMessage `json:"json_schema,omitempty"`
|
|||
|
|
// Error metadata (for batch operations)
|
|||
|
|
Error *ErrorMetadata `json:"error,omitempty"`
|
|||
|
|
// Extraction duration in milliseconds (for benchmarking).
|
|||
|
|
//
|
|||
|
|
// This field is populated by batch extraction to provide per-file timing
|
|||
|
|
// information. It's `None` for single-file extraction (which uses external timing).
|
|||
|
|
ExtractionDurationMs *uint64 `json:"extraction_duration_ms,omitempty"`
|
|||
|
|
// Document category (from frontmatter or classification).
|
|||
|
|
Category *string `json:"category,omitempty"`
|
|||
|
|
// Document tags (from frontmatter).
|
|||
|
|
Tags []string `json:"tags,omitempty"`
|
|||
|
|
// Document version string (from frontmatter).
|
|||
|
|
DocumentVersion *string `json:"document_version,omitempty"`
|
|||
|
|
// Abstract or summary text (from frontmatter).
|
|||
|
|
AbstractText *string `json:"abstract_text,omitempty"`
|
|||
|
|
// Output format identifier (e.g., "markdown", "html", "text").
|
|||
|
|
//
|
|||
|
|
// Set by the output format pipeline stage when format conversion is applied.
|
|||
|
|
// Previously stored in `metadata.additional["output_format"]`.
|
|||
|
|
OutputFormat *string `json:"output_format,omitempty"`
|
|||
|
|
// Whether OCR was used during extraction.
|
|||
|
|
//
|
|||
|
|
// Set to `true` whenever the extraction pipeline ran an OCR backend
|
|||
|
|
// (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
|
|||
|
|
// or fallback text. `false` means native text extraction was used exclusively.
|
|||
|
|
OcrUsed bool `json:"ocr_used"`
|
|||
|
|
// Additional custom fields from postprocessors.
|
|||
|
|
//
|
|||
|
|
// Serialized as a nested `"additional"` object (not flattened at root level).
|
|||
|
|
// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
|
|||
|
|
Additional map[string]json.RawMessage `json:"additional,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExcelMetadata excel/spreadsheet format metadata.
|
|||
|
|
//
|
|||
|
|
// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
|||
|
|
// discriminant. Sheet count and sheet names are stored inside this struct.
|
|||
|
|
type ExcelMetadata struct {
|
|||
|
|
// Number of sheets in the workbook.
|
|||
|
|
SheetCount *uint32 `json:"sheet_count,omitempty"`
|
|||
|
|
// Names of all sheets in the workbook.
|
|||
|
|
SheetNames []string `json:"sheet_names,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmailMetadata email metadata extracted from .eml and .msg files.
|
|||
|
|
//
|
|||
|
|
// Includes sender/recipient information, message ID, and attachment list.
|
|||
|
|
type EmailMetadata struct {
|
|||
|
|
// Sender's email address
|
|||
|
|
FromEmail *string `json:"from_email,omitempty"`
|
|||
|
|
// Sender's display name
|
|||
|
|
FromName *string `json:"from_name,omitempty"`
|
|||
|
|
// Primary recipients
|
|||
|
|
ToEmails []string `json:"to_emails,omitempty"`
|
|||
|
|
// CC recipients
|
|||
|
|
CcEmails []string `json:"cc_emails,omitempty"`
|
|||
|
|
// BCC recipients
|
|||
|
|
BccEmails []string `json:"bcc_emails,omitempty"`
|
|||
|
|
// Message-ID header value
|
|||
|
|
MessageID *string `json:"message_id,omitempty"`
|
|||
|
|
// List of attachment filenames
|
|||
|
|
Attachments []string `json:"attachments,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ArchiveMetadata archive (ZIP/TAR/7Z) metadata.
|
|||
|
|
//
|
|||
|
|
// Extracted from compressed archive files containing file lists and size information.
|
|||
|
|
type ArchiveMetadata struct {
|
|||
|
|
// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
// Total number of files in the archive
|
|||
|
|
FileCount uint32 `json:"file_count"`
|
|||
|
|
// List of file paths within the archive
|
|||
|
|
FileList []string `json:"file_list,omitempty"`
|
|||
|
|
// Total uncompressed size in bytes
|
|||
|
|
TotalSize uint64 `json:"total_size"`
|
|||
|
|
// Compressed size in bytes (if available)
|
|||
|
|
CompressedSize *uint64 `json:"compressed_size,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ImageMetadata image metadata extracted from image files.
|
|||
|
|
//
|
|||
|
|
// Includes dimensions, format, and EXIF data.
|
|||
|
|
type ImageMetadata struct {
|
|||
|
|
// Image width in pixels
|
|||
|
|
Width uint32 `json:"width"`
|
|||
|
|
// Image height in pixels
|
|||
|
|
Height uint32 `json:"height"`
|
|||
|
|
// Image format (e.g., "PNG", "JPEG", "TIFF")
|
|||
|
|
Format string `json:"format"`
|
|||
|
|
// EXIF metadata tags
|
|||
|
|
Exif map[string]string `json:"exif,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// XMLMetadata xML metadata extracted during XML parsing.
|
|||
|
|
//
|
|||
|
|
// Provides statistics about XML document structure.
|
|||
|
|
type XMLMetadata struct {
|
|||
|
|
// Total number of XML elements processed
|
|||
|
|
ElementCount uint32 `json:"element_count"`
|
|||
|
|
// List of unique element tag names (sorted)
|
|||
|
|
UniqueElements []string `json:"unique_elements,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TextMetadata text/Markdown metadata.
|
|||
|
|
//
|
|||
|
|
// Extracted from plain text and Markdown files. Includes word counts and,
|
|||
|
|
// for Markdown, structural elements like headers and links.
|
|||
|
|
type TextMetadata struct {
|
|||
|
|
// Number of lines in the document
|
|||
|
|
LineCount uint32 `json:"line_count"`
|
|||
|
|
// Number of words
|
|||
|
|
WordCount uint32 `json:"word_count"`
|
|||
|
|
// Number of characters
|
|||
|
|
CharacterCount uint32 `json:"character_count"`
|
|||
|
|
// Markdown headers (headings text only, for Markdown files)
|
|||
|
|
Headers []string `json:"headers,omitempty"`
|
|||
|
|
// Markdown links as (text, url) tuples (for Markdown files)
|
|||
|
|
Links [][]string `json:"links,omitempty"`
|
|||
|
|
// Code blocks as (language, code) tuples (for Markdown files)
|
|||
|
|
CodeBlocks [][]string `json:"code_blocks,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HeaderMetadata header/heading element metadata.
|
|||
|
|
type HeaderMetadata struct {
|
|||
|
|
// Header level: 1 (h1) through 6 (h6)
|
|||
|
|
Level uint8 `json:"level"`
|
|||
|
|
// Normalized text content of the header
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// HTML id attribute if present
|
|||
|
|
ID *string `json:"id,omitempty"`
|
|||
|
|
// Document tree depth at the header element
|
|||
|
|
Depth uint32 `json:"depth"`
|
|||
|
|
// Byte offset in original HTML document
|
|||
|
|
HTMLOffset uint32 `json:"html_offset"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LinkMetadata link element metadata.
|
|||
|
|
type LinkMetadata struct {
|
|||
|
|
// The href URL value
|
|||
|
|
Href string `json:"href"`
|
|||
|
|
// Link text content (normalized)
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// Optional title attribute
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Link type classification
|
|||
|
|
LinkType LinkType `json:"link_type"`
|
|||
|
|
// Rel attribute values
|
|||
|
|
Rel []string `json:"rel,omitempty"`
|
|||
|
|
// Additional attributes as key-value pairs
|
|||
|
|
Attributes [][]string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ImageMetadataType image element metadata.
|
|||
|
|
type ImageMetadataType struct {
|
|||
|
|
// Image source (URL, data URI, or SVG content)
|
|||
|
|
Src string `json:"src"`
|
|||
|
|
// Alternative text from alt attribute
|
|||
|
|
Alt *string `json:"alt,omitempty"`
|
|||
|
|
// Title attribute
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Image dimensions as (width, height) if available
|
|||
|
|
Dimensions []uint32 `json:"dimensions,omitempty"`
|
|||
|
|
// Image type classification
|
|||
|
|
ImageType ImageType `json:"image_type"`
|
|||
|
|
// Additional attributes as key-value pairs
|
|||
|
|
Attributes [][]string `json:"attributes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StructuredData structured data (Schema.org, microdata, RDFa) block.
|
|||
|
|
type StructuredData struct {
|
|||
|
|
// Type of structured data
|
|||
|
|
DataType StructuredDataType `json:"data_type"`
|
|||
|
|
// Raw JSON string representation
|
|||
|
|
RawJSON string `json:"raw_json"`
|
|||
|
|
// Schema type if detectable (e.g., "Article", "Event", "Product")
|
|||
|
|
SchemaType *string `json:"schema_type,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HTMLMetadata hTML metadata extracted from HTML documents.
|
|||
|
|
//
|
|||
|
|
// Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|||
|
|
// and extracted structural elements (headers, links, images, structured data).
|
|||
|
|
type HTMLMetadata struct {
|
|||
|
|
// Document title from `<title>` tag
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Document description from `<meta name="description">` tag
|
|||
|
|
Description *string `json:"description,omitempty"`
|
|||
|
|
// Document keywords from `<meta name="keywords">` tag, split on commas
|
|||
|
|
Keywords []string `json:"keywords,omitempty"`
|
|||
|
|
// Document author from `<meta name="author">` tag
|
|||
|
|
Author *string `json:"author,omitempty"`
|
|||
|
|
// Canonical URL from `<link rel="canonical">` tag
|
|||
|
|
CanonicalURL *string `json:"canonical_url,omitempty"`
|
|||
|
|
// Base URL from `<base href="">` tag for resolving relative URLs
|
|||
|
|
BaseHref *string `json:"base_href,omitempty"`
|
|||
|
|
// Document language from `lang` attribute
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// Document text direction from `dir` attribute
|
|||
|
|
TextDirection *TextDirection `json:"text_direction,omitempty"`
|
|||
|
|
// Open Graph metadata (og:* properties) for social media
|
|||
|
|
// Keys like "title", "description", "image", "url", etc.
|
|||
|
|
OpenGraph map[string]string `json:"open_graph,omitempty"`
|
|||
|
|
// Twitter Card metadata (twitter:* properties)
|
|||
|
|
// Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|||
|
|
TwitterCard map[string]string `json:"twitter_card,omitempty"`
|
|||
|
|
// Additional meta tags not covered by specific fields
|
|||
|
|
// Keys are meta name/property attributes, values are content
|
|||
|
|
MetaTags map[string]string `json:"meta_tags,omitempty"`
|
|||
|
|
// Extracted header elements with hierarchy
|
|||
|
|
Headers []HeaderMetadata `json:"headers,omitempty"`
|
|||
|
|
// Extracted hyperlinks with type classification
|
|||
|
|
Links []LinkMetadata `json:"links,omitempty"`
|
|||
|
|
// Extracted images with source and dimensions
|
|||
|
|
Images []ImageMetadataType `json:"images,omitempty"`
|
|||
|
|
// Extracted structured data blocks
|
|||
|
|
StructuredData []StructuredData `json:"structured_data,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrMetadata oCR processing metadata.
|
|||
|
|
//
|
|||
|
|
// Captures information about OCR processing configuration and results.
|
|||
|
|
type OcrMetadata struct {
|
|||
|
|
// OCR language code(s) used
|
|||
|
|
Language string `json:"language"`
|
|||
|
|
// Tesseract Page Segmentation Mode (PSM)
|
|||
|
|
Psm int32 `json:"psm"`
|
|||
|
|
// Output format (e.g., "text", "hocr")
|
|||
|
|
OutputFormat string `json:"output_format"`
|
|||
|
|
// Number of tables detected
|
|||
|
|
TableCount uint32 `json:"table_count"`
|
|||
|
|
TableRows *uint32 `json:"table_rows,omitempty"`
|
|||
|
|
TableCols *uint32 `json:"table_cols,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ErrorMetadata error metadata (for batch operations).
|
|||
|
|
type ErrorMetadata struct {
|
|||
|
|
ErrorType string `json:"error_type"`
|
|||
|
|
Message string `json:"message"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PptxMetadata powerPoint presentation metadata.
|
|||
|
|
//
|
|||
|
|
// Extracted from PPTX files containing slide counts and presentation details.
|
|||
|
|
type PptxMetadata struct {
|
|||
|
|
// Total number of slides in the presentation
|
|||
|
|
SlideCount uint32 `json:"slide_count"`
|
|||
|
|
// Names of slides (if available)
|
|||
|
|
SlideNames []string `json:"slide_names,omitempty"`
|
|||
|
|
// Number of embedded images
|
|||
|
|
ImageCount *uint32 `json:"image_count,omitempty"`
|
|||
|
|
// Number of tables
|
|||
|
|
TableCount *uint32 `json:"table_count,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocxMetadata word document metadata.
|
|||
|
|
//
|
|||
|
|
// Extracted from DOCX files using shared Office Open XML metadata extraction.
|
|||
|
|
// Integrates with `office_metadata` module for core/app/custom properties.
|
|||
|
|
type DocxMetadata struct {
|
|||
|
|
// Core properties from docProps/core.xml (Dublin Core metadata)
|
|||
|
|
//
|
|||
|
|
// Contains title, creator, subject, keywords, dates, etc.
|
|||
|
|
// Shared format across DOCX/PPTX/XLSX documents.
|
|||
|
|
CoreProperties *CoreProperties `json:"core_properties,omitempty"`
|
|||
|
|
// Application properties from docProps/app.xml (Word-specific statistics)
|
|||
|
|
//
|
|||
|
|
// Contains word count, page count, paragraph count, editing time, etc.
|
|||
|
|
// DOCX-specific variant of Office application properties.
|
|||
|
|
AppProperties *DocxAppProperties `json:"app_properties,omitempty"`
|
|||
|
|
// Custom properties from docProps/custom.xml (user-defined properties)
|
|||
|
|
//
|
|||
|
|
// Contains key-value pairs defined by users or applications.
|
|||
|
|
// Values can be strings, numbers, booleans, or dates.
|
|||
|
|
CustomProperties map[string]json.RawMessage `json:"custom_properties,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CsvMetadata cSV/TSV file metadata.
|
|||
|
|
type CsvMetadata struct {
|
|||
|
|
RowCount uint32 `json:"row_count"`
|
|||
|
|
ColumnCount uint32 `json:"column_count"`
|
|||
|
|
Delimiter *string `json:"delimiter,omitempty"`
|
|||
|
|
HasHeader bool `json:"has_header"`
|
|||
|
|
ColumnTypes []string `json:"column_types,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BibtexMetadata bibTeX bibliography metadata.
|
|||
|
|
type BibtexMetadata struct {
|
|||
|
|
// Number of entries in the bibliography.
|
|||
|
|
EntryCount uint `json:"entry_count"`
|
|||
|
|
CitationKeys []string `json:"citation_keys,omitempty"`
|
|||
|
|
Authors []string `json:"authors,omitempty"`
|
|||
|
|
YearRange *YearRange `json:"year_range,omitempty"`
|
|||
|
|
EntryTypes map[string]uint `json:"entry_types,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CitationMetadata citation file metadata (RIS, PubMed, EndNote).
|
|||
|
|
type CitationMetadata struct {
|
|||
|
|
CitationCount uint `json:"citation_count"`
|
|||
|
|
Format *string `json:"format,omitempty"`
|
|||
|
|
Authors []string `json:"authors,omitempty"`
|
|||
|
|
YearRange *YearRange `json:"year_range,omitempty"`
|
|||
|
|
Dois []string `json:"dois,omitempty"`
|
|||
|
|
Keywords []string `json:"keywords,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// YearRange year range for bibliographic metadata.
|
|||
|
|
type YearRange struct {
|
|||
|
|
Min *uint32 `json:"min,omitempty"`
|
|||
|
|
Max *uint32 `json:"max,omitempty"`
|
|||
|
|
Years []uint32 `json:"years,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FictionBookMetadata fictionBook (FB2) metadata.
|
|||
|
|
type FictionBookMetadata struct {
|
|||
|
|
Genres []string `json:"genres,omitempty"`
|
|||
|
|
Sequences []string `json:"sequences,omitempty"`
|
|||
|
|
Annotation *string `json:"annotation,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DbfMetadata dBASE (DBF) file metadata.
|
|||
|
|
type DbfMetadata struct {
|
|||
|
|
RecordCount uint `json:"record_count"`
|
|||
|
|
FieldCount uint `json:"field_count"`
|
|||
|
|
Fields []DbfFieldInfo `json:"fields,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DbfFieldInfo dBASE field information.
|
|||
|
|
type DbfFieldInfo struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
FieldType string `json:"field_type"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// JatsMetadata jATS (Journal Article Tag Suite) metadata.
|
|||
|
|
type JatsMetadata struct {
|
|||
|
|
Copyright *string `json:"copyright,omitempty"`
|
|||
|
|
License *string `json:"license,omitempty"`
|
|||
|
|
HistoryDates map[string]string `json:"history_dates,omitempty"`
|
|||
|
|
ContributorRoles []ContributorRole `json:"contributor_roles,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ContributorRole jATS contributor with role.
|
|||
|
|
type ContributorRole struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Role *string `json:"role,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EpubMetadata ePUB metadata (Dublin Core extensions).
|
|||
|
|
type EpubMetadata struct {
|
|||
|
|
Coverage *string `json:"coverage,omitempty"`
|
|||
|
|
DcFormat *string `json:"dc_format,omitempty"`
|
|||
|
|
Relation *string `json:"relation,omitempty"`
|
|||
|
|
Source *string `json:"source,omitempty"`
|
|||
|
|
DcType *string `json:"dc_type,omitempty"`
|
|||
|
|
CoverImage *string `json:"cover_image,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PstMetadata outlook PST archive metadata.
|
|||
|
|
type PstMetadata struct {
|
|||
|
|
MessageCount uint `json:"message_count"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrConfidence confidence scores for an OCR element.
|
|||
|
|
//
|
|||
|
|
// Separates detection confidence (how confident that text exists at this location)
|
|||
|
|
// from recognition confidence (how confident about the actual text content).
|
|||
|
|
type OcrConfidence struct {
|
|||
|
|
// Detection confidence: how confident the OCR engine is that text exists here.
|
|||
|
|
//
|
|||
|
|
// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
|
|||
|
|
// Range: 0.0 to 1.0 (or None if not available).
|
|||
|
|
Detection *float64 `json:"detection,omitempty"`
|
|||
|
|
// Recognition confidence: how confident about the text content.
|
|||
|
|
//
|
|||
|
|
// Range: 0.0 to 1.0.
|
|||
|
|
Recognition float64 `json:"recognition"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrRotation rotation information for an OCR element.
|
|||
|
|
type OcrRotation struct {
|
|||
|
|
// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
|
|||
|
|
AngleDegrees float64 `json:"angle_degrees"`
|
|||
|
|
// Confidence score for the rotation detection.
|
|||
|
|
Confidence *float64 `json:"confidence,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrElement unified OCR element representing detected text with full metadata.
|
|||
|
|
//
|
|||
|
|
// This is the primary type for structured OCR output, preserving all information
|
|||
|
|
// from both Tesseract and PaddleOCR backends.
|
|||
|
|
type OcrElement struct {
|
|||
|
|
// The recognized text content.
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// Bounding geometry (rectangle or quadrilateral).
|
|||
|
|
Geometry OcrBoundingGeometry `json:"geometry"`
|
|||
|
|
// Confidence scores for detection and recognition.
|
|||
|
|
Confidence OcrConfidence `json:"confidence"`
|
|||
|
|
// Hierarchical level (word, line, block, page).
|
|||
|
|
Level OcrElementLevel `json:"level,omitempty"`
|
|||
|
|
// Rotation information (if detected).
|
|||
|
|
Rotation *OcrRotation `json:"rotation,omitempty"`
|
|||
|
|
// Page number (1-indexed).
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
// Parent element ID for hierarchical relationships.
|
|||
|
|
//
|
|||
|
|
// Only used for Tesseract output which has word -> line -> block hierarchy.
|
|||
|
|
ParentID *string `json:"parent_id,omitempty"`
|
|||
|
|
// Backend-specific metadata that doesn't fit the unified schema.
|
|||
|
|
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *OcrElement) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
Geometry json.RawMessage `json:"geometry,omitempty"`
|
|||
|
|
Confidence OcrConfidence `json:"confidence"`
|
|||
|
|
Level OcrElementLevel `json:"level,omitempty"`
|
|||
|
|
Rotation *OcrRotation `json:"rotation,omitempty"`
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
ParentID *string `json:"parent_id,omitempty"`
|
|||
|
|
BackendMetadata map[string]json.RawMessage `json:"backend_metadata,omitempty"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Text = raw.Text
|
|||
|
|
s.Confidence = raw.Confidence
|
|||
|
|
s.Level = raw.Level
|
|||
|
|
s.Rotation = raw.Rotation
|
|||
|
|
s.PageNumber = raw.PageNumber
|
|||
|
|
s.ParentID = raw.ParentID
|
|||
|
|
s.BackendMetadata = raw.BackendMetadata
|
|||
|
|
if len(raw.Geometry) > 0 && string(raw.Geometry) != "null" {
|
|||
|
|
v, err := UnmarshalOcrBoundingGeometry(raw.Geometry)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Geometry = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OcrElementConfig configuration for OCR element extraction.
|
|||
|
|
//
|
|||
|
|
// Controls how OCR elements are extracted and filtered.
|
|||
|
|
type OcrElementConfig struct {
|
|||
|
|
// Whether to include OCR elements in the extraction result.
|
|||
|
|
//
|
|||
|
|
// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
|
|||
|
|
IncludeElements bool `json:"include_elements"`
|
|||
|
|
// Minimum hierarchical level to include.
|
|||
|
|
//
|
|||
|
|
// Elements below this level (e.g., words when min_level is Line) will be excluded.
|
|||
|
|
MinLevel OcrElementLevel `json:"min_level,omitempty"`
|
|||
|
|
// Minimum recognition confidence threshold (0.0-1.0).
|
|||
|
|
//
|
|||
|
|
// Elements with confidence below this threshold will be filtered out.
|
|||
|
|
MinConfidence float64 `json:"min_confidence"`
|
|||
|
|
// Whether to build hierarchical relationships between elements.
|
|||
|
|
//
|
|||
|
|
// When true, `parent_id` fields will be populated based on spatial containment.
|
|||
|
|
// Only meaningful for Tesseract output.
|
|||
|
|
BuildHierarchy bool `json:"build_hierarchy"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageStructure unified page structure for documents.
|
|||
|
|
//
|
|||
|
|
// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|||
|
|
// with character offset boundaries for chunk-to-page mapping.
|
|||
|
|
type PageStructure struct {
|
|||
|
|
// Total number of pages/slides/sheets
|
|||
|
|
TotalCount uint32 `json:"total_count"`
|
|||
|
|
// Type of paginated unit
|
|||
|
|
UnitType PageUnitType `json:"unit_type"`
|
|||
|
|
// Character offset boundaries for each page
|
|||
|
|
//
|
|||
|
|
// Maps character ranges in the extracted content to page numbers.
|
|||
|
|
// Used for chunk page range calculation.
|
|||
|
|
Boundaries []PageBoundary `json:"boundaries,omitempty"`
|
|||
|
|
// Detailed per-page metadata (optional, only when needed)
|
|||
|
|
Pages []PageInfo `json:"pages,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageBoundary byte offset boundary for a page.
|
|||
|
|
//
|
|||
|
|
// Tracks where a specific page's content starts and ends in the main content string,
|
|||
|
|
// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|||
|
|
// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|||
|
|
type PageBoundary struct {
|
|||
|
|
// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|||
|
|
ByteStart uint `json:"byte_start"`
|
|||
|
|
// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|||
|
|
ByteEnd uint `json:"byte_end"`
|
|||
|
|
// Page number (1-indexed)
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageInfo metadata for individual page/slide/sheet.
|
|||
|
|
//
|
|||
|
|
// Captures per-page information including dimensions, content counts,
|
|||
|
|
// and visibility state (for presentations).
|
|||
|
|
type PageInfo struct {
|
|||
|
|
// Page number (1-indexed)
|
|||
|
|
Number uint32 `json:"number"`
|
|||
|
|
// Page title (usually for presentations)
|
|||
|
|
Title *string `json:"title,omitempty"`
|
|||
|
|
// Dimensions in points (PDF) or pixels (images): (width, height)
|
|||
|
|
Dimensions []float64 `json:"dimensions,omitempty"`
|
|||
|
|
// Number of images on this page
|
|||
|
|
ImageCount *uint32 `json:"image_count,omitempty"`
|
|||
|
|
// Number of tables on this page
|
|||
|
|
TableCount *uint32 `json:"table_count,omitempty"`
|
|||
|
|
// Whether this page is hidden (e.g., in presentations)
|
|||
|
|
Hidden *bool `json:"hidden,omitempty"`
|
|||
|
|
// Whether this page is blank (no meaningful text, no images, no tables)
|
|||
|
|
//
|
|||
|
|
// A page is considered blank if it has fewer than 3 non-whitespace characters
|
|||
|
|
// and contains no tables or images. This is useful for filtering out empty pages
|
|||
|
|
// in scanned documents or PDFs with blank separator pages.
|
|||
|
|
IsBlank *bool `json:"is_blank,omitempty"`
|
|||
|
|
// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
|||
|
|
//
|
|||
|
|
// Indicates the presence of vector-drawn content such as charts, diagrams,
|
|||
|
|
// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
|
|||
|
|
// invisible to `ExtractionResult.images` since they are not embedded as raster
|
|||
|
|
// XObjects. Set to `true` when path count exceeds a heuristic threshold,
|
|||
|
|
// signaling that downstream consumers may want to rasterize the page to
|
|||
|
|
// capture this content.
|
|||
|
|
//
|
|||
|
|
// Only populated for PDFs; `None` for other document types.
|
|||
|
|
HasVectorGraphics bool `json:"has_vector_graphics"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageContent content for a single page/slide.
|
|||
|
|
//
|
|||
|
|
// When page extraction is enabled, documents are split into per-page content
|
|||
|
|
// with associated tables and images mapped to each page.
|
|||
|
|
//
|
|||
|
|
// # Performance
|
|||
|
|
//
|
|||
|
|
// Uses Arc-wrapped tables and images for memory efficiency:
|
|||
|
|
// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|||
|
|
// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|||
|
|
// - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|||
|
|
//
|
|||
|
|
// This reduces memory overhead for documents with shared tables/images
|
|||
|
|
// by avoiding redundant copies during serialization.
|
|||
|
|
type PageContent struct {
|
|||
|
|
// Page number (1-indexed)
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
// Text content for this page
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Tables found on this page (uses Arc for memory efficiency)
|
|||
|
|
//
|
|||
|
|
// Serializes as Vec<Table> for JSON compatibility while maintaining
|
|||
|
|
// Arc semantics in-memory for zero-copy sharing.
|
|||
|
|
Tables []Table `json:"tables,omitempty"`
|
|||
|
|
// Indices into `ExtractionResult.images` for images found on this page.
|
|||
|
|
//
|
|||
|
|
// Each value is a zero-based index into the top-level `images` collection.
|
|||
|
|
// Only populated when `extract_images = true` in the extraction config.
|
|||
|
|
ImageIndices []uint32 `json:"image_indices,omitempty"`
|
|||
|
|
// Hierarchy information for the page (when hierarchy extraction is enabled)
|
|||
|
|
//
|
|||
|
|
// Contains text hierarchy levels (H1-H6) extracted from the page content.
|
|||
|
|
Hierarchy *PageHierarchy `json:"hierarchy,omitempty"`
|
|||
|
|
// Whether this page is blank (no meaningful text content)
|
|||
|
|
//
|
|||
|
|
// Determined during extraction based on text content analysis.
|
|||
|
|
// A page is blank if it has fewer than 3 non-whitespace characters
|
|||
|
|
// and contains no tables or images.
|
|||
|
|
IsBlank *bool `json:"is_blank,omitempty"`
|
|||
|
|
// Layout detection regions for this page (when layout detection is enabled).
|
|||
|
|
//
|
|||
|
|
// Contains detected layout regions with class, confidence, bounding box,
|
|||
|
|
// and area fraction. Only populated when layout detection is configured.
|
|||
|
|
LayoutRegions []LayoutRegion `json:"layout_regions,omitempty"`
|
|||
|
|
// Speaker notes for this slide (PPTX only).
|
|||
|
|
//
|
|||
|
|
// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
|
|||
|
|
// Only populated when the source is a PPTX file and notes are present.
|
|||
|
|
SpeakerNotes *string `json:"speaker_notes,omitempty"`
|
|||
|
|
// Section name this slide belongs to (PPTX only).
|
|||
|
|
//
|
|||
|
|
// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
|
|||
|
|
// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
|
|||
|
|
// the slide belongs to a named section.
|
|||
|
|
SectionName *string `json:"section_name,omitempty"`
|
|||
|
|
// Sheet name for this page (XLSX/ODS only).
|
|||
|
|
//
|
|||
|
|
// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
|
|||
|
|
// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
|
|||
|
|
// formats and for sheets with an empty name.
|
|||
|
|
SheetName *string `json:"sheet_name,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LayoutRegion detected layout region on a page.
|
|||
|
|
//
|
|||
|
|
// When layout detection is enabled, each page may have layout regions
|
|||
|
|
// identifying different content types (text, pictures, tables, etc.)
|
|||
|
|
// with confidence scores and spatial positions.
|
|||
|
|
type LayoutRegion struct {
|
|||
|
|
// Layout class name (e.g. "picture", "table", "text", "section_header").
|
|||
|
|
ClassName string `json:"class_name"`
|
|||
|
|
// Confidence score from the layout detection model (0.0 to 1.0).
|
|||
|
|
Confidence float64 `json:"confidence"`
|
|||
|
|
// Bounding box in document coordinate space.
|
|||
|
|
BoundingBox BoundingBox `json:"bounding_box"`
|
|||
|
|
// Fraction of the page area covered by this region (0.0 to 1.0).
|
|||
|
|
AreaFraction float64 `json:"area_fraction"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PageHierarchy page hierarchy structure containing heading levels and block information.
|
|||
|
|
//
|
|||
|
|
// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|||
|
|
// blocks with heading levels (H1-H6) for semantic document structure.
|
|||
|
|
type PageHierarchy struct {
|
|||
|
|
// Number of hierarchy blocks on this page
|
|||
|
|
BlockCount uint32 `json:"block_count"`
|
|||
|
|
// Hierarchical blocks with heading levels
|
|||
|
|
Blocks []HierarchicalBlock `json:"blocks,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HierarchicalBlock text block with hierarchy level assignment.
|
|||
|
|
//
|
|||
|
|
// Represents a block of text with semantic heading information extracted from
|
|||
|
|
// font size clustering and hierarchical analysis.
|
|||
|
|
type HierarchicalBlock struct {
|
|||
|
|
// The text content of this block
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// The font size of the text in this block
|
|||
|
|
FontSize float32 `json:"font_size"`
|
|||
|
|
// The hierarchy level of this block (H1-H6 or Body)
|
|||
|
|
//
|
|||
|
|
// Levels correspond to HTML heading tags:
|
|||
|
|
// - "h1": Top-level heading
|
|||
|
|
// - "h2": Secondary heading
|
|||
|
|
// - "h3": Tertiary heading
|
|||
|
|
// - "h4": Quaternary heading
|
|||
|
|
// - "h5": Quinary heading
|
|||
|
|
// - "h6": Senary heading
|
|||
|
|
// - "body": Body text (no heading level)
|
|||
|
|
Level string `json:"level"`
|
|||
|
|
// Bounding box information for the block
|
|||
|
|
//
|
|||
|
|
// Contains coordinates as (left, top, right, bottom) in PDF units.
|
|||
|
|
Bbox []float32 `json:"bbox,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CellChange single changed cell within a table.
|
|||
|
|
//
|
|||
|
|
// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|||
|
|
// reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
// `crate::diff` re-exports this type verbatim.
|
|||
|
|
type CellChange struct {
|
|||
|
|
// Zero-based row index.
|
|||
|
|
Row uint `json:"row"`
|
|||
|
|
// Zero-based column index.
|
|||
|
|
Col uint `json:"col"`
|
|||
|
|
// Value before the change.
|
|||
|
|
From string `json:"from"`
|
|||
|
|
// Value after the change.
|
|||
|
|
To string `json:"to"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DocumentRevision single tracked change embedded in a document.
|
|||
|
|
//
|
|||
|
|
// Populated by per-format extractors that understand change-tracking metadata
|
|||
|
|
// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
|||
|
|
// extractor defaults to `ExtractionResult.revisions = None` until a
|
|||
|
|
// format-specific implementation is added.
|
|||
|
|
type DocumentRevision struct {
|
|||
|
|
// Format-specific revision identifier.
|
|||
|
|
//
|
|||
|
|
// For DOCX this is the `w:id` attribute value on the change element
|
|||
|
|
// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
|
|||
|
|
// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
|
|||
|
|
RevisionID string `json:"revision_id"`
|
|||
|
|
// Display name of the author who made this change, when available.
|
|||
|
|
Author *string `json:"author,omitempty"`
|
|||
|
|
// ISO-8601 timestamp of the change, when available.
|
|||
|
|
//
|
|||
|
|
// Stored as a plain string so this type remains FFI-friendly and
|
|||
|
|
// unconditionally available without the `chrono` optional dep.
|
|||
|
|
// DOCX populates this from the `w:date` attribute (e.g.
|
|||
|
|
// `"2024-03-15T10:30:00Z"`).
|
|||
|
|
Timestamp *string `json:"timestamp,omitempty"`
|
|||
|
|
// Semantic kind of this revision.
|
|||
|
|
Kind RevisionKind `json:"kind"`
|
|||
|
|
// Best-effort document location for this revision.
|
|||
|
|
//
|
|||
|
|
// Resolution is format-dependent and may be `None` when the location
|
|||
|
|
// cannot be determined (e.g. changes inside table cells before
|
|||
|
|
// table-cell anchor support is added).
|
|||
|
|
Anchor RevisionAnchor `json:"anchor,omitempty"`
|
|||
|
|
// The content changes that make up this revision.
|
|||
|
|
Delta RevisionDelta `json:"delta"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (s *DocumentRevision) UnmarshalJSON(data []byte) error {
|
|||
|
|
var raw struct {
|
|||
|
|
RevisionID string `json:"revision_id"`
|
|||
|
|
Author *string `json:"author,omitempty"`
|
|||
|
|
Timestamp *string `json:"timestamp,omitempty"`
|
|||
|
|
Kind RevisionKind `json:"kind"`
|
|||
|
|
Anchor json.RawMessage `json:"anchor,omitempty"`
|
|||
|
|
Delta RevisionDelta `json:"delta"`
|
|||
|
|
}
|
|||
|
|
if err := json.Unmarshal(data, &raw); err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.RevisionID = raw.RevisionID
|
|||
|
|
s.Author = raw.Author
|
|||
|
|
s.Timestamp = raw.Timestamp
|
|||
|
|
s.Kind = raw.Kind
|
|||
|
|
s.Delta = raw.Delta
|
|||
|
|
if len(raw.Anchor) > 0 && string(raw.Anchor) != "null" {
|
|||
|
|
v, err := UnmarshalRevisionAnchor(raw.Anchor)
|
|||
|
|
if err != nil {
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
s.Anchor = v
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RevisionDelta content changes that make up a single revision.
|
|||
|
|
//
|
|||
|
|
// For insertions and deletions the `content` field carries the added/removed
|
|||
|
|
// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
|
|||
|
|
// changes, `content` is empty — the property diff is left as a TODO for a
|
|||
|
|
// later enrichment pass.
|
|||
|
|
type RevisionDelta struct {
|
|||
|
|
// Line-level content changes for this revision.
|
|||
|
|
Content []DiffLine `json:"content,omitempty"`
|
|||
|
|
// Cell-level table changes for this revision.
|
|||
|
|
TableChanges []CellChange `json:"table_changes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Table extracted table structure.
|
|||
|
|
//
|
|||
|
|
// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|||
|
|
// Tables are converted to both structured cell data and Markdown format.
|
|||
|
|
type Table struct {
|
|||
|
|
// Table cells as a 2D vector (rows × columns)
|
|||
|
|
Cells [][]string `json:"cells,omitempty"`
|
|||
|
|
// Markdown representation of the table
|
|||
|
|
Markdown string `json:"markdown"`
|
|||
|
|
// Page number where the table was found (1-indexed)
|
|||
|
|
PageNumber uint32 `json:"page_number"`
|
|||
|
|
// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
|||
|
|
// Only populated for PDF-extracted tables when position data is available.
|
|||
|
|
BoundingBox *BoundingBox `json:"bounding_box,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TableCell individual table cell with content and optional styling.
|
|||
|
|
//
|
|||
|
|
// Future extension point for rich table support with cell-level metadata.
|
|||
|
|
type TableCell struct {
|
|||
|
|
// Cell content as text
|
|||
|
|
Content string `json:"content"`
|
|||
|
|
// Row span (number of rows this cell spans)
|
|||
|
|
RowSpan uint32 `json:"row_span"`
|
|||
|
|
// Column span (number of columns this cell spans)
|
|||
|
|
ColSpan uint32 `json:"col_span"`
|
|||
|
|
// Whether this is a header cell
|
|||
|
|
IsHeader bool `json:"is_header"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractedURI uRI extracted from a document.
|
|||
|
|
//
|
|||
|
|
// Represents any link, reference, or resource pointer found during extraction.
|
|||
|
|
// The `kind` field classifies the URI semantically, while `label` carries
|
|||
|
|
// optional human-readable display text.
|
|||
|
|
type ExtractedURI struct {
|
|||
|
|
// The URL or path string.
|
|||
|
|
URL string `json:"url"`
|
|||
|
|
// Optional display text / label for the link.
|
|||
|
|
Label *string `json:"label,omitempty"`
|
|||
|
|
// Optional page number where the URI was found (1-indexed).
|
|||
|
|
Page *uint32 `json:"page,omitempty"`
|
|||
|
|
// Semantic classification of the URI.
|
|||
|
|
Kind URIKind `json:"kind"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DetectResponse mIME type detection response.
|
|||
|
|
type DetectResponse struct {
|
|||
|
|
// Detected MIME type
|
|||
|
|
MimeType string `json:"mime_type"`
|
|||
|
|
// Original filename (if provided)
|
|||
|
|
Filename *string `json:"filename,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DiffOptions options controlling how two `ExtractionResult` values are compared.
|
|||
|
|
type DiffOptions struct {
|
|||
|
|
// Include metadata changes in the diff. Default: `true`.
|
|||
|
|
IncludeMetadata *bool `json:"include_metadata,omitempty"`
|
|||
|
|
// Include embedded-children changes in the diff. Default: `true`.
|
|||
|
|
IncludeEmbedded *bool `json:"include_embedded,omitempty"`
|
|||
|
|
// Truncate content to this many characters before diffing.
|
|||
|
|
//
|
|||
|
|
// Useful for very large documents where only the first N characters matter.
|
|||
|
|
// `None` means no truncation.
|
|||
|
|
MaxContentChars *uint `json:"max_content_chars,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractionDiff complete diff between two `ExtractionResult` values.
|
|||
|
|
type ExtractionDiff struct {
|
|||
|
|
// Unified-diff hunks for the `content` field.
|
|||
|
|
//
|
|||
|
|
// Empty when the content is identical.
|
|||
|
|
ContentDiff []DiffHunk `json:"content_diff,omitempty"`
|
|||
|
|
// Tables present in `b` but not in `a` (by index position, excess right-side tables).
|
|||
|
|
TablesAdded []Table `json:"tables_added,omitempty"`
|
|||
|
|
// Tables present in `a` but not in `b` (by index position, excess left-side tables).
|
|||
|
|
TablesRemoved []Table `json:"tables_removed,omitempty"`
|
|||
|
|
// Cell-level changes for table pairs that share the same index and dimensions.
|
|||
|
|
TablesChanged []TableDiff `json:"tables_changed,omitempty"`
|
|||
|
|
// Metadata difference, encoded as a JSON object with three top-level keys:
|
|||
|
|
// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
|
|||
|
|
// but not `b`), and `changed` (keys whose values differ — each entry is
|
|||
|
|
// `{ "from": <value-in-a>, "to": <value-in-b> }`).
|
|||
|
|
//
|
|||
|
|
// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
|
|||
|
|
// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
|
|||
|
|
// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
|
|||
|
|
// preferred json-patch impl directly.
|
|||
|
|
MetadataChanged json.RawMessage `json:"metadata_changed"`
|
|||
|
|
// Changes to embedded archive children.
|
|||
|
|
EmbeddedChanges EmbeddedChanges `json:"embedded_changes"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DiffHunk single contiguous hunk in a unified diff.
|
|||
|
|
type DiffHunk struct {
|
|||
|
|
// Starting line number in the old content (0-indexed).
|
|||
|
|
FromLine uint `json:"from_line"`
|
|||
|
|
// Number of lines from the old content in this hunk.
|
|||
|
|
FromCount uint `json:"from_count"`
|
|||
|
|
// Starting line number in the new content (0-indexed).
|
|||
|
|
ToLine uint `json:"to_line"`
|
|||
|
|
// Number of lines from the new content in this hunk.
|
|||
|
|
ToCount uint `json:"to_count"`
|
|||
|
|
// Lines that make up this hunk.
|
|||
|
|
Lines []DiffLine `json:"lines,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TableDiff cell-level changes for a pair of tables that share the same index.
|
|||
|
|
type TableDiff struct {
|
|||
|
|
// Zero-based index of the table in both `a.tables` and `b.tables`.
|
|||
|
|
FromIndex uint `json:"from_index"`
|
|||
|
|
// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
|
|||
|
|
ToIndex uint `json:"to_index"`
|
|||
|
|
// Cell-level changes within the table.
|
|||
|
|
CellChanges []CellChange `json:"cell_changes,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddedChanges changes to embedded archive children between two results.
|
|||
|
|
type EmbeddedChanges struct {
|
|||
|
|
// Children present in `b` but not in `a` (matched by `path`).
|
|||
|
|
Added []ArchiveEntry `json:"added,omitempty"`
|
|||
|
|
// Children present in `a` but not in `b` (matched by `path`).
|
|||
|
|
Removed []ArchiveEntry `json:"removed,omitempty"`
|
|||
|
|
// Children present in both but with differing content (matched by `path`).
|
|||
|
|
//
|
|||
|
|
// Each entry holds the diff of the nested `ExtractionResult`.
|
|||
|
|
Changed []EmbeddedDiff `json:"changed,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddedDiff diff for a single embedded archive entry that appears in both results.
|
|||
|
|
type EmbeddedDiff struct {
|
|||
|
|
// Archive-relative path identifying this entry.
|
|||
|
|
Path string `json:"path"`
|
|||
|
|
// The recursive diff of the entry's extraction result.
|
|||
|
|
Diff ExtractionDiff `json:"diff"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddingPreset preset configurations for common RAG use cases.
|
|||
|
|
//
|
|||
|
|
// Each preset combines chunk size, overlap, and embedding model
|
|||
|
|
// to provide an optimized configuration for specific scenarios.
|
|||
|
|
//
|
|||
|
|
// All string fields are owned `String` for FFI compatibility — instances
|
|||
|
|
// are safe to clone and pass across language boundaries.
|
|||
|
|
type EmbeddingPreset struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
ChunkSize uint `json:"chunk_size"`
|
|||
|
|
Overlap uint `json:"overlap"`
|
|||
|
|
// HuggingFace repository name for the model.
|
|||
|
|
ModelRepo string `json:"model_repo"`
|
|||
|
|
// Pooling strategy: "cls" or "mean".
|
|||
|
|
Pooling string `json:"pooling"`
|
|||
|
|
// Path to the ONNX model file within the repo.
|
|||
|
|
ModelFile string `json:"model_file"`
|
|||
|
|
Dimensions uint `json:"dimensions"`
|
|||
|
|
Description string `json:"description"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// YakeParams yAKE-specific parameters.
|
|||
|
|
type YakeParams struct {
|
|||
|
|
// Window size for co-occurrence analysis (default: 2).
|
|||
|
|
//
|
|||
|
|
// Controls the context window for computing co-occurrence statistics.
|
|||
|
|
WindowSize *uint `json:"window_size,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RakeParams rAKE-specific parameters.
|
|||
|
|
type RakeParams struct {
|
|||
|
|
// Minimum word length to consider (default: 1).
|
|||
|
|
MinWordLength *uint `json:"min_word_length,omitempty"`
|
|||
|
|
// Maximum words in a keyword phrase (default: 3).
|
|||
|
|
MaxWordsPerPhrase *uint `json:"max_words_per_phrase,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// KeywordConfig keyword extraction configuration.
|
|||
|
|
type KeywordConfig struct {
|
|||
|
|
// Algorithm to use for extraction.
|
|||
|
|
Algorithm KeywordAlgorithm `json:"algorithm,omitempty"`
|
|||
|
|
// Maximum number of keywords to extract (default: 10).
|
|||
|
|
MaxKeywords *uint `json:"max_keywords,omitempty"`
|
|||
|
|
// Minimum score threshold (0.0-1.0, default: 0.0).
|
|||
|
|
//
|
|||
|
|
// Keywords with scores below this threshold are filtered out.
|
|||
|
|
// Note: Score ranges differ between algorithms.
|
|||
|
|
MinScore float32 `json:"min_score"`
|
|||
|
|
// N-gram range for keyword extraction (min, max).
|
|||
|
|
//
|
|||
|
|
// (1, 1) = unigrams only
|
|||
|
|
// (1, 2) = unigrams and bigrams
|
|||
|
|
// (1, 3) = unigrams, bigrams, and trigrams (default)
|
|||
|
|
NgramRange []uint `json:"ngram_range,omitempty"`
|
|||
|
|
// Language code for stopword filtering (e.g., "en", "de", "fr").
|
|||
|
|
//
|
|||
|
|
// If None, no stopword filtering is applied.
|
|||
|
|
Language *string `json:"language,omitempty"`
|
|||
|
|
// YAKE-specific tuning parameters.
|
|||
|
|
YakeParams *YakeParams `json:"yake_params,omitempty"`
|
|||
|
|
// RAKE-specific tuning parameters.
|
|||
|
|
RakeParams *RakeParams `json:"rake_params,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Keyword extracted keyword with metadata.
|
|||
|
|
type Keyword struct {
|
|||
|
|
// The keyword text.
|
|||
|
|
Text string `json:"text"`
|
|||
|
|
// Relevance score (higher is better, algorithm-specific range).
|
|||
|
|
Score float32 `json:"score"`
|
|||
|
|
// Algorithm that extracted this keyword.
|
|||
|
|
Algorithm KeywordAlgorithm `json:"algorithm"`
|
|||
|
|
// Optional positions where keyword appears in text (character offsets).
|
|||
|
|
Positions []uint `json:"positions,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PaddleOcrConfig configuration for PaddleOCR backend.
|
|||
|
|
//
|
|||
|
|
// Configures PaddleOCR text detection and recognition with multi-language support.
|
|||
|
|
// Uses a builder pattern for convenient configuration.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// // Create with default English configuration
|
|||
|
|
// let config = PaddleOcrConfig::new("en");
|
|||
|
|
//
|
|||
|
|
// // Create with custom cache directory
|
|||
|
|
// let config = PaddleOcrConfig::new("ch")
|
|||
|
|
// .with_cache_dir("/path/to/cache".into());
|
|||
|
|
//
|
|||
|
|
// // Enable table detection
|
|||
|
|
// let config = PaddleOcrConfig::new("en")
|
|||
|
|
// .with_table_detection(true);
|
|||
|
|
type PaddleOcrConfig struct {
|
|||
|
|
// Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
|
|||
|
|
Language string `json:"language"`
|
|||
|
|
// Optional custom cache directory for model files
|
|||
|
|
CacheDir *string `json:"cache_dir,omitempty"`
|
|||
|
|
// Enable angle classification for rotated text (default: false).
|
|||
|
|
// Can misfire on short text regions, rotating crops incorrectly before recognition.
|
|||
|
|
UseAngleCls bool `json:"use_angle_cls"`
|
|||
|
|
// Enable table structure detection (default: false)
|
|||
|
|
EnableTableDetection bool `json:"enable_table_detection"`
|
|||
|
|
// Database threshold for text detection (default: 0.3)
|
|||
|
|
// Range: 0.0-1.0, higher values require more confident detections
|
|||
|
|
DetDbThresh float32 `json:"det_db_thresh"`
|
|||
|
|
// Box threshold for text bounding box refinement (default: 0.5)
|
|||
|
|
// Range: 0.0-1.0
|
|||
|
|
DetDbBoxThresh float32 `json:"det_db_box_thresh"`
|
|||
|
|
// Unclip ratio for expanding text bounding boxes (default: 1.6)
|
|||
|
|
// Controls the expansion of detected text regions
|
|||
|
|
DetDbUnclipRatio float32 `json:"det_db_unclip_ratio"`
|
|||
|
|
// Maximum side length for detection image (default: 960)
|
|||
|
|
// Larger images may be resized to this limit for faster inference
|
|||
|
|
DetLimitSideLen uint32 `json:"det_limit_side_len"`
|
|||
|
|
// Batch size for recognition inference (default: 6)
|
|||
|
|
// Number of text regions to process simultaneously
|
|||
|
|
RecBatchNum uint32 `json:"rec_batch_num"`
|
|||
|
|
// Padding in pixels added around the image before detection (default: 10).
|
|||
|
|
// Large values can include surrounding content like table gridlines.
|
|||
|
|
Padding uint32 `json:"padding"`
|
|||
|
|
// Minimum recognition confidence score for text lines (default: 0.5).
|
|||
|
|
// Text regions with recognition confidence below this threshold are discarded.
|
|||
|
|
// Matches PaddleOCR Python's `drop_score` parameter.
|
|||
|
|
// Range: 0.0-1.0
|
|||
|
|
DropScore float32 `json:"drop_score"`
|
|||
|
|
// Model tier controlling detection/recognition model size and accuracy trade-off.
|
|||
|
|
// - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
|
|||
|
|
// - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
|
|||
|
|
ModelTier string `json:"model_tier"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ModelPaths combined paths to all models needed for OCR (backward compatibility).
|
|||
|
|
type ModelPaths struct {
|
|||
|
|
// Path to the detection model directory.
|
|||
|
|
DetModel string `json:"det_model"`
|
|||
|
|
// Path to the classification model directory.
|
|||
|
|
ClsModel string `json:"cls_model"`
|
|||
|
|
// Path to the recognition model directory.
|
|||
|
|
RecModel string `json:"rec_model"`
|
|||
|
|
// Path to the character dictionary file.
|
|||
|
|
DictFile string `json:"dict_file"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OrientationResult document orientation detection result.
|
|||
|
|
type OrientationResult struct {
|
|||
|
|
// Detected orientation in degrees (0, 90, 180, or 270).
|
|||
|
|
Degrees uint32 `json:"degrees"`
|
|||
|
|
// Confidence score (0.0-1.0).
|
|||
|
|
Confidence float32 `json:"confidence"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BBox bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
|
|||
|
|
type BBox struct {
|
|||
|
|
X1 float32 `json:"x1"`
|
|||
|
|
Y1 float32 `json:"y1"`
|
|||
|
|
X2 float32 `json:"x2"`
|
|||
|
|
Y2 float32 `json:"y2"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// LayoutDetection single layout detection result.
|
|||
|
|
type LayoutDetection struct {
|
|||
|
|
ClassName LayoutClass `json:"class_name"`
|
|||
|
|
Confidence float32 `json:"confidence"`
|
|||
|
|
Bbox BBox `json:"bbox"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RecognizedTable pre-computed table markdown for a table detection region.
|
|||
|
|
//
|
|||
|
|
// Produced by the TATR-based table structure recognizer and surfaced as part of
|
|||
|
|
// layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
|
|||
|
|
// so that consumers who do not enable `layout-detection` (ORT) can still reference
|
|||
|
|
// the type in their own code.
|
|||
|
|
type RecognizedTable struct {
|
|||
|
|
// Detection bbox that this table corresponds to (for matching).
|
|||
|
|
DetectionBbox BBox `json:"detection_bbox"`
|
|||
|
|
// Table cells as a 2D vector (rows × columns).
|
|||
|
|
Cells [][]string `json:"cells,omitempty"`
|
|||
|
|
// Rendered markdown table.
|
|||
|
|
Markdown string `json:"markdown"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DetectionResult page-level detection result containing all detections and page metadata.
|
|||
|
|
type DetectionResult struct {
|
|||
|
|
PageWidth uint32 `json:"page_width"`
|
|||
|
|
PageHeight uint32 `json:"page_height"`
|
|||
|
|
Detections []LayoutDetection `json:"detections,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbeddedFile embedded file descriptor extracted from the PDF name tree.
|
|||
|
|
type EmbeddedFile struct {
|
|||
|
|
// The filename as stored in the PDF name tree.
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
// Raw file bytes from the embedded stream (already decompressed by lopdf).
|
|||
|
|
Data []byte `json:"data"`
|
|||
|
|
// Compressed byte count of the original stream (before decompression).
|
|||
|
|
//
|
|||
|
|
// Used by callers to compute the decompression ratio and detect zip-bomb-style
|
|||
|
|
// attacks that embed a tiny compressed stream expanding to gigabytes of data.
|
|||
|
|
CompressedSize uint `json:"compressed_size"`
|
|||
|
|
// MIME type if specified in the filespec, otherwise `None`.
|
|||
|
|
MimeType *string `json:"mime_type,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MarshalJSON serializes `[]byte` fields as a JSON array of integers (the format
|
|||
|
|
// Rust's serde `Vec<u8>` deserializer expects) instead of Go's default base64 string.
|
|||
|
|
func (v EmbeddedFile) MarshalJSON() ([]byte, error) {
|
|||
|
|
// Explicit shadow struct listing every field — embedding the original
|
|||
|
|
// would cause both base64-string and int-array entries for the same JSON
|
|||
|
|
// key. Bytes fields rendered as `[]int`; everything else copied verbatim.
|
|||
|
|
aux := struct {
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Data []int `json:"data"`
|
|||
|
|
CompressedSize uint `json:"compressed_size"`
|
|||
|
|
MimeType *string `json:"mime_type,omitempty"`
|
|||
|
|
}{}
|
|||
|
|
aux.Name = v.Name
|
|||
|
|
aux.Data = make([]int, len(v.Data))
|
|||
|
|
for i, b := range v.Data {
|
|||
|
|
aux.Data[i] = int(b)
|
|||
|
|
}
|
|||
|
|
aux.CompressedSize = v.CompressedSize
|
|||
|
|
aux.MimeType = v.MimeType
|
|||
|
|
return json.Marshal(aux)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PdfMetadata pDF-specific metadata.
|
|||
|
|
//
|
|||
|
|
// Contains metadata fields specific to PDF documents that are not in the common
|
|||
|
|
// `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|||
|
|
// are at the `Metadata` level.
|
|||
|
|
type PdfMetadata struct {
|
|||
|
|
// PDF version (e.g., "1.7", "2.0")
|
|||
|
|
PdfVersion *string `json:"pdf_version,omitempty"`
|
|||
|
|
// PDF producer (application that created the PDF)
|
|||
|
|
Producer *string `json:"producer,omitempty"`
|
|||
|
|
// Whether the PDF is encrypted/password-protected
|
|||
|
|
IsEncrypted *bool `json:"is_encrypted,omitempty"`
|
|||
|
|
// First page width in points (1/72 inch)
|
|||
|
|
Width *int64 `json:"width,omitempty"`
|
|||
|
|
// First page height in points (1/72 inch)
|
|||
|
|
Height *int64 `json:"height,omitempty"`
|
|||
|
|
// Total number of pages in the PDF document
|
|||
|
|
PageCount *uint32 `json:"page_count,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractBytes extract content from a byte array.
|
|||
|
|
//
|
|||
|
|
// This is the main entry point for in-memory extraction. It performs the following steps:
|
|||
|
|
// 1. Validate MIME type
|
|||
|
|
// 2. Handle legacy format conversion if needed
|
|||
|
|
// 3. Select appropriate extractor from registry
|
|||
|
|
// 4. Extract content
|
|||
|
|
// 5. Run post-processing pipeline
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - content: The byte array to extract
|
|||
|
|
// - mime_type: MIME type of the content
|
|||
|
|
// - config: Extraction configuration
|
|||
|
|
//
|
|||
|
|
// Returns an `ExtractionResult` containing the extracted content and metadata.
|
|||
|
|
//
|
|||
|
|
// Errors are returned when returns `KreuzbergError::Validation` if MIME type is invalid.
|
|||
|
|
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let bytes = b"Hello, world!";
|
|||
|
|
// let result = extract_bytes(bytes, "text/plain", &config).await?;
|
|||
|
|
// println!("Content: {}", result.content);
|
|||
|
|
func ExtractBytes(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
|
|||
|
|
var cContent *C.uint8_t
|
|||
|
|
if len(content) > 0 {
|
|||
|
|
var cContentPinner runtime.Pinner
|
|||
|
|
cContentPinner.Pin(&content[0])
|
|||
|
|
defer cContentPinner.Unpin()
|
|||
|
|
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
|
|||
|
|
}
|
|||
|
|
cContentLen := C.uintptr_t(len(content))
|
|||
|
|
|
|||
|
|
cMimeType := C.CString(mimeType)
|
|||
|
|
defer C.free(unsafe.Pointer(cMimeType))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_extract_bytes(cContent, cContentLen, cMimeType, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
}
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to convert to JSON")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return &result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractFile extract content from a file.
|
|||
|
|
//
|
|||
|
|
// This is the main entry point for file-based extraction. It performs the following steps:
|
|||
|
|
// 1. Check cache for existing result (if caching enabled)
|
|||
|
|
// 2. Detect or validate MIME type
|
|||
|
|
// 3. Select appropriate extractor from registry
|
|||
|
|
// 4. Extract content
|
|||
|
|
// 5. Run post-processing pipeline
|
|||
|
|
// 6. Store result in cache (if caching enabled)
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - path: Path to the file to extract
|
|||
|
|
// - mime_type: Optional MIME type override. If None, will be auto-detected
|
|||
|
|
// - config: Extraction configuration
|
|||
|
|
//
|
|||
|
|
// Returns an `ExtractionResult` containing the extracted content and metadata.
|
|||
|
|
//
|
|||
|
|
// Errors are returned when returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
|||
|
|
// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let result = extract_file("document.pdf", None, &config).await?;
|
|||
|
|
// println!("Content: {}", result.content);
|
|||
|
|
func ExtractFile(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
|
|||
|
|
cPath := C.CString(path)
|
|||
|
|
defer C.free(unsafe.Pointer(cPath))
|
|||
|
|
|
|||
|
|
var cMimeType *C.char
|
|||
|
|
if mimeType != nil {
|
|||
|
|
cMimeType = C.CString(*mimeType)
|
|||
|
|
defer C.free(unsafe.Pointer(cMimeType))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_extract_file(cPath, cMimeType, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
}
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to convert to JSON")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return &result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractFileSync synchronous wrapper for `extract_file`.
|
|||
|
|
//
|
|||
|
|
// This is a convenience function that blocks the current thread until extraction completes.
|
|||
|
|
// For async code, use `extract_file` directly.
|
|||
|
|
//
|
|||
|
|
// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|||
|
|
// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
|||
|
|
//
|
|||
|
|
// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|||
|
|
// use a truly synchronous extraction approach instead.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let result = extract_file_sync("document.pdf", None, &config)?;
|
|||
|
|
// println!("Content: {}", result.content);
|
|||
|
|
func ExtractFileSync(path string, mimeType *string, config ExtractionConfig) (*ExtractionResult, error) {
|
|||
|
|
cPath := C.CString(path)
|
|||
|
|
defer C.free(unsafe.Pointer(cPath))
|
|||
|
|
|
|||
|
|
var cMimeType *C.char
|
|||
|
|
if mimeType != nil {
|
|||
|
|
cMimeType = C.CString(*mimeType)
|
|||
|
|
defer C.free(unsafe.Pointer(cMimeType))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_extract_file_sync(cPath, cMimeType, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
}
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to convert to JSON")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return &result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ExtractBytesSync synchronous wrapper for `extract_bytes`.
|
|||
|
|
//
|
|||
|
|
// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|||
|
|
// a new runtime per call.
|
|||
|
|
//
|
|||
|
|
// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|||
|
|
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let bytes = b"Hello, world!";
|
|||
|
|
// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
|
|||
|
|
// println!("Content: {}", result.content);
|
|||
|
|
func ExtractBytesSync(content []byte, mimeType string, config ExtractionConfig) (*ExtractionResult, error) {
|
|||
|
|
var cContent *C.uint8_t
|
|||
|
|
if len(content) > 0 {
|
|||
|
|
var cContentPinner runtime.Pinner
|
|||
|
|
cContentPinner.Pin(&content[0])
|
|||
|
|
defer cContentPinner.Unpin()
|
|||
|
|
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
|
|||
|
|
}
|
|||
|
|
cContentLen := C.uintptr_t(len(content))
|
|||
|
|
|
|||
|
|
cMimeType := C.CString(mimeType)
|
|||
|
|
defer C.free(unsafe.Pointer(cMimeType))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_extract_bytes_sync(cContent, cContentLen, cMimeType, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
}
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to convert to JSON")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return &result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchExtractFilesSync synchronous wrapper for `batch_extract_files`.
|
|||
|
|
//
|
|||
|
|
// Uses the global Tokio runtime for optimal performance.
|
|||
|
|
// Only available with `tokio-runtime` (WASM has no filesystem).
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchFileItem {
|
|||
|
|
// path: "doc1.pdf".into(),
|
|||
|
|
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
|||
|
|
// },
|
|||
|
|
// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_files_sync(items, &config)?;
|
|||
|
|
func BatchExtractFilesSync(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
|
|||
|
|
jsonBytescItems, err := json.Marshal(items)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cItems := C.CString(string(jsonBytescItems))
|
|||
|
|
defer C.free(unsafe.Pointer(cItems))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_batch_extract_files_sync(cItems, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchExtractBytesSync synchronous wrapper for `batch_extract_bytes`.
|
|||
|
|
//
|
|||
|
|
// Uses the global Tokio runtime for optimal performance.
|
|||
|
|
// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|||
|
|
// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
|||
|
|
// that iterates through items and calls `extract_bytes_sync()`.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
|||
|
|
// BatchBytesItem {
|
|||
|
|
// content: b"other".to_vec(),
|
|||
|
|
// mime_type: "text/plain".to_string(),
|
|||
|
|
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
|||
|
|
// },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_bytes_sync(items, &config)?;
|
|||
|
|
func BatchExtractBytesSync(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
|
|||
|
|
jsonBytescItems, err := json.Marshal(items)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cItems := C.CString(string(jsonBytescItems))
|
|||
|
|
defer C.free(unsafe.Pointer(cItems))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_batch_extract_bytes_sync(cItems, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchExtractFiles extract content from multiple files concurrently.
|
|||
|
|
//
|
|||
|
|
// This function processes multiple files in parallel, automatically managing
|
|||
|
|
// concurrency to prevent resource exhaustion. The concurrency limit can be
|
|||
|
|
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|||
|
|
// to `(num_cpus * 1.5).ceil()`.
|
|||
|
|
//
|
|||
|
|
// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
|||
|
|
// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
|||
|
|
// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
|||
|
|
// taken from the batch-level `config`.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - items: Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
|
|||
|
|
// - config: Batch-level extraction configuration (provides defaults and batch settings)
|
|||
|
|
//
|
|||
|
|
// Returns a vector of `ExtractionResult` in the same order as the input items.
|
|||
|
|
//
|
|||
|
|
// Errors are returned when individual file errors are captured in the result metadata. System errors
|
|||
|
|
// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// Simple usage with no per-file overrides:
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchFileItem { path: "doc1.pdf".into(), config: None },
|
|||
|
|
// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_files(items, &config).await?;
|
|||
|
|
// println!("Processed {} files", results.len());
|
|||
|
|
//
|
|||
|
|
// Per-file configuration overrides:
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchFileItem {
|
|||
|
|
// path: "scan.pdf".into(),
|
|||
|
|
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
|||
|
|
// },
|
|||
|
|
// BatchFileItem { path: "notes.txt".into(), config: None },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_files(items, &config).await?;
|
|||
|
|
func BatchExtractFiles(items []BatchFileItem, config ExtractionConfig) ([]ExtractionResult, error) {
|
|||
|
|
jsonBytescItems, err := json.Marshal(items)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cItems := C.CString(string(jsonBytescItems))
|
|||
|
|
defer C.free(unsafe.Pointer(cItems))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_batch_extract_files(cItems, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// BatchExtractBytes extract content from multiple byte arrays concurrently.
|
|||
|
|
//
|
|||
|
|
// This function processes multiple byte arrays in parallel, automatically managing
|
|||
|
|
// concurrency to prevent resource exhaustion. The concurrency limit can be
|
|||
|
|
// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|||
|
|
// to `(num_cpus * 1.5).ceil()`.
|
|||
|
|
//
|
|||
|
|
// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
|||
|
|
// fields from the batch-level `config`. Pass `None` as the config to use
|
|||
|
|
// the batch-level defaults for that item.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - items: Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
|
|||
|
|
// - config: Batch-level extraction configuration
|
|||
|
|
//
|
|||
|
|
// Returns a vector of `ExtractionResult` in the same order as the input items.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// Simple usage with no per-item overrides:
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
|||
|
|
// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_bytes(items, &config).await?;
|
|||
|
|
// println!("Processed {} items", results.len());
|
|||
|
|
//
|
|||
|
|
// Per-item configuration overrides:
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// let config = ExtractionConfig::default();
|
|||
|
|
// let items = vec![
|
|||
|
|
// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
|||
|
|
// BatchBytesItem {
|
|||
|
|
// content: b"<html>test</html>".to_vec(),
|
|||
|
|
// mime_type: "text/html".to_string(),
|
|||
|
|
// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
|||
|
|
// },
|
|||
|
|
// ];
|
|||
|
|
// let results = batch_extract_bytes(items, &config).await?;
|
|||
|
|
func BatchExtractBytes(items []BatchBytesItem, config ExtractionConfig) ([]ExtractionResult, error) {
|
|||
|
|
jsonBytescItems, err := json.Marshal(items)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cItems := C.CString(string(jsonBytescItems))
|
|||
|
|
defer C.free(unsafe.Pointer(cItems))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_extraction_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_batch_extract_bytes(cItems, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DetectMimeTypeFromBytes detect MIME type from raw file bytes.
|
|||
|
|
//
|
|||
|
|
// Uses magic byte signatures to detect file type from content.
|
|||
|
|
// Falls back to `infer` crate for comprehensive detection.
|
|||
|
|
//
|
|||
|
|
// For ZIP-based files, inspects contents to distinguish Office Open XML
|
|||
|
|
// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - content: Raw file bytes
|
|||
|
|
//
|
|||
|
|
// Returns the detected MIME type string.
|
|||
|
|
//
|
|||
|
|
// Errors are returned when returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|||
|
|
func DetectMimeTypeFromBytes(content []byte) (string, error) {
|
|||
|
|
var cContent *C.uint8_t
|
|||
|
|
if len(content) > 0 {
|
|||
|
|
var cContentPinner runtime.Pinner
|
|||
|
|
cContentPinner.Pin(&content[0])
|
|||
|
|
defer cContentPinner.Unpin()
|
|||
|
|
cContent = (*C.uint8_t)(unsafe.Pointer(&content[0]))
|
|||
|
|
}
|
|||
|
|
cContentLen := C.uintptr_t(len(content))
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_detect_mime_type_from_bytes(cContent, cContentLen)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_free_string(ptr)
|
|||
|
|
}
|
|||
|
|
return "", err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
return C.GoString(ptr), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetExtensionsForMime get file extensions for a given MIME type.
|
|||
|
|
//
|
|||
|
|
// Returns all known file extensions that map to the specified MIME type.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - mime_type: The MIME type to look up
|
|||
|
|
//
|
|||
|
|
// Returns a vector of file extensions (without leading dot) for the MIME type.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let extensions = get_extensions_for_mime("application/pdf");
|
|||
|
|
// assert_eq!(extensions, vec!["pdf"]);
|
|||
|
|
//
|
|||
|
|
// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|||
|
|
// assert!(doc_extensions.contains(&"docx".to_string()));
|
|||
|
|
func GetExtensionsForMime(mimeType string) ([]string, error) {
|
|||
|
|
cMimeType := C.CString(mimeType)
|
|||
|
|
defer C.free(unsafe.Pointer(cMimeType))
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_get_extensions_for_mime(cMimeType)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListEmbeddingBackends list the names of all registered embedding backends.
|
|||
|
|
//
|
|||
|
|
// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
|||
|
|
// bindings.
|
|||
|
|
func ListEmbeddingBackends() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_embedding_backends()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListDocumentExtractors list names of all registered document extractors.
|
|||
|
|
func ListDocumentExtractors() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_document_extractors()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListOcrBackends list all registered OCR backends.
|
|||
|
|
//
|
|||
|
|
// Returns the names of all OCR backends currently registered in the global registry.
|
|||
|
|
//
|
|||
|
|
// Returns a vector of OCR backend names.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let backends = list_ocr_backends()?;
|
|||
|
|
// for name in backends {
|
|||
|
|
// println!("Registered OCR backend: {}", name);
|
|||
|
|
// }
|
|||
|
|
func ListOcrBackends() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_ocr_backends()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListPostProcessors list all registered post-processor names.
|
|||
|
|
//
|
|||
|
|
// Returns a vector of all post-processor names currently registered in the
|
|||
|
|
// global registry.
|
|||
|
|
//
|
|||
|
|
// Returns - `Ok(Vec<String>)` - Vector of post-processor names
|
|||
|
|
// - `Err(...)` if the registry lock is poisoned
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let processors = list_post_processors()?;
|
|||
|
|
// for name in processors {
|
|||
|
|
// println!("Registered post-processor: {}", name);
|
|||
|
|
// }
|
|||
|
|
func ListPostProcessors() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_post_processors()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListRenderers list names of all registered renderers.
|
|||
|
|
//
|
|||
|
|
// Errors are returned when returns an error if the registry lock is poisoned.
|
|||
|
|
func ListRenderers() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_renderers()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListValidators list names of all registered validators.
|
|||
|
|
func ListValidators() ([]string, error) {
|
|||
|
|
ptr := C.kreuzberg_list_validators()
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Compare two extraction results and return a structured diff.
|
|||
|
|
//
|
|||
|
|
// The comparison is purely structural — no I/O, no side effects. All fields
|
|||
|
|
// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - a: — the "before" extraction result
|
|||
|
|
// - b: — the "after" extraction result
|
|||
|
|
// - opts: — controls which sections are compared and optional truncation
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut a = ExtractionResult::default();
|
|||
|
|
// let mut b = ExtractionResult::default();
|
|||
|
|
// a.content = "Hello world".to_string();
|
|||
|
|
// b.content = "Hello Rust".to_string();
|
|||
|
|
//
|
|||
|
|
// let diff = compare(&a, &b, &DiffOptions::default());
|
|||
|
|
// assert_eq!(diff.content_diff.len(), 1);
|
|||
|
|
func Compare(a ExtractionResult, b ExtractionResult, opts DiffOptions) (*ExtractionDiff, error) {
|
|||
|
|
jsonBytesca, err := json.Marshal(a)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytesca) == "null" {
|
|||
|
|
jsonBytesca = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrca := C.CString(string(jsonBytesca))
|
|||
|
|
ca := C.kreuzberg_extraction_result_from_json(tmpStrca)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrca))
|
|||
|
|
if ca == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ca)
|
|||
|
|
|
|||
|
|
jsonBytescb, err := json.Marshal(b)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescb) == "null" {
|
|||
|
|
jsonBytescb = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcb := C.CString(string(jsonBytescb))
|
|||
|
|
cb := C.kreuzberg_extraction_result_from_json(tmpStrcb)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcb))
|
|||
|
|
if cb == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_result_free(cb)
|
|||
|
|
|
|||
|
|
jsonBytescOpts, err := json.Marshal(opts)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescOpts) == "null" {
|
|||
|
|
jsonBytescOpts = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcOpts := C.CString(string(jsonBytescOpts))
|
|||
|
|
cOpts := C.kreuzberg_diff_options_from_json(tmpStrcOpts)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcOpts))
|
|||
|
|
if cOpts == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create diff_options: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_diff_options_free(cOpts)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_compare(ca, cb, cOpts)
|
|||
|
|
defer C.kreuzberg_extraction_diff_free(ptr)
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_diff_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to convert to JSON")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionDiff
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return &result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbedTextsAsync generate embeddings asynchronously for a list of text strings.
|
|||
|
|
//
|
|||
|
|
// This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
|||
|
|
// ONNX inference work to a dedicated blocking thread pool via Tokio's
|
|||
|
|
// `spawn_blocking`, keeping the async executor free.
|
|||
|
|
//
|
|||
|
|
// Returns one embedding vector per input text in the same order.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - texts: Vec of strings to embed (owned, sent to blocking thread)
|
|||
|
|
// - config: Embedding configuration specifying model, batch size, and normalization
|
|||
|
|
//
|
|||
|
|
// Errors are returned when - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
|||
|
|
// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
|||
|
|
// or the blocking inference task panics
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let embeddings = embed_texts_async(
|
|||
|
|
// vec!["Hello!".to_string()],
|
|||
|
|
// &EmbeddingConfig::default(),
|
|||
|
|
// ).await?;
|
|||
|
|
func EmbedTextsAsync(texts []string, config EmbeddingConfig) ([][]float32, error) {
|
|||
|
|
jsonBytescTexts, err := json.Marshal(texts)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cTexts := C.CString(string(jsonBytescTexts))
|
|||
|
|
defer C.free(unsafe.Pointer(cTexts))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_embedding_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_embed_texts_async(cTexts, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result [][]float32
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RenderPdfPageToPng render a single PDF page to PNG bytes.
|
|||
|
|
//
|
|||
|
|
// Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
|||
|
|
// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - pdf_bytes: Raw PDF file bytes
|
|||
|
|
// - page_index: Zero-based page index
|
|||
|
|
// - dpi: Resolution in dots per inch (default: 150)
|
|||
|
|
// - password: Optional password for encrypted PDFs
|
|||
|
|
//
|
|||
|
|
// Errors are returned when returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
|||
|
|
// or rendered, or if `page_index` is out of range.
|
|||
|
|
func RenderPdfPageToPng(pdfBytes []byte, pageIndex uint, dpi *int32, password *string) ([]byte, error) {
|
|||
|
|
var cPdfBytes *C.uint8_t
|
|||
|
|
if len(pdfBytes) > 0 {
|
|||
|
|
var cPdfBytesPinner runtime.Pinner
|
|||
|
|
cPdfBytesPinner.Pin(&pdfBytes[0])
|
|||
|
|
defer cPdfBytesPinner.Unpin()
|
|||
|
|
cPdfBytes = (*C.uint8_t)(unsafe.Pointer(&pdfBytes[0]))
|
|||
|
|
}
|
|||
|
|
cPdfBytesLen := C.uintptr_t(len(pdfBytes))
|
|||
|
|
|
|||
|
|
cPageIndex := C.size_t(uint(pageIndex))
|
|||
|
|
|
|||
|
|
var cDpi C.int32_t = C.int32_t(int32(2147483647))
|
|||
|
|
if dpi != nil {
|
|||
|
|
cDpi = C.int32_t(int32(*dpi))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var cPassword *C.char
|
|||
|
|
if password != nil {
|
|||
|
|
cPassword = C.CString(*password)
|
|||
|
|
defer C.free(unsafe.Pointer(cPassword))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var outPtr *C.uint8_t
|
|||
|
|
var outLen, outCap C.uintptr_t
|
|||
|
|
rc := C.kreuzberg_render_pdf_page_to_png(cPdfBytes, cPdfBytesLen, cPageIndex, cDpi, cPassword, &outPtr, &outLen, &outCap)
|
|||
|
|
if rc != 0 {
|
|||
|
|
return nil, lastError()
|
|||
|
|
}
|
|||
|
|
if outPtr == nil {
|
|||
|
|
return nil, lastError()
|
|||
|
|
}
|
|||
|
|
result := C.GoBytes(unsafe.Pointer(outPtr), C.int(outLen))
|
|||
|
|
C.kreuzberg_free_bytes(outPtr, outLen, outCap)
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DetectMimeType detect the MIME type of a file at the given path.
|
|||
|
|
//
|
|||
|
|
// Uses the file extension and optionally the file content to determine the MIME type.
|
|||
|
|
// Set `check_exists` to `true` to verify the file exists before detection.
|
|||
|
|
func DetectMimeType(path string, checkExists bool) (string, error) {
|
|||
|
|
cPath := C.CString(path)
|
|||
|
|
defer C.free(unsafe.Pointer(cPath))
|
|||
|
|
|
|||
|
|
var cCheckExists C.int32_t
|
|||
|
|
if checkExists {
|
|||
|
|
cCheckExists = 1
|
|||
|
|
} else {
|
|||
|
|
cCheckExists = 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_detect_mime_type(cPath, cCheckExists)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
if ptr != nil {
|
|||
|
|
C.kreuzberg_free_string(ptr)
|
|||
|
|
}
|
|||
|
|
return "", err
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
return C.GoString(ptr), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// EmbedTexts embed a list of texts using the configured embedding model.
|
|||
|
|
//
|
|||
|
|
// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
|||
|
|
func EmbedTexts(texts []string, config EmbeddingConfig) ([][]float32, error) {
|
|||
|
|
jsonBytescTexts, err := json.Marshal(texts)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
cTexts := C.CString(string(jsonBytescTexts))
|
|||
|
|
defer C.free(unsafe.Pointer(cTexts))
|
|||
|
|
|
|||
|
|
jsonBytescConfig, err := json.Marshal(config)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescConfig) == "null" {
|
|||
|
|
jsonBytescConfig = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcConfig := C.CString(string(jsonBytescConfig))
|
|||
|
|
cConfig := C.kreuzberg_embedding_config_from_json(tmpStrcConfig)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcConfig))
|
|||
|
|
if cConfig == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create embedding_config: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_embedding_config_free(cConfig)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_embed_texts(cTexts, cConfig)
|
|||
|
|
if err := lastError(); err != nil {
|
|||
|
|
return nil, err
|
|||
|
|
}
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to get result")
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result [][]float32
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to unmarshal: %w", err)
|
|||
|
|
}
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetEmbeddingPreset get an embedding preset by name.
|
|||
|
|
//
|
|||
|
|
// Returns `None` if no preset with the given name exists. Returns an owned
|
|||
|
|
// clone so the value is safe to pass across FFI boundaries.
|
|||
|
|
func GetEmbeddingPreset(name string) *EmbeddingPreset {
|
|||
|
|
cName := C.CString(name)
|
|||
|
|
defer C.free(unsafe.Pointer(cName))
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_get_embedding_preset(cName)
|
|||
|
|
return func() *EmbeddingPreset {
|
|||
|
|
jsonPtr := C.kreuzberg_embedding_preset_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result EmbeddingPreset
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListEmbeddingPresets list the names of all available embedding presets.
|
|||
|
|
//
|
|||
|
|
// Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
|||
|
|
func ListEmbeddingPresets() []string {
|
|||
|
|
ptr := C.kreuzberg_list_embedding_presets()
|
|||
|
|
return func() []string {
|
|||
|
|
if ptr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
var result []string
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(ptr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return result
|
|||
|
|
}()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NeedsImageProcessing check if image processing is needed by examining OCR and image extraction settings.
|
|||
|
|
//
|
|||
|
|
// Returns `true` if either OCR is enabled or image extraction is configured,
|
|||
|
|
// indicating that image decompression and processing should occur.
|
|||
|
|
// Returns `false` if both are disabled, allowing optimization to skip unnecessary
|
|||
|
|
// image decompression for text-only extraction workflows.
|
|||
|
|
//
|
|||
|
|
// # Optimization Impact
|
|||
|
|
// For text-only extractions (no OCR, no image extraction), skipping image
|
|||
|
|
// decompression can improve CPU utilization by 5-10% by avoiding wasteful
|
|||
|
|
// image I/O and processing when results won't be used.
|
|||
|
|
func (r *ExtractionConfig) NeedsImageProcessing() (bool, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return false, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_extraction_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_extraction_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_extraction_config_needs_image_processing(cRecv)
|
|||
|
|
return ptr != 0, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ListenAddr get the server listen address (host:port).
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = ServerConfig::default();
|
|||
|
|
// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
|
|||
|
|
func (r *ServerConfig) ListenAddr() (string, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return "", fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return "", fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_server_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_server_config_listen_addr(cRecv)
|
|||
|
|
defer C.kreuzberg_free_string(ptr)
|
|||
|
|
return C.GoString(ptr), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CorsAllowsAll check if CORS allows all origins.
|
|||
|
|
//
|
|||
|
|
// Returns `true` if the `cors_origins` vector is empty, meaning all origins
|
|||
|
|
// are allowed. Returns `false` if specific origins are configured.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut config = ServerConfig::default();
|
|||
|
|
// assert!(config.cors_allows_all());
|
|||
|
|
//
|
|||
|
|
// config.cors_origins.push("https://example.com".to_string());
|
|||
|
|
// assert!(!config.cors_allows_all());
|
|||
|
|
func (r *ServerConfig) CorsAllowsAll() (bool, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return false, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_server_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_server_config_cors_allows_all(cRecv)
|
|||
|
|
return ptr != 0, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IsOriginAllowed check if a given origin is allowed by CORS configuration.
|
|||
|
|
//
|
|||
|
|
// Returns `true` if:
|
|||
|
|
// - CORS allows all origins (empty origins list), or
|
|||
|
|
// - The given origin is in the allowed origins list
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - origin: The origin to check (e.g., "https://example.com")
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut config = ServerConfig::default();
|
|||
|
|
// assert!(config.is_origin_allowed("https://example.com"));
|
|||
|
|
//
|
|||
|
|
// config.cors_origins.push("https://allowed.com".to_string());
|
|||
|
|
// assert!(config.is_origin_allowed("https://allowed.com"));
|
|||
|
|
// assert!(!config.is_origin_allowed("https://denied.com"));
|
|||
|
|
func (r *ServerConfig) IsOriginAllowed(origin string) (bool, error) {
|
|||
|
|
cOrigin := C.CString(origin)
|
|||
|
|
defer C.free(unsafe.Pointer(cOrigin))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return false, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_server_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_server_config_is_origin_allowed(cRecv, cOrigin)
|
|||
|
|
return ptr != 0, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MaxRequestBodyMb get maximum request body size in megabytes (rounded up).
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut config = ServerConfig::default();
|
|||
|
|
// assert_eq!(config.max_request_body_mb(), 100);
|
|||
|
|
func (r *ServerConfig) MaxRequestBodyMb() (uint, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_server_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_server_config_max_request_body_mb(cRecv)
|
|||
|
|
return uint(ptr), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// MaxMultipartFieldMb get maximum multipart field size in megabytes (rounded up).
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut config = ServerConfig::default();
|
|||
|
|
// assert_eq!(config.max_multipart_field_mb(), 100);
|
|||
|
|
func (r *ServerConfig) MaxMultipartFieldMb() (uint, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return 0, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_server_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return 0, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_server_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_server_config_max_multipart_field_mb(cRecv)
|
|||
|
|
return uint(ptr), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FinalizeNodeTypes compute and populate the `node_types` field from the current `nodes`.
|
|||
|
|
//
|
|||
|
|
// Call this after all nodes have been added to the structure. Internal
|
|||
|
|
// construction paths (builder, derivation) call this automatically.
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let mut structure = DocumentStructure {
|
|||
|
|
// nodes: vec![DocumentNode {
|
|||
|
|
// id: NodeId::from("n1"),
|
|||
|
|
// content: NodeContent::Paragraph { text: "Hello".into() },
|
|||
|
|
// parent: None,
|
|||
|
|
// children: vec![],
|
|||
|
|
// content_layer: Default::default(),
|
|||
|
|
// page: None,
|
|||
|
|
// page_end: None,
|
|||
|
|
// bbox: None,
|
|||
|
|
// annotations: vec![],
|
|||
|
|
// attributes: None,
|
|||
|
|
// }],
|
|||
|
|
// source_format: None,
|
|||
|
|
// relationships: vec![],
|
|||
|
|
// node_types: vec![],
|
|||
|
|
// };
|
|||
|
|
// structure.finalize_node_types();
|
|||
|
|
// assert!(structure.node_types.contains(&"paragraph".to_string()));
|
|||
|
|
func (r *DocumentStructure) FinalizeNodeTypes() error {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_document_structure_free(cRecv)
|
|||
|
|
C.kreuzberg_document_structure_finalize_node_types(cRecv)
|
|||
|
|
jsonPtrUpdated := C.kreuzberg_document_structure_to_json(cRecv)
|
|||
|
|
if jsonPtrUpdated != nil {
|
|||
|
|
_ = json.Unmarshal([]byte(C.GoString(jsonPtrUpdated)), r)
|
|||
|
|
C.kreuzberg_free_string(jsonPtrUpdated)
|
|||
|
|
}
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IsEmpty check if the document structure is empty.
|
|||
|
|
func (r *DocumentStructure) IsEmpty() (bool, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return false, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_document_structure_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_document_structure_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_document_structure_is_empty(cRecv)
|
|||
|
|
return ptr != 0, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FromOcr convert from an OCR result.
|
|||
|
|
func ExtractionResultFromOcr(ocr OcrExtractionResult) (*ExtractionResult, error) {
|
|||
|
|
jsonBytescOcr, err := json.Marshal(ocr)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal: %w", err)
|
|||
|
|
}
|
|||
|
|
// When the parameter is a nil pointer (Option<&T> on the Rust side), json.Marshal
|
|||
|
|
// emits "null" which the FFI's _from_json rejects. Substitute "{}" so a default
|
|||
|
|
// instance is constructed instead — semantically equivalent to None for query types
|
|||
|
|
// whose fields are all optional with serde(default).
|
|||
|
|
if string(jsonBytescOcr) == "null" {
|
|||
|
|
jsonBytescOcr = []byte("{}")
|
|||
|
|
}
|
|||
|
|
tmpStrcOcr := C.CString(string(jsonBytescOcr))
|
|||
|
|
cOcr := C.kreuzberg_ocr_extraction_result_from_json(tmpStrcOcr)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrcOcr))
|
|||
|
|
if cOcr == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create ocr_extraction_result: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_ocr_extraction_result_free(cOcr)
|
|||
|
|
|
|||
|
|
ptr := C.kreuzberg_extraction_result_from_ocr(cOcr)
|
|||
|
|
defer C.kreuzberg_extraction_result_free(ptr)
|
|||
|
|
return func() *ExtractionResult {
|
|||
|
|
jsonPtr := C.kreuzberg_extraction_result_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result ExtractionResult
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IsEmpty returns `true` when no metadata fields, format-specific metadata, or
|
|||
|
|
// additional postprocessor fields are populated.
|
|||
|
|
func (r *Metadata) IsEmpty() (bool, error) {
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return false, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_metadata_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return false, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_metadata_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_metadata_is_empty(cRecv)
|
|||
|
|
return ptr != 0, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithCacheDir sets a custom cache directory for model files.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - path: Path to cache directory
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = PaddleOcrConfig::new("en")
|
|||
|
|
// .with_cache_dir(PathBuf::from("/tmp/paddle-cache"));
|
|||
|
|
func (r *PaddleOcrConfig) WithCacheDir(path string) (*PaddleOcrConfig, error) {
|
|||
|
|
cPath := C.CString(path)
|
|||
|
|
defer C.free(unsafe.Pointer(cPath))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_cache_dir(cRecv, cPath)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithTableDetection enables or disables table structure detection.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - enable: Whether to enable table detection
|
|||
|
|
//
|
|||
|
|
// Example:
|
|||
|
|
//
|
|||
|
|
// let config = PaddleOcrConfig::new("en")
|
|||
|
|
// .with_table_detection(true);
|
|||
|
|
func (r *PaddleOcrConfig) WithTableDetection(enable bool) (*PaddleOcrConfig, error) {
|
|||
|
|
var cEnable C.int32_t
|
|||
|
|
if enable {
|
|||
|
|
cEnable = 1
|
|||
|
|
} else {
|
|||
|
|
cEnable = 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_table_detection(cRecv, cEnable)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithAngleCls enables or disables angle classification for rotated text.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - enable: Whether to enable angle classification
|
|||
|
|
func (r *PaddleOcrConfig) WithAngleCls(enable bool) (*PaddleOcrConfig, error) {
|
|||
|
|
var cEnable C.int32_t
|
|||
|
|
if enable {
|
|||
|
|
cEnable = 1
|
|||
|
|
} else {
|
|||
|
|
cEnable = 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_angle_cls(cRecv, cEnable)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithDetDbThresh sets the database threshold for text detection.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - threshold: Detection threshold (0.0-1.0)
|
|||
|
|
func (r *PaddleOcrConfig) WithDetDbThresh(threshold float32) (*PaddleOcrConfig, error) {
|
|||
|
|
cThreshold := C.float(float32(threshold))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_thresh(cRecv, cThreshold)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithDetDbBoxThresh sets the box threshold for text bounding box refinement.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - threshold: Box threshold (0.0-1.0)
|
|||
|
|
func (r *PaddleOcrConfig) WithDetDbBoxThresh(threshold float32) (*PaddleOcrConfig, error) {
|
|||
|
|
cThreshold := C.float(float32(threshold))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_box_thresh(cRecv, cThreshold)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithDetDbUnclipRatio sets the unclip ratio for expanding text bounding boxes.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - ratio: Unclip ratio (typically 1.5-2.0)
|
|||
|
|
func (r *PaddleOcrConfig) WithDetDbUnclipRatio(ratio float32) (*PaddleOcrConfig, error) {
|
|||
|
|
cRatio := C.float(float32(ratio))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_det_db_unclip_ratio(cRecv, cRatio)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithDetLimitSideLen sets the maximum side length for detection images.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - length: Maximum side length in pixels
|
|||
|
|
func (r *PaddleOcrConfig) WithDetLimitSideLen(length uint32) (*PaddleOcrConfig, error) {
|
|||
|
|
cLength := C.uint32_t(uint32(length))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_det_limit_side_len(cRecv, cLength)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithRecBatchNum sets the batch size for recognition inference.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - batch_size: Number of text regions to process simultaneously
|
|||
|
|
func (r *PaddleOcrConfig) WithRecBatchNum(batchSize uint32) (*PaddleOcrConfig, error) {
|
|||
|
|
cBatchSize := C.uint32_t(uint32(batchSize))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_rec_batch_num(cRecv, cBatchSize)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithDropScore sets the minimum recognition confidence threshold.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - score: Minimum confidence (0.0-1.0), text below this is dropped
|
|||
|
|
func (r *PaddleOcrConfig) WithDropScore(score float32) (*PaddleOcrConfig, error) {
|
|||
|
|
cScore := C.float(float32(score))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_drop_score(cRecv, cScore)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithPadding sets padding in pixels added around images before detection.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - padding: Padding in pixels (0-100)
|
|||
|
|
func (r *PaddleOcrConfig) WithPadding(padding uint32) (*PaddleOcrConfig, error) {
|
|||
|
|
cPadding := C.uint32_t(uint32(padding))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_padding(cRecv, cPadding)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithModelTier sets the model tier controlling detection/recognition model size.
|
|||
|
|
//
|
|||
|
|
// Arguments:
|
|||
|
|
// - tier: `"mobile"` (default, lightweight, faster) or `"server"` (high accuracy, GPU/complex documents)
|
|||
|
|
func (r *PaddleOcrConfig) WithModelTier(tier string) (*PaddleOcrConfig, error) {
|
|||
|
|
cTier := C.CString(tier)
|
|||
|
|
defer C.free(unsafe.Pointer(cTier))
|
|||
|
|
|
|||
|
|
jsonBytesRecv, err := json.Marshal(r)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("failed to marshal receiver: %w", err)
|
|||
|
|
}
|
|||
|
|
tmpStrRecv := C.CString(string(jsonBytesRecv))
|
|||
|
|
cRecv := C.kreuzberg_paddle_ocr_config_from_json(tmpStrRecv)
|
|||
|
|
C.free(unsafe.Pointer(tmpStrRecv))
|
|||
|
|
if cRecv == nil {
|
|||
|
|
return nil, fmt.Errorf("failed to create receiver: %s", C.GoString(C.kreuzberg_last_error_context()))
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(cRecv)
|
|||
|
|
ptr := C.kreuzberg_paddle_ocr_config_with_model_tier(cRecv, cTier)
|
|||
|
|
defer C.kreuzberg_paddle_ocr_config_free(ptr)
|
|||
|
|
return func() *PaddleOcrConfig {
|
|||
|
|
jsonPtr := C.kreuzberg_paddle_ocr_config_to_json(ptr)
|
|||
|
|
if jsonPtr == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
defer C.kreuzberg_free_string(jsonPtr)
|
|||
|
|
var result PaddleOcrConfig
|
|||
|
|
if err := json.Unmarshal([]byte(C.GoString(jsonPtr)), &result); err != nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
return &result
|
|||
|
|
}(), nil
|
|||
|
|
}
|