Files
fil/packages/r/R/extendr-wrappers.R

3519 lines
135 KiB
R
Raw Normal View History

2026-06-01 23:40:55 +02:00
# Generated by extendr: Do not edit by hand
#
# This file is regenerated by alef on every `alef generate` run.
# It mirrors the output of `rextendr::document()` and binds every
# wrap__<symbol> entry registered in extendr_module! to an R-callable
# function or class env.
#' @useDynLib kreuzberg, .registration = TRUE
NULL
#' Extract content from a byte array
#'
#' This is the main entry point for in-memory extraction. It performs the following steps:
#' 1. Validate MIME type
#' 2. Handle legacy format conversion if needed
#' 3. Select appropriate extractor from registry
#' 4. Extract content
#' 5. Run post-processing pipeline
#' @param content The byte array to extract.
#' @param mime_type MIME type of the content.
#' @param config Extraction configuration.
#' @return An `ExtractionResult` containing the extracted content and metadata.
#'
#' @section Errors:
#' Returns `KreuzbergError::Validation` if MIME type is invalid.
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
#' @export
extract_bytes <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes", content, mime_type, config, PACKAGE = "kreuzberg")
#' Extract content from a file
#'
#' This is the main entry point for file-based extraction. It performs the following steps:
#' 1. Check cache for existing result (if caching enabled)
#' 2. Detect or validate MIME type
#' 3. Select appropriate extractor from registry
#' 4. Extract content
#' 5. Run post-processing pipeline
#' 6. Store result in cache (if caching enabled)
#' @param path Path to the file to extract.
#' @param mime_type Optional MIME type override. If None, will be auto-detected.
#' @param config Extraction configuration.
#' @return An `ExtractionResult` containing the extracted content and metadata.
#'
#' @section Errors:
#' Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
#' @export
extract_file <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file", path, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `extract_file`
#'
#' This is a convenience function that blocks the current thread until extraction completes.
#' For async code, use `extract_file` directly.
#'
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
#' a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
#'
#' This function is only available with the `tokio-runtime` feature. For WASM targets,
#' use a truly synchronous extraction approach instead.
#' @param path File path as character string.
#' @param mime_type Character string.
#' @param config ExtractionConfig object (list with class attribute).
#' @return ExtractionResult object (list with class attribute).
#' @export
extract_file_sync <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file_sync", path, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `extract_bytes`
#'
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
#' a new runtime per call.
#'
#' With the `tokio-runtime` feature, this blocks the current thread using the global
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
#' @param content Raw vector of bytes.
#' @param mime_type Character string.
#' @param config ExtractionConfig object (list with class attribute).
#' @return ExtractionResult object (list with class attribute).
#' @export
extract_bytes_sync <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes_sync", content, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `batch_extract_files`
#'
#' Uses the global Tokio runtime for optimal performance.
#' Only available with `tokio-runtime` (WASM has no filesystem).
#' @param items List of batchfileitem object (list with class attribute).
#' @param config ExtractionConfig object (list with class attribute).
#' @return List of extractionresult object (list with class attribute).
#' @export
batch_extract_files_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files_sync", items, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `batch_extract_bytes`
#'
#' Uses the global Tokio runtime for optimal performance.
#' With the `tokio-runtime` feature, this blocks the current thread using the global
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
#' that iterates through items and calls `extract_bytes_sync()`.
#' @param items List of batchbytesitem object (list with class attribute).
#' @param config ExtractionConfig object (list with class attribute).
#' @return List of extractionresult object (list with class attribute).
#' @export
batch_extract_bytes_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes_sync", items, config, PACKAGE = "kreuzberg")
#' Extract content from multiple files concurrently
#'
#' This function processes multiple files in parallel, automatically managing
#' concurrency to prevent resource exhaustion. The concurrency limit can be
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
#' to `(num_cpus * 1.5).ceil()`.
#'
#' Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
#' fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
#' Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
#' taken from the batch-level `config`.
#' @param items Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
#' @param config Batch-level extraction configuration (provides defaults and batch settings).
#' @return A vector of `ExtractionResult` in the same order as the input items.
#'
#' @section Errors:
#' Individual file errors are captured in the result metadata. System errors
#' (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
#' @export
batch_extract_files <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files", items, config, PACKAGE = "kreuzberg")
#' Extract content from multiple byte arrays concurrently
#'
#' This function processes multiple byte arrays in parallel, automatically managing
#' concurrency to prevent resource exhaustion. The concurrency limit can be
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
#' to `(num_cpus * 1.5).ceil()`.
#'
#' Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
#' fields from the batch-level `config`. Pass `None` as the config to use
#' the batch-level defaults for that item.
#' @param items Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
#' @param config Batch-level extraction configuration.
#' @return A vector of `ExtractionResult` in the same order as the input items.
#' @export
batch_extract_bytes <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes", items, config, PACKAGE = "kreuzberg")
#' Detect MIME type from raw file bytes
#'
#' Uses magic byte signatures to detect file type from content.
#' Falls back to `infer` crate for comprehensive detection.
#'
#' For ZIP-based files, inspects contents to distinguish Office Open XML
#' formats (DOCX, XLSX, PPTX) from plain ZIP archives.
#' @param content Raw file bytes.
#' @return The detected MIME type string.
#'
#' @section Errors:
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
#' @export
detect_mime_type_from_bytes <- function(content) .Call("wrap__detect_mime_type_from_bytes", content, PACKAGE = "kreuzberg")
#' Get file extensions for a given MIME type
#'
#' Returns all known file extensions that map to the specified MIME type.
#' @param mime_type The MIME type to look up.
#' @return A vector of file extensions (without leading dot) for the MIME type.
#' @export
get_extensions_for_mime <- function(mime_type) .Call("wrap__get_extensions_for_mime", mime_type, PACKAGE = "kreuzberg")
#' List the names of all registered embedding backends
#'
#' Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
#' bindings.
#' @return List of character string.
#' @export
list_embedding_backends <- function() .Call("wrap__list_embedding_backends", PACKAGE = "kreuzberg")
#' List names of all registered document extractors
#' @return List of character string.
#' @export
list_document_extractors <- function() .Call("wrap__list_document_extractors", PACKAGE = "kreuzberg")
#' List all registered OCR backends
#'
#' Returns the names of all OCR backends currently registered in the global registry.
#' @return A vector of OCR backend names.
#' @export
list_ocr_backends <- function() .Call("wrap__list_ocr_backends", PACKAGE = "kreuzberg")
#' List all registered post-processor names
#'
#' Returns a vector of all post-processor names currently registered in the
#' global registry.
#' @return - `Ok(Vec<String>)` - Vector of post-processor names
#' - `Err(...)` if the registry lock is poisoned.
#' @export
list_post_processors <- function() .Call("wrap__list_post_processors", PACKAGE = "kreuzberg")
#' List names of all registered renderers
#' @return List of character string.
#'
#' @section Errors:
#' Returns an error if the registry lock is poisoned.
#' @export
list_renderers <- function() .Call("wrap__list_renderers", PACKAGE = "kreuzberg")
#' List names of all registered validators
#' @return List of character string.
#' @export
list_validators <- function() .Call("wrap__list_validators", PACKAGE = "kreuzberg")
#' Compare two extraction results and return a structured diff
#'
#' The comparison is purely structural — no I/O, no side effects. All fields
#' of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
#' @param a — the "before" extraction result.
#' @param b — the "after" extraction result.
#' @param opts — controls which sections are compared and optional truncation.
#' @return ExtractionDiff object (list with class attribute).
#' @export
compare <- function(a = ExtractionResult$default(), b = ExtractionResult$default(), opts = DiffOptions$default()) .Call("wrap__compare", a, b, opts, PACKAGE = "kreuzberg")
#' Generate embeddings asynchronously for a list of text strings
#'
#' This is the async counterpart to [`embed_texts`]. It offloads the blocking
#' ONNX inference work to a dedicated blocking thread pool via Tokio's
#' `spawn_blocking`, keeping the async executor free.
#'
#' Returns one embedding vector per input text in the same order.
#' @param texts Vec of strings to embed (owned, sent to blocking thread).
#' @param config Embedding configuration specifying model, batch size, and normalization.
#' @return List of list of numeric.
#'
#' @section Errors:
#' - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
#' - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
#' or the blocking inference task panics
#' @export
embed_texts_async <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts_async", texts, config, PACKAGE = "kreuzberg")
#' Render a single PDF page to PNG bytes
#'
#' Returns raw PNG-encoded bytes for the specified page at the given DPI.
#' Uses pdf_oxide with tiny-skia for pure-Rust rendering.
#' @param pdf_bytes Raw PDF file bytes.
#' @param page_index Zero-based page index.
#' @param dpi Resolution in dots per inch (default: 150).
#' @param password Optional password for encrypted PDFs.
#' @return Raw vector of bytes.
#'
#' @section Errors:
#' Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
#' or rendered, or if `page_index` is out of range.
#' @export
render_pdf_page_to_png <- function(pdf_bytes, page_index, dpi = NULL, password = NULL) .Call("wrap__render_pdf_page_to_png", pdf_bytes, page_index, dpi, password, PACKAGE = "kreuzberg")
#' Detect the MIME type of a file at the given path
#'
#' Uses the file extension and optionally the file content to determine the MIME type.
#' Set `check_exists` to `true` to verify the file exists before detection.
#' @param path Character string.
#' @param check_exists Logical (TRUE/FALSE).
#' @return Character string.
#' @export
detect_mime_type <- function(path, check_exists) .Call("wrap__detect_mime_type", path, check_exists, PACKAGE = "kreuzberg")
#' Embed a list of texts using the configured embedding model
#'
#' Returns a 2D vector where each inner vector is the embedding for the corresponding text.
#' @param texts List of character string.
#' @param config EmbeddingConfig object (list with class attribute).
#' @return List of list of numeric.
#' @export
embed_texts <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts", texts, config, PACKAGE = "kreuzberg")
#' Get an embedding preset by name
#'
#' Returns `None` if no preset with the given name exists. Returns an owned
#' clone so the value is safe to pass across FFI boundaries.
#' @param name Character string.
#' @return Optional EmbeddingPreset object (list with class attribute). Defaults to NULL.
#' @export
get_embedding_preset <- function(name) .Call("wrap__get_embedding_preset", name, PACKAGE = "kreuzberg")
#' List the names of all available embedding presets
#'
#' Returns owned `String`s so the values are safe to pass across FFI boundaries.
#' @return List of character string.
#' @export
list_embedding_presets <- function() .Call("wrap__list_embedding_presets", PACKAGE = "kreuzberg")
#' register_ocr_backend
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_ocr_backend <- function(r_backend) .Call("wrap__register_ocr_backend", r_backend, PACKAGE = "kreuzberg")
#' unregister_ocr_backend
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_ocr_backend <- function(name) .Call("wrap__unregister_ocr_backend", name, PACKAGE = "kreuzberg")
#' clear_ocr_backends
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_ocr_backends <- function() .Call("wrap__clear_ocr_backends", PACKAGE = "kreuzberg")
#' register_post_processor
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_post_processor <- function(r_backend) .Call("wrap__register_post_processor", r_backend, PACKAGE = "kreuzberg")
#' unregister_post_processor
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_post_processor <- function(name) .Call("wrap__unregister_post_processor", name, PACKAGE = "kreuzberg")
#' clear_post_processors
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_post_processors <- function() .Call("wrap__clear_post_processors", PACKAGE = "kreuzberg")
#' register_validator
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_validator <- function(r_backend) .Call("wrap__register_validator", r_backend, PACKAGE = "kreuzberg")
#' unregister_validator
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_validator <- function(name) .Call("wrap__unregister_validator", name, PACKAGE = "kreuzberg")
#' clear_validators
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_validators <- function() .Call("wrap__clear_validators", PACKAGE = "kreuzberg")
#' register_embedding_backend
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_embedding_backend <- function(r_backend) .Call("wrap__register_embedding_backend", r_backend, PACKAGE = "kreuzberg")
#' unregister_embedding_backend
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_embedding_backend <- function(name) .Call("wrap__unregister_embedding_backend", name, PACKAGE = "kreuzberg")
#' clear_embedding_backends
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_embedding_backends <- function() .Call("wrap__clear_embedding_backends", PACKAGE = "kreuzberg")
#' register_document_extractor
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_document_extractor <- function(r_backend) .Call("wrap__register_document_extractor", r_backend, PACKAGE = "kreuzberg")
#' unregister_document_extractor
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_document_extractor <- function(name) .Call("wrap__unregister_document_extractor", name, PACKAGE = "kreuzberg")
#' clear_document_extractors
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_document_extractors <- function() .Call("wrap__clear_document_extractors", PACKAGE = "kreuzberg")
#' register_renderer
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_renderer <- function(r_backend) .Call("wrap__register_renderer", r_backend, PACKAGE = "kreuzberg")
#' unregister_renderer
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_renderer <- function(name) .Call("wrap__unregister_renderer", name, PACKAGE = "kreuzberg")
#' clear_renderers
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_renderers <- function() .Call("wrap__clear_renderers", PACKAGE = "kreuzberg")
#' CacheStats
#' @field total_files total_files
#' @field total_size_mb total_size_mb
#' @field available_space_mb available_space_mb
#' @field oldest_file_age_days oldest_file_age_days
#' @field newest_file_age_days newest_file_age_days
#' @export
CacheStats <- new.env(parent = emptyenv())
#' @export
`$.CacheStats` <- function(self, name) {
func <- CacheStats[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CacheStats` <- `$.CacheStats`
#' Hardware acceleration configuration for ONNX Runtime models
#'
#' Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
#' for inference in layout detection and embedding generation.
#' @field provider Execution provider to use for ONNX inference.
#' @field device_id GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
#' @export
AccelerationConfig <- new.env(parent = emptyenv())
AccelerationConfig$from_json <- function(json) {
.Call("wrap__AccelerationConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.AccelerationConfig` <- function(self, name) {
func <- AccelerationConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.AccelerationConfig` <- `$.AccelerationConfig`
#' Cross-extractor content filtering configuration
#'
#' Controls whether "furniture" content (headers, footers, page numbers,
#' watermarks, repeating text) is included in or stripped from extraction
#' results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
#' with format-specific implementation.
#'
#' When `None` on `ExtractionConfig`, each extractor uses its current
#' default behavior unchanged.
#' @field include_headers Include running headers in extraction output.
#' @field include_footers Include running footers in extraction output.
#' @field strip_repeating_text Enable the heuristic cross-page repeating text detector.
#' @field include_watermarks Include watermark text in extraction output.
#' @export
ContentFilterConfig <- new.env(parent = emptyenv())
ContentFilterConfig$default <- function() .Call("wrap__ContentFilterConfig__default", PACKAGE = "kreuzberg")
ContentFilterConfig$from_json <- function(json) {
.Call("wrap__ContentFilterConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ContentFilterConfig` <- function(self, name) {
func <- ContentFilterConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ContentFilterConfig` <- `$.ContentFilterConfig`
#' Configuration for email extraction
#' @field msg_fallback_codepage Windows codepage number to use when an MSG file contains no codepage property. Defaults
#' @export
EmailConfig <- new.env(parent = emptyenv())
EmailConfig$from_json <- function(json) {
.Call("wrap__EmailConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmailConfig` <- function(self, name) {
func <- EmailConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailConfig` <- `$.EmailConfig`
#' Main extraction configuration
#'
#' This struct contains all configuration options for the extraction process.
#' It can be loaded from TOML, YAML, or JSON files, or created programmatically.
#' @field use_cache Enable caching of extraction results
#' @field enable_quality_processing Enable quality post-processing
#' @field ocr OCR configuration (None = OCR disabled)
#' @field force_ocr Force OCR even for searchable PDFs
#' @field force_ocr_pages Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
#' @field disable_ocr Disable OCR entirely, even for images.
#' @field chunking Text chunking configuration (None = chunking disabled)
#' @field content_filter Content filtering configuration (None = use extractor defaults).
#' @field images Image extraction configuration (None = no image extraction)
#' @field pdf_options PDF-specific options (None = use defaults)
#' @field token_reduction Token reduction configuration (None = no token reduction)
#' @field language_detection Language detection configuration (None = no language detection)
#' @field pages Page extraction configuration (None = no page tracking)
#' @field keywords Keyword extraction configuration (None = no keyword extraction)
#' @field postprocessor Post-processor configuration (None = use defaults)
#' @field html_options HTML to Markdown conversion options (None = use defaults)
#' @field html_output Styled HTML output configuration.
#' @field extraction_timeout_secs Default per-file timeout in seconds for batch extraction.
#' @field max_concurrent_extractions Maximum concurrent extractions in batch operations (None = (num_cpus ×
#' @field result_format Result structure format
#' @field security_limits Security limits for archive extraction.
#' @field max_embedded_file_bytes Maximum uncompressed size in bytes for a single embedded file before recursive
#' @field output_format Content text format (default: Plain).
#' @field layout Layout detection configuration (None = layout detection disabled).
#' @field use_layout_for_markdown Run layout detection on the non-OCR PDF markdown path.
#' @field include_document_structure Enable structured document tree output.
#' @field acceleration Hardware acceleration configuration for ONNX Runtime models.
#' @field cache_namespace Cache namespace for tenant isolation.
#' @field cache_ttl_secs Per-request cache TTL in seconds.
#' @field email Email extraction configuration (None = use defaults).
#' @field concurrency Concurrency limits for constrained environments (None = use defaults).
#' @field max_archive_depth Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive
#' @field tree_sitter Tree-sitter language pack configuration (None = tree-sitter disabled).
#' @field structured_extraction Structured extraction via LLM (None = disabled).
#' @field cancel_token Cancellation token for this extraction (None = no external cancellation).
#' @export
ExtractionConfig <- new.env(parent = emptyenv())
ExtractionConfig$default <- function() .Call("wrap__ExtractionConfig__default", PACKAGE = "kreuzberg")
ExtractionConfig$needs_image_processing <- function(self) .Call("wrap__ExtractionConfig__needs_image_processing", self, PACKAGE = "kreuzberg")
ExtractionConfig$from_json <- function(json) {
.Call("wrap__ExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ExtractionConfig` <- function(self, name) {
func <- ExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractionConfig` <- `$.ExtractionConfig`
#' @export
needs_image_processing.ExtractionConfig <- function(x, ...) x$needs_image_processing(...)
#' Per-file extraction configuration overrides for batch processing
#'
#' All fields are `Option<T>` — `None` means "use the batch-level default."
#' This type is used with `batch_extract_files` and
#' `batch_extract_bytes` to allow heterogeneous
#' extraction settings within a single batch.
#'
#' # Excluded Fields
#'
#' The following `ExtractionConfig` fields are batch-level only and
#' cannot be overridden per file:
#' - `max_concurrent_extractions` — controls batch parallelism
#' - `use_cache` — global caching policy
#' - `acceleration` — shared ONNX execution provider
#' - `security_limits` — global archive security policy
#' @field enable_quality_processing Override quality post-processing for this file.
#' @field ocr Override OCR configuration for this file (None in the Option = use batch default).
#' @field force_ocr Override force OCR for this file.
#' @field force_ocr_pages Override force OCR pages for this file (1-indexed page numbers).
#' @field disable_ocr Override disable OCR for this file.
#' @field chunking Override chunking configuration for this file.
#' @field content_filter Override content filtering configuration for this file.
#' @field images Override image extraction configuration for this file.
#' @field pdf_options Override PDF options for this file.
#' @field token_reduction Override token reduction for this file.
#' @field language_detection Override language detection for this file.
#' @field pages Override page extraction for this file.
#' @field keywords Override keyword extraction for this file.
#' @field postprocessor Override post-processor for this file.
#' @field html_options Override HTML conversion options for this file.
#' @field result_format Override result format for this file.
#' @field output_format Override output content format for this file.
#' @field include_document_structure Override document structure output for this file.
#' @field layout Override layout detection for this file.
#' @field timeout_secs Override per-file extraction timeout in seconds.
#' @field tree_sitter Override tree-sitter configuration for this file.
#' @field structured_extraction Override structured extraction configuration for this file.
#' @export
FileExtractionConfig <- new.env(parent = emptyenv())
FileExtractionConfig$from_json <- function(json) {
.Call("wrap__FileExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.FileExtractionConfig` <- function(self, name) {
func <- FileExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FileExtractionConfig` <- `$.FileExtractionConfig`
#' Batch item for byte array extraction
#'
#' Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
#' to represent a single item in a batch extraction job.
#' @field content The content bytes to extract from
#' @field mime_type MIME type of the content (e.g., "application/pdf", "text/html")
#' @field config Per-item configuration overrides (None uses batch-level defaults)
#' @export
BatchBytesItem <- new.env(parent = emptyenv())
#' @export
`$.BatchBytesItem` <- function(self, name) {
func <- BatchBytesItem[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BatchBytesItem` <- `$.BatchBytesItem`
#' Batch item for file extraction
#'
#' Used with `batch_extract_files` and `batch_extract_files_sync`
#' to represent a single file in a batch extraction job.
#' @field path Path to the file to extract from
#' @field config Per-file configuration overrides (None uses batch-level defaults)
#' @export
BatchFileItem <- new.env(parent = emptyenv())
#' @export
`$.BatchFileItem` <- function(self, name) {
func <- BatchFileItem[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BatchFileItem` <- `$.BatchFileItem`
#' Image extraction configuration
#' @field extract_images Extract images from documents
#' @field target_dpi Target DPI for image normalization
#' @field max_image_dimension Maximum dimension for images (width or height)
#' @field inject_placeholders Whether to inject image reference placeholders into markdown output. When `true`
#' @field auto_adjust_dpi Automatically adjust DPI based on image content
#' @field min_dpi Minimum DPI threshold
#' @field max_dpi Maximum DPI threshold
#' @field max_images_per_page Maximum number of image objects to extract per PDF page.
#' @field classify When `true` (default), extracted images are classified by kind and grouped into clusters where they
#' @field include_page_rasters When `true`, full-page renders produced during OCR preprocessing are captured and
#' @field run_ocr_on_images Run OCR on extracted images and include the recognized text in the document content.
#' @field ocr_text_only When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown
#' @field append_ocr_text When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder
#' @export
ImageExtractionConfig <- new.env(parent = emptyenv())
ImageExtractionConfig$default <- function() .Call("wrap__ImageExtractionConfig__default", PACKAGE = "kreuzberg")
ImageExtractionConfig$from_json <- function(json) {
.Call("wrap__ImageExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImageExtractionConfig` <- function(self, name) {
func <- ImageExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImageExtractionConfig` <- `$.ImageExtractionConfig`
#' Token reduction configuration
#' @field mode Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
#' @field preserve_important_words Preserve important words (capitalized, technical terms)
#' @export
TokenReductionOptions <- new.env(parent = emptyenv())
TokenReductionOptions$default <- function() .Call("wrap__TokenReductionOptions__default", PACKAGE = "kreuzberg")
TokenReductionOptions$from_json <- function(json) {
.Call("wrap__TokenReductionOptions__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TokenReductionOptions` <- function(self, name) {
func <- TokenReductionOptions[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TokenReductionOptions` <- `$.TokenReductionOptions`
#' Language detection configuration
#' @field enabled Enable language detection
#' @field min_confidence Minimum confidence threshold (0.0-1.0)
#' @field detect_multiple Detect multiple languages in the document
#' @export
LanguageDetectionConfig <- new.env(parent = emptyenv())
LanguageDetectionConfig$default <- function() .Call("wrap__LanguageDetectionConfig__default", PACKAGE = "kreuzberg")
LanguageDetectionConfig$from_json <- function(json) {
.Call("wrap__LanguageDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LanguageDetectionConfig` <- function(self, name) {
func <- LanguageDetectionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LanguageDetectionConfig` <- `$.LanguageDetectionConfig`
#' Configuration for styled HTML output
#'
#' When set on [`ExtractionConfig::html_output`] alongside
#' `output_format = OutputFormat::Html`, the pipeline builds a
#' [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
#' the plain comrak-based renderer.
#' @field css Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file`
#' @field css_file Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both
#' @field theme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
#' @field class_prefix CSS class prefix applied to every emitted class name.
#' @field embed_css When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening
#' @export
HtmlOutputConfig <- new.env(parent = emptyenv())
HtmlOutputConfig$default <- function() .Call("wrap__HtmlOutputConfig__default", PACKAGE = "kreuzberg")
HtmlOutputConfig$from_json <- function(json) {
.Call("wrap__HtmlOutputConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.HtmlOutputConfig` <- function(self, name) {
func <- HtmlOutputConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HtmlOutputConfig` <- `$.HtmlOutputConfig`
#' Layout detection configuration
#'
#' Controls layout detection behavior in the extraction pipeline.
#' When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
#' is enabled for PDF extraction.
#' @field confidence_threshold Confidence threshold override (None = use model default).
#' @field apply_heuristics Whether to apply postprocessing heuristics (default: true).
#' @field table_model Table structure recognition model.
#' @field acceleration Hardware acceleration for ONNX models (layout detection + table structure).
#' @export
LayoutDetectionConfig <- new.env(parent = emptyenv())
LayoutDetectionConfig$default <- function() .Call("wrap__LayoutDetectionConfig__default", PACKAGE = "kreuzberg")
LayoutDetectionConfig$from_json <- function(json) {
.Call("wrap__LayoutDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LayoutDetectionConfig` <- function(self, name) {
func <- LayoutDetectionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutDetectionConfig` <- `$.LayoutDetectionConfig`
#' Configuration for an LLM provider/model via liter-llm
#'
#' Each feature (VLM OCR, VLM embeddings, structured extraction) carries
#' its own `LlmConfig`, allowing different providers per feature.
#' @field model Provider/model string using liter-llm routing format.
#' @field api_key API key for the provider. When `None`, liter-llm falls back to the provider's standard environment
#' @field base_url Custom base URL override for the provider endpoint.
#' @field timeout_secs Request timeout in seconds (default: 60).
#' @field max_retries Maximum retry attempts (default: 3).
#' @field temperature Sampling temperature for generation tasks.
#' @field max_tokens Maximum tokens to generate.
#' @export
LlmConfig <- new.env(parent = emptyenv())
LlmConfig$from_json <- function(json) {
.Call("wrap__LlmConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LlmConfig` <- function(self, name) {
func <- LlmConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LlmConfig` <- `$.LlmConfig`
#' Configuration for LLM-based structured data extraction
#'
#' Sends extracted document content to a VLM with a JSON schema,
#' returning structured data that conforms to the schema.
#' @field schema JSON Schema defining the desired output structure.
#' @field schema_name Schema name passed to the LLM's structured output mode.
#' @field schema_description Optional schema description for the LLM.
#' @field strict Enable strict mode — output must exactly match the schema.
#' @field prompt Custom Jinja2 extraction prompt template. When `None`, a default template is used.
#' @field llm LLM configuration for the extraction.
#' @export
StructuredExtractionConfig <- new.env(parent = emptyenv())
#' @export
`$.StructuredExtractionConfig` <- function(self, name) {
func <- StructuredExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredExtractionConfig` <- `$.StructuredExtractionConfig`
#' Quality thresholds for OCR fallback decisions and pipeline quality gating
#'
#' All fields default to the values that match the previous hardcoded behavior,
#' so `OcrQualityThresholds::default()` preserves existing semantics exactly.
#' @field min_total_non_whitespace Minimum total non-whitespace characters to consider text substantive.
#' @field min_non_whitespace_per_page Minimum non-whitespace characters per page on average.
#' @field min_meaningful_word_len Minimum character count for a word to be "meaningful".
#' @field min_meaningful_words Minimum count of meaningful words before text is accepted.
#' @field min_alnum_ratio Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
#' @field min_garbage_chars Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
#' @field max_fragmented_word_ratio Maximum fraction of short (1-2 char) words before text is considered fragmented.
#' @field critical_fragmented_word_ratio Critical fragmentation threshold — triggers OCR regardless of meaningful
#' @field min_avg_word_length Minimum average word length. Below this with enough words indicates garbled extraction.
#' @field min_words_for_avg_length_check Minimum word count before average word length check applies.
#' @field min_consecutive_repeat_ratio Minimum consecutive word repetition ratio to detect column scrambling.
#' @field min_words_for_repeat_check Minimum word count before consecutive repetition check is applied.
#' @field substantive_min_chars Minimum character count for "substantive markdown" OCR skip gate.
#' @field non_text_min_chars Minimum character count for "non-text content" OCR skip gate.
#' @field alnum_ws_ratio_threshold Alphanumeric+whitespace ratio threshold for skip decisions.
#' @field pipeline_min_quality Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the
#' @export
OcrQualityThresholds <- new.env(parent = emptyenv())
OcrQualityThresholds$default <- function() .Call("wrap__OcrQualityThresholds__default", PACKAGE = "kreuzberg")
OcrQualityThresholds$from_json <- function(json) {
.Call("wrap__OcrQualityThresholds__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrQualityThresholds` <- function(self, name) {
func <- OcrQualityThresholds[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrQualityThresholds` <- `$.OcrQualityThresholds`
#' A single backend stage in the OCR pipeline
#' @field backend Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
#' @field priority Priority weight (higher = tried first). Stages are sorted by priority descending.
#' @field language Language override for this stage (None = use parent OcrConfig.language).
#' @field tesseract_config Tesseract-specific config override for this stage.
#' @field paddle_ocr_config PaddleOCR-specific config for this stage.
#' @field vlm_config VLM config override for this pipeline stage.
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
#' @export
OcrPipelineStage <- new.env(parent = emptyenv())
#' @export
`$.OcrPipelineStage` <- function(self, name) {
func <- OcrPipelineStage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrPipelineStage` <- `$.OcrPipelineStage`
#' OCR configuration
#' @field enabled Whether OCR is enabled.
#' @field backend OCR backend: tesseract, easyocr, paddleocr
#' @field language Language code (e.g., "eng", "deu")
#' @field tesseract_config Tesseract-specific configuration (optional)
#' @field output_format Output format for OCR results (optional, for format conversion)
#' @field paddle_ocr_config PaddleOCR-specific configuration (optional, JSON passthrough)
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
#' @field element_config OCR element extraction configuration
#' @field quality_thresholds Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled
#' @field pipeline Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR
#' @field auto_rotate Enable automatic page rotation based on orientation detection.
#' @field vlm_config VLM (Vision Language Model) OCR configuration.
#' @field vlm_prompt Custom Jinja2 prompt template for VLM OCR.
#' @field acceleration Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
#' @field tessdata_bytes Caller-supplied Tesseract `traineddata` bytes per language code.
#' @export
OcrConfig <- new.env(parent = emptyenv())
OcrConfig$default <- function() .Call("wrap__OcrConfig__default", PACKAGE = "kreuzberg")
OcrConfig$from_json <- function(json) {
.Call("wrap__OcrConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrConfig` <- function(self, name) {
func <- OcrConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrConfig` <- `$.OcrConfig`
#' Page extraction and tracking configuration
#'
#' Controls how pages are extracted, tracked, and represented in the extraction results.
#' When `None`, page tracking is disabled.
#'
#' Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
#' when page boundaries are available and chunking is configured.
#' @field extract_pages Extract pages as separate array (ExtractionResult.pages)
#' @field insert_page_markers Insert page markers in main content string
#' @field marker_format Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n"
#' @export
PageConfig <- new.env(parent = emptyenv())
PageConfig$default <- function() .Call("wrap__PageConfig__default", PACKAGE = "kreuzberg")
PageConfig$from_json <- function(json) {
.Call("wrap__PageConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PageConfig` <- function(self, name) {
func <- PageConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageConfig` <- `$.PageConfig`
#' PDF-specific configuration
#' @field extract_images Extract images from PDF
#' @field extract_tables Extract tables from PDF.
#' @field passwords List of passwords to try when opening encrypted PDFs
#' @field extract_metadata Extract PDF metadata
#' @field hierarchy Hierarchy extraction configuration (None = hierarchy extraction disabled)
#' @field extract_annotations Extract PDF annotations (text notes, highlights, links, stamps). Default: false
#' @field top_margin_fraction Top margin fraction (0.01.0) of page height to exclude headers/running heads. Default:
#' @field bottom_margin_fraction Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
#' @field allow_single_column_tables Allow single-column pseudo tables in extraction results.
#' @field ocr_inline_images Perform OCR on inline images extracted from PDF pages and attach the recognized text to
#' @export
PdfConfig <- new.env(parent = emptyenv())
PdfConfig$default <- function() .Call("wrap__PdfConfig__default", PACKAGE = "kreuzberg")
PdfConfig$from_json <- function(json) {
.Call("wrap__PdfConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PdfConfig` <- function(self, name) {
func <- PdfConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfConfig` <- `$.PdfConfig`
#' Hierarchy extraction configuration for PDF text structure analysis
#'
#' Enables extraction of document hierarchy levels (H1-H6) based on font size
#' clustering and semantic analysis. When enabled, hierarchical blocks are
#' included in page content.
#' @field enabled Enable hierarchy extraction
#' @field k_clusters Number of font size clusters to use for hierarchy levels (1-7)
#' @field include_bbox Include bounding box information in hierarchy blocks
#' @field ocr_coverage_threshold OCR coverage threshold for smart OCR triggering (0.0-1.0)
#' @export
HierarchyConfig <- new.env(parent = emptyenv())
HierarchyConfig$default <- function() .Call("wrap__HierarchyConfig__default", PACKAGE = "kreuzberg")
HierarchyConfig$from_json <- function(json) {
.Call("wrap__HierarchyConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.HierarchyConfig` <- function(self, name) {
func <- HierarchyConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HierarchyConfig` <- `$.HierarchyConfig`
#' Post-processor configuration
#' @field enabled Enable post-processors
#' @field enabled_processors Whitelist of processor names to run (None = all enabled)
#' @field disabled_processors Blacklist of processor names to skip (None = none disabled)
#' @field enabled_set Pre-computed AHashSet for O(1) enabled processor lookup
#' @field disabled_set Pre-computed AHashSet for O(1) disabled processor lookup
#' @export
PostProcessorConfig <- new.env(parent = emptyenv())
PostProcessorConfig$default <- function() .Call("wrap__PostProcessorConfig__default", PACKAGE = "kreuzberg")
PostProcessorConfig$from_json <- function(json) {
.Call("wrap__PostProcessorConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PostProcessorConfig` <- function(self, name) {
func <- PostProcessorConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PostProcessorConfig` <- `$.PostProcessorConfig`
#' Chunking configuration
#'
#' Configures text chunking for document content, including chunk size,
#' overlap, trimming behavior, and optional embeddings.
#'
#' Use `..Default::default()` when constructing to allow for future field additions:
#' ```rust
#' let config = ChunkingConfig {
#' max_characters: 500,
#' ..Default::default()
#' };
#' ```
#' @field max_characters Maximum size per chunk (in units determined by `sizing`).
#' @field overlap Overlap between chunks (in units determined by `sizing`).
#' @field trim Whether to trim whitespace from chunk boundaries.
#' @field chunker_type Type of chunker to use (Text or Markdown).
#' @field embedding Optional embedding configuration for chunk embeddings.
#' @field preset Use a preset configuration (overrides individual settings if provided).
#' @field sizing How to measure chunk size.
#' @field prepend_heading_context When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path
#' @field topic_threshold Optional cosine similarity threshold for semantic topic boundary detection.
#' @export
ChunkingConfig <- new.env(parent = emptyenv())
ChunkingConfig$default <- function() .Call("wrap__ChunkingConfig__default", PACKAGE = "kreuzberg")
ChunkingConfig$from_json <- function(json) {
.Call("wrap__ChunkingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ChunkingConfig` <- function(self, name) {
func <- ChunkingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkingConfig` <- `$.ChunkingConfig`
#' Embedding configuration for text chunks
#'
#' Configures embedding generation using ONNX models via the vendored embedding engine.
#' Requires the `embeddings` feature to be enabled.
#' @field model The embedding model to use (defaults to "balanced" preset if not specified)
#' @field normalize Whether to normalize embedding vectors (recommended for cosine similarity)
#' @field batch_size Batch size for embedding generation
#' @field show_download_progress Show model download progress
#' @field cache_dir Custom cache directory for model files
#' @field acceleration Hardware acceleration for the embedding ONNX model.
#' @field max_embed_duration_secs Maximum wall-clock duration (in seconds) for a single `embed()` call when using
#' @export
EmbeddingConfig <- new.env(parent = emptyenv())
EmbeddingConfig$default <- function() .Call("wrap__EmbeddingConfig__default", PACKAGE = "kreuzberg")
EmbeddingConfig$from_json <- function(json) {
.Call("wrap__EmbeddingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmbeddingConfig` <- function(self, name) {
func <- EmbeddingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingConfig` <- `$.EmbeddingConfig`
#' Configuration for tree-sitter language pack integration
#'
#' Controls grammar download behavior and code analysis options.
#'
#' # Example (TOML)
#'
#' ```toml
#' [tree_sitter]
#' languages = ["python", "rust"]
#' groups = ["web"]
#'
#' [tree_sitter.process]
#' structure = true
#' comments = true
#' docstrings = true
#' ```
#' @field enabled Enable code intelligence processing (default: true).
#' @field cache_dir Custom cache directory for downloaded grammars.
#' @field languages Languages to pre-download on init (e.g., `["python", "rust"]`).
#' @field groups Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
#' @field process Processing options for code analysis.
#' @export
TreeSitterConfig <- new.env(parent = emptyenv())
TreeSitterConfig$default <- function() .Call("wrap__TreeSitterConfig__default", PACKAGE = "kreuzberg")
TreeSitterConfig$from_json <- function(json) {
.Call("wrap__TreeSitterConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TreeSitterConfig` <- function(self, name) {
func <- TreeSitterConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TreeSitterConfig` <- `$.TreeSitterConfig`
#' Processing options for tree-sitter code analysis
#'
#' Controls which analysis features are enabled when extracting code files.
#' @field structure Extract structural items (functions, classes, structs, etc.). Default: true.
#' @field imports Extract import statements. Default: true.
#' @field exports Extract export statements. Default: true.
#' @field comments Extract comments. Default: false.
#' @field docstrings Extract docstrings. Default: false.
#' @field symbols Extract symbol definitions. Default: false.
#' @field diagnostics Include parse diagnostics. Default: false.
#' @field chunk_max_size Maximum chunk size in bytes. `None` disables chunking.
#' @field content_mode Content rendering mode for code extraction.
#' @export
TreeSitterProcessConfig <- new.env(parent = emptyenv())
TreeSitterProcessConfig$default <- function() .Call("wrap__TreeSitterProcessConfig__default", PACKAGE = "kreuzberg")
TreeSitterProcessConfig$from_json <- function(json) {
.Call("wrap__TreeSitterProcessConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TreeSitterProcessConfig` <- function(self, name) {
func <- TreeSitterProcessConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TreeSitterProcessConfig` <- `$.TreeSitterProcessConfig`
#' A supported document format entry
#'
#' Represents a file extension and its corresponding MIME type that Kreuzberg can process.
#' @field extension File extension (without leading dot), e.g., "pdf", "docx"
#' @field mime_type MIME type string, e.g., "application/pdf"
#' @export
SupportedFormat <- new.env(parent = emptyenv())
#' @export
`$.SupportedFormat` <- function(self, name) {
func <- SupportedFormat[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.SupportedFormat` <- `$.SupportedFormat`
#' API server configuration
#'
#' This struct holds all configuration options for the Kreuzberg API server,
#' including host/port settings, CORS configuration, and upload limits.
#'
#' # Defaults
#'
#' - `host`: "127.0.0.1" (localhost only)
#' - `port`: 8000
#' - `cors_origins`: empty vector (allows all origins)
#' - `max_request_body_bytes`: 104_857_600 (100 MB)
#' - `max_multipart_field_bytes`: 104_857_600 (100 MB)
#' @field host Server host address (e.g., "127.0.0.1", "0.0.0.0")
#' @field port Server port number
#' @field cors_origins CORS allowed origins. Empty vector means allow all origins.
#' @field max_request_body_bytes Maximum size of request body in bytes (default: 100 MB)
#' @field max_multipart_field_bytes Maximum size of multipart fields in bytes (default: 100 MB)
#' @export
ServerConfig <- new.env(parent = emptyenv())
ServerConfig$default <- function() .Call("wrap__ServerConfig__default", PACKAGE = "kreuzberg")
ServerConfig$listen_addr <- function(self) .Call("wrap__ServerConfig__listen_addr", self, PACKAGE = "kreuzberg")
ServerConfig$cors_allows_all <- function(self) .Call("wrap__ServerConfig__cors_allows_all", self, PACKAGE = "kreuzberg")
ServerConfig$is_origin_allowed <- function(self, origin) .Call("wrap__ServerConfig__is_origin_allowed", self, origin, PACKAGE = "kreuzberg")
ServerConfig$max_request_body_mb <- function(self) .Call("wrap__ServerConfig__max_request_body_mb", self, PACKAGE = "kreuzberg")
ServerConfig$max_multipart_field_mb <- function(self) .Call("wrap__ServerConfig__max_multipart_field_mb", self, PACKAGE = "kreuzberg")
ServerConfig$from_json <- function(json) {
.Call("wrap__ServerConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ServerConfig` <- function(self, name) {
func <- ServerConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ServerConfig` <- `$.ServerConfig`
#' @export
listen_addr.ServerConfig <- function(x, ...) x$listen_addr(...)
#' @export
cors_allows_all.ServerConfig <- function(x, ...) x$cors_allows_all(...)
#' @export
is_origin_allowed.ServerConfig <- function(x, ...) x$is_origin_allowed(...)
#' @export
max_request_body_mb.ServerConfig <- function(x, ...) x$max_request_body_mb(...)
#' @export
max_multipart_field_mb.ServerConfig <- function(x, ...) x$max_multipart_field_mb(...)
#' StructuredDataResult
#' @field content content
#' @field format format
#' @field metadata metadata
#' @field text_fields text_fields
#' @export
StructuredDataResult <- new.env(parent = emptyenv())
#' @export
`$.StructuredDataResult` <- function(self, name) {
func <- StructuredDataResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredDataResult` <- `$.StructuredDataResult`
#' Application properties from docProps/app.xml for DOCX
#'
#' Contains Word-specific document statistics and metadata.
#' @field application Application name (e.g., "Microsoft Office Word")
#' @field app_version Application version
#' @field template Template filename
#' @field total_time Total editing time in minutes
#' @field pages Number of pages
#' @field words Number of words
#' @field characters Number of characters (excluding spaces)
#' @field characters_with_spaces Number of characters (including spaces)
#' @field lines Number of lines
#' @field paragraphs Number of paragraphs
#' @field company Company name
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @export
DocxAppProperties <- new.env(parent = emptyenv())
DocxAppProperties$from_json <- function(json) {
.Call("wrap__DocxAppProperties__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DocxAppProperties` <- function(self, name) {
func <- DocxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocxAppProperties` <- `$.DocxAppProperties`
#' Application properties from docProps/app.xml for XLSX
#'
#' Contains Excel-specific document metadata.
#' @field application Application name (e.g., "Microsoft Excel")
#' @field app_version Application version
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @field company Company name
#' @field worksheet_names Worksheet names
#' @export
XlsxAppProperties <- new.env(parent = emptyenv())
#' @export
`$.XlsxAppProperties` <- function(self, name) {
func <- XlsxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XlsxAppProperties` <- `$.XlsxAppProperties`
#' Application properties from docProps/app.xml for PPTX
#'
#' Contains PowerPoint-specific document metadata.
#' @field application Application name (e.g., "Microsoft Office PowerPoint")
#' @field app_version Application version
#' @field total_time Total editing time in minutes
#' @field company Company name
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @field slides Number of slides
#' @field notes Number of notes
#' @field hidden_slides Number of hidden slides
#' @field multimedia_clips Number of multimedia clips
#' @field presentation_format Presentation format (e.g., "Widescreen", "Standard")
#' @field slide_titles Slide titles
#' @export
PptxAppProperties <- new.env(parent = emptyenv())
#' @export
`$.PptxAppProperties` <- function(self, name) {
func <- PptxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PptxAppProperties` <- `$.PptxAppProperties`
#' Dublin Core metadata from docProps/core.xml
#'
#' Contains standard metadata fields defined by the Dublin Core standard
#' and Office-specific extensions.
#' @field title Document title
#' @field subject Document subject/topic
#' @field creator Document creator/author
#' @field keywords Keywords or tags
#' @field description Document description/abstract
#' @field last_modified_by User who last modified the document
#' @field revision Revision number
#' @field created Creation timestamp (ISO 8601)
#' @field modified Last modification timestamp (ISO 8601)
#' @field category Document category
#' @field content_status Content status (Draft, Final, etc.)
#' @field language Document language
#' @field identifier Unique identifier
#' @field version Document version
#' @field last_printed Last print timestamp (ISO 8601)
#' @export
CoreProperties <- new.env(parent = emptyenv())
CoreProperties$from_json <- function(json) {
.Call("wrap__CoreProperties__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CoreProperties` <- function(self, name) {
func <- CoreProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CoreProperties` <- `$.CoreProperties`
#' Configuration for security limits across extractors
#'
#' All limits are intentionally conservative to prevent DoS attacks
#' while still supporting legitimate documents.
#' @field max_archive_size Maximum uncompressed size for archives (500 MB)
#' @field max_compression_ratio Maximum compression ratio before flagging as potential bomb (100:1)
#' @field max_files_in_archive Maximum number of files in archive (10,000)
#' @field max_nesting_depth Maximum nesting depth for structures (100)
#' @field max_entity_length Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token
#' @field max_content_size Maximum string growth per document (100 MB)
#' @field max_iterations Maximum iterations per operation
#' @field max_xml_depth Maximum XML depth (100 levels)
#' @field max_table_cells Maximum cells per table (100,000)
#' @export
SecurityLimits <- new.env(parent = emptyenv())
SecurityLimits$default <- function() .Call("wrap__SecurityLimits__default", PACKAGE = "kreuzberg")
SecurityLimits$from_json <- function(json) {
.Call("wrap__SecurityLimits__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.SecurityLimits` <- function(self, name) {
func <- SecurityLimits[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.SecurityLimits` <- `$.SecurityLimits`
#' TokenReductionConfig
#' @field level level
#' @field language_hint language_hint
#' @field preserve_markdown preserve_markdown
#' @field preserve_code preserve_code
#' @field semantic_threshold semantic_threshold
#' @field enable_parallel enable_parallel
#' @field use_simd use_simd
#' @field custom_stopwords custom_stopwords
#' @field preserve_patterns preserve_patterns
#' @field target_reduction target_reduction
#' @field enable_semantic_clustering enable_semantic_clustering
#' @export
TokenReductionConfig <- new.env(parent = emptyenv())
TokenReductionConfig$default <- function() .Call("wrap__TokenReductionConfig__default", PACKAGE = "kreuzberg")
TokenReductionConfig$from_json <- function(json) {
.Call("wrap__TokenReductionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TokenReductionConfig` <- function(self, name) {
func <- TokenReductionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TokenReductionConfig` <- `$.TokenReductionConfig`
#' A PDF annotation extracted from a document page
#' @field annotation_type The type of annotation.
#' @field content Text content of the annotation (e.g., comment text, link URL).
#' @field page_number Page number where the annotation appears (1-indexed).
#' @field bounding_box Bounding box of the annotation on the page.
#' @export
PdfAnnotation <- new.env(parent = emptyenv())
#' @export
`$.PdfAnnotation` <- function(self, name) {
func <- PdfAnnotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfAnnotation` <- `$.PdfAnnotation`
#' Inline element within a block
#'
#' Represents text with formatting, links, images, etc.
#' @field element_type Type of inline element
#' @field content Text content
#' @field attributes Element attributes
#' @field metadata Additional metadata (e.g., href for links, src/alt for images)
#' @export
InlineElement <- new.env(parent = emptyenv())
#' @export
`$.InlineElement` <- function(self, name) {
func <- InlineElement[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.InlineElement` <- `$.InlineElement`
#' Image element in Djot
#' @field src Image source URL or path
#' @field alt Alternative text
#' @field title Optional title
#' @field attributes Element attributes
#' @export
DjotImage <- new.env(parent = emptyenv())
#' @export
`$.DjotImage` <- function(self, name) {
func <- DjotImage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DjotImage` <- `$.DjotImage`
#' Link element in Djot
#' @field url Link URL
#' @field text Link text content
#' @field title Optional title
#' @field attributes Element attributes
#' @export
DjotLink <- new.env(parent = emptyenv())
#' @export
`$.DjotLink` <- function(self, name) {
func <- DjotLink[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DjotLink` <- `$.DjotLink`
#' A resolved relationship between two nodes in the document tree
#' @field source Source node index (the referencing node).
#' @field target Target node index (the referenced node).
#' @field kind Semantic kind of the relationship.
#' @export
DocumentRelationship <- new.env(parent = emptyenv())
#' @export
`$.DocumentRelationship` <- function(self, name) {
func <- DocumentRelationship[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocumentRelationship` <- `$.DocumentRelationship`
#' Individual grid cell with position and span metadata
#' @field content Cell text content.
#' @field row Zero-indexed row position.
#' @field col Zero-indexed column position.
#' @field row_span Number of rows this cell spans.
#' @field col_span Number of columns this cell spans.
#' @field is_header Whether this is a header cell.
#' @field bbox Bounding box for this cell (if available).
#' @export
GridCell <- new.env(parent = emptyenv())
#' @export
`$.GridCell` <- function(self, name) {
func <- GridCell[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.GridCell` <- `$.GridCell`
#' Inline text annotation — byte-range based formatting and links
#'
#' Annotations reference byte offsets into the node's text content,
#' enabling precise identification of formatted regions.
#' @field start Start byte offset in the node's text content (inclusive).
#' @field end End byte offset in the node's text content (exclusive).
#' @field kind Annotation type.
#' @export
TextAnnotation <- new.env(parent = emptyenv())
#' @export
`$.TextAnnotation` <- function(self, name) {
func <- TextAnnotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TextAnnotation` <- `$.TextAnnotation`
#' A single file extracted from an archive
#'
#' When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
#' enabled, each processable file produces its own full `ExtractionResult`.
#' @field path Archive-relative file path (e.g. "folder/document.pdf").
#' @field mime_type Detected MIME type of the file.
#' @field result Full extraction result for this file.
#' @export
ArchiveEntry <- new.env(parent = emptyenv())
#' @export
`$.ArchiveEntry` <- function(self, name) {
func <- ArchiveEntry[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ArchiveEntry` <- `$.ArchiveEntry`
#' A non-fatal warning from a processing pipeline stage
#'
#' Captures errors from optional features that don't prevent extraction
#' but may indicate degraded results.
#' @field source The pipeline stage or feature that produced this warning (e.g., "embedding", "chunking",
#' @field message Human-readable description of what went wrong.
#' @export
ProcessingWarning <- new.env(parent = emptyenv())
#' @export
`$.ProcessingWarning` <- function(self, name) {
func <- ProcessingWarning[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ProcessingWarning` <- `$.ProcessingWarning`
#' Token usage and cost data for a single LLM call made during extraction
#'
#' Populated when VLM OCR, structured extraction, or LLM-based embeddings
#' are used. Multiple entries may be present when multiple LLM calls occur
#' within one extraction (e.g. VLM OCR + structured extraction).
#' @field model The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
#' @field source The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction",
#' @field input_tokens Number of input/prompt tokens consumed.
#' @field output_tokens Number of output/completion tokens generated.
#' @field total_tokens Total tokens (input + output).
#' @field estimated_cost Estimated cost in USD based on the provider's published pricing.
#' @field finish_reason Why the model stopped generating (e.g. "stop", "length", "content_filter").
#' @export
LlmUsage <- new.env(parent = emptyenv())
LlmUsage$from_json <- function(json) {
.Call("wrap__LlmUsage__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LlmUsage` <- function(self, name) {
func <- LlmUsage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LlmUsage` <- `$.LlmUsage`
#' A text chunk with optional embedding and metadata
#'
#' Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
#' contains the text content, optional embedding vector (if embedding generation
#' is configured), and metadata about its position in the document.
#' @field content The text content of this chunk.
#' @field chunk_type Semantic structural classification of this chunk.
#' @field embedding Optional embedding vector for this chunk.
#' @field metadata Metadata about this chunk's position and properties.
#' @export
Chunk <- new.env(parent = emptyenv())
#' @export
`$.Chunk` <- function(self, name) {
func <- Chunk[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Chunk` <- `$.Chunk`
#' A single heading in the hierarchy
#' @field level Heading depth (1 = h1, 2 = h2, etc.)
#' @field text The text content of the heading.
#' @export
HeadingLevel <- new.env(parent = emptyenv())
#' @export
`$.HeadingLevel` <- function(self, name) {
func <- HeadingLevel[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HeadingLevel` <- `$.HeadingLevel`
#' Metadata about a chunk's position in the original document
#' @field byte_start Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
#' @field byte_end Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
#' @field token_count Number of tokens in this chunk (if available).
#' @field chunk_index Zero-based index of this chunk in the document.
#' @field total_chunks Total number of chunks in the document.
#' @field first_page First page number this chunk spans (1-indexed).
#' @field last_page Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
#' @field heading_context Heading context when using Markdown chunker.
#' @field image_indices Indices into `ExtractionResult.images` for images on pages covered by this chunk.
#' @export
ChunkMetadata <- new.env(parent = emptyenv())
#' @export
`$.ChunkMetadata` <- function(self, name) {
func <- ChunkMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkMetadata` <- `$.ChunkMetadata`
#' Extracted image from a document
#'
#' Contains raw image data, metadata, and optional nested OCR results.
#' Raw bytes allow cross-language compatibility - users can convert to
#' PIL.Image (Python), Sharp (Node.js), or other formats as needed.
#' @field data Raw image data (PNG, JPEG, WebP, etc. bytes). Uses `bytes::Bytes` for cheap cloning of large buffers.
#' @field format Image format (e.g., "jpeg", "png", "webp") Uses Cow<'static, str> to avoid allocation for static
#' @field image_index Zero-indexed position of this image in the document/page
#' @field page_number Page/slide number where image was found (1-indexed)
#' @field width Image width in pixels
#' @field height Image height in pixels
#' @field colorspace Colorspace information (e.g., "RGB", "CMYK", "Gray")
#' @field bits_per_component Bits per color component (e.g., 8, 16)
#' @field is_mask Whether this image is a mask image
#' @field description Optional description of the image
#' @field ocr_result Nested OCR extraction result (if image was OCRed)
#' @field bounding_box Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
#' @field source_path Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
#' @field image_kind Heuristic classification of what this image likely depicts. `None` if classification was disabled
#' @field kind_confidence Confidence score for `image_kind`, in the range 0.0 to 1.0.
#' @field cluster_id Identifier shared across images that form a single logical figure (e.g. all raster tiles of one
#' @export
ExtractedImage <- new.env(parent = emptyenv())
#' @export
`$.ExtractedImage` <- function(self, name) {
func <- ExtractedImage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractedImage` <- `$.ExtractedImage`
#' Bounding box coordinates for element positioning
#' @field x0 Left x-coordinate
#' @field y0 Bottom y-coordinate
#' @field x1 Right x-coordinate
#' @field y1 Top y-coordinate
#' @export
BoundingBox <- new.env(parent = emptyenv())
BoundingBox$from_json <- function(json) {
.Call("wrap__BoundingBox__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.BoundingBox` <- function(self, name) {
func <- BoundingBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BoundingBox` <- `$.BoundingBox`
#' Metadata for a semantic element
#' @field page_number Page number (1-indexed)
#' @field filename Source filename or document name
#' @field coordinates Bounding box coordinates if available
#' @field element_index Position index in the element sequence
#' @field additional Additional custom metadata
#' @export
ElementMetadata <- new.env(parent = emptyenv())
#' @export
`$.ElementMetadata` <- function(self, name) {
func <- ElementMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ElementMetadata` <- `$.ElementMetadata`
#' Semantic element extracted from document
#'
#' Represents a logical unit of content with semantic classification,
#' unique identifier, and metadata for tracking origin and position.
#' @field element_id Unique element identifier
#' @field element_type Semantic type of this element
#' @field text Text content of the element
#' @field metadata Metadata about the element
#' @export
Element <- new.env(parent = emptyenv())
#' @export
`$.Element` <- function(self, name) {
func <- Element[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Element` <- `$.Element`
#' XML extraction result
#'
#' Contains extracted text content from XML files along with
#' structural statistics about the XML document.
#' @field content Extracted text content (XML structure filtered out)
#' @field element_count Total number of XML elements processed
#' @field unique_elements List of unique element names found (sorted)
#' @export
XmlExtractionResult <- new.env(parent = emptyenv())
#' @export
`$.XmlExtractionResult` <- function(self, name) {
func <- XmlExtractionResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XmlExtractionResult` <- `$.XmlExtractionResult`
#' Email attachment representation
#'
#' Contains metadata and optionally the content of an email attachment.
#' @field name Attachment name (from Content-Disposition header)
#' @field filename Filename of the attachment
#' @field mime_type MIME type of the attachment
#' @field size Size in bytes
#' @field is_image Whether this attachment is an image
#' @field data Attachment data (if extracted). Uses `bytes::Bytes` for cheap cloning of large buffers.
#' @export
EmailAttachment <- new.env(parent = emptyenv())
#' @export
`$.EmailAttachment` <- function(self, name) {
func <- EmailAttachment[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailAttachment` <- `$.EmailAttachment`
#' Bounding box for an OCR-detected table in pixel coordinates
#' @field left Left x-coordinate (pixels)
#' @field top Top y-coordinate (pixels)
#' @field right Right x-coordinate (pixels)
#' @field bottom Bottom y-coordinate (pixels)
#' @export
OcrTableBoundingBox <- new.env(parent = emptyenv())
#' @export
`$.OcrTableBoundingBox` <- function(self, name) {
func <- OcrTableBoundingBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrTableBoundingBox` <- `$.OcrTableBoundingBox`
#' Image preprocessing configuration for OCR
#'
#' These settings control how images are preprocessed before OCR to improve
#' text recognition quality. Different preprocessing strategies work better
#' for different document types.
#' @field target_dpi Target DPI for the image (300 is standard, 600 for small text).
#' @field auto_rotate Auto-detect and correct image rotation.
#' @field deskew Correct skew (tilted images).
#' @field denoise Remove noise from the image.
#' @field contrast_enhance Enhance contrast for better text visibility.
#' @field binarization_method Binarization method: "otsu", "sauvola", "adaptive".
#' @field invert_colors Invert colors (white text on black → black on white).
#' @export
ImagePreprocessingConfig <- new.env(parent = emptyenv())
ImagePreprocessingConfig$default <- function() .Call("wrap__ImagePreprocessingConfig__default", PACKAGE = "kreuzberg")
ImagePreprocessingConfig$from_json <- function(json) {
.Call("wrap__ImagePreprocessingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImagePreprocessingConfig` <- function(self, name) {
func <- ImagePreprocessingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImagePreprocessingConfig` <- `$.ImagePreprocessingConfig`
#' Tesseract OCR configuration
#'
#' Provides fine-grained control over Tesseract OCR engine parameters.
#' Most users can use the defaults, but these settings allow optimization
#' for specific document types (invoices, handwriting, etc.).
#' @field language Language code (e.g., "eng", "deu", "fra")
#' @field psm Page Segmentation Mode (0-13).
#' @field output_format Output format ("text" or "markdown")
#' @field oem OCR Engine Mode (0-3).
#' @field min_confidence Minimum confidence threshold (0.0-100.0).
#' @field preprocessing Image preprocessing configuration.
#' @field enable_table_detection Enable automatic table detection and reconstruction
#' @field table_min_confidence Minimum confidence threshold for table detection (0.0-1.0)
#' @field table_column_threshold Column threshold for table detection (pixels)
#' @field table_row_threshold_ratio Row threshold ratio for table detection (0.0-1.0)
#' @field use_cache Enable OCR result caching
#' @field classify_use_pre_adapted_templates Use pre-adapted templates for character classification
#' @field language_model_ngram_on Enable N-gram language model
#' @field tessedit_dont_blkrej_good_wds Don't reject good words during block-level processing
#' @field tessedit_dont_rowrej_good_wds Don't reject good words during row-level processing
#' @field tessedit_enable_dict_correction Enable dictionary correction
#' @field tessedit_char_whitelist Whitelist of allowed characters (empty = all allowed)
#' @field tessedit_char_blacklist Blacklist of forbidden characters (empty = none forbidden)
#' @field tessedit_use_primary_params_model Use primary language params model
#' @field textord_space_size_is_variable Variable-width space detection
#' @field thresholding_method Use adaptive thresholding method
#' @export
TesseractConfig <- new.env(parent = emptyenv())
TesseractConfig$default <- function() .Call("wrap__TesseractConfig__default", PACKAGE = "kreuzberg")
TesseractConfig$from_json <- function(json) {
.Call("wrap__TesseractConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TesseractConfig` <- function(self, name) {
func <- TesseractConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TesseractConfig` <- `$.TesseractConfig`
#' Image preprocessing metadata
#'
#' Tracks the transformations applied to an image during OCR preprocessing,
#' including DPI normalization, resizing, and resampling.
#' @field original_dimensions Original image dimensions (width, height) in pixels
#' @field original_dpi Original image DPI (horizontal, vertical)
#' @field target_dpi Target DPI from configuration
#' @field scale_factor Scaling factor applied to the image
#' @field auto_adjusted Whether DPI was auto-adjusted based on content
#' @field final_dpi Final DPI after processing
#' @field new_dimensions New dimensions after resizing (if resized)
#' @field resample_method Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
#' @field dimension_clamped Whether dimensions were clamped to max_image_dimension
#' @field calculated_dpi Calculated optimal DPI (if auto_adjust_dpi enabled)
#' @field skipped_resize Whether resize was skipped (dimensions already optimal)
#' @field resize_error Error message if resize failed
#' @export
ImagePreprocessingMetadata <- new.env(parent = emptyenv())
#' @export
`$.ImagePreprocessingMetadata` <- function(self, name) {
func <- ImagePreprocessingMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImagePreprocessingMetadata` <- `$.ImagePreprocessingMetadata`
#' Extraction result metadata
#'
#' Contains common fields applicable to all formats, format-specific metadata
#' via a discriminated union, and additional custom fields from postprocessors.
#' @field title Document title
#' @field subject Document subject or description
#' @field authors Primary author(s) - always Vec for consistency
#' @field keywords Keywords/tags - always Vec for consistency
#' @field language Primary language (ISO 639 code)
#' @field created_at Creation timestamp (ISO 8601 format)
#' @field modified_at Last modification timestamp (ISO 8601 format)
#' @field created_by User who created the document
#' @field modified_by User who last modified the document
#' @field pages Page/slide/sheet structure with boundaries
#' @field format Format-specific metadata (discriminated union)
#' @field image_preprocessing Image preprocessing metadata (when OCR preprocessing was applied)
#' @field json_schema JSON schema (for structured data extraction)
#' @field error Error metadata (for batch operations)
#' @field extraction_duration_ms Extraction duration in milliseconds (for benchmarking).
#' @field category Document category (from frontmatter or classification).
#' @field tags Document tags (from frontmatter).
#' @field document_version Document version string (from frontmatter).
#' @field abstract_text Abstract or summary text (from frontmatter).
#' @field output_format Output format identifier (e.g., "markdown", "html", "text").
#' @field ocr_used Whether OCR was used during extraction.
#' @field additional Additional custom fields from postprocessors.
#' @export
Metadata <- new.env(parent = emptyenv())
Metadata$is_empty <- function(self) .Call("wrap__Metadata__is_empty", self, PACKAGE = "kreuzberg")
Metadata$from_json <- function(json) {
.Call("wrap__Metadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.Metadata` <- function(self, name) {
func <- Metadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Metadata` <- `$.Metadata`
#' @export
is_empty.Metadata <- function(x, ...) x$is_empty(...)
#' Excel/spreadsheet format metadata
#'
#' Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
#' discriminant. Sheet count and sheet names are stored inside this struct.
#' @field sheet_count Number of sheets in the workbook.
#' @field sheet_names Names of all sheets in the workbook.
#' @export
ExcelMetadata <- new.env(parent = emptyenv())
ExcelMetadata$from_json <- function(json) {
.Call("wrap__ExcelMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ExcelMetadata` <- function(self, name) {
func <- ExcelMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExcelMetadata` <- `$.ExcelMetadata`
#' Email metadata extracted from .eml and .msg files
#'
#' Includes sender/recipient information, message ID, and attachment list.
#' @field from_email Sender's email address
#' @field from_name Sender's display name
#' @field to_emails Primary recipients
#' @field cc_emails CC recipients
#' @field bcc_emails BCC recipients
#' @field message_id Message-ID header value
#' @field attachments List of attachment filenames
#' @export
EmailMetadata <- new.env(parent = emptyenv())
EmailMetadata$from_json <- function(json) {
.Call("wrap__EmailMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmailMetadata` <- function(self, name) {
func <- EmailMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailMetadata` <- `$.EmailMetadata`
#' Archive (ZIP/TAR/7Z) metadata
#'
#' Extracted from compressed archive files containing file lists and size information.
#' @field format Archive format ("ZIP", "TAR", "7Z", etc.)
#' @field file_count Total number of files in the archive
#' @field file_list List of file paths within the archive
#' @field total_size Total uncompressed size in bytes
#' @field compressed_size Compressed size in bytes (if available)
#' @export
ArchiveMetadata <- new.env(parent = emptyenv())
ArchiveMetadata$from_json <- function(json) {
.Call("wrap__ArchiveMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ArchiveMetadata` <- function(self, name) {
func <- ArchiveMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ArchiveMetadata` <- `$.ArchiveMetadata`
#' Image metadata extracted from image files
#'
#' Includes dimensions, format, and EXIF data.
#' @field width Image width in pixels
#' @field height Image height in pixels
#' @field format Image format (e.g., "PNG", "JPEG", "TIFF")
#' @field exif EXIF metadata tags
#' @export
ImageMetadata <- new.env(parent = emptyenv())
ImageMetadata$from_json <- function(json) {
.Call("wrap__ImageMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImageMetadata` <- function(self, name) {
func <- ImageMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImageMetadata` <- `$.ImageMetadata`
#' XML metadata extracted during XML parsing
#'
#' Provides statistics about XML document structure.
#' @field element_count Total number of XML elements processed
#' @field unique_elements List of unique element tag names (sorted)
#' @export
XmlMetadata <- new.env(parent = emptyenv())
XmlMetadata$from_json <- function(json) {
.Call("wrap__XmlMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.XmlMetadata` <- function(self, name) {
func <- XmlMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XmlMetadata` <- `$.XmlMetadata`
#' Header/heading element metadata
#' @field level Header level: 1 (h1) through 6 (h6)
#' @field text Normalized text content of the header
#' @field id HTML id attribute if present
#' @field depth Document tree depth at the header element
#' @field html_offset Byte offset in original HTML document
#' @export
HeaderMetadata <- new.env(parent = emptyenv())
#' @export
`$.HeaderMetadata` <- function(self, name) {
func <- HeaderMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HeaderMetadata` <- `$.HeaderMetadata`
#' Structured data (Schema.org, microdata, RDFa) block
#' @field data_type Type of structured data
#' @field raw_json Raw JSON string representation
#' @field schema_type Schema type if detectable (e.g., "Article", "Event", "Product")
#' @export
StructuredData <- new.env(parent = emptyenv())
#' @export
`$.StructuredData` <- function(self, name) {
func <- StructuredData[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredData` <- `$.StructuredData`
#' OCR processing metadata
#'
#' Captures information about OCR processing configuration and results.
#' @field language OCR language code(s) used
#' @field psm Tesseract Page Segmentation Mode (PSM)
#' @field output_format Output format (e.g., "text", "hocr")
#' @field table_count Number of tables detected
#' @field table_rows table_rows
#' @field table_cols table_cols
#' @export
OcrMetadata <- new.env(parent = emptyenv())
OcrMetadata$from_json <- function(json) {
.Call("wrap__OcrMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrMetadata` <- function(self, name) {
func <- OcrMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrMetadata` <- `$.OcrMetadata`
#' Error metadata (for batch operations)
#' @field error_type error_type
#' @field message message
#' @export
ErrorMetadata <- new.env(parent = emptyenv())
#' @export
`$.ErrorMetadata` <- function(self, name) {
func <- ErrorMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ErrorMetadata` <- `$.ErrorMetadata`
#' PowerPoint presentation metadata
#'
#' Extracted from PPTX files containing slide counts and presentation details.
#' @field slide_count Total number of slides in the presentation
#' @field slide_names Names of slides (if available)
#' @field image_count Number of embedded images
#' @field table_count Number of tables
#' @export
PptxMetadata <- new.env(parent = emptyenv())
PptxMetadata$from_json <- function(json) {
.Call("wrap__PptxMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PptxMetadata` <- function(self, name) {
func <- PptxMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PptxMetadata` <- `$.PptxMetadata`
#' Word document metadata
#'
#' Extracted from DOCX files using shared Office Open XML metadata extraction.
#' Integrates with `office_metadata` module for core/app/custom properties.
#' @field core_properties Core properties from docProps/core.xml (Dublin Core metadata)
#' @field app_properties Application properties from docProps/app.xml (Word-specific statistics)
#' @field custom_properties Custom properties from docProps/custom.xml (user-defined properties)
#' @export
DocxMetadata <- new.env(parent = emptyenv())
DocxMetadata$from_json <- function(json) {
.Call("wrap__DocxMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DocxMetadata` <- function(self, name) {
func <- DocxMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocxMetadata` <- `$.DocxMetadata`
#' CSV/TSV file metadata
#' @field row_count row_count
#' @field column_count column_count
#' @field delimiter delimiter
#' @field has_header has_header
#' @field column_types column_types
#' @export
CsvMetadata <- new.env(parent = emptyenv())
CsvMetadata$from_json <- function(json) {
.Call("wrap__CsvMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CsvMetadata` <- function(self, name) {
func <- CsvMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CsvMetadata` <- `$.CsvMetadata`
#' BibTeX bibliography metadata
#' @field entry_count Number of entries in the bibliography.
#' @field citation_keys citation_keys
#' @field authors authors
#' @field year_range year_range
#' @field entry_types entry_types
#' @export
BibtexMetadata <- new.env(parent = emptyenv())
BibtexMetadata$from_json <- function(json) {
.Call("wrap__BibtexMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.BibtexMetadata` <- function(self, name) {
func <- BibtexMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BibtexMetadata` <- `$.BibtexMetadata`
#' Citation file metadata (RIS, PubMed, EndNote)
#' @field citation_count citation_count
#' @field format format
#' @field authors authors
#' @field year_range year_range
#' @field dois dois
#' @field keywords keywords
#' @export
CitationMetadata <- new.env(parent = emptyenv())
CitationMetadata$from_json <- function(json) {
.Call("wrap__CitationMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CitationMetadata` <- function(self, name) {
func <- CitationMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CitationMetadata` <- `$.CitationMetadata`
#' Year range for bibliographic metadata
#' @field min min
#' @field max max
#' @field years years
#' @export
YearRange <- new.env(parent = emptyenv())
#' @export
`$.YearRange` <- function(self, name) {
func <- YearRange[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.YearRange` <- `$.YearRange`
#' FictionBook (FB2) metadata
#' @field genres genres
#' @field sequences sequences
#' @field annotation annotation
#' @export
FictionBookMetadata <- new.env(parent = emptyenv())
FictionBookMetadata$from_json <- function(json) {
.Call("wrap__FictionBookMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.FictionBookMetadata` <- function(self, name) {
func <- FictionBookMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FictionBookMetadata` <- `$.FictionBookMetadata`
#' DBASE field information
#' @field name name
#' @field field_type field_type
#' @export
DbfFieldInfo <- new.env(parent = emptyenv())
#' @export
`$.DbfFieldInfo` <- function(self, name) {
func <- DbfFieldInfo[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DbfFieldInfo` <- `$.DbfFieldInfo`
#' JATS contributor with role
#' @field name name
#' @field role role
#' @export
ContributorRole <- new.env(parent = emptyenv())
#' @export
`$.ContributorRole` <- function(self, name) {
func <- ContributorRole[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ContributorRole` <- `$.ContributorRole`
#' EPUB metadata (Dublin Core extensions)
#' @field coverage coverage
#' @field dc_format dc_format
#' @field relation relation
#' @field source source
#' @field dc_type dc_type
#' @field cover_image cover_image
#' @export
EpubMetadata <- new.env(parent = emptyenv())
EpubMetadata$from_json <- function(json) {
.Call("wrap__EpubMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EpubMetadata` <- function(self, name) {
func <- EpubMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EpubMetadata` <- `$.EpubMetadata`
#' Outlook PST archive metadata
#' @field message_count message_count
#' @export
PstMetadata <- new.env(parent = emptyenv())
PstMetadata$from_json <- function(json) {
.Call("wrap__PstMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PstMetadata` <- function(self, name) {
func <- PstMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PstMetadata` <- `$.PstMetadata`
#' Confidence scores for an OCR element
#'
#' Separates detection confidence (how confident that text exists at this location)
#' from recognition confidence (how confident about the actual text content).
#' @field detection Detection confidence: how confident the OCR engine is that text exists here.
#' @field recognition Recognition confidence: how confident about the text content.
#' @export
OcrConfidence <- new.env(parent = emptyenv())
OcrConfidence$from_json <- function(json) {
.Call("wrap__OcrConfidence__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrConfidence` <- function(self, name) {
func <- OcrConfidence[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrConfidence` <- `$.OcrConfidence`
#' Rotation information for an OCR element
#' @field angle_degrees Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
#' @field confidence Confidence score for the rotation detection.
#' @export
OcrRotation <- new.env(parent = emptyenv())
#' @export
`$.OcrRotation` <- function(self, name) {
func <- OcrRotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrRotation` <- `$.OcrRotation`
#' A unified OCR element representing detected text with full metadata
#'
#' This is the primary type for structured OCR output, preserving all information
#' from both Tesseract and PaddleOCR backends.
#' @field text The recognized text content.
#' @field geometry Bounding geometry (rectangle or quadrilateral).
#' @field confidence Confidence scores for detection and recognition.
#' @field level Hierarchical level (word, line, block, page).
#' @field rotation Rotation information (if detected).
#' @field page_number Page number (1-indexed).
#' @field parent_id Parent element ID for hierarchical relationships.
#' @field backend_metadata Backend-specific metadata that doesn't fit the unified schema.
#' @export
OcrElement <- new.env(parent = emptyenv())
OcrElement$from_json <- function(json) {
.Call("wrap__OcrElement__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrElement` <- function(self, name) {
func <- OcrElement[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrElement` <- `$.OcrElement`
#' Configuration for OCR element extraction
#'
#' Controls how OCR elements are extracted and filtered.
#' @field include_elements Whether to include OCR elements in the extraction result.
#' @field min_level Minimum hierarchical level to include.
#' @field min_confidence Minimum recognition confidence threshold (0.0-1.0).
#' @field build_hierarchy Whether to build hierarchical relationships between elements.
#' @export
OcrElementConfig <- new.env(parent = emptyenv())
OcrElementConfig$from_json <- function(json) {
.Call("wrap__OcrElementConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrElementConfig` <- function(self, name) {
func <- OcrElementConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrElementConfig` <- `$.OcrElementConfig`
#' Byte offset boundary for a page
#'
#' Tracks where a specific page's content starts and ends in the main content string,
#' enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
#' at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
#' @field byte_start Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
#' @field byte_end Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
#' @field page_number Page number (1-indexed)
#' @export
PageBoundary <- new.env(parent = emptyenv())
#' @export
`$.PageBoundary` <- function(self, name) {
func <- PageBoundary[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageBoundary` <- `$.PageBoundary`
#' Metadata for individual page/slide/sheet
#'
#' Captures per-page information including dimensions, content counts,
#' and visibility state (for presentations).
#' @field number Page number (1-indexed)
#' @field title Page title (usually for presentations)
#' @field dimensions Dimensions in points (PDF) or pixels (images): (width, height)
#' @field image_count Number of images on this page
#' @field table_count Number of tables on this page
#' @field hidden Whether this page is hidden (e.g., in presentations)
#' @field is_blank Whether this page is blank (no meaningful text, no images, no tables)
#' @field has_vector_graphics Whether this page contains non-trivial vector graphics (paths, shapes, curves)
#' @export
PageInfo <- new.env(parent = emptyenv())
#' @export
`$.PageInfo` <- function(self, name) {
func <- PageInfo[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageInfo` <- `$.PageInfo`
#' A detected layout region on a page
#'
#' When layout detection is enabled, each page may have layout regions
#' identifying different content types (text, pictures, tables, etc.)
#' with confidence scores and spatial positions.
#' @field class_name Layout class name (e.g. "picture", "table", "text", "section_header").
#' @field confidence Confidence score from the layout detection model (0.0 to 1.0).
#' @field bounding_box Bounding box in document coordinate space.
#' @field area_fraction Fraction of the page area covered by this region (0.0 to 1.0).
#' @export
LayoutRegion <- new.env(parent = emptyenv())
LayoutRegion$from_json <- function(json) {
.Call("wrap__LayoutRegion__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LayoutRegion` <- function(self, name) {
func <- LayoutRegion[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutRegion` <- `$.LayoutRegion`
#' A text block with hierarchy level assignment
#'
#' Represents a block of text with semantic heading information extracted from
#' font size clustering and hierarchical analysis.
#' @field text The text content of this block
#' @field font_size The font size of the text in this block
#' @field level The hierarchy level of this block (H1-H6 or Body)
#' @field bbox Bounding box information for the block
#' @export
HierarchicalBlock <- new.env(parent = emptyenv())
#' @export
`$.HierarchicalBlock` <- function(self, name) {
func <- HierarchicalBlock[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HierarchicalBlock` <- `$.HierarchicalBlock`
#' A single changed cell within a table
#'
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
#' reference it unconditionally, without requiring the `diff` Cargo feature.
#' `crate::diff` re-exports this type verbatim.
#' @field row Zero-based row index.
#' @field col Zero-based column index.
#' @field from Value before the change.
#' @field to Value after the change.
#' @export
CellChange <- new.env(parent = emptyenv())
#' @export
`$.CellChange` <- function(self, name) {
func <- CellChange[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CellChange` <- `$.CellChange`
#' A single tracked change embedded in a document
#'
#' Populated by per-format extractors that understand change-tracking metadata
#' (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
#' extractor defaults to `ExtractionResult.revisions = None` until a
#' format-specific implementation is added.
#' @field revision_id Format-specific revision identifier.
#' @field author Display name of the author who made this change, when available.
#' @field timestamp ISO-8601 timestamp of the change, when available.
#' @field kind Semantic kind of this revision.
#' @field anchor Best-effort document location for this revision.
#' @field delta The content changes that make up this revision.
#' @export
DocumentRevision <- new.env(parent = emptyenv())
#' @export
`$.DocumentRevision` <- function(self, name) {
func <- DocumentRevision[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocumentRevision` <- `$.DocumentRevision`
#' Individual table cell with content and optional styling
#'
#' Future extension point for rich table support with cell-level metadata.
#' @field content Cell content as text
#' @field row_span Row span (number of rows this cell spans)
#' @field col_span Column span (number of columns this cell spans)
#' @field is_header Whether this is a header cell
#' @export
TableCell <- new.env(parent = emptyenv())
#' @export
`$.TableCell` <- function(self, name) {
func <- TableCell[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TableCell` <- `$.TableCell`
#' A URI extracted from a document
#'
#' Represents any link, reference, or resource pointer found during extraction.
#' The `kind` field classifies the URI semantically, while `label` carries
#' optional human-readable display text.
#' @field url The URL or path string.
#' @field label Optional display text / label for the link.
#' @field page Optional page number where the URI was found (1-indexed).
#' @field kind Semantic classification of the URI.
#' @export
ExtractedUri <- new.env(parent = emptyenv())
#' @export
`$.ExtractedUri` <- function(self, name) {
func <- ExtractedUri[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractedUri` <- `$.ExtractedUri`
#' MIME type detection response
#' @field mime_type Detected MIME type
#' @field filename Original filename (if provided)
#' @export
DetectResponse <- new.env(parent = emptyenv())
#' @export
`$.DetectResponse` <- function(self, name) {
func <- DetectResponse[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DetectResponse` <- `$.DetectResponse`
#' Options controlling how two `ExtractionResult` values are compared
#' @field include_metadata Include metadata changes in the diff. Default: `true`.
#' @field include_embedded Include embedded-children changes in the diff. Default: `true`.
#' @field max_content_chars Truncate content to this many characters before diffing.
#' @export
DiffOptions <- new.env(parent = emptyenv())
DiffOptions$default <- function() .Call("wrap__DiffOptions__default", PACKAGE = "kreuzberg")
DiffOptions$from_json <- function(json) {
.Call("wrap__DiffOptions__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DiffOptions` <- function(self, name) {
func <- DiffOptions[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffOptions` <- `$.DiffOptions`
#' A single contiguous hunk in a unified diff
#' @field from_line Starting line number in the old content (0-indexed).
#' @field from_count Number of lines from the old content in this hunk.
#' @field to_line Starting line number in the new content (0-indexed).
#' @field to_count Number of lines from the new content in this hunk.
#' @field lines Lines that make up this hunk.
#' @export
DiffHunk <- new.env(parent = emptyenv())
#' @export
`$.DiffHunk` <- function(self, name) {
func <- DiffHunk[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffHunk` <- `$.DiffHunk`
#' Diff for a single embedded archive entry that appears in both results
#' @field path Archive-relative path identifying this entry.
#' @field diff The recursive diff of the entry's extraction result.
#' @export
EmbeddedDiff <- new.env(parent = emptyenv())
#' @export
`$.EmbeddedDiff` <- function(self, name) {
func <- EmbeddedDiff[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddedDiff` <- `$.EmbeddedDiff`
#' Preset configurations for common RAG use cases
#'
#' Each preset combines chunk size, overlap, and embedding model
#' to provide an optimized configuration for specific scenarios.
#'
#' All string fields are owned `String` for FFI compatibility — instances
#' are safe to clone and pass across language boundaries.
#' @field name name
#' @field chunk_size chunk_size
#' @field overlap overlap
#' @field model_repo HuggingFace repository name for the model.
#' @field pooling Pooling strategy: "cls" or "mean".
#' @field model_file Path to the ONNX model file within the repo.
#' @field dimensions dimensions
#' @field description description
#' @export
EmbeddingPreset <- new.env(parent = emptyenv())
#' @export
`$.EmbeddingPreset` <- function(self, name) {
func <- EmbeddingPreset[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingPreset` <- `$.EmbeddingPreset`
#' YAKE-specific parameters
#' @field window_size Window size for co-occurrence analysis (default: 2).
#' @export
YakeParams <- new.env(parent = emptyenv())
YakeParams$default <- function() .Call("wrap__YakeParams__default", PACKAGE = "kreuzberg")
YakeParams$from_json <- function(json) {
.Call("wrap__YakeParams__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.YakeParams` <- function(self, name) {
func <- YakeParams[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.YakeParams` <- `$.YakeParams`
#' RAKE-specific parameters
#' @field min_word_length Minimum word length to consider (default: 1).
#' @field max_words_per_phrase Maximum words in a keyword phrase (default: 3).
#' @export
RakeParams <- new.env(parent = emptyenv())
RakeParams$default <- function() .Call("wrap__RakeParams__default", PACKAGE = "kreuzberg")
RakeParams$from_json <- function(json) {
.Call("wrap__RakeParams__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.RakeParams` <- function(self, name) {
func <- RakeParams[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.RakeParams` <- `$.RakeParams`
#' Keyword extraction configuration
#' @field algorithm Algorithm to use for extraction.
#' @field max_keywords Maximum number of keywords to extract (default: 10).
#' @field min_score Minimum score threshold (0.0-1.0, default: 0.0).
#' @field ngram_range N-gram range for keyword extraction (min, max).
#' @field language Language code for stopword filtering (e.g., "en", "de", "fr").
#' @field yake_params YAKE-specific tuning parameters.
#' @field rake_params RAKE-specific tuning parameters.
#' @export
KeywordConfig <- new.env(parent = emptyenv())
KeywordConfig$default <- function() .Call("wrap__KeywordConfig__default", PACKAGE = "kreuzberg")
KeywordConfig$from_json <- function(json) {
.Call("wrap__KeywordConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.KeywordConfig` <- function(self, name) {
func <- KeywordConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.KeywordConfig` <- `$.KeywordConfig`
#' Extracted keyword with metadata
#' @field text The keyword text.
#' @field score Relevance score (higher is better, algorithm-specific range).
#' @field algorithm Algorithm that extracted this keyword.
#' @field positions Optional positions where keyword appears in text (character offsets).
#' @export
Keyword <- new.env(parent = emptyenv())
#' @export
`$.Keyword` <- function(self, name) {
func <- Keyword[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Keyword` <- `$.Keyword`
#' Configuration for PaddleOCR backend
#'
#' Configures PaddleOCR text detection and recognition with multi-language support.
#' Uses a builder pattern for convenient configuration.
#' @field language Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
#' @field cache_dir Optional custom cache directory for model files
#' @field use_angle_cls Enable angle classification for rotated text (default: false). Can misfire on short text
#' @field enable_table_detection Enable table structure detection (default: false)
#' @field det_db_thresh Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more
#' @field det_db_box_thresh Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0
#' @field det_db_unclip_ratio Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of
#' @field det_limit_side_len Maximum side length for detection image (default: 960) Larger images may be resized to
#' @field rec_batch_num Batch size for recognition inference (default: 6) Number of text regions to process
#' @field padding Padding in pixels added around the image before detection (default: 10). Large values can include
#' @field drop_score Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition
#' @field model_tier Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"`
#' @export
PaddleOcrConfig <- new.env(parent = emptyenv())
PaddleOcrConfig$with_cache_dir <- function(self, path) .Call("wrap__PaddleOcrConfig__with_cache_dir", self, path, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_table_detection <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_table_detection", self, enable, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_angle_cls <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_angle_cls", self, enable, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_thresh", self, threshold, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_box_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_box_thresh", self, threshold, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_unclip_ratio <- function(self, ratio) .Call("wrap__PaddleOcrConfig__with_det_db_unclip_ratio", self, ratio, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_limit_side_len <- function(self, length) .Call("wrap__PaddleOcrConfig__with_det_limit_side_len", self, length, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_rec_batch_num <- function(self, batch_size) .Call("wrap__PaddleOcrConfig__with_rec_batch_num", self, batch_size, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_drop_score <- function(self, score) .Call("wrap__PaddleOcrConfig__with_drop_score", self, score, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_padding <- function(self, padding) .Call("wrap__PaddleOcrConfig__with_padding", self, padding, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_model_tier <- function(self, tier) .Call("wrap__PaddleOcrConfig__with_model_tier", self, tier, PACKAGE = "kreuzberg")
PaddleOcrConfig$default <- function() .Call("wrap__PaddleOcrConfig__default", PACKAGE = "kreuzberg")
PaddleOcrConfig$from_json <- function(json) {
.Call("wrap__PaddleOcrConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PaddleOcrConfig` <- function(self, name) {
func <- PaddleOcrConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PaddleOcrConfig` <- `$.PaddleOcrConfig`
#' @export
with_cache_dir.PaddleOcrConfig <- function(x, ...) x$with_cache_dir(...)
#' @export
with_table_detection.PaddleOcrConfig <- function(x, ...) x$with_table_detection(...)
#' @export
with_angle_cls.PaddleOcrConfig <- function(x, ...) x$with_angle_cls(...)
#' @export
with_det_db_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_thresh(...)
#' @export
with_det_db_box_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_box_thresh(...)
#' @export
with_det_db_unclip_ratio.PaddleOcrConfig <- function(x, ...) x$with_det_db_unclip_ratio(...)
#' @export
with_det_limit_side_len.PaddleOcrConfig <- function(x, ...) x$with_det_limit_side_len(...)
#' @export
with_rec_batch_num.PaddleOcrConfig <- function(x, ...) x$with_rec_batch_num(...)
#' @export
with_drop_score.PaddleOcrConfig <- function(x, ...) x$with_drop_score(...)
#' @export
with_padding.PaddleOcrConfig <- function(x, ...) x$with_padding(...)
#' @export
with_model_tier.PaddleOcrConfig <- function(x, ...) x$with_model_tier(...)
#' Combined paths to all models needed for OCR (backward compatibility)
#' @field det_model Path to the detection model directory.
#' @field cls_model Path to the classification model directory.
#' @field rec_model Path to the recognition model directory.
#' @field dict_file Path to the character dictionary file.
#' @export
ModelPaths <- new.env(parent = emptyenv())
#' @export
`$.ModelPaths` <- function(self, name) {
func <- ModelPaths[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ModelPaths` <- `$.ModelPaths`
#' Document orientation detection result
#' @field degrees Detected orientation in degrees (0, 90, 180, or 270).
#' @field confidence Confidence score (0.0-1.0).
#' @export
OrientationResult <- new.env(parent = emptyenv())
#' @export
`$.OrientationResult` <- function(self, name) {
func <- OrientationResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OrientationResult` <- `$.OrientationResult`
#' Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right
#' @field x1 x1
#' @field y1 y1
#' @field x2 x2
#' @field y2 y2
#' @export
BBox <- new.env(parent = emptyenv())
#' @export
`$.BBox` <- function(self, name) {
func <- BBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BBox` <- `$.BBox`
#' A single layout detection result
#' @field class_name class_name
#' @field confidence confidence
#' @field bbox bbox
#' @export
LayoutDetection <- new.env(parent = emptyenv())
#' @export
`$.LayoutDetection` <- function(self, name) {
func <- LayoutDetection[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutDetection` <- `$.LayoutDetection`
#' Embedded file descriptor extracted from the PDF name tree
#' @field name The filename as stored in the PDF name tree.
#' @field data Raw file bytes from the embedded stream (already decompressed by lopdf).
#' @field compressed_size Compressed byte count of the original stream (before decompression).
#' @field mime_type MIME type if specified in the filespec, otherwise `None`.
#' @export
EmbeddedFile <- new.env(parent = emptyenv())
#' @export
`$.EmbeddedFile` <- function(self, name) {
func <- EmbeddedFile[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddedFile` <- `$.EmbeddedFile`
#' PDF-specific metadata
#'
#' Contains metadata fields specific to PDF documents that are not in the common
#' `Metadata` structure. Common fields like title, authors, keywords, and dates
#' are at the `Metadata` level.
#' @field pdf_version PDF version (e.g., "1.7", "2.0")
#' @field producer PDF producer (application that created the PDF)
#' @field is_encrypted Whether the PDF is encrypted/password-protected
#' @field width First page width in points (1/72 inch)
#' @field height First page height in points (1/72 inch)
#' @field page_count Total number of pages in the PDF document
#' @export
PdfMetadata <- new.env(parent = emptyenv())
PdfMetadata$from_json <- function(json) {
.Call("wrap__PdfMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PdfMetadata` <- function(self, name) {
func <- PdfMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfMetadata` <- `$.PdfMetadata`
#' Output format for extraction results
#'
#' Controls the format of the `content` field in `ExtractionResult`.
#' When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
#' `Plain` returns the raw extracted text.
#' `Structured` returns JSON with full OCR element data including bounding
#' boxes and confidence scores.
#' @field Plain Plain text content only (default)
#' @field Markdown Markdown format
#' @field Djot Djot markup format
#' @field Html HTML format
#' @field Json JSON tree format with heading-driven sections.
#' @field Structured Structured JSON format with full OCR element metadata.
#' @field Custom Custom renderer registered via the RendererRegistry. The string is the renderer name (e.g., "docx",
#' @export
OutputFormat <- new.env(parent = emptyenv())
#' @export
`$.OutputFormat` <- function(self, name) {
func <- OutputFormat[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OutputFormat` <- `$.OutputFormat`
#' Format-specific metadata (discriminated union)
#'
#' Only one format type can exist per extraction result. This provides
#' type-safe, clean metadata without nested optionals.
#' @field Pdf Pdf
#' @field Docx Docx
#' @field Excel Excel
#' @field Email Email
#' @field Pptx Pptx
#' @field Archive Archive
#' @field Image Image
#' @field Xml Xml
#' @field Text Text
#' @field Html Html
#' @field Ocr Ocr
#' @field Csv Csv
#' @field Bibtex Bibtex
#' @field Citation Citation
#' @field FictionBook FictionBook
#' @field Dbf Dbf
#' @field Jats Jats
#' @field Epub Epub
#' @field Pst Pst
#' @field Code Code
#' @export
FormatMetadata <- new.env(parent = emptyenv())
#' @export
`$.FormatMetadata` <- function(self, name) {
func <- FormatMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FormatMetadata` <- `$.FormatMetadata`
#' A single line in a unified-diff hunk
#'
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
#' reference it unconditionally, without requiring the `diff` Cargo feature.
#' `crate::diff` re-exports this type verbatim.
#' @field Context Unchanged context line.
#' @field Added Line added in the "after" version.
#' @field Removed Line removed from the "before" version.
#' @export
DiffLine <- new.env(parent = emptyenv())
#' @export
`$.DiffLine` <- function(self, name) {
func <- DiffLine[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffLine` <- `$.DiffLine`
#' Create a ExecutionProviderType enum value
#'
#' Returns the default ExecutionProviderType variant.
#'
#' @return A ExecutionProviderType enum value
#' @export
ExecutionProviderType <- function() list() |> structure(class = "ExecutionProviderType")
#' Create a HtmlTheme enum value
#'
#' Returns the default HtmlTheme variant.
#'
#' @return A HtmlTheme enum value
#' @export
HtmlTheme <- function() list() |> structure(class = "HtmlTheme")
#' Create a TableModel enum value
#'
#' Returns the default TableModel variant.
#'
#' @return A TableModel enum value
#' @export
TableModel <- function() list() |> structure(class = "TableModel")
#' Create a ChunkerType enum value
#'
#' Returns the default ChunkerType variant.
#'
#' @return A ChunkerType enum value
#' @export
ChunkerType <- function() list() |> structure(class = "ChunkerType")
#' Create a CodeContentMode enum value
#'
#' Returns the default CodeContentMode variant.
#'
#' @return A CodeContentMode enum value
#' @export
CodeContentMode <- function() list() |> structure(class = "CodeContentMode")
#' Create a ListType enum value
#'
#' Returns the default ListType variant.
#'
#' @return A ListType enum value
#' @export
ListType <- function() list() |> structure(class = "ListType")
#' Create a OcrBackendType enum value
#'
#' Returns the default OcrBackendType variant.
#'
#' @return A OcrBackendType enum value
#' @export
OcrBackendType <- function() list() |> structure(class = "OcrBackendType")
#' Create a ProcessingStage enum value
#'
#' Returns the default ProcessingStage variant.
#'
#' @return A ProcessingStage enum value
#' @export
ProcessingStage <- function() list() |> structure(class = "ProcessingStage")
#' Create a ReductionLevel enum value
#'
#' Returns the default ReductionLevel variant.
#'
#' @return A ReductionLevel enum value
#' @export
ReductionLevel <- function() list() |> structure(class = "ReductionLevel")
#' Create a PdfAnnotationType enum value
#'
#' Returns the default PdfAnnotationType variant.
#'
#' @return A PdfAnnotationType enum value
#' @export
PdfAnnotationType <- function() list() |> structure(class = "PdfAnnotationType")
#' Create a BlockType enum value
#'
#' Returns the default BlockType variant.
#'
#' @return A BlockType enum value
#' @export
BlockType <- function() list() |> structure(class = "BlockType")
#' Create a InlineType enum value
#'
#' Returns the default InlineType variant.
#'
#' @return A InlineType enum value
#' @export
InlineType <- function() list() |> structure(class = "InlineType")
#' Create a RelationshipKind enum value
#'
#' Returns the default RelationshipKind variant.
#'
#' @return A RelationshipKind enum value
#' @export
RelationshipKind <- function() list() |> structure(class = "RelationshipKind")
#' Create a ContentLayer enum value
#'
#' Returns the default ContentLayer variant.
#'
#' @return A ContentLayer enum value
#' @export
ContentLayer <- function() list() |> structure(class = "ContentLayer")
#' Create a ExtractionMethod enum value
#'
#' Returns the default ExtractionMethod variant.
#'
#' @return A ExtractionMethod enum value
#' @export
ExtractionMethod <- function() list() |> structure(class = "ExtractionMethod")
#' Create a ChunkType enum value
#'
#' Returns the default ChunkType variant.
#'
#' @return A ChunkType enum value
#' @export
ChunkType <- function() list() |> structure(class = "ChunkType")
#' Create a ImageKind enum value
#'
#' Returns the default ImageKind variant.
#'
#' @return A ImageKind enum value
#' @export
ImageKind <- function() list() |> structure(class = "ImageKind")
#' Create a ResultFormat enum value
#'
#' Returns the default ResultFormat variant.
#'
#' @return A ResultFormat enum value
#' @export
ResultFormat <- function() list() |> structure(class = "ResultFormat")
#' Create a ElementType enum value
#'
#' Returns the default ElementType variant.
#'
#' @return A ElementType enum value
#' @export
ElementType <- function() list() |> structure(class = "ElementType")
#' Create a TextDirection enum value
#'
#' Returns the default TextDirection variant.
#'
#' @return A TextDirection enum value
#' @export
TextDirection <- function() list() |> structure(class = "TextDirection")
#' Create a LinkType enum value
#'
#' Returns the default LinkType variant.
#'
#' @return A LinkType enum value
#' @export
LinkType <- function() list() |> structure(class = "LinkType")
#' Create a ImageType enum value
#'
#' Returns the default ImageType variant.
#'
#' @return A ImageType enum value
#' @export
ImageType <- function() list() |> structure(class = "ImageType")
#' Create a StructuredDataType enum value
#'
#' Returns the default StructuredDataType variant.
#'
#' @return A StructuredDataType enum value
#' @export
StructuredDataType <- function() list() |> structure(class = "StructuredDataType")
#' Create a OcrElementLevel enum value
#'
#' Returns the default OcrElementLevel variant.
#'
#' @return A OcrElementLevel enum value
#' @export
OcrElementLevel <- function() list() |> structure(class = "OcrElementLevel")
#' Create a PageUnitType enum value
#'
#' Returns the default PageUnitType variant.
#'
#' @return A PageUnitType enum value
#' @export
PageUnitType <- function() list() |> structure(class = "PageUnitType")
#' Create a RevisionKind enum value
#'
#' Returns the default RevisionKind variant.
#'
#' @return A RevisionKind enum value
#' @export
RevisionKind <- function() list() |> structure(class = "RevisionKind")
#' Create a UriKind enum value
#'
#' Returns the default UriKind variant.
#'
#' @return A UriKind enum value
#' @export
UriKind <- function() list() |> structure(class = "UriKind")
#' Create a KeywordAlgorithm enum value
#'
#' Returns the default KeywordAlgorithm variant.
#'
#' @return A KeywordAlgorithm enum value
#' @export
KeywordAlgorithm <- function() list() |> structure(class = "KeywordAlgorithm")
#' Create a PSMMode enum value
#'
#' Returns the default PSMMode variant.
#'
#' @return A PSMMode enum value
#' @export
PSMMode <- function() list() |> structure(class = "PSMMode")
#' Create a PaddleLanguage enum value
#'
#' Returns the default PaddleLanguage variant.
#'
#' @return A PaddleLanguage enum value
#' @export
PaddleLanguage <- function() list() |> structure(class = "PaddleLanguage")
#' Create a LayoutClass enum value
#'
#' Returns the default LayoutClass variant.
#'
#' @return A LayoutClass enum value
#' @export
LayoutClass <- function() list() |> structure(class = "LayoutClass")
#' How chunk size is measured
#'
#' Defaults to `Characters` (Unicode character count). When using token-based sizing,
#' chunks are sized by token count according to the specified tokenizer.
#'
#' Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
#' available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
#' (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
#' @export
ChunkSizing <- new.env(parent = emptyenv())
ChunkSizing$default <- function() .Call("wrap__ChunkSizing__default", PACKAGE = "kreuzberg")
ChunkSizing$from_json <- function(json) .Call("wrap__ChunkSizing__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.ChunkSizing` <- function(self, name) {
func <- ChunkSizing[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkSizing` <- `$.ChunkSizing`
#' Embedding model types supported by Kreuzberg
#' @export
EmbeddingModelType <- new.env(parent = emptyenv())
EmbeddingModelType$default <- function() .Call("wrap__EmbeddingModelType__default", PACKAGE = "kreuzberg")
EmbeddingModelType$from_json <- function(json) .Call("wrap__EmbeddingModelType__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.EmbeddingModelType` <- function(self, name) {
func <- EmbeddingModelType[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingModelType` <- `$.EmbeddingModelType`
#' Tagged enum for node content. Each variant carries only type-specific data
#'
#' Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
#' Go/Java/TypeScript bindings.
#' @export
NodeContent <- new.env(parent = emptyenv())
NodeContent$default <- function() .Call("wrap__NodeContent__default", PACKAGE = "kreuzberg")
NodeContent$from_json <- function(json) .Call("wrap__NodeContent__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.NodeContent` <- function(self, name) {
func <- NodeContent[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.NodeContent` <- `$.NodeContent`
#' Types of inline text annotations
#' @export
AnnotationKind <- new.env(parent = emptyenv())
AnnotationKind$default <- function() .Call("wrap__AnnotationKind__default", PACKAGE = "kreuzberg")
AnnotationKind$from_json <- function(json) .Call("wrap__AnnotationKind__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.AnnotationKind` <- function(self, name) {
func <- AnnotationKind[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.AnnotationKind` <- `$.AnnotationKind`
#' Bounding geometry for an OCR element
#'
#' Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
#' (from PaddleOCR and rotated text detection).
#' @export
OcrBoundingGeometry <- new.env(parent = emptyenv())
OcrBoundingGeometry$default <- function() .Call("wrap__OcrBoundingGeometry__default", PACKAGE = "kreuzberg")
OcrBoundingGeometry$from_json <- function(json) .Call("wrap__OcrBoundingGeometry__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.OcrBoundingGeometry` <- function(self, name) {
func <- OcrBoundingGeometry[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrBoundingGeometry` <- `$.OcrBoundingGeometry`
#' Best-effort document location for a revision
#' @export
RevisionAnchor <- new.env(parent = emptyenv())
RevisionAnchor$default <- function() .Call("wrap__RevisionAnchor__default", PACKAGE = "kreuzberg")
RevisionAnchor$from_json <- function(json) .Call("wrap__RevisionAnchor__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.RevisionAnchor` <- function(self, name) {
func <- RevisionAnchor[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.RevisionAnchor` <- `$.RevisionAnchor`
#' @export
cors_allows_all <- function(x, ...) UseMethod("cors_allows_all")
#' @export
is_empty <- function(x, ...) UseMethod("is_empty")
#' @export
is_origin_allowed <- function(x, ...) UseMethod("is_origin_allowed")
#' @export
listen_addr <- function(x, ...) UseMethod("listen_addr")
#' @export
max_multipart_field_mb <- function(x, ...) UseMethod("max_multipart_field_mb")
#' @export
max_request_body_mb <- function(x, ...) UseMethod("max_request_body_mb")
#' @export
needs_image_processing <- function(x, ...) UseMethod("needs_image_processing")
#' @export
with_angle_cls <- function(x, ...) UseMethod("with_angle_cls")
#' @export
with_cache_dir <- function(x, ...) UseMethod("with_cache_dir")
#' @export
with_det_db_box_thresh <- function(x, ...) UseMethod("with_det_db_box_thresh")
#' @export
with_det_db_thresh <- function(x, ...) UseMethod("with_det_db_thresh")
#' @export
with_det_db_unclip_ratio <- function(x, ...) UseMethod("with_det_db_unclip_ratio")
#' @export
with_det_limit_side_len <- function(x, ...) UseMethod("with_det_limit_side_len")
#' @export
with_drop_score <- function(x, ...) UseMethod("with_drop_score")
#' @export
with_model_tier <- function(x, ...) UseMethod("with_model_tier")
#' @export
with_padding <- function(x, ...) UseMethod("with_padding")
#' @export
with_rec_batch_num <- function(x, ...) UseMethod("with_rec_batch_num")
#' @export
with_table_detection <- function(x, ...) UseMethod("with_table_detection")