Files
fil/packages/r/R/extendr-wrappers.R
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

3519 lines
135 KiB
R
Generated
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Generated by extendr: Do not edit by hand
#
# This file is regenerated by alef on every `alef generate` run.
# It mirrors the output of `rextendr::document()` and binds every
# wrap__<symbol> entry registered in extendr_module! to an R-callable
# function or class env.
#' @useDynLib kreuzberg, .registration = TRUE
NULL
#' Extract content from a byte array
#'
#' This is the main entry point for in-memory extraction. It performs the following steps:
#' 1. Validate MIME type
#' 2. Handle legacy format conversion if needed
#' 3. Select appropriate extractor from registry
#' 4. Extract content
#' 5. Run post-processing pipeline
#' @param content The byte array to extract.
#' @param mime_type MIME type of the content.
#' @param config Extraction configuration.
#' @return An `ExtractionResult` containing the extracted content and metadata.
#'
#' @section Errors:
#' Returns `KreuzbergError::Validation` if MIME type is invalid.
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
#' @export
extract_bytes <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes", content, mime_type, config, PACKAGE = "kreuzberg")
#' Extract content from a file
#'
#' This is the main entry point for file-based extraction. It performs the following steps:
#' 1. Check cache for existing result (if caching enabled)
#' 2. Detect or validate MIME type
#' 3. Select appropriate extractor from registry
#' 4. Extract content
#' 5. Run post-processing pipeline
#' 6. Store result in cache (if caching enabled)
#' @param path Path to the file to extract.
#' @param mime_type Optional MIME type override. If None, will be auto-detected.
#' @param config Extraction configuration.
#' @return An `ExtractionResult` containing the extracted content and metadata.
#'
#' @section Errors:
#' Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
#' @export
extract_file <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file", path, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `extract_file`
#'
#' This is a convenience function that blocks the current thread until extraction completes.
#' For async code, use `extract_file` directly.
#'
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
#' a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
#'
#' This function is only available with the `tokio-runtime` feature. For WASM targets,
#' use a truly synchronous extraction approach instead.
#' @param path File path as character string.
#' @param mime_type Character string.
#' @param config ExtractionConfig object (list with class attribute).
#' @return ExtractionResult object (list with class attribute).
#' @export
extract_file_sync <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file_sync", path, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `extract_bytes`
#'
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
#' a new runtime per call.
#'
#' With the `tokio-runtime` feature, this blocks the current thread using the global
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
#' @param content Raw vector of bytes.
#' @param mime_type Character string.
#' @param config ExtractionConfig object (list with class attribute).
#' @return ExtractionResult object (list with class attribute).
#' @export
extract_bytes_sync <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes_sync", content, mime_type, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `batch_extract_files`
#'
#' Uses the global Tokio runtime for optimal performance.
#' Only available with `tokio-runtime` (WASM has no filesystem).
#' @param items List of batchfileitem object (list with class attribute).
#' @param config ExtractionConfig object (list with class attribute).
#' @return List of extractionresult object (list with class attribute).
#' @export
batch_extract_files_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files_sync", items, config, PACKAGE = "kreuzberg")
#' Synchronous wrapper for `batch_extract_bytes`
#'
#' Uses the global Tokio runtime for optimal performance.
#' With the `tokio-runtime` feature, this blocks the current thread using the global
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
#' that iterates through items and calls `extract_bytes_sync()`.
#' @param items List of batchbytesitem object (list with class attribute).
#' @param config ExtractionConfig object (list with class attribute).
#' @return List of extractionresult object (list with class attribute).
#' @export
batch_extract_bytes_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes_sync", items, config, PACKAGE = "kreuzberg")
#' Extract content from multiple files concurrently
#'
#' This function processes multiple files in parallel, automatically managing
#' concurrency to prevent resource exhaustion. The concurrency limit can be
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
#' to `(num_cpus * 1.5).ceil()`.
#'
#' Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
#' fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
#' Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
#' taken from the batch-level `config`.
#' @param items Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
#' @param config Batch-level extraction configuration (provides defaults and batch settings).
#' @return A vector of `ExtractionResult` in the same order as the input items.
#'
#' @section Errors:
#' Individual file errors are captured in the result metadata. System errors
#' (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
#' @export
batch_extract_files <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files", items, config, PACKAGE = "kreuzberg")
#' Extract content from multiple byte arrays concurrently
#'
#' This function processes multiple byte arrays in parallel, automatically managing
#' concurrency to prevent resource exhaustion. The concurrency limit can be
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
#' to `(num_cpus * 1.5).ceil()`.
#'
#' Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
#' fields from the batch-level `config`. Pass `None` as the config to use
#' the batch-level defaults for that item.
#' @param items Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
#' @param config Batch-level extraction configuration.
#' @return A vector of `ExtractionResult` in the same order as the input items.
#' @export
batch_extract_bytes <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes", items, config, PACKAGE = "kreuzberg")
#' Detect MIME type from raw file bytes
#'
#' Uses magic byte signatures to detect file type from content.
#' Falls back to `infer` crate for comprehensive detection.
#'
#' For ZIP-based files, inspects contents to distinguish Office Open XML
#' formats (DOCX, XLSX, PPTX) from plain ZIP archives.
#' @param content Raw file bytes.
#' @return The detected MIME type string.
#'
#' @section Errors:
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
#' @export
detect_mime_type_from_bytes <- function(content) .Call("wrap__detect_mime_type_from_bytes", content, PACKAGE = "kreuzberg")
#' Get file extensions for a given MIME type
#'
#' Returns all known file extensions that map to the specified MIME type.
#' @param mime_type The MIME type to look up.
#' @return A vector of file extensions (without leading dot) for the MIME type.
#' @export
get_extensions_for_mime <- function(mime_type) .Call("wrap__get_extensions_for_mime", mime_type, PACKAGE = "kreuzberg")
#' List the names of all registered embedding backends
#'
#' Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
#' bindings.
#' @return List of character string.
#' @export
list_embedding_backends <- function() .Call("wrap__list_embedding_backends", PACKAGE = "kreuzberg")
#' List names of all registered document extractors
#' @return List of character string.
#' @export
list_document_extractors <- function() .Call("wrap__list_document_extractors", PACKAGE = "kreuzberg")
#' List all registered OCR backends
#'
#' Returns the names of all OCR backends currently registered in the global registry.
#' @return A vector of OCR backend names.
#' @export
list_ocr_backends <- function() .Call("wrap__list_ocr_backends", PACKAGE = "kreuzberg")
#' List all registered post-processor names
#'
#' Returns a vector of all post-processor names currently registered in the
#' global registry.
#' @return - `Ok(Vec<String>)` - Vector of post-processor names
#' - `Err(...)` if the registry lock is poisoned.
#' @export
list_post_processors <- function() .Call("wrap__list_post_processors", PACKAGE = "kreuzberg")
#' List names of all registered renderers
#' @return List of character string.
#'
#' @section Errors:
#' Returns an error if the registry lock is poisoned.
#' @export
list_renderers <- function() .Call("wrap__list_renderers", PACKAGE = "kreuzberg")
#' List names of all registered validators
#' @return List of character string.
#' @export
list_validators <- function() .Call("wrap__list_validators", PACKAGE = "kreuzberg")
#' Compare two extraction results and return a structured diff
#'
#' The comparison is purely structural — no I/O, no side effects. All fields
#' of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
#' @param a — the "before" extraction result.
#' @param b — the "after" extraction result.
#' @param opts — controls which sections are compared and optional truncation.
#' @return ExtractionDiff object (list with class attribute).
#' @export
compare <- function(a = ExtractionResult$default(), b = ExtractionResult$default(), opts = DiffOptions$default()) .Call("wrap__compare", a, b, opts, PACKAGE = "kreuzberg")
#' Generate embeddings asynchronously for a list of text strings
#'
#' This is the async counterpart to [`embed_texts`]. It offloads the blocking
#' ONNX inference work to a dedicated blocking thread pool via Tokio's
#' `spawn_blocking`, keeping the async executor free.
#'
#' Returns one embedding vector per input text in the same order.
#' @param texts Vec of strings to embed (owned, sent to blocking thread).
#' @param config Embedding configuration specifying model, batch size, and normalization.
#' @return List of list of numeric.
#'
#' @section Errors:
#' - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
#' - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
#' or the blocking inference task panics
#' @export
embed_texts_async <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts_async", texts, config, PACKAGE = "kreuzberg")
#' Render a single PDF page to PNG bytes
#'
#' Returns raw PNG-encoded bytes for the specified page at the given DPI.
#' Uses pdf_oxide with tiny-skia for pure-Rust rendering.
#' @param pdf_bytes Raw PDF file bytes.
#' @param page_index Zero-based page index.
#' @param dpi Resolution in dots per inch (default: 150).
#' @param password Optional password for encrypted PDFs.
#' @return Raw vector of bytes.
#'
#' @section Errors:
#' Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
#' or rendered, or if `page_index` is out of range.
#' @export
render_pdf_page_to_png <- function(pdf_bytes, page_index, dpi = NULL, password = NULL) .Call("wrap__render_pdf_page_to_png", pdf_bytes, page_index, dpi, password, PACKAGE = "kreuzberg")
#' Detect the MIME type of a file at the given path
#'
#' Uses the file extension and optionally the file content to determine the MIME type.
#' Set `check_exists` to `true` to verify the file exists before detection.
#' @param path Character string.
#' @param check_exists Logical (TRUE/FALSE).
#' @return Character string.
#' @export
detect_mime_type <- function(path, check_exists) .Call("wrap__detect_mime_type", path, check_exists, PACKAGE = "kreuzberg")
#' Embed a list of texts using the configured embedding model
#'
#' Returns a 2D vector where each inner vector is the embedding for the corresponding text.
#' @param texts List of character string.
#' @param config EmbeddingConfig object (list with class attribute).
#' @return List of list of numeric.
#' @export
embed_texts <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts", texts, config, PACKAGE = "kreuzberg")
#' Get an embedding preset by name
#'
#' Returns `None` if no preset with the given name exists. Returns an owned
#' clone so the value is safe to pass across FFI boundaries.
#' @param name Character string.
#' @return Optional EmbeddingPreset object (list with class attribute). Defaults to NULL.
#' @export
get_embedding_preset <- function(name) .Call("wrap__get_embedding_preset", name, PACKAGE = "kreuzberg")
#' List the names of all available embedding presets
#'
#' Returns owned `String`s so the values are safe to pass across FFI boundaries.
#' @return List of character string.
#' @export
list_embedding_presets <- function() .Call("wrap__list_embedding_presets", PACKAGE = "kreuzberg")
#' register_ocr_backend
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_ocr_backend <- function(r_backend) .Call("wrap__register_ocr_backend", r_backend, PACKAGE = "kreuzberg")
#' unregister_ocr_backend
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_ocr_backend <- function(name) .Call("wrap__unregister_ocr_backend", name, PACKAGE = "kreuzberg")
#' clear_ocr_backends
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_ocr_backends <- function() .Call("wrap__clear_ocr_backends", PACKAGE = "kreuzberg")
#' register_post_processor
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_post_processor <- function(r_backend) .Call("wrap__register_post_processor", r_backend, PACKAGE = "kreuzberg")
#' unregister_post_processor
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_post_processor <- function(name) .Call("wrap__unregister_post_processor", name, PACKAGE = "kreuzberg")
#' clear_post_processors
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_post_processors <- function() .Call("wrap__clear_post_processors", PACKAGE = "kreuzberg")
#' register_validator
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_validator <- function(r_backend) .Call("wrap__register_validator", r_backend, PACKAGE = "kreuzberg")
#' unregister_validator
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_validator <- function(name) .Call("wrap__unregister_validator", name, PACKAGE = "kreuzberg")
#' clear_validators
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_validators <- function() .Call("wrap__clear_validators", PACKAGE = "kreuzberg")
#' register_embedding_backend
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_embedding_backend <- function(r_backend) .Call("wrap__register_embedding_backend", r_backend, PACKAGE = "kreuzberg")
#' unregister_embedding_backend
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_embedding_backend <- function(name) .Call("wrap__unregister_embedding_backend", name, PACKAGE = "kreuzberg")
#' clear_embedding_backends
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_embedding_backends <- function() .Call("wrap__clear_embedding_backends", PACKAGE = "kreuzberg")
#' register_document_extractor
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_document_extractor <- function(r_backend) .Call("wrap__register_document_extractor", r_backend, PACKAGE = "kreuzberg")
#' unregister_document_extractor
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_document_extractor <- function(name) .Call("wrap__unregister_document_extractor", name, PACKAGE = "kreuzberg")
#' clear_document_extractors
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_document_extractors <- function() .Call("wrap__clear_document_extractors", PACKAGE = "kreuzberg")
#' register_renderer
#'
#' Register an R-side plugin implementation. Pass a named list whose entries
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
#'
#' @param r_backend Named list of R closures implementing the trait surface.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
register_renderer <- function(r_backend) .Call("wrap__register_renderer", r_backend, PACKAGE = "kreuzberg")
#' unregister_renderer
#'
#' Unregister a previously registered plugin by name.
#'
#' @param name Plugin name string as returned by the backend's `name()` method.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
unregister_renderer <- function(name) .Call("wrap__unregister_renderer", name, PACKAGE = "kreuzberg")
#' clear_renderers
#'
#' Remove every registered plugin of this type. Typically used in test teardown.
#'
#' @return Invisible NULL on success; raises an R error on failure.
#' @export
clear_renderers <- function() .Call("wrap__clear_renderers", PACKAGE = "kreuzberg")
#' CacheStats
#' @field total_files total_files
#' @field total_size_mb total_size_mb
#' @field available_space_mb available_space_mb
#' @field oldest_file_age_days oldest_file_age_days
#' @field newest_file_age_days newest_file_age_days
#' @export
CacheStats <- new.env(parent = emptyenv())
#' @export
`$.CacheStats` <- function(self, name) {
func <- CacheStats[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CacheStats` <- `$.CacheStats`
#' Hardware acceleration configuration for ONNX Runtime models
#'
#' Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
#' for inference in layout detection and embedding generation.
#' @field provider Execution provider to use for ONNX inference.
#' @field device_id GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
#' @export
AccelerationConfig <- new.env(parent = emptyenv())
AccelerationConfig$from_json <- function(json) {
.Call("wrap__AccelerationConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.AccelerationConfig` <- function(self, name) {
func <- AccelerationConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.AccelerationConfig` <- `$.AccelerationConfig`
#' Cross-extractor content filtering configuration
#'
#' Controls whether "furniture" content (headers, footers, page numbers,
#' watermarks, repeating text) is included in or stripped from extraction
#' results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
#' with format-specific implementation.
#'
#' When `None` on `ExtractionConfig`, each extractor uses its current
#' default behavior unchanged.
#' @field include_headers Include running headers in extraction output.
#' @field include_footers Include running footers in extraction output.
#' @field strip_repeating_text Enable the heuristic cross-page repeating text detector.
#' @field include_watermarks Include watermark text in extraction output.
#' @export
ContentFilterConfig <- new.env(parent = emptyenv())
ContentFilterConfig$default <- function() .Call("wrap__ContentFilterConfig__default", PACKAGE = "kreuzberg")
ContentFilterConfig$from_json <- function(json) {
.Call("wrap__ContentFilterConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ContentFilterConfig` <- function(self, name) {
func <- ContentFilterConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ContentFilterConfig` <- `$.ContentFilterConfig`
#' Configuration for email extraction
#' @field msg_fallback_codepage Windows codepage number to use when an MSG file contains no codepage property. Defaults
#' @export
EmailConfig <- new.env(parent = emptyenv())
EmailConfig$from_json <- function(json) {
.Call("wrap__EmailConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmailConfig` <- function(self, name) {
func <- EmailConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailConfig` <- `$.EmailConfig`
#' Main extraction configuration
#'
#' This struct contains all configuration options for the extraction process.
#' It can be loaded from TOML, YAML, or JSON files, or created programmatically.
#' @field use_cache Enable caching of extraction results
#' @field enable_quality_processing Enable quality post-processing
#' @field ocr OCR configuration (None = OCR disabled)
#' @field force_ocr Force OCR even for searchable PDFs
#' @field force_ocr_pages Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
#' @field disable_ocr Disable OCR entirely, even for images.
#' @field chunking Text chunking configuration (None = chunking disabled)
#' @field content_filter Content filtering configuration (None = use extractor defaults).
#' @field images Image extraction configuration (None = no image extraction)
#' @field pdf_options PDF-specific options (None = use defaults)
#' @field token_reduction Token reduction configuration (None = no token reduction)
#' @field language_detection Language detection configuration (None = no language detection)
#' @field pages Page extraction configuration (None = no page tracking)
#' @field keywords Keyword extraction configuration (None = no keyword extraction)
#' @field postprocessor Post-processor configuration (None = use defaults)
#' @field html_options HTML to Markdown conversion options (None = use defaults)
#' @field html_output Styled HTML output configuration.
#' @field extraction_timeout_secs Default per-file timeout in seconds for batch extraction.
#' @field max_concurrent_extractions Maximum concurrent extractions in batch operations (None = (num_cpus ×
#' @field result_format Result structure format
#' @field security_limits Security limits for archive extraction.
#' @field max_embedded_file_bytes Maximum uncompressed size in bytes for a single embedded file before recursive
#' @field output_format Content text format (default: Plain).
#' @field layout Layout detection configuration (None = layout detection disabled).
#' @field use_layout_for_markdown Run layout detection on the non-OCR PDF markdown path.
#' @field include_document_structure Enable structured document tree output.
#' @field acceleration Hardware acceleration configuration for ONNX Runtime models.
#' @field cache_namespace Cache namespace for tenant isolation.
#' @field cache_ttl_secs Per-request cache TTL in seconds.
#' @field email Email extraction configuration (None = use defaults).
#' @field concurrency Concurrency limits for constrained environments (None = use defaults).
#' @field max_archive_depth Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive
#' @field tree_sitter Tree-sitter language pack configuration (None = tree-sitter disabled).
#' @field structured_extraction Structured extraction via LLM (None = disabled).
#' @field cancel_token Cancellation token for this extraction (None = no external cancellation).
#' @export
ExtractionConfig <- new.env(parent = emptyenv())
ExtractionConfig$default <- function() .Call("wrap__ExtractionConfig__default", PACKAGE = "kreuzberg")
ExtractionConfig$needs_image_processing <- function(self) .Call("wrap__ExtractionConfig__needs_image_processing", self, PACKAGE = "kreuzberg")
ExtractionConfig$from_json <- function(json) {
.Call("wrap__ExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ExtractionConfig` <- function(self, name) {
func <- ExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractionConfig` <- `$.ExtractionConfig`
#' @export
needs_image_processing.ExtractionConfig <- function(x, ...) x$needs_image_processing(...)
#' Per-file extraction configuration overrides for batch processing
#'
#' All fields are `Option<T>` — `None` means "use the batch-level default."
#' This type is used with `batch_extract_files` and
#' `batch_extract_bytes` to allow heterogeneous
#' extraction settings within a single batch.
#'
#' # Excluded Fields
#'
#' The following `ExtractionConfig` fields are batch-level only and
#' cannot be overridden per file:
#' - `max_concurrent_extractions` — controls batch parallelism
#' - `use_cache` — global caching policy
#' - `acceleration` — shared ONNX execution provider
#' - `security_limits` — global archive security policy
#' @field enable_quality_processing Override quality post-processing for this file.
#' @field ocr Override OCR configuration for this file (None in the Option = use batch default).
#' @field force_ocr Override force OCR for this file.
#' @field force_ocr_pages Override force OCR pages for this file (1-indexed page numbers).
#' @field disable_ocr Override disable OCR for this file.
#' @field chunking Override chunking configuration for this file.
#' @field content_filter Override content filtering configuration for this file.
#' @field images Override image extraction configuration for this file.
#' @field pdf_options Override PDF options for this file.
#' @field token_reduction Override token reduction for this file.
#' @field language_detection Override language detection for this file.
#' @field pages Override page extraction for this file.
#' @field keywords Override keyword extraction for this file.
#' @field postprocessor Override post-processor for this file.
#' @field html_options Override HTML conversion options for this file.
#' @field result_format Override result format for this file.
#' @field output_format Override output content format for this file.
#' @field include_document_structure Override document structure output for this file.
#' @field layout Override layout detection for this file.
#' @field timeout_secs Override per-file extraction timeout in seconds.
#' @field tree_sitter Override tree-sitter configuration for this file.
#' @field structured_extraction Override structured extraction configuration for this file.
#' @export
FileExtractionConfig <- new.env(parent = emptyenv())
FileExtractionConfig$from_json <- function(json) {
.Call("wrap__FileExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.FileExtractionConfig` <- function(self, name) {
func <- FileExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FileExtractionConfig` <- `$.FileExtractionConfig`
#' Batch item for byte array extraction
#'
#' Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
#' to represent a single item in a batch extraction job.
#' @field content The content bytes to extract from
#' @field mime_type MIME type of the content (e.g., "application/pdf", "text/html")
#' @field config Per-item configuration overrides (None uses batch-level defaults)
#' @export
BatchBytesItem <- new.env(parent = emptyenv())
#' @export
`$.BatchBytesItem` <- function(self, name) {
func <- BatchBytesItem[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BatchBytesItem` <- `$.BatchBytesItem`
#' Batch item for file extraction
#'
#' Used with `batch_extract_files` and `batch_extract_files_sync`
#' to represent a single file in a batch extraction job.
#' @field path Path to the file to extract from
#' @field config Per-file configuration overrides (None uses batch-level defaults)
#' @export
BatchFileItem <- new.env(parent = emptyenv())
#' @export
`$.BatchFileItem` <- function(self, name) {
func <- BatchFileItem[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BatchFileItem` <- `$.BatchFileItem`
#' Image extraction configuration
#' @field extract_images Extract images from documents
#' @field target_dpi Target DPI for image normalization
#' @field max_image_dimension Maximum dimension for images (width or height)
#' @field inject_placeholders Whether to inject image reference placeholders into markdown output. When `true`
#' @field auto_adjust_dpi Automatically adjust DPI based on image content
#' @field min_dpi Minimum DPI threshold
#' @field max_dpi Maximum DPI threshold
#' @field max_images_per_page Maximum number of image objects to extract per PDF page.
#' @field classify When `true` (default), extracted images are classified by kind and grouped into clusters where they
#' @field include_page_rasters When `true`, full-page renders produced during OCR preprocessing are captured and
#' @field run_ocr_on_images Run OCR on extracted images and include the recognized text in the document content.
#' @field ocr_text_only When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown
#' @field append_ocr_text When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder
#' @export
ImageExtractionConfig <- new.env(parent = emptyenv())
ImageExtractionConfig$default <- function() .Call("wrap__ImageExtractionConfig__default", PACKAGE = "kreuzberg")
ImageExtractionConfig$from_json <- function(json) {
.Call("wrap__ImageExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImageExtractionConfig` <- function(self, name) {
func <- ImageExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImageExtractionConfig` <- `$.ImageExtractionConfig`
#' Token reduction configuration
#' @field mode Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
#' @field preserve_important_words Preserve important words (capitalized, technical terms)
#' @export
TokenReductionOptions <- new.env(parent = emptyenv())
TokenReductionOptions$default <- function() .Call("wrap__TokenReductionOptions__default", PACKAGE = "kreuzberg")
TokenReductionOptions$from_json <- function(json) {
.Call("wrap__TokenReductionOptions__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TokenReductionOptions` <- function(self, name) {
func <- TokenReductionOptions[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TokenReductionOptions` <- `$.TokenReductionOptions`
#' Language detection configuration
#' @field enabled Enable language detection
#' @field min_confidence Minimum confidence threshold (0.0-1.0)
#' @field detect_multiple Detect multiple languages in the document
#' @export
LanguageDetectionConfig <- new.env(parent = emptyenv())
LanguageDetectionConfig$default <- function() .Call("wrap__LanguageDetectionConfig__default", PACKAGE = "kreuzberg")
LanguageDetectionConfig$from_json <- function(json) {
.Call("wrap__LanguageDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LanguageDetectionConfig` <- function(self, name) {
func <- LanguageDetectionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LanguageDetectionConfig` <- `$.LanguageDetectionConfig`
#' Configuration for styled HTML output
#'
#' When set on [`ExtractionConfig::html_output`] alongside
#' `output_format = OutputFormat::Html`, the pipeline builds a
#' [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
#' the plain comrak-based renderer.
#' @field css Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file`
#' @field css_file Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both
#' @field theme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
#' @field class_prefix CSS class prefix applied to every emitted class name.
#' @field embed_css When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening
#' @export
HtmlOutputConfig <- new.env(parent = emptyenv())
HtmlOutputConfig$default <- function() .Call("wrap__HtmlOutputConfig__default", PACKAGE = "kreuzberg")
HtmlOutputConfig$from_json <- function(json) {
.Call("wrap__HtmlOutputConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.HtmlOutputConfig` <- function(self, name) {
func <- HtmlOutputConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HtmlOutputConfig` <- `$.HtmlOutputConfig`
#' Layout detection configuration
#'
#' Controls layout detection behavior in the extraction pipeline.
#' When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
#' is enabled for PDF extraction.
#' @field confidence_threshold Confidence threshold override (None = use model default).
#' @field apply_heuristics Whether to apply postprocessing heuristics (default: true).
#' @field table_model Table structure recognition model.
#' @field acceleration Hardware acceleration for ONNX models (layout detection + table structure).
#' @export
LayoutDetectionConfig <- new.env(parent = emptyenv())
LayoutDetectionConfig$default <- function() .Call("wrap__LayoutDetectionConfig__default", PACKAGE = "kreuzberg")
LayoutDetectionConfig$from_json <- function(json) {
.Call("wrap__LayoutDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LayoutDetectionConfig` <- function(self, name) {
func <- LayoutDetectionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutDetectionConfig` <- `$.LayoutDetectionConfig`
#' Configuration for an LLM provider/model via liter-llm
#'
#' Each feature (VLM OCR, VLM embeddings, structured extraction) carries
#' its own `LlmConfig`, allowing different providers per feature.
#' @field model Provider/model string using liter-llm routing format.
#' @field api_key API key for the provider. When `None`, liter-llm falls back to the provider's standard environment
#' @field base_url Custom base URL override for the provider endpoint.
#' @field timeout_secs Request timeout in seconds (default: 60).
#' @field max_retries Maximum retry attempts (default: 3).
#' @field temperature Sampling temperature for generation tasks.
#' @field max_tokens Maximum tokens to generate.
#' @export
LlmConfig <- new.env(parent = emptyenv())
LlmConfig$from_json <- function(json) {
.Call("wrap__LlmConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LlmConfig` <- function(self, name) {
func <- LlmConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LlmConfig` <- `$.LlmConfig`
#' Configuration for LLM-based structured data extraction
#'
#' Sends extracted document content to a VLM with a JSON schema,
#' returning structured data that conforms to the schema.
#' @field schema JSON Schema defining the desired output structure.
#' @field schema_name Schema name passed to the LLM's structured output mode.
#' @field schema_description Optional schema description for the LLM.
#' @field strict Enable strict mode — output must exactly match the schema.
#' @field prompt Custom Jinja2 extraction prompt template. When `None`, a default template is used.
#' @field llm LLM configuration for the extraction.
#' @export
StructuredExtractionConfig <- new.env(parent = emptyenv())
#' @export
`$.StructuredExtractionConfig` <- function(self, name) {
func <- StructuredExtractionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredExtractionConfig` <- `$.StructuredExtractionConfig`
#' Quality thresholds for OCR fallback decisions and pipeline quality gating
#'
#' All fields default to the values that match the previous hardcoded behavior,
#' so `OcrQualityThresholds::default()` preserves existing semantics exactly.
#' @field min_total_non_whitespace Minimum total non-whitespace characters to consider text substantive.
#' @field min_non_whitespace_per_page Minimum non-whitespace characters per page on average.
#' @field min_meaningful_word_len Minimum character count for a word to be "meaningful".
#' @field min_meaningful_words Minimum count of meaningful words before text is accepted.
#' @field min_alnum_ratio Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
#' @field min_garbage_chars Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
#' @field max_fragmented_word_ratio Maximum fraction of short (1-2 char) words before text is considered fragmented.
#' @field critical_fragmented_word_ratio Critical fragmentation threshold — triggers OCR regardless of meaningful
#' @field min_avg_word_length Minimum average word length. Below this with enough words indicates garbled extraction.
#' @field min_words_for_avg_length_check Minimum word count before average word length check applies.
#' @field min_consecutive_repeat_ratio Minimum consecutive word repetition ratio to detect column scrambling.
#' @field min_words_for_repeat_check Minimum word count before consecutive repetition check is applied.
#' @field substantive_min_chars Minimum character count for "substantive markdown" OCR skip gate.
#' @field non_text_min_chars Minimum character count for "non-text content" OCR skip gate.
#' @field alnum_ws_ratio_threshold Alphanumeric+whitespace ratio threshold for skip decisions.
#' @field pipeline_min_quality Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the
#' @export
OcrQualityThresholds <- new.env(parent = emptyenv())
OcrQualityThresholds$default <- function() .Call("wrap__OcrQualityThresholds__default", PACKAGE = "kreuzberg")
OcrQualityThresholds$from_json <- function(json) {
.Call("wrap__OcrQualityThresholds__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrQualityThresholds` <- function(self, name) {
func <- OcrQualityThresholds[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrQualityThresholds` <- `$.OcrQualityThresholds`
#' A single backend stage in the OCR pipeline
#' @field backend Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
#' @field priority Priority weight (higher = tried first). Stages are sorted by priority descending.
#' @field language Language override for this stage (None = use parent OcrConfig.language).
#' @field tesseract_config Tesseract-specific config override for this stage.
#' @field paddle_ocr_config PaddleOCR-specific config for this stage.
#' @field vlm_config VLM config override for this pipeline stage.
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
#' @export
OcrPipelineStage <- new.env(parent = emptyenv())
#' @export
`$.OcrPipelineStage` <- function(self, name) {
func <- OcrPipelineStage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrPipelineStage` <- `$.OcrPipelineStage`
#' OCR configuration
#' @field enabled Whether OCR is enabled.
#' @field backend OCR backend: tesseract, easyocr, paddleocr
#' @field language Language code (e.g., "eng", "deu")
#' @field tesseract_config Tesseract-specific configuration (optional)
#' @field output_format Output format for OCR results (optional, for format conversion)
#' @field paddle_ocr_config PaddleOCR-specific configuration (optional, JSON passthrough)
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
#' @field element_config OCR element extraction configuration
#' @field quality_thresholds Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled
#' @field pipeline Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR
#' @field auto_rotate Enable automatic page rotation based on orientation detection.
#' @field vlm_config VLM (Vision Language Model) OCR configuration.
#' @field vlm_prompt Custom Jinja2 prompt template for VLM OCR.
#' @field acceleration Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
#' @field tessdata_bytes Caller-supplied Tesseract `traineddata` bytes per language code.
#' @export
OcrConfig <- new.env(parent = emptyenv())
OcrConfig$default <- function() .Call("wrap__OcrConfig__default", PACKAGE = "kreuzberg")
OcrConfig$from_json <- function(json) {
.Call("wrap__OcrConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrConfig` <- function(self, name) {
func <- OcrConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrConfig` <- `$.OcrConfig`
#' Page extraction and tracking configuration
#'
#' Controls how pages are extracted, tracked, and represented in the extraction results.
#' When `None`, page tracking is disabled.
#'
#' Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
#' when page boundaries are available and chunking is configured.
#' @field extract_pages Extract pages as separate array (ExtractionResult.pages)
#' @field insert_page_markers Insert page markers in main content string
#' @field marker_format Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n"
#' @export
PageConfig <- new.env(parent = emptyenv())
PageConfig$default <- function() .Call("wrap__PageConfig__default", PACKAGE = "kreuzberg")
PageConfig$from_json <- function(json) {
.Call("wrap__PageConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PageConfig` <- function(self, name) {
func <- PageConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageConfig` <- `$.PageConfig`
#' PDF-specific configuration
#' @field extract_images Extract images from PDF
#' @field extract_tables Extract tables from PDF.
#' @field passwords List of passwords to try when opening encrypted PDFs
#' @field extract_metadata Extract PDF metadata
#' @field hierarchy Hierarchy extraction configuration (None = hierarchy extraction disabled)
#' @field extract_annotations Extract PDF annotations (text notes, highlights, links, stamps). Default: false
#' @field top_margin_fraction Top margin fraction (0.01.0) of page height to exclude headers/running heads. Default:
#' @field bottom_margin_fraction Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
#' @field allow_single_column_tables Allow single-column pseudo tables in extraction results.
#' @field ocr_inline_images Perform OCR on inline images extracted from PDF pages and attach the recognized text to
#' @export
PdfConfig <- new.env(parent = emptyenv())
PdfConfig$default <- function() .Call("wrap__PdfConfig__default", PACKAGE = "kreuzberg")
PdfConfig$from_json <- function(json) {
.Call("wrap__PdfConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PdfConfig` <- function(self, name) {
func <- PdfConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfConfig` <- `$.PdfConfig`
#' Hierarchy extraction configuration for PDF text structure analysis
#'
#' Enables extraction of document hierarchy levels (H1-H6) based on font size
#' clustering and semantic analysis. When enabled, hierarchical blocks are
#' included in page content.
#' @field enabled Enable hierarchy extraction
#' @field k_clusters Number of font size clusters to use for hierarchy levels (1-7)
#' @field include_bbox Include bounding box information in hierarchy blocks
#' @field ocr_coverage_threshold OCR coverage threshold for smart OCR triggering (0.0-1.0)
#' @export
HierarchyConfig <- new.env(parent = emptyenv())
HierarchyConfig$default <- function() .Call("wrap__HierarchyConfig__default", PACKAGE = "kreuzberg")
HierarchyConfig$from_json <- function(json) {
.Call("wrap__HierarchyConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.HierarchyConfig` <- function(self, name) {
func <- HierarchyConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HierarchyConfig` <- `$.HierarchyConfig`
#' Post-processor configuration
#' @field enabled Enable post-processors
#' @field enabled_processors Whitelist of processor names to run (None = all enabled)
#' @field disabled_processors Blacklist of processor names to skip (None = none disabled)
#' @field enabled_set Pre-computed AHashSet for O(1) enabled processor lookup
#' @field disabled_set Pre-computed AHashSet for O(1) disabled processor lookup
#' @export
PostProcessorConfig <- new.env(parent = emptyenv())
PostProcessorConfig$default <- function() .Call("wrap__PostProcessorConfig__default", PACKAGE = "kreuzberg")
PostProcessorConfig$from_json <- function(json) {
.Call("wrap__PostProcessorConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PostProcessorConfig` <- function(self, name) {
func <- PostProcessorConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PostProcessorConfig` <- `$.PostProcessorConfig`
#' Chunking configuration
#'
#' Configures text chunking for document content, including chunk size,
#' overlap, trimming behavior, and optional embeddings.
#'
#' Use `..Default::default()` when constructing to allow for future field additions:
#' ```rust
#' let config = ChunkingConfig {
#' max_characters: 500,
#' ..Default::default()
#' };
#' ```
#' @field max_characters Maximum size per chunk (in units determined by `sizing`).
#' @field overlap Overlap between chunks (in units determined by `sizing`).
#' @field trim Whether to trim whitespace from chunk boundaries.
#' @field chunker_type Type of chunker to use (Text or Markdown).
#' @field embedding Optional embedding configuration for chunk embeddings.
#' @field preset Use a preset configuration (overrides individual settings if provided).
#' @field sizing How to measure chunk size.
#' @field prepend_heading_context When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path
#' @field topic_threshold Optional cosine similarity threshold for semantic topic boundary detection.
#' @export
ChunkingConfig <- new.env(parent = emptyenv())
ChunkingConfig$default <- function() .Call("wrap__ChunkingConfig__default", PACKAGE = "kreuzberg")
ChunkingConfig$from_json <- function(json) {
.Call("wrap__ChunkingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ChunkingConfig` <- function(self, name) {
func <- ChunkingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkingConfig` <- `$.ChunkingConfig`
#' Embedding configuration for text chunks
#'
#' Configures embedding generation using ONNX models via the vendored embedding engine.
#' Requires the `embeddings` feature to be enabled.
#' @field model The embedding model to use (defaults to "balanced" preset if not specified)
#' @field normalize Whether to normalize embedding vectors (recommended for cosine similarity)
#' @field batch_size Batch size for embedding generation
#' @field show_download_progress Show model download progress
#' @field cache_dir Custom cache directory for model files
#' @field acceleration Hardware acceleration for the embedding ONNX model.
#' @field max_embed_duration_secs Maximum wall-clock duration (in seconds) for a single `embed()` call when using
#' @export
EmbeddingConfig <- new.env(parent = emptyenv())
EmbeddingConfig$default <- function() .Call("wrap__EmbeddingConfig__default", PACKAGE = "kreuzberg")
EmbeddingConfig$from_json <- function(json) {
.Call("wrap__EmbeddingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmbeddingConfig` <- function(self, name) {
func <- EmbeddingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingConfig` <- `$.EmbeddingConfig`
#' Configuration for tree-sitter language pack integration
#'
#' Controls grammar download behavior and code analysis options.
#'
#' # Example (TOML)
#'
#' ```toml
#' [tree_sitter]
#' languages = ["python", "rust"]
#' groups = ["web"]
#'
#' [tree_sitter.process]
#' structure = true
#' comments = true
#' docstrings = true
#' ```
#' @field enabled Enable code intelligence processing (default: true).
#' @field cache_dir Custom cache directory for downloaded grammars.
#' @field languages Languages to pre-download on init (e.g., `["python", "rust"]`).
#' @field groups Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
#' @field process Processing options for code analysis.
#' @export
TreeSitterConfig <- new.env(parent = emptyenv())
TreeSitterConfig$default <- function() .Call("wrap__TreeSitterConfig__default", PACKAGE = "kreuzberg")
TreeSitterConfig$from_json <- function(json) {
.Call("wrap__TreeSitterConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TreeSitterConfig` <- function(self, name) {
func <- TreeSitterConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TreeSitterConfig` <- `$.TreeSitterConfig`
#' Processing options for tree-sitter code analysis
#'
#' Controls which analysis features are enabled when extracting code files.
#' @field structure Extract structural items (functions, classes, structs, etc.). Default: true.
#' @field imports Extract import statements. Default: true.
#' @field exports Extract export statements. Default: true.
#' @field comments Extract comments. Default: false.
#' @field docstrings Extract docstrings. Default: false.
#' @field symbols Extract symbol definitions. Default: false.
#' @field diagnostics Include parse diagnostics. Default: false.
#' @field chunk_max_size Maximum chunk size in bytes. `None` disables chunking.
#' @field content_mode Content rendering mode for code extraction.
#' @export
TreeSitterProcessConfig <- new.env(parent = emptyenv())
TreeSitterProcessConfig$default <- function() .Call("wrap__TreeSitterProcessConfig__default", PACKAGE = "kreuzberg")
TreeSitterProcessConfig$from_json <- function(json) {
.Call("wrap__TreeSitterProcessConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TreeSitterProcessConfig` <- function(self, name) {
func <- TreeSitterProcessConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TreeSitterProcessConfig` <- `$.TreeSitterProcessConfig`
#' A supported document format entry
#'
#' Represents a file extension and its corresponding MIME type that Kreuzberg can process.
#' @field extension File extension (without leading dot), e.g., "pdf", "docx"
#' @field mime_type MIME type string, e.g., "application/pdf"
#' @export
SupportedFormat <- new.env(parent = emptyenv())
#' @export
`$.SupportedFormat` <- function(self, name) {
func <- SupportedFormat[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.SupportedFormat` <- `$.SupportedFormat`
#' API server configuration
#'
#' This struct holds all configuration options for the Kreuzberg API server,
#' including host/port settings, CORS configuration, and upload limits.
#'
#' # Defaults
#'
#' - `host`: "127.0.0.1" (localhost only)
#' - `port`: 8000
#' - `cors_origins`: empty vector (allows all origins)
#' - `max_request_body_bytes`: 104_857_600 (100 MB)
#' - `max_multipart_field_bytes`: 104_857_600 (100 MB)
#' @field host Server host address (e.g., "127.0.0.1", "0.0.0.0")
#' @field port Server port number
#' @field cors_origins CORS allowed origins. Empty vector means allow all origins.
#' @field max_request_body_bytes Maximum size of request body in bytes (default: 100 MB)
#' @field max_multipart_field_bytes Maximum size of multipart fields in bytes (default: 100 MB)
#' @export
ServerConfig <- new.env(parent = emptyenv())
ServerConfig$default <- function() .Call("wrap__ServerConfig__default", PACKAGE = "kreuzberg")
ServerConfig$listen_addr <- function(self) .Call("wrap__ServerConfig__listen_addr", self, PACKAGE = "kreuzberg")
ServerConfig$cors_allows_all <- function(self) .Call("wrap__ServerConfig__cors_allows_all", self, PACKAGE = "kreuzberg")
ServerConfig$is_origin_allowed <- function(self, origin) .Call("wrap__ServerConfig__is_origin_allowed", self, origin, PACKAGE = "kreuzberg")
ServerConfig$max_request_body_mb <- function(self) .Call("wrap__ServerConfig__max_request_body_mb", self, PACKAGE = "kreuzberg")
ServerConfig$max_multipart_field_mb <- function(self) .Call("wrap__ServerConfig__max_multipart_field_mb", self, PACKAGE = "kreuzberg")
ServerConfig$from_json <- function(json) {
.Call("wrap__ServerConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ServerConfig` <- function(self, name) {
func <- ServerConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ServerConfig` <- `$.ServerConfig`
#' @export
listen_addr.ServerConfig <- function(x, ...) x$listen_addr(...)
#' @export
cors_allows_all.ServerConfig <- function(x, ...) x$cors_allows_all(...)
#' @export
is_origin_allowed.ServerConfig <- function(x, ...) x$is_origin_allowed(...)
#' @export
max_request_body_mb.ServerConfig <- function(x, ...) x$max_request_body_mb(...)
#' @export
max_multipart_field_mb.ServerConfig <- function(x, ...) x$max_multipart_field_mb(...)
#' StructuredDataResult
#' @field content content
#' @field format format
#' @field metadata metadata
#' @field text_fields text_fields
#' @export
StructuredDataResult <- new.env(parent = emptyenv())
#' @export
`$.StructuredDataResult` <- function(self, name) {
func <- StructuredDataResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredDataResult` <- `$.StructuredDataResult`
#' Application properties from docProps/app.xml for DOCX
#'
#' Contains Word-specific document statistics and metadata.
#' @field application Application name (e.g., "Microsoft Office Word")
#' @field app_version Application version
#' @field template Template filename
#' @field total_time Total editing time in minutes
#' @field pages Number of pages
#' @field words Number of words
#' @field characters Number of characters (excluding spaces)
#' @field characters_with_spaces Number of characters (including spaces)
#' @field lines Number of lines
#' @field paragraphs Number of paragraphs
#' @field company Company name
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @export
DocxAppProperties <- new.env(parent = emptyenv())
DocxAppProperties$from_json <- function(json) {
.Call("wrap__DocxAppProperties__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DocxAppProperties` <- function(self, name) {
func <- DocxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocxAppProperties` <- `$.DocxAppProperties`
#' Application properties from docProps/app.xml for XLSX
#'
#' Contains Excel-specific document metadata.
#' @field application Application name (e.g., "Microsoft Excel")
#' @field app_version Application version
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @field company Company name
#' @field worksheet_names Worksheet names
#' @export
XlsxAppProperties <- new.env(parent = emptyenv())
#' @export
`$.XlsxAppProperties` <- function(self, name) {
func <- XlsxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XlsxAppProperties` <- `$.XlsxAppProperties`
#' Application properties from docProps/app.xml for PPTX
#'
#' Contains PowerPoint-specific document metadata.
#' @field application Application name (e.g., "Microsoft Office PowerPoint")
#' @field app_version Application version
#' @field total_time Total editing time in minutes
#' @field company Company name
#' @field doc_security Document security level
#' @field scale_crop Scale crop flag
#' @field links_up_to_date Links up to date flag
#' @field shared_doc Shared document flag
#' @field hyperlinks_changed Hyperlinks changed flag
#' @field slides Number of slides
#' @field notes Number of notes
#' @field hidden_slides Number of hidden slides
#' @field multimedia_clips Number of multimedia clips
#' @field presentation_format Presentation format (e.g., "Widescreen", "Standard")
#' @field slide_titles Slide titles
#' @export
PptxAppProperties <- new.env(parent = emptyenv())
#' @export
`$.PptxAppProperties` <- function(self, name) {
func <- PptxAppProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PptxAppProperties` <- `$.PptxAppProperties`
#' Dublin Core metadata from docProps/core.xml
#'
#' Contains standard metadata fields defined by the Dublin Core standard
#' and Office-specific extensions.
#' @field title Document title
#' @field subject Document subject/topic
#' @field creator Document creator/author
#' @field keywords Keywords or tags
#' @field description Document description/abstract
#' @field last_modified_by User who last modified the document
#' @field revision Revision number
#' @field created Creation timestamp (ISO 8601)
#' @field modified Last modification timestamp (ISO 8601)
#' @field category Document category
#' @field content_status Content status (Draft, Final, etc.)
#' @field language Document language
#' @field identifier Unique identifier
#' @field version Document version
#' @field last_printed Last print timestamp (ISO 8601)
#' @export
CoreProperties <- new.env(parent = emptyenv())
CoreProperties$from_json <- function(json) {
.Call("wrap__CoreProperties__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CoreProperties` <- function(self, name) {
func <- CoreProperties[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CoreProperties` <- `$.CoreProperties`
#' Configuration for security limits across extractors
#'
#' All limits are intentionally conservative to prevent DoS attacks
#' while still supporting legitimate documents.
#' @field max_archive_size Maximum uncompressed size for archives (500 MB)
#' @field max_compression_ratio Maximum compression ratio before flagging as potential bomb (100:1)
#' @field max_files_in_archive Maximum number of files in archive (10,000)
#' @field max_nesting_depth Maximum nesting depth for structures (100)
#' @field max_entity_length Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token
#' @field max_content_size Maximum string growth per document (100 MB)
#' @field max_iterations Maximum iterations per operation
#' @field max_xml_depth Maximum XML depth (100 levels)
#' @field max_table_cells Maximum cells per table (100,000)
#' @export
SecurityLimits <- new.env(parent = emptyenv())
SecurityLimits$default <- function() .Call("wrap__SecurityLimits__default", PACKAGE = "kreuzberg")
SecurityLimits$from_json <- function(json) {
.Call("wrap__SecurityLimits__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.SecurityLimits` <- function(self, name) {
func <- SecurityLimits[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.SecurityLimits` <- `$.SecurityLimits`
#' TokenReductionConfig
#' @field level level
#' @field language_hint language_hint
#' @field preserve_markdown preserve_markdown
#' @field preserve_code preserve_code
#' @field semantic_threshold semantic_threshold
#' @field enable_parallel enable_parallel
#' @field use_simd use_simd
#' @field custom_stopwords custom_stopwords
#' @field preserve_patterns preserve_patterns
#' @field target_reduction target_reduction
#' @field enable_semantic_clustering enable_semantic_clustering
#' @export
TokenReductionConfig <- new.env(parent = emptyenv())
TokenReductionConfig$default <- function() .Call("wrap__TokenReductionConfig__default", PACKAGE = "kreuzberg")
TokenReductionConfig$from_json <- function(json) {
.Call("wrap__TokenReductionConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TokenReductionConfig` <- function(self, name) {
func <- TokenReductionConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TokenReductionConfig` <- `$.TokenReductionConfig`
#' A PDF annotation extracted from a document page
#' @field annotation_type The type of annotation.
#' @field content Text content of the annotation (e.g., comment text, link URL).
#' @field page_number Page number where the annotation appears (1-indexed).
#' @field bounding_box Bounding box of the annotation on the page.
#' @export
PdfAnnotation <- new.env(parent = emptyenv())
#' @export
`$.PdfAnnotation` <- function(self, name) {
func <- PdfAnnotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfAnnotation` <- `$.PdfAnnotation`
#' Inline element within a block
#'
#' Represents text with formatting, links, images, etc.
#' @field element_type Type of inline element
#' @field content Text content
#' @field attributes Element attributes
#' @field metadata Additional metadata (e.g., href for links, src/alt for images)
#' @export
InlineElement <- new.env(parent = emptyenv())
#' @export
`$.InlineElement` <- function(self, name) {
func <- InlineElement[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.InlineElement` <- `$.InlineElement`
#' Image element in Djot
#' @field src Image source URL or path
#' @field alt Alternative text
#' @field title Optional title
#' @field attributes Element attributes
#' @export
DjotImage <- new.env(parent = emptyenv())
#' @export
`$.DjotImage` <- function(self, name) {
func <- DjotImage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DjotImage` <- `$.DjotImage`
#' Link element in Djot
#' @field url Link URL
#' @field text Link text content
#' @field title Optional title
#' @field attributes Element attributes
#' @export
DjotLink <- new.env(parent = emptyenv())
#' @export
`$.DjotLink` <- function(self, name) {
func <- DjotLink[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DjotLink` <- `$.DjotLink`
#' A resolved relationship between two nodes in the document tree
#' @field source Source node index (the referencing node).
#' @field target Target node index (the referenced node).
#' @field kind Semantic kind of the relationship.
#' @export
DocumentRelationship <- new.env(parent = emptyenv())
#' @export
`$.DocumentRelationship` <- function(self, name) {
func <- DocumentRelationship[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocumentRelationship` <- `$.DocumentRelationship`
#' Individual grid cell with position and span metadata
#' @field content Cell text content.
#' @field row Zero-indexed row position.
#' @field col Zero-indexed column position.
#' @field row_span Number of rows this cell spans.
#' @field col_span Number of columns this cell spans.
#' @field is_header Whether this is a header cell.
#' @field bbox Bounding box for this cell (if available).
#' @export
GridCell <- new.env(parent = emptyenv())
#' @export
`$.GridCell` <- function(self, name) {
func <- GridCell[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.GridCell` <- `$.GridCell`
#' Inline text annotation — byte-range based formatting and links
#'
#' Annotations reference byte offsets into the node's text content,
#' enabling precise identification of formatted regions.
#' @field start Start byte offset in the node's text content (inclusive).
#' @field end End byte offset in the node's text content (exclusive).
#' @field kind Annotation type.
#' @export
TextAnnotation <- new.env(parent = emptyenv())
#' @export
`$.TextAnnotation` <- function(self, name) {
func <- TextAnnotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TextAnnotation` <- `$.TextAnnotation`
#' A single file extracted from an archive
#'
#' When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
#' enabled, each processable file produces its own full `ExtractionResult`.
#' @field path Archive-relative file path (e.g. "folder/document.pdf").
#' @field mime_type Detected MIME type of the file.
#' @field result Full extraction result for this file.
#' @export
ArchiveEntry <- new.env(parent = emptyenv())
#' @export
`$.ArchiveEntry` <- function(self, name) {
func <- ArchiveEntry[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ArchiveEntry` <- `$.ArchiveEntry`
#' A non-fatal warning from a processing pipeline stage
#'
#' Captures errors from optional features that don't prevent extraction
#' but may indicate degraded results.
#' @field source The pipeline stage or feature that produced this warning (e.g., "embedding", "chunking",
#' @field message Human-readable description of what went wrong.
#' @export
ProcessingWarning <- new.env(parent = emptyenv())
#' @export
`$.ProcessingWarning` <- function(self, name) {
func <- ProcessingWarning[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ProcessingWarning` <- `$.ProcessingWarning`
#' Token usage and cost data for a single LLM call made during extraction
#'
#' Populated when VLM OCR, structured extraction, or LLM-based embeddings
#' are used. Multiple entries may be present when multiple LLM calls occur
#' within one extraction (e.g. VLM OCR + structured extraction).
#' @field model The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
#' @field source The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction",
#' @field input_tokens Number of input/prompt tokens consumed.
#' @field output_tokens Number of output/completion tokens generated.
#' @field total_tokens Total tokens (input + output).
#' @field estimated_cost Estimated cost in USD based on the provider's published pricing.
#' @field finish_reason Why the model stopped generating (e.g. "stop", "length", "content_filter").
#' @export
LlmUsage <- new.env(parent = emptyenv())
LlmUsage$from_json <- function(json) {
.Call("wrap__LlmUsage__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LlmUsage` <- function(self, name) {
func <- LlmUsage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LlmUsage` <- `$.LlmUsage`
#' A text chunk with optional embedding and metadata
#'
#' Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
#' contains the text content, optional embedding vector (if embedding generation
#' is configured), and metadata about its position in the document.
#' @field content The text content of this chunk.
#' @field chunk_type Semantic structural classification of this chunk.
#' @field embedding Optional embedding vector for this chunk.
#' @field metadata Metadata about this chunk's position and properties.
#' @export
Chunk <- new.env(parent = emptyenv())
#' @export
`$.Chunk` <- function(self, name) {
func <- Chunk[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Chunk` <- `$.Chunk`
#' A single heading in the hierarchy
#' @field level Heading depth (1 = h1, 2 = h2, etc.)
#' @field text The text content of the heading.
#' @export
HeadingLevel <- new.env(parent = emptyenv())
#' @export
`$.HeadingLevel` <- function(self, name) {
func <- HeadingLevel[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HeadingLevel` <- `$.HeadingLevel`
#' Metadata about a chunk's position in the original document
#' @field byte_start Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
#' @field byte_end Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
#' @field token_count Number of tokens in this chunk (if available).
#' @field chunk_index Zero-based index of this chunk in the document.
#' @field total_chunks Total number of chunks in the document.
#' @field first_page First page number this chunk spans (1-indexed).
#' @field last_page Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
#' @field heading_context Heading context when using Markdown chunker.
#' @field image_indices Indices into `ExtractionResult.images` for images on pages covered by this chunk.
#' @export
ChunkMetadata <- new.env(parent = emptyenv())
#' @export
`$.ChunkMetadata` <- function(self, name) {
func <- ChunkMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkMetadata` <- `$.ChunkMetadata`
#' Extracted image from a document
#'
#' Contains raw image data, metadata, and optional nested OCR results.
#' Raw bytes allow cross-language compatibility - users can convert to
#' PIL.Image (Python), Sharp (Node.js), or other formats as needed.
#' @field data Raw image data (PNG, JPEG, WebP, etc. bytes). Uses `bytes::Bytes` for cheap cloning of large buffers.
#' @field format Image format (e.g., "jpeg", "png", "webp") Uses Cow<'static, str> to avoid allocation for static
#' @field image_index Zero-indexed position of this image in the document/page
#' @field page_number Page/slide number where image was found (1-indexed)
#' @field width Image width in pixels
#' @field height Image height in pixels
#' @field colorspace Colorspace information (e.g., "RGB", "CMYK", "Gray")
#' @field bits_per_component Bits per color component (e.g., 8, 16)
#' @field is_mask Whether this image is a mask image
#' @field description Optional description of the image
#' @field ocr_result Nested OCR extraction result (if image was OCRed)
#' @field bounding_box Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
#' @field source_path Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
#' @field image_kind Heuristic classification of what this image likely depicts. `None` if classification was disabled
#' @field kind_confidence Confidence score for `image_kind`, in the range 0.0 to 1.0.
#' @field cluster_id Identifier shared across images that form a single logical figure (e.g. all raster tiles of one
#' @export
ExtractedImage <- new.env(parent = emptyenv())
#' @export
`$.ExtractedImage` <- function(self, name) {
func <- ExtractedImage[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractedImage` <- `$.ExtractedImage`
#' Bounding box coordinates for element positioning
#' @field x0 Left x-coordinate
#' @field y0 Bottom y-coordinate
#' @field x1 Right x-coordinate
#' @field y1 Top y-coordinate
#' @export
BoundingBox <- new.env(parent = emptyenv())
BoundingBox$from_json <- function(json) {
.Call("wrap__BoundingBox__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.BoundingBox` <- function(self, name) {
func <- BoundingBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BoundingBox` <- `$.BoundingBox`
#' Metadata for a semantic element
#' @field page_number Page number (1-indexed)
#' @field filename Source filename or document name
#' @field coordinates Bounding box coordinates if available
#' @field element_index Position index in the element sequence
#' @field additional Additional custom metadata
#' @export
ElementMetadata <- new.env(parent = emptyenv())
#' @export
`$.ElementMetadata` <- function(self, name) {
func <- ElementMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ElementMetadata` <- `$.ElementMetadata`
#' Semantic element extracted from document
#'
#' Represents a logical unit of content with semantic classification,
#' unique identifier, and metadata for tracking origin and position.
#' @field element_id Unique element identifier
#' @field element_type Semantic type of this element
#' @field text Text content of the element
#' @field metadata Metadata about the element
#' @export
Element <- new.env(parent = emptyenv())
#' @export
`$.Element` <- function(self, name) {
func <- Element[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Element` <- `$.Element`
#' XML extraction result
#'
#' Contains extracted text content from XML files along with
#' structural statistics about the XML document.
#' @field content Extracted text content (XML structure filtered out)
#' @field element_count Total number of XML elements processed
#' @field unique_elements List of unique element names found (sorted)
#' @export
XmlExtractionResult <- new.env(parent = emptyenv())
#' @export
`$.XmlExtractionResult` <- function(self, name) {
func <- XmlExtractionResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XmlExtractionResult` <- `$.XmlExtractionResult`
#' Email attachment representation
#'
#' Contains metadata and optionally the content of an email attachment.
#' @field name Attachment name (from Content-Disposition header)
#' @field filename Filename of the attachment
#' @field mime_type MIME type of the attachment
#' @field size Size in bytes
#' @field is_image Whether this attachment is an image
#' @field data Attachment data (if extracted). Uses `bytes::Bytes` for cheap cloning of large buffers.
#' @export
EmailAttachment <- new.env(parent = emptyenv())
#' @export
`$.EmailAttachment` <- function(self, name) {
func <- EmailAttachment[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailAttachment` <- `$.EmailAttachment`
#' Bounding box for an OCR-detected table in pixel coordinates
#' @field left Left x-coordinate (pixels)
#' @field top Top y-coordinate (pixels)
#' @field right Right x-coordinate (pixels)
#' @field bottom Bottom y-coordinate (pixels)
#' @export
OcrTableBoundingBox <- new.env(parent = emptyenv())
#' @export
`$.OcrTableBoundingBox` <- function(self, name) {
func <- OcrTableBoundingBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrTableBoundingBox` <- `$.OcrTableBoundingBox`
#' Image preprocessing configuration for OCR
#'
#' These settings control how images are preprocessed before OCR to improve
#' text recognition quality. Different preprocessing strategies work better
#' for different document types.
#' @field target_dpi Target DPI for the image (300 is standard, 600 for small text).
#' @field auto_rotate Auto-detect and correct image rotation.
#' @field deskew Correct skew (tilted images).
#' @field denoise Remove noise from the image.
#' @field contrast_enhance Enhance contrast for better text visibility.
#' @field binarization_method Binarization method: "otsu", "sauvola", "adaptive".
#' @field invert_colors Invert colors (white text on black → black on white).
#' @export
ImagePreprocessingConfig <- new.env(parent = emptyenv())
ImagePreprocessingConfig$default <- function() .Call("wrap__ImagePreprocessingConfig__default", PACKAGE = "kreuzberg")
ImagePreprocessingConfig$from_json <- function(json) {
.Call("wrap__ImagePreprocessingConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImagePreprocessingConfig` <- function(self, name) {
func <- ImagePreprocessingConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImagePreprocessingConfig` <- `$.ImagePreprocessingConfig`
#' Tesseract OCR configuration
#'
#' Provides fine-grained control over Tesseract OCR engine parameters.
#' Most users can use the defaults, but these settings allow optimization
#' for specific document types (invoices, handwriting, etc.).
#' @field language Language code (e.g., "eng", "deu", "fra")
#' @field psm Page Segmentation Mode (0-13).
#' @field output_format Output format ("text" or "markdown")
#' @field oem OCR Engine Mode (0-3).
#' @field min_confidence Minimum confidence threshold (0.0-100.0).
#' @field preprocessing Image preprocessing configuration.
#' @field enable_table_detection Enable automatic table detection and reconstruction
#' @field table_min_confidence Minimum confidence threshold for table detection (0.0-1.0)
#' @field table_column_threshold Column threshold for table detection (pixels)
#' @field table_row_threshold_ratio Row threshold ratio for table detection (0.0-1.0)
#' @field use_cache Enable OCR result caching
#' @field classify_use_pre_adapted_templates Use pre-adapted templates for character classification
#' @field language_model_ngram_on Enable N-gram language model
#' @field tessedit_dont_blkrej_good_wds Don't reject good words during block-level processing
#' @field tessedit_dont_rowrej_good_wds Don't reject good words during row-level processing
#' @field tessedit_enable_dict_correction Enable dictionary correction
#' @field tessedit_char_whitelist Whitelist of allowed characters (empty = all allowed)
#' @field tessedit_char_blacklist Blacklist of forbidden characters (empty = none forbidden)
#' @field tessedit_use_primary_params_model Use primary language params model
#' @field textord_space_size_is_variable Variable-width space detection
#' @field thresholding_method Use adaptive thresholding method
#' @export
TesseractConfig <- new.env(parent = emptyenv())
TesseractConfig$default <- function() .Call("wrap__TesseractConfig__default", PACKAGE = "kreuzberg")
TesseractConfig$from_json <- function(json) {
.Call("wrap__TesseractConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.TesseractConfig` <- function(self, name) {
func <- TesseractConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TesseractConfig` <- `$.TesseractConfig`
#' Image preprocessing metadata
#'
#' Tracks the transformations applied to an image during OCR preprocessing,
#' including DPI normalization, resizing, and resampling.
#' @field original_dimensions Original image dimensions (width, height) in pixels
#' @field original_dpi Original image DPI (horizontal, vertical)
#' @field target_dpi Target DPI from configuration
#' @field scale_factor Scaling factor applied to the image
#' @field auto_adjusted Whether DPI was auto-adjusted based on content
#' @field final_dpi Final DPI after processing
#' @field new_dimensions New dimensions after resizing (if resized)
#' @field resample_method Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
#' @field dimension_clamped Whether dimensions were clamped to max_image_dimension
#' @field calculated_dpi Calculated optimal DPI (if auto_adjust_dpi enabled)
#' @field skipped_resize Whether resize was skipped (dimensions already optimal)
#' @field resize_error Error message if resize failed
#' @export
ImagePreprocessingMetadata <- new.env(parent = emptyenv())
#' @export
`$.ImagePreprocessingMetadata` <- function(self, name) {
func <- ImagePreprocessingMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImagePreprocessingMetadata` <- `$.ImagePreprocessingMetadata`
#' Extraction result metadata
#'
#' Contains common fields applicable to all formats, format-specific metadata
#' via a discriminated union, and additional custom fields from postprocessors.
#' @field title Document title
#' @field subject Document subject or description
#' @field authors Primary author(s) - always Vec for consistency
#' @field keywords Keywords/tags - always Vec for consistency
#' @field language Primary language (ISO 639 code)
#' @field created_at Creation timestamp (ISO 8601 format)
#' @field modified_at Last modification timestamp (ISO 8601 format)
#' @field created_by User who created the document
#' @field modified_by User who last modified the document
#' @field pages Page/slide/sheet structure with boundaries
#' @field format Format-specific metadata (discriminated union)
#' @field image_preprocessing Image preprocessing metadata (when OCR preprocessing was applied)
#' @field json_schema JSON schema (for structured data extraction)
#' @field error Error metadata (for batch operations)
#' @field extraction_duration_ms Extraction duration in milliseconds (for benchmarking).
#' @field category Document category (from frontmatter or classification).
#' @field tags Document tags (from frontmatter).
#' @field document_version Document version string (from frontmatter).
#' @field abstract_text Abstract or summary text (from frontmatter).
#' @field output_format Output format identifier (e.g., "markdown", "html", "text").
#' @field ocr_used Whether OCR was used during extraction.
#' @field additional Additional custom fields from postprocessors.
#' @export
Metadata <- new.env(parent = emptyenv())
Metadata$is_empty <- function(self) .Call("wrap__Metadata__is_empty", self, PACKAGE = "kreuzberg")
Metadata$from_json <- function(json) {
.Call("wrap__Metadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.Metadata` <- function(self, name) {
func <- Metadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Metadata` <- `$.Metadata`
#' @export
is_empty.Metadata <- function(x, ...) x$is_empty(...)
#' Excel/spreadsheet format metadata
#'
#' Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
#' discriminant. Sheet count and sheet names are stored inside this struct.
#' @field sheet_count Number of sheets in the workbook.
#' @field sheet_names Names of all sheets in the workbook.
#' @export
ExcelMetadata <- new.env(parent = emptyenv())
ExcelMetadata$from_json <- function(json) {
.Call("wrap__ExcelMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ExcelMetadata` <- function(self, name) {
func <- ExcelMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExcelMetadata` <- `$.ExcelMetadata`
#' Email metadata extracted from .eml and .msg files
#'
#' Includes sender/recipient information, message ID, and attachment list.
#' @field from_email Sender's email address
#' @field from_name Sender's display name
#' @field to_emails Primary recipients
#' @field cc_emails CC recipients
#' @field bcc_emails BCC recipients
#' @field message_id Message-ID header value
#' @field attachments List of attachment filenames
#' @export
EmailMetadata <- new.env(parent = emptyenv())
EmailMetadata$from_json <- function(json) {
.Call("wrap__EmailMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EmailMetadata` <- function(self, name) {
func <- EmailMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmailMetadata` <- `$.EmailMetadata`
#' Archive (ZIP/TAR/7Z) metadata
#'
#' Extracted from compressed archive files containing file lists and size information.
#' @field format Archive format ("ZIP", "TAR", "7Z", etc.)
#' @field file_count Total number of files in the archive
#' @field file_list List of file paths within the archive
#' @field total_size Total uncompressed size in bytes
#' @field compressed_size Compressed size in bytes (if available)
#' @export
ArchiveMetadata <- new.env(parent = emptyenv())
ArchiveMetadata$from_json <- function(json) {
.Call("wrap__ArchiveMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ArchiveMetadata` <- function(self, name) {
func <- ArchiveMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ArchiveMetadata` <- `$.ArchiveMetadata`
#' Image metadata extracted from image files
#'
#' Includes dimensions, format, and EXIF data.
#' @field width Image width in pixels
#' @field height Image height in pixels
#' @field format Image format (e.g., "PNG", "JPEG", "TIFF")
#' @field exif EXIF metadata tags
#' @export
ImageMetadata <- new.env(parent = emptyenv())
ImageMetadata$from_json <- function(json) {
.Call("wrap__ImageMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.ImageMetadata` <- function(self, name) {
func <- ImageMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ImageMetadata` <- `$.ImageMetadata`
#' XML metadata extracted during XML parsing
#'
#' Provides statistics about XML document structure.
#' @field element_count Total number of XML elements processed
#' @field unique_elements List of unique element tag names (sorted)
#' @export
XmlMetadata <- new.env(parent = emptyenv())
XmlMetadata$from_json <- function(json) {
.Call("wrap__XmlMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.XmlMetadata` <- function(self, name) {
func <- XmlMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.XmlMetadata` <- `$.XmlMetadata`
#' Header/heading element metadata
#' @field level Header level: 1 (h1) through 6 (h6)
#' @field text Normalized text content of the header
#' @field id HTML id attribute if present
#' @field depth Document tree depth at the header element
#' @field html_offset Byte offset in original HTML document
#' @export
HeaderMetadata <- new.env(parent = emptyenv())
#' @export
`$.HeaderMetadata` <- function(self, name) {
func <- HeaderMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HeaderMetadata` <- `$.HeaderMetadata`
#' Structured data (Schema.org, microdata, RDFa) block
#' @field data_type Type of structured data
#' @field raw_json Raw JSON string representation
#' @field schema_type Schema type if detectable (e.g., "Article", "Event", "Product")
#' @export
StructuredData <- new.env(parent = emptyenv())
#' @export
`$.StructuredData` <- function(self, name) {
func <- StructuredData[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.StructuredData` <- `$.StructuredData`
#' OCR processing metadata
#'
#' Captures information about OCR processing configuration and results.
#' @field language OCR language code(s) used
#' @field psm Tesseract Page Segmentation Mode (PSM)
#' @field output_format Output format (e.g., "text", "hocr")
#' @field table_count Number of tables detected
#' @field table_rows table_rows
#' @field table_cols table_cols
#' @export
OcrMetadata <- new.env(parent = emptyenv())
OcrMetadata$from_json <- function(json) {
.Call("wrap__OcrMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrMetadata` <- function(self, name) {
func <- OcrMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrMetadata` <- `$.OcrMetadata`
#' Error metadata (for batch operations)
#' @field error_type error_type
#' @field message message
#' @export
ErrorMetadata <- new.env(parent = emptyenv())
#' @export
`$.ErrorMetadata` <- function(self, name) {
func <- ErrorMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ErrorMetadata` <- `$.ErrorMetadata`
#' PowerPoint presentation metadata
#'
#' Extracted from PPTX files containing slide counts and presentation details.
#' @field slide_count Total number of slides in the presentation
#' @field slide_names Names of slides (if available)
#' @field image_count Number of embedded images
#' @field table_count Number of tables
#' @export
PptxMetadata <- new.env(parent = emptyenv())
PptxMetadata$from_json <- function(json) {
.Call("wrap__PptxMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PptxMetadata` <- function(self, name) {
func <- PptxMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PptxMetadata` <- `$.PptxMetadata`
#' Word document metadata
#'
#' Extracted from DOCX files using shared Office Open XML metadata extraction.
#' Integrates with `office_metadata` module for core/app/custom properties.
#' @field core_properties Core properties from docProps/core.xml (Dublin Core metadata)
#' @field app_properties Application properties from docProps/app.xml (Word-specific statistics)
#' @field custom_properties Custom properties from docProps/custom.xml (user-defined properties)
#' @export
DocxMetadata <- new.env(parent = emptyenv())
DocxMetadata$from_json <- function(json) {
.Call("wrap__DocxMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DocxMetadata` <- function(self, name) {
func <- DocxMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocxMetadata` <- `$.DocxMetadata`
#' CSV/TSV file metadata
#' @field row_count row_count
#' @field column_count column_count
#' @field delimiter delimiter
#' @field has_header has_header
#' @field column_types column_types
#' @export
CsvMetadata <- new.env(parent = emptyenv())
CsvMetadata$from_json <- function(json) {
.Call("wrap__CsvMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CsvMetadata` <- function(self, name) {
func <- CsvMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CsvMetadata` <- `$.CsvMetadata`
#' BibTeX bibliography metadata
#' @field entry_count Number of entries in the bibliography.
#' @field citation_keys citation_keys
#' @field authors authors
#' @field year_range year_range
#' @field entry_types entry_types
#' @export
BibtexMetadata <- new.env(parent = emptyenv())
BibtexMetadata$from_json <- function(json) {
.Call("wrap__BibtexMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.BibtexMetadata` <- function(self, name) {
func <- BibtexMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BibtexMetadata` <- `$.BibtexMetadata`
#' Citation file metadata (RIS, PubMed, EndNote)
#' @field citation_count citation_count
#' @field format format
#' @field authors authors
#' @field year_range year_range
#' @field dois dois
#' @field keywords keywords
#' @export
CitationMetadata <- new.env(parent = emptyenv())
CitationMetadata$from_json <- function(json) {
.Call("wrap__CitationMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.CitationMetadata` <- function(self, name) {
func <- CitationMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CitationMetadata` <- `$.CitationMetadata`
#' Year range for bibliographic metadata
#' @field min min
#' @field max max
#' @field years years
#' @export
YearRange <- new.env(parent = emptyenv())
#' @export
`$.YearRange` <- function(self, name) {
func <- YearRange[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.YearRange` <- `$.YearRange`
#' FictionBook (FB2) metadata
#' @field genres genres
#' @field sequences sequences
#' @field annotation annotation
#' @export
FictionBookMetadata <- new.env(parent = emptyenv())
FictionBookMetadata$from_json <- function(json) {
.Call("wrap__FictionBookMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.FictionBookMetadata` <- function(self, name) {
func <- FictionBookMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FictionBookMetadata` <- `$.FictionBookMetadata`
#' DBASE field information
#' @field name name
#' @field field_type field_type
#' @export
DbfFieldInfo <- new.env(parent = emptyenv())
#' @export
`$.DbfFieldInfo` <- function(self, name) {
func <- DbfFieldInfo[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DbfFieldInfo` <- `$.DbfFieldInfo`
#' JATS contributor with role
#' @field name name
#' @field role role
#' @export
ContributorRole <- new.env(parent = emptyenv())
#' @export
`$.ContributorRole` <- function(self, name) {
func <- ContributorRole[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ContributorRole` <- `$.ContributorRole`
#' EPUB metadata (Dublin Core extensions)
#' @field coverage coverage
#' @field dc_format dc_format
#' @field relation relation
#' @field source source
#' @field dc_type dc_type
#' @field cover_image cover_image
#' @export
EpubMetadata <- new.env(parent = emptyenv())
EpubMetadata$from_json <- function(json) {
.Call("wrap__EpubMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.EpubMetadata` <- function(self, name) {
func <- EpubMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EpubMetadata` <- `$.EpubMetadata`
#' Outlook PST archive metadata
#' @field message_count message_count
#' @export
PstMetadata <- new.env(parent = emptyenv())
PstMetadata$from_json <- function(json) {
.Call("wrap__PstMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PstMetadata` <- function(self, name) {
func <- PstMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PstMetadata` <- `$.PstMetadata`
#' Confidence scores for an OCR element
#'
#' Separates detection confidence (how confident that text exists at this location)
#' from recognition confidence (how confident about the actual text content).
#' @field detection Detection confidence: how confident the OCR engine is that text exists here.
#' @field recognition Recognition confidence: how confident about the text content.
#' @export
OcrConfidence <- new.env(parent = emptyenv())
OcrConfidence$from_json <- function(json) {
.Call("wrap__OcrConfidence__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrConfidence` <- function(self, name) {
func <- OcrConfidence[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrConfidence` <- `$.OcrConfidence`
#' Rotation information for an OCR element
#' @field angle_degrees Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
#' @field confidence Confidence score for the rotation detection.
#' @export
OcrRotation <- new.env(parent = emptyenv())
#' @export
`$.OcrRotation` <- function(self, name) {
func <- OcrRotation[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrRotation` <- `$.OcrRotation`
#' A unified OCR element representing detected text with full metadata
#'
#' This is the primary type for structured OCR output, preserving all information
#' from both Tesseract and PaddleOCR backends.
#' @field text The recognized text content.
#' @field geometry Bounding geometry (rectangle or quadrilateral).
#' @field confidence Confidence scores for detection and recognition.
#' @field level Hierarchical level (word, line, block, page).
#' @field rotation Rotation information (if detected).
#' @field page_number Page number (1-indexed).
#' @field parent_id Parent element ID for hierarchical relationships.
#' @field backend_metadata Backend-specific metadata that doesn't fit the unified schema.
#' @export
OcrElement <- new.env(parent = emptyenv())
OcrElement$from_json <- function(json) {
.Call("wrap__OcrElement__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrElement` <- function(self, name) {
func <- OcrElement[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrElement` <- `$.OcrElement`
#' Configuration for OCR element extraction
#'
#' Controls how OCR elements are extracted and filtered.
#' @field include_elements Whether to include OCR elements in the extraction result.
#' @field min_level Minimum hierarchical level to include.
#' @field min_confidence Minimum recognition confidence threshold (0.0-1.0).
#' @field build_hierarchy Whether to build hierarchical relationships between elements.
#' @export
OcrElementConfig <- new.env(parent = emptyenv())
OcrElementConfig$from_json <- function(json) {
.Call("wrap__OcrElementConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.OcrElementConfig` <- function(self, name) {
func <- OcrElementConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrElementConfig` <- `$.OcrElementConfig`
#' Byte offset boundary for a page
#'
#' Tracks where a specific page's content starts and ends in the main content string,
#' enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
#' at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
#' @field byte_start Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
#' @field byte_end Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
#' @field page_number Page number (1-indexed)
#' @export
PageBoundary <- new.env(parent = emptyenv())
#' @export
`$.PageBoundary` <- function(self, name) {
func <- PageBoundary[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageBoundary` <- `$.PageBoundary`
#' Metadata for individual page/slide/sheet
#'
#' Captures per-page information including dimensions, content counts,
#' and visibility state (for presentations).
#' @field number Page number (1-indexed)
#' @field title Page title (usually for presentations)
#' @field dimensions Dimensions in points (PDF) or pixels (images): (width, height)
#' @field image_count Number of images on this page
#' @field table_count Number of tables on this page
#' @field hidden Whether this page is hidden (e.g., in presentations)
#' @field is_blank Whether this page is blank (no meaningful text, no images, no tables)
#' @field has_vector_graphics Whether this page contains non-trivial vector graphics (paths, shapes, curves)
#' @export
PageInfo <- new.env(parent = emptyenv())
#' @export
`$.PageInfo` <- function(self, name) {
func <- PageInfo[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PageInfo` <- `$.PageInfo`
#' A detected layout region on a page
#'
#' When layout detection is enabled, each page may have layout regions
#' identifying different content types (text, pictures, tables, etc.)
#' with confidence scores and spatial positions.
#' @field class_name Layout class name (e.g. "picture", "table", "text", "section_header").
#' @field confidence Confidence score from the layout detection model (0.0 to 1.0).
#' @field bounding_box Bounding box in document coordinate space.
#' @field area_fraction Fraction of the page area covered by this region (0.0 to 1.0).
#' @export
LayoutRegion <- new.env(parent = emptyenv())
LayoutRegion$from_json <- function(json) {
.Call("wrap__LayoutRegion__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.LayoutRegion` <- function(self, name) {
func <- LayoutRegion[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutRegion` <- `$.LayoutRegion`
#' A text block with hierarchy level assignment
#'
#' Represents a block of text with semantic heading information extracted from
#' font size clustering and hierarchical analysis.
#' @field text The text content of this block
#' @field font_size The font size of the text in this block
#' @field level The hierarchy level of this block (H1-H6 or Body)
#' @field bbox Bounding box information for the block
#' @export
HierarchicalBlock <- new.env(parent = emptyenv())
#' @export
`$.HierarchicalBlock` <- function(self, name) {
func <- HierarchicalBlock[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.HierarchicalBlock` <- `$.HierarchicalBlock`
#' A single changed cell within a table
#'
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
#' reference it unconditionally, without requiring the `diff` Cargo feature.
#' `crate::diff` re-exports this type verbatim.
#' @field row Zero-based row index.
#' @field col Zero-based column index.
#' @field from Value before the change.
#' @field to Value after the change.
#' @export
CellChange <- new.env(parent = emptyenv())
#' @export
`$.CellChange` <- function(self, name) {
func <- CellChange[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.CellChange` <- `$.CellChange`
#' A single tracked change embedded in a document
#'
#' Populated by per-format extractors that understand change-tracking metadata
#' (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
#' extractor defaults to `ExtractionResult.revisions = None` until a
#' format-specific implementation is added.
#' @field revision_id Format-specific revision identifier.
#' @field author Display name of the author who made this change, when available.
#' @field timestamp ISO-8601 timestamp of the change, when available.
#' @field kind Semantic kind of this revision.
#' @field anchor Best-effort document location for this revision.
#' @field delta The content changes that make up this revision.
#' @export
DocumentRevision <- new.env(parent = emptyenv())
#' @export
`$.DocumentRevision` <- function(self, name) {
func <- DocumentRevision[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DocumentRevision` <- `$.DocumentRevision`
#' Individual table cell with content and optional styling
#'
#' Future extension point for rich table support with cell-level metadata.
#' @field content Cell content as text
#' @field row_span Row span (number of rows this cell spans)
#' @field col_span Column span (number of columns this cell spans)
#' @field is_header Whether this is a header cell
#' @export
TableCell <- new.env(parent = emptyenv())
#' @export
`$.TableCell` <- function(self, name) {
func <- TableCell[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.TableCell` <- `$.TableCell`
#' A URI extracted from a document
#'
#' Represents any link, reference, or resource pointer found during extraction.
#' The `kind` field classifies the URI semantically, while `label` carries
#' optional human-readable display text.
#' @field url The URL or path string.
#' @field label Optional display text / label for the link.
#' @field page Optional page number where the URI was found (1-indexed).
#' @field kind Semantic classification of the URI.
#' @export
ExtractedUri <- new.env(parent = emptyenv())
#' @export
`$.ExtractedUri` <- function(self, name) {
func <- ExtractedUri[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ExtractedUri` <- `$.ExtractedUri`
#' MIME type detection response
#' @field mime_type Detected MIME type
#' @field filename Original filename (if provided)
#' @export
DetectResponse <- new.env(parent = emptyenv())
#' @export
`$.DetectResponse` <- function(self, name) {
func <- DetectResponse[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DetectResponse` <- `$.DetectResponse`
#' Options controlling how two `ExtractionResult` values are compared
#' @field include_metadata Include metadata changes in the diff. Default: `true`.
#' @field include_embedded Include embedded-children changes in the diff. Default: `true`.
#' @field max_content_chars Truncate content to this many characters before diffing.
#' @export
DiffOptions <- new.env(parent = emptyenv())
DiffOptions$default <- function() .Call("wrap__DiffOptions__default", PACKAGE = "kreuzberg")
DiffOptions$from_json <- function(json) {
.Call("wrap__DiffOptions__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.DiffOptions` <- function(self, name) {
func <- DiffOptions[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffOptions` <- `$.DiffOptions`
#' A single contiguous hunk in a unified diff
#' @field from_line Starting line number in the old content (0-indexed).
#' @field from_count Number of lines from the old content in this hunk.
#' @field to_line Starting line number in the new content (0-indexed).
#' @field to_count Number of lines from the new content in this hunk.
#' @field lines Lines that make up this hunk.
#' @export
DiffHunk <- new.env(parent = emptyenv())
#' @export
`$.DiffHunk` <- function(self, name) {
func <- DiffHunk[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffHunk` <- `$.DiffHunk`
#' Diff for a single embedded archive entry that appears in both results
#' @field path Archive-relative path identifying this entry.
#' @field diff The recursive diff of the entry's extraction result.
#' @export
EmbeddedDiff <- new.env(parent = emptyenv())
#' @export
`$.EmbeddedDiff` <- function(self, name) {
func <- EmbeddedDiff[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddedDiff` <- `$.EmbeddedDiff`
#' Preset configurations for common RAG use cases
#'
#' Each preset combines chunk size, overlap, and embedding model
#' to provide an optimized configuration for specific scenarios.
#'
#' All string fields are owned `String` for FFI compatibility — instances
#' are safe to clone and pass across language boundaries.
#' @field name name
#' @field chunk_size chunk_size
#' @field overlap overlap
#' @field model_repo HuggingFace repository name for the model.
#' @field pooling Pooling strategy: "cls" or "mean".
#' @field model_file Path to the ONNX model file within the repo.
#' @field dimensions dimensions
#' @field description description
#' @export
EmbeddingPreset <- new.env(parent = emptyenv())
#' @export
`$.EmbeddingPreset` <- function(self, name) {
func <- EmbeddingPreset[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingPreset` <- `$.EmbeddingPreset`
#' YAKE-specific parameters
#' @field window_size Window size for co-occurrence analysis (default: 2).
#' @export
YakeParams <- new.env(parent = emptyenv())
YakeParams$default <- function() .Call("wrap__YakeParams__default", PACKAGE = "kreuzberg")
YakeParams$from_json <- function(json) {
.Call("wrap__YakeParams__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.YakeParams` <- function(self, name) {
func <- YakeParams[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.YakeParams` <- `$.YakeParams`
#' RAKE-specific parameters
#' @field min_word_length Minimum word length to consider (default: 1).
#' @field max_words_per_phrase Maximum words in a keyword phrase (default: 3).
#' @export
RakeParams <- new.env(parent = emptyenv())
RakeParams$default <- function() .Call("wrap__RakeParams__default", PACKAGE = "kreuzberg")
RakeParams$from_json <- function(json) {
.Call("wrap__RakeParams__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.RakeParams` <- function(self, name) {
func <- RakeParams[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.RakeParams` <- `$.RakeParams`
#' Keyword extraction configuration
#' @field algorithm Algorithm to use for extraction.
#' @field max_keywords Maximum number of keywords to extract (default: 10).
#' @field min_score Minimum score threshold (0.0-1.0, default: 0.0).
#' @field ngram_range N-gram range for keyword extraction (min, max).
#' @field language Language code for stopword filtering (e.g., "en", "de", "fr").
#' @field yake_params YAKE-specific tuning parameters.
#' @field rake_params RAKE-specific tuning parameters.
#' @export
KeywordConfig <- new.env(parent = emptyenv())
KeywordConfig$default <- function() .Call("wrap__KeywordConfig__default", PACKAGE = "kreuzberg")
KeywordConfig$from_json <- function(json) {
.Call("wrap__KeywordConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.KeywordConfig` <- function(self, name) {
func <- KeywordConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.KeywordConfig` <- `$.KeywordConfig`
#' Extracted keyword with metadata
#' @field text The keyword text.
#' @field score Relevance score (higher is better, algorithm-specific range).
#' @field algorithm Algorithm that extracted this keyword.
#' @field positions Optional positions where keyword appears in text (character offsets).
#' @export
Keyword <- new.env(parent = emptyenv())
#' @export
`$.Keyword` <- function(self, name) {
func <- Keyword[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.Keyword` <- `$.Keyword`
#' Configuration for PaddleOCR backend
#'
#' Configures PaddleOCR text detection and recognition with multi-language support.
#' Uses a builder pattern for convenient configuration.
#' @field language Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
#' @field cache_dir Optional custom cache directory for model files
#' @field use_angle_cls Enable angle classification for rotated text (default: false). Can misfire on short text
#' @field enable_table_detection Enable table structure detection (default: false)
#' @field det_db_thresh Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more
#' @field det_db_box_thresh Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0
#' @field det_db_unclip_ratio Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of
#' @field det_limit_side_len Maximum side length for detection image (default: 960) Larger images may be resized to
#' @field rec_batch_num Batch size for recognition inference (default: 6) Number of text regions to process
#' @field padding Padding in pixels added around the image before detection (default: 10). Large values can include
#' @field drop_score Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition
#' @field model_tier Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"`
#' @export
PaddleOcrConfig <- new.env(parent = emptyenv())
PaddleOcrConfig$with_cache_dir <- function(self, path) .Call("wrap__PaddleOcrConfig__with_cache_dir", self, path, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_table_detection <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_table_detection", self, enable, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_angle_cls <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_angle_cls", self, enable, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_thresh", self, threshold, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_box_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_box_thresh", self, threshold, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_db_unclip_ratio <- function(self, ratio) .Call("wrap__PaddleOcrConfig__with_det_db_unclip_ratio", self, ratio, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_det_limit_side_len <- function(self, length) .Call("wrap__PaddleOcrConfig__with_det_limit_side_len", self, length, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_rec_batch_num <- function(self, batch_size) .Call("wrap__PaddleOcrConfig__with_rec_batch_num", self, batch_size, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_drop_score <- function(self, score) .Call("wrap__PaddleOcrConfig__with_drop_score", self, score, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_padding <- function(self, padding) .Call("wrap__PaddleOcrConfig__with_padding", self, padding, PACKAGE = "kreuzberg")
PaddleOcrConfig$with_model_tier <- function(self, tier) .Call("wrap__PaddleOcrConfig__with_model_tier", self, tier, PACKAGE = "kreuzberg")
PaddleOcrConfig$default <- function() .Call("wrap__PaddleOcrConfig__default", PACKAGE = "kreuzberg")
PaddleOcrConfig$from_json <- function(json) {
.Call("wrap__PaddleOcrConfig__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PaddleOcrConfig` <- function(self, name) {
func <- PaddleOcrConfig[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PaddleOcrConfig` <- `$.PaddleOcrConfig`
#' @export
with_cache_dir.PaddleOcrConfig <- function(x, ...) x$with_cache_dir(...)
#' @export
with_table_detection.PaddleOcrConfig <- function(x, ...) x$with_table_detection(...)
#' @export
with_angle_cls.PaddleOcrConfig <- function(x, ...) x$with_angle_cls(...)
#' @export
with_det_db_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_thresh(...)
#' @export
with_det_db_box_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_box_thresh(...)
#' @export
with_det_db_unclip_ratio.PaddleOcrConfig <- function(x, ...) x$with_det_db_unclip_ratio(...)
#' @export
with_det_limit_side_len.PaddleOcrConfig <- function(x, ...) x$with_det_limit_side_len(...)
#' @export
with_rec_batch_num.PaddleOcrConfig <- function(x, ...) x$with_rec_batch_num(...)
#' @export
with_drop_score.PaddleOcrConfig <- function(x, ...) x$with_drop_score(...)
#' @export
with_padding.PaddleOcrConfig <- function(x, ...) x$with_padding(...)
#' @export
with_model_tier.PaddleOcrConfig <- function(x, ...) x$with_model_tier(...)
#' Combined paths to all models needed for OCR (backward compatibility)
#' @field det_model Path to the detection model directory.
#' @field cls_model Path to the classification model directory.
#' @field rec_model Path to the recognition model directory.
#' @field dict_file Path to the character dictionary file.
#' @export
ModelPaths <- new.env(parent = emptyenv())
#' @export
`$.ModelPaths` <- function(self, name) {
func <- ModelPaths[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ModelPaths` <- `$.ModelPaths`
#' Document orientation detection result
#' @field degrees Detected orientation in degrees (0, 90, 180, or 270).
#' @field confidence Confidence score (0.0-1.0).
#' @export
OrientationResult <- new.env(parent = emptyenv())
#' @export
`$.OrientationResult` <- function(self, name) {
func <- OrientationResult[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OrientationResult` <- `$.OrientationResult`
#' Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right
#' @field x1 x1
#' @field y1 y1
#' @field x2 x2
#' @field y2 y2
#' @export
BBox <- new.env(parent = emptyenv())
#' @export
`$.BBox` <- function(self, name) {
func <- BBox[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.BBox` <- `$.BBox`
#' A single layout detection result
#' @field class_name class_name
#' @field confidence confidence
#' @field bbox bbox
#' @export
LayoutDetection <- new.env(parent = emptyenv())
#' @export
`$.LayoutDetection` <- function(self, name) {
func <- LayoutDetection[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.LayoutDetection` <- `$.LayoutDetection`
#' Embedded file descriptor extracted from the PDF name tree
#' @field name The filename as stored in the PDF name tree.
#' @field data Raw file bytes from the embedded stream (already decompressed by lopdf).
#' @field compressed_size Compressed byte count of the original stream (before decompression).
#' @field mime_type MIME type if specified in the filespec, otherwise `None`.
#' @export
EmbeddedFile <- new.env(parent = emptyenv())
#' @export
`$.EmbeddedFile` <- function(self, name) {
func <- EmbeddedFile[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddedFile` <- `$.EmbeddedFile`
#' PDF-specific metadata
#'
#' Contains metadata fields specific to PDF documents that are not in the common
#' `Metadata` structure. Common fields like title, authors, keywords, and dates
#' are at the `Metadata` level.
#' @field pdf_version PDF version (e.g., "1.7", "2.0")
#' @field producer PDF producer (application that created the PDF)
#' @field is_encrypted Whether the PDF is encrypted/password-protected
#' @field width First page width in points (1/72 inch)
#' @field height First page height in points (1/72 inch)
#' @field page_count Total number of pages in the PDF document
#' @export
PdfMetadata <- new.env(parent = emptyenv())
PdfMetadata$from_json <- function(json) {
.Call("wrap__PdfMetadata__from_json", json, PACKAGE = "kreuzberg")
}
#' @export
`$.PdfMetadata` <- function(self, name) {
func <- PdfMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.PdfMetadata` <- `$.PdfMetadata`
#' Output format for extraction results
#'
#' Controls the format of the `content` field in `ExtractionResult`.
#' When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
#' `Plain` returns the raw extracted text.
#' `Structured` returns JSON with full OCR element data including bounding
#' boxes and confidence scores.
#' @field Plain Plain text content only (default)
#' @field Markdown Markdown format
#' @field Djot Djot markup format
#' @field Html HTML format
#' @field Json JSON tree format with heading-driven sections.
#' @field Structured Structured JSON format with full OCR element metadata.
#' @field Custom Custom renderer registered via the RendererRegistry. The string is the renderer name (e.g., "docx",
#' @export
OutputFormat <- new.env(parent = emptyenv())
#' @export
`$.OutputFormat` <- function(self, name) {
func <- OutputFormat[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OutputFormat` <- `$.OutputFormat`
#' Format-specific metadata (discriminated union)
#'
#' Only one format type can exist per extraction result. This provides
#' type-safe, clean metadata without nested optionals.
#' @field Pdf Pdf
#' @field Docx Docx
#' @field Excel Excel
#' @field Email Email
#' @field Pptx Pptx
#' @field Archive Archive
#' @field Image Image
#' @field Xml Xml
#' @field Text Text
#' @field Html Html
#' @field Ocr Ocr
#' @field Csv Csv
#' @field Bibtex Bibtex
#' @field Citation Citation
#' @field FictionBook FictionBook
#' @field Dbf Dbf
#' @field Jats Jats
#' @field Epub Epub
#' @field Pst Pst
#' @field Code Code
#' @export
FormatMetadata <- new.env(parent = emptyenv())
#' @export
`$.FormatMetadata` <- function(self, name) {
func <- FormatMetadata[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.FormatMetadata` <- `$.FormatMetadata`
#' A single line in a unified-diff hunk
#'
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
#' reference it unconditionally, without requiring the `diff` Cargo feature.
#' `crate::diff` re-exports this type verbatim.
#' @field Context Unchanged context line.
#' @field Added Line added in the "after" version.
#' @field Removed Line removed from the "before" version.
#' @export
DiffLine <- new.env(parent = emptyenv())
#' @export
`$.DiffLine` <- function(self, name) {
func <- DiffLine[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.DiffLine` <- `$.DiffLine`
#' Create a ExecutionProviderType enum value
#'
#' Returns the default ExecutionProviderType variant.
#'
#' @return A ExecutionProviderType enum value
#' @export
ExecutionProviderType <- function() list() |> structure(class = "ExecutionProviderType")
#' Create a HtmlTheme enum value
#'
#' Returns the default HtmlTheme variant.
#'
#' @return A HtmlTheme enum value
#' @export
HtmlTheme <- function() list() |> structure(class = "HtmlTheme")
#' Create a TableModel enum value
#'
#' Returns the default TableModel variant.
#'
#' @return A TableModel enum value
#' @export
TableModel <- function() list() |> structure(class = "TableModel")
#' Create a ChunkerType enum value
#'
#' Returns the default ChunkerType variant.
#'
#' @return A ChunkerType enum value
#' @export
ChunkerType <- function() list() |> structure(class = "ChunkerType")
#' Create a CodeContentMode enum value
#'
#' Returns the default CodeContentMode variant.
#'
#' @return A CodeContentMode enum value
#' @export
CodeContentMode <- function() list() |> structure(class = "CodeContentMode")
#' Create a ListType enum value
#'
#' Returns the default ListType variant.
#'
#' @return A ListType enum value
#' @export
ListType <- function() list() |> structure(class = "ListType")
#' Create a OcrBackendType enum value
#'
#' Returns the default OcrBackendType variant.
#'
#' @return A OcrBackendType enum value
#' @export
OcrBackendType <- function() list() |> structure(class = "OcrBackendType")
#' Create a ProcessingStage enum value
#'
#' Returns the default ProcessingStage variant.
#'
#' @return A ProcessingStage enum value
#' @export
ProcessingStage <- function() list() |> structure(class = "ProcessingStage")
#' Create a ReductionLevel enum value
#'
#' Returns the default ReductionLevel variant.
#'
#' @return A ReductionLevel enum value
#' @export
ReductionLevel <- function() list() |> structure(class = "ReductionLevel")
#' Create a PdfAnnotationType enum value
#'
#' Returns the default PdfAnnotationType variant.
#'
#' @return A PdfAnnotationType enum value
#' @export
PdfAnnotationType <- function() list() |> structure(class = "PdfAnnotationType")
#' Create a BlockType enum value
#'
#' Returns the default BlockType variant.
#'
#' @return A BlockType enum value
#' @export
BlockType <- function() list() |> structure(class = "BlockType")
#' Create a InlineType enum value
#'
#' Returns the default InlineType variant.
#'
#' @return A InlineType enum value
#' @export
InlineType <- function() list() |> structure(class = "InlineType")
#' Create a RelationshipKind enum value
#'
#' Returns the default RelationshipKind variant.
#'
#' @return A RelationshipKind enum value
#' @export
RelationshipKind <- function() list() |> structure(class = "RelationshipKind")
#' Create a ContentLayer enum value
#'
#' Returns the default ContentLayer variant.
#'
#' @return A ContentLayer enum value
#' @export
ContentLayer <- function() list() |> structure(class = "ContentLayer")
#' Create a ExtractionMethod enum value
#'
#' Returns the default ExtractionMethod variant.
#'
#' @return A ExtractionMethod enum value
#' @export
ExtractionMethod <- function() list() |> structure(class = "ExtractionMethod")
#' Create a ChunkType enum value
#'
#' Returns the default ChunkType variant.
#'
#' @return A ChunkType enum value
#' @export
ChunkType <- function() list() |> structure(class = "ChunkType")
#' Create a ImageKind enum value
#'
#' Returns the default ImageKind variant.
#'
#' @return A ImageKind enum value
#' @export
ImageKind <- function() list() |> structure(class = "ImageKind")
#' Create a ResultFormat enum value
#'
#' Returns the default ResultFormat variant.
#'
#' @return A ResultFormat enum value
#' @export
ResultFormat <- function() list() |> structure(class = "ResultFormat")
#' Create a ElementType enum value
#'
#' Returns the default ElementType variant.
#'
#' @return A ElementType enum value
#' @export
ElementType <- function() list() |> structure(class = "ElementType")
#' Create a TextDirection enum value
#'
#' Returns the default TextDirection variant.
#'
#' @return A TextDirection enum value
#' @export
TextDirection <- function() list() |> structure(class = "TextDirection")
#' Create a LinkType enum value
#'
#' Returns the default LinkType variant.
#'
#' @return A LinkType enum value
#' @export
LinkType <- function() list() |> structure(class = "LinkType")
#' Create a ImageType enum value
#'
#' Returns the default ImageType variant.
#'
#' @return A ImageType enum value
#' @export
ImageType <- function() list() |> structure(class = "ImageType")
#' Create a StructuredDataType enum value
#'
#' Returns the default StructuredDataType variant.
#'
#' @return A StructuredDataType enum value
#' @export
StructuredDataType <- function() list() |> structure(class = "StructuredDataType")
#' Create a OcrElementLevel enum value
#'
#' Returns the default OcrElementLevel variant.
#'
#' @return A OcrElementLevel enum value
#' @export
OcrElementLevel <- function() list() |> structure(class = "OcrElementLevel")
#' Create a PageUnitType enum value
#'
#' Returns the default PageUnitType variant.
#'
#' @return A PageUnitType enum value
#' @export
PageUnitType <- function() list() |> structure(class = "PageUnitType")
#' Create a RevisionKind enum value
#'
#' Returns the default RevisionKind variant.
#'
#' @return A RevisionKind enum value
#' @export
RevisionKind <- function() list() |> structure(class = "RevisionKind")
#' Create a UriKind enum value
#'
#' Returns the default UriKind variant.
#'
#' @return A UriKind enum value
#' @export
UriKind <- function() list() |> structure(class = "UriKind")
#' Create a KeywordAlgorithm enum value
#'
#' Returns the default KeywordAlgorithm variant.
#'
#' @return A KeywordAlgorithm enum value
#' @export
KeywordAlgorithm <- function() list() |> structure(class = "KeywordAlgorithm")
#' Create a PSMMode enum value
#'
#' Returns the default PSMMode variant.
#'
#' @return A PSMMode enum value
#' @export
PSMMode <- function() list() |> structure(class = "PSMMode")
#' Create a PaddleLanguage enum value
#'
#' Returns the default PaddleLanguage variant.
#'
#' @return A PaddleLanguage enum value
#' @export
PaddleLanguage <- function() list() |> structure(class = "PaddleLanguage")
#' Create a LayoutClass enum value
#'
#' Returns the default LayoutClass variant.
#'
#' @return A LayoutClass enum value
#' @export
LayoutClass <- function() list() |> structure(class = "LayoutClass")
#' How chunk size is measured
#'
#' Defaults to `Characters` (Unicode character count). When using token-based sizing,
#' chunks are sized by token count according to the specified tokenizer.
#'
#' Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
#' available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
#' (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
#' @export
ChunkSizing <- new.env(parent = emptyenv())
ChunkSizing$default <- function() .Call("wrap__ChunkSizing__default", PACKAGE = "kreuzberg")
ChunkSizing$from_json <- function(json) .Call("wrap__ChunkSizing__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.ChunkSizing` <- function(self, name) {
func <- ChunkSizing[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.ChunkSizing` <- `$.ChunkSizing`
#' Embedding model types supported by Kreuzberg
#' @export
EmbeddingModelType <- new.env(parent = emptyenv())
EmbeddingModelType$default <- function() .Call("wrap__EmbeddingModelType__default", PACKAGE = "kreuzberg")
EmbeddingModelType$from_json <- function(json) .Call("wrap__EmbeddingModelType__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.EmbeddingModelType` <- function(self, name) {
func <- EmbeddingModelType[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.EmbeddingModelType` <- `$.EmbeddingModelType`
#' Tagged enum for node content. Each variant carries only type-specific data
#'
#' Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
#' Go/Java/TypeScript bindings.
#' @export
NodeContent <- new.env(parent = emptyenv())
NodeContent$default <- function() .Call("wrap__NodeContent__default", PACKAGE = "kreuzberg")
NodeContent$from_json <- function(json) .Call("wrap__NodeContent__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.NodeContent` <- function(self, name) {
func <- NodeContent[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.NodeContent` <- `$.NodeContent`
#' Types of inline text annotations
#' @export
AnnotationKind <- new.env(parent = emptyenv())
AnnotationKind$default <- function() .Call("wrap__AnnotationKind__default", PACKAGE = "kreuzberg")
AnnotationKind$from_json <- function(json) .Call("wrap__AnnotationKind__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.AnnotationKind` <- function(self, name) {
func <- AnnotationKind[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.AnnotationKind` <- `$.AnnotationKind`
#' Bounding geometry for an OCR element
#'
#' Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
#' (from PaddleOCR and rotated text detection).
#' @export
OcrBoundingGeometry <- new.env(parent = emptyenv())
OcrBoundingGeometry$default <- function() .Call("wrap__OcrBoundingGeometry__default", PACKAGE = "kreuzberg")
OcrBoundingGeometry$from_json <- function(json) .Call("wrap__OcrBoundingGeometry__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.OcrBoundingGeometry` <- function(self, name) {
func <- OcrBoundingGeometry[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.OcrBoundingGeometry` <- `$.OcrBoundingGeometry`
#' Best-effort document location for a revision
#' @export
RevisionAnchor <- new.env(parent = emptyenv())
RevisionAnchor$default <- function() .Call("wrap__RevisionAnchor__default", PACKAGE = "kreuzberg")
RevisionAnchor$from_json <- function(json) .Call("wrap__RevisionAnchor__from_json", json, PACKAGE = "kreuzberg")
#' @export
`$.RevisionAnchor` <- function(self, name) {
func <- RevisionAnchor[[name]]
if (identical(names(formals(func))[1], "self")) {
function(...) func(self, ...)
} else {
func
}
}
#' @export
`[[.RevisionAnchor` <- `$.RevisionAnchor`
#' @export
cors_allows_all <- function(x, ...) UseMethod("cors_allows_all")
#' @export
is_empty <- function(x, ...) UseMethod("is_empty")
#' @export
is_origin_allowed <- function(x, ...) UseMethod("is_origin_allowed")
#' @export
listen_addr <- function(x, ...) UseMethod("listen_addr")
#' @export
max_multipart_field_mb <- function(x, ...) UseMethod("max_multipart_field_mb")
#' @export
max_request_body_mb <- function(x, ...) UseMethod("max_request_body_mb")
#' @export
needs_image_processing <- function(x, ...) UseMethod("needs_image_processing")
#' @export
with_angle_cls <- function(x, ...) UseMethod("with_angle_cls")
#' @export
with_cache_dir <- function(x, ...) UseMethod("with_cache_dir")
#' @export
with_det_db_box_thresh <- function(x, ...) UseMethod("with_det_db_box_thresh")
#' @export
with_det_db_thresh <- function(x, ...) UseMethod("with_det_db_thresh")
#' @export
with_det_db_unclip_ratio <- function(x, ...) UseMethod("with_det_db_unclip_ratio")
#' @export
with_det_limit_side_len <- function(x, ...) UseMethod("with_det_limit_side_len")
#' @export
with_drop_score <- function(x, ...) UseMethod("with_drop_score")
#' @export
with_model_tier <- function(x, ...) UseMethod("with_model_tier")
#' @export
with_padding <- function(x, ...) UseMethod("with_padding")
#' @export
with_rec_batch_num <- function(x, ...) UseMethod("with_rec_batch_num")
#' @export
with_table_detection <- function(x, ...) UseMethod("with_table_detection")