3519 lines
135 KiB
R
Generated
3519 lines
135 KiB
R
Generated
# Generated by extendr: Do not edit by hand
|
||
#
|
||
# This file is regenerated by alef on every `alef generate` run.
|
||
# It mirrors the output of `rextendr::document()` and binds every
|
||
# wrap__<symbol> entry registered in extendr_module! to an R-callable
|
||
# function or class env.
|
||
|
||
#' @useDynLib kreuzberg, .registration = TRUE
|
||
NULL
|
||
|
||
#' Extract content from a byte array
|
||
#'
|
||
#' This is the main entry point for in-memory extraction. It performs the following steps:
|
||
#' 1. Validate MIME type
|
||
#' 2. Handle legacy format conversion if needed
|
||
#' 3. Select appropriate extractor from registry
|
||
#' 4. Extract content
|
||
#' 5. Run post-processing pipeline
|
||
#' @param content The byte array to extract.
|
||
#' @param mime_type MIME type of the content.
|
||
#' @param config Extraction configuration.
|
||
#' @return An `ExtractionResult` containing the extracted content and metadata.
|
||
#'
|
||
#' @section Errors:
|
||
#' Returns `KreuzbergError::Validation` if MIME type is invalid.
|
||
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||
#' @export
|
||
extract_bytes <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes", content, mime_type, config, PACKAGE = "kreuzberg")
|
||
#' Extract content from a file
|
||
#'
|
||
#' This is the main entry point for file-based extraction. It performs the following steps:
|
||
#' 1. Check cache for existing result (if caching enabled)
|
||
#' 2. Detect or validate MIME type
|
||
#' 3. Select appropriate extractor from registry
|
||
#' 4. Extract content
|
||
#' 5. Run post-processing pipeline
|
||
#' 6. Store result in cache (if caching enabled)
|
||
#' @param path Path to the file to extract.
|
||
#' @param mime_type Optional MIME type override. If None, will be auto-detected.
|
||
#' @param config Extraction configuration.
|
||
#' @return An `ExtractionResult` containing the extracted content and metadata.
|
||
#'
|
||
#' @section Errors:
|
||
#' Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
||
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||
#' @export
|
||
extract_file <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file", path, mime_type, config, PACKAGE = "kreuzberg")
|
||
#' Synchronous wrapper for `extract_file`
|
||
#'
|
||
#' This is a convenience function that blocks the current thread until extraction completes.
|
||
#' For async code, use `extract_file` directly.
|
||
#'
|
||
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
#' a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
||
#'
|
||
#' This function is only available with the `tokio-runtime` feature. For WASM targets,
|
||
#' use a truly synchronous extraction approach instead.
|
||
#' @param path File path as character string.
|
||
#' @param mime_type Character string.
|
||
#' @param config ExtractionConfig object (list with class attribute).
|
||
#' @return ExtractionResult object (list with class attribute).
|
||
#' @export
|
||
extract_file_sync <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file_sync", path, mime_type, config, PACKAGE = "kreuzberg")
|
||
#' Synchronous wrapper for `extract_bytes`
|
||
#'
|
||
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
#' a new runtime per call.
|
||
#'
|
||
#' With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
||
#' @param content Raw vector of bytes.
|
||
#' @param mime_type Character string.
|
||
#' @param config ExtractionConfig object (list with class attribute).
|
||
#' @return ExtractionResult object (list with class attribute).
|
||
#' @export
|
||
extract_bytes_sync <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes_sync", content, mime_type, config, PACKAGE = "kreuzberg")
|
||
#' Synchronous wrapper for `batch_extract_files`
|
||
#'
|
||
#' Uses the global Tokio runtime for optimal performance.
|
||
#' Only available with `tokio-runtime` (WASM has no filesystem).
|
||
#' @param items List of batchfileitem object (list with class attribute).
|
||
#' @param config ExtractionConfig object (list with class attribute).
|
||
#' @return List of extractionresult object (list with class attribute).
|
||
#' @export
|
||
batch_extract_files_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files_sync", items, config, PACKAGE = "kreuzberg")
|
||
#' Synchronous wrapper for `batch_extract_bytes`
|
||
#'
|
||
#' Uses the global Tokio runtime for optimal performance.
|
||
#' With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
||
#' that iterates through items and calls `extract_bytes_sync()`.
|
||
#' @param items List of batchbytesitem object (list with class attribute).
|
||
#' @param config ExtractionConfig object (list with class attribute).
|
||
#' @return List of extractionresult object (list with class attribute).
|
||
#' @export
|
||
batch_extract_bytes_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes_sync", items, config, PACKAGE = "kreuzberg")
|
||
#' Extract content from multiple files concurrently
|
||
#'
|
||
#' This function processes multiple files in parallel, automatically managing
|
||
#' concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
#' to `(num_cpus * 1.5).ceil()`.
|
||
#'
|
||
#' Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
#' fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
||
#' Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
||
#' taken from the batch-level `config`.
|
||
#' @param items Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
|
||
#' @param config Batch-level extraction configuration (provides defaults and batch settings).
|
||
#' @return A vector of `ExtractionResult` in the same order as the input items.
|
||
#'
|
||
#' @section Errors:
|
||
#' Individual file errors are captured in the result metadata. System errors
|
||
#' (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
||
#' @export
|
||
batch_extract_files <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files", items, config, PACKAGE = "kreuzberg")
|
||
#' Extract content from multiple byte arrays concurrently
|
||
#'
|
||
#' This function processes multiple byte arrays in parallel, automatically managing
|
||
#' concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
#' to `(num_cpus * 1.5).ceil()`.
|
||
#'
|
||
#' Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
#' fields from the batch-level `config`. Pass `None` as the config to use
|
||
#' the batch-level defaults for that item.
|
||
#' @param items Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
|
||
#' @param config Batch-level extraction configuration.
|
||
#' @return A vector of `ExtractionResult` in the same order as the input items.
|
||
#' @export
|
||
batch_extract_bytes <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes", items, config, PACKAGE = "kreuzberg")
|
||
#' Detect MIME type from raw file bytes
|
||
#'
|
||
#' Uses magic byte signatures to detect file type from content.
|
||
#' Falls back to `infer` crate for comprehensive detection.
|
||
#'
|
||
#' For ZIP-based files, inspects contents to distinguish Office Open XML
|
||
#' formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
||
#' @param content Raw file bytes.
|
||
#' @return The detected MIME type string.
|
||
#'
|
||
#' @section Errors:
|
||
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
||
#' @export
|
||
detect_mime_type_from_bytes <- function(content) .Call("wrap__detect_mime_type_from_bytes", content, PACKAGE = "kreuzberg")
|
||
#' Get file extensions for a given MIME type
|
||
#'
|
||
#' Returns all known file extensions that map to the specified MIME type.
|
||
#' @param mime_type The MIME type to look up.
|
||
#' @return A vector of file extensions (without leading dot) for the MIME type.
|
||
#' @export
|
||
get_extensions_for_mime <- function(mime_type) .Call("wrap__get_extensions_for_mime", mime_type, PACKAGE = "kreuzberg")
|
||
#' List the names of all registered embedding backends
|
||
#'
|
||
#' Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
||
#' bindings.
|
||
#' @return List of character string.
|
||
#' @export
|
||
list_embedding_backends <- function() .Call("wrap__list_embedding_backends", PACKAGE = "kreuzberg")
|
||
#' List names of all registered document extractors
|
||
#' @return List of character string.
|
||
#' @export
|
||
list_document_extractors <- function() .Call("wrap__list_document_extractors", PACKAGE = "kreuzberg")
|
||
#' List all registered OCR backends
|
||
#'
|
||
#' Returns the names of all OCR backends currently registered in the global registry.
|
||
#' @return A vector of OCR backend names.
|
||
#' @export
|
||
list_ocr_backends <- function() .Call("wrap__list_ocr_backends", PACKAGE = "kreuzberg")
|
||
#' List all registered post-processor names
|
||
#'
|
||
#' Returns a vector of all post-processor names currently registered in the
|
||
#' global registry.
|
||
#' @return - `Ok(Vec<String>)` - Vector of post-processor names
|
||
#' - `Err(...)` if the registry lock is poisoned.
|
||
#' @export
|
||
list_post_processors <- function() .Call("wrap__list_post_processors", PACKAGE = "kreuzberg")
|
||
#' List names of all registered renderers
|
||
#' @return List of character string.
|
||
#'
|
||
#' @section Errors:
|
||
#' Returns an error if the registry lock is poisoned.
|
||
#' @export
|
||
list_renderers <- function() .Call("wrap__list_renderers", PACKAGE = "kreuzberg")
|
||
#' List names of all registered validators
|
||
#' @return List of character string.
|
||
#' @export
|
||
list_validators <- function() .Call("wrap__list_validators", PACKAGE = "kreuzberg")
|
||
#' Compare two extraction results and return a structured diff
|
||
#'
|
||
#' The comparison is purely structural — no I/O, no side effects. All fields
|
||
#' of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||
#' @param a — the "before" extraction result.
|
||
#' @param b — the "after" extraction result.
|
||
#' @param opts — controls which sections are compared and optional truncation.
|
||
#' @return ExtractionDiff object (list with class attribute).
|
||
#' @export
|
||
compare <- function(a = ExtractionResult$default(), b = ExtractionResult$default(), opts = DiffOptions$default()) .Call("wrap__compare", a, b, opts, PACKAGE = "kreuzberg")
|
||
#' Generate embeddings asynchronously for a list of text strings
|
||
#'
|
||
#' This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
||
#' ONNX inference work to a dedicated blocking thread pool via Tokio's
|
||
#' `spawn_blocking`, keeping the async executor free.
|
||
#'
|
||
#' Returns one embedding vector per input text in the same order.
|
||
#' @param texts Vec of strings to embed (owned, sent to blocking thread).
|
||
#' @param config Embedding configuration specifying model, batch size, and normalization.
|
||
#' @return List of list of numeric.
|
||
#'
|
||
#' @section Errors:
|
||
#' - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
||
#' - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
||
#' or the blocking inference task panics
|
||
#' @export
|
||
embed_texts_async <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts_async", texts, config, PACKAGE = "kreuzberg")
|
||
#' Render a single PDF page to PNG bytes
|
||
#'
|
||
#' Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
||
#' Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
||
#' @param pdf_bytes Raw PDF file bytes.
|
||
#' @param page_index Zero-based page index.
|
||
#' @param dpi Resolution in dots per inch (default: 150).
|
||
#' @param password Optional password for encrypted PDFs.
|
||
#' @return Raw vector of bytes.
|
||
#'
|
||
#' @section Errors:
|
||
#' Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
||
#' or rendered, or if `page_index` is out of range.
|
||
#' @export
|
||
render_pdf_page_to_png <- function(pdf_bytes, page_index, dpi = NULL, password = NULL) .Call("wrap__render_pdf_page_to_png", pdf_bytes, page_index, dpi, password, PACKAGE = "kreuzberg")
|
||
#' Detect the MIME type of a file at the given path
|
||
#'
|
||
#' Uses the file extension and optionally the file content to determine the MIME type.
|
||
#' Set `check_exists` to `true` to verify the file exists before detection.
|
||
#' @param path Character string.
|
||
#' @param check_exists Logical (TRUE/FALSE).
|
||
#' @return Character string.
|
||
#' @export
|
||
detect_mime_type <- function(path, check_exists) .Call("wrap__detect_mime_type", path, check_exists, PACKAGE = "kreuzberg")
|
||
#' Embed a list of texts using the configured embedding model
|
||
#'
|
||
#' Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
||
#' @param texts List of character string.
|
||
#' @param config EmbeddingConfig object (list with class attribute).
|
||
#' @return List of list of numeric.
|
||
#' @export
|
||
embed_texts <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts", texts, config, PACKAGE = "kreuzberg")
|
||
#' Get an embedding preset by name
|
||
#'
|
||
#' Returns `None` if no preset with the given name exists. Returns an owned
|
||
#' clone so the value is safe to pass across FFI boundaries.
|
||
#' @param name Character string.
|
||
#' @return Optional EmbeddingPreset object (list with class attribute). Defaults to NULL.
|
||
#' @export
|
||
get_embedding_preset <- function(name) .Call("wrap__get_embedding_preset", name, PACKAGE = "kreuzberg")
|
||
#' List the names of all available embedding presets
|
||
#'
|
||
#' Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
||
#' @return List of character string.
|
||
#' @export
|
||
list_embedding_presets <- function() .Call("wrap__list_embedding_presets", PACKAGE = "kreuzberg")
|
||
#' register_ocr_backend
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_ocr_backend <- function(r_backend) .Call("wrap__register_ocr_backend", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_ocr_backend
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_ocr_backend <- function(name) .Call("wrap__unregister_ocr_backend", name, PACKAGE = "kreuzberg")
|
||
#' clear_ocr_backends
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_ocr_backends <- function() .Call("wrap__clear_ocr_backends", PACKAGE = "kreuzberg")
|
||
#' register_post_processor
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_post_processor <- function(r_backend) .Call("wrap__register_post_processor", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_post_processor
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_post_processor <- function(name) .Call("wrap__unregister_post_processor", name, PACKAGE = "kreuzberg")
|
||
#' clear_post_processors
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_post_processors <- function() .Call("wrap__clear_post_processors", PACKAGE = "kreuzberg")
|
||
#' register_validator
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_validator <- function(r_backend) .Call("wrap__register_validator", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_validator
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_validator <- function(name) .Call("wrap__unregister_validator", name, PACKAGE = "kreuzberg")
|
||
#' clear_validators
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_validators <- function() .Call("wrap__clear_validators", PACKAGE = "kreuzberg")
|
||
#' register_embedding_backend
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_embedding_backend <- function(r_backend) .Call("wrap__register_embedding_backend", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_embedding_backend
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_embedding_backend <- function(name) .Call("wrap__unregister_embedding_backend", name, PACKAGE = "kreuzberg")
|
||
#' clear_embedding_backends
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_embedding_backends <- function() .Call("wrap__clear_embedding_backends", PACKAGE = "kreuzberg")
|
||
#' register_document_extractor
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_document_extractor <- function(r_backend) .Call("wrap__register_document_extractor", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_document_extractor
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_document_extractor <- function(name) .Call("wrap__unregister_document_extractor", name, PACKAGE = "kreuzberg")
|
||
#' clear_document_extractors
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_document_extractors <- function() .Call("wrap__clear_document_extractors", PACKAGE = "kreuzberg")
|
||
#' register_renderer
|
||
#'
|
||
#' Register an R-side plugin implementation. Pass a named list whose entries
|
||
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
||
#'
|
||
#' @param r_backend Named list of R closures implementing the trait surface.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
register_renderer <- function(r_backend) .Call("wrap__register_renderer", r_backend, PACKAGE = "kreuzberg")
|
||
#' unregister_renderer
|
||
#'
|
||
#' Unregister a previously registered plugin by name.
|
||
#'
|
||
#' @param name Plugin name string as returned by the backend's `name()` method.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
unregister_renderer <- function(name) .Call("wrap__unregister_renderer", name, PACKAGE = "kreuzberg")
|
||
#' clear_renderers
|
||
#'
|
||
#' Remove every registered plugin of this type. Typically used in test teardown.
|
||
#'
|
||
#' @return Invisible NULL on success; raises an R error on failure.
|
||
#' @export
|
||
clear_renderers <- function() .Call("wrap__clear_renderers", PACKAGE = "kreuzberg")
|
||
#' CacheStats
|
||
#' @field total_files total_files
|
||
#' @field total_size_mb total_size_mb
|
||
#' @field available_space_mb available_space_mb
|
||
#' @field oldest_file_age_days oldest_file_age_days
|
||
#' @field newest_file_age_days newest_file_age_days
|
||
#' @export
|
||
CacheStats <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.CacheStats` <- function(self, name) {
|
||
func <- CacheStats[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.CacheStats` <- `$.CacheStats`
|
||
#' Hardware acceleration configuration for ONNX Runtime models
|
||
#'
|
||
#' Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||
#' for inference in layout detection and embedding generation.
|
||
#' @field provider Execution provider to use for ONNX inference.
|
||
#' @field device_id GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
|
||
#' @export
|
||
AccelerationConfig <- new.env(parent = emptyenv())
|
||
AccelerationConfig$from_json <- function(json) {
|
||
.Call("wrap__AccelerationConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.AccelerationConfig` <- function(self, name) {
|
||
func <- AccelerationConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.AccelerationConfig` <- `$.AccelerationConfig`
|
||
#' Cross-extractor content filtering configuration
|
||
#'
|
||
#' Controls whether "furniture" content (headers, footers, page numbers,
|
||
#' watermarks, repeating text) is included in or stripped from extraction
|
||
#' results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||
#' with format-specific implementation.
|
||
#'
|
||
#' When `None` on `ExtractionConfig`, each extractor uses its current
|
||
#' default behavior unchanged.
|
||
#' @field include_headers Include running headers in extraction output.
|
||
#' @field include_footers Include running footers in extraction output.
|
||
#' @field strip_repeating_text Enable the heuristic cross-page repeating text detector.
|
||
#' @field include_watermarks Include watermark text in extraction output.
|
||
#' @export
|
||
ContentFilterConfig <- new.env(parent = emptyenv())
|
||
ContentFilterConfig$default <- function() .Call("wrap__ContentFilterConfig__default", PACKAGE = "kreuzberg")
|
||
ContentFilterConfig$from_json <- function(json) {
|
||
.Call("wrap__ContentFilterConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ContentFilterConfig` <- function(self, name) {
|
||
func <- ContentFilterConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ContentFilterConfig` <- `$.ContentFilterConfig`
|
||
#' Configuration for email extraction
|
||
#' @field msg_fallback_codepage Windows codepage number to use when an MSG file contains no codepage property. Defaults
|
||
#' @export
|
||
EmailConfig <- new.env(parent = emptyenv())
|
||
EmailConfig$from_json <- function(json) {
|
||
.Call("wrap__EmailConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.EmailConfig` <- function(self, name) {
|
||
func <- EmailConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmailConfig` <- `$.EmailConfig`
|
||
#' Main extraction configuration
|
||
#'
|
||
#' This struct contains all configuration options for the extraction process.
|
||
#' It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||
#' @field use_cache Enable caching of extraction results
|
||
#' @field enable_quality_processing Enable quality post-processing
|
||
#' @field ocr OCR configuration (None = OCR disabled)
|
||
#' @field force_ocr Force OCR even for searchable PDFs
|
||
#' @field force_ocr_pages Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
||
#' @field disable_ocr Disable OCR entirely, even for images.
|
||
#' @field chunking Text chunking configuration (None = chunking disabled)
|
||
#' @field content_filter Content filtering configuration (None = use extractor defaults).
|
||
#' @field images Image extraction configuration (None = no image extraction)
|
||
#' @field pdf_options PDF-specific options (None = use defaults)
|
||
#' @field token_reduction Token reduction configuration (None = no token reduction)
|
||
#' @field language_detection Language detection configuration (None = no language detection)
|
||
#' @field pages Page extraction configuration (None = no page tracking)
|
||
#' @field keywords Keyword extraction configuration (None = no keyword extraction)
|
||
#' @field postprocessor Post-processor configuration (None = use defaults)
|
||
#' @field html_options HTML to Markdown conversion options (None = use defaults)
|
||
#' @field html_output Styled HTML output configuration.
|
||
#' @field extraction_timeout_secs Default per-file timeout in seconds for batch extraction.
|
||
#' @field max_concurrent_extractions Maximum concurrent extractions in batch operations (None = (num_cpus ×
|
||
#' @field result_format Result structure format
|
||
#' @field security_limits Security limits for archive extraction.
|
||
#' @field max_embedded_file_bytes Maximum uncompressed size in bytes for a single embedded file before recursive
|
||
#' @field output_format Content text format (default: Plain).
|
||
#' @field layout Layout detection configuration (None = layout detection disabled).
|
||
#' @field use_layout_for_markdown Run layout detection on the non-OCR PDF markdown path.
|
||
#' @field include_document_structure Enable structured document tree output.
|
||
#' @field acceleration Hardware acceleration configuration for ONNX Runtime models.
|
||
#' @field cache_namespace Cache namespace for tenant isolation.
|
||
#' @field cache_ttl_secs Per-request cache TTL in seconds.
|
||
#' @field email Email extraction configuration (None = use defaults).
|
||
#' @field concurrency Concurrency limits for constrained environments (None = use defaults).
|
||
#' @field max_archive_depth Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive
|
||
#' @field tree_sitter Tree-sitter language pack configuration (None = tree-sitter disabled).
|
||
#' @field structured_extraction Structured extraction via LLM (None = disabled).
|
||
#' @field cancel_token Cancellation token for this extraction (None = no external cancellation).
|
||
#' @export
|
||
ExtractionConfig <- new.env(parent = emptyenv())
|
||
ExtractionConfig$default <- function() .Call("wrap__ExtractionConfig__default", PACKAGE = "kreuzberg")
|
||
ExtractionConfig$needs_image_processing <- function(self) .Call("wrap__ExtractionConfig__needs_image_processing", self, PACKAGE = "kreuzberg")
|
||
ExtractionConfig$from_json <- function(json) {
|
||
.Call("wrap__ExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ExtractionConfig` <- function(self, name) {
|
||
func <- ExtractionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ExtractionConfig` <- `$.ExtractionConfig`
|
||
#' @export
|
||
needs_image_processing.ExtractionConfig <- function(x, ...) x$needs_image_processing(...)
|
||
#' Per-file extraction configuration overrides for batch processing
|
||
#'
|
||
#' All fields are `Option<T>` — `None` means "use the batch-level default."
|
||
#' This type is used with `batch_extract_files` and
|
||
#' `batch_extract_bytes` to allow heterogeneous
|
||
#' extraction settings within a single batch.
|
||
#'
|
||
#' # Excluded Fields
|
||
#'
|
||
#' The following `ExtractionConfig` fields are batch-level only and
|
||
#' cannot be overridden per file:
|
||
#' - `max_concurrent_extractions` — controls batch parallelism
|
||
#' - `use_cache` — global caching policy
|
||
#' - `acceleration` — shared ONNX execution provider
|
||
#' - `security_limits` — global archive security policy
|
||
#' @field enable_quality_processing Override quality post-processing for this file.
|
||
#' @field ocr Override OCR configuration for this file (None in the Option = use batch default).
|
||
#' @field force_ocr Override force OCR for this file.
|
||
#' @field force_ocr_pages Override force OCR pages for this file (1-indexed page numbers).
|
||
#' @field disable_ocr Override disable OCR for this file.
|
||
#' @field chunking Override chunking configuration for this file.
|
||
#' @field content_filter Override content filtering configuration for this file.
|
||
#' @field images Override image extraction configuration for this file.
|
||
#' @field pdf_options Override PDF options for this file.
|
||
#' @field token_reduction Override token reduction for this file.
|
||
#' @field language_detection Override language detection for this file.
|
||
#' @field pages Override page extraction for this file.
|
||
#' @field keywords Override keyword extraction for this file.
|
||
#' @field postprocessor Override post-processor for this file.
|
||
#' @field html_options Override HTML conversion options for this file.
|
||
#' @field result_format Override result format for this file.
|
||
#' @field output_format Override output content format for this file.
|
||
#' @field include_document_structure Override document structure output for this file.
|
||
#' @field layout Override layout detection for this file.
|
||
#' @field timeout_secs Override per-file extraction timeout in seconds.
|
||
#' @field tree_sitter Override tree-sitter configuration for this file.
|
||
#' @field structured_extraction Override structured extraction configuration for this file.
|
||
#' @export
|
||
FileExtractionConfig <- new.env(parent = emptyenv())
|
||
FileExtractionConfig$from_json <- function(json) {
|
||
.Call("wrap__FileExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.FileExtractionConfig` <- function(self, name) {
|
||
func <- FileExtractionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.FileExtractionConfig` <- `$.FileExtractionConfig`
|
||
#' Batch item for byte array extraction
|
||
#'
|
||
#' Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||
#' to represent a single item in a batch extraction job.
|
||
#' @field content The content bytes to extract from
|
||
#' @field mime_type MIME type of the content (e.g., "application/pdf", "text/html")
|
||
#' @field config Per-item configuration overrides (None uses batch-level defaults)
|
||
#' @export
|
||
BatchBytesItem <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.BatchBytesItem` <- function(self, name) {
|
||
func <- BatchBytesItem[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.BatchBytesItem` <- `$.BatchBytesItem`
|
||
#' Batch item for file extraction
|
||
#'
|
||
#' Used with `batch_extract_files` and `batch_extract_files_sync`
|
||
#' to represent a single file in a batch extraction job.
|
||
#' @field path Path to the file to extract from
|
||
#' @field config Per-file configuration overrides (None uses batch-level defaults)
|
||
#' @export
|
||
BatchFileItem <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.BatchFileItem` <- function(self, name) {
|
||
func <- BatchFileItem[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.BatchFileItem` <- `$.BatchFileItem`
|
||
#' Image extraction configuration
|
||
#' @field extract_images Extract images from documents
|
||
#' @field target_dpi Target DPI for image normalization
|
||
#' @field max_image_dimension Maximum dimension for images (width or height)
|
||
#' @field inject_placeholders Whether to inject image reference placeholders into markdown output. When `true`
|
||
#' @field auto_adjust_dpi Automatically adjust DPI based on image content
|
||
#' @field min_dpi Minimum DPI threshold
|
||
#' @field max_dpi Maximum DPI threshold
|
||
#' @field max_images_per_page Maximum number of image objects to extract per PDF page.
|
||
#' @field classify When `true` (default), extracted images are classified by kind and grouped into clusters where they
|
||
#' @field include_page_rasters When `true`, full-page renders produced during OCR preprocessing are captured and
|
||
#' @field run_ocr_on_images Run OCR on extracted images and include the recognized text in the document content.
|
||
#' @field ocr_text_only When `true`, image OCR results are rendered as plain text without the `` markdown
|
||
#' @field append_ocr_text When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder
|
||
#' @export
|
||
ImageExtractionConfig <- new.env(parent = emptyenv())
|
||
ImageExtractionConfig$default <- function() .Call("wrap__ImageExtractionConfig__default", PACKAGE = "kreuzberg")
|
||
ImageExtractionConfig$from_json <- function(json) {
|
||
.Call("wrap__ImageExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ImageExtractionConfig` <- function(self, name) {
|
||
func <- ImageExtractionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ImageExtractionConfig` <- `$.ImageExtractionConfig`
|
||
#' Token reduction configuration
|
||
#' @field mode Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
|
||
#' @field preserve_important_words Preserve important words (capitalized, technical terms)
|
||
#' @export
|
||
TokenReductionOptions <- new.env(parent = emptyenv())
|
||
TokenReductionOptions$default <- function() .Call("wrap__TokenReductionOptions__default", PACKAGE = "kreuzberg")
|
||
TokenReductionOptions$from_json <- function(json) {
|
||
.Call("wrap__TokenReductionOptions__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.TokenReductionOptions` <- function(self, name) {
|
||
func <- TokenReductionOptions[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TokenReductionOptions` <- `$.TokenReductionOptions`
|
||
#' Language detection configuration
|
||
#' @field enabled Enable language detection
|
||
#' @field min_confidence Minimum confidence threshold (0.0-1.0)
|
||
#' @field detect_multiple Detect multiple languages in the document
|
||
#' @export
|
||
LanguageDetectionConfig <- new.env(parent = emptyenv())
|
||
LanguageDetectionConfig$default <- function() .Call("wrap__LanguageDetectionConfig__default", PACKAGE = "kreuzberg")
|
||
LanguageDetectionConfig$from_json <- function(json) {
|
||
.Call("wrap__LanguageDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.LanguageDetectionConfig` <- function(self, name) {
|
||
func <- LanguageDetectionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LanguageDetectionConfig` <- `$.LanguageDetectionConfig`
|
||
#' Configuration for styled HTML output
|
||
#'
|
||
#' When set on [`ExtractionConfig::html_output`] alongside
|
||
#' `output_format = OutputFormat::Html`, the pipeline builds a
|
||
#' [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
||
#' the plain comrak-based renderer.
|
||
#' @field css Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file`
|
||
#' @field css_file Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both
|
||
#' @field theme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
|
||
#' @field class_prefix CSS class prefix applied to every emitted class name.
|
||
#' @field embed_css When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening
|
||
#' @export
|
||
HtmlOutputConfig <- new.env(parent = emptyenv())
|
||
HtmlOutputConfig$default <- function() .Call("wrap__HtmlOutputConfig__default", PACKAGE = "kreuzberg")
|
||
HtmlOutputConfig$from_json <- function(json) {
|
||
.Call("wrap__HtmlOutputConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.HtmlOutputConfig` <- function(self, name) {
|
||
func <- HtmlOutputConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.HtmlOutputConfig` <- `$.HtmlOutputConfig`
|
||
#' Layout detection configuration
|
||
#'
|
||
#' Controls layout detection behavior in the extraction pipeline.
|
||
#' When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
||
#' is enabled for PDF extraction.
|
||
#' @field confidence_threshold Confidence threshold override (None = use model default).
|
||
#' @field apply_heuristics Whether to apply postprocessing heuristics (default: true).
|
||
#' @field table_model Table structure recognition model.
|
||
#' @field acceleration Hardware acceleration for ONNX models (layout detection + table structure).
|
||
#' @export
|
||
LayoutDetectionConfig <- new.env(parent = emptyenv())
|
||
LayoutDetectionConfig$default <- function() .Call("wrap__LayoutDetectionConfig__default", PACKAGE = "kreuzberg")
|
||
LayoutDetectionConfig$from_json <- function(json) {
|
||
.Call("wrap__LayoutDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.LayoutDetectionConfig` <- function(self, name) {
|
||
func <- LayoutDetectionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LayoutDetectionConfig` <- `$.LayoutDetectionConfig`
|
||
#' Configuration for an LLM provider/model via liter-llm
|
||
#'
|
||
#' Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
||
#' its own `LlmConfig`, allowing different providers per feature.
|
||
#' @field model Provider/model string using liter-llm routing format.
|
||
#' @field api_key API key for the provider. When `None`, liter-llm falls back to the provider's standard environment
|
||
#' @field base_url Custom base URL override for the provider endpoint.
|
||
#' @field timeout_secs Request timeout in seconds (default: 60).
|
||
#' @field max_retries Maximum retry attempts (default: 3).
|
||
#' @field temperature Sampling temperature for generation tasks.
|
||
#' @field max_tokens Maximum tokens to generate.
|
||
#' @export
|
||
LlmConfig <- new.env(parent = emptyenv())
|
||
LlmConfig$from_json <- function(json) {
|
||
.Call("wrap__LlmConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.LlmConfig` <- function(self, name) {
|
||
func <- LlmConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LlmConfig` <- `$.LlmConfig`
|
||
#' Configuration for LLM-based structured data extraction
|
||
#'
|
||
#' Sends extracted document content to a VLM with a JSON schema,
|
||
#' returning structured data that conforms to the schema.
|
||
#' @field schema JSON Schema defining the desired output structure.
|
||
#' @field schema_name Schema name passed to the LLM's structured output mode.
|
||
#' @field schema_description Optional schema description for the LLM.
|
||
#' @field strict Enable strict mode — output must exactly match the schema.
|
||
#' @field prompt Custom Jinja2 extraction prompt template. When `None`, a default template is used.
|
||
#' @field llm LLM configuration for the extraction.
|
||
#' @export
|
||
StructuredExtractionConfig <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.StructuredExtractionConfig` <- function(self, name) {
|
||
func <- StructuredExtractionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.StructuredExtractionConfig` <- `$.StructuredExtractionConfig`
|
||
#' Quality thresholds for OCR fallback decisions and pipeline quality gating
|
||
#'
|
||
#' All fields default to the values that match the previous hardcoded behavior,
|
||
#' so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
||
#' @field min_total_non_whitespace Minimum total non-whitespace characters to consider text substantive.
|
||
#' @field min_non_whitespace_per_page Minimum non-whitespace characters per page on average.
|
||
#' @field min_meaningful_word_len Minimum character count for a word to be "meaningful".
|
||
#' @field min_meaningful_words Minimum count of meaningful words before text is accepted.
|
||
#' @field min_alnum_ratio Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
|
||
#' @field min_garbage_chars Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
|
||
#' @field max_fragmented_word_ratio Maximum fraction of short (1-2 char) words before text is considered fragmented.
|
||
#' @field critical_fragmented_word_ratio Critical fragmentation threshold — triggers OCR regardless of meaningful
|
||
#' @field min_avg_word_length Minimum average word length. Below this with enough words indicates garbled extraction.
|
||
#' @field min_words_for_avg_length_check Minimum word count before average word length check applies.
|
||
#' @field min_consecutive_repeat_ratio Minimum consecutive word repetition ratio to detect column scrambling.
|
||
#' @field min_words_for_repeat_check Minimum word count before consecutive repetition check is applied.
|
||
#' @field substantive_min_chars Minimum character count for "substantive markdown" OCR skip gate.
|
||
#' @field non_text_min_chars Minimum character count for "non-text content" OCR skip gate.
|
||
#' @field alnum_ws_ratio_threshold Alphanumeric+whitespace ratio threshold for skip decisions.
|
||
#' @field pipeline_min_quality Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the
|
||
#' @export
|
||
OcrQualityThresholds <- new.env(parent = emptyenv())
|
||
OcrQualityThresholds$default <- function() .Call("wrap__OcrQualityThresholds__default", PACKAGE = "kreuzberg")
|
||
OcrQualityThresholds$from_json <- function(json) {
|
||
.Call("wrap__OcrQualityThresholds__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrQualityThresholds` <- function(self, name) {
|
||
func <- OcrQualityThresholds[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrQualityThresholds` <- `$.OcrQualityThresholds`
|
||
#' A single backend stage in the OCR pipeline
|
||
#' @field backend Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
|
||
#' @field priority Priority weight (higher = tried first). Stages are sorted by priority descending.
|
||
#' @field language Language override for this stage (None = use parent OcrConfig.language).
|
||
#' @field tesseract_config Tesseract-specific config override for this stage.
|
||
#' @field paddle_ocr_config PaddleOCR-specific config for this stage.
|
||
#' @field vlm_config VLM config override for this pipeline stage.
|
||
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
|
||
#' @export
|
||
OcrPipelineStage <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.OcrPipelineStage` <- function(self, name) {
|
||
func <- OcrPipelineStage[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrPipelineStage` <- `$.OcrPipelineStage`
|
||
#' OCR configuration
|
||
#' @field enabled Whether OCR is enabled.
|
||
#' @field backend OCR backend: tesseract, easyocr, paddleocr
|
||
#' @field language Language code (e.g., "eng", "deu")
|
||
#' @field tesseract_config Tesseract-specific configuration (optional)
|
||
#' @field output_format Output format for OCR results (optional, for format conversion)
|
||
#' @field paddle_ocr_config PaddleOCR-specific configuration (optional, JSON passthrough)
|
||
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
|
||
#' @field element_config OCR element extraction configuration
|
||
#' @field quality_thresholds Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled
|
||
#' @field pipeline Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR
|
||
#' @field auto_rotate Enable automatic page rotation based on orientation detection.
|
||
#' @field vlm_config VLM (Vision Language Model) OCR configuration.
|
||
#' @field vlm_prompt Custom Jinja2 prompt template for VLM OCR.
|
||
#' @field acceleration Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
||
#' @field tessdata_bytes Caller-supplied Tesseract `traineddata` bytes per language code.
|
||
#' @export
|
||
OcrConfig <- new.env(parent = emptyenv())
|
||
OcrConfig$default <- function() .Call("wrap__OcrConfig__default", PACKAGE = "kreuzberg")
|
||
OcrConfig$from_json <- function(json) {
|
||
.Call("wrap__OcrConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrConfig` <- function(self, name) {
|
||
func <- OcrConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrConfig` <- `$.OcrConfig`
|
||
#' Page extraction and tracking configuration
|
||
#'
|
||
#' Controls how pages are extracted, tracked, and represented in the extraction results.
|
||
#' When `None`, page tracking is disabled.
|
||
#'
|
||
#' Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
||
#' when page boundaries are available and chunking is configured.
|
||
#' @field extract_pages Extract pages as separate array (ExtractionResult.pages)
|
||
#' @field insert_page_markers Insert page markers in main content string
|
||
#' @field marker_format Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
||
#' @export
|
||
PageConfig <- new.env(parent = emptyenv())
|
||
PageConfig$default <- function() .Call("wrap__PageConfig__default", PACKAGE = "kreuzberg")
|
||
PageConfig$from_json <- function(json) {
|
||
.Call("wrap__PageConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PageConfig` <- function(self, name) {
|
||
func <- PageConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PageConfig` <- `$.PageConfig`
|
||
#' PDF-specific configuration
|
||
#' @field extract_images Extract images from PDF
|
||
#' @field extract_tables Extract tables from PDF.
|
||
#' @field passwords List of passwords to try when opening encrypted PDFs
|
||
#' @field extract_metadata Extract PDF metadata
|
||
#' @field hierarchy Hierarchy extraction configuration (None = hierarchy extraction disabled)
|
||
#' @field extract_annotations Extract PDF annotations (text notes, highlights, links, stamps). Default: false
|
||
#' @field top_margin_fraction Top margin fraction (0.0–1.0) of page height to exclude headers/running heads. Default:
|
||
#' @field bottom_margin_fraction Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
||
#' @field allow_single_column_tables Allow single-column pseudo tables in extraction results.
|
||
#' @field ocr_inline_images Perform OCR on inline images extracted from PDF pages and attach the recognized text to
|
||
#' @export
|
||
PdfConfig <- new.env(parent = emptyenv())
|
||
PdfConfig$default <- function() .Call("wrap__PdfConfig__default", PACKAGE = "kreuzberg")
|
||
PdfConfig$from_json <- function(json) {
|
||
.Call("wrap__PdfConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PdfConfig` <- function(self, name) {
|
||
func <- PdfConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PdfConfig` <- `$.PdfConfig`
|
||
#' Hierarchy extraction configuration for PDF text structure analysis
|
||
#'
|
||
#' Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||
#' clustering and semantic analysis. When enabled, hierarchical blocks are
|
||
#' included in page content.
|
||
#' @field enabled Enable hierarchy extraction
|
||
#' @field k_clusters Number of font size clusters to use for hierarchy levels (1-7)
|
||
#' @field include_bbox Include bounding box information in hierarchy blocks
|
||
#' @field ocr_coverage_threshold OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
||
#' @export
|
||
HierarchyConfig <- new.env(parent = emptyenv())
|
||
HierarchyConfig$default <- function() .Call("wrap__HierarchyConfig__default", PACKAGE = "kreuzberg")
|
||
HierarchyConfig$from_json <- function(json) {
|
||
.Call("wrap__HierarchyConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.HierarchyConfig` <- function(self, name) {
|
||
func <- HierarchyConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.HierarchyConfig` <- `$.HierarchyConfig`
|
||
#' Post-processor configuration
|
||
#' @field enabled Enable post-processors
|
||
#' @field enabled_processors Whitelist of processor names to run (None = all enabled)
|
||
#' @field disabled_processors Blacklist of processor names to skip (None = none disabled)
|
||
#' @field enabled_set Pre-computed AHashSet for O(1) enabled processor lookup
|
||
#' @field disabled_set Pre-computed AHashSet for O(1) disabled processor lookup
|
||
#' @export
|
||
PostProcessorConfig <- new.env(parent = emptyenv())
|
||
PostProcessorConfig$default <- function() .Call("wrap__PostProcessorConfig__default", PACKAGE = "kreuzberg")
|
||
PostProcessorConfig$from_json <- function(json) {
|
||
.Call("wrap__PostProcessorConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PostProcessorConfig` <- function(self, name) {
|
||
func <- PostProcessorConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PostProcessorConfig` <- `$.PostProcessorConfig`
|
||
#' Chunking configuration
|
||
#'
|
||
#' Configures text chunking for document content, including chunk size,
|
||
#' overlap, trimming behavior, and optional embeddings.
|
||
#'
|
||
#' Use `..Default::default()` when constructing to allow for future field additions:
|
||
#' ```rust
|
||
#' let config = ChunkingConfig {
|
||
#' max_characters: 500,
|
||
#' ..Default::default()
|
||
#' };
|
||
#' ```
|
||
#' @field max_characters Maximum size per chunk (in units determined by `sizing`).
|
||
#' @field overlap Overlap between chunks (in units determined by `sizing`).
|
||
#' @field trim Whether to trim whitespace from chunk boundaries.
|
||
#' @field chunker_type Type of chunker to use (Text or Markdown).
|
||
#' @field embedding Optional embedding configuration for chunk embeddings.
|
||
#' @field preset Use a preset configuration (overrides individual settings if provided).
|
||
#' @field sizing How to measure chunk size.
|
||
#' @field prepend_heading_context When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path
|
||
#' @field topic_threshold Optional cosine similarity threshold for semantic topic boundary detection.
|
||
#' @export
|
||
ChunkingConfig <- new.env(parent = emptyenv())
|
||
ChunkingConfig$default <- function() .Call("wrap__ChunkingConfig__default", PACKAGE = "kreuzberg")
|
||
ChunkingConfig$from_json <- function(json) {
|
||
.Call("wrap__ChunkingConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ChunkingConfig` <- function(self, name) {
|
||
func <- ChunkingConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ChunkingConfig` <- `$.ChunkingConfig`
|
||
#' Embedding configuration for text chunks
|
||
#'
|
||
#' Configures embedding generation using ONNX models via the vendored embedding engine.
|
||
#' Requires the `embeddings` feature to be enabled.
|
||
#' @field model The embedding model to use (defaults to "balanced" preset if not specified)
|
||
#' @field normalize Whether to normalize embedding vectors (recommended for cosine similarity)
|
||
#' @field batch_size Batch size for embedding generation
|
||
#' @field show_download_progress Show model download progress
|
||
#' @field cache_dir Custom cache directory for model files
|
||
#' @field acceleration Hardware acceleration for the embedding ONNX model.
|
||
#' @field max_embed_duration_secs Maximum wall-clock duration (in seconds) for a single `embed()` call when using
|
||
#' @export
|
||
EmbeddingConfig <- new.env(parent = emptyenv())
|
||
EmbeddingConfig$default <- function() .Call("wrap__EmbeddingConfig__default", PACKAGE = "kreuzberg")
|
||
EmbeddingConfig$from_json <- function(json) {
|
||
.Call("wrap__EmbeddingConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.EmbeddingConfig` <- function(self, name) {
|
||
func <- EmbeddingConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmbeddingConfig` <- `$.EmbeddingConfig`
|
||
#' Configuration for tree-sitter language pack integration
|
||
#'
|
||
#' Controls grammar download behavior and code analysis options.
|
||
#'
|
||
#' # Example (TOML)
|
||
#'
|
||
#' ```toml
|
||
#' [tree_sitter]
|
||
#' languages = ["python", "rust"]
|
||
#' groups = ["web"]
|
||
#'
|
||
#' [tree_sitter.process]
|
||
#' structure = true
|
||
#' comments = true
|
||
#' docstrings = true
|
||
#' ```
|
||
#' @field enabled Enable code intelligence processing (default: true).
|
||
#' @field cache_dir Custom cache directory for downloaded grammars.
|
||
#' @field languages Languages to pre-download on init (e.g., `["python", "rust"]`).
|
||
#' @field groups Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
|
||
#' @field process Processing options for code analysis.
|
||
#' @export
|
||
TreeSitterConfig <- new.env(parent = emptyenv())
|
||
TreeSitterConfig$default <- function() .Call("wrap__TreeSitterConfig__default", PACKAGE = "kreuzberg")
|
||
TreeSitterConfig$from_json <- function(json) {
|
||
.Call("wrap__TreeSitterConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.TreeSitterConfig` <- function(self, name) {
|
||
func <- TreeSitterConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TreeSitterConfig` <- `$.TreeSitterConfig`
|
||
#' Processing options for tree-sitter code analysis
|
||
#'
|
||
#' Controls which analysis features are enabled when extracting code files.
|
||
#' @field structure Extract structural items (functions, classes, structs, etc.). Default: true.
|
||
#' @field imports Extract import statements. Default: true.
|
||
#' @field exports Extract export statements. Default: true.
|
||
#' @field comments Extract comments. Default: false.
|
||
#' @field docstrings Extract docstrings. Default: false.
|
||
#' @field symbols Extract symbol definitions. Default: false.
|
||
#' @field diagnostics Include parse diagnostics. Default: false.
|
||
#' @field chunk_max_size Maximum chunk size in bytes. `None` disables chunking.
|
||
#' @field content_mode Content rendering mode for code extraction.
|
||
#' @export
|
||
TreeSitterProcessConfig <- new.env(parent = emptyenv())
|
||
TreeSitterProcessConfig$default <- function() .Call("wrap__TreeSitterProcessConfig__default", PACKAGE = "kreuzberg")
|
||
TreeSitterProcessConfig$from_json <- function(json) {
|
||
.Call("wrap__TreeSitterProcessConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.TreeSitterProcessConfig` <- function(self, name) {
|
||
func <- TreeSitterProcessConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TreeSitterProcessConfig` <- `$.TreeSitterProcessConfig`
|
||
#' A supported document format entry
|
||
#'
|
||
#' Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
||
#' @field extension File extension (without leading dot), e.g., "pdf", "docx"
|
||
#' @field mime_type MIME type string, e.g., "application/pdf"
|
||
#' @export
|
||
SupportedFormat <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.SupportedFormat` <- function(self, name) {
|
||
func <- SupportedFormat[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.SupportedFormat` <- `$.SupportedFormat`
|
||
#' API server configuration
|
||
#'
|
||
#' This struct holds all configuration options for the Kreuzberg API server,
|
||
#' including host/port settings, CORS configuration, and upload limits.
|
||
#'
|
||
#' # Defaults
|
||
#'
|
||
#' - `host`: "127.0.0.1" (localhost only)
|
||
#' - `port`: 8000
|
||
#' - `cors_origins`: empty vector (allows all origins)
|
||
#' - `max_request_body_bytes`: 104_857_600 (100 MB)
|
||
#' - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
||
#' @field host Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
||
#' @field port Server port number
|
||
#' @field cors_origins CORS allowed origins. Empty vector means allow all origins.
|
||
#' @field max_request_body_bytes Maximum size of request body in bytes (default: 100 MB)
|
||
#' @field max_multipart_field_bytes Maximum size of multipart fields in bytes (default: 100 MB)
|
||
#' @export
|
||
ServerConfig <- new.env(parent = emptyenv())
|
||
ServerConfig$default <- function() .Call("wrap__ServerConfig__default", PACKAGE = "kreuzberg")
|
||
ServerConfig$listen_addr <- function(self) .Call("wrap__ServerConfig__listen_addr", self, PACKAGE = "kreuzberg")
|
||
ServerConfig$cors_allows_all <- function(self) .Call("wrap__ServerConfig__cors_allows_all", self, PACKAGE = "kreuzberg")
|
||
ServerConfig$is_origin_allowed <- function(self, origin) .Call("wrap__ServerConfig__is_origin_allowed", self, origin, PACKAGE = "kreuzberg")
|
||
ServerConfig$max_request_body_mb <- function(self) .Call("wrap__ServerConfig__max_request_body_mb", self, PACKAGE = "kreuzberg")
|
||
ServerConfig$max_multipart_field_mb <- function(self) .Call("wrap__ServerConfig__max_multipart_field_mb", self, PACKAGE = "kreuzberg")
|
||
ServerConfig$from_json <- function(json) {
|
||
.Call("wrap__ServerConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ServerConfig` <- function(self, name) {
|
||
func <- ServerConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ServerConfig` <- `$.ServerConfig`
|
||
#' @export
|
||
listen_addr.ServerConfig <- function(x, ...) x$listen_addr(...)
|
||
#' @export
|
||
cors_allows_all.ServerConfig <- function(x, ...) x$cors_allows_all(...)
|
||
#' @export
|
||
is_origin_allowed.ServerConfig <- function(x, ...) x$is_origin_allowed(...)
|
||
#' @export
|
||
max_request_body_mb.ServerConfig <- function(x, ...) x$max_request_body_mb(...)
|
||
#' @export
|
||
max_multipart_field_mb.ServerConfig <- function(x, ...) x$max_multipart_field_mb(...)
|
||
#' StructuredDataResult
|
||
#' @field content content
|
||
#' @field format format
|
||
#' @field metadata metadata
|
||
#' @field text_fields text_fields
|
||
#' @export
|
||
StructuredDataResult <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.StructuredDataResult` <- function(self, name) {
|
||
func <- StructuredDataResult[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.StructuredDataResult` <- `$.StructuredDataResult`
|
||
#' Application properties from docProps/app.xml for DOCX
|
||
#'
|
||
#' Contains Word-specific document statistics and metadata.
|
||
#' @field application Application name (e.g., "Microsoft Office Word")
|
||
#' @field app_version Application version
|
||
#' @field template Template filename
|
||
#' @field total_time Total editing time in minutes
|
||
#' @field pages Number of pages
|
||
#' @field words Number of words
|
||
#' @field characters Number of characters (excluding spaces)
|
||
#' @field characters_with_spaces Number of characters (including spaces)
|
||
#' @field lines Number of lines
|
||
#' @field paragraphs Number of paragraphs
|
||
#' @field company Company name
|
||
#' @field doc_security Document security level
|
||
#' @field scale_crop Scale crop flag
|
||
#' @field links_up_to_date Links up to date flag
|
||
#' @field shared_doc Shared document flag
|
||
#' @field hyperlinks_changed Hyperlinks changed flag
|
||
#' @export
|
||
DocxAppProperties <- new.env(parent = emptyenv())
|
||
DocxAppProperties$from_json <- function(json) {
|
||
.Call("wrap__DocxAppProperties__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.DocxAppProperties` <- function(self, name) {
|
||
func <- DocxAppProperties[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DocxAppProperties` <- `$.DocxAppProperties`
|
||
#' Application properties from docProps/app.xml for XLSX
|
||
#'
|
||
#' Contains Excel-specific document metadata.
|
||
#' @field application Application name (e.g., "Microsoft Excel")
|
||
#' @field app_version Application version
|
||
#' @field doc_security Document security level
|
||
#' @field scale_crop Scale crop flag
|
||
#' @field links_up_to_date Links up to date flag
|
||
#' @field shared_doc Shared document flag
|
||
#' @field hyperlinks_changed Hyperlinks changed flag
|
||
#' @field company Company name
|
||
#' @field worksheet_names Worksheet names
|
||
#' @export
|
||
XlsxAppProperties <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.XlsxAppProperties` <- function(self, name) {
|
||
func <- XlsxAppProperties[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.XlsxAppProperties` <- `$.XlsxAppProperties`
|
||
#' Application properties from docProps/app.xml for PPTX
|
||
#'
|
||
#' Contains PowerPoint-specific document metadata.
|
||
#' @field application Application name (e.g., "Microsoft Office PowerPoint")
|
||
#' @field app_version Application version
|
||
#' @field total_time Total editing time in minutes
|
||
#' @field company Company name
|
||
#' @field doc_security Document security level
|
||
#' @field scale_crop Scale crop flag
|
||
#' @field links_up_to_date Links up to date flag
|
||
#' @field shared_doc Shared document flag
|
||
#' @field hyperlinks_changed Hyperlinks changed flag
|
||
#' @field slides Number of slides
|
||
#' @field notes Number of notes
|
||
#' @field hidden_slides Number of hidden slides
|
||
#' @field multimedia_clips Number of multimedia clips
|
||
#' @field presentation_format Presentation format (e.g., "Widescreen", "Standard")
|
||
#' @field slide_titles Slide titles
|
||
#' @export
|
||
PptxAppProperties <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.PptxAppProperties` <- function(self, name) {
|
||
func <- PptxAppProperties[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PptxAppProperties` <- `$.PptxAppProperties`
|
||
#' Dublin Core metadata from docProps/core.xml
|
||
#'
|
||
#' Contains standard metadata fields defined by the Dublin Core standard
|
||
#' and Office-specific extensions.
|
||
#' @field title Document title
|
||
#' @field subject Document subject/topic
|
||
#' @field creator Document creator/author
|
||
#' @field keywords Keywords or tags
|
||
#' @field description Document description/abstract
|
||
#' @field last_modified_by User who last modified the document
|
||
#' @field revision Revision number
|
||
#' @field created Creation timestamp (ISO 8601)
|
||
#' @field modified Last modification timestamp (ISO 8601)
|
||
#' @field category Document category
|
||
#' @field content_status Content status (Draft, Final, etc.)
|
||
#' @field language Document language
|
||
#' @field identifier Unique identifier
|
||
#' @field version Document version
|
||
#' @field last_printed Last print timestamp (ISO 8601)
|
||
#' @export
|
||
CoreProperties <- new.env(parent = emptyenv())
|
||
CoreProperties$from_json <- function(json) {
|
||
.Call("wrap__CoreProperties__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.CoreProperties` <- function(self, name) {
|
||
func <- CoreProperties[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.CoreProperties` <- `$.CoreProperties`
|
||
#' Configuration for security limits across extractors
|
||
#'
|
||
#' All limits are intentionally conservative to prevent DoS attacks
|
||
#' while still supporting legitimate documents.
|
||
#' @field max_archive_size Maximum uncompressed size for archives (500 MB)
|
||
#' @field max_compression_ratio Maximum compression ratio before flagging as potential bomb (100:1)
|
||
#' @field max_files_in_archive Maximum number of files in archive (10,000)
|
||
#' @field max_nesting_depth Maximum nesting depth for structures (100)
|
||
#' @field max_entity_length Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token
|
||
#' @field max_content_size Maximum string growth per document (100 MB)
|
||
#' @field max_iterations Maximum iterations per operation
|
||
#' @field max_xml_depth Maximum XML depth (100 levels)
|
||
#' @field max_table_cells Maximum cells per table (100,000)
|
||
#' @export
|
||
SecurityLimits <- new.env(parent = emptyenv())
|
||
SecurityLimits$default <- function() .Call("wrap__SecurityLimits__default", PACKAGE = "kreuzberg")
|
||
SecurityLimits$from_json <- function(json) {
|
||
.Call("wrap__SecurityLimits__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.SecurityLimits` <- function(self, name) {
|
||
func <- SecurityLimits[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.SecurityLimits` <- `$.SecurityLimits`
|
||
#' TokenReductionConfig
|
||
#' @field level level
|
||
#' @field language_hint language_hint
|
||
#' @field preserve_markdown preserve_markdown
|
||
#' @field preserve_code preserve_code
|
||
#' @field semantic_threshold semantic_threshold
|
||
#' @field enable_parallel enable_parallel
|
||
#' @field use_simd use_simd
|
||
#' @field custom_stopwords custom_stopwords
|
||
#' @field preserve_patterns preserve_patterns
|
||
#' @field target_reduction target_reduction
|
||
#' @field enable_semantic_clustering enable_semantic_clustering
|
||
#' @export
|
||
TokenReductionConfig <- new.env(parent = emptyenv())
|
||
TokenReductionConfig$default <- function() .Call("wrap__TokenReductionConfig__default", PACKAGE = "kreuzberg")
|
||
TokenReductionConfig$from_json <- function(json) {
|
||
.Call("wrap__TokenReductionConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.TokenReductionConfig` <- function(self, name) {
|
||
func <- TokenReductionConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TokenReductionConfig` <- `$.TokenReductionConfig`
|
||
#' A PDF annotation extracted from a document page
|
||
#' @field annotation_type The type of annotation.
|
||
#' @field content Text content of the annotation (e.g., comment text, link URL).
|
||
#' @field page_number Page number where the annotation appears (1-indexed).
|
||
#' @field bounding_box Bounding box of the annotation on the page.
|
||
#' @export
|
||
PdfAnnotation <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.PdfAnnotation` <- function(self, name) {
|
||
func <- PdfAnnotation[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PdfAnnotation` <- `$.PdfAnnotation`
|
||
#' Inline element within a block
|
||
#'
|
||
#' Represents text with formatting, links, images, etc.
|
||
#' @field element_type Type of inline element
|
||
#' @field content Text content
|
||
#' @field attributes Element attributes
|
||
#' @field metadata Additional metadata (e.g., href for links, src/alt for images)
|
||
#' @export
|
||
InlineElement <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.InlineElement` <- function(self, name) {
|
||
func <- InlineElement[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.InlineElement` <- `$.InlineElement`
|
||
#' Image element in Djot
|
||
#' @field src Image source URL or path
|
||
#' @field alt Alternative text
|
||
#' @field title Optional title
|
||
#' @field attributes Element attributes
|
||
#' @export
|
||
DjotImage <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DjotImage` <- function(self, name) {
|
||
func <- DjotImage[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DjotImage` <- `$.DjotImage`
|
||
#' Link element in Djot
|
||
#' @field url Link URL
|
||
#' @field text Link text content
|
||
#' @field title Optional title
|
||
#' @field attributes Element attributes
|
||
#' @export
|
||
DjotLink <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DjotLink` <- function(self, name) {
|
||
func <- DjotLink[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DjotLink` <- `$.DjotLink`
|
||
#' A resolved relationship between two nodes in the document tree
|
||
#' @field source Source node index (the referencing node).
|
||
#' @field target Target node index (the referenced node).
|
||
#' @field kind Semantic kind of the relationship.
|
||
#' @export
|
||
DocumentRelationship <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DocumentRelationship` <- function(self, name) {
|
||
func <- DocumentRelationship[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DocumentRelationship` <- `$.DocumentRelationship`
|
||
#' Individual grid cell with position and span metadata
|
||
#' @field content Cell text content.
|
||
#' @field row Zero-indexed row position.
|
||
#' @field col Zero-indexed column position.
|
||
#' @field row_span Number of rows this cell spans.
|
||
#' @field col_span Number of columns this cell spans.
|
||
#' @field is_header Whether this is a header cell.
|
||
#' @field bbox Bounding box for this cell (if available).
|
||
#' @export
|
||
GridCell <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.GridCell` <- function(self, name) {
|
||
func <- GridCell[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.GridCell` <- `$.GridCell`
|
||
#' Inline text annotation — byte-range based formatting and links
|
||
#'
|
||
#' Annotations reference byte offsets into the node's text content,
|
||
#' enabling precise identification of formatted regions.
|
||
#' @field start Start byte offset in the node's text content (inclusive).
|
||
#' @field end End byte offset in the node's text content (exclusive).
|
||
#' @field kind Annotation type.
|
||
#' @export
|
||
TextAnnotation <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.TextAnnotation` <- function(self, name) {
|
||
func <- TextAnnotation[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TextAnnotation` <- `$.TextAnnotation`
|
||
#' A single file extracted from an archive
|
||
#'
|
||
#' When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||
#' enabled, each processable file produces its own full `ExtractionResult`.
|
||
#' @field path Archive-relative file path (e.g. "folder/document.pdf").
|
||
#' @field mime_type Detected MIME type of the file.
|
||
#' @field result Full extraction result for this file.
|
||
#' @export
|
||
ArchiveEntry <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ArchiveEntry` <- function(self, name) {
|
||
func <- ArchiveEntry[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ArchiveEntry` <- `$.ArchiveEntry`
|
||
#' A non-fatal warning from a processing pipeline stage
|
||
#'
|
||
#' Captures errors from optional features that don't prevent extraction
|
||
#' but may indicate degraded results.
|
||
#' @field source The pipeline stage or feature that produced this warning (e.g., "embedding", "chunking",
|
||
#' @field message Human-readable description of what went wrong.
|
||
#' @export
|
||
ProcessingWarning <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ProcessingWarning` <- function(self, name) {
|
||
func <- ProcessingWarning[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ProcessingWarning` <- `$.ProcessingWarning`
|
||
#' Token usage and cost data for a single LLM call made during extraction
|
||
#'
|
||
#' Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
||
#' are used. Multiple entries may be present when multiple LLM calls occur
|
||
#' within one extraction (e.g. VLM OCR + structured extraction).
|
||
#' @field model The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
|
||
#' @field source The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction",
|
||
#' @field input_tokens Number of input/prompt tokens consumed.
|
||
#' @field output_tokens Number of output/completion tokens generated.
|
||
#' @field total_tokens Total tokens (input + output).
|
||
#' @field estimated_cost Estimated cost in USD based on the provider's published pricing.
|
||
#' @field finish_reason Why the model stopped generating (e.g. "stop", "length", "content_filter").
|
||
#' @export
|
||
LlmUsage <- new.env(parent = emptyenv())
|
||
LlmUsage$from_json <- function(json) {
|
||
.Call("wrap__LlmUsage__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.LlmUsage` <- function(self, name) {
|
||
func <- LlmUsage[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LlmUsage` <- `$.LlmUsage`
|
||
#' A text chunk with optional embedding and metadata
|
||
#'
|
||
#' Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||
#' contains the text content, optional embedding vector (if embedding generation
|
||
#' is configured), and metadata about its position in the document.
|
||
#' @field content The text content of this chunk.
|
||
#' @field chunk_type Semantic structural classification of this chunk.
|
||
#' @field embedding Optional embedding vector for this chunk.
|
||
#' @field metadata Metadata about this chunk's position and properties.
|
||
#' @export
|
||
Chunk <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.Chunk` <- function(self, name) {
|
||
func <- Chunk[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.Chunk` <- `$.Chunk`
|
||
#' A single heading in the hierarchy
|
||
#' @field level Heading depth (1 = h1, 2 = h2, etc.)
|
||
#' @field text The text content of the heading.
|
||
#' @export
|
||
HeadingLevel <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.HeadingLevel` <- function(self, name) {
|
||
func <- HeadingLevel[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.HeadingLevel` <- `$.HeadingLevel`
|
||
#' Metadata about a chunk's position in the original document
|
||
#' @field byte_start Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
||
#' @field byte_end Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
||
#' @field token_count Number of tokens in this chunk (if available).
|
||
#' @field chunk_index Zero-based index of this chunk in the document.
|
||
#' @field total_chunks Total number of chunks in the document.
|
||
#' @field first_page First page number this chunk spans (1-indexed).
|
||
#' @field last_page Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
||
#' @field heading_context Heading context when using Markdown chunker.
|
||
#' @field image_indices Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
||
#' @export
|
||
ChunkMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ChunkMetadata` <- function(self, name) {
|
||
func <- ChunkMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ChunkMetadata` <- `$.ChunkMetadata`
|
||
#' Extracted image from a document
|
||
#'
|
||
#' Contains raw image data, metadata, and optional nested OCR results.
|
||
#' Raw bytes allow cross-language compatibility - users can convert to
|
||
#' PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||
#' @field data Raw image data (PNG, JPEG, WebP, etc. bytes). Uses `bytes::Bytes` for cheap cloning of large buffers.
|
||
#' @field format Image format (e.g., "jpeg", "png", "webp") Uses Cow<'static, str> to avoid allocation for static
|
||
#' @field image_index Zero-indexed position of this image in the document/page
|
||
#' @field page_number Page/slide number where image was found (1-indexed)
|
||
#' @field width Image width in pixels
|
||
#' @field height Image height in pixels
|
||
#' @field colorspace Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
||
#' @field bits_per_component Bits per color component (e.g., 8, 16)
|
||
#' @field is_mask Whether this image is a mask image
|
||
#' @field description Optional description of the image
|
||
#' @field ocr_result Nested OCR extraction result (if image was OCRed)
|
||
#' @field bounding_box Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
||
#' @field source_path Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
||
#' @field image_kind Heuristic classification of what this image likely depicts. `None` if classification was disabled
|
||
#' @field kind_confidence Confidence score for `image_kind`, in the range 0.0 to 1.0.
|
||
#' @field cluster_id Identifier shared across images that form a single logical figure (e.g. all raster tiles of one
|
||
#' @export
|
||
ExtractedImage <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ExtractedImage` <- function(self, name) {
|
||
func <- ExtractedImage[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ExtractedImage` <- `$.ExtractedImage`
|
||
#' Bounding box coordinates for element positioning
|
||
#' @field x0 Left x-coordinate
|
||
#' @field y0 Bottom y-coordinate
|
||
#' @field x1 Right x-coordinate
|
||
#' @field y1 Top y-coordinate
|
||
#' @export
|
||
BoundingBox <- new.env(parent = emptyenv())
|
||
BoundingBox$from_json <- function(json) {
|
||
.Call("wrap__BoundingBox__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.BoundingBox` <- function(self, name) {
|
||
func <- BoundingBox[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.BoundingBox` <- `$.BoundingBox`
|
||
#' Metadata for a semantic element
|
||
#' @field page_number Page number (1-indexed)
|
||
#' @field filename Source filename or document name
|
||
#' @field coordinates Bounding box coordinates if available
|
||
#' @field element_index Position index in the element sequence
|
||
#' @field additional Additional custom metadata
|
||
#' @export
|
||
ElementMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ElementMetadata` <- function(self, name) {
|
||
func <- ElementMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ElementMetadata` <- `$.ElementMetadata`
|
||
#' Semantic element extracted from document
|
||
#'
|
||
#' Represents a logical unit of content with semantic classification,
|
||
#' unique identifier, and metadata for tracking origin and position.
|
||
#' @field element_id Unique element identifier
|
||
#' @field element_type Semantic type of this element
|
||
#' @field text Text content of the element
|
||
#' @field metadata Metadata about the element
|
||
#' @export
|
||
Element <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.Element` <- function(self, name) {
|
||
func <- Element[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.Element` <- `$.Element`
|
||
#' XML extraction result
|
||
#'
|
||
#' Contains extracted text content from XML files along with
|
||
#' structural statistics about the XML document.
|
||
#' @field content Extracted text content (XML structure filtered out)
|
||
#' @field element_count Total number of XML elements processed
|
||
#' @field unique_elements List of unique element names found (sorted)
|
||
#' @export
|
||
XmlExtractionResult <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.XmlExtractionResult` <- function(self, name) {
|
||
func <- XmlExtractionResult[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.XmlExtractionResult` <- `$.XmlExtractionResult`
|
||
#' Email attachment representation
|
||
#'
|
||
#' Contains metadata and optionally the content of an email attachment.
|
||
#' @field name Attachment name (from Content-Disposition header)
|
||
#' @field filename Filename of the attachment
|
||
#' @field mime_type MIME type of the attachment
|
||
#' @field size Size in bytes
|
||
#' @field is_image Whether this attachment is an image
|
||
#' @field data Attachment data (if extracted). Uses `bytes::Bytes` for cheap cloning of large buffers.
|
||
#' @export
|
||
EmailAttachment <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.EmailAttachment` <- function(self, name) {
|
||
func <- EmailAttachment[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmailAttachment` <- `$.EmailAttachment`
|
||
#' Bounding box for an OCR-detected table in pixel coordinates
|
||
#' @field left Left x-coordinate (pixels)
|
||
#' @field top Top y-coordinate (pixels)
|
||
#' @field right Right x-coordinate (pixels)
|
||
#' @field bottom Bottom y-coordinate (pixels)
|
||
#' @export
|
||
OcrTableBoundingBox <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.OcrTableBoundingBox` <- function(self, name) {
|
||
func <- OcrTableBoundingBox[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrTableBoundingBox` <- `$.OcrTableBoundingBox`
|
||
#' Image preprocessing configuration for OCR
|
||
#'
|
||
#' These settings control how images are preprocessed before OCR to improve
|
||
#' text recognition quality. Different preprocessing strategies work better
|
||
#' for different document types.
|
||
#' @field target_dpi Target DPI for the image (300 is standard, 600 for small text).
|
||
#' @field auto_rotate Auto-detect and correct image rotation.
|
||
#' @field deskew Correct skew (tilted images).
|
||
#' @field denoise Remove noise from the image.
|
||
#' @field contrast_enhance Enhance contrast for better text visibility.
|
||
#' @field binarization_method Binarization method: "otsu", "sauvola", "adaptive".
|
||
#' @field invert_colors Invert colors (white text on black → black on white).
|
||
#' @export
|
||
ImagePreprocessingConfig <- new.env(parent = emptyenv())
|
||
ImagePreprocessingConfig$default <- function() .Call("wrap__ImagePreprocessingConfig__default", PACKAGE = "kreuzberg")
|
||
ImagePreprocessingConfig$from_json <- function(json) {
|
||
.Call("wrap__ImagePreprocessingConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ImagePreprocessingConfig` <- function(self, name) {
|
||
func <- ImagePreprocessingConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ImagePreprocessingConfig` <- `$.ImagePreprocessingConfig`
|
||
#' Tesseract OCR configuration
|
||
#'
|
||
#' Provides fine-grained control over Tesseract OCR engine parameters.
|
||
#' Most users can use the defaults, but these settings allow optimization
|
||
#' for specific document types (invoices, handwriting, etc.).
|
||
#' @field language Language code (e.g., "eng", "deu", "fra")
|
||
#' @field psm Page Segmentation Mode (0-13).
|
||
#' @field output_format Output format ("text" or "markdown")
|
||
#' @field oem OCR Engine Mode (0-3).
|
||
#' @field min_confidence Minimum confidence threshold (0.0-100.0).
|
||
#' @field preprocessing Image preprocessing configuration.
|
||
#' @field enable_table_detection Enable automatic table detection and reconstruction
|
||
#' @field table_min_confidence Minimum confidence threshold for table detection (0.0-1.0)
|
||
#' @field table_column_threshold Column threshold for table detection (pixels)
|
||
#' @field table_row_threshold_ratio Row threshold ratio for table detection (0.0-1.0)
|
||
#' @field use_cache Enable OCR result caching
|
||
#' @field classify_use_pre_adapted_templates Use pre-adapted templates for character classification
|
||
#' @field language_model_ngram_on Enable N-gram language model
|
||
#' @field tessedit_dont_blkrej_good_wds Don't reject good words during block-level processing
|
||
#' @field tessedit_dont_rowrej_good_wds Don't reject good words during row-level processing
|
||
#' @field tessedit_enable_dict_correction Enable dictionary correction
|
||
#' @field tessedit_char_whitelist Whitelist of allowed characters (empty = all allowed)
|
||
#' @field tessedit_char_blacklist Blacklist of forbidden characters (empty = none forbidden)
|
||
#' @field tessedit_use_primary_params_model Use primary language params model
|
||
#' @field textord_space_size_is_variable Variable-width space detection
|
||
#' @field thresholding_method Use adaptive thresholding method
|
||
#' @export
|
||
TesseractConfig <- new.env(parent = emptyenv())
|
||
TesseractConfig$default <- function() .Call("wrap__TesseractConfig__default", PACKAGE = "kreuzberg")
|
||
TesseractConfig$from_json <- function(json) {
|
||
.Call("wrap__TesseractConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.TesseractConfig` <- function(self, name) {
|
||
func <- TesseractConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TesseractConfig` <- `$.TesseractConfig`
|
||
#' Image preprocessing metadata
|
||
#'
|
||
#' Tracks the transformations applied to an image during OCR preprocessing,
|
||
#' including DPI normalization, resizing, and resampling.
|
||
#' @field original_dimensions Original image dimensions (width, height) in pixels
|
||
#' @field original_dpi Original image DPI (horizontal, vertical)
|
||
#' @field target_dpi Target DPI from configuration
|
||
#' @field scale_factor Scaling factor applied to the image
|
||
#' @field auto_adjusted Whether DPI was auto-adjusted based on content
|
||
#' @field final_dpi Final DPI after processing
|
||
#' @field new_dimensions New dimensions after resizing (if resized)
|
||
#' @field resample_method Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
||
#' @field dimension_clamped Whether dimensions were clamped to max_image_dimension
|
||
#' @field calculated_dpi Calculated optimal DPI (if auto_adjust_dpi enabled)
|
||
#' @field skipped_resize Whether resize was skipped (dimensions already optimal)
|
||
#' @field resize_error Error message if resize failed
|
||
#' @export
|
||
ImagePreprocessingMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ImagePreprocessingMetadata` <- function(self, name) {
|
||
func <- ImagePreprocessingMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ImagePreprocessingMetadata` <- `$.ImagePreprocessingMetadata`
|
||
#' Extraction result metadata
|
||
#'
|
||
#' Contains common fields applicable to all formats, format-specific metadata
|
||
#' via a discriminated union, and additional custom fields from postprocessors.
|
||
#' @field title Document title
|
||
#' @field subject Document subject or description
|
||
#' @field authors Primary author(s) - always Vec for consistency
|
||
#' @field keywords Keywords/tags - always Vec for consistency
|
||
#' @field language Primary language (ISO 639 code)
|
||
#' @field created_at Creation timestamp (ISO 8601 format)
|
||
#' @field modified_at Last modification timestamp (ISO 8601 format)
|
||
#' @field created_by User who created the document
|
||
#' @field modified_by User who last modified the document
|
||
#' @field pages Page/slide/sheet structure with boundaries
|
||
#' @field format Format-specific metadata (discriminated union)
|
||
#' @field image_preprocessing Image preprocessing metadata (when OCR preprocessing was applied)
|
||
#' @field json_schema JSON schema (for structured data extraction)
|
||
#' @field error Error metadata (for batch operations)
|
||
#' @field extraction_duration_ms Extraction duration in milliseconds (for benchmarking).
|
||
#' @field category Document category (from frontmatter or classification).
|
||
#' @field tags Document tags (from frontmatter).
|
||
#' @field document_version Document version string (from frontmatter).
|
||
#' @field abstract_text Abstract or summary text (from frontmatter).
|
||
#' @field output_format Output format identifier (e.g., "markdown", "html", "text").
|
||
#' @field ocr_used Whether OCR was used during extraction.
|
||
#' @field additional Additional custom fields from postprocessors.
|
||
#' @export
|
||
Metadata <- new.env(parent = emptyenv())
|
||
Metadata$is_empty <- function(self) .Call("wrap__Metadata__is_empty", self, PACKAGE = "kreuzberg")
|
||
Metadata$from_json <- function(json) {
|
||
.Call("wrap__Metadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.Metadata` <- function(self, name) {
|
||
func <- Metadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.Metadata` <- `$.Metadata`
|
||
#' @export
|
||
is_empty.Metadata <- function(x, ...) x$is_empty(...)
|
||
#' Excel/spreadsheet format metadata
|
||
#'
|
||
#' Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
||
#' discriminant. Sheet count and sheet names are stored inside this struct.
|
||
#' @field sheet_count Number of sheets in the workbook.
|
||
#' @field sheet_names Names of all sheets in the workbook.
|
||
#' @export
|
||
ExcelMetadata <- new.env(parent = emptyenv())
|
||
ExcelMetadata$from_json <- function(json) {
|
||
.Call("wrap__ExcelMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ExcelMetadata` <- function(self, name) {
|
||
func <- ExcelMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ExcelMetadata` <- `$.ExcelMetadata`
|
||
#' Email metadata extracted from .eml and .msg files
|
||
#'
|
||
#' Includes sender/recipient information, message ID, and attachment list.
|
||
#' @field from_email Sender's email address
|
||
#' @field from_name Sender's display name
|
||
#' @field to_emails Primary recipients
|
||
#' @field cc_emails CC recipients
|
||
#' @field bcc_emails BCC recipients
|
||
#' @field message_id Message-ID header value
|
||
#' @field attachments List of attachment filenames
|
||
#' @export
|
||
EmailMetadata <- new.env(parent = emptyenv())
|
||
EmailMetadata$from_json <- function(json) {
|
||
.Call("wrap__EmailMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.EmailMetadata` <- function(self, name) {
|
||
func <- EmailMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmailMetadata` <- `$.EmailMetadata`
|
||
#' Archive (ZIP/TAR/7Z) metadata
|
||
#'
|
||
#' Extracted from compressed archive files containing file lists and size information.
|
||
#' @field format Archive format ("ZIP", "TAR", "7Z", etc.)
|
||
#' @field file_count Total number of files in the archive
|
||
#' @field file_list List of file paths within the archive
|
||
#' @field total_size Total uncompressed size in bytes
|
||
#' @field compressed_size Compressed size in bytes (if available)
|
||
#' @export
|
||
ArchiveMetadata <- new.env(parent = emptyenv())
|
||
ArchiveMetadata$from_json <- function(json) {
|
||
.Call("wrap__ArchiveMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ArchiveMetadata` <- function(self, name) {
|
||
func <- ArchiveMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ArchiveMetadata` <- `$.ArchiveMetadata`
|
||
#' Image metadata extracted from image files
|
||
#'
|
||
#' Includes dimensions, format, and EXIF data.
|
||
#' @field width Image width in pixels
|
||
#' @field height Image height in pixels
|
||
#' @field format Image format (e.g., "PNG", "JPEG", "TIFF")
|
||
#' @field exif EXIF metadata tags
|
||
#' @export
|
||
ImageMetadata <- new.env(parent = emptyenv())
|
||
ImageMetadata$from_json <- function(json) {
|
||
.Call("wrap__ImageMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.ImageMetadata` <- function(self, name) {
|
||
func <- ImageMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ImageMetadata` <- `$.ImageMetadata`
|
||
#' XML metadata extracted during XML parsing
|
||
#'
|
||
#' Provides statistics about XML document structure.
|
||
#' @field element_count Total number of XML elements processed
|
||
#' @field unique_elements List of unique element tag names (sorted)
|
||
#' @export
|
||
XmlMetadata <- new.env(parent = emptyenv())
|
||
XmlMetadata$from_json <- function(json) {
|
||
.Call("wrap__XmlMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.XmlMetadata` <- function(self, name) {
|
||
func <- XmlMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.XmlMetadata` <- `$.XmlMetadata`
|
||
#' Header/heading element metadata
|
||
#' @field level Header level: 1 (h1) through 6 (h6)
|
||
#' @field text Normalized text content of the header
|
||
#' @field id HTML id attribute if present
|
||
#' @field depth Document tree depth at the header element
|
||
#' @field html_offset Byte offset in original HTML document
|
||
#' @export
|
||
HeaderMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.HeaderMetadata` <- function(self, name) {
|
||
func <- HeaderMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.HeaderMetadata` <- `$.HeaderMetadata`
|
||
#' Structured data (Schema.org, microdata, RDFa) block
|
||
#' @field data_type Type of structured data
|
||
#' @field raw_json Raw JSON string representation
|
||
#' @field schema_type Schema type if detectable (e.g., "Article", "Event", "Product")
|
||
#' @export
|
||
StructuredData <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.StructuredData` <- function(self, name) {
|
||
func <- StructuredData[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.StructuredData` <- `$.StructuredData`
|
||
#' OCR processing metadata
|
||
#'
|
||
#' Captures information about OCR processing configuration and results.
|
||
#' @field language OCR language code(s) used
|
||
#' @field psm Tesseract Page Segmentation Mode (PSM)
|
||
#' @field output_format Output format (e.g., "text", "hocr")
|
||
#' @field table_count Number of tables detected
|
||
#' @field table_rows table_rows
|
||
#' @field table_cols table_cols
|
||
#' @export
|
||
OcrMetadata <- new.env(parent = emptyenv())
|
||
OcrMetadata$from_json <- function(json) {
|
||
.Call("wrap__OcrMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrMetadata` <- function(self, name) {
|
||
func <- OcrMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrMetadata` <- `$.OcrMetadata`
|
||
#' Error metadata (for batch operations)
|
||
#' @field error_type error_type
|
||
#' @field message message
|
||
#' @export
|
||
ErrorMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ErrorMetadata` <- function(self, name) {
|
||
func <- ErrorMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ErrorMetadata` <- `$.ErrorMetadata`
|
||
#' PowerPoint presentation metadata
|
||
#'
|
||
#' Extracted from PPTX files containing slide counts and presentation details.
|
||
#' @field slide_count Total number of slides in the presentation
|
||
#' @field slide_names Names of slides (if available)
|
||
#' @field image_count Number of embedded images
|
||
#' @field table_count Number of tables
|
||
#' @export
|
||
PptxMetadata <- new.env(parent = emptyenv())
|
||
PptxMetadata$from_json <- function(json) {
|
||
.Call("wrap__PptxMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PptxMetadata` <- function(self, name) {
|
||
func <- PptxMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PptxMetadata` <- `$.PptxMetadata`
|
||
#' Word document metadata
|
||
#'
|
||
#' Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||
#' Integrates with `office_metadata` module for core/app/custom properties.
|
||
#' @field core_properties Core properties from docProps/core.xml (Dublin Core metadata)
|
||
#' @field app_properties Application properties from docProps/app.xml (Word-specific statistics)
|
||
#' @field custom_properties Custom properties from docProps/custom.xml (user-defined properties)
|
||
#' @export
|
||
DocxMetadata <- new.env(parent = emptyenv())
|
||
DocxMetadata$from_json <- function(json) {
|
||
.Call("wrap__DocxMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.DocxMetadata` <- function(self, name) {
|
||
func <- DocxMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DocxMetadata` <- `$.DocxMetadata`
|
||
#' CSV/TSV file metadata
|
||
#' @field row_count row_count
|
||
#' @field column_count column_count
|
||
#' @field delimiter delimiter
|
||
#' @field has_header has_header
|
||
#' @field column_types column_types
|
||
#' @export
|
||
CsvMetadata <- new.env(parent = emptyenv())
|
||
CsvMetadata$from_json <- function(json) {
|
||
.Call("wrap__CsvMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.CsvMetadata` <- function(self, name) {
|
||
func <- CsvMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.CsvMetadata` <- `$.CsvMetadata`
|
||
#' BibTeX bibliography metadata
|
||
#' @field entry_count Number of entries in the bibliography.
|
||
#' @field citation_keys citation_keys
|
||
#' @field authors authors
|
||
#' @field year_range year_range
|
||
#' @field entry_types entry_types
|
||
#' @export
|
||
BibtexMetadata <- new.env(parent = emptyenv())
|
||
BibtexMetadata$from_json <- function(json) {
|
||
.Call("wrap__BibtexMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.BibtexMetadata` <- function(self, name) {
|
||
func <- BibtexMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.BibtexMetadata` <- `$.BibtexMetadata`
|
||
#' Citation file metadata (RIS, PubMed, EndNote)
|
||
#' @field citation_count citation_count
|
||
#' @field format format
|
||
#' @field authors authors
|
||
#' @field year_range year_range
|
||
#' @field dois dois
|
||
#' @field keywords keywords
|
||
#' @export
|
||
CitationMetadata <- new.env(parent = emptyenv())
|
||
CitationMetadata$from_json <- function(json) {
|
||
.Call("wrap__CitationMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.CitationMetadata` <- function(self, name) {
|
||
func <- CitationMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.CitationMetadata` <- `$.CitationMetadata`
|
||
#' Year range for bibliographic metadata
|
||
#' @field min min
|
||
#' @field max max
|
||
#' @field years years
|
||
#' @export
|
||
YearRange <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.YearRange` <- function(self, name) {
|
||
func <- YearRange[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.YearRange` <- `$.YearRange`
|
||
#' FictionBook (FB2) metadata
|
||
#' @field genres genres
|
||
#' @field sequences sequences
|
||
#' @field annotation annotation
|
||
#' @export
|
||
FictionBookMetadata <- new.env(parent = emptyenv())
|
||
FictionBookMetadata$from_json <- function(json) {
|
||
.Call("wrap__FictionBookMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.FictionBookMetadata` <- function(self, name) {
|
||
func <- FictionBookMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.FictionBookMetadata` <- `$.FictionBookMetadata`
|
||
#' DBASE field information
|
||
#' @field name name
|
||
#' @field field_type field_type
|
||
#' @export
|
||
DbfFieldInfo <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DbfFieldInfo` <- function(self, name) {
|
||
func <- DbfFieldInfo[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DbfFieldInfo` <- `$.DbfFieldInfo`
|
||
#' JATS contributor with role
|
||
#' @field name name
|
||
#' @field role role
|
||
#' @export
|
||
ContributorRole <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ContributorRole` <- function(self, name) {
|
||
func <- ContributorRole[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ContributorRole` <- `$.ContributorRole`
|
||
#' EPUB metadata (Dublin Core extensions)
|
||
#' @field coverage coverage
|
||
#' @field dc_format dc_format
|
||
#' @field relation relation
|
||
#' @field source source
|
||
#' @field dc_type dc_type
|
||
#' @field cover_image cover_image
|
||
#' @export
|
||
EpubMetadata <- new.env(parent = emptyenv())
|
||
EpubMetadata$from_json <- function(json) {
|
||
.Call("wrap__EpubMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.EpubMetadata` <- function(self, name) {
|
||
func <- EpubMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EpubMetadata` <- `$.EpubMetadata`
|
||
#' Outlook PST archive metadata
|
||
#' @field message_count message_count
|
||
#' @export
|
||
PstMetadata <- new.env(parent = emptyenv())
|
||
PstMetadata$from_json <- function(json) {
|
||
.Call("wrap__PstMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PstMetadata` <- function(self, name) {
|
||
func <- PstMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PstMetadata` <- `$.PstMetadata`
|
||
#' Confidence scores for an OCR element
|
||
#'
|
||
#' Separates detection confidence (how confident that text exists at this location)
|
||
#' from recognition confidence (how confident about the actual text content).
|
||
#' @field detection Detection confidence: how confident the OCR engine is that text exists here.
|
||
#' @field recognition Recognition confidence: how confident about the text content.
|
||
#' @export
|
||
OcrConfidence <- new.env(parent = emptyenv())
|
||
OcrConfidence$from_json <- function(json) {
|
||
.Call("wrap__OcrConfidence__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrConfidence` <- function(self, name) {
|
||
func <- OcrConfidence[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrConfidence` <- `$.OcrConfidence`
|
||
#' Rotation information for an OCR element
|
||
#' @field angle_degrees Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
|
||
#' @field confidence Confidence score for the rotation detection.
|
||
#' @export
|
||
OcrRotation <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.OcrRotation` <- function(self, name) {
|
||
func <- OcrRotation[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrRotation` <- `$.OcrRotation`
|
||
#' A unified OCR element representing detected text with full metadata
|
||
#'
|
||
#' This is the primary type for structured OCR output, preserving all information
|
||
#' from both Tesseract and PaddleOCR backends.
|
||
#' @field text The recognized text content.
|
||
#' @field geometry Bounding geometry (rectangle or quadrilateral).
|
||
#' @field confidence Confidence scores for detection and recognition.
|
||
#' @field level Hierarchical level (word, line, block, page).
|
||
#' @field rotation Rotation information (if detected).
|
||
#' @field page_number Page number (1-indexed).
|
||
#' @field parent_id Parent element ID for hierarchical relationships.
|
||
#' @field backend_metadata Backend-specific metadata that doesn't fit the unified schema.
|
||
#' @export
|
||
OcrElement <- new.env(parent = emptyenv())
|
||
OcrElement$from_json <- function(json) {
|
||
.Call("wrap__OcrElement__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrElement` <- function(self, name) {
|
||
func <- OcrElement[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrElement` <- `$.OcrElement`
|
||
#' Configuration for OCR element extraction
|
||
#'
|
||
#' Controls how OCR elements are extracted and filtered.
|
||
#' @field include_elements Whether to include OCR elements in the extraction result.
|
||
#' @field min_level Minimum hierarchical level to include.
|
||
#' @field min_confidence Minimum recognition confidence threshold (0.0-1.0).
|
||
#' @field build_hierarchy Whether to build hierarchical relationships between elements.
|
||
#' @export
|
||
OcrElementConfig <- new.env(parent = emptyenv())
|
||
OcrElementConfig$from_json <- function(json) {
|
||
.Call("wrap__OcrElementConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.OcrElementConfig` <- function(self, name) {
|
||
func <- OcrElementConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrElementConfig` <- `$.OcrElementConfig`
|
||
#' Byte offset boundary for a page
|
||
#'
|
||
#' Tracks where a specific page's content starts and ends in the main content string,
|
||
#' enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
||
#' at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
||
#' @field byte_start Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
||
#' @field byte_end Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
||
#' @field page_number Page number (1-indexed)
|
||
#' @export
|
||
PageBoundary <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.PageBoundary` <- function(self, name) {
|
||
func <- PageBoundary[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PageBoundary` <- `$.PageBoundary`
|
||
#' Metadata for individual page/slide/sheet
|
||
#'
|
||
#' Captures per-page information including dimensions, content counts,
|
||
#' and visibility state (for presentations).
|
||
#' @field number Page number (1-indexed)
|
||
#' @field title Page title (usually for presentations)
|
||
#' @field dimensions Dimensions in points (PDF) or pixels (images): (width, height)
|
||
#' @field image_count Number of images on this page
|
||
#' @field table_count Number of tables on this page
|
||
#' @field hidden Whether this page is hidden (e.g., in presentations)
|
||
#' @field is_blank Whether this page is blank (no meaningful text, no images, no tables)
|
||
#' @field has_vector_graphics Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
||
#' @export
|
||
PageInfo <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.PageInfo` <- function(self, name) {
|
||
func <- PageInfo[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PageInfo` <- `$.PageInfo`
|
||
#' A detected layout region on a page
|
||
#'
|
||
#' When layout detection is enabled, each page may have layout regions
|
||
#' identifying different content types (text, pictures, tables, etc.)
|
||
#' with confidence scores and spatial positions.
|
||
#' @field class_name Layout class name (e.g. "picture", "table", "text", "section_header").
|
||
#' @field confidence Confidence score from the layout detection model (0.0 to 1.0).
|
||
#' @field bounding_box Bounding box in document coordinate space.
|
||
#' @field area_fraction Fraction of the page area covered by this region (0.0 to 1.0).
|
||
#' @export
|
||
LayoutRegion <- new.env(parent = emptyenv())
|
||
LayoutRegion$from_json <- function(json) {
|
||
.Call("wrap__LayoutRegion__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.LayoutRegion` <- function(self, name) {
|
||
func <- LayoutRegion[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LayoutRegion` <- `$.LayoutRegion`
|
||
#' A text block with hierarchy level assignment
|
||
#'
|
||
#' Represents a block of text with semantic heading information extracted from
|
||
#' font size clustering and hierarchical analysis.
|
||
#' @field text The text content of this block
|
||
#' @field font_size The font size of the text in this block
|
||
#' @field level The hierarchy level of this block (H1-H6 or Body)
|
||
#' @field bbox Bounding box information for the block
|
||
#' @export
|
||
HierarchicalBlock <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.HierarchicalBlock` <- function(self, name) {
|
||
func <- HierarchicalBlock[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.HierarchicalBlock` <- `$.HierarchicalBlock`
|
||
#' A single changed cell within a table
|
||
#'
|
||
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
#' reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
#' `crate::diff` re-exports this type verbatim.
|
||
#' @field row Zero-based row index.
|
||
#' @field col Zero-based column index.
|
||
#' @field from Value before the change.
|
||
#' @field to Value after the change.
|
||
#' @export
|
||
CellChange <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.CellChange` <- function(self, name) {
|
||
func <- CellChange[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.CellChange` <- `$.CellChange`
|
||
#' A single tracked change embedded in a document
|
||
#'
|
||
#' Populated by per-format extractors that understand change-tracking metadata
|
||
#' (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||
#' extractor defaults to `ExtractionResult.revisions = None` until a
|
||
#' format-specific implementation is added.
|
||
#' @field revision_id Format-specific revision identifier.
|
||
#' @field author Display name of the author who made this change, when available.
|
||
#' @field timestamp ISO-8601 timestamp of the change, when available.
|
||
#' @field kind Semantic kind of this revision.
|
||
#' @field anchor Best-effort document location for this revision.
|
||
#' @field delta The content changes that make up this revision.
|
||
#' @export
|
||
DocumentRevision <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DocumentRevision` <- function(self, name) {
|
||
func <- DocumentRevision[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DocumentRevision` <- `$.DocumentRevision`
|
||
#' Individual table cell with content and optional styling
|
||
#'
|
||
#' Future extension point for rich table support with cell-level metadata.
|
||
#' @field content Cell content as text
|
||
#' @field row_span Row span (number of rows this cell spans)
|
||
#' @field col_span Column span (number of columns this cell spans)
|
||
#' @field is_header Whether this is a header cell
|
||
#' @export
|
||
TableCell <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.TableCell` <- function(self, name) {
|
||
func <- TableCell[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.TableCell` <- `$.TableCell`
|
||
#' A URI extracted from a document
|
||
#'
|
||
#' Represents any link, reference, or resource pointer found during extraction.
|
||
#' The `kind` field classifies the URI semantically, while `label` carries
|
||
#' optional human-readable display text.
|
||
#' @field url The URL or path string.
|
||
#' @field label Optional display text / label for the link.
|
||
#' @field page Optional page number where the URI was found (1-indexed).
|
||
#' @field kind Semantic classification of the URI.
|
||
#' @export
|
||
ExtractedUri <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ExtractedUri` <- function(self, name) {
|
||
func <- ExtractedUri[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ExtractedUri` <- `$.ExtractedUri`
|
||
#' MIME type detection response
|
||
#' @field mime_type Detected MIME type
|
||
#' @field filename Original filename (if provided)
|
||
#' @export
|
||
DetectResponse <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DetectResponse` <- function(self, name) {
|
||
func <- DetectResponse[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DetectResponse` <- `$.DetectResponse`
|
||
#' Options controlling how two `ExtractionResult` values are compared
|
||
#' @field include_metadata Include metadata changes in the diff. Default: `true`.
|
||
#' @field include_embedded Include embedded-children changes in the diff. Default: `true`.
|
||
#' @field max_content_chars Truncate content to this many characters before diffing.
|
||
#' @export
|
||
DiffOptions <- new.env(parent = emptyenv())
|
||
DiffOptions$default <- function() .Call("wrap__DiffOptions__default", PACKAGE = "kreuzberg")
|
||
DiffOptions$from_json <- function(json) {
|
||
.Call("wrap__DiffOptions__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.DiffOptions` <- function(self, name) {
|
||
func <- DiffOptions[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DiffOptions` <- `$.DiffOptions`
|
||
#' A single contiguous hunk in a unified diff
|
||
#' @field from_line Starting line number in the old content (0-indexed).
|
||
#' @field from_count Number of lines from the old content in this hunk.
|
||
#' @field to_line Starting line number in the new content (0-indexed).
|
||
#' @field to_count Number of lines from the new content in this hunk.
|
||
#' @field lines Lines that make up this hunk.
|
||
#' @export
|
||
DiffHunk <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DiffHunk` <- function(self, name) {
|
||
func <- DiffHunk[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DiffHunk` <- `$.DiffHunk`
|
||
#' Diff for a single embedded archive entry that appears in both results
|
||
#' @field path Archive-relative path identifying this entry.
|
||
#' @field diff The recursive diff of the entry's extraction result.
|
||
#' @export
|
||
EmbeddedDiff <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.EmbeddedDiff` <- function(self, name) {
|
||
func <- EmbeddedDiff[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmbeddedDiff` <- `$.EmbeddedDiff`
|
||
#' Preset configurations for common RAG use cases
|
||
#'
|
||
#' Each preset combines chunk size, overlap, and embedding model
|
||
#' to provide an optimized configuration for specific scenarios.
|
||
#'
|
||
#' All string fields are owned `String` for FFI compatibility — instances
|
||
#' are safe to clone and pass across language boundaries.
|
||
#' @field name name
|
||
#' @field chunk_size chunk_size
|
||
#' @field overlap overlap
|
||
#' @field model_repo HuggingFace repository name for the model.
|
||
#' @field pooling Pooling strategy: "cls" or "mean".
|
||
#' @field model_file Path to the ONNX model file within the repo.
|
||
#' @field dimensions dimensions
|
||
#' @field description description
|
||
#' @export
|
||
EmbeddingPreset <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.EmbeddingPreset` <- function(self, name) {
|
||
func <- EmbeddingPreset[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmbeddingPreset` <- `$.EmbeddingPreset`
|
||
#' YAKE-specific parameters
|
||
#' @field window_size Window size for co-occurrence analysis (default: 2).
|
||
#' @export
|
||
YakeParams <- new.env(parent = emptyenv())
|
||
YakeParams$default <- function() .Call("wrap__YakeParams__default", PACKAGE = "kreuzberg")
|
||
YakeParams$from_json <- function(json) {
|
||
.Call("wrap__YakeParams__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.YakeParams` <- function(self, name) {
|
||
func <- YakeParams[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.YakeParams` <- `$.YakeParams`
|
||
#' RAKE-specific parameters
|
||
#' @field min_word_length Minimum word length to consider (default: 1).
|
||
#' @field max_words_per_phrase Maximum words in a keyword phrase (default: 3).
|
||
#' @export
|
||
RakeParams <- new.env(parent = emptyenv())
|
||
RakeParams$default <- function() .Call("wrap__RakeParams__default", PACKAGE = "kreuzberg")
|
||
RakeParams$from_json <- function(json) {
|
||
.Call("wrap__RakeParams__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.RakeParams` <- function(self, name) {
|
||
func <- RakeParams[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.RakeParams` <- `$.RakeParams`
|
||
#' Keyword extraction configuration
|
||
#' @field algorithm Algorithm to use for extraction.
|
||
#' @field max_keywords Maximum number of keywords to extract (default: 10).
|
||
#' @field min_score Minimum score threshold (0.0-1.0, default: 0.0).
|
||
#' @field ngram_range N-gram range for keyword extraction (min, max).
|
||
#' @field language Language code for stopword filtering (e.g., "en", "de", "fr").
|
||
#' @field yake_params YAKE-specific tuning parameters.
|
||
#' @field rake_params RAKE-specific tuning parameters.
|
||
#' @export
|
||
KeywordConfig <- new.env(parent = emptyenv())
|
||
KeywordConfig$default <- function() .Call("wrap__KeywordConfig__default", PACKAGE = "kreuzberg")
|
||
KeywordConfig$from_json <- function(json) {
|
||
.Call("wrap__KeywordConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.KeywordConfig` <- function(self, name) {
|
||
func <- KeywordConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.KeywordConfig` <- `$.KeywordConfig`
|
||
#' Extracted keyword with metadata
|
||
#' @field text The keyword text.
|
||
#' @field score Relevance score (higher is better, algorithm-specific range).
|
||
#' @field algorithm Algorithm that extracted this keyword.
|
||
#' @field positions Optional positions where keyword appears in text (character offsets).
|
||
#' @export
|
||
Keyword <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.Keyword` <- function(self, name) {
|
||
func <- Keyword[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.Keyword` <- `$.Keyword`
|
||
#' Configuration for PaddleOCR backend
|
||
#'
|
||
#' Configures PaddleOCR text detection and recognition with multi-language support.
|
||
#' Uses a builder pattern for convenient configuration.
|
||
#' @field language Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
|
||
#' @field cache_dir Optional custom cache directory for model files
|
||
#' @field use_angle_cls Enable angle classification for rotated text (default: false). Can misfire on short text
|
||
#' @field enable_table_detection Enable table structure detection (default: false)
|
||
#' @field det_db_thresh Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more
|
||
#' @field det_db_box_thresh Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0
|
||
#' @field det_db_unclip_ratio Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of
|
||
#' @field det_limit_side_len Maximum side length for detection image (default: 960) Larger images may be resized to
|
||
#' @field rec_batch_num Batch size for recognition inference (default: 6) Number of text regions to process
|
||
#' @field padding Padding in pixels added around the image before detection (default: 10). Large values can include
|
||
#' @field drop_score Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition
|
||
#' @field model_tier Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"`
|
||
#' @export
|
||
PaddleOcrConfig <- new.env(parent = emptyenv())
|
||
PaddleOcrConfig$with_cache_dir <- function(self, path) .Call("wrap__PaddleOcrConfig__with_cache_dir", self, path, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_table_detection <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_table_detection", self, enable, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_angle_cls <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_angle_cls", self, enable, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_det_db_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_thresh", self, threshold, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_det_db_box_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_box_thresh", self, threshold, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_det_db_unclip_ratio <- function(self, ratio) .Call("wrap__PaddleOcrConfig__with_det_db_unclip_ratio", self, ratio, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_det_limit_side_len <- function(self, length) .Call("wrap__PaddleOcrConfig__with_det_limit_side_len", self, length, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_rec_batch_num <- function(self, batch_size) .Call("wrap__PaddleOcrConfig__with_rec_batch_num", self, batch_size, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_drop_score <- function(self, score) .Call("wrap__PaddleOcrConfig__with_drop_score", self, score, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_padding <- function(self, padding) .Call("wrap__PaddleOcrConfig__with_padding", self, padding, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$with_model_tier <- function(self, tier) .Call("wrap__PaddleOcrConfig__with_model_tier", self, tier, PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$default <- function() .Call("wrap__PaddleOcrConfig__default", PACKAGE = "kreuzberg")
|
||
PaddleOcrConfig$from_json <- function(json) {
|
||
.Call("wrap__PaddleOcrConfig__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PaddleOcrConfig` <- function(self, name) {
|
||
func <- PaddleOcrConfig[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PaddleOcrConfig` <- `$.PaddleOcrConfig`
|
||
#' @export
|
||
with_cache_dir.PaddleOcrConfig <- function(x, ...) x$with_cache_dir(...)
|
||
#' @export
|
||
with_table_detection.PaddleOcrConfig <- function(x, ...) x$with_table_detection(...)
|
||
#' @export
|
||
with_angle_cls.PaddleOcrConfig <- function(x, ...) x$with_angle_cls(...)
|
||
#' @export
|
||
with_det_db_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_thresh(...)
|
||
#' @export
|
||
with_det_db_box_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_box_thresh(...)
|
||
#' @export
|
||
with_det_db_unclip_ratio.PaddleOcrConfig <- function(x, ...) x$with_det_db_unclip_ratio(...)
|
||
#' @export
|
||
with_det_limit_side_len.PaddleOcrConfig <- function(x, ...) x$with_det_limit_side_len(...)
|
||
#' @export
|
||
with_rec_batch_num.PaddleOcrConfig <- function(x, ...) x$with_rec_batch_num(...)
|
||
#' @export
|
||
with_drop_score.PaddleOcrConfig <- function(x, ...) x$with_drop_score(...)
|
||
#' @export
|
||
with_padding.PaddleOcrConfig <- function(x, ...) x$with_padding(...)
|
||
#' @export
|
||
with_model_tier.PaddleOcrConfig <- function(x, ...) x$with_model_tier(...)
|
||
#' Combined paths to all models needed for OCR (backward compatibility)
|
||
#' @field det_model Path to the detection model directory.
|
||
#' @field cls_model Path to the classification model directory.
|
||
#' @field rec_model Path to the recognition model directory.
|
||
#' @field dict_file Path to the character dictionary file.
|
||
#' @export
|
||
ModelPaths <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.ModelPaths` <- function(self, name) {
|
||
func <- ModelPaths[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ModelPaths` <- `$.ModelPaths`
|
||
#' Document orientation detection result
|
||
#' @field degrees Detected orientation in degrees (0, 90, 180, or 270).
|
||
#' @field confidence Confidence score (0.0-1.0).
|
||
#' @export
|
||
OrientationResult <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.OrientationResult` <- function(self, name) {
|
||
func <- OrientationResult[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OrientationResult` <- `$.OrientationResult`
|
||
#' Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right
|
||
#' @field x1 x1
|
||
#' @field y1 y1
|
||
#' @field x2 x2
|
||
#' @field y2 y2
|
||
#' @export
|
||
BBox <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.BBox` <- function(self, name) {
|
||
func <- BBox[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.BBox` <- `$.BBox`
|
||
#' A single layout detection result
|
||
#' @field class_name class_name
|
||
#' @field confidence confidence
|
||
#' @field bbox bbox
|
||
#' @export
|
||
LayoutDetection <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.LayoutDetection` <- function(self, name) {
|
||
func <- LayoutDetection[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.LayoutDetection` <- `$.LayoutDetection`
|
||
#' Embedded file descriptor extracted from the PDF name tree
|
||
#' @field name The filename as stored in the PDF name tree.
|
||
#' @field data Raw file bytes from the embedded stream (already decompressed by lopdf).
|
||
#' @field compressed_size Compressed byte count of the original stream (before decompression).
|
||
#' @field mime_type MIME type if specified in the filespec, otherwise `None`.
|
||
#' @export
|
||
EmbeddedFile <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.EmbeddedFile` <- function(self, name) {
|
||
func <- EmbeddedFile[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmbeddedFile` <- `$.EmbeddedFile`
|
||
#' PDF-specific metadata
|
||
#'
|
||
#' Contains metadata fields specific to PDF documents that are not in the common
|
||
#' `Metadata` structure. Common fields like title, authors, keywords, and dates
|
||
#' are at the `Metadata` level.
|
||
#' @field pdf_version PDF version (e.g., "1.7", "2.0")
|
||
#' @field producer PDF producer (application that created the PDF)
|
||
#' @field is_encrypted Whether the PDF is encrypted/password-protected
|
||
#' @field width First page width in points (1/72 inch)
|
||
#' @field height First page height in points (1/72 inch)
|
||
#' @field page_count Total number of pages in the PDF document
|
||
#' @export
|
||
PdfMetadata <- new.env(parent = emptyenv())
|
||
PdfMetadata$from_json <- function(json) {
|
||
.Call("wrap__PdfMetadata__from_json", json, PACKAGE = "kreuzberg")
|
||
}
|
||
#' @export
|
||
`$.PdfMetadata` <- function(self, name) {
|
||
func <- PdfMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.PdfMetadata` <- `$.PdfMetadata`
|
||
#' Output format for extraction results
|
||
#'
|
||
#' Controls the format of the `content` field in `ExtractionResult`.
|
||
#' When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
||
#' `Plain` returns the raw extracted text.
|
||
#' `Structured` returns JSON with full OCR element data including bounding
|
||
#' boxes and confidence scores.
|
||
#' @field Plain Plain text content only (default)
|
||
#' @field Markdown Markdown format
|
||
#' @field Djot Djot markup format
|
||
#' @field Html HTML format
|
||
#' @field Json JSON tree format with heading-driven sections.
|
||
#' @field Structured Structured JSON format with full OCR element metadata.
|
||
#' @field Custom Custom renderer registered via the RendererRegistry. The string is the renderer name (e.g., "docx",
|
||
#' @export
|
||
OutputFormat <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.OutputFormat` <- function(self, name) {
|
||
func <- OutputFormat[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OutputFormat` <- `$.OutputFormat`
|
||
#' Format-specific metadata (discriminated union)
|
||
#'
|
||
#' Only one format type can exist per extraction result. This provides
|
||
#' type-safe, clean metadata without nested optionals.
|
||
#' @field Pdf Pdf
|
||
#' @field Docx Docx
|
||
#' @field Excel Excel
|
||
#' @field Email Email
|
||
#' @field Pptx Pptx
|
||
#' @field Archive Archive
|
||
#' @field Image Image
|
||
#' @field Xml Xml
|
||
#' @field Text Text
|
||
#' @field Html Html
|
||
#' @field Ocr Ocr
|
||
#' @field Csv Csv
|
||
#' @field Bibtex Bibtex
|
||
#' @field Citation Citation
|
||
#' @field FictionBook FictionBook
|
||
#' @field Dbf Dbf
|
||
#' @field Jats Jats
|
||
#' @field Epub Epub
|
||
#' @field Pst Pst
|
||
#' @field Code Code
|
||
#' @export
|
||
FormatMetadata <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.FormatMetadata` <- function(self, name) {
|
||
func <- FormatMetadata[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.FormatMetadata` <- `$.FormatMetadata`
|
||
#' A single line in a unified-diff hunk
|
||
#'
|
||
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
#' reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
#' `crate::diff` re-exports this type verbatim.
|
||
#' @field Context Unchanged context line.
|
||
#' @field Added Line added in the "after" version.
|
||
#' @field Removed Line removed from the "before" version.
|
||
#' @export
|
||
DiffLine <- new.env(parent = emptyenv())
|
||
#' @export
|
||
`$.DiffLine` <- function(self, name) {
|
||
func <- DiffLine[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.DiffLine` <- `$.DiffLine`
|
||
#' Create a ExecutionProviderType enum value
|
||
#'
|
||
#' Returns the default ExecutionProviderType variant.
|
||
#'
|
||
#' @return A ExecutionProviderType enum value
|
||
#' @export
|
||
ExecutionProviderType <- function() list() |> structure(class = "ExecutionProviderType")
|
||
|
||
#' Create a HtmlTheme enum value
|
||
#'
|
||
#' Returns the default HtmlTheme variant.
|
||
#'
|
||
#' @return A HtmlTheme enum value
|
||
#' @export
|
||
HtmlTheme <- function() list() |> structure(class = "HtmlTheme")
|
||
|
||
#' Create a TableModel enum value
|
||
#'
|
||
#' Returns the default TableModel variant.
|
||
#'
|
||
#' @return A TableModel enum value
|
||
#' @export
|
||
TableModel <- function() list() |> structure(class = "TableModel")
|
||
|
||
#' Create a ChunkerType enum value
|
||
#'
|
||
#' Returns the default ChunkerType variant.
|
||
#'
|
||
#' @return A ChunkerType enum value
|
||
#' @export
|
||
ChunkerType <- function() list() |> structure(class = "ChunkerType")
|
||
|
||
#' Create a CodeContentMode enum value
|
||
#'
|
||
#' Returns the default CodeContentMode variant.
|
||
#'
|
||
#' @return A CodeContentMode enum value
|
||
#' @export
|
||
CodeContentMode <- function() list() |> structure(class = "CodeContentMode")
|
||
|
||
#' Create a ListType enum value
|
||
#'
|
||
#' Returns the default ListType variant.
|
||
#'
|
||
#' @return A ListType enum value
|
||
#' @export
|
||
ListType <- function() list() |> structure(class = "ListType")
|
||
|
||
#' Create a OcrBackendType enum value
|
||
#'
|
||
#' Returns the default OcrBackendType variant.
|
||
#'
|
||
#' @return A OcrBackendType enum value
|
||
#' @export
|
||
OcrBackendType <- function() list() |> structure(class = "OcrBackendType")
|
||
|
||
#' Create a ProcessingStage enum value
|
||
#'
|
||
#' Returns the default ProcessingStage variant.
|
||
#'
|
||
#' @return A ProcessingStage enum value
|
||
#' @export
|
||
ProcessingStage <- function() list() |> structure(class = "ProcessingStage")
|
||
|
||
#' Create a ReductionLevel enum value
|
||
#'
|
||
#' Returns the default ReductionLevel variant.
|
||
#'
|
||
#' @return A ReductionLevel enum value
|
||
#' @export
|
||
ReductionLevel <- function() list() |> structure(class = "ReductionLevel")
|
||
|
||
#' Create a PdfAnnotationType enum value
|
||
#'
|
||
#' Returns the default PdfAnnotationType variant.
|
||
#'
|
||
#' @return A PdfAnnotationType enum value
|
||
#' @export
|
||
PdfAnnotationType <- function() list() |> structure(class = "PdfAnnotationType")
|
||
|
||
#' Create a BlockType enum value
|
||
#'
|
||
#' Returns the default BlockType variant.
|
||
#'
|
||
#' @return A BlockType enum value
|
||
#' @export
|
||
BlockType <- function() list() |> structure(class = "BlockType")
|
||
|
||
#' Create a InlineType enum value
|
||
#'
|
||
#' Returns the default InlineType variant.
|
||
#'
|
||
#' @return A InlineType enum value
|
||
#' @export
|
||
InlineType <- function() list() |> structure(class = "InlineType")
|
||
|
||
#' Create a RelationshipKind enum value
|
||
#'
|
||
#' Returns the default RelationshipKind variant.
|
||
#'
|
||
#' @return A RelationshipKind enum value
|
||
#' @export
|
||
RelationshipKind <- function() list() |> structure(class = "RelationshipKind")
|
||
|
||
#' Create a ContentLayer enum value
|
||
#'
|
||
#' Returns the default ContentLayer variant.
|
||
#'
|
||
#' @return A ContentLayer enum value
|
||
#' @export
|
||
ContentLayer <- function() list() |> structure(class = "ContentLayer")
|
||
|
||
#' Create a ExtractionMethod enum value
|
||
#'
|
||
#' Returns the default ExtractionMethod variant.
|
||
#'
|
||
#' @return A ExtractionMethod enum value
|
||
#' @export
|
||
ExtractionMethod <- function() list() |> structure(class = "ExtractionMethod")
|
||
|
||
#' Create a ChunkType enum value
|
||
#'
|
||
#' Returns the default ChunkType variant.
|
||
#'
|
||
#' @return A ChunkType enum value
|
||
#' @export
|
||
ChunkType <- function() list() |> structure(class = "ChunkType")
|
||
|
||
#' Create a ImageKind enum value
|
||
#'
|
||
#' Returns the default ImageKind variant.
|
||
#'
|
||
#' @return A ImageKind enum value
|
||
#' @export
|
||
ImageKind <- function() list() |> structure(class = "ImageKind")
|
||
|
||
#' Create a ResultFormat enum value
|
||
#'
|
||
#' Returns the default ResultFormat variant.
|
||
#'
|
||
#' @return A ResultFormat enum value
|
||
#' @export
|
||
ResultFormat <- function() list() |> structure(class = "ResultFormat")
|
||
|
||
#' Create a ElementType enum value
|
||
#'
|
||
#' Returns the default ElementType variant.
|
||
#'
|
||
#' @return A ElementType enum value
|
||
#' @export
|
||
ElementType <- function() list() |> structure(class = "ElementType")
|
||
|
||
#' Create a TextDirection enum value
|
||
#'
|
||
#' Returns the default TextDirection variant.
|
||
#'
|
||
#' @return A TextDirection enum value
|
||
#' @export
|
||
TextDirection <- function() list() |> structure(class = "TextDirection")
|
||
|
||
#' Create a LinkType enum value
|
||
#'
|
||
#' Returns the default LinkType variant.
|
||
#'
|
||
#' @return A LinkType enum value
|
||
#' @export
|
||
LinkType <- function() list() |> structure(class = "LinkType")
|
||
|
||
#' Create a ImageType enum value
|
||
#'
|
||
#' Returns the default ImageType variant.
|
||
#'
|
||
#' @return A ImageType enum value
|
||
#' @export
|
||
ImageType <- function() list() |> structure(class = "ImageType")
|
||
|
||
#' Create a StructuredDataType enum value
|
||
#'
|
||
#' Returns the default StructuredDataType variant.
|
||
#'
|
||
#' @return A StructuredDataType enum value
|
||
#' @export
|
||
StructuredDataType <- function() list() |> structure(class = "StructuredDataType")
|
||
|
||
#' Create a OcrElementLevel enum value
|
||
#'
|
||
#' Returns the default OcrElementLevel variant.
|
||
#'
|
||
#' @return A OcrElementLevel enum value
|
||
#' @export
|
||
OcrElementLevel <- function() list() |> structure(class = "OcrElementLevel")
|
||
|
||
#' Create a PageUnitType enum value
|
||
#'
|
||
#' Returns the default PageUnitType variant.
|
||
#'
|
||
#' @return A PageUnitType enum value
|
||
#' @export
|
||
PageUnitType <- function() list() |> structure(class = "PageUnitType")
|
||
|
||
#' Create a RevisionKind enum value
|
||
#'
|
||
#' Returns the default RevisionKind variant.
|
||
#'
|
||
#' @return A RevisionKind enum value
|
||
#' @export
|
||
RevisionKind <- function() list() |> structure(class = "RevisionKind")
|
||
|
||
#' Create a UriKind enum value
|
||
#'
|
||
#' Returns the default UriKind variant.
|
||
#'
|
||
#' @return A UriKind enum value
|
||
#' @export
|
||
UriKind <- function() list() |> structure(class = "UriKind")
|
||
|
||
#' Create a KeywordAlgorithm enum value
|
||
#'
|
||
#' Returns the default KeywordAlgorithm variant.
|
||
#'
|
||
#' @return A KeywordAlgorithm enum value
|
||
#' @export
|
||
KeywordAlgorithm <- function() list() |> structure(class = "KeywordAlgorithm")
|
||
|
||
#' Create a PSMMode enum value
|
||
#'
|
||
#' Returns the default PSMMode variant.
|
||
#'
|
||
#' @return A PSMMode enum value
|
||
#' @export
|
||
PSMMode <- function() list() |> structure(class = "PSMMode")
|
||
|
||
#' Create a PaddleLanguage enum value
|
||
#'
|
||
#' Returns the default PaddleLanguage variant.
|
||
#'
|
||
#' @return A PaddleLanguage enum value
|
||
#' @export
|
||
PaddleLanguage <- function() list() |> structure(class = "PaddleLanguage")
|
||
|
||
#' Create a LayoutClass enum value
|
||
#'
|
||
#' Returns the default LayoutClass variant.
|
||
#'
|
||
#' @return A LayoutClass enum value
|
||
#' @export
|
||
LayoutClass <- function() list() |> structure(class = "LayoutClass")
|
||
|
||
#' How chunk size is measured
|
||
#'
|
||
#' Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||
#' chunks are sized by token count according to the specified tokenizer.
|
||
#'
|
||
#' Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||
#' available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||
#' (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||
#' @export
|
||
ChunkSizing <- new.env(parent = emptyenv())
|
||
ChunkSizing$default <- function() .Call("wrap__ChunkSizing__default", PACKAGE = "kreuzberg")
|
||
ChunkSizing$from_json <- function(json) .Call("wrap__ChunkSizing__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.ChunkSizing` <- function(self, name) {
|
||
func <- ChunkSizing[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.ChunkSizing` <- `$.ChunkSizing`
|
||
#' Embedding model types supported by Kreuzberg
|
||
#' @export
|
||
EmbeddingModelType <- new.env(parent = emptyenv())
|
||
EmbeddingModelType$default <- function() .Call("wrap__EmbeddingModelType__default", PACKAGE = "kreuzberg")
|
||
EmbeddingModelType$from_json <- function(json) .Call("wrap__EmbeddingModelType__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.EmbeddingModelType` <- function(self, name) {
|
||
func <- EmbeddingModelType[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.EmbeddingModelType` <- `$.EmbeddingModelType`
|
||
#' Tagged enum for node content. Each variant carries only type-specific data
|
||
#'
|
||
#' Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
||
#' Go/Java/TypeScript bindings.
|
||
#' @export
|
||
NodeContent <- new.env(parent = emptyenv())
|
||
NodeContent$default <- function() .Call("wrap__NodeContent__default", PACKAGE = "kreuzberg")
|
||
NodeContent$from_json <- function(json) .Call("wrap__NodeContent__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.NodeContent` <- function(self, name) {
|
||
func <- NodeContent[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.NodeContent` <- `$.NodeContent`
|
||
#' Types of inline text annotations
|
||
#' @export
|
||
AnnotationKind <- new.env(parent = emptyenv())
|
||
AnnotationKind$default <- function() .Call("wrap__AnnotationKind__default", PACKAGE = "kreuzberg")
|
||
AnnotationKind$from_json <- function(json) .Call("wrap__AnnotationKind__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.AnnotationKind` <- function(self, name) {
|
||
func <- AnnotationKind[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.AnnotationKind` <- `$.AnnotationKind`
|
||
#' Bounding geometry for an OCR element
|
||
#'
|
||
#' Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
||
#' (from PaddleOCR and rotated text detection).
|
||
#' @export
|
||
OcrBoundingGeometry <- new.env(parent = emptyenv())
|
||
OcrBoundingGeometry$default <- function() .Call("wrap__OcrBoundingGeometry__default", PACKAGE = "kreuzberg")
|
||
OcrBoundingGeometry$from_json <- function(json) .Call("wrap__OcrBoundingGeometry__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.OcrBoundingGeometry` <- function(self, name) {
|
||
func <- OcrBoundingGeometry[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.OcrBoundingGeometry` <- `$.OcrBoundingGeometry`
|
||
#' Best-effort document location for a revision
|
||
#' @export
|
||
RevisionAnchor <- new.env(parent = emptyenv())
|
||
RevisionAnchor$default <- function() .Call("wrap__RevisionAnchor__default", PACKAGE = "kreuzberg")
|
||
RevisionAnchor$from_json <- function(json) .Call("wrap__RevisionAnchor__from_json", json, PACKAGE = "kreuzberg")
|
||
#' @export
|
||
`$.RevisionAnchor` <- function(self, name) {
|
||
func <- RevisionAnchor[[name]]
|
||
if (identical(names(formals(func))[1], "self")) {
|
||
function(...) func(self, ...)
|
||
} else {
|
||
func
|
||
}
|
||
}
|
||
#' @export
|
||
`[[.RevisionAnchor` <- `$.RevisionAnchor`
|
||
#' @export
|
||
cors_allows_all <- function(x, ...) UseMethod("cors_allows_all")
|
||
#' @export
|
||
is_empty <- function(x, ...) UseMethod("is_empty")
|
||
#' @export
|
||
is_origin_allowed <- function(x, ...) UseMethod("is_origin_allowed")
|
||
#' @export
|
||
listen_addr <- function(x, ...) UseMethod("listen_addr")
|
||
#' @export
|
||
max_multipart_field_mb <- function(x, ...) UseMethod("max_multipart_field_mb")
|
||
#' @export
|
||
max_request_body_mb <- function(x, ...) UseMethod("max_request_body_mb")
|
||
#' @export
|
||
needs_image_processing <- function(x, ...) UseMethod("needs_image_processing")
|
||
#' @export
|
||
with_angle_cls <- function(x, ...) UseMethod("with_angle_cls")
|
||
#' @export
|
||
with_cache_dir <- function(x, ...) UseMethod("with_cache_dir")
|
||
#' @export
|
||
with_det_db_box_thresh <- function(x, ...) UseMethod("with_det_db_box_thresh")
|
||
#' @export
|
||
with_det_db_thresh <- function(x, ...) UseMethod("with_det_db_thresh")
|
||
#' @export
|
||
with_det_db_unclip_ratio <- function(x, ...) UseMethod("with_det_db_unclip_ratio")
|
||
#' @export
|
||
with_det_limit_side_len <- function(x, ...) UseMethod("with_det_limit_side_len")
|
||
#' @export
|
||
with_drop_score <- function(x, ...) UseMethod("with_drop_score")
|
||
#' @export
|
||
with_model_tier <- function(x, ...) UseMethod("with_model_tier")
|
||
#' @export
|
||
with_padding <- function(x, ...) UseMethod("with_padding")
|
||
#' @export
|
||
with_rec_batch_num <- function(x, ...) UseMethod("with_rec_batch_num")
|
||
#' @export
|
||
with_table_detection <- function(x, ...) UseMethod("with_table_detection")
|