3519 lines
135 KiB
R
3519 lines
135 KiB
R
|
|
# Generated by extendr: Do not edit by hand
|
|||
|
|
#
|
|||
|
|
# This file is regenerated by alef on every `alef generate` run.
|
|||
|
|
# It mirrors the output of `rextendr::document()` and binds every
|
|||
|
|
# wrap__<symbol> entry registered in extendr_module! to an R-callable
|
|||
|
|
# function or class env.
|
|||
|
|
|
|||
|
|
#' @useDynLib kreuzberg, .registration = TRUE
|
|||
|
|
NULL
|
|||
|
|
|
|||
|
|
#' Extract content from a byte array
|
|||
|
|
#'
|
|||
|
|
#' This is the main entry point for in-memory extraction. It performs the following steps:
|
|||
|
|
#' 1. Validate MIME type
|
|||
|
|
#' 2. Handle legacy format conversion if needed
|
|||
|
|
#' 3. Select appropriate extractor from registry
|
|||
|
|
#' 4. Extract content
|
|||
|
|
#' 5. Run post-processing pipeline
|
|||
|
|
#' @param content The byte array to extract.
|
|||
|
|
#' @param mime_type MIME type of the content.
|
|||
|
|
#' @param config Extraction configuration.
|
|||
|
|
#' @return An `ExtractionResult` containing the extracted content and metadata.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Returns `KreuzbergError::Validation` if MIME type is invalid.
|
|||
|
|
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|||
|
|
#' @export
|
|||
|
|
extract_bytes <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes", content, mime_type, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Extract content from a file
|
|||
|
|
#'
|
|||
|
|
#' This is the main entry point for file-based extraction. It performs the following steps:
|
|||
|
|
#' 1. Check cache for existing result (if caching enabled)
|
|||
|
|
#' 2. Detect or validate MIME type
|
|||
|
|
#' 3. Select appropriate extractor from registry
|
|||
|
|
#' 4. Extract content
|
|||
|
|
#' 5. Run post-processing pipeline
|
|||
|
|
#' 6. Store result in cache (if caching enabled)
|
|||
|
|
#' @param path Path to the file to extract.
|
|||
|
|
#' @param mime_type Optional MIME type override. If None, will be auto-detected.
|
|||
|
|
#' @param config Extraction configuration.
|
|||
|
|
#' @return An `ExtractionResult` containing the extracted content and metadata.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
|||
|
|
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|||
|
|
#' @export
|
|||
|
|
extract_file <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file", path, mime_type, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Synchronous wrapper for `extract_file`
|
|||
|
|
#'
|
|||
|
|
#' This is a convenience function that blocks the current thread until extraction completes.
|
|||
|
|
#' For async code, use `extract_file` directly.
|
|||
|
|
#'
|
|||
|
|
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|||
|
|
#' a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
|||
|
|
#'
|
|||
|
|
#' This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|||
|
|
#' use a truly synchronous extraction approach instead.
|
|||
|
|
#' @param path File path as character string.
|
|||
|
|
#' @param mime_type Character string.
|
|||
|
|
#' @param config ExtractionConfig object (list with class attribute).
|
|||
|
|
#' @return ExtractionResult object (list with class attribute).
|
|||
|
|
#' @export
|
|||
|
|
extract_file_sync <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file_sync", path, mime_type, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Synchronous wrapper for `extract_bytes`
|
|||
|
|
#'
|
|||
|
|
#' Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|||
|
|
#' a new runtime per call.
|
|||
|
|
#'
|
|||
|
|
#' With the `tokio-runtime` feature, this blocks the current thread using the global
|
|||
|
|
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
|||
|
|
#' @param content Raw vector of bytes.
|
|||
|
|
#' @param mime_type Character string.
|
|||
|
|
#' @param config ExtractionConfig object (list with class attribute).
|
|||
|
|
#' @return ExtractionResult object (list with class attribute).
|
|||
|
|
#' @export
|
|||
|
|
extract_bytes_sync <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes_sync", content, mime_type, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Synchronous wrapper for `batch_extract_files`
|
|||
|
|
#'
|
|||
|
|
#' Uses the global Tokio runtime for optimal performance.
|
|||
|
|
#' Only available with `tokio-runtime` (WASM has no filesystem).
|
|||
|
|
#' @param items List of batchfileitem object (list with class attribute).
|
|||
|
|
#' @param config ExtractionConfig object (list with class attribute).
|
|||
|
|
#' @return List of extractionresult object (list with class attribute).
|
|||
|
|
#' @export
|
|||
|
|
batch_extract_files_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files_sync", items, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Synchronous wrapper for `batch_extract_bytes`
|
|||
|
|
#'
|
|||
|
|
#' Uses the global Tokio runtime for optimal performance.
|
|||
|
|
#' With the `tokio-runtime` feature, this blocks the current thread using the global
|
|||
|
|
#' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
|||
|
|
#' that iterates through items and calls `extract_bytes_sync()`.
|
|||
|
|
#' @param items List of batchbytesitem object (list with class attribute).
|
|||
|
|
#' @param config ExtractionConfig object (list with class attribute).
|
|||
|
|
#' @return List of extractionresult object (list with class attribute).
|
|||
|
|
#' @export
|
|||
|
|
batch_extract_bytes_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes_sync", items, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Extract content from multiple files concurrently
|
|||
|
|
#'
|
|||
|
|
#' This function processes multiple files in parallel, automatically managing
|
|||
|
|
#' concurrency to prevent resource exhaustion. The concurrency limit can be
|
|||
|
|
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|||
|
|
#' to `(num_cpus * 1.5).ceil()`.
|
|||
|
|
#'
|
|||
|
|
#' Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
|||
|
|
#' fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
|||
|
|
#' Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
|||
|
|
#' taken from the batch-level `config`.
|
|||
|
|
#' @param items Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
|
|||
|
|
#' @param config Batch-level extraction configuration (provides defaults and batch settings).
|
|||
|
|
#' @return A vector of `ExtractionResult` in the same order as the input items.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Individual file errors are captured in the result metadata. System errors
|
|||
|
|
#' (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|||
|
|
#' @export
|
|||
|
|
batch_extract_files <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files", items, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Extract content from multiple byte arrays concurrently
|
|||
|
|
#'
|
|||
|
|
#' This function processes multiple byte arrays in parallel, automatically managing
|
|||
|
|
#' concurrency to prevent resource exhaustion. The concurrency limit can be
|
|||
|
|
#' configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|||
|
|
#' to `(num_cpus * 1.5).ceil()`.
|
|||
|
|
#'
|
|||
|
|
#' Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
|||
|
|
#' fields from the batch-level `config`. Pass `None` as the config to use
|
|||
|
|
#' the batch-level defaults for that item.
|
|||
|
|
#' @param items Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
|
|||
|
|
#' @param config Batch-level extraction configuration.
|
|||
|
|
#' @return A vector of `ExtractionResult` in the same order as the input items.
|
|||
|
|
#' @export
|
|||
|
|
batch_extract_bytes <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes", items, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Detect MIME type from raw file bytes
|
|||
|
|
#'
|
|||
|
|
#' Uses magic byte signatures to detect file type from content.
|
|||
|
|
#' Falls back to `infer` crate for comprehensive detection.
|
|||
|
|
#'
|
|||
|
|
#' For ZIP-based files, inspects contents to distinguish Office Open XML
|
|||
|
|
#' formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
|||
|
|
#' @param content Raw file bytes.
|
|||
|
|
#' @return The detected MIME type string.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|||
|
|
#' @export
|
|||
|
|
detect_mime_type_from_bytes <- function(content) .Call("wrap__detect_mime_type_from_bytes", content, PACKAGE = "kreuzberg")
|
|||
|
|
#' Get file extensions for a given MIME type
|
|||
|
|
#'
|
|||
|
|
#' Returns all known file extensions that map to the specified MIME type.
|
|||
|
|
#' @param mime_type The MIME type to look up.
|
|||
|
|
#' @return A vector of file extensions (without leading dot) for the MIME type.
|
|||
|
|
#' @export
|
|||
|
|
get_extensions_for_mime <- function(mime_type) .Call("wrap__get_extensions_for_mime", mime_type, PACKAGE = "kreuzberg")
|
|||
|
|
#' List the names of all registered embedding backends
|
|||
|
|
#'
|
|||
|
|
#' Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
|||
|
|
#' bindings.
|
|||
|
|
#' @return List of character string.
|
|||
|
|
#' @export
|
|||
|
|
list_embedding_backends <- function() .Call("wrap__list_embedding_backends", PACKAGE = "kreuzberg")
|
|||
|
|
#' List names of all registered document extractors
|
|||
|
|
#' @return List of character string.
|
|||
|
|
#' @export
|
|||
|
|
list_document_extractors <- function() .Call("wrap__list_document_extractors", PACKAGE = "kreuzberg")
|
|||
|
|
#' List all registered OCR backends
|
|||
|
|
#'
|
|||
|
|
#' Returns the names of all OCR backends currently registered in the global registry.
|
|||
|
|
#' @return A vector of OCR backend names.
|
|||
|
|
#' @export
|
|||
|
|
list_ocr_backends <- function() .Call("wrap__list_ocr_backends", PACKAGE = "kreuzberg")
|
|||
|
|
#' List all registered post-processor names
|
|||
|
|
#'
|
|||
|
|
#' Returns a vector of all post-processor names currently registered in the
|
|||
|
|
#' global registry.
|
|||
|
|
#' @return - `Ok(Vec<String>)` - Vector of post-processor names
|
|||
|
|
#' - `Err(...)` if the registry lock is poisoned.
|
|||
|
|
#' @export
|
|||
|
|
list_post_processors <- function() .Call("wrap__list_post_processors", PACKAGE = "kreuzberg")
|
|||
|
|
#' List names of all registered renderers
|
|||
|
|
#' @return List of character string.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Returns an error if the registry lock is poisoned.
|
|||
|
|
#' @export
|
|||
|
|
list_renderers <- function() .Call("wrap__list_renderers", PACKAGE = "kreuzberg")
|
|||
|
|
#' List names of all registered validators
|
|||
|
|
#' @return List of character string.
|
|||
|
|
#' @export
|
|||
|
|
list_validators <- function() .Call("wrap__list_validators", PACKAGE = "kreuzberg")
|
|||
|
|
#' Compare two extraction results and return a structured diff
|
|||
|
|
#'
|
|||
|
|
#' The comparison is purely structural — no I/O, no side effects. All fields
|
|||
|
|
#' of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
|||
|
|
#' @param a — the "before" extraction result.
|
|||
|
|
#' @param b — the "after" extraction result.
|
|||
|
|
#' @param opts — controls which sections are compared and optional truncation.
|
|||
|
|
#' @return ExtractionDiff object (list with class attribute).
|
|||
|
|
#' @export
|
|||
|
|
compare <- function(a = ExtractionResult$default(), b = ExtractionResult$default(), opts = DiffOptions$default()) .Call("wrap__compare", a, b, opts, PACKAGE = "kreuzberg")
|
|||
|
|
#' Generate embeddings asynchronously for a list of text strings
|
|||
|
|
#'
|
|||
|
|
#' This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
|||
|
|
#' ONNX inference work to a dedicated blocking thread pool via Tokio's
|
|||
|
|
#' `spawn_blocking`, keeping the async executor free.
|
|||
|
|
#'
|
|||
|
|
#' Returns one embedding vector per input text in the same order.
|
|||
|
|
#' @param texts Vec of strings to embed (owned, sent to blocking thread).
|
|||
|
|
#' @param config Embedding configuration specifying model, batch size, and normalization.
|
|||
|
|
#' @return List of list of numeric.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
|||
|
|
#' - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
|||
|
|
#' or the blocking inference task panics
|
|||
|
|
#' @export
|
|||
|
|
embed_texts_async <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts_async", texts, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Render a single PDF page to PNG bytes
|
|||
|
|
#'
|
|||
|
|
#' Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
|||
|
|
#' Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
|||
|
|
#' @param pdf_bytes Raw PDF file bytes.
|
|||
|
|
#' @param page_index Zero-based page index.
|
|||
|
|
#' @param dpi Resolution in dots per inch (default: 150).
|
|||
|
|
#' @param password Optional password for encrypted PDFs.
|
|||
|
|
#' @return Raw vector of bytes.
|
|||
|
|
#'
|
|||
|
|
#' @section Errors:
|
|||
|
|
#' Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
|||
|
|
#' or rendered, or if `page_index` is out of range.
|
|||
|
|
#' @export
|
|||
|
|
render_pdf_page_to_png <- function(pdf_bytes, page_index, dpi = NULL, password = NULL) .Call("wrap__render_pdf_page_to_png", pdf_bytes, page_index, dpi, password, PACKAGE = "kreuzberg")
|
|||
|
|
#' Detect the MIME type of a file at the given path
|
|||
|
|
#'
|
|||
|
|
#' Uses the file extension and optionally the file content to determine the MIME type.
|
|||
|
|
#' Set `check_exists` to `true` to verify the file exists before detection.
|
|||
|
|
#' @param path Character string.
|
|||
|
|
#' @param check_exists Logical (TRUE/FALSE).
|
|||
|
|
#' @return Character string.
|
|||
|
|
#' @export
|
|||
|
|
detect_mime_type <- function(path, check_exists) .Call("wrap__detect_mime_type", path, check_exists, PACKAGE = "kreuzberg")
|
|||
|
|
#' Embed a list of texts using the configured embedding model
|
|||
|
|
#'
|
|||
|
|
#' Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
|||
|
|
#' @param texts List of character string.
|
|||
|
|
#' @param config EmbeddingConfig object (list with class attribute).
|
|||
|
|
#' @return List of list of numeric.
|
|||
|
|
#' @export
|
|||
|
|
embed_texts <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts", texts, config, PACKAGE = "kreuzberg")
|
|||
|
|
#' Get an embedding preset by name
|
|||
|
|
#'
|
|||
|
|
#' Returns `None` if no preset with the given name exists. Returns an owned
|
|||
|
|
#' clone so the value is safe to pass across FFI boundaries.
|
|||
|
|
#' @param name Character string.
|
|||
|
|
#' @return Optional EmbeddingPreset object (list with class attribute). Defaults to NULL.
|
|||
|
|
#' @export
|
|||
|
|
get_embedding_preset <- function(name) .Call("wrap__get_embedding_preset", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' List the names of all available embedding presets
|
|||
|
|
#'
|
|||
|
|
#' Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
|||
|
|
#' @return List of character string.
|
|||
|
|
#' @export
|
|||
|
|
list_embedding_presets <- function() .Call("wrap__list_embedding_presets", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_ocr_backend
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_ocr_backend <- function(r_backend) .Call("wrap__register_ocr_backend", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_ocr_backend
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_ocr_backend <- function(name) .Call("wrap__unregister_ocr_backend", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_ocr_backends
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_ocr_backends <- function() .Call("wrap__clear_ocr_backends", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_post_processor
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_post_processor <- function(r_backend) .Call("wrap__register_post_processor", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_post_processor
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_post_processor <- function(name) .Call("wrap__unregister_post_processor", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_post_processors
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_post_processors <- function() .Call("wrap__clear_post_processors", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_validator
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_validator <- function(r_backend) .Call("wrap__register_validator", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_validator
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_validator <- function(name) .Call("wrap__unregister_validator", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_validators
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_validators <- function() .Call("wrap__clear_validators", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_embedding_backend
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_embedding_backend <- function(r_backend) .Call("wrap__register_embedding_backend", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_embedding_backend
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_embedding_backend <- function(name) .Call("wrap__unregister_embedding_backend", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_embedding_backends
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_embedding_backends <- function() .Call("wrap__clear_embedding_backends", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_document_extractor
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_document_extractor <- function(r_backend) .Call("wrap__register_document_extractor", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_document_extractor
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_document_extractor <- function(name) .Call("wrap__unregister_document_extractor", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_document_extractors
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_document_extractors <- function() .Call("wrap__clear_document_extractors", PACKAGE = "kreuzberg")
|
|||
|
|
#' register_renderer
|
|||
|
|
#'
|
|||
|
|
#' Register an R-side plugin implementation. Pass a named list whose entries
|
|||
|
|
#' implement the trait's required methods (e.g. `list(name = function() "my", ...)`).
|
|||
|
|
#'
|
|||
|
|
#' @param r_backend Named list of R closures implementing the trait surface.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
register_renderer <- function(r_backend) .Call("wrap__register_renderer", r_backend, PACKAGE = "kreuzberg")
|
|||
|
|
#' unregister_renderer
|
|||
|
|
#'
|
|||
|
|
#' Unregister a previously registered plugin by name.
|
|||
|
|
#'
|
|||
|
|
#' @param name Plugin name string as returned by the backend's `name()` method.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
unregister_renderer <- function(name) .Call("wrap__unregister_renderer", name, PACKAGE = "kreuzberg")
|
|||
|
|
#' clear_renderers
|
|||
|
|
#'
|
|||
|
|
#' Remove every registered plugin of this type. Typically used in test teardown.
|
|||
|
|
#'
|
|||
|
|
#' @return Invisible NULL on success; raises an R error on failure.
|
|||
|
|
#' @export
|
|||
|
|
clear_renderers <- function() .Call("wrap__clear_renderers", PACKAGE = "kreuzberg")
|
|||
|
|
#' CacheStats
|
|||
|
|
#' @field total_files total_files
|
|||
|
|
#' @field total_size_mb total_size_mb
|
|||
|
|
#' @field available_space_mb available_space_mb
|
|||
|
|
#' @field oldest_file_age_days oldest_file_age_days
|
|||
|
|
#' @field newest_file_age_days newest_file_age_days
|
|||
|
|
#' @export
|
|||
|
|
CacheStats <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.CacheStats` <- function(self, name) {
|
|||
|
|
func <- CacheStats[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.CacheStats` <- `$.CacheStats`
|
|||
|
|
#' Hardware acceleration configuration for ONNX Runtime models
|
|||
|
|
#'
|
|||
|
|
#' Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|||
|
|
#' for inference in layout detection and embedding generation.
|
|||
|
|
#' @field provider Execution provider to use for ONNX inference.
|
|||
|
|
#' @field device_id GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
|
|||
|
|
#' @export
|
|||
|
|
AccelerationConfig <- new.env(parent = emptyenv())
|
|||
|
|
AccelerationConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__AccelerationConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.AccelerationConfig` <- function(self, name) {
|
|||
|
|
func <- AccelerationConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.AccelerationConfig` <- `$.AccelerationConfig`
|
|||
|
|
#' Cross-extractor content filtering configuration
|
|||
|
|
#'
|
|||
|
|
#' Controls whether "furniture" content (headers, footers, page numbers,
|
|||
|
|
#' watermarks, repeating text) is included in or stripped from extraction
|
|||
|
|
#' results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
|||
|
|
#' with format-specific implementation.
|
|||
|
|
#'
|
|||
|
|
#' When `None` on `ExtractionConfig`, each extractor uses its current
|
|||
|
|
#' default behavior unchanged.
|
|||
|
|
#' @field include_headers Include running headers in extraction output.
|
|||
|
|
#' @field include_footers Include running footers in extraction output.
|
|||
|
|
#' @field strip_repeating_text Enable the heuristic cross-page repeating text detector.
|
|||
|
|
#' @field include_watermarks Include watermark text in extraction output.
|
|||
|
|
#' @export
|
|||
|
|
ContentFilterConfig <- new.env(parent = emptyenv())
|
|||
|
|
ContentFilterConfig$default <- function() .Call("wrap__ContentFilterConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ContentFilterConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ContentFilterConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ContentFilterConfig` <- function(self, name) {
|
|||
|
|
func <- ContentFilterConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ContentFilterConfig` <- `$.ContentFilterConfig`
|
|||
|
|
#' Configuration for email extraction
|
|||
|
|
#' @field msg_fallback_codepage Windows codepage number to use when an MSG file contains no codepage property. Defaults
|
|||
|
|
#' @export
|
|||
|
|
EmailConfig <- new.env(parent = emptyenv())
|
|||
|
|
EmailConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__EmailConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.EmailConfig` <- function(self, name) {
|
|||
|
|
func <- EmailConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmailConfig` <- `$.EmailConfig`
|
|||
|
|
#' Main extraction configuration
|
|||
|
|
#'
|
|||
|
|
#' This struct contains all configuration options for the extraction process.
|
|||
|
|
#' It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|||
|
|
#' @field use_cache Enable caching of extraction results
|
|||
|
|
#' @field enable_quality_processing Enable quality post-processing
|
|||
|
|
#' @field ocr OCR configuration (None = OCR disabled)
|
|||
|
|
#' @field force_ocr Force OCR even for searchable PDFs
|
|||
|
|
#' @field force_ocr_pages Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
|||
|
|
#' @field disable_ocr Disable OCR entirely, even for images.
|
|||
|
|
#' @field chunking Text chunking configuration (None = chunking disabled)
|
|||
|
|
#' @field content_filter Content filtering configuration (None = use extractor defaults).
|
|||
|
|
#' @field images Image extraction configuration (None = no image extraction)
|
|||
|
|
#' @field pdf_options PDF-specific options (None = use defaults)
|
|||
|
|
#' @field token_reduction Token reduction configuration (None = no token reduction)
|
|||
|
|
#' @field language_detection Language detection configuration (None = no language detection)
|
|||
|
|
#' @field pages Page extraction configuration (None = no page tracking)
|
|||
|
|
#' @field keywords Keyword extraction configuration (None = no keyword extraction)
|
|||
|
|
#' @field postprocessor Post-processor configuration (None = use defaults)
|
|||
|
|
#' @field html_options HTML to Markdown conversion options (None = use defaults)
|
|||
|
|
#' @field html_output Styled HTML output configuration.
|
|||
|
|
#' @field extraction_timeout_secs Default per-file timeout in seconds for batch extraction.
|
|||
|
|
#' @field max_concurrent_extractions Maximum concurrent extractions in batch operations (None = (num_cpus ×
|
|||
|
|
#' @field result_format Result structure format
|
|||
|
|
#' @field security_limits Security limits for archive extraction.
|
|||
|
|
#' @field max_embedded_file_bytes Maximum uncompressed size in bytes for a single embedded file before recursive
|
|||
|
|
#' @field output_format Content text format (default: Plain).
|
|||
|
|
#' @field layout Layout detection configuration (None = layout detection disabled).
|
|||
|
|
#' @field use_layout_for_markdown Run layout detection on the non-OCR PDF markdown path.
|
|||
|
|
#' @field include_document_structure Enable structured document tree output.
|
|||
|
|
#' @field acceleration Hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
#' @field cache_namespace Cache namespace for tenant isolation.
|
|||
|
|
#' @field cache_ttl_secs Per-request cache TTL in seconds.
|
|||
|
|
#' @field email Email extraction configuration (None = use defaults).
|
|||
|
|
#' @field concurrency Concurrency limits for constrained environments (None = use defaults).
|
|||
|
|
#' @field max_archive_depth Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive
|
|||
|
|
#' @field tree_sitter Tree-sitter language pack configuration (None = tree-sitter disabled).
|
|||
|
|
#' @field structured_extraction Structured extraction via LLM (None = disabled).
|
|||
|
|
#' @field cancel_token Cancellation token for this extraction (None = no external cancellation).
|
|||
|
|
#' @export
|
|||
|
|
ExtractionConfig <- new.env(parent = emptyenv())
|
|||
|
|
ExtractionConfig$default <- function() .Call("wrap__ExtractionConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ExtractionConfig$needs_image_processing <- function(self) .Call("wrap__ExtractionConfig__needs_image_processing", self, PACKAGE = "kreuzberg")
|
|||
|
|
ExtractionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ExtractionConfig` <- function(self, name) {
|
|||
|
|
func <- ExtractionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ExtractionConfig` <- `$.ExtractionConfig`
|
|||
|
|
#' @export
|
|||
|
|
needs_image_processing.ExtractionConfig <- function(x, ...) x$needs_image_processing(...)
|
|||
|
|
#' Per-file extraction configuration overrides for batch processing
|
|||
|
|
#'
|
|||
|
|
#' All fields are `Option<T>` — `None` means "use the batch-level default."
|
|||
|
|
#' This type is used with `batch_extract_files` and
|
|||
|
|
#' `batch_extract_bytes` to allow heterogeneous
|
|||
|
|
#' extraction settings within a single batch.
|
|||
|
|
#'
|
|||
|
|
#' # Excluded Fields
|
|||
|
|
#'
|
|||
|
|
#' The following `ExtractionConfig` fields are batch-level only and
|
|||
|
|
#' cannot be overridden per file:
|
|||
|
|
#' - `max_concurrent_extractions` — controls batch parallelism
|
|||
|
|
#' - `use_cache` — global caching policy
|
|||
|
|
#' - `acceleration` — shared ONNX execution provider
|
|||
|
|
#' - `security_limits` — global archive security policy
|
|||
|
|
#' @field enable_quality_processing Override quality post-processing for this file.
|
|||
|
|
#' @field ocr Override OCR configuration for this file (None in the Option = use batch default).
|
|||
|
|
#' @field force_ocr Override force OCR for this file.
|
|||
|
|
#' @field force_ocr_pages Override force OCR pages for this file (1-indexed page numbers).
|
|||
|
|
#' @field disable_ocr Override disable OCR for this file.
|
|||
|
|
#' @field chunking Override chunking configuration for this file.
|
|||
|
|
#' @field content_filter Override content filtering configuration for this file.
|
|||
|
|
#' @field images Override image extraction configuration for this file.
|
|||
|
|
#' @field pdf_options Override PDF options for this file.
|
|||
|
|
#' @field token_reduction Override token reduction for this file.
|
|||
|
|
#' @field language_detection Override language detection for this file.
|
|||
|
|
#' @field pages Override page extraction for this file.
|
|||
|
|
#' @field keywords Override keyword extraction for this file.
|
|||
|
|
#' @field postprocessor Override post-processor for this file.
|
|||
|
|
#' @field html_options Override HTML conversion options for this file.
|
|||
|
|
#' @field result_format Override result format for this file.
|
|||
|
|
#' @field output_format Override output content format for this file.
|
|||
|
|
#' @field include_document_structure Override document structure output for this file.
|
|||
|
|
#' @field layout Override layout detection for this file.
|
|||
|
|
#' @field timeout_secs Override per-file extraction timeout in seconds.
|
|||
|
|
#' @field tree_sitter Override tree-sitter configuration for this file.
|
|||
|
|
#' @field structured_extraction Override structured extraction configuration for this file.
|
|||
|
|
#' @export
|
|||
|
|
FileExtractionConfig <- new.env(parent = emptyenv())
|
|||
|
|
FileExtractionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__FileExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.FileExtractionConfig` <- function(self, name) {
|
|||
|
|
func <- FileExtractionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.FileExtractionConfig` <- `$.FileExtractionConfig`
|
|||
|
|
#' Batch item for byte array extraction
|
|||
|
|
#'
|
|||
|
|
#' Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
|||
|
|
#' to represent a single item in a batch extraction job.
|
|||
|
|
#' @field content The content bytes to extract from
|
|||
|
|
#' @field mime_type MIME type of the content (e.g., "application/pdf", "text/html")
|
|||
|
|
#' @field config Per-item configuration overrides (None uses batch-level defaults)
|
|||
|
|
#' @export
|
|||
|
|
BatchBytesItem <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.BatchBytesItem` <- function(self, name) {
|
|||
|
|
func <- BatchBytesItem[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.BatchBytesItem` <- `$.BatchBytesItem`
|
|||
|
|
#' Batch item for file extraction
|
|||
|
|
#'
|
|||
|
|
#' Used with `batch_extract_files` and `batch_extract_files_sync`
|
|||
|
|
#' to represent a single file in a batch extraction job.
|
|||
|
|
#' @field path Path to the file to extract from
|
|||
|
|
#' @field config Per-file configuration overrides (None uses batch-level defaults)
|
|||
|
|
#' @export
|
|||
|
|
BatchFileItem <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.BatchFileItem` <- function(self, name) {
|
|||
|
|
func <- BatchFileItem[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.BatchFileItem` <- `$.BatchFileItem`
|
|||
|
|
#' Image extraction configuration
|
|||
|
|
#' @field extract_images Extract images from documents
|
|||
|
|
#' @field target_dpi Target DPI for image normalization
|
|||
|
|
#' @field max_image_dimension Maximum dimension for images (width or height)
|
|||
|
|
#' @field inject_placeholders Whether to inject image reference placeholders into markdown output. When `true`
|
|||
|
|
#' @field auto_adjust_dpi Automatically adjust DPI based on image content
|
|||
|
|
#' @field min_dpi Minimum DPI threshold
|
|||
|
|
#' @field max_dpi Maximum DPI threshold
|
|||
|
|
#' @field max_images_per_page Maximum number of image objects to extract per PDF page.
|
|||
|
|
#' @field classify When `true` (default), extracted images are classified by kind and grouped into clusters where they
|
|||
|
|
#' @field include_page_rasters When `true`, full-page renders produced during OCR preprocessing are captured and
|
|||
|
|
#' @field run_ocr_on_images Run OCR on extracted images and include the recognized text in the document content.
|
|||
|
|
#' @field ocr_text_only When `true`, image OCR results are rendered as plain text without the `` markdown
|
|||
|
|
#' @field append_ocr_text When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder
|
|||
|
|
#' @export
|
|||
|
|
ImageExtractionConfig <- new.env(parent = emptyenv())
|
|||
|
|
ImageExtractionConfig$default <- function() .Call("wrap__ImageExtractionConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ImageExtractionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ImageExtractionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ImageExtractionConfig` <- function(self, name) {
|
|||
|
|
func <- ImageExtractionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ImageExtractionConfig` <- `$.ImageExtractionConfig`
|
|||
|
|
#' Token reduction configuration
|
|||
|
|
#' @field mode Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
|
|||
|
|
#' @field preserve_important_words Preserve important words (capitalized, technical terms)
|
|||
|
|
#' @export
|
|||
|
|
TokenReductionOptions <- new.env(parent = emptyenv())
|
|||
|
|
TokenReductionOptions$default <- function() .Call("wrap__TokenReductionOptions__default", PACKAGE = "kreuzberg")
|
|||
|
|
TokenReductionOptions$from_json <- function(json) {
|
|||
|
|
.Call("wrap__TokenReductionOptions__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.TokenReductionOptions` <- function(self, name) {
|
|||
|
|
func <- TokenReductionOptions[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TokenReductionOptions` <- `$.TokenReductionOptions`
|
|||
|
|
#' Language detection configuration
|
|||
|
|
#' @field enabled Enable language detection
|
|||
|
|
#' @field min_confidence Minimum confidence threshold (0.0-1.0)
|
|||
|
|
#' @field detect_multiple Detect multiple languages in the document
|
|||
|
|
#' @export
|
|||
|
|
LanguageDetectionConfig <- new.env(parent = emptyenv())
|
|||
|
|
LanguageDetectionConfig$default <- function() .Call("wrap__LanguageDetectionConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
LanguageDetectionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__LanguageDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.LanguageDetectionConfig` <- function(self, name) {
|
|||
|
|
func <- LanguageDetectionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LanguageDetectionConfig` <- `$.LanguageDetectionConfig`
|
|||
|
|
#' Configuration for styled HTML output
|
|||
|
|
#'
|
|||
|
|
#' When set on [`ExtractionConfig::html_output`] alongside
|
|||
|
|
#' `output_format = OutputFormat::Html`, the pipeline builds a
|
|||
|
|
#' [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
|||
|
|
#' the plain comrak-based renderer.
|
|||
|
|
#' @field css Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file`
|
|||
|
|
#' @field css_file Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both
|
|||
|
|
#' @field theme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
|
|||
|
|
#' @field class_prefix CSS class prefix applied to every emitted class name.
|
|||
|
|
#' @field embed_css When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening
|
|||
|
|
#' @export
|
|||
|
|
HtmlOutputConfig <- new.env(parent = emptyenv())
|
|||
|
|
HtmlOutputConfig$default <- function() .Call("wrap__HtmlOutputConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
HtmlOutputConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__HtmlOutputConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.HtmlOutputConfig` <- function(self, name) {
|
|||
|
|
func <- HtmlOutputConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.HtmlOutputConfig` <- `$.HtmlOutputConfig`
|
|||
|
|
#' Layout detection configuration
|
|||
|
|
#'
|
|||
|
|
#' Controls layout detection behavior in the extraction pipeline.
|
|||
|
|
#' When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
|||
|
|
#' is enabled for PDF extraction.
|
|||
|
|
#' @field confidence_threshold Confidence threshold override (None = use model default).
|
|||
|
|
#' @field apply_heuristics Whether to apply postprocessing heuristics (default: true).
|
|||
|
|
#' @field table_model Table structure recognition model.
|
|||
|
|
#' @field acceleration Hardware acceleration for ONNX models (layout detection + table structure).
|
|||
|
|
#' @export
|
|||
|
|
LayoutDetectionConfig <- new.env(parent = emptyenv())
|
|||
|
|
LayoutDetectionConfig$default <- function() .Call("wrap__LayoutDetectionConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
LayoutDetectionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__LayoutDetectionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.LayoutDetectionConfig` <- function(self, name) {
|
|||
|
|
func <- LayoutDetectionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LayoutDetectionConfig` <- `$.LayoutDetectionConfig`
|
|||
|
|
#' Configuration for an LLM provider/model via liter-llm
|
|||
|
|
#'
|
|||
|
|
#' Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
|||
|
|
#' its own `LlmConfig`, allowing different providers per feature.
|
|||
|
|
#' @field model Provider/model string using liter-llm routing format.
|
|||
|
|
#' @field api_key API key for the provider. When `None`, liter-llm falls back to the provider's standard environment
|
|||
|
|
#' @field base_url Custom base URL override for the provider endpoint.
|
|||
|
|
#' @field timeout_secs Request timeout in seconds (default: 60).
|
|||
|
|
#' @field max_retries Maximum retry attempts (default: 3).
|
|||
|
|
#' @field temperature Sampling temperature for generation tasks.
|
|||
|
|
#' @field max_tokens Maximum tokens to generate.
|
|||
|
|
#' @export
|
|||
|
|
LlmConfig <- new.env(parent = emptyenv())
|
|||
|
|
LlmConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__LlmConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.LlmConfig` <- function(self, name) {
|
|||
|
|
func <- LlmConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LlmConfig` <- `$.LlmConfig`
|
|||
|
|
#' Configuration for LLM-based structured data extraction
|
|||
|
|
#'
|
|||
|
|
#' Sends extracted document content to a VLM with a JSON schema,
|
|||
|
|
#' returning structured data that conforms to the schema.
|
|||
|
|
#' @field schema JSON Schema defining the desired output structure.
|
|||
|
|
#' @field schema_name Schema name passed to the LLM's structured output mode.
|
|||
|
|
#' @field schema_description Optional schema description for the LLM.
|
|||
|
|
#' @field strict Enable strict mode — output must exactly match the schema.
|
|||
|
|
#' @field prompt Custom Jinja2 extraction prompt template. When `None`, a default template is used.
|
|||
|
|
#' @field llm LLM configuration for the extraction.
|
|||
|
|
#' @export
|
|||
|
|
StructuredExtractionConfig <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.StructuredExtractionConfig` <- function(self, name) {
|
|||
|
|
func <- StructuredExtractionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.StructuredExtractionConfig` <- `$.StructuredExtractionConfig`
|
|||
|
|
#' Quality thresholds for OCR fallback decisions and pipeline quality gating
|
|||
|
|
#'
|
|||
|
|
#' All fields default to the values that match the previous hardcoded behavior,
|
|||
|
|
#' so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
|||
|
|
#' @field min_total_non_whitespace Minimum total non-whitespace characters to consider text substantive.
|
|||
|
|
#' @field min_non_whitespace_per_page Minimum non-whitespace characters per page on average.
|
|||
|
|
#' @field min_meaningful_word_len Minimum character count for a word to be "meaningful".
|
|||
|
|
#' @field min_meaningful_words Minimum count of meaningful words before text is accepted.
|
|||
|
|
#' @field min_alnum_ratio Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
|
|||
|
|
#' @field min_garbage_chars Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
|
|||
|
|
#' @field max_fragmented_word_ratio Maximum fraction of short (1-2 char) words before text is considered fragmented.
|
|||
|
|
#' @field critical_fragmented_word_ratio Critical fragmentation threshold — triggers OCR regardless of meaningful
|
|||
|
|
#' @field min_avg_word_length Minimum average word length. Below this with enough words indicates garbled extraction.
|
|||
|
|
#' @field min_words_for_avg_length_check Minimum word count before average word length check applies.
|
|||
|
|
#' @field min_consecutive_repeat_ratio Minimum consecutive word repetition ratio to detect column scrambling.
|
|||
|
|
#' @field min_words_for_repeat_check Minimum word count before consecutive repetition check is applied.
|
|||
|
|
#' @field substantive_min_chars Minimum character count for "substantive markdown" OCR skip gate.
|
|||
|
|
#' @field non_text_min_chars Minimum character count for "non-text content" OCR skip gate.
|
|||
|
|
#' @field alnum_ws_ratio_threshold Alphanumeric+whitespace ratio threshold for skip decisions.
|
|||
|
|
#' @field pipeline_min_quality Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the
|
|||
|
|
#' @export
|
|||
|
|
OcrQualityThresholds <- new.env(parent = emptyenv())
|
|||
|
|
OcrQualityThresholds$default <- function() .Call("wrap__OcrQualityThresholds__default", PACKAGE = "kreuzberg")
|
|||
|
|
OcrQualityThresholds$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrQualityThresholds__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrQualityThresholds` <- function(self, name) {
|
|||
|
|
func <- OcrQualityThresholds[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrQualityThresholds` <- `$.OcrQualityThresholds`
|
|||
|
|
#' A single backend stage in the OCR pipeline
|
|||
|
|
#' @field backend Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name.
|
|||
|
|
#' @field priority Priority weight (higher = tried first). Stages are sorted by priority descending.
|
|||
|
|
#' @field language Language override for this stage (None = use parent OcrConfig.language).
|
|||
|
|
#' @field tesseract_config Tesseract-specific config override for this stage.
|
|||
|
|
#' @field paddle_ocr_config PaddleOCR-specific config for this stage.
|
|||
|
|
#' @field vlm_config VLM config override for this pipeline stage.
|
|||
|
|
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
#' @export
|
|||
|
|
OcrPipelineStage <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrPipelineStage` <- function(self, name) {
|
|||
|
|
func <- OcrPipelineStage[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrPipelineStage` <- `$.OcrPipelineStage`
|
|||
|
|
#' OCR configuration
|
|||
|
|
#' @field enabled Whether OCR is enabled.
|
|||
|
|
#' @field backend OCR backend: tesseract, easyocr, paddleocr
|
|||
|
|
#' @field language Language code (e.g., "eng", "deu")
|
|||
|
|
#' @field tesseract_config Tesseract-specific configuration (optional)
|
|||
|
|
#' @field output_format Output format for OCR results (optional, for format conversion)
|
|||
|
|
#' @field paddle_ocr_config PaddleOCR-specific configuration (optional, JSON passthrough)
|
|||
|
|
#' @field backend_options Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
#' @field element_config OCR element extraction configuration
|
|||
|
|
#' @field quality_thresholds Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled
|
|||
|
|
#' @field pipeline Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR
|
|||
|
|
#' @field auto_rotate Enable automatic page rotation based on orientation detection.
|
|||
|
|
#' @field vlm_config VLM (Vision Language Model) OCR configuration.
|
|||
|
|
#' @field vlm_prompt Custom Jinja2 prompt template for VLM OCR.
|
|||
|
|
#' @field acceleration Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
|||
|
|
#' @field tessdata_bytes Caller-supplied Tesseract `traineddata` bytes per language code.
|
|||
|
|
#' @export
|
|||
|
|
OcrConfig <- new.env(parent = emptyenv())
|
|||
|
|
OcrConfig$default <- function() .Call("wrap__OcrConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
OcrConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrConfig` <- function(self, name) {
|
|||
|
|
func <- OcrConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrConfig` <- `$.OcrConfig`
|
|||
|
|
#' Page extraction and tracking configuration
|
|||
|
|
#'
|
|||
|
|
#' Controls how pages are extracted, tracked, and represented in the extraction results.
|
|||
|
|
#' When `None`, page tracking is disabled.
|
|||
|
|
#'
|
|||
|
|
#' Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
|||
|
|
#' when page boundaries are available and chunking is configured.
|
|||
|
|
#' @field extract_pages Extract pages as separate array (ExtractionResult.pages)
|
|||
|
|
#' @field insert_page_markers Insert page markers in main content string
|
|||
|
|
#' @field marker_format Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|||
|
|
#' @export
|
|||
|
|
PageConfig <- new.env(parent = emptyenv())
|
|||
|
|
PageConfig$default <- function() .Call("wrap__PageConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
PageConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PageConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PageConfig` <- function(self, name) {
|
|||
|
|
func <- PageConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PageConfig` <- `$.PageConfig`
|
|||
|
|
#' PDF-specific configuration
|
|||
|
|
#' @field extract_images Extract images from PDF
|
|||
|
|
#' @field extract_tables Extract tables from PDF.
|
|||
|
|
#' @field passwords List of passwords to try when opening encrypted PDFs
|
|||
|
|
#' @field extract_metadata Extract PDF metadata
|
|||
|
|
#' @field hierarchy Hierarchy extraction configuration (None = hierarchy extraction disabled)
|
|||
|
|
#' @field extract_annotations Extract PDF annotations (text notes, highlights, links, stamps). Default: false
|
|||
|
|
#' @field top_margin_fraction Top margin fraction (0.0–1.0) of page height to exclude headers/running heads. Default:
|
|||
|
|
#' @field bottom_margin_fraction Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
|||
|
|
#' @field allow_single_column_tables Allow single-column pseudo tables in extraction results.
|
|||
|
|
#' @field ocr_inline_images Perform OCR on inline images extracted from PDF pages and attach the recognized text to
|
|||
|
|
#' @export
|
|||
|
|
PdfConfig <- new.env(parent = emptyenv())
|
|||
|
|
PdfConfig$default <- function() .Call("wrap__PdfConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
PdfConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PdfConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PdfConfig` <- function(self, name) {
|
|||
|
|
func <- PdfConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PdfConfig` <- `$.PdfConfig`
|
|||
|
|
#' Hierarchy extraction configuration for PDF text structure analysis
|
|||
|
|
#'
|
|||
|
|
#' Enables extraction of document hierarchy levels (H1-H6) based on font size
|
|||
|
|
#' clustering and semantic analysis. When enabled, hierarchical blocks are
|
|||
|
|
#' included in page content.
|
|||
|
|
#' @field enabled Enable hierarchy extraction
|
|||
|
|
#' @field k_clusters Number of font size clusters to use for hierarchy levels (1-7)
|
|||
|
|
#' @field include_bbox Include bounding box information in hierarchy blocks
|
|||
|
|
#' @field ocr_coverage_threshold OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
|||
|
|
#' @export
|
|||
|
|
HierarchyConfig <- new.env(parent = emptyenv())
|
|||
|
|
HierarchyConfig$default <- function() .Call("wrap__HierarchyConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
HierarchyConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__HierarchyConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.HierarchyConfig` <- function(self, name) {
|
|||
|
|
func <- HierarchyConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.HierarchyConfig` <- `$.HierarchyConfig`
|
|||
|
|
#' Post-processor configuration
|
|||
|
|
#' @field enabled Enable post-processors
|
|||
|
|
#' @field enabled_processors Whitelist of processor names to run (None = all enabled)
|
|||
|
|
#' @field disabled_processors Blacklist of processor names to skip (None = none disabled)
|
|||
|
|
#' @field enabled_set Pre-computed AHashSet for O(1) enabled processor lookup
|
|||
|
|
#' @field disabled_set Pre-computed AHashSet for O(1) disabled processor lookup
|
|||
|
|
#' @export
|
|||
|
|
PostProcessorConfig <- new.env(parent = emptyenv())
|
|||
|
|
PostProcessorConfig$default <- function() .Call("wrap__PostProcessorConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
PostProcessorConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PostProcessorConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PostProcessorConfig` <- function(self, name) {
|
|||
|
|
func <- PostProcessorConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PostProcessorConfig` <- `$.PostProcessorConfig`
|
|||
|
|
#' Chunking configuration
|
|||
|
|
#'
|
|||
|
|
#' Configures text chunking for document content, including chunk size,
|
|||
|
|
#' overlap, trimming behavior, and optional embeddings.
|
|||
|
|
#'
|
|||
|
|
#' Use `..Default::default()` when constructing to allow for future field additions:
|
|||
|
|
#' ```rust
|
|||
|
|
#' let config = ChunkingConfig {
|
|||
|
|
#' max_characters: 500,
|
|||
|
|
#' ..Default::default()
|
|||
|
|
#' };
|
|||
|
|
#' ```
|
|||
|
|
#' @field max_characters Maximum size per chunk (in units determined by `sizing`).
|
|||
|
|
#' @field overlap Overlap between chunks (in units determined by `sizing`).
|
|||
|
|
#' @field trim Whether to trim whitespace from chunk boundaries.
|
|||
|
|
#' @field chunker_type Type of chunker to use (Text or Markdown).
|
|||
|
|
#' @field embedding Optional embedding configuration for chunk embeddings.
|
|||
|
|
#' @field preset Use a preset configuration (overrides individual settings if provided).
|
|||
|
|
#' @field sizing How to measure chunk size.
|
|||
|
|
#' @field prepend_heading_context When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path
|
|||
|
|
#' @field topic_threshold Optional cosine similarity threshold for semantic topic boundary detection.
|
|||
|
|
#' @export
|
|||
|
|
ChunkingConfig <- new.env(parent = emptyenv())
|
|||
|
|
ChunkingConfig$default <- function() .Call("wrap__ChunkingConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ChunkingConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ChunkingConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ChunkingConfig` <- function(self, name) {
|
|||
|
|
func <- ChunkingConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ChunkingConfig` <- `$.ChunkingConfig`
|
|||
|
|
#' Embedding configuration for text chunks
|
|||
|
|
#'
|
|||
|
|
#' Configures embedding generation using ONNX models via the vendored embedding engine.
|
|||
|
|
#' Requires the `embeddings` feature to be enabled.
|
|||
|
|
#' @field model The embedding model to use (defaults to "balanced" preset if not specified)
|
|||
|
|
#' @field normalize Whether to normalize embedding vectors (recommended for cosine similarity)
|
|||
|
|
#' @field batch_size Batch size for embedding generation
|
|||
|
|
#' @field show_download_progress Show model download progress
|
|||
|
|
#' @field cache_dir Custom cache directory for model files
|
|||
|
|
#' @field acceleration Hardware acceleration for the embedding ONNX model.
|
|||
|
|
#' @field max_embed_duration_secs Maximum wall-clock duration (in seconds) for a single `embed()` call when using
|
|||
|
|
#' @export
|
|||
|
|
EmbeddingConfig <- new.env(parent = emptyenv())
|
|||
|
|
EmbeddingConfig$default <- function() .Call("wrap__EmbeddingConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
EmbeddingConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__EmbeddingConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.EmbeddingConfig` <- function(self, name) {
|
|||
|
|
func <- EmbeddingConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmbeddingConfig` <- `$.EmbeddingConfig`
|
|||
|
|
#' Configuration for tree-sitter language pack integration
|
|||
|
|
#'
|
|||
|
|
#' Controls grammar download behavior and code analysis options.
|
|||
|
|
#'
|
|||
|
|
#' # Example (TOML)
|
|||
|
|
#'
|
|||
|
|
#' ```toml
|
|||
|
|
#' [tree_sitter]
|
|||
|
|
#' languages = ["python", "rust"]
|
|||
|
|
#' groups = ["web"]
|
|||
|
|
#'
|
|||
|
|
#' [tree_sitter.process]
|
|||
|
|
#' structure = true
|
|||
|
|
#' comments = true
|
|||
|
|
#' docstrings = true
|
|||
|
|
#' ```
|
|||
|
|
#' @field enabled Enable code intelligence processing (default: true).
|
|||
|
|
#' @field cache_dir Custom cache directory for downloaded grammars.
|
|||
|
|
#' @field languages Languages to pre-download on init (e.g., `["python", "rust"]`).
|
|||
|
|
#' @field groups Language groups to pre-download (e.g., `["web", "systems", "scripting"]`).
|
|||
|
|
#' @field process Processing options for code analysis.
|
|||
|
|
#' @export
|
|||
|
|
TreeSitterConfig <- new.env(parent = emptyenv())
|
|||
|
|
TreeSitterConfig$default <- function() .Call("wrap__TreeSitterConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
TreeSitterConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__TreeSitterConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.TreeSitterConfig` <- function(self, name) {
|
|||
|
|
func <- TreeSitterConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TreeSitterConfig` <- `$.TreeSitterConfig`
|
|||
|
|
#' Processing options for tree-sitter code analysis
|
|||
|
|
#'
|
|||
|
|
#' Controls which analysis features are enabled when extracting code files.
|
|||
|
|
#' @field structure Extract structural items (functions, classes, structs, etc.). Default: true.
|
|||
|
|
#' @field imports Extract import statements. Default: true.
|
|||
|
|
#' @field exports Extract export statements. Default: true.
|
|||
|
|
#' @field comments Extract comments. Default: false.
|
|||
|
|
#' @field docstrings Extract docstrings. Default: false.
|
|||
|
|
#' @field symbols Extract symbol definitions. Default: false.
|
|||
|
|
#' @field diagnostics Include parse diagnostics. Default: false.
|
|||
|
|
#' @field chunk_max_size Maximum chunk size in bytes. `None` disables chunking.
|
|||
|
|
#' @field content_mode Content rendering mode for code extraction.
|
|||
|
|
#' @export
|
|||
|
|
TreeSitterProcessConfig <- new.env(parent = emptyenv())
|
|||
|
|
TreeSitterProcessConfig$default <- function() .Call("wrap__TreeSitterProcessConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
TreeSitterProcessConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__TreeSitterProcessConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.TreeSitterProcessConfig` <- function(self, name) {
|
|||
|
|
func <- TreeSitterProcessConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TreeSitterProcessConfig` <- `$.TreeSitterProcessConfig`
|
|||
|
|
#' A supported document format entry
|
|||
|
|
#'
|
|||
|
|
#' Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
|||
|
|
#' @field extension File extension (without leading dot), e.g., "pdf", "docx"
|
|||
|
|
#' @field mime_type MIME type string, e.g., "application/pdf"
|
|||
|
|
#' @export
|
|||
|
|
SupportedFormat <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.SupportedFormat` <- function(self, name) {
|
|||
|
|
func <- SupportedFormat[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.SupportedFormat` <- `$.SupportedFormat`
|
|||
|
|
#' API server configuration
|
|||
|
|
#'
|
|||
|
|
#' This struct holds all configuration options for the Kreuzberg API server,
|
|||
|
|
#' including host/port settings, CORS configuration, and upload limits.
|
|||
|
|
#'
|
|||
|
|
#' # Defaults
|
|||
|
|
#'
|
|||
|
|
#' - `host`: "127.0.0.1" (localhost only)
|
|||
|
|
#' - `port`: 8000
|
|||
|
|
#' - `cors_origins`: empty vector (allows all origins)
|
|||
|
|
#' - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|||
|
|
#' - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|||
|
|
#' @field host Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
|||
|
|
#' @field port Server port number
|
|||
|
|
#' @field cors_origins CORS allowed origins. Empty vector means allow all origins.
|
|||
|
|
#' @field max_request_body_bytes Maximum size of request body in bytes (default: 100 MB)
|
|||
|
|
#' @field max_multipart_field_bytes Maximum size of multipart fields in bytes (default: 100 MB)
|
|||
|
|
#' @export
|
|||
|
|
ServerConfig <- new.env(parent = emptyenv())
|
|||
|
|
ServerConfig$default <- function() .Call("wrap__ServerConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$listen_addr <- function(self) .Call("wrap__ServerConfig__listen_addr", self, PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$cors_allows_all <- function(self) .Call("wrap__ServerConfig__cors_allows_all", self, PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$is_origin_allowed <- function(self, origin) .Call("wrap__ServerConfig__is_origin_allowed", self, origin, PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$max_request_body_mb <- function(self) .Call("wrap__ServerConfig__max_request_body_mb", self, PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$max_multipart_field_mb <- function(self) .Call("wrap__ServerConfig__max_multipart_field_mb", self, PACKAGE = "kreuzberg")
|
|||
|
|
ServerConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ServerConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ServerConfig` <- function(self, name) {
|
|||
|
|
func <- ServerConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ServerConfig` <- `$.ServerConfig`
|
|||
|
|
#' @export
|
|||
|
|
listen_addr.ServerConfig <- function(x, ...) x$listen_addr(...)
|
|||
|
|
#' @export
|
|||
|
|
cors_allows_all.ServerConfig <- function(x, ...) x$cors_allows_all(...)
|
|||
|
|
#' @export
|
|||
|
|
is_origin_allowed.ServerConfig <- function(x, ...) x$is_origin_allowed(...)
|
|||
|
|
#' @export
|
|||
|
|
max_request_body_mb.ServerConfig <- function(x, ...) x$max_request_body_mb(...)
|
|||
|
|
#' @export
|
|||
|
|
max_multipart_field_mb.ServerConfig <- function(x, ...) x$max_multipart_field_mb(...)
|
|||
|
|
#' StructuredDataResult
|
|||
|
|
#' @field content content
|
|||
|
|
#' @field format format
|
|||
|
|
#' @field metadata metadata
|
|||
|
|
#' @field text_fields text_fields
|
|||
|
|
#' @export
|
|||
|
|
StructuredDataResult <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.StructuredDataResult` <- function(self, name) {
|
|||
|
|
func <- StructuredDataResult[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.StructuredDataResult` <- `$.StructuredDataResult`
|
|||
|
|
#' Application properties from docProps/app.xml for DOCX
|
|||
|
|
#'
|
|||
|
|
#' Contains Word-specific document statistics and metadata.
|
|||
|
|
#' @field application Application name (e.g., "Microsoft Office Word")
|
|||
|
|
#' @field app_version Application version
|
|||
|
|
#' @field template Template filename
|
|||
|
|
#' @field total_time Total editing time in minutes
|
|||
|
|
#' @field pages Number of pages
|
|||
|
|
#' @field words Number of words
|
|||
|
|
#' @field characters Number of characters (excluding spaces)
|
|||
|
|
#' @field characters_with_spaces Number of characters (including spaces)
|
|||
|
|
#' @field lines Number of lines
|
|||
|
|
#' @field paragraphs Number of paragraphs
|
|||
|
|
#' @field company Company name
|
|||
|
|
#' @field doc_security Document security level
|
|||
|
|
#' @field scale_crop Scale crop flag
|
|||
|
|
#' @field links_up_to_date Links up to date flag
|
|||
|
|
#' @field shared_doc Shared document flag
|
|||
|
|
#' @field hyperlinks_changed Hyperlinks changed flag
|
|||
|
|
#' @export
|
|||
|
|
DocxAppProperties <- new.env(parent = emptyenv())
|
|||
|
|
DocxAppProperties$from_json <- function(json) {
|
|||
|
|
.Call("wrap__DocxAppProperties__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.DocxAppProperties` <- function(self, name) {
|
|||
|
|
func <- DocxAppProperties[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DocxAppProperties` <- `$.DocxAppProperties`
|
|||
|
|
#' Application properties from docProps/app.xml for XLSX
|
|||
|
|
#'
|
|||
|
|
#' Contains Excel-specific document metadata.
|
|||
|
|
#' @field application Application name (e.g., "Microsoft Excel")
|
|||
|
|
#' @field app_version Application version
|
|||
|
|
#' @field doc_security Document security level
|
|||
|
|
#' @field scale_crop Scale crop flag
|
|||
|
|
#' @field links_up_to_date Links up to date flag
|
|||
|
|
#' @field shared_doc Shared document flag
|
|||
|
|
#' @field hyperlinks_changed Hyperlinks changed flag
|
|||
|
|
#' @field company Company name
|
|||
|
|
#' @field worksheet_names Worksheet names
|
|||
|
|
#' @export
|
|||
|
|
XlsxAppProperties <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.XlsxAppProperties` <- function(self, name) {
|
|||
|
|
func <- XlsxAppProperties[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.XlsxAppProperties` <- `$.XlsxAppProperties`
|
|||
|
|
#' Application properties from docProps/app.xml for PPTX
|
|||
|
|
#'
|
|||
|
|
#' Contains PowerPoint-specific document metadata.
|
|||
|
|
#' @field application Application name (e.g., "Microsoft Office PowerPoint")
|
|||
|
|
#' @field app_version Application version
|
|||
|
|
#' @field total_time Total editing time in minutes
|
|||
|
|
#' @field company Company name
|
|||
|
|
#' @field doc_security Document security level
|
|||
|
|
#' @field scale_crop Scale crop flag
|
|||
|
|
#' @field links_up_to_date Links up to date flag
|
|||
|
|
#' @field shared_doc Shared document flag
|
|||
|
|
#' @field hyperlinks_changed Hyperlinks changed flag
|
|||
|
|
#' @field slides Number of slides
|
|||
|
|
#' @field notes Number of notes
|
|||
|
|
#' @field hidden_slides Number of hidden slides
|
|||
|
|
#' @field multimedia_clips Number of multimedia clips
|
|||
|
|
#' @field presentation_format Presentation format (e.g., "Widescreen", "Standard")
|
|||
|
|
#' @field slide_titles Slide titles
|
|||
|
|
#' @export
|
|||
|
|
PptxAppProperties <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.PptxAppProperties` <- function(self, name) {
|
|||
|
|
func <- PptxAppProperties[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PptxAppProperties` <- `$.PptxAppProperties`
|
|||
|
|
#' Dublin Core metadata from docProps/core.xml
|
|||
|
|
#'
|
|||
|
|
#' Contains standard metadata fields defined by the Dublin Core standard
|
|||
|
|
#' and Office-specific extensions.
|
|||
|
|
#' @field title Document title
|
|||
|
|
#' @field subject Document subject/topic
|
|||
|
|
#' @field creator Document creator/author
|
|||
|
|
#' @field keywords Keywords or tags
|
|||
|
|
#' @field description Document description/abstract
|
|||
|
|
#' @field last_modified_by User who last modified the document
|
|||
|
|
#' @field revision Revision number
|
|||
|
|
#' @field created Creation timestamp (ISO 8601)
|
|||
|
|
#' @field modified Last modification timestamp (ISO 8601)
|
|||
|
|
#' @field category Document category
|
|||
|
|
#' @field content_status Content status (Draft, Final, etc.)
|
|||
|
|
#' @field language Document language
|
|||
|
|
#' @field identifier Unique identifier
|
|||
|
|
#' @field version Document version
|
|||
|
|
#' @field last_printed Last print timestamp (ISO 8601)
|
|||
|
|
#' @export
|
|||
|
|
CoreProperties <- new.env(parent = emptyenv())
|
|||
|
|
CoreProperties$from_json <- function(json) {
|
|||
|
|
.Call("wrap__CoreProperties__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.CoreProperties` <- function(self, name) {
|
|||
|
|
func <- CoreProperties[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.CoreProperties` <- `$.CoreProperties`
|
|||
|
|
#' Configuration for security limits across extractors
|
|||
|
|
#'
|
|||
|
|
#' All limits are intentionally conservative to prevent DoS attacks
|
|||
|
|
#' while still supporting legitimate documents.
|
|||
|
|
#' @field max_archive_size Maximum uncompressed size for archives (500 MB)
|
|||
|
|
#' @field max_compression_ratio Maximum compression ratio before flagging as potential bomb (100:1)
|
|||
|
|
#' @field max_files_in_archive Maximum number of files in archive (10,000)
|
|||
|
|
#' @field max_nesting_depth Maximum nesting depth for structures (100)
|
|||
|
|
#' @field max_entity_length Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token
|
|||
|
|
#' @field max_content_size Maximum string growth per document (100 MB)
|
|||
|
|
#' @field max_iterations Maximum iterations per operation
|
|||
|
|
#' @field max_xml_depth Maximum XML depth (100 levels)
|
|||
|
|
#' @field max_table_cells Maximum cells per table (100,000)
|
|||
|
|
#' @export
|
|||
|
|
SecurityLimits <- new.env(parent = emptyenv())
|
|||
|
|
SecurityLimits$default <- function() .Call("wrap__SecurityLimits__default", PACKAGE = "kreuzberg")
|
|||
|
|
SecurityLimits$from_json <- function(json) {
|
|||
|
|
.Call("wrap__SecurityLimits__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.SecurityLimits` <- function(self, name) {
|
|||
|
|
func <- SecurityLimits[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.SecurityLimits` <- `$.SecurityLimits`
|
|||
|
|
#' TokenReductionConfig
|
|||
|
|
#' @field level level
|
|||
|
|
#' @field language_hint language_hint
|
|||
|
|
#' @field preserve_markdown preserve_markdown
|
|||
|
|
#' @field preserve_code preserve_code
|
|||
|
|
#' @field semantic_threshold semantic_threshold
|
|||
|
|
#' @field enable_parallel enable_parallel
|
|||
|
|
#' @field use_simd use_simd
|
|||
|
|
#' @field custom_stopwords custom_stopwords
|
|||
|
|
#' @field preserve_patterns preserve_patterns
|
|||
|
|
#' @field target_reduction target_reduction
|
|||
|
|
#' @field enable_semantic_clustering enable_semantic_clustering
|
|||
|
|
#' @export
|
|||
|
|
TokenReductionConfig <- new.env(parent = emptyenv())
|
|||
|
|
TokenReductionConfig$default <- function() .Call("wrap__TokenReductionConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
TokenReductionConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__TokenReductionConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.TokenReductionConfig` <- function(self, name) {
|
|||
|
|
func <- TokenReductionConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TokenReductionConfig` <- `$.TokenReductionConfig`
|
|||
|
|
#' A PDF annotation extracted from a document page
|
|||
|
|
#' @field annotation_type The type of annotation.
|
|||
|
|
#' @field content Text content of the annotation (e.g., comment text, link URL).
|
|||
|
|
#' @field page_number Page number where the annotation appears (1-indexed).
|
|||
|
|
#' @field bounding_box Bounding box of the annotation on the page.
|
|||
|
|
#' @export
|
|||
|
|
PdfAnnotation <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.PdfAnnotation` <- function(self, name) {
|
|||
|
|
func <- PdfAnnotation[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PdfAnnotation` <- `$.PdfAnnotation`
|
|||
|
|
#' Inline element within a block
|
|||
|
|
#'
|
|||
|
|
#' Represents text with formatting, links, images, etc.
|
|||
|
|
#' @field element_type Type of inline element
|
|||
|
|
#' @field content Text content
|
|||
|
|
#' @field attributes Element attributes
|
|||
|
|
#' @field metadata Additional metadata (e.g., href for links, src/alt for images)
|
|||
|
|
#' @export
|
|||
|
|
InlineElement <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.InlineElement` <- function(self, name) {
|
|||
|
|
func <- InlineElement[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.InlineElement` <- `$.InlineElement`
|
|||
|
|
#' Image element in Djot
|
|||
|
|
#' @field src Image source URL or path
|
|||
|
|
#' @field alt Alternative text
|
|||
|
|
#' @field title Optional title
|
|||
|
|
#' @field attributes Element attributes
|
|||
|
|
#' @export
|
|||
|
|
DjotImage <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DjotImage` <- function(self, name) {
|
|||
|
|
func <- DjotImage[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DjotImage` <- `$.DjotImage`
|
|||
|
|
#' Link element in Djot
|
|||
|
|
#' @field url Link URL
|
|||
|
|
#' @field text Link text content
|
|||
|
|
#' @field title Optional title
|
|||
|
|
#' @field attributes Element attributes
|
|||
|
|
#' @export
|
|||
|
|
DjotLink <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DjotLink` <- function(self, name) {
|
|||
|
|
func <- DjotLink[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DjotLink` <- `$.DjotLink`
|
|||
|
|
#' A resolved relationship between two nodes in the document tree
|
|||
|
|
#' @field source Source node index (the referencing node).
|
|||
|
|
#' @field target Target node index (the referenced node).
|
|||
|
|
#' @field kind Semantic kind of the relationship.
|
|||
|
|
#' @export
|
|||
|
|
DocumentRelationship <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DocumentRelationship` <- function(self, name) {
|
|||
|
|
func <- DocumentRelationship[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DocumentRelationship` <- `$.DocumentRelationship`
|
|||
|
|
#' Individual grid cell with position and span metadata
|
|||
|
|
#' @field content Cell text content.
|
|||
|
|
#' @field row Zero-indexed row position.
|
|||
|
|
#' @field col Zero-indexed column position.
|
|||
|
|
#' @field row_span Number of rows this cell spans.
|
|||
|
|
#' @field col_span Number of columns this cell spans.
|
|||
|
|
#' @field is_header Whether this is a header cell.
|
|||
|
|
#' @field bbox Bounding box for this cell (if available).
|
|||
|
|
#' @export
|
|||
|
|
GridCell <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.GridCell` <- function(self, name) {
|
|||
|
|
func <- GridCell[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.GridCell` <- `$.GridCell`
|
|||
|
|
#' Inline text annotation — byte-range based formatting and links
|
|||
|
|
#'
|
|||
|
|
#' Annotations reference byte offsets into the node's text content,
|
|||
|
|
#' enabling precise identification of formatted regions.
|
|||
|
|
#' @field start Start byte offset in the node's text content (inclusive).
|
|||
|
|
#' @field end End byte offset in the node's text content (exclusive).
|
|||
|
|
#' @field kind Annotation type.
|
|||
|
|
#' @export
|
|||
|
|
TextAnnotation <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.TextAnnotation` <- function(self, name) {
|
|||
|
|
func <- TextAnnotation[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TextAnnotation` <- `$.TextAnnotation`
|
|||
|
|
#' A single file extracted from an archive
|
|||
|
|
#'
|
|||
|
|
#' When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
|||
|
|
#' enabled, each processable file produces its own full `ExtractionResult`.
|
|||
|
|
#' @field path Archive-relative file path (e.g. "folder/document.pdf").
|
|||
|
|
#' @field mime_type Detected MIME type of the file.
|
|||
|
|
#' @field result Full extraction result for this file.
|
|||
|
|
#' @export
|
|||
|
|
ArchiveEntry <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ArchiveEntry` <- function(self, name) {
|
|||
|
|
func <- ArchiveEntry[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ArchiveEntry` <- `$.ArchiveEntry`
|
|||
|
|
#' A non-fatal warning from a processing pipeline stage
|
|||
|
|
#'
|
|||
|
|
#' Captures errors from optional features that don't prevent extraction
|
|||
|
|
#' but may indicate degraded results.
|
|||
|
|
#' @field source The pipeline stage or feature that produced this warning (e.g., "embedding", "chunking",
|
|||
|
|
#' @field message Human-readable description of what went wrong.
|
|||
|
|
#' @export
|
|||
|
|
ProcessingWarning <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ProcessingWarning` <- function(self, name) {
|
|||
|
|
func <- ProcessingWarning[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ProcessingWarning` <- `$.ProcessingWarning`
|
|||
|
|
#' Token usage and cost data for a single LLM call made during extraction
|
|||
|
|
#'
|
|||
|
|
#' Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
|||
|
|
#' are used. Multiple entries may be present when multiple LLM calls occur
|
|||
|
|
#' within one extraction (e.g. VLM OCR + structured extraction).
|
|||
|
|
#' @field model The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
|
|||
|
|
#' @field source The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction",
|
|||
|
|
#' @field input_tokens Number of input/prompt tokens consumed.
|
|||
|
|
#' @field output_tokens Number of output/completion tokens generated.
|
|||
|
|
#' @field total_tokens Total tokens (input + output).
|
|||
|
|
#' @field estimated_cost Estimated cost in USD based on the provider's published pricing.
|
|||
|
|
#' @field finish_reason Why the model stopped generating (e.g. "stop", "length", "content_filter").
|
|||
|
|
#' @export
|
|||
|
|
LlmUsage <- new.env(parent = emptyenv())
|
|||
|
|
LlmUsage$from_json <- function(json) {
|
|||
|
|
.Call("wrap__LlmUsage__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.LlmUsage` <- function(self, name) {
|
|||
|
|
func <- LlmUsage[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LlmUsage` <- `$.LlmUsage`
|
|||
|
|
#' A text chunk with optional embedding and metadata
|
|||
|
|
#'
|
|||
|
|
#' Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|||
|
|
#' contains the text content, optional embedding vector (if embedding generation
|
|||
|
|
#' is configured), and metadata about its position in the document.
|
|||
|
|
#' @field content The text content of this chunk.
|
|||
|
|
#' @field chunk_type Semantic structural classification of this chunk.
|
|||
|
|
#' @field embedding Optional embedding vector for this chunk.
|
|||
|
|
#' @field metadata Metadata about this chunk's position and properties.
|
|||
|
|
#' @export
|
|||
|
|
Chunk <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.Chunk` <- function(self, name) {
|
|||
|
|
func <- Chunk[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.Chunk` <- `$.Chunk`
|
|||
|
|
#' A single heading in the hierarchy
|
|||
|
|
#' @field level Heading depth (1 = h1, 2 = h2, etc.)
|
|||
|
|
#' @field text The text content of the heading.
|
|||
|
|
#' @export
|
|||
|
|
HeadingLevel <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.HeadingLevel` <- function(self, name) {
|
|||
|
|
func <- HeadingLevel[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.HeadingLevel` <- `$.HeadingLevel`
|
|||
|
|
#' Metadata about a chunk's position in the original document
|
|||
|
|
#' @field byte_start Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|||
|
|
#' @field byte_end Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|||
|
|
#' @field token_count Number of tokens in this chunk (if available).
|
|||
|
|
#' @field chunk_index Zero-based index of this chunk in the document.
|
|||
|
|
#' @field total_chunks Total number of chunks in the document.
|
|||
|
|
#' @field first_page First page number this chunk spans (1-indexed).
|
|||
|
|
#' @field last_page Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|||
|
|
#' @field heading_context Heading context when using Markdown chunker.
|
|||
|
|
#' @field image_indices Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
|||
|
|
#' @export
|
|||
|
|
ChunkMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ChunkMetadata` <- function(self, name) {
|
|||
|
|
func <- ChunkMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ChunkMetadata` <- `$.ChunkMetadata`
|
|||
|
|
#' Extracted image from a document
|
|||
|
|
#'
|
|||
|
|
#' Contains raw image data, metadata, and optional nested OCR results.
|
|||
|
|
#' Raw bytes allow cross-language compatibility - users can convert to
|
|||
|
|
#' PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|||
|
|
#' @field data Raw image data (PNG, JPEG, WebP, etc. bytes). Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
#' @field format Image format (e.g., "jpeg", "png", "webp") Uses Cow<'static, str> to avoid allocation for static
|
|||
|
|
#' @field image_index Zero-indexed position of this image in the document/page
|
|||
|
|
#' @field page_number Page/slide number where image was found (1-indexed)
|
|||
|
|
#' @field width Image width in pixels
|
|||
|
|
#' @field height Image height in pixels
|
|||
|
|
#' @field colorspace Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|||
|
|
#' @field bits_per_component Bits per color component (e.g., 8, 16)
|
|||
|
|
#' @field is_mask Whether this image is a mask image
|
|||
|
|
#' @field description Optional description of the image
|
|||
|
|
#' @field ocr_result Nested OCR extraction result (if image was OCRed)
|
|||
|
|
#' @field bounding_box Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
|||
|
|
#' @field source_path Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
|||
|
|
#' @field image_kind Heuristic classification of what this image likely depicts. `None` if classification was disabled
|
|||
|
|
#' @field kind_confidence Confidence score for `image_kind`, in the range 0.0 to 1.0.
|
|||
|
|
#' @field cluster_id Identifier shared across images that form a single logical figure (e.g. all raster tiles of one
|
|||
|
|
#' @export
|
|||
|
|
ExtractedImage <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ExtractedImage` <- function(self, name) {
|
|||
|
|
func <- ExtractedImage[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ExtractedImage` <- `$.ExtractedImage`
|
|||
|
|
#' Bounding box coordinates for element positioning
|
|||
|
|
#' @field x0 Left x-coordinate
|
|||
|
|
#' @field y0 Bottom y-coordinate
|
|||
|
|
#' @field x1 Right x-coordinate
|
|||
|
|
#' @field y1 Top y-coordinate
|
|||
|
|
#' @export
|
|||
|
|
BoundingBox <- new.env(parent = emptyenv())
|
|||
|
|
BoundingBox$from_json <- function(json) {
|
|||
|
|
.Call("wrap__BoundingBox__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.BoundingBox` <- function(self, name) {
|
|||
|
|
func <- BoundingBox[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.BoundingBox` <- `$.BoundingBox`
|
|||
|
|
#' Metadata for a semantic element
|
|||
|
|
#' @field page_number Page number (1-indexed)
|
|||
|
|
#' @field filename Source filename or document name
|
|||
|
|
#' @field coordinates Bounding box coordinates if available
|
|||
|
|
#' @field element_index Position index in the element sequence
|
|||
|
|
#' @field additional Additional custom metadata
|
|||
|
|
#' @export
|
|||
|
|
ElementMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ElementMetadata` <- function(self, name) {
|
|||
|
|
func <- ElementMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ElementMetadata` <- `$.ElementMetadata`
|
|||
|
|
#' Semantic element extracted from document
|
|||
|
|
#'
|
|||
|
|
#' Represents a logical unit of content with semantic classification,
|
|||
|
|
#' unique identifier, and metadata for tracking origin and position.
|
|||
|
|
#' @field element_id Unique element identifier
|
|||
|
|
#' @field element_type Semantic type of this element
|
|||
|
|
#' @field text Text content of the element
|
|||
|
|
#' @field metadata Metadata about the element
|
|||
|
|
#' @export
|
|||
|
|
Element <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.Element` <- function(self, name) {
|
|||
|
|
func <- Element[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.Element` <- `$.Element`
|
|||
|
|
#' XML extraction result
|
|||
|
|
#'
|
|||
|
|
#' Contains extracted text content from XML files along with
|
|||
|
|
#' structural statistics about the XML document.
|
|||
|
|
#' @field content Extracted text content (XML structure filtered out)
|
|||
|
|
#' @field element_count Total number of XML elements processed
|
|||
|
|
#' @field unique_elements List of unique element names found (sorted)
|
|||
|
|
#' @export
|
|||
|
|
XmlExtractionResult <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.XmlExtractionResult` <- function(self, name) {
|
|||
|
|
func <- XmlExtractionResult[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.XmlExtractionResult` <- `$.XmlExtractionResult`
|
|||
|
|
#' Email attachment representation
|
|||
|
|
#'
|
|||
|
|
#' Contains metadata and optionally the content of an email attachment.
|
|||
|
|
#' @field name Attachment name (from Content-Disposition header)
|
|||
|
|
#' @field filename Filename of the attachment
|
|||
|
|
#' @field mime_type MIME type of the attachment
|
|||
|
|
#' @field size Size in bytes
|
|||
|
|
#' @field is_image Whether this attachment is an image
|
|||
|
|
#' @field data Attachment data (if extracted). Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
#' @export
|
|||
|
|
EmailAttachment <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.EmailAttachment` <- function(self, name) {
|
|||
|
|
func <- EmailAttachment[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmailAttachment` <- `$.EmailAttachment`
|
|||
|
|
#' Bounding box for an OCR-detected table in pixel coordinates
|
|||
|
|
#' @field left Left x-coordinate (pixels)
|
|||
|
|
#' @field top Top y-coordinate (pixels)
|
|||
|
|
#' @field right Right x-coordinate (pixels)
|
|||
|
|
#' @field bottom Bottom y-coordinate (pixels)
|
|||
|
|
#' @export
|
|||
|
|
OcrTableBoundingBox <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrTableBoundingBox` <- function(self, name) {
|
|||
|
|
func <- OcrTableBoundingBox[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrTableBoundingBox` <- `$.OcrTableBoundingBox`
|
|||
|
|
#' Image preprocessing configuration for OCR
|
|||
|
|
#'
|
|||
|
|
#' These settings control how images are preprocessed before OCR to improve
|
|||
|
|
#' text recognition quality. Different preprocessing strategies work better
|
|||
|
|
#' for different document types.
|
|||
|
|
#' @field target_dpi Target DPI for the image (300 is standard, 600 for small text).
|
|||
|
|
#' @field auto_rotate Auto-detect and correct image rotation.
|
|||
|
|
#' @field deskew Correct skew (tilted images).
|
|||
|
|
#' @field denoise Remove noise from the image.
|
|||
|
|
#' @field contrast_enhance Enhance contrast for better text visibility.
|
|||
|
|
#' @field binarization_method Binarization method: "otsu", "sauvola", "adaptive".
|
|||
|
|
#' @field invert_colors Invert colors (white text on black → black on white).
|
|||
|
|
#' @export
|
|||
|
|
ImagePreprocessingConfig <- new.env(parent = emptyenv())
|
|||
|
|
ImagePreprocessingConfig$default <- function() .Call("wrap__ImagePreprocessingConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
ImagePreprocessingConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ImagePreprocessingConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ImagePreprocessingConfig` <- function(self, name) {
|
|||
|
|
func <- ImagePreprocessingConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ImagePreprocessingConfig` <- `$.ImagePreprocessingConfig`
|
|||
|
|
#' Tesseract OCR configuration
|
|||
|
|
#'
|
|||
|
|
#' Provides fine-grained control over Tesseract OCR engine parameters.
|
|||
|
|
#' Most users can use the defaults, but these settings allow optimization
|
|||
|
|
#' for specific document types (invoices, handwriting, etc.).
|
|||
|
|
#' @field language Language code (e.g., "eng", "deu", "fra")
|
|||
|
|
#' @field psm Page Segmentation Mode (0-13).
|
|||
|
|
#' @field output_format Output format ("text" or "markdown")
|
|||
|
|
#' @field oem OCR Engine Mode (0-3).
|
|||
|
|
#' @field min_confidence Minimum confidence threshold (0.0-100.0).
|
|||
|
|
#' @field preprocessing Image preprocessing configuration.
|
|||
|
|
#' @field enable_table_detection Enable automatic table detection and reconstruction
|
|||
|
|
#' @field table_min_confidence Minimum confidence threshold for table detection (0.0-1.0)
|
|||
|
|
#' @field table_column_threshold Column threshold for table detection (pixels)
|
|||
|
|
#' @field table_row_threshold_ratio Row threshold ratio for table detection (0.0-1.0)
|
|||
|
|
#' @field use_cache Enable OCR result caching
|
|||
|
|
#' @field classify_use_pre_adapted_templates Use pre-adapted templates for character classification
|
|||
|
|
#' @field language_model_ngram_on Enable N-gram language model
|
|||
|
|
#' @field tessedit_dont_blkrej_good_wds Don't reject good words during block-level processing
|
|||
|
|
#' @field tessedit_dont_rowrej_good_wds Don't reject good words during row-level processing
|
|||
|
|
#' @field tessedit_enable_dict_correction Enable dictionary correction
|
|||
|
|
#' @field tessedit_char_whitelist Whitelist of allowed characters (empty = all allowed)
|
|||
|
|
#' @field tessedit_char_blacklist Blacklist of forbidden characters (empty = none forbidden)
|
|||
|
|
#' @field tessedit_use_primary_params_model Use primary language params model
|
|||
|
|
#' @field textord_space_size_is_variable Variable-width space detection
|
|||
|
|
#' @field thresholding_method Use adaptive thresholding method
|
|||
|
|
#' @export
|
|||
|
|
TesseractConfig <- new.env(parent = emptyenv())
|
|||
|
|
TesseractConfig$default <- function() .Call("wrap__TesseractConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
TesseractConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__TesseractConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.TesseractConfig` <- function(self, name) {
|
|||
|
|
func <- TesseractConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TesseractConfig` <- `$.TesseractConfig`
|
|||
|
|
#' Image preprocessing metadata
|
|||
|
|
#'
|
|||
|
|
#' Tracks the transformations applied to an image during OCR preprocessing,
|
|||
|
|
#' including DPI normalization, resizing, and resampling.
|
|||
|
|
#' @field original_dimensions Original image dimensions (width, height) in pixels
|
|||
|
|
#' @field original_dpi Original image DPI (horizontal, vertical)
|
|||
|
|
#' @field target_dpi Target DPI from configuration
|
|||
|
|
#' @field scale_factor Scaling factor applied to the image
|
|||
|
|
#' @field auto_adjusted Whether DPI was auto-adjusted based on content
|
|||
|
|
#' @field final_dpi Final DPI after processing
|
|||
|
|
#' @field new_dimensions New dimensions after resizing (if resized)
|
|||
|
|
#' @field resample_method Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|||
|
|
#' @field dimension_clamped Whether dimensions were clamped to max_image_dimension
|
|||
|
|
#' @field calculated_dpi Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|||
|
|
#' @field skipped_resize Whether resize was skipped (dimensions already optimal)
|
|||
|
|
#' @field resize_error Error message if resize failed
|
|||
|
|
#' @export
|
|||
|
|
ImagePreprocessingMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ImagePreprocessingMetadata` <- function(self, name) {
|
|||
|
|
func <- ImagePreprocessingMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ImagePreprocessingMetadata` <- `$.ImagePreprocessingMetadata`
|
|||
|
|
#' Extraction result metadata
|
|||
|
|
#'
|
|||
|
|
#' Contains common fields applicable to all formats, format-specific metadata
|
|||
|
|
#' via a discriminated union, and additional custom fields from postprocessors.
|
|||
|
|
#' @field title Document title
|
|||
|
|
#' @field subject Document subject or description
|
|||
|
|
#' @field authors Primary author(s) - always Vec for consistency
|
|||
|
|
#' @field keywords Keywords/tags - always Vec for consistency
|
|||
|
|
#' @field language Primary language (ISO 639 code)
|
|||
|
|
#' @field created_at Creation timestamp (ISO 8601 format)
|
|||
|
|
#' @field modified_at Last modification timestamp (ISO 8601 format)
|
|||
|
|
#' @field created_by User who created the document
|
|||
|
|
#' @field modified_by User who last modified the document
|
|||
|
|
#' @field pages Page/slide/sheet structure with boundaries
|
|||
|
|
#' @field format Format-specific metadata (discriminated union)
|
|||
|
|
#' @field image_preprocessing Image preprocessing metadata (when OCR preprocessing was applied)
|
|||
|
|
#' @field json_schema JSON schema (for structured data extraction)
|
|||
|
|
#' @field error Error metadata (for batch operations)
|
|||
|
|
#' @field extraction_duration_ms Extraction duration in milliseconds (for benchmarking).
|
|||
|
|
#' @field category Document category (from frontmatter or classification).
|
|||
|
|
#' @field tags Document tags (from frontmatter).
|
|||
|
|
#' @field document_version Document version string (from frontmatter).
|
|||
|
|
#' @field abstract_text Abstract or summary text (from frontmatter).
|
|||
|
|
#' @field output_format Output format identifier (e.g., "markdown", "html", "text").
|
|||
|
|
#' @field ocr_used Whether OCR was used during extraction.
|
|||
|
|
#' @field additional Additional custom fields from postprocessors.
|
|||
|
|
#' @export
|
|||
|
|
Metadata <- new.env(parent = emptyenv())
|
|||
|
|
Metadata$is_empty <- function(self) .Call("wrap__Metadata__is_empty", self, PACKAGE = "kreuzberg")
|
|||
|
|
Metadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__Metadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.Metadata` <- function(self, name) {
|
|||
|
|
func <- Metadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.Metadata` <- `$.Metadata`
|
|||
|
|
#' @export
|
|||
|
|
is_empty.Metadata <- function(x, ...) x$is_empty(...)
|
|||
|
|
#' Excel/spreadsheet format metadata
|
|||
|
|
#'
|
|||
|
|
#' Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
|||
|
|
#' discriminant. Sheet count and sheet names are stored inside this struct.
|
|||
|
|
#' @field sheet_count Number of sheets in the workbook.
|
|||
|
|
#' @field sheet_names Names of all sheets in the workbook.
|
|||
|
|
#' @export
|
|||
|
|
ExcelMetadata <- new.env(parent = emptyenv())
|
|||
|
|
ExcelMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ExcelMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ExcelMetadata` <- function(self, name) {
|
|||
|
|
func <- ExcelMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ExcelMetadata` <- `$.ExcelMetadata`
|
|||
|
|
#' Email metadata extracted from .eml and .msg files
|
|||
|
|
#'
|
|||
|
|
#' Includes sender/recipient information, message ID, and attachment list.
|
|||
|
|
#' @field from_email Sender's email address
|
|||
|
|
#' @field from_name Sender's display name
|
|||
|
|
#' @field to_emails Primary recipients
|
|||
|
|
#' @field cc_emails CC recipients
|
|||
|
|
#' @field bcc_emails BCC recipients
|
|||
|
|
#' @field message_id Message-ID header value
|
|||
|
|
#' @field attachments List of attachment filenames
|
|||
|
|
#' @export
|
|||
|
|
EmailMetadata <- new.env(parent = emptyenv())
|
|||
|
|
EmailMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__EmailMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.EmailMetadata` <- function(self, name) {
|
|||
|
|
func <- EmailMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmailMetadata` <- `$.EmailMetadata`
|
|||
|
|
#' Archive (ZIP/TAR/7Z) metadata
|
|||
|
|
#'
|
|||
|
|
#' Extracted from compressed archive files containing file lists and size information.
|
|||
|
|
#' @field format Archive format ("ZIP", "TAR", "7Z", etc.)
|
|||
|
|
#' @field file_count Total number of files in the archive
|
|||
|
|
#' @field file_list List of file paths within the archive
|
|||
|
|
#' @field total_size Total uncompressed size in bytes
|
|||
|
|
#' @field compressed_size Compressed size in bytes (if available)
|
|||
|
|
#' @export
|
|||
|
|
ArchiveMetadata <- new.env(parent = emptyenv())
|
|||
|
|
ArchiveMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ArchiveMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ArchiveMetadata` <- function(self, name) {
|
|||
|
|
func <- ArchiveMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ArchiveMetadata` <- `$.ArchiveMetadata`
|
|||
|
|
#' Image metadata extracted from image files
|
|||
|
|
#'
|
|||
|
|
#' Includes dimensions, format, and EXIF data.
|
|||
|
|
#' @field width Image width in pixels
|
|||
|
|
#' @field height Image height in pixels
|
|||
|
|
#' @field format Image format (e.g., "PNG", "JPEG", "TIFF")
|
|||
|
|
#' @field exif EXIF metadata tags
|
|||
|
|
#' @export
|
|||
|
|
ImageMetadata <- new.env(parent = emptyenv())
|
|||
|
|
ImageMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__ImageMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.ImageMetadata` <- function(self, name) {
|
|||
|
|
func <- ImageMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ImageMetadata` <- `$.ImageMetadata`
|
|||
|
|
#' XML metadata extracted during XML parsing
|
|||
|
|
#'
|
|||
|
|
#' Provides statistics about XML document structure.
|
|||
|
|
#' @field element_count Total number of XML elements processed
|
|||
|
|
#' @field unique_elements List of unique element tag names (sorted)
|
|||
|
|
#' @export
|
|||
|
|
XmlMetadata <- new.env(parent = emptyenv())
|
|||
|
|
XmlMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__XmlMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.XmlMetadata` <- function(self, name) {
|
|||
|
|
func <- XmlMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.XmlMetadata` <- `$.XmlMetadata`
|
|||
|
|
#' Header/heading element metadata
|
|||
|
|
#' @field level Header level: 1 (h1) through 6 (h6)
|
|||
|
|
#' @field text Normalized text content of the header
|
|||
|
|
#' @field id HTML id attribute if present
|
|||
|
|
#' @field depth Document tree depth at the header element
|
|||
|
|
#' @field html_offset Byte offset in original HTML document
|
|||
|
|
#' @export
|
|||
|
|
HeaderMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.HeaderMetadata` <- function(self, name) {
|
|||
|
|
func <- HeaderMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.HeaderMetadata` <- `$.HeaderMetadata`
|
|||
|
|
#' Structured data (Schema.org, microdata, RDFa) block
|
|||
|
|
#' @field data_type Type of structured data
|
|||
|
|
#' @field raw_json Raw JSON string representation
|
|||
|
|
#' @field schema_type Schema type if detectable (e.g., "Article", "Event", "Product")
|
|||
|
|
#' @export
|
|||
|
|
StructuredData <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.StructuredData` <- function(self, name) {
|
|||
|
|
func <- StructuredData[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.StructuredData` <- `$.StructuredData`
|
|||
|
|
#' OCR processing metadata
|
|||
|
|
#'
|
|||
|
|
#' Captures information about OCR processing configuration and results.
|
|||
|
|
#' @field language OCR language code(s) used
|
|||
|
|
#' @field psm Tesseract Page Segmentation Mode (PSM)
|
|||
|
|
#' @field output_format Output format (e.g., "text", "hocr")
|
|||
|
|
#' @field table_count Number of tables detected
|
|||
|
|
#' @field table_rows table_rows
|
|||
|
|
#' @field table_cols table_cols
|
|||
|
|
#' @export
|
|||
|
|
OcrMetadata <- new.env(parent = emptyenv())
|
|||
|
|
OcrMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrMetadata` <- function(self, name) {
|
|||
|
|
func <- OcrMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrMetadata` <- `$.OcrMetadata`
|
|||
|
|
#' Error metadata (for batch operations)
|
|||
|
|
#' @field error_type error_type
|
|||
|
|
#' @field message message
|
|||
|
|
#' @export
|
|||
|
|
ErrorMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ErrorMetadata` <- function(self, name) {
|
|||
|
|
func <- ErrorMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ErrorMetadata` <- `$.ErrorMetadata`
|
|||
|
|
#' PowerPoint presentation metadata
|
|||
|
|
#'
|
|||
|
|
#' Extracted from PPTX files containing slide counts and presentation details.
|
|||
|
|
#' @field slide_count Total number of slides in the presentation
|
|||
|
|
#' @field slide_names Names of slides (if available)
|
|||
|
|
#' @field image_count Number of embedded images
|
|||
|
|
#' @field table_count Number of tables
|
|||
|
|
#' @export
|
|||
|
|
PptxMetadata <- new.env(parent = emptyenv())
|
|||
|
|
PptxMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PptxMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PptxMetadata` <- function(self, name) {
|
|||
|
|
func <- PptxMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PptxMetadata` <- `$.PptxMetadata`
|
|||
|
|
#' Word document metadata
|
|||
|
|
#'
|
|||
|
|
#' Extracted from DOCX files using shared Office Open XML metadata extraction.
|
|||
|
|
#' Integrates with `office_metadata` module for core/app/custom properties.
|
|||
|
|
#' @field core_properties Core properties from docProps/core.xml (Dublin Core metadata)
|
|||
|
|
#' @field app_properties Application properties from docProps/app.xml (Word-specific statistics)
|
|||
|
|
#' @field custom_properties Custom properties from docProps/custom.xml (user-defined properties)
|
|||
|
|
#' @export
|
|||
|
|
DocxMetadata <- new.env(parent = emptyenv())
|
|||
|
|
DocxMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__DocxMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.DocxMetadata` <- function(self, name) {
|
|||
|
|
func <- DocxMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DocxMetadata` <- `$.DocxMetadata`
|
|||
|
|
#' CSV/TSV file metadata
|
|||
|
|
#' @field row_count row_count
|
|||
|
|
#' @field column_count column_count
|
|||
|
|
#' @field delimiter delimiter
|
|||
|
|
#' @field has_header has_header
|
|||
|
|
#' @field column_types column_types
|
|||
|
|
#' @export
|
|||
|
|
CsvMetadata <- new.env(parent = emptyenv())
|
|||
|
|
CsvMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__CsvMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.CsvMetadata` <- function(self, name) {
|
|||
|
|
func <- CsvMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.CsvMetadata` <- `$.CsvMetadata`
|
|||
|
|
#' BibTeX bibliography metadata
|
|||
|
|
#' @field entry_count Number of entries in the bibliography.
|
|||
|
|
#' @field citation_keys citation_keys
|
|||
|
|
#' @field authors authors
|
|||
|
|
#' @field year_range year_range
|
|||
|
|
#' @field entry_types entry_types
|
|||
|
|
#' @export
|
|||
|
|
BibtexMetadata <- new.env(parent = emptyenv())
|
|||
|
|
BibtexMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__BibtexMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.BibtexMetadata` <- function(self, name) {
|
|||
|
|
func <- BibtexMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.BibtexMetadata` <- `$.BibtexMetadata`
|
|||
|
|
#' Citation file metadata (RIS, PubMed, EndNote)
|
|||
|
|
#' @field citation_count citation_count
|
|||
|
|
#' @field format format
|
|||
|
|
#' @field authors authors
|
|||
|
|
#' @field year_range year_range
|
|||
|
|
#' @field dois dois
|
|||
|
|
#' @field keywords keywords
|
|||
|
|
#' @export
|
|||
|
|
CitationMetadata <- new.env(parent = emptyenv())
|
|||
|
|
CitationMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__CitationMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.CitationMetadata` <- function(self, name) {
|
|||
|
|
func <- CitationMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.CitationMetadata` <- `$.CitationMetadata`
|
|||
|
|
#' Year range for bibliographic metadata
|
|||
|
|
#' @field min min
|
|||
|
|
#' @field max max
|
|||
|
|
#' @field years years
|
|||
|
|
#' @export
|
|||
|
|
YearRange <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.YearRange` <- function(self, name) {
|
|||
|
|
func <- YearRange[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.YearRange` <- `$.YearRange`
|
|||
|
|
#' FictionBook (FB2) metadata
|
|||
|
|
#' @field genres genres
|
|||
|
|
#' @field sequences sequences
|
|||
|
|
#' @field annotation annotation
|
|||
|
|
#' @export
|
|||
|
|
FictionBookMetadata <- new.env(parent = emptyenv())
|
|||
|
|
FictionBookMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__FictionBookMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.FictionBookMetadata` <- function(self, name) {
|
|||
|
|
func <- FictionBookMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.FictionBookMetadata` <- `$.FictionBookMetadata`
|
|||
|
|
#' DBASE field information
|
|||
|
|
#' @field name name
|
|||
|
|
#' @field field_type field_type
|
|||
|
|
#' @export
|
|||
|
|
DbfFieldInfo <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DbfFieldInfo` <- function(self, name) {
|
|||
|
|
func <- DbfFieldInfo[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DbfFieldInfo` <- `$.DbfFieldInfo`
|
|||
|
|
#' JATS contributor with role
|
|||
|
|
#' @field name name
|
|||
|
|
#' @field role role
|
|||
|
|
#' @export
|
|||
|
|
ContributorRole <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ContributorRole` <- function(self, name) {
|
|||
|
|
func <- ContributorRole[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ContributorRole` <- `$.ContributorRole`
|
|||
|
|
#' EPUB metadata (Dublin Core extensions)
|
|||
|
|
#' @field coverage coverage
|
|||
|
|
#' @field dc_format dc_format
|
|||
|
|
#' @field relation relation
|
|||
|
|
#' @field source source
|
|||
|
|
#' @field dc_type dc_type
|
|||
|
|
#' @field cover_image cover_image
|
|||
|
|
#' @export
|
|||
|
|
EpubMetadata <- new.env(parent = emptyenv())
|
|||
|
|
EpubMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__EpubMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.EpubMetadata` <- function(self, name) {
|
|||
|
|
func <- EpubMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EpubMetadata` <- `$.EpubMetadata`
|
|||
|
|
#' Outlook PST archive metadata
|
|||
|
|
#' @field message_count message_count
|
|||
|
|
#' @export
|
|||
|
|
PstMetadata <- new.env(parent = emptyenv())
|
|||
|
|
PstMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PstMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PstMetadata` <- function(self, name) {
|
|||
|
|
func <- PstMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PstMetadata` <- `$.PstMetadata`
|
|||
|
|
#' Confidence scores for an OCR element
|
|||
|
|
#'
|
|||
|
|
#' Separates detection confidence (how confident that text exists at this location)
|
|||
|
|
#' from recognition confidence (how confident about the actual text content).
|
|||
|
|
#' @field detection Detection confidence: how confident the OCR engine is that text exists here.
|
|||
|
|
#' @field recognition Recognition confidence: how confident about the text content.
|
|||
|
|
#' @export
|
|||
|
|
OcrConfidence <- new.env(parent = emptyenv())
|
|||
|
|
OcrConfidence$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrConfidence__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrConfidence` <- function(self, name) {
|
|||
|
|
func <- OcrConfidence[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrConfidence` <- `$.OcrConfidence`
|
|||
|
|
#' Rotation information for an OCR element
|
|||
|
|
#' @field angle_degrees Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
|
|||
|
|
#' @field confidence Confidence score for the rotation detection.
|
|||
|
|
#' @export
|
|||
|
|
OcrRotation <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrRotation` <- function(self, name) {
|
|||
|
|
func <- OcrRotation[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrRotation` <- `$.OcrRotation`
|
|||
|
|
#' A unified OCR element representing detected text with full metadata
|
|||
|
|
#'
|
|||
|
|
#' This is the primary type for structured OCR output, preserving all information
|
|||
|
|
#' from both Tesseract and PaddleOCR backends.
|
|||
|
|
#' @field text The recognized text content.
|
|||
|
|
#' @field geometry Bounding geometry (rectangle or quadrilateral).
|
|||
|
|
#' @field confidence Confidence scores for detection and recognition.
|
|||
|
|
#' @field level Hierarchical level (word, line, block, page).
|
|||
|
|
#' @field rotation Rotation information (if detected).
|
|||
|
|
#' @field page_number Page number (1-indexed).
|
|||
|
|
#' @field parent_id Parent element ID for hierarchical relationships.
|
|||
|
|
#' @field backend_metadata Backend-specific metadata that doesn't fit the unified schema.
|
|||
|
|
#' @export
|
|||
|
|
OcrElement <- new.env(parent = emptyenv())
|
|||
|
|
OcrElement$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrElement__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrElement` <- function(self, name) {
|
|||
|
|
func <- OcrElement[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrElement` <- `$.OcrElement`
|
|||
|
|
#' Configuration for OCR element extraction
|
|||
|
|
#'
|
|||
|
|
#' Controls how OCR elements are extracted and filtered.
|
|||
|
|
#' @field include_elements Whether to include OCR elements in the extraction result.
|
|||
|
|
#' @field min_level Minimum hierarchical level to include.
|
|||
|
|
#' @field min_confidence Minimum recognition confidence threshold (0.0-1.0).
|
|||
|
|
#' @field build_hierarchy Whether to build hierarchical relationships between elements.
|
|||
|
|
#' @export
|
|||
|
|
OcrElementConfig <- new.env(parent = emptyenv())
|
|||
|
|
OcrElementConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__OcrElementConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrElementConfig` <- function(self, name) {
|
|||
|
|
func <- OcrElementConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrElementConfig` <- `$.OcrElementConfig`
|
|||
|
|
#' Byte offset boundary for a page
|
|||
|
|
#'
|
|||
|
|
#' Tracks where a specific page's content starts and ends in the main content string,
|
|||
|
|
#' enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|||
|
|
#' at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|||
|
|
#' @field byte_start Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|||
|
|
#' @field byte_end Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|||
|
|
#' @field page_number Page number (1-indexed)
|
|||
|
|
#' @export
|
|||
|
|
PageBoundary <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.PageBoundary` <- function(self, name) {
|
|||
|
|
func <- PageBoundary[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PageBoundary` <- `$.PageBoundary`
|
|||
|
|
#' Metadata for individual page/slide/sheet
|
|||
|
|
#'
|
|||
|
|
#' Captures per-page information including dimensions, content counts,
|
|||
|
|
#' and visibility state (for presentations).
|
|||
|
|
#' @field number Page number (1-indexed)
|
|||
|
|
#' @field title Page title (usually for presentations)
|
|||
|
|
#' @field dimensions Dimensions in points (PDF) or pixels (images): (width, height)
|
|||
|
|
#' @field image_count Number of images on this page
|
|||
|
|
#' @field table_count Number of tables on this page
|
|||
|
|
#' @field hidden Whether this page is hidden (e.g., in presentations)
|
|||
|
|
#' @field is_blank Whether this page is blank (no meaningful text, no images, no tables)
|
|||
|
|
#' @field has_vector_graphics Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
|||
|
|
#' @export
|
|||
|
|
PageInfo <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.PageInfo` <- function(self, name) {
|
|||
|
|
func <- PageInfo[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PageInfo` <- `$.PageInfo`
|
|||
|
|
#' A detected layout region on a page
|
|||
|
|
#'
|
|||
|
|
#' When layout detection is enabled, each page may have layout regions
|
|||
|
|
#' identifying different content types (text, pictures, tables, etc.)
|
|||
|
|
#' with confidence scores and spatial positions.
|
|||
|
|
#' @field class_name Layout class name (e.g. "picture", "table", "text", "section_header").
|
|||
|
|
#' @field confidence Confidence score from the layout detection model (0.0 to 1.0).
|
|||
|
|
#' @field bounding_box Bounding box in document coordinate space.
|
|||
|
|
#' @field area_fraction Fraction of the page area covered by this region (0.0 to 1.0).
|
|||
|
|
#' @export
|
|||
|
|
LayoutRegion <- new.env(parent = emptyenv())
|
|||
|
|
LayoutRegion$from_json <- function(json) {
|
|||
|
|
.Call("wrap__LayoutRegion__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.LayoutRegion` <- function(self, name) {
|
|||
|
|
func <- LayoutRegion[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LayoutRegion` <- `$.LayoutRegion`
|
|||
|
|
#' A text block with hierarchy level assignment
|
|||
|
|
#'
|
|||
|
|
#' Represents a block of text with semantic heading information extracted from
|
|||
|
|
#' font size clustering and hierarchical analysis.
|
|||
|
|
#' @field text The text content of this block
|
|||
|
|
#' @field font_size The font size of the text in this block
|
|||
|
|
#' @field level The hierarchy level of this block (H1-H6 or Body)
|
|||
|
|
#' @field bbox Bounding box information for the block
|
|||
|
|
#' @export
|
|||
|
|
HierarchicalBlock <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.HierarchicalBlock` <- function(self, name) {
|
|||
|
|
func <- HierarchicalBlock[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.HierarchicalBlock` <- `$.HierarchicalBlock`
|
|||
|
|
#' A single changed cell within a table
|
|||
|
|
#'
|
|||
|
|
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|||
|
|
#' reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
#' `crate::diff` re-exports this type verbatim.
|
|||
|
|
#' @field row Zero-based row index.
|
|||
|
|
#' @field col Zero-based column index.
|
|||
|
|
#' @field from Value before the change.
|
|||
|
|
#' @field to Value after the change.
|
|||
|
|
#' @export
|
|||
|
|
CellChange <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.CellChange` <- function(self, name) {
|
|||
|
|
func <- CellChange[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.CellChange` <- `$.CellChange`
|
|||
|
|
#' A single tracked change embedded in a document
|
|||
|
|
#'
|
|||
|
|
#' Populated by per-format extractors that understand change-tracking metadata
|
|||
|
|
#' (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
|||
|
|
#' extractor defaults to `ExtractionResult.revisions = None` until a
|
|||
|
|
#' format-specific implementation is added.
|
|||
|
|
#' @field revision_id Format-specific revision identifier.
|
|||
|
|
#' @field author Display name of the author who made this change, when available.
|
|||
|
|
#' @field timestamp ISO-8601 timestamp of the change, when available.
|
|||
|
|
#' @field kind Semantic kind of this revision.
|
|||
|
|
#' @field anchor Best-effort document location for this revision.
|
|||
|
|
#' @field delta The content changes that make up this revision.
|
|||
|
|
#' @export
|
|||
|
|
DocumentRevision <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DocumentRevision` <- function(self, name) {
|
|||
|
|
func <- DocumentRevision[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DocumentRevision` <- `$.DocumentRevision`
|
|||
|
|
#' Individual table cell with content and optional styling
|
|||
|
|
#'
|
|||
|
|
#' Future extension point for rich table support with cell-level metadata.
|
|||
|
|
#' @field content Cell content as text
|
|||
|
|
#' @field row_span Row span (number of rows this cell spans)
|
|||
|
|
#' @field col_span Column span (number of columns this cell spans)
|
|||
|
|
#' @field is_header Whether this is a header cell
|
|||
|
|
#' @export
|
|||
|
|
TableCell <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.TableCell` <- function(self, name) {
|
|||
|
|
func <- TableCell[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.TableCell` <- `$.TableCell`
|
|||
|
|
#' A URI extracted from a document
|
|||
|
|
#'
|
|||
|
|
#' Represents any link, reference, or resource pointer found during extraction.
|
|||
|
|
#' The `kind` field classifies the URI semantically, while `label` carries
|
|||
|
|
#' optional human-readable display text.
|
|||
|
|
#' @field url The URL or path string.
|
|||
|
|
#' @field label Optional display text / label for the link.
|
|||
|
|
#' @field page Optional page number where the URI was found (1-indexed).
|
|||
|
|
#' @field kind Semantic classification of the URI.
|
|||
|
|
#' @export
|
|||
|
|
ExtractedUri <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ExtractedUri` <- function(self, name) {
|
|||
|
|
func <- ExtractedUri[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ExtractedUri` <- `$.ExtractedUri`
|
|||
|
|
#' MIME type detection response
|
|||
|
|
#' @field mime_type Detected MIME type
|
|||
|
|
#' @field filename Original filename (if provided)
|
|||
|
|
#' @export
|
|||
|
|
DetectResponse <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DetectResponse` <- function(self, name) {
|
|||
|
|
func <- DetectResponse[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DetectResponse` <- `$.DetectResponse`
|
|||
|
|
#' Options controlling how two `ExtractionResult` values are compared
|
|||
|
|
#' @field include_metadata Include metadata changes in the diff. Default: `true`.
|
|||
|
|
#' @field include_embedded Include embedded-children changes in the diff. Default: `true`.
|
|||
|
|
#' @field max_content_chars Truncate content to this many characters before diffing.
|
|||
|
|
#' @export
|
|||
|
|
DiffOptions <- new.env(parent = emptyenv())
|
|||
|
|
DiffOptions$default <- function() .Call("wrap__DiffOptions__default", PACKAGE = "kreuzberg")
|
|||
|
|
DiffOptions$from_json <- function(json) {
|
|||
|
|
.Call("wrap__DiffOptions__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.DiffOptions` <- function(self, name) {
|
|||
|
|
func <- DiffOptions[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DiffOptions` <- `$.DiffOptions`
|
|||
|
|
#' A single contiguous hunk in a unified diff
|
|||
|
|
#' @field from_line Starting line number in the old content (0-indexed).
|
|||
|
|
#' @field from_count Number of lines from the old content in this hunk.
|
|||
|
|
#' @field to_line Starting line number in the new content (0-indexed).
|
|||
|
|
#' @field to_count Number of lines from the new content in this hunk.
|
|||
|
|
#' @field lines Lines that make up this hunk.
|
|||
|
|
#' @export
|
|||
|
|
DiffHunk <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DiffHunk` <- function(self, name) {
|
|||
|
|
func <- DiffHunk[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DiffHunk` <- `$.DiffHunk`
|
|||
|
|
#' Diff for a single embedded archive entry that appears in both results
|
|||
|
|
#' @field path Archive-relative path identifying this entry.
|
|||
|
|
#' @field diff The recursive diff of the entry's extraction result.
|
|||
|
|
#' @export
|
|||
|
|
EmbeddedDiff <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.EmbeddedDiff` <- function(self, name) {
|
|||
|
|
func <- EmbeddedDiff[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmbeddedDiff` <- `$.EmbeddedDiff`
|
|||
|
|
#' Preset configurations for common RAG use cases
|
|||
|
|
#'
|
|||
|
|
#' Each preset combines chunk size, overlap, and embedding model
|
|||
|
|
#' to provide an optimized configuration for specific scenarios.
|
|||
|
|
#'
|
|||
|
|
#' All string fields are owned `String` for FFI compatibility — instances
|
|||
|
|
#' are safe to clone and pass across language boundaries.
|
|||
|
|
#' @field name name
|
|||
|
|
#' @field chunk_size chunk_size
|
|||
|
|
#' @field overlap overlap
|
|||
|
|
#' @field model_repo HuggingFace repository name for the model.
|
|||
|
|
#' @field pooling Pooling strategy: "cls" or "mean".
|
|||
|
|
#' @field model_file Path to the ONNX model file within the repo.
|
|||
|
|
#' @field dimensions dimensions
|
|||
|
|
#' @field description description
|
|||
|
|
#' @export
|
|||
|
|
EmbeddingPreset <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.EmbeddingPreset` <- function(self, name) {
|
|||
|
|
func <- EmbeddingPreset[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmbeddingPreset` <- `$.EmbeddingPreset`
|
|||
|
|
#' YAKE-specific parameters
|
|||
|
|
#' @field window_size Window size for co-occurrence analysis (default: 2).
|
|||
|
|
#' @export
|
|||
|
|
YakeParams <- new.env(parent = emptyenv())
|
|||
|
|
YakeParams$default <- function() .Call("wrap__YakeParams__default", PACKAGE = "kreuzberg")
|
|||
|
|
YakeParams$from_json <- function(json) {
|
|||
|
|
.Call("wrap__YakeParams__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.YakeParams` <- function(self, name) {
|
|||
|
|
func <- YakeParams[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.YakeParams` <- `$.YakeParams`
|
|||
|
|
#' RAKE-specific parameters
|
|||
|
|
#' @field min_word_length Minimum word length to consider (default: 1).
|
|||
|
|
#' @field max_words_per_phrase Maximum words in a keyword phrase (default: 3).
|
|||
|
|
#' @export
|
|||
|
|
RakeParams <- new.env(parent = emptyenv())
|
|||
|
|
RakeParams$default <- function() .Call("wrap__RakeParams__default", PACKAGE = "kreuzberg")
|
|||
|
|
RakeParams$from_json <- function(json) {
|
|||
|
|
.Call("wrap__RakeParams__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.RakeParams` <- function(self, name) {
|
|||
|
|
func <- RakeParams[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.RakeParams` <- `$.RakeParams`
|
|||
|
|
#' Keyword extraction configuration
|
|||
|
|
#' @field algorithm Algorithm to use for extraction.
|
|||
|
|
#' @field max_keywords Maximum number of keywords to extract (default: 10).
|
|||
|
|
#' @field min_score Minimum score threshold (0.0-1.0, default: 0.0).
|
|||
|
|
#' @field ngram_range N-gram range for keyword extraction (min, max).
|
|||
|
|
#' @field language Language code for stopword filtering (e.g., "en", "de", "fr").
|
|||
|
|
#' @field yake_params YAKE-specific tuning parameters.
|
|||
|
|
#' @field rake_params RAKE-specific tuning parameters.
|
|||
|
|
#' @export
|
|||
|
|
KeywordConfig <- new.env(parent = emptyenv())
|
|||
|
|
KeywordConfig$default <- function() .Call("wrap__KeywordConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
KeywordConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__KeywordConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.KeywordConfig` <- function(self, name) {
|
|||
|
|
func <- KeywordConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.KeywordConfig` <- `$.KeywordConfig`
|
|||
|
|
#' Extracted keyword with metadata
|
|||
|
|
#' @field text The keyword text.
|
|||
|
|
#' @field score Relevance score (higher is better, algorithm-specific range).
|
|||
|
|
#' @field algorithm Algorithm that extracted this keyword.
|
|||
|
|
#' @field positions Optional positions where keyword appears in text (character offsets).
|
|||
|
|
#' @export
|
|||
|
|
Keyword <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.Keyword` <- function(self, name) {
|
|||
|
|
func <- Keyword[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.Keyword` <- `$.Keyword`
|
|||
|
|
#' Configuration for PaddleOCR backend
|
|||
|
|
#'
|
|||
|
|
#' Configures PaddleOCR text detection and recognition with multi-language support.
|
|||
|
|
#' Uses a builder pattern for convenient configuration.
|
|||
|
|
#' @field language Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")
|
|||
|
|
#' @field cache_dir Optional custom cache directory for model files
|
|||
|
|
#' @field use_angle_cls Enable angle classification for rotated text (default: false). Can misfire on short text
|
|||
|
|
#' @field enable_table_detection Enable table structure detection (default: false)
|
|||
|
|
#' @field det_db_thresh Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more
|
|||
|
|
#' @field det_db_box_thresh Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0
|
|||
|
|
#' @field det_db_unclip_ratio Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of
|
|||
|
|
#' @field det_limit_side_len Maximum side length for detection image (default: 960) Larger images may be resized to
|
|||
|
|
#' @field rec_batch_num Batch size for recognition inference (default: 6) Number of text regions to process
|
|||
|
|
#' @field padding Padding in pixels added around the image before detection (default: 10). Large values can include
|
|||
|
|
#' @field drop_score Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition
|
|||
|
|
#' @field model_tier Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"`
|
|||
|
|
#' @export
|
|||
|
|
PaddleOcrConfig <- new.env(parent = emptyenv())
|
|||
|
|
PaddleOcrConfig$with_cache_dir <- function(self, path) .Call("wrap__PaddleOcrConfig__with_cache_dir", self, path, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_table_detection <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_table_detection", self, enable, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_angle_cls <- function(self, enable) .Call("wrap__PaddleOcrConfig__with_angle_cls", self, enable, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_det_db_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_thresh", self, threshold, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_det_db_box_thresh <- function(self, threshold) .Call("wrap__PaddleOcrConfig__with_det_db_box_thresh", self, threshold, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_det_db_unclip_ratio <- function(self, ratio) .Call("wrap__PaddleOcrConfig__with_det_db_unclip_ratio", self, ratio, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_det_limit_side_len <- function(self, length) .Call("wrap__PaddleOcrConfig__with_det_limit_side_len", self, length, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_rec_batch_num <- function(self, batch_size) .Call("wrap__PaddleOcrConfig__with_rec_batch_num", self, batch_size, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_drop_score <- function(self, score) .Call("wrap__PaddleOcrConfig__with_drop_score", self, score, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_padding <- function(self, padding) .Call("wrap__PaddleOcrConfig__with_padding", self, padding, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$with_model_tier <- function(self, tier) .Call("wrap__PaddleOcrConfig__with_model_tier", self, tier, PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$default <- function() .Call("wrap__PaddleOcrConfig__default", PACKAGE = "kreuzberg")
|
|||
|
|
PaddleOcrConfig$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PaddleOcrConfig__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PaddleOcrConfig` <- function(self, name) {
|
|||
|
|
func <- PaddleOcrConfig[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PaddleOcrConfig` <- `$.PaddleOcrConfig`
|
|||
|
|
#' @export
|
|||
|
|
with_cache_dir.PaddleOcrConfig <- function(x, ...) x$with_cache_dir(...)
|
|||
|
|
#' @export
|
|||
|
|
with_table_detection.PaddleOcrConfig <- function(x, ...) x$with_table_detection(...)
|
|||
|
|
#' @export
|
|||
|
|
with_angle_cls.PaddleOcrConfig <- function(x, ...) x$with_angle_cls(...)
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_thresh(...)
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_box_thresh.PaddleOcrConfig <- function(x, ...) x$with_det_db_box_thresh(...)
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_unclip_ratio.PaddleOcrConfig <- function(x, ...) x$with_det_db_unclip_ratio(...)
|
|||
|
|
#' @export
|
|||
|
|
with_det_limit_side_len.PaddleOcrConfig <- function(x, ...) x$with_det_limit_side_len(...)
|
|||
|
|
#' @export
|
|||
|
|
with_rec_batch_num.PaddleOcrConfig <- function(x, ...) x$with_rec_batch_num(...)
|
|||
|
|
#' @export
|
|||
|
|
with_drop_score.PaddleOcrConfig <- function(x, ...) x$with_drop_score(...)
|
|||
|
|
#' @export
|
|||
|
|
with_padding.PaddleOcrConfig <- function(x, ...) x$with_padding(...)
|
|||
|
|
#' @export
|
|||
|
|
with_model_tier.PaddleOcrConfig <- function(x, ...) x$with_model_tier(...)
|
|||
|
|
#' Combined paths to all models needed for OCR (backward compatibility)
|
|||
|
|
#' @field det_model Path to the detection model directory.
|
|||
|
|
#' @field cls_model Path to the classification model directory.
|
|||
|
|
#' @field rec_model Path to the recognition model directory.
|
|||
|
|
#' @field dict_file Path to the character dictionary file.
|
|||
|
|
#' @export
|
|||
|
|
ModelPaths <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.ModelPaths` <- function(self, name) {
|
|||
|
|
func <- ModelPaths[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ModelPaths` <- `$.ModelPaths`
|
|||
|
|
#' Document orientation detection result
|
|||
|
|
#' @field degrees Detected orientation in degrees (0, 90, 180, or 270).
|
|||
|
|
#' @field confidence Confidence score (0.0-1.0).
|
|||
|
|
#' @export
|
|||
|
|
OrientationResult <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.OrientationResult` <- function(self, name) {
|
|||
|
|
func <- OrientationResult[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OrientationResult` <- `$.OrientationResult`
|
|||
|
|
#' Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right
|
|||
|
|
#' @field x1 x1
|
|||
|
|
#' @field y1 y1
|
|||
|
|
#' @field x2 x2
|
|||
|
|
#' @field y2 y2
|
|||
|
|
#' @export
|
|||
|
|
BBox <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.BBox` <- function(self, name) {
|
|||
|
|
func <- BBox[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.BBox` <- `$.BBox`
|
|||
|
|
#' A single layout detection result
|
|||
|
|
#' @field class_name class_name
|
|||
|
|
#' @field confidence confidence
|
|||
|
|
#' @field bbox bbox
|
|||
|
|
#' @export
|
|||
|
|
LayoutDetection <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.LayoutDetection` <- function(self, name) {
|
|||
|
|
func <- LayoutDetection[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.LayoutDetection` <- `$.LayoutDetection`
|
|||
|
|
#' Embedded file descriptor extracted from the PDF name tree
|
|||
|
|
#' @field name The filename as stored in the PDF name tree.
|
|||
|
|
#' @field data Raw file bytes from the embedded stream (already decompressed by lopdf).
|
|||
|
|
#' @field compressed_size Compressed byte count of the original stream (before decompression).
|
|||
|
|
#' @field mime_type MIME type if specified in the filespec, otherwise `None`.
|
|||
|
|
#' @export
|
|||
|
|
EmbeddedFile <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.EmbeddedFile` <- function(self, name) {
|
|||
|
|
func <- EmbeddedFile[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmbeddedFile` <- `$.EmbeddedFile`
|
|||
|
|
#' PDF-specific metadata
|
|||
|
|
#'
|
|||
|
|
#' Contains metadata fields specific to PDF documents that are not in the common
|
|||
|
|
#' `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|||
|
|
#' are at the `Metadata` level.
|
|||
|
|
#' @field pdf_version PDF version (e.g., "1.7", "2.0")
|
|||
|
|
#' @field producer PDF producer (application that created the PDF)
|
|||
|
|
#' @field is_encrypted Whether the PDF is encrypted/password-protected
|
|||
|
|
#' @field width First page width in points (1/72 inch)
|
|||
|
|
#' @field height First page height in points (1/72 inch)
|
|||
|
|
#' @field page_count Total number of pages in the PDF document
|
|||
|
|
#' @export
|
|||
|
|
PdfMetadata <- new.env(parent = emptyenv())
|
|||
|
|
PdfMetadata$from_json <- function(json) {
|
|||
|
|
.Call("wrap__PdfMetadata__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`$.PdfMetadata` <- function(self, name) {
|
|||
|
|
func <- PdfMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.PdfMetadata` <- `$.PdfMetadata`
|
|||
|
|
#' Output format for extraction results
|
|||
|
|
#'
|
|||
|
|
#' Controls the format of the `content` field in `ExtractionResult`.
|
|||
|
|
#' When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
|||
|
|
#' `Plain` returns the raw extracted text.
|
|||
|
|
#' `Structured` returns JSON with full OCR element data including bounding
|
|||
|
|
#' boxes and confidence scores.
|
|||
|
|
#' @field Plain Plain text content only (default)
|
|||
|
|
#' @field Markdown Markdown format
|
|||
|
|
#' @field Djot Djot markup format
|
|||
|
|
#' @field Html HTML format
|
|||
|
|
#' @field Json JSON tree format with heading-driven sections.
|
|||
|
|
#' @field Structured Structured JSON format with full OCR element metadata.
|
|||
|
|
#' @field Custom Custom renderer registered via the RendererRegistry. The string is the renderer name (e.g., "docx",
|
|||
|
|
#' @export
|
|||
|
|
OutputFormat <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.OutputFormat` <- function(self, name) {
|
|||
|
|
func <- OutputFormat[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OutputFormat` <- `$.OutputFormat`
|
|||
|
|
#' Format-specific metadata (discriminated union)
|
|||
|
|
#'
|
|||
|
|
#' Only one format type can exist per extraction result. This provides
|
|||
|
|
#' type-safe, clean metadata without nested optionals.
|
|||
|
|
#' @field Pdf Pdf
|
|||
|
|
#' @field Docx Docx
|
|||
|
|
#' @field Excel Excel
|
|||
|
|
#' @field Email Email
|
|||
|
|
#' @field Pptx Pptx
|
|||
|
|
#' @field Archive Archive
|
|||
|
|
#' @field Image Image
|
|||
|
|
#' @field Xml Xml
|
|||
|
|
#' @field Text Text
|
|||
|
|
#' @field Html Html
|
|||
|
|
#' @field Ocr Ocr
|
|||
|
|
#' @field Csv Csv
|
|||
|
|
#' @field Bibtex Bibtex
|
|||
|
|
#' @field Citation Citation
|
|||
|
|
#' @field FictionBook FictionBook
|
|||
|
|
#' @field Dbf Dbf
|
|||
|
|
#' @field Jats Jats
|
|||
|
|
#' @field Epub Epub
|
|||
|
|
#' @field Pst Pst
|
|||
|
|
#' @field Code Code
|
|||
|
|
#' @export
|
|||
|
|
FormatMetadata <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.FormatMetadata` <- function(self, name) {
|
|||
|
|
func <- FormatMetadata[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.FormatMetadata` <- `$.FormatMetadata`
|
|||
|
|
#' A single line in a unified-diff hunk
|
|||
|
|
#'
|
|||
|
|
#' Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|||
|
|
#' reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
#' `crate::diff` re-exports this type verbatim.
|
|||
|
|
#' @field Context Unchanged context line.
|
|||
|
|
#' @field Added Line added in the "after" version.
|
|||
|
|
#' @field Removed Line removed from the "before" version.
|
|||
|
|
#' @export
|
|||
|
|
DiffLine <- new.env(parent = emptyenv())
|
|||
|
|
#' @export
|
|||
|
|
`$.DiffLine` <- function(self, name) {
|
|||
|
|
func <- DiffLine[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.DiffLine` <- `$.DiffLine`
|
|||
|
|
#' Create a ExecutionProviderType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ExecutionProviderType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ExecutionProviderType enum value
|
|||
|
|
#' @export
|
|||
|
|
ExecutionProviderType <- function() list() |> structure(class = "ExecutionProviderType")
|
|||
|
|
|
|||
|
|
#' Create a HtmlTheme enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default HtmlTheme variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A HtmlTheme enum value
|
|||
|
|
#' @export
|
|||
|
|
HtmlTheme <- function() list() |> structure(class = "HtmlTheme")
|
|||
|
|
|
|||
|
|
#' Create a TableModel enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default TableModel variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A TableModel enum value
|
|||
|
|
#' @export
|
|||
|
|
TableModel <- function() list() |> structure(class = "TableModel")
|
|||
|
|
|
|||
|
|
#' Create a ChunkerType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ChunkerType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ChunkerType enum value
|
|||
|
|
#' @export
|
|||
|
|
ChunkerType <- function() list() |> structure(class = "ChunkerType")
|
|||
|
|
|
|||
|
|
#' Create a CodeContentMode enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default CodeContentMode variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A CodeContentMode enum value
|
|||
|
|
#' @export
|
|||
|
|
CodeContentMode <- function() list() |> structure(class = "CodeContentMode")
|
|||
|
|
|
|||
|
|
#' Create a ListType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ListType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ListType enum value
|
|||
|
|
#' @export
|
|||
|
|
ListType <- function() list() |> structure(class = "ListType")
|
|||
|
|
|
|||
|
|
#' Create a OcrBackendType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default OcrBackendType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A OcrBackendType enum value
|
|||
|
|
#' @export
|
|||
|
|
OcrBackendType <- function() list() |> structure(class = "OcrBackendType")
|
|||
|
|
|
|||
|
|
#' Create a ProcessingStage enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ProcessingStage variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ProcessingStage enum value
|
|||
|
|
#' @export
|
|||
|
|
ProcessingStage <- function() list() |> structure(class = "ProcessingStage")
|
|||
|
|
|
|||
|
|
#' Create a ReductionLevel enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ReductionLevel variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ReductionLevel enum value
|
|||
|
|
#' @export
|
|||
|
|
ReductionLevel <- function() list() |> structure(class = "ReductionLevel")
|
|||
|
|
|
|||
|
|
#' Create a PdfAnnotationType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default PdfAnnotationType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A PdfAnnotationType enum value
|
|||
|
|
#' @export
|
|||
|
|
PdfAnnotationType <- function() list() |> structure(class = "PdfAnnotationType")
|
|||
|
|
|
|||
|
|
#' Create a BlockType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default BlockType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A BlockType enum value
|
|||
|
|
#' @export
|
|||
|
|
BlockType <- function() list() |> structure(class = "BlockType")
|
|||
|
|
|
|||
|
|
#' Create a InlineType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default InlineType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A InlineType enum value
|
|||
|
|
#' @export
|
|||
|
|
InlineType <- function() list() |> structure(class = "InlineType")
|
|||
|
|
|
|||
|
|
#' Create a RelationshipKind enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default RelationshipKind variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A RelationshipKind enum value
|
|||
|
|
#' @export
|
|||
|
|
RelationshipKind <- function() list() |> structure(class = "RelationshipKind")
|
|||
|
|
|
|||
|
|
#' Create a ContentLayer enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ContentLayer variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ContentLayer enum value
|
|||
|
|
#' @export
|
|||
|
|
ContentLayer <- function() list() |> structure(class = "ContentLayer")
|
|||
|
|
|
|||
|
|
#' Create a ExtractionMethod enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ExtractionMethod variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ExtractionMethod enum value
|
|||
|
|
#' @export
|
|||
|
|
ExtractionMethod <- function() list() |> structure(class = "ExtractionMethod")
|
|||
|
|
|
|||
|
|
#' Create a ChunkType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ChunkType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ChunkType enum value
|
|||
|
|
#' @export
|
|||
|
|
ChunkType <- function() list() |> structure(class = "ChunkType")
|
|||
|
|
|
|||
|
|
#' Create a ImageKind enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ImageKind variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ImageKind enum value
|
|||
|
|
#' @export
|
|||
|
|
ImageKind <- function() list() |> structure(class = "ImageKind")
|
|||
|
|
|
|||
|
|
#' Create a ResultFormat enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ResultFormat variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ResultFormat enum value
|
|||
|
|
#' @export
|
|||
|
|
ResultFormat <- function() list() |> structure(class = "ResultFormat")
|
|||
|
|
|
|||
|
|
#' Create a ElementType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ElementType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ElementType enum value
|
|||
|
|
#' @export
|
|||
|
|
ElementType <- function() list() |> structure(class = "ElementType")
|
|||
|
|
|
|||
|
|
#' Create a TextDirection enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default TextDirection variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A TextDirection enum value
|
|||
|
|
#' @export
|
|||
|
|
TextDirection <- function() list() |> structure(class = "TextDirection")
|
|||
|
|
|
|||
|
|
#' Create a LinkType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default LinkType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A LinkType enum value
|
|||
|
|
#' @export
|
|||
|
|
LinkType <- function() list() |> structure(class = "LinkType")
|
|||
|
|
|
|||
|
|
#' Create a ImageType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default ImageType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A ImageType enum value
|
|||
|
|
#' @export
|
|||
|
|
ImageType <- function() list() |> structure(class = "ImageType")
|
|||
|
|
|
|||
|
|
#' Create a StructuredDataType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default StructuredDataType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A StructuredDataType enum value
|
|||
|
|
#' @export
|
|||
|
|
StructuredDataType <- function() list() |> structure(class = "StructuredDataType")
|
|||
|
|
|
|||
|
|
#' Create a OcrElementLevel enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default OcrElementLevel variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A OcrElementLevel enum value
|
|||
|
|
#' @export
|
|||
|
|
OcrElementLevel <- function() list() |> structure(class = "OcrElementLevel")
|
|||
|
|
|
|||
|
|
#' Create a PageUnitType enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default PageUnitType variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A PageUnitType enum value
|
|||
|
|
#' @export
|
|||
|
|
PageUnitType <- function() list() |> structure(class = "PageUnitType")
|
|||
|
|
|
|||
|
|
#' Create a RevisionKind enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default RevisionKind variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A RevisionKind enum value
|
|||
|
|
#' @export
|
|||
|
|
RevisionKind <- function() list() |> structure(class = "RevisionKind")
|
|||
|
|
|
|||
|
|
#' Create a UriKind enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default UriKind variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A UriKind enum value
|
|||
|
|
#' @export
|
|||
|
|
UriKind <- function() list() |> structure(class = "UriKind")
|
|||
|
|
|
|||
|
|
#' Create a KeywordAlgorithm enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default KeywordAlgorithm variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A KeywordAlgorithm enum value
|
|||
|
|
#' @export
|
|||
|
|
KeywordAlgorithm <- function() list() |> structure(class = "KeywordAlgorithm")
|
|||
|
|
|
|||
|
|
#' Create a PSMMode enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default PSMMode variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A PSMMode enum value
|
|||
|
|
#' @export
|
|||
|
|
PSMMode <- function() list() |> structure(class = "PSMMode")
|
|||
|
|
|
|||
|
|
#' Create a PaddleLanguage enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default PaddleLanguage variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A PaddleLanguage enum value
|
|||
|
|
#' @export
|
|||
|
|
PaddleLanguage <- function() list() |> structure(class = "PaddleLanguage")
|
|||
|
|
|
|||
|
|
#' Create a LayoutClass enum value
|
|||
|
|
#'
|
|||
|
|
#' Returns the default LayoutClass variant.
|
|||
|
|
#'
|
|||
|
|
#' @return A LayoutClass enum value
|
|||
|
|
#' @export
|
|||
|
|
LayoutClass <- function() list() |> structure(class = "LayoutClass")
|
|||
|
|
|
|||
|
|
#' How chunk size is measured
|
|||
|
|
#'
|
|||
|
|
#' Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
|||
|
|
#' chunks are sized by token count according to the specified tokenizer.
|
|||
|
|
#'
|
|||
|
|
#' Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
|||
|
|
#' available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
|||
|
|
#' (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
|||
|
|
#' @export
|
|||
|
|
ChunkSizing <- new.env(parent = emptyenv())
|
|||
|
|
ChunkSizing$default <- function() .Call("wrap__ChunkSizing__default", PACKAGE = "kreuzberg")
|
|||
|
|
ChunkSizing$from_json <- function(json) .Call("wrap__ChunkSizing__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.ChunkSizing` <- function(self, name) {
|
|||
|
|
func <- ChunkSizing[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.ChunkSizing` <- `$.ChunkSizing`
|
|||
|
|
#' Embedding model types supported by Kreuzberg
|
|||
|
|
#' @export
|
|||
|
|
EmbeddingModelType <- new.env(parent = emptyenv())
|
|||
|
|
EmbeddingModelType$default <- function() .Call("wrap__EmbeddingModelType__default", PACKAGE = "kreuzberg")
|
|||
|
|
EmbeddingModelType$from_json <- function(json) .Call("wrap__EmbeddingModelType__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.EmbeddingModelType` <- function(self, name) {
|
|||
|
|
func <- EmbeddingModelType[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.EmbeddingModelType` <- `$.EmbeddingModelType`
|
|||
|
|
#' Tagged enum for node content. Each variant carries only type-specific data
|
|||
|
|
#'
|
|||
|
|
#' Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
|||
|
|
#' Go/Java/TypeScript bindings.
|
|||
|
|
#' @export
|
|||
|
|
NodeContent <- new.env(parent = emptyenv())
|
|||
|
|
NodeContent$default <- function() .Call("wrap__NodeContent__default", PACKAGE = "kreuzberg")
|
|||
|
|
NodeContent$from_json <- function(json) .Call("wrap__NodeContent__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.NodeContent` <- function(self, name) {
|
|||
|
|
func <- NodeContent[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.NodeContent` <- `$.NodeContent`
|
|||
|
|
#' Types of inline text annotations
|
|||
|
|
#' @export
|
|||
|
|
AnnotationKind <- new.env(parent = emptyenv())
|
|||
|
|
AnnotationKind$default <- function() .Call("wrap__AnnotationKind__default", PACKAGE = "kreuzberg")
|
|||
|
|
AnnotationKind$from_json <- function(json) .Call("wrap__AnnotationKind__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.AnnotationKind` <- function(self, name) {
|
|||
|
|
func <- AnnotationKind[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.AnnotationKind` <- `$.AnnotationKind`
|
|||
|
|
#' Bounding geometry for an OCR element
|
|||
|
|
#'
|
|||
|
|
#' Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
|||
|
|
#' (from PaddleOCR and rotated text detection).
|
|||
|
|
#' @export
|
|||
|
|
OcrBoundingGeometry <- new.env(parent = emptyenv())
|
|||
|
|
OcrBoundingGeometry$default <- function() .Call("wrap__OcrBoundingGeometry__default", PACKAGE = "kreuzberg")
|
|||
|
|
OcrBoundingGeometry$from_json <- function(json) .Call("wrap__OcrBoundingGeometry__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.OcrBoundingGeometry` <- function(self, name) {
|
|||
|
|
func <- OcrBoundingGeometry[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.OcrBoundingGeometry` <- `$.OcrBoundingGeometry`
|
|||
|
|
#' Best-effort document location for a revision
|
|||
|
|
#' @export
|
|||
|
|
RevisionAnchor <- new.env(parent = emptyenv())
|
|||
|
|
RevisionAnchor$default <- function() .Call("wrap__RevisionAnchor__default", PACKAGE = "kreuzberg")
|
|||
|
|
RevisionAnchor$from_json <- function(json) .Call("wrap__RevisionAnchor__from_json", json, PACKAGE = "kreuzberg")
|
|||
|
|
#' @export
|
|||
|
|
`$.RevisionAnchor` <- function(self, name) {
|
|||
|
|
func <- RevisionAnchor[[name]]
|
|||
|
|
if (identical(names(formals(func))[1], "self")) {
|
|||
|
|
function(...) func(self, ...)
|
|||
|
|
} else {
|
|||
|
|
func
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
#' @export
|
|||
|
|
`[[.RevisionAnchor` <- `$.RevisionAnchor`
|
|||
|
|
#' @export
|
|||
|
|
cors_allows_all <- function(x, ...) UseMethod("cors_allows_all")
|
|||
|
|
#' @export
|
|||
|
|
is_empty <- function(x, ...) UseMethod("is_empty")
|
|||
|
|
#' @export
|
|||
|
|
is_origin_allowed <- function(x, ...) UseMethod("is_origin_allowed")
|
|||
|
|
#' @export
|
|||
|
|
listen_addr <- function(x, ...) UseMethod("listen_addr")
|
|||
|
|
#' @export
|
|||
|
|
max_multipart_field_mb <- function(x, ...) UseMethod("max_multipart_field_mb")
|
|||
|
|
#' @export
|
|||
|
|
max_request_body_mb <- function(x, ...) UseMethod("max_request_body_mb")
|
|||
|
|
#' @export
|
|||
|
|
needs_image_processing <- function(x, ...) UseMethod("needs_image_processing")
|
|||
|
|
#' @export
|
|||
|
|
with_angle_cls <- function(x, ...) UseMethod("with_angle_cls")
|
|||
|
|
#' @export
|
|||
|
|
with_cache_dir <- function(x, ...) UseMethod("with_cache_dir")
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_box_thresh <- function(x, ...) UseMethod("with_det_db_box_thresh")
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_thresh <- function(x, ...) UseMethod("with_det_db_thresh")
|
|||
|
|
#' @export
|
|||
|
|
with_det_db_unclip_ratio <- function(x, ...) UseMethod("with_det_db_unclip_ratio")
|
|||
|
|
#' @export
|
|||
|
|
with_det_limit_side_len <- function(x, ...) UseMethod("with_det_limit_side_len")
|
|||
|
|
#' @export
|
|||
|
|
with_drop_score <- function(x, ...) UseMethod("with_drop_score")
|
|||
|
|
#' @export
|
|||
|
|
with_model_tier <- function(x, ...) UseMethod("with_model_tier")
|
|||
|
|
#' @export
|
|||
|
|
with_padding <- function(x, ...) UseMethod("with_padding")
|
|||
|
|
#' @export
|
|||
|
|
with_rec_batch_num <- function(x, ...) UseMethod("with_rec_batch_num")
|
|||
|
|
#' @export
|
|||
|
|
with_table_detection <- function(x, ...) UseMethod("with_table_detection")
|