# Generated by extendr: Do not edit by hand # # This file is regenerated by alef on every `alef generate` run. # It mirrors the output of `rextendr::document()` and binds every # wrap__ entry registered in extendr_module! to an R-callable # function or class env. #' @useDynLib kreuzberg, .registration = TRUE NULL #' Extract content from a byte array #' #' This is the main entry point for in-memory extraction. It performs the following steps: #' 1. Validate MIME type #' 2. Handle legacy format conversion if needed #' 3. Select appropriate extractor from registry #' 4. Extract content #' 5. Run post-processing pipeline #' @param content The byte array to extract. #' @param mime_type MIME type of the content. #' @param config Extraction configuration. #' @return An `ExtractionResult` containing the extracted content and metadata. #' #' @section Errors: #' Returns `KreuzbergError::Validation` if MIME type is invalid. #' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported. #' @export extract_bytes <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes", content, mime_type, config, PACKAGE = "kreuzberg") #' Extract content from a file #' #' This is the main entry point for file-based extraction. It performs the following steps: #' 1. Check cache for existing result (if caching enabled) #' 2. Detect or validate MIME type #' 3. Select appropriate extractor from registry #' 4. Extract content #' 5. Run post-processing pipeline #' 6. Store result in cache (if caching enabled) #' @param path Path to the file to extract. #' @param mime_type Optional MIME type override. If None, will be auto-detected. #' @param config Extraction configuration. #' @return An `ExtractionResult` containing the extracted content and metadata. #' #' @section Errors: #' Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors. #' Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported. #' @export extract_file <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file", path, mime_type, config, PACKAGE = "kreuzberg") #' Synchronous wrapper for `extract_file` #' #' This is a convenience function that blocks the current thread until extraction completes. #' For async code, use `extract_file` directly. #' #' Uses the global Tokio runtime for 100x+ performance improvement over creating #' a new runtime per call. Always uses the global runtime to avoid nested runtime issues. #' #' This function is only available with the `tokio-runtime` feature. For WASM targets, #' use a truly synchronous extraction approach instead. #' @param path File path as character string. #' @param mime_type Character string. #' @param config ExtractionConfig object (list with class attribute). #' @return ExtractionResult object (list with class attribute). #' @export extract_file_sync <- function(path, mime_type = NULL, config = ExtractionConfig$default()) .Call("wrap__extract_file_sync", path, mime_type, config, PACKAGE = "kreuzberg") #' Synchronous wrapper for `extract_bytes` #' #' Uses the global Tokio runtime for 100x+ performance improvement over creating #' a new runtime per call. #' #' With the `tokio-runtime` feature, this blocks the current thread using the global #' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation. #' @param content Raw vector of bytes. #' @param mime_type Character string. #' @param config ExtractionConfig object (list with class attribute). #' @return ExtractionResult object (list with class attribute). #' @export extract_bytes_sync <- function(content, mime_type, config = ExtractionConfig$default()) .Call("wrap__extract_bytes_sync", content, mime_type, config, PACKAGE = "kreuzberg") #' Synchronous wrapper for `batch_extract_files` #' #' Uses the global Tokio runtime for optimal performance. #' Only available with `tokio-runtime` (WASM has no filesystem). #' @param items List of batchfileitem object (list with class attribute). #' @param config ExtractionConfig object (list with class attribute). #' @return List of extractionresult object (list with class attribute). #' @export batch_extract_files_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files_sync", items, config, PACKAGE = "kreuzberg") #' Synchronous wrapper for `batch_extract_bytes` #' #' Uses the global Tokio runtime for optimal performance. #' With the `tokio-runtime` feature, this blocks the current thread using the global #' Tokio runtime. Without it (WASM), this calls a truly synchronous implementation #' that iterates through items and calls `extract_bytes_sync()`. #' @param items List of batchbytesitem object (list with class attribute). #' @param config ExtractionConfig object (list with class attribute). #' @return List of extractionresult object (list with class attribute). #' @export batch_extract_bytes_sync <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes_sync", items, config, PACKAGE = "kreuzberg") #' Extract content from multiple files concurrently #' #' This function processes multiple files in parallel, automatically managing #' concurrency to prevent resource exhaustion. The concurrency limit can be #' configured via `ExtractionConfig::max_concurrent_extractions` or defaults #' to `(num_cpus * 1.5).ceil()`. #' #' Each file can optionally specify a [`FileExtractionConfig`] that overrides specific #' fields from the batch-level `config`. Pass `None` for a file to use the batch defaults. #' Batch-level settings like `max_concurrent_extractions` and `use_cache` are always #' taken from the batch-level `config`. #' @param items Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides. #' @param config Batch-level extraction configuration (provides defaults and batch settings). #' @return A vector of `ExtractionResult` in the same order as the input items. #' #' @section Errors: #' Individual file errors are captured in the result metadata. System errors #' (IO, RuntimeError equivalents) will bubble up and fail the entire batch. #' @export batch_extract_files <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_files", items, config, PACKAGE = "kreuzberg") #' Extract content from multiple byte arrays concurrently #' #' This function processes multiple byte arrays in parallel, automatically managing #' concurrency to prevent resource exhaustion. The concurrency limit can be #' configured via `ExtractionConfig::max_concurrent_extractions` or defaults #' to `(num_cpus * 1.5).ceil()`. #' #' Each item can optionally specify a [`FileExtractionConfig`] that overrides specific #' fields from the batch-level `config`. Pass `None` as the config to use #' the batch-level defaults for that item. #' @param items Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides. #' @param config Batch-level extraction configuration. #' @return A vector of `ExtractionResult` in the same order as the input items. #' @export batch_extract_bytes <- function(items, config = ExtractionConfig$default()) .Call("wrap__batch_extract_bytes", items, config, PACKAGE = "kreuzberg") #' Detect MIME type from raw file bytes #' #' Uses magic byte signatures to detect file type from content. #' Falls back to `infer` crate for comprehensive detection. #' #' For ZIP-based files, inspects contents to distinguish Office Open XML #' formats (DOCX, XLSX, PPTX) from plain ZIP archives. #' @param content Raw file bytes. #' @return The detected MIME type string. #' #' @section Errors: #' Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined. #' @export detect_mime_type_from_bytes <- function(content) .Call("wrap__detect_mime_type_from_bytes", content, PACKAGE = "kreuzberg") #' Get file extensions for a given MIME type #' #' Returns all known file extensions that map to the specified MIME type. #' @param mime_type The MIME type to look up. #' @return A vector of file extensions (without leading dot) for the MIME type. #' @export get_extensions_for_mime <- function(mime_type) .Call("wrap__get_extensions_for_mime", mime_type, PACKAGE = "kreuzberg") #' List the names of all registered embedding backends #' #' Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language #' bindings. #' @return List of character string. #' @export list_embedding_backends <- function() .Call("wrap__list_embedding_backends", PACKAGE = "kreuzberg") #' List names of all registered document extractors #' @return List of character string. #' @export list_document_extractors <- function() .Call("wrap__list_document_extractors", PACKAGE = "kreuzberg") #' List all registered OCR backends #' #' Returns the names of all OCR backends currently registered in the global registry. #' @return A vector of OCR backend names. #' @export list_ocr_backends <- function() .Call("wrap__list_ocr_backends", PACKAGE = "kreuzberg") #' List all registered post-processor names #' #' Returns a vector of all post-processor names currently registered in the #' global registry. #' @return - `Ok(Vec)` - Vector of post-processor names #' - `Err(...)` if the registry lock is poisoned. #' @export list_post_processors <- function() .Call("wrap__list_post_processors", PACKAGE = "kreuzberg") #' List names of all registered renderers #' @return List of character string. #' #' @section Errors: #' Returns an error if the registry lock is poisoned. #' @export list_renderers <- function() .Call("wrap__list_renderers", PACKAGE = "kreuzberg") #' List names of all registered validators #' @return List of character string. #' @export list_validators <- function() .Call("wrap__list_validators", PACKAGE = "kreuzberg") #' Compare two extraction results and return a structured diff #' #' The comparison is purely structural — no I/O, no side effects. All fields #' of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`]. #' @param a — the "before" extraction result. #' @param b — the "after" extraction result. #' @param opts — controls which sections are compared and optional truncation. #' @return ExtractionDiff object (list with class attribute). #' @export compare <- function(a = ExtractionResult$default(), b = ExtractionResult$default(), opts = DiffOptions$default()) .Call("wrap__compare", a, b, opts, PACKAGE = "kreuzberg") #' Generate embeddings asynchronously for a list of text strings #' #' This is the async counterpart to [`embed_texts`]. It offloads the blocking #' ONNX inference work to a dedicated blocking thread pool via Tokio's #' `spawn_blocking`, keeping the async executor free. #' #' Returns one embedding vector per input text in the same order. #' @param texts Vec of strings to embed (owned, sent to blocking thread). #' @param config Embedding configuration specifying model, batch size, and normalization. #' @return List of list of numeric. #' #' @section Errors: #' - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed #' - `KreuzbergError::Embedding` if the preset name is unknown, model download fails, #' or the blocking inference task panics #' @export embed_texts_async <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts_async", texts, config, PACKAGE = "kreuzberg") #' Render a single PDF page to PNG bytes #' #' Returns raw PNG-encoded bytes for the specified page at the given DPI. #' Uses pdf_oxide with tiny-skia for pure-Rust rendering. #' @param pdf_bytes Raw PDF file bytes. #' @param page_index Zero-based page index. #' @param dpi Resolution in dots per inch (default: 150). #' @param password Optional password for encrypted PDFs. #' @return Raw vector of bytes. #' #' @section Errors: #' Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated, #' or rendered, or if `page_index` is out of range. #' @export render_pdf_page_to_png <- function(pdf_bytes, page_index, dpi = NULL, password = NULL) .Call("wrap__render_pdf_page_to_png", pdf_bytes, page_index, dpi, password, PACKAGE = "kreuzberg") #' Detect the MIME type of a file at the given path #' #' Uses the file extension and optionally the file content to determine the MIME type. #' Set `check_exists` to `true` to verify the file exists before detection. #' @param path Character string. #' @param check_exists Logical (TRUE/FALSE). #' @return Character string. #' @export detect_mime_type <- function(path, check_exists) .Call("wrap__detect_mime_type", path, check_exists, PACKAGE = "kreuzberg") #' Embed a list of texts using the configured embedding model #' #' Returns a 2D vector where each inner vector is the embedding for the corresponding text. #' @param texts List of character string. #' @param config EmbeddingConfig object (list with class attribute). #' @return List of list of numeric. #' @export embed_texts <- function(texts, config = EmbeddingConfig$default()) .Call("wrap__embed_texts", texts, config, PACKAGE = "kreuzberg") #' Get an embedding preset by name #' #' Returns `None` if no preset with the given name exists. Returns an owned #' clone so the value is safe to pass across FFI boundaries. #' @param name Character string. #' @return Optional EmbeddingPreset object (list with class attribute). Defaults to NULL. #' @export get_embedding_preset <- function(name) .Call("wrap__get_embedding_preset", name, PACKAGE = "kreuzberg") #' List the names of all available embedding presets #' #' Returns owned `String`s so the values are safe to pass across FFI boundaries. #' @return List of character string. #' @export list_embedding_presets <- function() .Call("wrap__list_embedding_presets", PACKAGE = "kreuzberg") #' register_ocr_backend #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_ocr_backend <- function(r_backend) .Call("wrap__register_ocr_backend", r_backend, PACKAGE = "kreuzberg") #' unregister_ocr_backend #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_ocr_backend <- function(name) .Call("wrap__unregister_ocr_backend", name, PACKAGE = "kreuzberg") #' clear_ocr_backends #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_ocr_backends <- function() .Call("wrap__clear_ocr_backends", PACKAGE = "kreuzberg") #' register_post_processor #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_post_processor <- function(r_backend) .Call("wrap__register_post_processor", r_backend, PACKAGE = "kreuzberg") #' unregister_post_processor #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_post_processor <- function(name) .Call("wrap__unregister_post_processor", name, PACKAGE = "kreuzberg") #' clear_post_processors #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_post_processors <- function() .Call("wrap__clear_post_processors", PACKAGE = "kreuzberg") #' register_validator #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_validator <- function(r_backend) .Call("wrap__register_validator", r_backend, PACKAGE = "kreuzberg") #' unregister_validator #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_validator <- function(name) .Call("wrap__unregister_validator", name, PACKAGE = "kreuzberg") #' clear_validators #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_validators <- function() .Call("wrap__clear_validators", PACKAGE = "kreuzberg") #' register_embedding_backend #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_embedding_backend <- function(r_backend) .Call("wrap__register_embedding_backend", r_backend, PACKAGE = "kreuzberg") #' unregister_embedding_backend #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_embedding_backend <- function(name) .Call("wrap__unregister_embedding_backend", name, PACKAGE = "kreuzberg") #' clear_embedding_backends #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_embedding_backends <- function() .Call("wrap__clear_embedding_backends", PACKAGE = "kreuzberg") #' register_document_extractor #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_document_extractor <- function(r_backend) .Call("wrap__register_document_extractor", r_backend, PACKAGE = "kreuzberg") #' unregister_document_extractor #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_document_extractor <- function(name) .Call("wrap__unregister_document_extractor", name, PACKAGE = "kreuzberg") #' clear_document_extractors #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_document_extractors <- function() .Call("wrap__clear_document_extractors", PACKAGE = "kreuzberg") #' register_renderer #' #' Register an R-side plugin implementation. Pass a named list whose entries #' implement the trait's required methods (e.g. `list(name = function() "my", ...)`). #' #' @param r_backend Named list of R closures implementing the trait surface. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export register_renderer <- function(r_backend) .Call("wrap__register_renderer", r_backend, PACKAGE = "kreuzberg") #' unregister_renderer #' #' Unregister a previously registered plugin by name. #' #' @param name Plugin name string as returned by the backend's `name()` method. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export unregister_renderer <- function(name) .Call("wrap__unregister_renderer", name, PACKAGE = "kreuzberg") #' clear_renderers #' #' Remove every registered plugin of this type. Typically used in test teardown. #' #' @return Invisible NULL on success; raises an R error on failure. #' @export clear_renderers <- function() .Call("wrap__clear_renderers", PACKAGE = "kreuzberg") #' CacheStats #' @field total_files total_files #' @field total_size_mb total_size_mb #' @field available_space_mb available_space_mb #' @field oldest_file_age_days oldest_file_age_days #' @field newest_file_age_days newest_file_age_days #' @export CacheStats <- new.env(parent = emptyenv()) #' @export `$.CacheStats` <- function(self, name) { func <- CacheStats[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.CacheStats` <- `$.CacheStats` #' Hardware acceleration configuration for ONNX Runtime models #' #' Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used #' for inference in layout detection and embedding generation. #' @field provider Execution provider to use for ONNX inference. #' @field device_id GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. #' @export AccelerationConfig <- new.env(parent = emptyenv()) AccelerationConfig$from_json <- function(json) { .Call("wrap__AccelerationConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.AccelerationConfig` <- function(self, name) { func <- AccelerationConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.AccelerationConfig` <- `$.AccelerationConfig` #' Cross-extractor content filtering configuration #' #' Controls whether "furniture" content (headers, footers, page numbers, #' watermarks, repeating text) is included in or stripped from extraction #' results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.) #' with format-specific implementation. #' #' When `None` on `ExtractionConfig`, each extractor uses its current #' default behavior unchanged. #' @field include_headers Include running headers in extraction output. #' @field include_footers Include running footers in extraction output. #' @field strip_repeating_text Enable the heuristic cross-page repeating text detector. #' @field include_watermarks Include watermark text in extraction output. #' @export ContentFilterConfig <- new.env(parent = emptyenv()) ContentFilterConfig$default <- function() .Call("wrap__ContentFilterConfig__default", PACKAGE = "kreuzberg") ContentFilterConfig$from_json <- function(json) { .Call("wrap__ContentFilterConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.ContentFilterConfig` <- function(self, name) { func <- ContentFilterConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.ContentFilterConfig` <- `$.ContentFilterConfig` #' Configuration for email extraction #' @field msg_fallback_codepage Windows codepage number to use when an MSG file contains no codepage property. Defaults #' @export EmailConfig <- new.env(parent = emptyenv()) EmailConfig$from_json <- function(json) { .Call("wrap__EmailConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.EmailConfig` <- function(self, name) { func <- EmailConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.EmailConfig` <- `$.EmailConfig` #' Main extraction configuration #' #' This struct contains all configuration options for the extraction process. #' It can be loaded from TOML, YAML, or JSON files, or created programmatically. #' @field use_cache Enable caching of extraction results #' @field enable_quality_processing Enable quality post-processing #' @field ocr OCR configuration (None = OCR disabled) #' @field force_ocr Force OCR even for searchable PDFs #' @field force_ocr_pages Force OCR on specific pages only (1-indexed page numbers, must be >= 1). #' @field disable_ocr Disable OCR entirely, even for images. #' @field chunking Text chunking configuration (None = chunking disabled) #' @field content_filter Content filtering configuration (None = use extractor defaults). #' @field images Image extraction configuration (None = no image extraction) #' @field pdf_options PDF-specific options (None = use defaults) #' @field token_reduction Token reduction configuration (None = no token reduction) #' @field language_detection Language detection configuration (None = no language detection) #' @field pages Page extraction configuration (None = no page tracking) #' @field keywords Keyword extraction configuration (None = no keyword extraction) #' @field postprocessor Post-processor configuration (None = use defaults) #' @field html_options HTML to Markdown conversion options (None = use defaults) #' @field html_output Styled HTML output configuration. #' @field extraction_timeout_secs Default per-file timeout in seconds for batch extraction. #' @field max_concurrent_extractions Maximum concurrent extractions in batch operations (None = (num_cpus × #' @field result_format Result structure format #' @field security_limits Security limits for archive extraction. #' @field max_embedded_file_bytes Maximum uncompressed size in bytes for a single embedded file before recursive #' @field output_format Content text format (default: Plain). #' @field layout Layout detection configuration (None = layout detection disabled). #' @field use_layout_for_markdown Run layout detection on the non-OCR PDF markdown path. #' @field include_document_structure Enable structured document tree output. #' @field acceleration Hardware acceleration configuration for ONNX Runtime models. #' @field cache_namespace Cache namespace for tenant isolation. #' @field cache_ttl_secs Per-request cache TTL in seconds. #' @field email Email extraction configuration (None = use defaults). #' @field concurrency Concurrency limits for constrained environments (None = use defaults). #' @field max_archive_depth Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive #' @field tree_sitter Tree-sitter language pack configuration (None = tree-sitter disabled). #' @field structured_extraction Structured extraction via LLM (None = disabled). #' @field cancel_token Cancellation token for this extraction (None = no external cancellation). #' @export ExtractionConfig <- new.env(parent = emptyenv()) ExtractionConfig$default <- function() .Call("wrap__ExtractionConfig__default", PACKAGE = "kreuzberg") ExtractionConfig$needs_image_processing <- function(self) .Call("wrap__ExtractionConfig__needs_image_processing", self, PACKAGE = "kreuzberg") ExtractionConfig$from_json <- function(json) { .Call("wrap__ExtractionConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.ExtractionConfig` <- function(self, name) { func <- ExtractionConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.ExtractionConfig` <- `$.ExtractionConfig` #' @export needs_image_processing.ExtractionConfig <- function(x, ...) x$needs_image_processing(...) #' Per-file extraction configuration overrides for batch processing #' #' All fields are `Option` — `None` means "use the batch-level default." #' This type is used with `batch_extract_files` and #' `batch_extract_bytes` to allow heterogeneous #' extraction settings within a single batch. #' #' # Excluded Fields #' #' The following `ExtractionConfig` fields are batch-level only and #' cannot be overridden per file: #' - `max_concurrent_extractions` — controls batch parallelism #' - `use_cache` — global caching policy #' - `acceleration` — shared ONNX execution provider #' - `security_limits` — global archive security policy #' @field enable_quality_processing Override quality post-processing for this file. #' @field ocr Override OCR configuration for this file (None in the Option = use batch default). #' @field force_ocr Override force OCR for this file. #' @field force_ocr_pages Override force OCR pages for this file (1-indexed page numbers). #' @field disable_ocr Override disable OCR for this file. #' @field chunking Override chunking configuration for this file. #' @field content_filter Override content filtering configuration for this file. #' @field images Override image extraction configuration for this file. #' @field pdf_options Override PDF options for this file. #' @field token_reduction Override token reduction for this file. #' @field language_detection Override language detection for this file. #' @field pages Override page extraction for this file. #' @field keywords Override keyword extraction for this file. #' @field postprocessor Override post-processor for this file. #' @field html_options Override HTML conversion options for this file. #' @field result_format Override result format for this file. #' @field output_format Override output content format for this file. #' @field include_document_structure Override document structure output for this file. #' @field layout Override layout detection for this file. #' @field timeout_secs Override per-file extraction timeout in seconds. #' @field tree_sitter Override tree-sitter configuration for this file. #' @field structured_extraction Override structured extraction configuration for this file. #' @export FileExtractionConfig <- new.env(parent = emptyenv()) FileExtractionConfig$from_json <- function(json) { .Call("wrap__FileExtractionConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.FileExtractionConfig` <- function(self, name) { func <- FileExtractionConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.FileExtractionConfig` <- `$.FileExtractionConfig` #' Batch item for byte array extraction #' #' Used with `batch_extract_bytes` and `batch_extract_bytes_sync` #' to represent a single item in a batch extraction job. #' @field content The content bytes to extract from #' @field mime_type MIME type of the content (e.g., "application/pdf", "text/html") #' @field config Per-item configuration overrides (None uses batch-level defaults) #' @export BatchBytesItem <- new.env(parent = emptyenv()) #' @export `$.BatchBytesItem` <- function(self, name) { func <- BatchBytesItem[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.BatchBytesItem` <- `$.BatchBytesItem` #' Batch item for file extraction #' #' Used with `batch_extract_files` and `batch_extract_files_sync` #' to represent a single file in a batch extraction job. #' @field path Path to the file to extract from #' @field config Per-file configuration overrides (None uses batch-level defaults) #' @export BatchFileItem <- new.env(parent = emptyenv()) #' @export `$.BatchFileItem` <- function(self, name) { func <- BatchFileItem[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.BatchFileItem` <- `$.BatchFileItem` #' Image extraction configuration #' @field extract_images Extract images from documents #' @field target_dpi Target DPI for image normalization #' @field max_image_dimension Maximum dimension for images (width or height) #' @field inject_placeholders Whether to inject image reference placeholders into markdown output. When `true` #' @field auto_adjust_dpi Automatically adjust DPI based on image content #' @field min_dpi Minimum DPI threshold #' @field max_dpi Maximum DPI threshold #' @field max_images_per_page Maximum number of image objects to extract per PDF page. #' @field classify When `true` (default), extracted images are classified by kind and grouped into clusters where they #' @field include_page_rasters When `true`, full-page renders produced during OCR preprocessing are captured and #' @field run_ocr_on_images Run OCR on extracted images and include the recognized text in the document content. #' @field ocr_text_only When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown #' @field append_ocr_text When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder #' @export ImageExtractionConfig <- new.env(parent = emptyenv()) ImageExtractionConfig$default <- function() .Call("wrap__ImageExtractionConfig__default", PACKAGE = "kreuzberg") ImageExtractionConfig$from_json <- function(json) { .Call("wrap__ImageExtractionConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.ImageExtractionConfig` <- function(self, name) { func <- ImageExtractionConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.ImageExtractionConfig` <- `$.ImageExtractionConfig` #' Token reduction configuration #' @field mode Reduction mode: "off", "light", "moderate", "aggressive", "maximum" #' @field preserve_important_words Preserve important words (capitalized, technical terms) #' @export TokenReductionOptions <- new.env(parent = emptyenv()) TokenReductionOptions$default <- function() .Call("wrap__TokenReductionOptions__default", PACKAGE = "kreuzberg") TokenReductionOptions$from_json <- function(json) { .Call("wrap__TokenReductionOptions__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.TokenReductionOptions` <- function(self, name) { func <- TokenReductionOptions[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.TokenReductionOptions` <- `$.TokenReductionOptions` #' Language detection configuration #' @field enabled Enable language detection #' @field min_confidence Minimum confidence threshold (0.0-1.0) #' @field detect_multiple Detect multiple languages in the document #' @export LanguageDetectionConfig <- new.env(parent = emptyenv()) LanguageDetectionConfig$default <- function() .Call("wrap__LanguageDetectionConfig__default", PACKAGE = "kreuzberg") LanguageDetectionConfig$from_json <- function(json) { .Call("wrap__LanguageDetectionConfig__from_json", json, PACKAGE = "kreuzberg") } #' @export `$.LanguageDetectionConfig` <- function(self, name) { func <- LanguageDetectionConfig[[name]] if (identical(names(formals(func))[1], "self")) { function(...) func(self, ...) } else { func } } #' @export `[[.LanguageDetectionConfig` <- `$.LanguageDetectionConfig` #' Configuration for styled HTML output #' #' When set on [`ExtractionConfig::html_output`] alongside #' `output_format = OutputFormat::Html`, the pipeline builds a #' [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of #' the plain comrak-based renderer. #' @field css Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file` #' @field css_file Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both #' @field theme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. #' @field class_prefix CSS class prefix applied to every emitted class name. #' @field embed_css When `true` (default), write the resolved CSS into a `