// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import java.util.List; import org.jspecify.annotations.Nullable; public final class Kreuzberg { private Kreuzberg() { } /** * Extract content from a byte array. * * This is the main entry point for in-memory extraction. It performs the following steps: * 1. Validate MIME type * 2. Handle legacy format conversion if needed * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * {@literal @}param content The byte array to extract * * {@literal @}param mime_type MIME type of the content * * {@literal @}param config Extraction configuration * * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid. * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported. */ public static ExtractionResult extractBytes( final byte[] content, final String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(content, "content must not be null"); java.util.Objects.requireNonNull(mimeType, "mimeType must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.extractBytes(content, mimeType, config); } /** * Extract content from a file. * * This is the main entry point for file-based extraction. It performs the following steps: * 1. Check cache for existing result (if caching enabled) * 2. Detect or validate MIME type * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * 6. Store result in cache (if caching enabled) * {@literal @}param path Path to the file to extract * * {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected * * {@literal @}param config Extraction configuration * * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O * errors. * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported. */ public static ExtractionResult extractFile( final java.nio.file.Path path, final @Nullable String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(path, "path must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.extractFile(path, mimeType, config); } public static ExtractionResult extractFile(final java.nio.file.Path path, final ExtractionConfig config) throws KreuzbergRsException { return KreuzbergRs.extractFile(path, null, config); } /** * Synchronous wrapper for {@code extract_file}. * * This is a convenience function that blocks the current thread until extraction completes. * For async code, use {@code extract_file} directly. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. Always uses the global runtime to avoid nested runtime issues. * * This function is only available with the {@code tokio-runtime} feature. For WASM targets, * use a truly synchronous extraction approach instead. */ public static ExtractionResult extractFileSync( final java.nio.file.Path path, final @Nullable String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(path, "path must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.extractFileSync(path, mimeType, config); } public static ExtractionResult extractFileSync( final java.nio.file.Path path, final ExtractionConfig config ) throws KreuzbergRsException { return KreuzbergRs.extractFileSync(path, null, config); } /** * Synchronous wrapper for {@code extract_bytes}. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. * * With the {@code tokio-runtime} feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation. */ public static ExtractionResult extractBytesSync( final byte[] content, final String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(content, "content must not be null"); java.util.Objects.requireNonNull(mimeType, "mimeType must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.extractBytesSync(content, mimeType, config); } /** * Synchronous wrapper for {@code batch_extract_files}. * * Uses the global Tokio runtime for optimal performance. * Only available with {@code tokio-runtime} (WASM has no filesystem). */ public static List batchExtractFilesSync( final List items, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(items, "items must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.batchExtractFilesSync(items, config); } /** * Synchronous wrapper for {@code batch_extract_bytes}. * * Uses the global Tokio runtime for optimal performance. * With the {@code tokio-runtime} feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation * that iterates through items and calls {@code extract_bytes_sync()}. */ public static List batchExtractBytesSync( final List items, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(items, "items must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.batchExtractBytesSync(items, config); } /** * Extract content from multiple files concurrently. * * This function processes multiple files in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults * to {@code (num_cpus * 1.5).ceil()}. * * Each file can optionally specify a FileExtractionConfig that overrides specific * fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults. * Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always * taken from the batch-level {@code config}. * {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration * overrides. * * {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings) * * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items. * * {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors * (IO, RuntimeError equivalents) will bubble up and fail the entire batch. */ public static List batchExtractFiles( final List items, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(items, "items must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.batchExtractFiles(items, config); } /** * Extract content from multiple byte arrays concurrently. * * This function processes multiple byte arrays in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults * to {@code (num_cpus * 1.5).ceil()}. * * Each item can optionally specify a FileExtractionConfig that overrides specific * fields from the batch-level {@code config}. Pass {@code None} as the config to use * the batch-level defaults for that item. * {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item * configuration overrides. * * {@literal @}param config Batch-level extraction configuration * * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items. */ public static List batchExtractBytes( final List items, final ExtractionConfig config ) throws KreuzbergRsException { java.util.Objects.requireNonNull(items, "items must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.batchExtractBytes(items, config); } /** * Detect MIME type from raw file bytes. * * Uses magic byte signatures to detect file type from content. * Falls back to {@code infer} crate for comprehensive detection. * * For ZIP-based files, inspects contents to distinguish Office Open XML * formats (DOCX, XLSX, PPTX) from plain ZIP archives. * {@literal @}param content Raw file bytes * * {@literal @}return The detected MIME type string. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined. */ public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException { java.util.Objects.requireNonNull(content, "content must not be null"); return KreuzbergRs.detectMimeTypeFromBytes(content); } /** * Get file extensions for a given MIME type. * * Returns all known file extensions that map to the specified MIME type. * {@literal @}param mime_type The MIME type to look up * * {@literal @}return A vector of file extensions (without leading dot) for the MIME type. */ public static List getExtensionsForMime(final String mimeType) throws KreuzbergRsException { java.util.Objects.requireNonNull(mimeType, "mimeType must not be null"); return KreuzbergRs.getExtensionsForMime(mimeType); } /** * Clear all embedding backends from the global registry. * * Calls {@code shutdown()} on every registered backend, then empties the registry. * {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error * encountered stops processing of remaining backends. */ public static void clearEmbeddingBackends() throws KreuzbergRsException { KreuzbergRs.clearEmbeddingBackends(); } /** * List the names of all registered embedding backends. * * Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language * bindings. */ public static List listEmbeddingBackends() throws KreuzbergRsException { return KreuzbergRs.listEmbeddingBackends(); } /** * List names of all registered document extractors. */ public static List listDocumentExtractors() throws KreuzbergRsException { return KreuzbergRs.listDocumentExtractors(); } /** * Clear all document extractors from the global registry. * * Calls {@code shutdown()} on every registered extractor, then empties the registry. * {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error * encountered stops processing of remaining extractors. */ public static void clearDocumentExtractors() throws KreuzbergRsException { KreuzbergRs.clearDocumentExtractors(); } /** * List all registered OCR backends. * * Returns the names of all OCR backends currently registered in the global registry. * {@literal @}return A vector of OCR backend names. */ public static List listOcrBackends() throws KreuzbergRsException { return KreuzbergRs.listOcrBackends(); } /** * Clear all OCR backends from the global registry. * * Removes all OCR backends and calls their {@code shutdown()} methods. * {@literal @}return - {@code Ok(())} if all backends were cleared successfully * - {@code Err(...)} if any shutdown method failed */ public static void clearOcrBackends() throws KreuzbergRsException { KreuzbergRs.clearOcrBackends(); } /** * List all registered post-processor names. * * Returns a vector of all post-processor names currently registered in the * global registry. * {@literal @}return - {@code Ok(Vec<String>)} - Vector of post-processor names * - {@code Err(...)} if the registry lock is poisoned */ public static List listPostProcessors() throws KreuzbergRsException { return KreuzbergRs.listPostProcessors(); } /** * Remove all registered post-processors. */ public static void clearPostProcessors() throws KreuzbergRsException { KreuzbergRs.clearPostProcessors(); } /** * List names of all registered renderers. * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned. */ public static List listRenderers() throws KreuzbergRsException { return KreuzbergRs.listRenderers(); } /** * Clear all renderers from the global registry. * * Removes every renderer, including the built-in defaults (markdown, html, * djot, plain). After calling this no renderers are registered; re-register * as needed. * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned. */ public static void clearRenderers() throws KreuzbergRsException { KreuzbergRs.clearRenderers(); } /** * List names of all registered validators. */ public static List listValidators() throws KreuzbergRsException { return KreuzbergRs.listValidators(); } /** * Remove all registered validators. */ public static void clearValidators() throws KreuzbergRsException { KreuzbergRs.clearValidators(); } /** * Compare two extraction results and return a structured diff. * * The comparison is purely structural — no I/O, no side effects. All fields * of ExtractionDiff are populated according to the provided DiffOptions. * {@literal @}param a — the "before" extraction result * * {@literal @}param b — the "after" extraction result * * {@literal @}param opts — controls which sections are compared and optional truncation */ public static ExtractionDiff compare( final ExtractionResult a, final ExtractionResult b, final DiffOptions opts ) throws KreuzbergRsException { java.util.Objects.requireNonNull(a, "a must not be null"); java.util.Objects.requireNonNull(b, "b must not be null"); java.util.Objects.requireNonNull(opts, "opts must not be null"); return KreuzbergRs.compare(a, b, opts); } /** * Generate embeddings asynchronously for a list of text strings. * * This is the async counterpart to embed_texts. It offloads the blocking * ONNX inference work to a dedicated blocking thread pool via Tokio's * {@code spawn_blocking}, keeping the async executor free. * * Returns one embedding vector per input text in the same order. * {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread) * * {@literal @}param config Embedding configuration specifying model, batch size, and normalization * * {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed * - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails, * or the blocking inference task panics */ public static List> embedTextsAsync(final List texts, final EmbeddingConfig config) throws KreuzbergRsException { java.util.Objects.requireNonNull(texts, "texts must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.embedTextsAsync(texts, config); } /** * Render a single PDF page to PNG bytes. * * Returns raw PNG-encoded bytes for the specified page at the given DPI. * Uses pdf_oxide with tiny-skia for pure-Rust rendering. * {@literal @}param pdf_bytes Raw PDF file bytes * * {@literal @}param page_index Zero-based page index * * {@literal @}param dpi Resolution in dots per inch (default: 150) * * {@literal @}param password Optional password for encrypted PDFs * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated, * or rendered, or if {@code page_index} is out of range. */ public static byte[] renderPdfPageToPng( final byte[] pdfBytes, final long pageIndex, final @Nullable Integer dpi, final @Nullable String password ) throws KreuzbergRsException { java.util.Objects.requireNonNull(pdfBytes, "pdfBytes must not be null"); java.util.Objects.requireNonNull(pageIndex, "pageIndex must not be null"); return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, dpi, password); } public static byte[] renderPdfPageToPng(final byte[] pdfBytes, final long pageIndex) throws KreuzbergRsException { return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, 0, null); } /** * Detect the MIME type of a file at the given path. * * Uses the file extension and optionally the file content to determine the MIME type. * Set {@code check_exists} to {@code true} to verify the file exists before detection. */ public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException { java.util.Objects.requireNonNull(path, "path must not be null"); java.util.Objects.requireNonNull(checkExists, "checkExists must not be null"); return KreuzbergRs.detectMimeType(path, checkExists); } /** * Embed a list of texts using the configured embedding model. * * Returns a 2D vector where each inner vector is the embedding for the corresponding text. */ public static List> embedTexts(final List texts, final EmbeddingConfig config) throws KreuzbergRsException { java.util.Objects.requireNonNull(texts, "texts must not be null"); java.util.Objects.requireNonNull(config, "config must not be null"); return KreuzbergRs.embedTexts(texts, config); } /** * Get an embedding preset by name. * * Returns {@code None} if no preset with the given name exists. Returns an owned * clone so the value is safe to pass across FFI boundaries. */ public static @Nullable EmbeddingPreset getEmbeddingPreset(final String name) throws KreuzbergRsException { java.util.Objects.requireNonNull(name, "name must not be null"); return KreuzbergRs.getEmbeddingPreset(name).orElse(null); } /** * List the names of all available embedding presets. * * Returns owned {@code String}s so the values are safe to pass across FFI boundaries. */ public static List listEmbeddingPresets() throws KreuzbergRsException { return KreuzbergRs.listEmbeddingPresets(); } public static void registerOcrBackend(final IOcrBackend impl) throws KreuzbergRsException { try { OcrBackendBridge.registerOcrBackend(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterOcrBackend(final String name) throws KreuzbergRsException { try { OcrBackendBridge.unregisterOcrBackend(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void registerPostProcessor(final IPostProcessor impl) throws KreuzbergRsException { try { PostProcessorBridge.registerPostProcessor(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterPostProcessor(final String name) throws KreuzbergRsException { try { PostProcessorBridge.unregisterPostProcessor(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void registerValidator(final IValidator impl) throws KreuzbergRsException { try { ValidatorBridge.registerValidator(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterValidator(final String name) throws KreuzbergRsException { try { ValidatorBridge.unregisterValidator(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void registerEmbeddingBackend(final IEmbeddingBackend impl) throws KreuzbergRsException { try { EmbeddingBackendBridge.registerEmbeddingBackend(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterEmbeddingBackend(final String name) throws KreuzbergRsException { try { EmbeddingBackendBridge.unregisterEmbeddingBackend(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void registerDocumentExtractor(final IDocumentExtractor impl) throws KreuzbergRsException { try { DocumentExtractorBridge.registerDocumentExtractor(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterDocumentExtractor(final String name) throws KreuzbergRsException { try { DocumentExtractorBridge.unregisterDocumentExtractor(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void registerRenderer(final IRenderer impl) throws KreuzbergRsException { try { RendererBridge.registerRenderer(impl); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } public static void unregisterRenderer(final String name) throws KreuzbergRsException { try { RendererBridge.unregisterRenderer(name); } catch (Exception e) { throw new KreuzbergRsException(e.getMessage(), e); } } }