Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/packages/java/dev/kreuzberg/Kreuzberg.java
+++ b/packages/java/dev/kreuzberg/Kreuzberg.java
@@ -0,0 +1,578 @@
+// This file is auto-generated by alef — DO NOT EDIT.
+// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+// To regenerate: alef generate
+// To verify freshness: alef verify --exit-code
+// Issues & docs: https://github.com/kreuzberg-dev/alef
+package dev.kreuzberg;
+
+import java.util.List;
+import org.jspecify.annotations.Nullable;
+public final class Kreuzberg {
+    private Kreuzberg() { }
+
+    /**
+     * Extract content from a byte array.
+     *
+     * This is the main entry point for in-memory extraction. It performs the following steps:
+     * 1. Validate MIME type
+     * 2. Handle legacy format conversion if needed
+     * 3. Select appropriate extractor from registry
+     * 4. Extract content
+     * 5. Run post-processing pipeline
+     * {@literal @}param content The byte array to extract
+     *
+     * {@literal @}param mime_type MIME type of the content
+     *
+     * {@literal @}param config Extraction configuration
+     *
+     * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
+     *
+     * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid.
+     * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
+     */
+    public static ExtractionResult extractBytes(
+        final byte[] content,
+        final String mimeType,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(content, "content must not be null");
+        java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.extractBytes(content, mimeType, config);
+    }
+
+    /**
+     * Extract content from a file.
+     *
+     * This is the main entry point for file-based extraction. It performs the following steps:
+     * 1. Check cache for existing result (if caching enabled)
+     * 2. Detect or validate MIME type
+     * 3. Select appropriate extractor from registry
+     * 4. Extract content
+     * 5. Run post-processing pipeline
+     * 6. Store result in cache (if caching enabled)
+     * {@literal @}param path Path to the file to extract
+     *
+     * {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected
+     *
+     * {@literal @}param config Extraction configuration
+     *
+     * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
+     *
+     * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O
+     * errors.
+     * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
+     */
+    public static ExtractionResult extractFile(
+        final java.nio.file.Path path,
+        final @Nullable String mimeType,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(path, "path must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.extractFile(path, mimeType, config);
+    }
+
+    public static ExtractionResult extractFile(final java.nio.file.Path path, final ExtractionConfig config) throws KreuzbergRsException {
+        return KreuzbergRs.extractFile(path, null, config);
+    }
+
+    /**
+     * Synchronous wrapper for {@code extract_file}.
+     *
+     * This is a convenience function that blocks the current thread until extraction completes.
+     * For async code, use {@code extract_file} directly.
+     *
+     * Uses the global Tokio runtime for 100x+ performance improvement over creating
+     * a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
+     *
+     * This function is only available with the {@code tokio-runtime} feature. For WASM targets,
+     * use a truly synchronous extraction approach instead.
+     */
+    public static ExtractionResult extractFileSync(
+        final java.nio.file.Path path,
+        final @Nullable String mimeType,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(path, "path must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.extractFileSync(path, mimeType, config);
+    }
+
+    public static ExtractionResult extractFileSync(
+        final java.nio.file.Path path,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        return KreuzbergRs.extractFileSync(path, null, config);
+    }
+
+    /**
+     * Synchronous wrapper for {@code extract_bytes}.
+     *
+     * Uses the global Tokio runtime for 100x+ performance improvement over creating
+     * a new runtime per call.
+     *
+     * With the {@code tokio-runtime} feature, this blocks the current thread using the global
+     * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
+     */
+    public static ExtractionResult extractBytesSync(
+        final byte[] content,
+        final String mimeType,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(content, "content must not be null");
+        java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.extractBytesSync(content, mimeType, config);
+    }
+
+    /**
+     * Synchronous wrapper for {@code batch_extract_files}.
+     *
+     * Uses the global Tokio runtime for optimal performance.
+     * Only available with {@code tokio-runtime} (WASM has no filesystem).
+     */
+    public static List<ExtractionResult> batchExtractFilesSync(
+        final List<BatchFileItem> items,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(items, "items must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.batchExtractFilesSync(items, config);
+    }
+
+    /**
+     * Synchronous wrapper for {@code batch_extract_bytes}.
+     *
+     * Uses the global Tokio runtime for optimal performance.
+     * With the {@code tokio-runtime} feature, this blocks the current thread using the global
+     * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
+     * that iterates through items and calls {@code extract_bytes_sync()}.
+     */
+    public static List<ExtractionResult> batchExtractBytesSync(
+        final List<BatchBytesItem> items,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(items, "items must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.batchExtractBytesSync(items, config);
+    }
+
+    /**
+     * Extract content from multiple files concurrently.
+     *
+     * This function processes multiple files in parallel, automatically managing
+     * concurrency to prevent resource exhaustion. The concurrency limit can be
+     * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
+     * to {@code (num_cpus * 1.5).ceil()}.
+     *
+     * Each file can optionally specify a FileExtractionConfig that overrides specific
+     * fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults.
+     * Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always
+     * taken from the batch-level {@code config}.
+     * {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration
+     * overrides.
+     *
+     * {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings)
+     *
+     * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
+     *
+     * {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors
+     * (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
+     */
+    public static List<ExtractionResult> batchExtractFiles(
+        final List<BatchFileItem> items,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(items, "items must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.batchExtractFiles(items, config);
+    }
+
+    /**
+     * Extract content from multiple byte arrays concurrently.
+     *
+     * This function processes multiple byte arrays in parallel, automatically managing
+     * concurrency to prevent resource exhaustion. The concurrency limit can be
+     * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
+     * to {@code (num_cpus * 1.5).ceil()}.
+     *
+     * Each item can optionally specify a FileExtractionConfig that overrides specific
+     * fields from the batch-level {@code config}. Pass {@code None} as the config to use
+     * the batch-level defaults for that item.
+     * {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item
+     * configuration overrides.
+     *
+     * {@literal @}param config Batch-level extraction configuration
+     *
+     * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
+     */
+    public static List<ExtractionResult> batchExtractBytes(
+        final List<BatchBytesItem> items,
+        final ExtractionConfig config
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(items, "items must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.batchExtractBytes(items, config);
+    }
+
+    /**
+     * Detect MIME type from raw file bytes.
+     *
+     * Uses magic byte signatures to detect file type from content.
+     * Falls back to {@code infer} crate for comprehensive detection.
+     *
+     * For ZIP-based files, inspects contents to distinguish Office Open XML
+     * formats (DOCX, XLSX, PPTX) from plain ZIP archives.
+     * {@literal @}param content Raw file bytes
+     *
+     * {@literal @}return The detected MIME type string.
+     *
+     * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined.
+     */
+    public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(content, "content must not be null");
+        return KreuzbergRs.detectMimeTypeFromBytes(content);
+    }
+
+    /**
+     * Get file extensions for a given MIME type.
+     *
+     * Returns all known file extensions that map to the specified MIME type.
+     * {@literal @}param mime_type The MIME type to look up
+     *
+     * {@literal @}return A vector of file extensions (without leading dot) for the MIME type.
+     */
+    public static List<String> getExtensionsForMime(final String mimeType) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
+        return KreuzbergRs.getExtensionsForMime(mimeType);
+    }
+
+    /**
+     * Clear all embedding backends from the global registry.
+     *
+     * Calls {@code shutdown()} on every registered backend, then empties the registry.
+     * {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error
+     *   encountered stops processing of remaining backends.
+     */
+    public static void clearEmbeddingBackends() throws KreuzbergRsException {
+        KreuzbergRs.clearEmbeddingBackends();
+    }
+
+    /**
+     * List the names of all registered embedding backends.
+     *
+     * Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language
+     * bindings.
+     */
+    public static List<String> listEmbeddingBackends() throws KreuzbergRsException {
+        return KreuzbergRs.listEmbeddingBackends();
+    }
+
+    /**
+     * List names of all registered document extractors.
+     */
+    public static List<String> listDocumentExtractors() throws KreuzbergRsException {
+        return KreuzbergRs.listDocumentExtractors();
+    }
+
+    /**
+     * Clear all document extractors from the global registry.
+     *
+     * Calls {@code shutdown()} on every registered extractor, then empties the registry.
+     * {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error
+     *   encountered stops processing of remaining extractors.
+     */
+    public static void clearDocumentExtractors() throws KreuzbergRsException {
+        KreuzbergRs.clearDocumentExtractors();
+    }
+
+    /**
+     * List all registered OCR backends.
+     *
+     * Returns the names of all OCR backends currently registered in the global registry.
+     * {@literal @}return A vector of OCR backend names.
+     */
+    public static List<String> listOcrBackends() throws KreuzbergRsException {
+        return KreuzbergRs.listOcrBackends();
+    }
+
+    /**
+     * Clear all OCR backends from the global registry.
+     *
+     * Removes all OCR backends and calls their {@code shutdown()} methods.
+     * {@literal @}return - {@code Ok(())} if all backends were cleared successfully
+     * - {@code Err(...)} if any shutdown method failed
+     */
+    public static void clearOcrBackends() throws KreuzbergRsException {
+        KreuzbergRs.clearOcrBackends();
+    }
+
+    /**
+     * List all registered post-processor names.
+     *
+     * Returns a vector of all post-processor names currently registered in the
+     * global registry.
+     * {@literal @}return - {@code Ok(Vec&lt;String&gt;)} - Vector of post-processor names
+     * - {@code Err(...)} if the registry lock is poisoned
+     */
+    public static List<String> listPostProcessors() throws KreuzbergRsException {
+        return KreuzbergRs.listPostProcessors();
+    }
+
+    /**
+     * Remove all registered post-processors.
+     */
+    public static void clearPostProcessors() throws KreuzbergRsException {
+        KreuzbergRs.clearPostProcessors();
+    }
+
+    /**
+     * List names of all registered renderers.
+     * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
+     */
+    public static List<String> listRenderers() throws KreuzbergRsException {
+        return KreuzbergRs.listRenderers();
+    }
+
+    /**
+     * Clear all renderers from the global registry.
+     *
+     * Removes every renderer, including the built-in defaults (markdown, html,
+     * djot, plain). After calling this no renderers are registered; re-register
+     * as needed.
+     * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
+     */
+    public static void clearRenderers() throws KreuzbergRsException {
+        KreuzbergRs.clearRenderers();
+    }
+
+    /**
+     * List names of all registered validators.
+     */
+    public static List<String> listValidators() throws KreuzbergRsException {
+        return KreuzbergRs.listValidators();
+    }
+
+    /**
+     * Remove all registered validators.
+     */
+    public static void clearValidators() throws KreuzbergRsException {
+        KreuzbergRs.clearValidators();
+    }
+
+    /**
+     * Compare two extraction results and return a structured diff.
+     *
+     * The comparison is purely structural — no I/O, no side effects. All fields
+     * of ExtractionDiff are populated according to the provided DiffOptions.
+     * {@literal @}param a — the "before" extraction result
+     *
+     * {@literal @}param b — the "after" extraction result
+     *
+     * {@literal @}param opts — controls which sections are compared and optional truncation
+     */
+    public static ExtractionDiff compare(
+        final ExtractionResult a,
+        final ExtractionResult b,
+        final DiffOptions opts
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(a, "a must not be null");
+        java.util.Objects.requireNonNull(b, "b must not be null");
+        java.util.Objects.requireNonNull(opts, "opts must not be null");
+        return KreuzbergRs.compare(a, b, opts);
+    }
+
+    /**
+     * Generate embeddings asynchronously for a list of text strings.
+     *
+     * This is the async counterpart to embed_texts. It offloads the blocking
+     * ONNX inference work to a dedicated blocking thread pool via Tokio's
+     * {@code spawn_blocking}, keeping the async executor free.
+     *
+     * Returns one embedding vector per input text in the same order.
+     * {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread)
+     *
+     * {@literal @}param config Embedding configuration specifying model, batch size, and normalization
+     *
+     * {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed
+     * - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails,
+     *   or the blocking inference task panics
+     */
+    public static List<List<Float>> embedTextsAsync(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(texts, "texts must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.embedTextsAsync(texts, config);
+    }
+
+    /**
+     * Render a single PDF page to PNG bytes.
+     *
+     * Returns raw PNG-encoded bytes for the specified page at the given DPI.
+     * Uses pdf_oxide with tiny-skia for pure-Rust rendering.
+     * {@literal @}param pdf_bytes Raw PDF file bytes
+     *
+     * {@literal @}param page_index Zero-based page index
+     *
+     * {@literal @}param dpi Resolution in dots per inch (default: 150)
+     *
+     * {@literal @}param password Optional password for encrypted PDFs
+     *
+     * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated,
+     * or rendered, or if {@code page_index} is out of range.
+     */
+    public static byte[] renderPdfPageToPng(
+        final byte[] pdfBytes,
+        final long pageIndex,
+        final @Nullable Integer dpi,
+        final @Nullable String password
+    ) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(pdfBytes, "pdfBytes must not be null");
+        java.util.Objects.requireNonNull(pageIndex, "pageIndex must not be null");
+        return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, dpi, password);
+    }
+
+    public static byte[] renderPdfPageToPng(final byte[] pdfBytes, final long pageIndex) throws KreuzbergRsException {
+        return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, 0, null);
+    }
+
+    /**
+     * Detect the MIME type of a file at the given path.
+     *
+     * Uses the file extension and optionally the file content to determine the MIME type.
+     * Set {@code check_exists} to {@code true} to verify the file exists before detection.
+     */
+    public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(path, "path must not be null");
+        java.util.Objects.requireNonNull(checkExists, "checkExists must not be null");
+        return KreuzbergRs.detectMimeType(path, checkExists);
+    }
+
+    /**
+     * Embed a list of texts using the configured embedding model.
+     *
+     * Returns a 2D vector where each inner vector is the embedding for the corresponding text.
+     */
+    public static List<List<Float>> embedTexts(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(texts, "texts must not be null");
+        java.util.Objects.requireNonNull(config, "config must not be null");
+        return KreuzbergRs.embedTexts(texts, config);
+    }
+
+    /**
+     * Get an embedding preset by name.
+     *
+     * Returns {@code None} if no preset with the given name exists. Returns an owned
+     * clone so the value is safe to pass across FFI boundaries.
+     */
+    public static @Nullable EmbeddingPreset getEmbeddingPreset(final String name) throws KreuzbergRsException {
+        java.util.Objects.requireNonNull(name, "name must not be null");
+        return KreuzbergRs.getEmbeddingPreset(name).orElse(null);
+    }
+
+    /**
+     * List the names of all available embedding presets.
+     *
+     * Returns owned {@code String}s so the values are safe to pass across FFI boundaries.
+     */
+    public static List<String> listEmbeddingPresets() throws KreuzbergRsException {
+        return KreuzbergRs.listEmbeddingPresets();
+    }
+    public static void registerOcrBackend(final IOcrBackend impl) throws KreuzbergRsException {
+        try {
+            OcrBackendBridge.registerOcrBackend(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterOcrBackend(final String name) throws KreuzbergRsException {
+        try {
+            OcrBackendBridge.unregisterOcrBackend(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void registerPostProcessor(final IPostProcessor impl) throws KreuzbergRsException {
+        try {
+            PostProcessorBridge.registerPostProcessor(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterPostProcessor(final String name) throws KreuzbergRsException {
+        try {
+            PostProcessorBridge.unregisterPostProcessor(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void registerValidator(final IValidator impl) throws KreuzbergRsException {
+        try {
+            ValidatorBridge.registerValidator(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterValidator(final String name) throws KreuzbergRsException {
+        try {
+            ValidatorBridge.unregisterValidator(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void registerEmbeddingBackend(final IEmbeddingBackend impl) throws KreuzbergRsException {
+        try {
+            EmbeddingBackendBridge.registerEmbeddingBackend(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterEmbeddingBackend(final String name) throws KreuzbergRsException {
+        try {
+            EmbeddingBackendBridge.unregisterEmbeddingBackend(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void registerDocumentExtractor(final IDocumentExtractor impl) throws KreuzbergRsException {
+        try {
+            DocumentExtractorBridge.registerDocumentExtractor(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterDocumentExtractor(final String name) throws KreuzbergRsException {
+        try {
+            DocumentExtractorBridge.unregisterDocumentExtractor(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void registerRenderer(final IRenderer impl) throws KreuzbergRsException {
+        try {
+            RendererBridge.registerRenderer(impl);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+    public static void unregisterRenderer(final String name) throws KreuzbergRsException {
+        try {
+            RendererBridge.unregisterRenderer(name);
+        } catch (Exception e) {
+            throw new KreuzbergRsException(e.getMessage(), e);
+        }
+    }
+
+
+}