// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef package dev.kreuzberg; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; import java.lang.foreign.ValueLayout; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import org.jspecify.annotations.Nullable; public final class KreuzbergRs { private KreuzbergRs() { } /** * Extract content from a byte array. * * This is the main entry point for in-memory extraction. It performs the following steps: * 1. Validate MIME type * 2. Handle legacy format conversion if needed * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * {@literal @}param content The byte array to extract * * {@literal @}param mime_type MIME type of the content * * {@literal @}param config Extraction configuration * * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid. * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported. */ public static ExtractionResult extractBytes( final byte[] content, final String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content); var ccontentLen = (long) content.length; var cmimeType = arena.allocateFrom(mimeType); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES.invoke(ccontent, ccontentLen, cmimeType, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); throw new KreuzbergRsException("extractBytes: failed to serialize response", null); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return MAPPER.readValue(json, ExtractionResult.class); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } public static CompletableFuture extractBytesAsync( final byte[] content, final String mimeType, final ExtractionConfig config ) { return CompletableFuture.supplyAsync(() -> { try { return extractBytes(content, mimeType, config); } catch (Throwable e) { throw new CompletionException(e); } }); } /** * Extract content from a file. * * This is the main entry point for file-based extraction. It performs the following steps: * 1. Check cache for existing result (if caching enabled) * 2. Detect or validate MIME type * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * 6. Store result in cache (if caching enabled) * {@literal @}param path Path to the file to extract * * {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected * * {@literal @}param config Extraction configuration * * {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O * errors. * Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported. */ public static ExtractionResult extractFile( final java.nio.file.Path path, final @Nullable String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cpath = arena.allocateFrom(path.toString()); var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL; var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE.invoke(cpath, cmimeType, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); throw new KreuzbergRsException("extractFile: failed to serialize response", null); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return MAPPER.readValue(json, ExtractionResult.class); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } public static CompletableFuture extractFileAsync( final java.nio.file.Path path, final String mimeType, final ExtractionConfig config ) { return CompletableFuture.supplyAsync(() -> { try { return extractFile(path, mimeType, config); } catch (Throwable e) { throw new CompletionException(e); } }); } /** * Synchronous wrapper for {@code extract_file}. * * This is a convenience function that blocks the current thread until extraction completes. * For async code, use {@code extract_file} directly. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. Always uses the global runtime to avoid nested runtime issues. * * This function is only available with the {@code tokio-runtime} feature. For WASM targets, * use a truly synchronous extraction approach instead. */ public static ExtractionResult extractFileSync( final java.nio.file.Path path, final @Nullable String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cpath = arena.allocateFrom(path.toString()); var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL; var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE_SYNC.invoke(cpath, cmimeType, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); throw new KreuzbergRsException("extractFileSync: failed to serialize response", null); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return MAPPER.readValue(json, ExtractionResult.class); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Synchronous wrapper for {@code extract_bytes}. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. * * With the {@code tokio-runtime} feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation. */ public static ExtractionResult extractBytesSync( final byte[] content, final String mimeType, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content); var ccontentLen = (long) content.length; var cmimeType = arena.allocateFrom(mimeType); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES_SYNC.invoke(ccontent, ccontentLen, cmimeType, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); throw new KreuzbergRsException("extractBytesSync: failed to serialize response", null); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return MAPPER.readValue(json, ExtractionResult.class); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Synchronous wrapper for {@code batch_extract_files}. * * Uses the global Tokio runtime for optimal performance. * Only available with {@code tokio-runtime} (WASM has no filesystem). */ public static List batchExtractFilesSync( final List items, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items); var citems = arena.allocateFrom(citemsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES_SYNC.invoke(citems, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Synchronous wrapper for {@code batch_extract_bytes}. * * Uses the global Tokio runtime for optimal performance. * With the {@code tokio-runtime} feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation * that iterates through items and calls {@code extract_bytes_sync()}. */ public static List batchExtractBytesSync( final List items, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items); var citems = arena.allocateFrom(citemsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES_SYNC.invoke(citems, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Extract content from multiple files concurrently. * * This function processes multiple files in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults * to {@code (num_cpus * 1.5).ceil()}. * * Each file can optionally specify a FileExtractionConfig that overrides specific * fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults. * Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always * taken from the batch-level {@code config}. * {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration * overrides. * * {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings) * * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items. * * {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors * (IO, RuntimeError equivalents) will bubble up and fail the entire batch. */ public static List batchExtractFiles( final List items, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items); var citems = arena.allocateFrom(citemsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES.invoke(citems, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } public static CompletableFuture> batchExtractFilesAsync( final List items, final ExtractionConfig config ) { return CompletableFuture.supplyAsync(() -> { try { return batchExtractFiles(items, config); } catch (Throwable e) { throw new CompletionException(e); } }); } /** * Extract content from multiple byte arrays concurrently. * * This function processes multiple byte arrays in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults * to {@code (num_cpus * 1.5).ceil()}. * * Each item can optionally specify a FileExtractionConfig that overrides specific * fields from the batch-level {@code config}. Pass {@code None} as the config to use * the batch-level defaults for that item. * {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item * configuration overrides. * * {@literal @}param config Batch-level extraction configuration * * {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items. */ public static List batchExtractBytes( final List items, final ExtractionConfig config ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items); var citems = arena.allocateFrom(citemsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES.invoke(citems, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } public static CompletableFuture> batchExtractBytesAsync( final List items, final ExtractionConfig config ) { return CompletableFuture.supplyAsync(() -> { try { return batchExtractBytes(items, config); } catch (Throwable e) { throw new CompletionException(e); } }); } /** * Detect MIME type from raw file bytes. * * Uses magic byte signatures to detect file type from content. * Falls back to {@code infer} crate for comprehensive detection. * * For ZIP-based files, inspects contents to distinguish Office Open XML * formats (DOCX, XLSX, PPTX) from plain ZIP archives. * {@literal @}param content Raw file bytes * * {@literal @}return The detected MIME type string. * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined. */ public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content); var ccontentLen = (long) content.length; var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES.invoke(ccontent, ccontentLen); if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES_LEN.invoke(ccontent, ccontentLen); String str = readCString(resultPtr, resultLen); NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr); return str; } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Get file extensions for a given MIME type. * * Returns all known file extensions that map to the specified MIME type. * {@literal @}param mime_type The MIME type to look up * * {@literal @}return A vector of file extensions (without leading dot) for the MIME type. */ public static List getExtensionsForMime(final String mimeType) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cmimeType = arena.allocateFrom(mimeType); var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EXTENSIONS_FOR_MIME.invoke(cmimeType); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Clear all embedding backends from the global registry. * * Calls {@code shutdown()} on every registered backend, then empties the registry. * {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error * encountered stops processing of remaining backends. */ public static void clearEmbeddingBackends() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_EMBEDDING_BACKEND.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List the names of all registered embedding backends. * * Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language * bindings. */ public static List listEmbeddingBackends() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_BACKENDS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List names of all registered document extractors. */ public static List listDocumentExtractors() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_DOCUMENT_EXTRACTORS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Clear all document extractors from the global registry. * * Calls {@code shutdown()} on every registered extractor, then empties the registry. * {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error * encountered stops processing of remaining extractors. */ public static void clearDocumentExtractors() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_DOCUMENT_EXTRACTOR.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List all registered OCR backends. * * Returns the names of all OCR backends currently registered in the global registry. * {@literal @}return A vector of OCR backend names. */ public static List listOcrBackends() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_OCR_BACKENDS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Clear all OCR backends from the global registry. * * Removes all OCR backends and calls their {@code shutdown()} methods. * {@literal @}return - {@code Ok(())} if all backends were cleared successfully * - {@code Err(...)} if any shutdown method failed */ public static void clearOcrBackends() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_OCR_BACKEND.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List all registered post-processor names. * * Returns a vector of all post-processor names currently registered in the * global registry. * {@literal @}return - {@code Ok(Vec<String>)} - Vector of post-processor names * - {@code Err(...)} if the registry lock is poisoned */ public static List listPostProcessors() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_POST_PROCESSORS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Remove all registered post-processors. */ public static void clearPostProcessors() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_POST_PROCESSOR.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List names of all registered renderers. * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned. */ public static List listRenderers() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_RENDERERS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Clear all renderers from the global registry. * * Removes every renderer, including the built-in defaults (markdown, html, * djot, plain). After calling this no renderers are registered; re-register * as needed. * {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned. */ public static void clearRenderers() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_RENDERER.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List names of all registered validators. */ public static List listValidators() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_VALIDATORS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Remove all registered validators. */ public static void clearValidators() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var outErr = arena.allocate(ValueLayout.ADDRESS); var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_VALIDATOR.invoke(outErr); if (primitiveResult != 0) { MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0); String msg = errPtr.equals(MemorySegment.NULL) ? "clear failed (rc=" + primitiveResult + ")" : errPtr.reinterpret(Long.MAX_VALUE).getString(0); throw new KreuzbergRsException(primitiveResult, msg); } } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Compare two extraction results and return a structured diff. * * The comparison is purely structural — no I/O, no side effects. All fields * of ExtractionDiff are populated according to the provided DiffOptions. * {@literal @}param a — the "before" extraction result * * {@literal @}param b — the "after" extraction result * * {@literal @}param opts — controls which sections are compared and optional truncation */ public static ExtractionDiff compare( final ExtractionResult a, final ExtractionResult b, final DiffOptions opts ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var caJson = a != null ? MAPPER.writeValueAsString(a) : null; var caJsonSeg = caJson != null ? arena.allocateFrom(caJson) : MemorySegment.NULL; var ca = caJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(caJsonSeg) : MemorySegment.NULL; if (caJson != null && ca.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create a from JSON"); } var cbJson = b != null ? MAPPER.writeValueAsString(b) : null; var cbJsonSeg = cbJson != null ? arena.allocateFrom(cbJson) : MemorySegment.NULL; var cb = cbJson != null ? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(cbJsonSeg) : MemorySegment.NULL; if (cbJson != null && cb.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create b from JSON"); } var coptsJson = opts != null ? MAPPER.writeValueAsString(opts) : null; var coptsJsonSeg = coptsJson != null ? arena.allocateFrom(coptsJson) : MemorySegment.NULL; var copts = coptsJson != null ? (MemorySegment) NativeLib.KREUZBERG_DIFF_OPTIONS_FROM_JSON.invoke(coptsJsonSeg) : MemorySegment.NULL; if (coptsJson != null && copts.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create opts from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_COMPARE.invoke(ca, cb, copts); if (!ca.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(ca); } if (!cb.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(cb); } if (!copts.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_DIFF_OPTIONS_FREE.invoke(copts); } if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_DIFF_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EXTRACTION_DIFF_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); throw new KreuzbergRsException("compare: failed to serialize response", null); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return MAPPER.readValue(json, ExtractionDiff.class); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Generate embeddings asynchronously for a list of text strings. * * This is the async counterpart to embed_texts. It offloads the blocking * ONNX inference work to a dedicated blocking thread pool via Tokio's * {@code spawn_blocking}, keeping the async executor free. * * Returns one embedding vector per input text in the same order. * {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread) * * {@literal @}param config Embedding configuration specifying model, batch size, and normalization * * {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed * - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails, * or the blocking inference task panics */ public static List> embedTextsAsync(final List texts, final EmbeddingConfig config) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts); var ctexts = arena.allocateFrom(ctextsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS_ASYNC.invoke(ctexts, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } public static CompletableFuture>> embedTextsAsyncAsync(final List texts, final EmbeddingConfig config) { return CompletableFuture.supplyAsync(() -> { try { return embedTextsAsync(texts, config); } catch (Throwable e) { throw new CompletionException(e); } }); } /** * Render a single PDF page to PNG bytes. * * Returns raw PNG-encoded bytes for the specified page at the given DPI. * Uses pdf_oxide with tiny-skia for pure-Rust rendering. * {@literal @}param pdf_bytes Raw PDF file bytes * * {@literal @}param page_index Zero-based page index * * {@literal @}param dpi Resolution in dots per inch (default: 150) * * {@literal @}param password Optional password for encrypted PDFs * * {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated, * or rendered, or if {@code page_index} is out of range. */ public static byte[] renderPdfPageToPng( final byte[] pdfBytes, final long pageIndex, final @Nullable Integer dpi, final @Nullable String password ) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cpdfBytes = arena.allocateFrom(ValueLayout.JAVA_BYTE, pdfBytes); var cpdfBytesLen = (long) pdfBytes.length; int cdpi = (dpi == null) ? Integer.MAX_VALUE : dpi; var cpassword = password != null ? arena.allocateFrom(password) : MemorySegment.NULL; var outPtrHolder = arena.allocate(ValueLayout.ADDRESS); var outLenHolder = arena.allocate(ValueLayout.JAVA_LONG); var outCapHolder = arena.allocate(ValueLayout.JAVA_LONG); int rc = (int) NativeLib.KREUZBERG_RENDER_PDF_PAGE_TO_PNG.invoke( cpdfBytes, cpdfBytesLen, pageIndex, cdpi, cpassword, outPtrHolder, outLenHolder, outCapHolder ); if (rc != 0) { checkLastError(); return null; } var outPtr = outPtrHolder.get(ValueLayout.ADDRESS, 0); long outLen = outLenHolder.get(ValueLayout.JAVA_LONG, 0); long outCap = outCapHolder.get(ValueLayout.JAVA_LONG, 0); if (outPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } byte[] result = outPtr.reinterpret(outLen).toArray(ValueLayout.JAVA_BYTE); NativeLib.KREUZBERG_FREE_BYTES.invoke(outPtr, outLen, outCap); return result; } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Detect the MIME type of a file at the given path. * * Uses the file extension and optionally the file content to determine the MIME type. * Set {@code check_exists} to {@code true} to verify the file exists before detection. */ public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cpath = arena.allocateFrom(path); var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE.invoke(cpath, (checkExists ? 1 : 0)); if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return null; } long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_LEN.invoke(cpath, (checkExists ? 1 : 0)); String str = readCString(resultPtr, resultLen); NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr); return str; } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Embed a list of texts using the configured embedding model. * * Returns a 2D vector where each inner vector is the embedding for the corresponding text. */ public static List> embedTexts(final List texts, final EmbeddingConfig config) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts); var ctexts = arena.allocateFrom(ctextsJson); var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null; var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL; var cconfig = cconfigJson != null ? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg) : MemorySegment.NULL; if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) { checkLastError(); throw new IllegalStateException("failed to create config from JSON"); } var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS.invoke(ctexts, cconfig); if (!cconfig.equals(MemorySegment.NULL)) { NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig); } return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * Get an embedding preset by name. * * Returns {@code None} if no preset with the given name exists. Returns an owned * clone so the value is safe to pass across FFI boundaries. */ public static Optional getEmbeddingPreset(final String name) throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var cname = arena.allocateFrom(name); var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EMBEDDING_PRESET.invoke(cname); if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return Optional.empty(); } // CPD-OFF var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_PRESET_TO_JSON.invoke(resultPtr); NativeLib.KREUZBERG_EMBEDDING_PRESET_FREE.invoke(resultPtr); if (jsonPtr.equals(MemorySegment.NULL)) { checkLastError(); return Optional.empty(); } String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr); return Optional.of(MAPPER.readValue(json, EmbeddingPreset.class)); // CPD-ON } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } /** * List the names of all available embedding presets. * * Returns owned {@code String}s so the values are safe to pass across FFI boundaries. */ public static List listEmbeddingPresets() throws KreuzbergRsException { try (var arena = Arena.ofShared()) { var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_PRESETS.invoke(); return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference>() { }); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } // Helper methods for FFI marshalling private static void checkLastError() throws Throwable { int errCode = (int) NativeLib.KREUZBERG_LAST_ERROR_CODE.invoke(); if (errCode != 0) { var ctxPtr = (MemorySegment) NativeLib.KREUZBERG_LAST_ERROR_CONTEXT.invoke(); String msg = ctxPtr.reinterpret(Long.MAX_VALUE).getString(0); switch (errCode) { case 1 -> throw new InvalidInputException(msg); case 2 -> throw new ConversionErrorException(msg); default -> throw new KreuzbergRsException(errCode, msg); } } } private static com.fasterxml.jackson.databind.ObjectMapper createObjectMapper() { return new com.fasterxml.jackson.databind.ObjectMapper() .registerModule(new com.fasterxml.jackson.datatype.jdk8.Jdk8Module()) .findAndRegisterModules() .setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE) .setSerializationInclusion(com.fasterxml.jackson.annotation.JsonInclude.Include.ALWAYS) .configure(com.fasterxml.jackson.databind.MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS, true); } private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER = createObjectMapper(); private static String readCString(MemorySegment ptr, long byteLen) { if (ptr == null || ptr.address() == 0) { return null; } return ptr.reinterpret(byteLen + 1).getString(0); } private static java.util.List readJsonList( MemorySegment resultPtr, com.fasterxml.jackson.core.type.TypeReference> typeRef ) throws KreuzbergRsException { try { if (resultPtr.equals(MemorySegment.NULL)) { checkLastError(); return java.util.List.of(); } String json = resultPtr.reinterpret(Long.MAX_VALUE).getString(0); NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr); return MAPPER.readValue(json, typeRef); } catch (Throwable e) { throw new KreuzbergRsException("FFI call failed", e); } } }