Files
fil/packages/java/dev/kreuzberg/KreuzbergRs.java

1028 lines
50 KiB
Java
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import org.jspecify.annotations.Nullable;
public final class KreuzbergRs {
private KreuzbergRs() { }
/**
* Extract content from a byte array.
*
* This is the main entry point for in-memory extraction. It performs the following steps:
* 1. Validate MIME type
* 2. Handle legacy format conversion if needed
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* {@literal @}param content The byte array to extract
*
* {@literal @}param mime_type MIME type of the content
*
* {@literal @}param config Extraction configuration
*
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid.
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
*/
public static ExtractionResult extractBytes(
final byte[] content,
final String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
var ccontentLen = (long) content.length;
var cmimeType = arena.allocateFrom(mimeType);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES.invoke(ccontent, ccontentLen, cmimeType, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
throw new KreuzbergRsException("extractBytes: failed to serialize response", null);
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return MAPPER.readValue(json, ExtractionResult.class);
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
public static CompletableFuture<ExtractionResult> extractBytesAsync(
final byte[] content,
final String mimeType,
final ExtractionConfig config
) {
return CompletableFuture.supplyAsync(() -> {
try {
return extractBytes(content, mimeType, config);
} catch (Throwable e) {
throw new CompletionException(e);
}
});
}
/**
* Extract content from a file.
*
* This is the main entry point for file-based extraction. It performs the following steps:
* 1. Check cache for existing result (if caching enabled)
* 2. Detect or validate MIME type
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* 6. Store result in cache (if caching enabled)
* {@literal @}param path Path to the file to extract
*
* {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected
*
* {@literal @}param config Extraction configuration
*
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O
* errors.
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
*/
public static ExtractionResult extractFile(
final java.nio.file.Path path,
final @Nullable String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cpath = arena.allocateFrom(path.toString());
var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL;
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE.invoke(cpath, cmimeType, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
throw new KreuzbergRsException("extractFile: failed to serialize response", null);
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return MAPPER.readValue(json, ExtractionResult.class);
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
public static CompletableFuture<ExtractionResult> extractFileAsync(
final java.nio.file.Path path,
final String mimeType,
final ExtractionConfig config
) {
return CompletableFuture.supplyAsync(() -> {
try {
return extractFile(path, mimeType, config);
} catch (Throwable e) {
throw new CompletionException(e);
}
});
}
/**
* Synchronous wrapper for {@code extract_file}.
*
* This is a convenience function that blocks the current thread until extraction completes.
* For async code, use {@code extract_file} directly.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
*
* This function is only available with the {@code tokio-runtime} feature. For WASM targets,
* use a truly synchronous extraction approach instead.
*/
public static ExtractionResult extractFileSync(
final java.nio.file.Path path,
final @Nullable String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cpath = arena.allocateFrom(path.toString());
var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL;
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE_SYNC.invoke(cpath, cmimeType, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
throw new KreuzbergRsException("extractFileSync: failed to serialize response", null);
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return MAPPER.readValue(json, ExtractionResult.class);
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Synchronous wrapper for {@code extract_bytes}.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call.
*
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
*/
public static ExtractionResult extractBytesSync(
final byte[] content,
final String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
var ccontentLen = (long) content.length;
var cmimeType = arena.allocateFrom(mimeType);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES_SYNC.invoke(ccontent, ccontentLen, cmimeType, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
throw new KreuzbergRsException("extractBytesSync: failed to serialize response", null);
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return MAPPER.readValue(json, ExtractionResult.class);
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Synchronous wrapper for {@code batch_extract_files}.
*
* Uses the global Tokio runtime for optimal performance.
* Only available with {@code tokio-runtime} (WASM has no filesystem).
*/
public static List<ExtractionResult> batchExtractFilesSync(
final List<BatchFileItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items);
var citems = arena.allocateFrom(citemsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES_SYNC.invoke(citems, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Synchronous wrapper for {@code batch_extract_bytes}.
*
* Uses the global Tokio runtime for optimal performance.
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
* that iterates through items and calls {@code extract_bytes_sync()}.
*/
public static List<ExtractionResult> batchExtractBytesSync(
final List<BatchBytesItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items);
var citems = arena.allocateFrom(citemsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES_SYNC.invoke(citems, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Extract content from multiple files concurrently.
*
* This function processes multiple files in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
* to {@code (num_cpus * 1.5).ceil()}.
*
* Each file can optionally specify a FileExtractionConfig that overrides specific
* fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults.
* Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always
* taken from the batch-level {@code config}.
* {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration
* overrides.
*
* {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings)
*
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
*
* {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
*/
public static List<ExtractionResult> batchExtractFiles(
final List<BatchFileItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items);
var citems = arena.allocateFrom(citemsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES.invoke(citems, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
public static CompletableFuture<List<ExtractionResult>> batchExtractFilesAsync(
final List<BatchFileItem> items,
final ExtractionConfig config
) {
return CompletableFuture.supplyAsync(() -> {
try {
return batchExtractFiles(items, config);
} catch (Throwable e) {
throw new CompletionException(e);
}
});
}
/**
* Extract content from multiple byte arrays concurrently.
*
* This function processes multiple byte arrays in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
* to {@code (num_cpus * 1.5).ceil()}.
*
* Each item can optionally specify a FileExtractionConfig that overrides specific
* fields from the batch-level {@code config}. Pass {@code None} as the config to use
* the batch-level defaults for that item.
* {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item
* configuration overrides.
*
* {@literal @}param config Batch-level extraction configuration
*
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
*/
public static List<ExtractionResult> batchExtractBytes(
final List<BatchBytesItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items);
var citems = arena.allocateFrom(citemsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES.invoke(citems, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
public static CompletableFuture<List<ExtractionResult>> batchExtractBytesAsync(
final List<BatchBytesItem> items,
final ExtractionConfig config
) {
return CompletableFuture.supplyAsync(() -> {
try {
return batchExtractBytes(items, config);
} catch (Throwable e) {
throw new CompletionException(e);
}
});
}
/**
* Detect MIME type from raw file bytes.
*
* Uses magic byte signatures to detect file type from content.
* Falls back to {@code infer} crate for comprehensive detection.
*
* For ZIP-based files, inspects contents to distinguish Office Open XML
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
* {@literal @}param content Raw file bytes
*
* {@literal @}return The detected MIME type string.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined.
*/
public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
var ccontentLen = (long) content.length;
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES.invoke(ccontent, ccontentLen);
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES_LEN.invoke(ccontent, ccontentLen);
String str = readCString(resultPtr, resultLen);
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
return str;
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Get file extensions for a given MIME type.
*
* Returns all known file extensions that map to the specified MIME type.
* {@literal @}param mime_type The MIME type to look up
*
* {@literal @}return A vector of file extensions (without leading dot) for the MIME type.
*/
public static List<String> getExtensionsForMime(final String mimeType) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cmimeType = arena.allocateFrom(mimeType);
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EXTENSIONS_FOR_MIME.invoke(cmimeType);
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Clear all embedding backends from the global registry.
*
* Calls {@code shutdown()} on every registered backend, then empties the registry.
* {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error
* encountered stops processing of remaining backends.
*/
public static void clearEmbeddingBackends() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_EMBEDDING_BACKEND.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List the names of all registered embedding backends.
*
* Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language
* bindings.
*/
public static List<String> listEmbeddingBackends() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_BACKENDS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List names of all registered document extractors.
*/
public static List<String> listDocumentExtractors() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_DOCUMENT_EXTRACTORS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Clear all document extractors from the global registry.
*
* Calls {@code shutdown()} on every registered extractor, then empties the registry.
* {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error
* encountered stops processing of remaining extractors.
*/
public static void clearDocumentExtractors() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_DOCUMENT_EXTRACTOR.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List all registered OCR backends.
*
* Returns the names of all OCR backends currently registered in the global registry.
* {@literal @}return A vector of OCR backend names.
*/
public static List<String> listOcrBackends() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_OCR_BACKENDS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Clear all OCR backends from the global registry.
*
* Removes all OCR backends and calls their {@code shutdown()} methods.
* {@literal @}return - {@code Ok(())} if all backends were cleared successfully
* - {@code Err(...)} if any shutdown method failed
*/
public static void clearOcrBackends() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_OCR_BACKEND.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List all registered post-processor names.
*
* Returns a vector of all post-processor names currently registered in the
* global registry.
* {@literal @}return - {@code Ok(Vec&lt;String&gt;)} - Vector of post-processor names
* - {@code Err(...)} if the registry lock is poisoned
*/
public static List<String> listPostProcessors() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_POST_PROCESSORS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Remove all registered post-processors.
*/
public static void clearPostProcessors() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_POST_PROCESSOR.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List names of all registered renderers.
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
*/
public static List<String> listRenderers() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_RENDERERS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Clear all renderers from the global registry.
*
* Removes every renderer, including the built-in defaults (markdown, html,
* djot, plain). After calling this no renderers are registered; re-register
* as needed.
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
*/
public static void clearRenderers() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_RENDERER.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List names of all registered validators.
*/
public static List<String> listValidators() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_VALIDATORS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Remove all registered validators.
*/
public static void clearValidators() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var outErr = arena.allocate(ValueLayout.ADDRESS);
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_VALIDATOR.invoke(outErr);
if (primitiveResult != 0) {
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
String msg = errPtr.equals(MemorySegment.NULL)
? "clear failed (rc=" + primitiveResult + ")"
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
throw new KreuzbergRsException(primitiveResult, msg);
}
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Compare two extraction results and return a structured diff.
*
* The comparison is purely structural no I/O, no side effects. All fields
* of ExtractionDiff are populated according to the provided DiffOptions.
* {@literal @}param a the "before" extraction result
*
* {@literal @}param b the "after" extraction result
*
* {@literal @}param opts controls which sections are compared and optional truncation
*/
public static ExtractionDiff compare(
final ExtractionResult a,
final ExtractionResult b,
final DiffOptions opts
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var caJson = a != null ? MAPPER.writeValueAsString(a) : null;
var caJsonSeg = caJson != null ? arena.allocateFrom(caJson) : MemorySegment.NULL;
var ca = caJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(caJsonSeg)
: MemorySegment.NULL;
if (caJson != null && ca.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create a from JSON");
}
var cbJson = b != null ? MAPPER.writeValueAsString(b) : null;
var cbJsonSeg = cbJson != null ? arena.allocateFrom(cbJson) : MemorySegment.NULL;
var cb = cbJson != null
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(cbJsonSeg)
: MemorySegment.NULL;
if (cbJson != null && cb.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create b from JSON");
}
var coptsJson = opts != null ? MAPPER.writeValueAsString(opts) : null;
var coptsJsonSeg = coptsJson != null ? arena.allocateFrom(coptsJson) : MemorySegment.NULL;
var copts = coptsJson != null
? (MemorySegment) NativeLib.KREUZBERG_DIFF_OPTIONS_FROM_JSON.invoke(coptsJsonSeg)
: MemorySegment.NULL;
if (coptsJson != null && copts.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create opts from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_COMPARE.invoke(ca, cb, copts);
if (!ca.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(ca);
}
if (!cb.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(cb);
}
if (!copts.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_DIFF_OPTIONS_FREE.invoke(copts);
}
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_DIFF_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EXTRACTION_DIFF_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
throw new KreuzbergRsException("compare: failed to serialize response", null);
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return MAPPER.readValue(json, ExtractionDiff.class);
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Generate embeddings asynchronously for a list of text strings.
*
* This is the async counterpart to embed_texts. It offloads the blocking
* ONNX inference work to a dedicated blocking thread pool via Tokio's
* {@code spawn_blocking}, keeping the async executor free.
*
* Returns one embedding vector per input text in the same order.
* {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread)
*
* {@literal @}param config Embedding configuration specifying model, batch size, and normalization
*
* {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed
* - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails,
* or the blocking inference task panics
*/
public static List<List<Float>> embedTextsAsync(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts);
var ctexts = arena.allocateFrom(ctextsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS_ASYNC.invoke(ctexts, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<List<Float>>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
public static CompletableFuture<List<List<Float>>> embedTextsAsyncAsync(final List<String> texts, final EmbeddingConfig config) {
return CompletableFuture.supplyAsync(() -> {
try {
return embedTextsAsync(texts, config);
} catch (Throwable e) {
throw new CompletionException(e);
}
});
}
/**
* Render a single PDF page to PNG bytes.
*
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
* {@literal @}param pdf_bytes Raw PDF file bytes
*
* {@literal @}param page_index Zero-based page index
*
* {@literal @}param dpi Resolution in dots per inch (default: 150)
*
* {@literal @}param password Optional password for encrypted PDFs
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated,
* or rendered, or if {@code page_index} is out of range.
*/
public static byte[] renderPdfPageToPng(
final byte[] pdfBytes,
final long pageIndex,
final @Nullable Integer dpi,
final @Nullable String password
) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cpdfBytes = arena.allocateFrom(ValueLayout.JAVA_BYTE, pdfBytes);
var cpdfBytesLen = (long) pdfBytes.length;
int cdpi = (dpi == null) ? Integer.MAX_VALUE : dpi;
var cpassword = password != null ? arena.allocateFrom(password) : MemorySegment.NULL;
var outPtrHolder = arena.allocate(ValueLayout.ADDRESS);
var outLenHolder = arena.allocate(ValueLayout.JAVA_LONG);
var outCapHolder = arena.allocate(ValueLayout.JAVA_LONG);
int rc = (int) NativeLib.KREUZBERG_RENDER_PDF_PAGE_TO_PNG.invoke(
cpdfBytes,
cpdfBytesLen,
pageIndex,
cdpi,
cpassword,
outPtrHolder,
outLenHolder,
outCapHolder
);
if (rc != 0) {
checkLastError(); return null; }
var outPtr = outPtrHolder.get(ValueLayout.ADDRESS, 0);
long outLen = outLenHolder.get(ValueLayout.JAVA_LONG, 0);
long outCap = outCapHolder.get(ValueLayout.JAVA_LONG, 0);
if (outPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
byte[] result = outPtr.reinterpret(outLen).toArray(ValueLayout.JAVA_BYTE);
NativeLib.KREUZBERG_FREE_BYTES.invoke(outPtr, outLen, outCap); return result; } catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Detect the MIME type of a file at the given path.
*
* Uses the file extension and optionally the file content to determine the MIME type.
* Set {@code check_exists} to {@code true} to verify the file exists before detection.
*/
public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cpath = arena.allocateFrom(path);
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE.invoke(cpath, (checkExists ? 1 : 0));
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return null; }
long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_LEN.invoke(cpath, (checkExists ? 1 : 0));
String str = readCString(resultPtr, resultLen);
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
return str;
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Embed a list of texts using the configured embedding model.
*
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
*/
public static List<List<Float>> embedTexts(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts);
var ctexts = arena.allocateFrom(ctextsJson);
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
var cconfig = cconfigJson != null
? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
: MemorySegment.NULL;
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
checkLastError();
throw new IllegalStateException("failed to create config from JSON");
}
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS.invoke(ctexts, cconfig);
if (!cconfig.equals(MemorySegment.NULL)) {
NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig);
}
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<List<Float>>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* Get an embedding preset by name.
*
* Returns {@code None} if no preset with the given name exists. Returns an owned
* clone so the value is safe to pass across FFI boundaries.
*/
public static Optional<EmbeddingPreset> getEmbeddingPreset(final String name) throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var cname = arena.allocateFrom(name);
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EMBEDDING_PRESET.invoke(cname);
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError(); return Optional.empty(); }
// CPD-OFF
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_PRESET_TO_JSON.invoke(resultPtr);
NativeLib.KREUZBERG_EMBEDDING_PRESET_FREE.invoke(resultPtr);
if (jsonPtr.equals(MemorySegment.NULL)) {
checkLastError();
return Optional.empty();
}
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
return Optional.of(MAPPER.readValue(json, EmbeddingPreset.class));
// CPD-ON
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
/**
* List the names of all available embedding presets.
*
* Returns owned {@code String}s so the values are safe to pass across FFI boundaries.
*/
public static List<String> listEmbeddingPresets() throws KreuzbergRsException {
try (var arena = Arena.ofShared()) {
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_PRESETS.invoke();
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
// Helper methods for FFI marshalling
private static void checkLastError() throws Throwable {
int errCode = (int) NativeLib.KREUZBERG_LAST_ERROR_CODE.invoke();
if (errCode != 0) {
var ctxPtr = (MemorySegment) NativeLib.KREUZBERG_LAST_ERROR_CONTEXT.invoke();
String msg = ctxPtr.reinterpret(Long.MAX_VALUE).getString(0);
switch (errCode) {
case 1 -> throw new InvalidInputException(msg);
case 2 -> throw new ConversionErrorException(msg);
default -> throw new KreuzbergRsException(errCode, msg);
}
}
}
private static com.fasterxml.jackson.databind.ObjectMapper createObjectMapper() {
return new com.fasterxml.jackson.databind.ObjectMapper()
.registerModule(new com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
.findAndRegisterModules()
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
.setSerializationInclusion(com.fasterxml.jackson.annotation.JsonInclude.Include.ALWAYS)
.configure(com.fasterxml.jackson.databind.MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS, true);
}
private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER = createObjectMapper();
private static String readCString(MemorySegment ptr, long byteLen) {
if (ptr == null || ptr.address() == 0) {
return null;
}
return ptr.reinterpret(byteLen + 1).getString(0);
}
private static <T> java.util.List<T> readJsonList(
MemorySegment resultPtr,
com.fasterxml.jackson.core.type.TypeReference<java.util.List<T>> typeRef
) throws KreuzbergRsException {
try {
if (resultPtr.equals(MemorySegment.NULL)) {
checkLastError();
return java.util.List.of();
}
String json = resultPtr.reinterpret(Long.MAX_VALUE).getString(0);
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
return MAPPER.readValue(json, typeRef);
} catch (Throwable e) {
throw new KreuzbergRsException("FFI call failed", e);
}
}
}