1028 lines
50 KiB
Java
Generated
1028 lines
50 KiB
Java
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
|
|
package dev.kreuzberg;
|
|
|
|
import java.lang.foreign.Arena;
|
|
import java.lang.foreign.MemorySegment;
|
|
import java.lang.foreign.ValueLayout;
|
|
import java.util.List;
|
|
import java.util.Optional;
|
|
import java.util.concurrent.CompletableFuture;
|
|
import java.util.concurrent.CompletionException;
|
|
import org.jspecify.annotations.Nullable;
|
|
public final class KreuzbergRs {
|
|
private KreuzbergRs() { }
|
|
|
|
/**
|
|
* Extract content from a byte array.
|
|
*
|
|
* This is the main entry point for in-memory extraction. It performs the following steps:
|
|
* 1. Validate MIME type
|
|
* 2. Handle legacy format conversion if needed
|
|
* 3. Select appropriate extractor from registry
|
|
* 4. Extract content
|
|
* 5. Run post-processing pipeline
|
|
* {@literal @}param content The byte array to extract
|
|
*
|
|
* {@literal @}param mime_type MIME type of the content
|
|
*
|
|
* {@literal @}param config Extraction configuration
|
|
*
|
|
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
|
|
*
|
|
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid.
|
|
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
|
|
*/
|
|
public static ExtractionResult extractBytes(
|
|
final byte[] content,
|
|
final String mimeType,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
|
|
var ccontentLen = (long) content.length;
|
|
var cmimeType = arena.allocateFrom(mimeType);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES.invoke(ccontent, ccontentLen, cmimeType, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new KreuzbergRsException("extractBytes: failed to serialize response", null);
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return MAPPER.readValue(json, ExtractionResult.class);
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
public static CompletableFuture<ExtractionResult> extractBytesAsync(
|
|
final byte[] content,
|
|
final String mimeType,
|
|
final ExtractionConfig config
|
|
) {
|
|
return CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
return extractBytes(content, mimeType, config);
|
|
} catch (Throwable e) {
|
|
throw new CompletionException(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Extract content from a file.
|
|
*
|
|
* This is the main entry point for file-based extraction. It performs the following steps:
|
|
* 1. Check cache for existing result (if caching enabled)
|
|
* 2. Detect or validate MIME type
|
|
* 3. Select appropriate extractor from registry
|
|
* 4. Extract content
|
|
* 5. Run post-processing pipeline
|
|
* 6. Store result in cache (if caching enabled)
|
|
* {@literal @}param path Path to the file to extract
|
|
*
|
|
* {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected
|
|
*
|
|
* {@literal @}param config Extraction configuration
|
|
*
|
|
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
|
|
*
|
|
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O
|
|
* errors.
|
|
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
|
|
*/
|
|
public static ExtractionResult extractFile(
|
|
final java.nio.file.Path path,
|
|
final @Nullable String mimeType,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cpath = arena.allocateFrom(path.toString());
|
|
var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL;
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE.invoke(cpath, cmimeType, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new KreuzbergRsException("extractFile: failed to serialize response", null);
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return MAPPER.readValue(json, ExtractionResult.class);
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
public static CompletableFuture<ExtractionResult> extractFileAsync(
|
|
final java.nio.file.Path path,
|
|
final String mimeType,
|
|
final ExtractionConfig config
|
|
) {
|
|
return CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
return extractFile(path, mimeType, config);
|
|
} catch (Throwable e) {
|
|
throw new CompletionException(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Synchronous wrapper for {@code extract_file}.
|
|
*
|
|
* This is a convenience function that blocks the current thread until extraction completes.
|
|
* For async code, use {@code extract_file} directly.
|
|
*
|
|
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
|
*
|
|
* This function is only available with the {@code tokio-runtime} feature. For WASM targets,
|
|
* use a truly synchronous extraction approach instead.
|
|
*/
|
|
public static ExtractionResult extractFileSync(
|
|
final java.nio.file.Path path,
|
|
final @Nullable String mimeType,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cpath = arena.allocateFrom(path.toString());
|
|
var cmimeType = mimeType != null ? arena.allocateFrom(mimeType) : MemorySegment.NULL;
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_FILE_SYNC.invoke(cpath, cmimeType, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new KreuzbergRsException("extractFileSync: failed to serialize response", null);
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return MAPPER.readValue(json, ExtractionResult.class);
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Synchronous wrapper for {@code extract_bytes}.
|
|
*
|
|
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
* a new runtime per call.
|
|
*
|
|
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
|
|
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
|
*/
|
|
public static ExtractionResult extractBytesSync(
|
|
final byte[] content,
|
|
final String mimeType,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
|
|
var ccontentLen = (long) content.length;
|
|
var cmimeType = arena.allocateFrom(mimeType);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACT_BYTES_SYNC.invoke(ccontent, ccontentLen, cmimeType, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new KreuzbergRsException("extractBytesSync: failed to serialize response", null);
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return MAPPER.readValue(json, ExtractionResult.class);
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Synchronous wrapper for {@code batch_extract_files}.
|
|
*
|
|
* Uses the global Tokio runtime for optimal performance.
|
|
* Only available with {@code tokio-runtime} (WASM has no filesystem).
|
|
*/
|
|
public static List<ExtractionResult> batchExtractFilesSync(
|
|
final List<BatchFileItem> items,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items);
|
|
var citems = arena.allocateFrom(citemsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES_SYNC.invoke(citems, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Synchronous wrapper for {@code batch_extract_bytes}.
|
|
*
|
|
* Uses the global Tokio runtime for optimal performance.
|
|
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
|
|
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
|
* that iterates through items and calls {@code extract_bytes_sync()}.
|
|
*/
|
|
public static List<ExtractionResult> batchExtractBytesSync(
|
|
final List<BatchBytesItem> items,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items);
|
|
var citems = arena.allocateFrom(citemsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES_SYNC.invoke(citems, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract content from multiple files concurrently.
|
|
*
|
|
* This function processes multiple files in parallel, automatically managing
|
|
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
|
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
|
|
* to {@code (num_cpus * 1.5).ceil()}.
|
|
*
|
|
* Each file can optionally specify a FileExtractionConfig that overrides specific
|
|
* fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults.
|
|
* Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always
|
|
* taken from the batch-level {@code config}.
|
|
* {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration
|
|
* overrides.
|
|
*
|
|
* {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings)
|
|
*
|
|
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
|
|
*
|
|
* {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors
|
|
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|
*/
|
|
public static List<ExtractionResult> batchExtractFiles(
|
|
final List<BatchFileItem> items,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchFileItem.class)).writeValueAsString(items);
|
|
var citems = arena.allocateFrom(citemsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_FILES.invoke(citems, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
public static CompletableFuture<List<ExtractionResult>> batchExtractFilesAsync(
|
|
final List<BatchFileItem> items,
|
|
final ExtractionConfig config
|
|
) {
|
|
return CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
return batchExtractFiles(items, config);
|
|
} catch (Throwable e) {
|
|
throw new CompletionException(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Extract content from multiple byte arrays concurrently.
|
|
*
|
|
* This function processes multiple byte arrays in parallel, automatically managing
|
|
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
|
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
|
|
* to {@code (num_cpus * 1.5).ceil()}.
|
|
*
|
|
* Each item can optionally specify a FileExtractionConfig that overrides specific
|
|
* fields from the batch-level {@code config}. Pass {@code None} as the config to use
|
|
* the batch-level defaults for that item.
|
|
* {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item
|
|
* configuration overrides.
|
|
*
|
|
* {@literal @}param config Batch-level extraction configuration
|
|
*
|
|
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
|
|
*/
|
|
public static List<ExtractionResult> batchExtractBytes(
|
|
final List<BatchBytesItem> items,
|
|
final ExtractionConfig config
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var citemsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, BatchBytesItem.class)).writeValueAsString(items);
|
|
var citems = arena.allocateFrom(citemsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_BATCH_EXTRACT_BYTES.invoke(citems, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<ExtractionResult>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
public static CompletableFuture<List<ExtractionResult>> batchExtractBytesAsync(
|
|
final List<BatchBytesItem> items,
|
|
final ExtractionConfig config
|
|
) {
|
|
return CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
return batchExtractBytes(items, config);
|
|
} catch (Throwable e) {
|
|
throw new CompletionException(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Detect MIME type from raw file bytes.
|
|
*
|
|
* Uses magic byte signatures to detect file type from content.
|
|
* Falls back to {@code infer} crate for comprehensive detection.
|
|
*
|
|
* For ZIP-based files, inspects contents to distinguish Office Open XML
|
|
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
|
* {@literal @}param content Raw file bytes
|
|
*
|
|
* {@literal @}return The detected MIME type string.
|
|
*
|
|
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined.
|
|
*/
|
|
public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var ccontent = arena.allocateFrom(ValueLayout.JAVA_BYTE, content);
|
|
var ccontentLen = (long) content.length;
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES.invoke(ccontent, ccontentLen);
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_FROM_BYTES_LEN.invoke(ccontent, ccontentLen);
|
|
String str = readCString(resultPtr, resultLen);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
|
|
return str;
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get file extensions for a given MIME type.
|
|
*
|
|
* Returns all known file extensions that map to the specified MIME type.
|
|
* {@literal @}param mime_type The MIME type to look up
|
|
*
|
|
* {@literal @}return A vector of file extensions (without leading dot) for the MIME type.
|
|
*/
|
|
public static List<String> getExtensionsForMime(final String mimeType) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cmimeType = arena.allocateFrom(mimeType);
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EXTENSIONS_FOR_MIME.invoke(cmimeType);
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all embedding backends from the global registry.
|
|
*
|
|
* Calls {@code shutdown()} on every registered backend, then empties the registry.
|
|
* {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error
|
|
* encountered stops processing of remaining backends.
|
|
*/
|
|
public static void clearEmbeddingBackends() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_EMBEDDING_BACKEND.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List the names of all registered embedding backends.
|
|
*
|
|
* Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language
|
|
* bindings.
|
|
*/
|
|
public static List<String> listEmbeddingBackends() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_BACKENDS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List names of all registered document extractors.
|
|
*/
|
|
public static List<String> listDocumentExtractors() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_DOCUMENT_EXTRACTORS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all document extractors from the global registry.
|
|
*
|
|
* Calls {@code shutdown()} on every registered extractor, then empties the registry.
|
|
* {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error
|
|
* encountered stops processing of remaining extractors.
|
|
*/
|
|
public static void clearDocumentExtractors() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_DOCUMENT_EXTRACTOR.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List all registered OCR backends.
|
|
*
|
|
* Returns the names of all OCR backends currently registered in the global registry.
|
|
* {@literal @}return A vector of OCR backend names.
|
|
*/
|
|
public static List<String> listOcrBackends() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_OCR_BACKENDS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all OCR backends from the global registry.
|
|
*
|
|
* Removes all OCR backends and calls their {@code shutdown()} methods.
|
|
* {@literal @}return - {@code Ok(())} if all backends were cleared successfully
|
|
* - {@code Err(...)} if any shutdown method failed
|
|
*/
|
|
public static void clearOcrBackends() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_OCR_BACKEND.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List all registered post-processor names.
|
|
*
|
|
* Returns a vector of all post-processor names currently registered in the
|
|
* global registry.
|
|
* {@literal @}return - {@code Ok(Vec<String>)} - Vector of post-processor names
|
|
* - {@code Err(...)} if the registry lock is poisoned
|
|
*/
|
|
public static List<String> listPostProcessors() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_POST_PROCESSORS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove all registered post-processors.
|
|
*/
|
|
public static void clearPostProcessors() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_POST_PROCESSOR.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List names of all registered renderers.
|
|
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
|
|
*/
|
|
public static List<String> listRenderers() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_RENDERERS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all renderers from the global registry.
|
|
*
|
|
* Removes every renderer, including the built-in defaults (markdown, html,
|
|
* djot, plain). After calling this no renderers are registered; re-register
|
|
* as needed.
|
|
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
|
|
*/
|
|
public static void clearRenderers() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_RENDERER.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List names of all registered validators.
|
|
*/
|
|
public static List<String> listValidators() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_VALIDATORS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove all registered validators.
|
|
*/
|
|
public static void clearValidators() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var outErr = arena.allocate(ValueLayout.ADDRESS);
|
|
var primitiveResult = (int) NativeLib.KREUZBERG_CLEAR_VALIDATOR.invoke(outErr);
|
|
if (primitiveResult != 0) {
|
|
MemorySegment errPtr = outErr.get(ValueLayout.ADDRESS, 0);
|
|
String msg = errPtr.equals(MemorySegment.NULL)
|
|
? "clear failed (rc=" + primitiveResult + ")"
|
|
: errPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
throw new KreuzbergRsException(primitiveResult, msg);
|
|
}
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Compare two extraction results and return a structured diff.
|
|
*
|
|
* The comparison is purely structural — no I/O, no side effects. All fields
|
|
* of ExtractionDiff are populated according to the provided DiffOptions.
|
|
* {@literal @}param a — the "before" extraction result
|
|
*
|
|
* {@literal @}param b — the "after" extraction result
|
|
*
|
|
* {@literal @}param opts — controls which sections are compared and optional truncation
|
|
*/
|
|
public static ExtractionDiff compare(
|
|
final ExtractionResult a,
|
|
final ExtractionResult b,
|
|
final DiffOptions opts
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var caJson = a != null ? MAPPER.writeValueAsString(a) : null;
|
|
var caJsonSeg = caJson != null ? arena.allocateFrom(caJson) : MemorySegment.NULL;
|
|
var ca = caJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(caJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (caJson != null && ca.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create a from JSON");
|
|
}
|
|
var cbJson = b != null ? MAPPER.writeValueAsString(b) : null;
|
|
var cbJsonSeg = cbJson != null ? arena.allocateFrom(cbJson) : MemorySegment.NULL;
|
|
var cb = cbJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_RESULT_FROM_JSON.invoke(cbJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cbJson != null && cb.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create b from JSON");
|
|
}
|
|
var coptsJson = opts != null ? MAPPER.writeValueAsString(opts) : null;
|
|
var coptsJsonSeg = coptsJson != null ? arena.allocateFrom(coptsJson) : MemorySegment.NULL;
|
|
var copts = coptsJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_DIFF_OPTIONS_FROM_JSON.invoke(coptsJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (coptsJson != null && copts.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create opts from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_COMPARE.invoke(ca, cb, copts);
|
|
if (!ca.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(ca);
|
|
}
|
|
if (!cb.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EXTRACTION_RESULT_FREE.invoke(cb);
|
|
}
|
|
if (!copts.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_DIFF_OPTIONS_FREE.invoke(copts);
|
|
}
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EXTRACTION_DIFF_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EXTRACTION_DIFF_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new KreuzbergRsException("compare: failed to serialize response", null);
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return MAPPER.readValue(json, ExtractionDiff.class);
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate embeddings asynchronously for a list of text strings.
|
|
*
|
|
* This is the async counterpart to embed_texts. It offloads the blocking
|
|
* ONNX inference work to a dedicated blocking thread pool via Tokio's
|
|
* {@code spawn_blocking}, keeping the async executor free.
|
|
*
|
|
* Returns one embedding vector per input text in the same order.
|
|
* {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread)
|
|
*
|
|
* {@literal @}param config Embedding configuration specifying model, batch size, and normalization
|
|
*
|
|
* {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed
|
|
* - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails,
|
|
* or the blocking inference task panics
|
|
*/
|
|
public static List<List<Float>> embedTextsAsync(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts);
|
|
var ctexts = arena.allocateFrom(ctextsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS_ASYNC.invoke(ctexts, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<List<Float>>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
public static CompletableFuture<List<List<Float>>> embedTextsAsyncAsync(final List<String> texts, final EmbeddingConfig config) {
|
|
return CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
return embedTextsAsync(texts, config);
|
|
} catch (Throwable e) {
|
|
throw new CompletionException(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Render a single PDF page to PNG bytes.
|
|
*
|
|
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
|
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
|
* {@literal @}param pdf_bytes Raw PDF file bytes
|
|
*
|
|
* {@literal @}param page_index Zero-based page index
|
|
*
|
|
* {@literal @}param dpi Resolution in dots per inch (default: 150)
|
|
*
|
|
* {@literal @}param password Optional password for encrypted PDFs
|
|
*
|
|
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated,
|
|
* or rendered, or if {@code page_index} is out of range.
|
|
*/
|
|
public static byte[] renderPdfPageToPng(
|
|
final byte[] pdfBytes,
|
|
final long pageIndex,
|
|
final @Nullable Integer dpi,
|
|
final @Nullable String password
|
|
) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cpdfBytes = arena.allocateFrom(ValueLayout.JAVA_BYTE, pdfBytes);
|
|
var cpdfBytesLen = (long) pdfBytes.length;
|
|
int cdpi = (dpi == null) ? Integer.MAX_VALUE : dpi;
|
|
var cpassword = password != null ? arena.allocateFrom(password) : MemorySegment.NULL;
|
|
var outPtrHolder = arena.allocate(ValueLayout.ADDRESS);
|
|
var outLenHolder = arena.allocate(ValueLayout.JAVA_LONG);
|
|
var outCapHolder = arena.allocate(ValueLayout.JAVA_LONG);
|
|
int rc = (int) NativeLib.KREUZBERG_RENDER_PDF_PAGE_TO_PNG.invoke(
|
|
cpdfBytes,
|
|
cpdfBytesLen,
|
|
pageIndex,
|
|
cdpi,
|
|
cpassword,
|
|
outPtrHolder,
|
|
outLenHolder,
|
|
outCapHolder
|
|
);
|
|
if (rc != 0) {
|
|
checkLastError(); return null; }
|
|
var outPtr = outPtrHolder.get(ValueLayout.ADDRESS, 0);
|
|
long outLen = outLenHolder.get(ValueLayout.JAVA_LONG, 0);
|
|
long outCap = outCapHolder.get(ValueLayout.JAVA_LONG, 0);
|
|
if (outPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
byte[] result = outPtr.reinterpret(outLen).toArray(ValueLayout.JAVA_BYTE);
|
|
NativeLib.KREUZBERG_FREE_BYTES.invoke(outPtr, outLen, outCap); return result; } catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Detect the MIME type of a file at the given path.
|
|
*
|
|
* Uses the file extension and optionally the file content to determine the MIME type.
|
|
* Set {@code check_exists} to {@code true} to verify the file exists before detection.
|
|
*/
|
|
public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cpath = arena.allocateFrom(path);
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_DETECT_MIME_TYPE.invoke(cpath, (checkExists ? 1 : 0));
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return null; }
|
|
long resultLen = (long) NativeLib.KREUZBERG_DETECT_MIME_TYPE_LEN.invoke(cpath, (checkExists ? 1 : 0));
|
|
String str = readCString(resultPtr, resultLen);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
|
|
return str;
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Embed a list of texts using the configured embedding model.
|
|
*
|
|
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
|
*/
|
|
public static List<List<Float>> embedTexts(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var ctextsJson = MAPPER.writerFor(MAPPER.getTypeFactory().constructCollectionType(java.util.List.class, String.class)).writeValueAsString(texts);
|
|
var ctexts = arena.allocateFrom(ctextsJson);
|
|
var cconfigJson = config != null ? MAPPER.writeValueAsString(config) : null;
|
|
var cconfigJsonSeg = cconfigJson != null ? arena.allocateFrom(cconfigJson) : MemorySegment.NULL;
|
|
var cconfig = cconfigJson != null
|
|
? (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_CONFIG_FROM_JSON.invoke(cconfigJsonSeg)
|
|
: MemorySegment.NULL;
|
|
if (cconfigJson != null && cconfig.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
throw new IllegalStateException("failed to create config from JSON");
|
|
}
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_EMBED_TEXTS.invoke(ctexts, cconfig);
|
|
if (!cconfig.equals(MemorySegment.NULL)) {
|
|
NativeLib.KREUZBERG_EMBEDDING_CONFIG_FREE.invoke(cconfig);
|
|
}
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<List<Float>>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get an embedding preset by name.
|
|
*
|
|
* Returns {@code None} if no preset with the given name exists. Returns an owned
|
|
* clone so the value is safe to pass across FFI boundaries.
|
|
*/
|
|
public static Optional<EmbeddingPreset> getEmbeddingPreset(final String name) throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var cname = arena.allocateFrom(name);
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_GET_EMBEDDING_PRESET.invoke(cname);
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError(); return Optional.empty(); }
|
|
// CPD-OFF
|
|
var jsonPtr = (MemorySegment) NativeLib.KREUZBERG_EMBEDDING_PRESET_TO_JSON.invoke(resultPtr);
|
|
NativeLib.KREUZBERG_EMBEDDING_PRESET_FREE.invoke(resultPtr);
|
|
if (jsonPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
return Optional.empty();
|
|
}
|
|
String json = jsonPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(jsonPtr);
|
|
return Optional.of(MAPPER.readValue(json, EmbeddingPreset.class));
|
|
// CPD-ON
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List the names of all available embedding presets.
|
|
*
|
|
* Returns owned {@code String}s so the values are safe to pass across FFI boundaries.
|
|
*/
|
|
public static List<String> listEmbeddingPresets() throws KreuzbergRsException {
|
|
try (var arena = Arena.ofShared()) {
|
|
var resultPtr = (MemorySegment) NativeLib.KREUZBERG_LIST_EMBEDDING_PRESETS.invoke();
|
|
return readJsonList(resultPtr, new com.fasterxml.jackson.core.type.TypeReference<java.util.List<String>>() { });
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
|
|
// Helper methods for FFI marshalling
|
|
|
|
private static void checkLastError() throws Throwable {
|
|
int errCode = (int) NativeLib.KREUZBERG_LAST_ERROR_CODE.invoke();
|
|
if (errCode != 0) {
|
|
var ctxPtr = (MemorySegment) NativeLib.KREUZBERG_LAST_ERROR_CONTEXT.invoke();
|
|
String msg = ctxPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
switch (errCode) {
|
|
case 1 -> throw new InvalidInputException(msg);
|
|
case 2 -> throw new ConversionErrorException(msg);
|
|
default -> throw new KreuzbergRsException(errCode, msg);
|
|
}
|
|
}
|
|
}
|
|
private static com.fasterxml.jackson.databind.ObjectMapper createObjectMapper() {
|
|
return new com.fasterxml.jackson.databind.ObjectMapper()
|
|
.registerModule(new com.fasterxml.jackson.datatype.jdk8.Jdk8Module())
|
|
.findAndRegisterModules()
|
|
.setPropertyNamingStrategy(com.fasterxml.jackson.databind.PropertyNamingStrategies.SNAKE_CASE)
|
|
.setSerializationInclusion(com.fasterxml.jackson.annotation.JsonInclude.Include.ALWAYS)
|
|
.configure(com.fasterxml.jackson.databind.MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS, true);
|
|
}
|
|
|
|
private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER = createObjectMapper();
|
|
private static String readCString(MemorySegment ptr, long byteLen) {
|
|
if (ptr == null || ptr.address() == 0) {
|
|
return null;
|
|
}
|
|
return ptr.reinterpret(byteLen + 1).getString(0);
|
|
}
|
|
private static <T> java.util.List<T> readJsonList(
|
|
MemorySegment resultPtr,
|
|
com.fasterxml.jackson.core.type.TypeReference<java.util.List<T>> typeRef
|
|
) throws KreuzbergRsException {
|
|
try {
|
|
if (resultPtr.equals(MemorySegment.NULL)) {
|
|
checkLastError();
|
|
return java.util.List.of();
|
|
}
|
|
String json = resultPtr.reinterpret(Long.MAX_VALUE).getString(0);
|
|
NativeLib.KREUZBERG_FREE_STRING.invoke(resultPtr);
|
|
return MAPPER.readValue(json, typeRef);
|
|
} catch (Throwable e) {
|
|
throw new KreuzbergRsException("FFI call failed", e);
|
|
}
|
|
}
|
|
}
|