Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

578
packages/java/dev/kreuzberg/Kreuzberg.java generated Normal file
View File

@@ -0,0 +1,578 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
package dev.kreuzberg;
import java.util.List;
import org.jspecify.annotations.Nullable;
public final class Kreuzberg {
private Kreuzberg() { }
/**
* Extract content from a byte array.
*
* This is the main entry point for in-memory extraction. It performs the following steps:
* 1. Validate MIME type
* 2. Handle legacy format conversion if needed
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* {@literal @}param content The byte array to extract
*
* {@literal @}param mime_type MIME type of the content
*
* {@literal @}param config Extraction configuration
*
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Validation} if MIME type is invalid.
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
*/
public static ExtractionResult extractBytes(
final byte[] content,
final String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(content, "content must not be null");
java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.extractBytes(content, mimeType, config);
}
/**
* Extract content from a file.
*
* This is the main entry point for file-based extraction. It performs the following steps:
* 1. Check cache for existing result (if caching enabled)
* 2. Detect or validate MIME type
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* 6. Store result in cache (if caching enabled)
* {@literal @}param path Path to the file to extract
*
* {@literal @}param mime_type Optional MIME type override. If null, will be auto-detected
*
* {@literal @}param config Extraction configuration
*
* {@literal @}return An {@code ExtractionResult} containing the extracted content and metadata.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Io} if the file doesn't exist (NotFound) or for other file I/O
* errors.
* Returns {@code KreuzbergError.UnsupportedFormat} if MIME type is not supported.
*/
public static ExtractionResult extractFile(
final java.nio.file.Path path,
final @Nullable String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(path, "path must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.extractFile(path, mimeType, config);
}
public static ExtractionResult extractFile(final java.nio.file.Path path, final ExtractionConfig config) throws KreuzbergRsException {
return KreuzbergRs.extractFile(path, null, config);
}
/**
* Synchronous wrapper for {@code extract_file}.
*
* This is a convenience function that blocks the current thread until extraction completes.
* For async code, use {@code extract_file} directly.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
*
* This function is only available with the {@code tokio-runtime} feature. For WASM targets,
* use a truly synchronous extraction approach instead.
*/
public static ExtractionResult extractFileSync(
final java.nio.file.Path path,
final @Nullable String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(path, "path must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.extractFileSync(path, mimeType, config);
}
public static ExtractionResult extractFileSync(
final java.nio.file.Path path,
final ExtractionConfig config
) throws KreuzbergRsException {
return KreuzbergRs.extractFileSync(path, null, config);
}
/**
* Synchronous wrapper for {@code extract_bytes}.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call.
*
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
*/
public static ExtractionResult extractBytesSync(
final byte[] content,
final String mimeType,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(content, "content must not be null");
java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.extractBytesSync(content, mimeType, config);
}
/**
* Synchronous wrapper for {@code batch_extract_files}.
*
* Uses the global Tokio runtime for optimal performance.
* Only available with {@code tokio-runtime} (WASM has no filesystem).
*/
public static List<ExtractionResult> batchExtractFilesSync(
final List<BatchFileItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(items, "items must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.batchExtractFilesSync(items, config);
}
/**
* Synchronous wrapper for {@code batch_extract_bytes}.
*
* Uses the global Tokio runtime for optimal performance.
* With the {@code tokio-runtime} feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
* that iterates through items and calls {@code extract_bytes_sync()}.
*/
public static List<ExtractionResult> batchExtractBytesSync(
final List<BatchBytesItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(items, "items must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.batchExtractBytesSync(items, config);
}
/**
* Extract content from multiple files concurrently.
*
* This function processes multiple files in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
* to {@code (num_cpus * 1.5).ceil()}.
*
* Each file can optionally specify a FileExtractionConfig that overrides specific
* fields from the batch-level {@code config}. Pass {@code None} for a file to use the batch defaults.
* Batch-level settings like {@code max_concurrent_extractions} and {@code use_cache} are always
* taken from the batch-level {@code config}.
* {@literal @}param items Vector of {@code BatchFileItem} structs, each containing a path and optional per-file configuration
* overrides.
*
* {@literal @}param config Batch-level extraction configuration (provides defaults and batch settings)
*
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
*
* {@literal @}throws KreuzbergRsException Individual file errors are captured in the result metadata. System errors
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
*/
public static List<ExtractionResult> batchExtractFiles(
final List<BatchFileItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(items, "items must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.batchExtractFiles(items, config);
}
/**
* Extract content from multiple byte arrays concurrently.
*
* This function processes multiple byte arrays in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via {@code ExtractionConfig.max_concurrent_extractions} or defaults
* to {@code (num_cpus * 1.5).ceil()}.
*
* Each item can optionally specify a FileExtractionConfig that overrides specific
* fields from the batch-level {@code config}. Pass {@code None} as the config to use
* the batch-level defaults for that item.
* {@literal @}param items Vector of {@code BatchBytesItem} structs, each containing content bytes, MIME type, and optional per-item
* configuration overrides.
*
* {@literal @}param config Batch-level extraction configuration
*
* {@literal @}return A vector of {@code ExtractionResult} in the same order as the input items.
*/
public static List<ExtractionResult> batchExtractBytes(
final List<BatchBytesItem> items,
final ExtractionConfig config
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(items, "items must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.batchExtractBytes(items, config);
}
/**
* Detect MIME type from raw file bytes.
*
* Uses magic byte signatures to detect file type from content.
* Falls back to {@code infer} crate for comprehensive detection.
*
* For ZIP-based files, inspects contents to distinguish Office Open XML
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
* {@literal @}param content Raw file bytes
*
* {@literal @}return The detected MIME type string.
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.UnsupportedFormat} if MIME type cannot be determined.
*/
public static String detectMimeTypeFromBytes(final byte[] content) throws KreuzbergRsException {
java.util.Objects.requireNonNull(content, "content must not be null");
return KreuzbergRs.detectMimeTypeFromBytes(content);
}
/**
* Get file extensions for a given MIME type.
*
* Returns all known file extensions that map to the specified MIME type.
* {@literal @}param mime_type The MIME type to look up
*
* {@literal @}return A vector of file extensions (without leading dot) for the MIME type.
*/
public static List<String> getExtensionsForMime(final String mimeType) throws KreuzbergRsException {
java.util.Objects.requireNonNull(mimeType, "mimeType must not be null");
return KreuzbergRs.getExtensionsForMime(mimeType);
}
/**
* Clear all embedding backends from the global registry.
*
* Calls {@code shutdown()} on every registered backend, then empties the registry.
* {@literal @}throws KreuzbergRsException - Any error returned by a backend's {@code shutdown()} method. The first error
* encountered stops processing of remaining backends.
*/
public static void clearEmbeddingBackends() throws KreuzbergRsException {
KreuzbergRs.clearEmbeddingBackends();
}
/**
* List the names of all registered embedding backends.
*
* Used by {@code kreuzberg-cli}, the api/mcp endpoints, and generated language
* bindings.
*/
public static List<String> listEmbeddingBackends() throws KreuzbergRsException {
return KreuzbergRs.listEmbeddingBackends();
}
/**
* List names of all registered document extractors.
*/
public static List<String> listDocumentExtractors() throws KreuzbergRsException {
return KreuzbergRs.listDocumentExtractors();
}
/**
* Clear all document extractors from the global registry.
*
* Calls {@code shutdown()} on every registered extractor, then empties the registry.
* {@literal @}throws KreuzbergRsException - Any error returned by an extractor's {@code shutdown()} method. The first error
* encountered stops processing of remaining extractors.
*/
public static void clearDocumentExtractors() throws KreuzbergRsException {
KreuzbergRs.clearDocumentExtractors();
}
/**
* List all registered OCR backends.
*
* Returns the names of all OCR backends currently registered in the global registry.
* {@literal @}return A vector of OCR backend names.
*/
public static List<String> listOcrBackends() throws KreuzbergRsException {
return KreuzbergRs.listOcrBackends();
}
/**
* Clear all OCR backends from the global registry.
*
* Removes all OCR backends and calls their {@code shutdown()} methods.
* {@literal @}return - {@code Ok(())} if all backends were cleared successfully
* - {@code Err(...)} if any shutdown method failed
*/
public static void clearOcrBackends() throws KreuzbergRsException {
KreuzbergRs.clearOcrBackends();
}
/**
* List all registered post-processor names.
*
* Returns a vector of all post-processor names currently registered in the
* global registry.
* {@literal @}return - {@code Ok(Vec&lt;String&gt;)} - Vector of post-processor names
* - {@code Err(...)} if the registry lock is poisoned
*/
public static List<String> listPostProcessors() throws KreuzbergRsException {
return KreuzbergRs.listPostProcessors();
}
/**
* Remove all registered post-processors.
*/
public static void clearPostProcessors() throws KreuzbergRsException {
KreuzbergRs.clearPostProcessors();
}
/**
* List names of all registered renderers.
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
*/
public static List<String> listRenderers() throws KreuzbergRsException {
return KreuzbergRs.listRenderers();
}
/**
* Clear all renderers from the global registry.
*
* Removes every renderer, including the built-in defaults (markdown, html,
* djot, plain). After calling this no renderers are registered; re-register
* as needed.
* {@literal @}throws KreuzbergRsException Returns an error if the registry lock is poisoned.
*/
public static void clearRenderers() throws KreuzbergRsException {
KreuzbergRs.clearRenderers();
}
/**
* List names of all registered validators.
*/
public static List<String> listValidators() throws KreuzbergRsException {
return KreuzbergRs.listValidators();
}
/**
* Remove all registered validators.
*/
public static void clearValidators() throws KreuzbergRsException {
KreuzbergRs.clearValidators();
}
/**
* Compare two extraction results and return a structured diff.
*
* The comparison is purely structural — no I/O, no side effects. All fields
* of ExtractionDiff are populated according to the provided DiffOptions.
* {@literal @}param a — the "before" extraction result
*
* {@literal @}param b — the "after" extraction result
*
* {@literal @}param opts — controls which sections are compared and optional truncation
*/
public static ExtractionDiff compare(
final ExtractionResult a,
final ExtractionResult b,
final DiffOptions opts
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(a, "a must not be null");
java.util.Objects.requireNonNull(b, "b must not be null");
java.util.Objects.requireNonNull(opts, "opts must not be null");
return KreuzbergRs.compare(a, b, opts);
}
/**
* Generate embeddings asynchronously for a list of text strings.
*
* This is the async counterpart to embed_texts. It offloads the blocking
* ONNX inference work to a dedicated blocking thread pool via Tokio's
* {@code spawn_blocking}, keeping the async executor free.
*
* Returns one embedding vector per input text in the same order.
* {@literal @}param texts Vec of strings to embed (owned, sent to blocking thread)
*
* {@literal @}param config Embedding configuration specifying model, batch size, and normalization
*
* {@literal @}throws KreuzbergRsException - {@code KreuzbergError.MissingDependency} if ONNX Runtime is not installed
* - {@code KreuzbergError.Embedding} if the preset name is unknown, model download fails,
* or the blocking inference task panics
*/
public static List<List<Float>> embedTextsAsync(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
java.util.Objects.requireNonNull(texts, "texts must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.embedTextsAsync(texts, config);
}
/**
* Render a single PDF page to PNG bytes.
*
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
* {@literal @}param pdf_bytes Raw PDF file bytes
*
* {@literal @}param page_index Zero-based page index
*
* {@literal @}param dpi Resolution in dots per inch (default: 150)
*
* {@literal @}param password Optional password for encrypted PDFs
*
* {@literal @}throws KreuzbergRsException Returns {@code KreuzbergError.Parsing} if the PDF cannot be opened, authenticated,
* or rendered, or if {@code page_index} is out of range.
*/
public static byte[] renderPdfPageToPng(
final byte[] pdfBytes,
final long pageIndex,
final @Nullable Integer dpi,
final @Nullable String password
) throws KreuzbergRsException {
java.util.Objects.requireNonNull(pdfBytes, "pdfBytes must not be null");
java.util.Objects.requireNonNull(pageIndex, "pageIndex must not be null");
return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, dpi, password);
}
public static byte[] renderPdfPageToPng(final byte[] pdfBytes, final long pageIndex) throws KreuzbergRsException {
return KreuzbergRs.renderPdfPageToPng(pdfBytes, pageIndex, 0, null);
}
/**
* Detect the MIME type of a file at the given path.
*
* Uses the file extension and optionally the file content to determine the MIME type.
* Set {@code check_exists} to {@code true} to verify the file exists before detection.
*/
public static String detectMimeType(final String path, final boolean checkExists) throws KreuzbergRsException {
java.util.Objects.requireNonNull(path, "path must not be null");
java.util.Objects.requireNonNull(checkExists, "checkExists must not be null");
return KreuzbergRs.detectMimeType(path, checkExists);
}
/**
* Embed a list of texts using the configured embedding model.
*
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
*/
public static List<List<Float>> embedTexts(final List<String> texts, final EmbeddingConfig config) throws KreuzbergRsException {
java.util.Objects.requireNonNull(texts, "texts must not be null");
java.util.Objects.requireNonNull(config, "config must not be null");
return KreuzbergRs.embedTexts(texts, config);
}
/**
* Get an embedding preset by name.
*
* Returns {@code None} if no preset with the given name exists. Returns an owned
* clone so the value is safe to pass across FFI boundaries.
*/
public static @Nullable EmbeddingPreset getEmbeddingPreset(final String name) throws KreuzbergRsException {
java.util.Objects.requireNonNull(name, "name must not be null");
return KreuzbergRs.getEmbeddingPreset(name).orElse(null);
}
/**
* List the names of all available embedding presets.
*
* Returns owned {@code String}s so the values are safe to pass across FFI boundaries.
*/
public static List<String> listEmbeddingPresets() throws KreuzbergRsException {
return KreuzbergRs.listEmbeddingPresets();
}
public static void registerOcrBackend(final IOcrBackend impl) throws KreuzbergRsException {
try {
OcrBackendBridge.registerOcrBackend(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterOcrBackend(final String name) throws KreuzbergRsException {
try {
OcrBackendBridge.unregisterOcrBackend(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void registerPostProcessor(final IPostProcessor impl) throws KreuzbergRsException {
try {
PostProcessorBridge.registerPostProcessor(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterPostProcessor(final String name) throws KreuzbergRsException {
try {
PostProcessorBridge.unregisterPostProcessor(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void registerValidator(final IValidator impl) throws KreuzbergRsException {
try {
ValidatorBridge.registerValidator(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterValidator(final String name) throws KreuzbergRsException {
try {
ValidatorBridge.unregisterValidator(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void registerEmbeddingBackend(final IEmbeddingBackend impl) throws KreuzbergRsException {
try {
EmbeddingBackendBridge.registerEmbeddingBackend(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterEmbeddingBackend(final String name) throws KreuzbergRsException {
try {
EmbeddingBackendBridge.unregisterEmbeddingBackend(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void registerDocumentExtractor(final IDocumentExtractor impl) throws KreuzbergRsException {
try {
DocumentExtractorBridge.registerDocumentExtractor(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterDocumentExtractor(final String name) throws KreuzbergRsException {
try {
DocumentExtractorBridge.unregisterDocumentExtractor(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void registerRenderer(final IRenderer impl) throws KreuzbergRsException {
try {
RendererBridge.registerRenderer(impl);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
public static void unregisterRenderer(final String name) throws KreuzbergRsException {
try {
RendererBridge.unregisterRenderer(name);
} catch (Exception e) {
throw new KreuzbergRsException(e.getMessage(), e);
}
}
}