Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/e2e/r/DESCRIPTION
+++ b/e2e/r/DESCRIPTION
@@ -0,0 +1,6 @@
+Package: e2e.r
+Title: E2E Tests for kreuzberg
+Version: 0.1.0
+Description: End-to-end test suite.
+Suggests: testthat (>= 3.0.0)
+Config/testthat/edition: 3
--- a/e2e/r/run_tests.R
+++ b/e2e/r/run_tests.R
@@ -0,0 +1,11 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+library(testthat)
+devtools::load_all("../../packages/r")
+
+testthat::set_max_fails(Inf)
+.script_dir <- tryCatch(dirname(normalizePath(sys.frame(1)$ofile)), error = function(e) getwd())
+test_dir(file.path(.script_dir, "tests"))
--- a/e2e/r/tests/setup-fixtures.R
+++ b/e2e/r/tests/setup-fixtures.R
@@ -0,0 +1,37 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+
+# Resolve fixture paths against the repo's `test_documents/` directory.
+# testthat sources setup-*.R with the working directory at tests/,
+# so test_documents lives three directories up: tests/ -> e2e/r/ -> e2e/ -> repo root.
+# Each `test_that()` block has its working directory reset back to tests/, so
+# fixture lookups must be performed via this helper rather than relying on `setwd`.
+.alef_test_documents <- normalizePath("../../../test_documents", mustWork = FALSE)
+.resolve_fixture <- function(path) {
+  if (dir.exists(.alef_test_documents)) {
+    file.path(.alef_test_documents, path)
+  } else {
+    path
+  }
+}
+
+.alef_format_value <- function(x) {
+  if (is.list(x)) {
+    for (variant in names(x)) {
+      v <- x[[variant]]
+      if (is.list(v) && !is.null(v[["format"]]) && is.character(v[["format"]])) {
+        return(v[["format"]])
+      }
+    }
+    if (!is.null(x[["format"]]) && is.character(x[["format"]])) {
+      return(x[["format"]])
+    }
+    if (!is.null(x[["format_type"]])) {
+      return(x[["format_type"]])
+    }
+  }
+  x
+}
--- a/e2e/r/tests/test_async.R
+++ b/e2e/r/tests/test_async.R
@@ -0,0 +1,21 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: async
+
+test_that("async_extract_bytes: Async extract_bytes call on PDF document", {
+  result <- jsonlite::fromJSON(extract_bytes(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), mime_type = "application/pdf", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 50)
+})
+
+test_that("async_extract_bytes_empty_mime: extract_bytes empty MIME async", {
+  expect_error(extract_bytes(content = readBin(.resolve_fixture("text/plain.txt"), what = "raw", n = file.info(.resolve_fixture("text/plain.txt"))$size), mime_type = "", config = ExtractionConfig$default()))
+})
+
+test_that("async_extract_bytes_invalid_mime: extract_bytes unsupported MIME async", {
+  expect_error(extract_bytes(content = readBin(.resolve_fixture("text/plain.txt"), what = "raw", n = file.info(.resolve_fixture("text/plain.txt"))$size), mime_type = "application/x-nonexistent", config = ExtractionConfig$default()))
+})
--- a/e2e/r/tests/test_batch.R
+++ b/e2e/r/tests/test_batch.R
@@ -0,0 +1,58 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: batch
+
+test_that("batch_bytes_invalid_mime: batch_extract_bytes_sync invalid MIME", {
+  result <- jsonlite::fromJSON(batch_extract_bytes_sync(items = "[{\"content\":[72,101,108,108,111],\"mime_type\":\"application/x-nonexistent\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_extract_bytes_happy: batch_extract_bytes: happy path with mixed inputs", {
+  result <- jsonlite::fromJSON(batch_extract_bytes(items = "[{\"content\":[72,101,108,108,111,44,32,119,111,114,108,100,33],\"mime_type\":\"text/plain\"},{\"content\":[60,104,116,109,108,62,60,98,111,100,121,62,84,101,115,116,60,47,98,111,100,121,62,60,47,104,116,109,108,62],\"mime_type\":\"text/html\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_true(length(result) >= 1)
+})
+
+test_that("batch_extract_bytes_mixed_format: batch_extract_bytes: handles unsupported MIME gracefully", {
+  result <- jsonlite::fromJSON(batch_extract_bytes(items = "[{\"content\":[80,68,70,32,112,108,97,99,101,104,111,108,100,101,114],\"mime_type\":\"application/x-unknown\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_extract_bytes_sync_empty_list: batch_extract_bytes_sync: empty batch", {
+  result <- jsonlite::fromJSON(batch_extract_bytes_sync(items = "[]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_equal(length(result), 0)
+})
+
+test_that("batch_extract_bytes_sync_invalid_mime: batch_extract_bytes_sync: unsupported MIME", {
+  result <- jsonlite::fromJSON(batch_extract_bytes_sync(items = "[{\"content\":[100,97,116,97],\"mime_type\":\"application/x-unknown\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_file_async_basic: Extract text from multiple files asynchronously", {
+  result <- jsonlite::fromJSON(batch_extract_files(items = "[{\"path\":\"pdf/fake_memo.pdf\"},{\"path\":\"text/fake_text.txt\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_file_async_not_found: batch_extract_file async nonexistent", {
+  result <- jsonlite::fromJSON(batch_extract_files(items = "[{\"path\":\"/nonexistent/a.pdf\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_file_not_found: batch_extract_file_sync nonexistent", {
+  result <- jsonlite::fromJSON(batch_extract_files_sync(items = "[{\"path\":\"/nonexistent/a.pdf\"},{\"path\":\"/nonexistent/b.txt\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_file_partial: batch_extract_file_sync mixed", {
+  result <- jsonlite::fromJSON(batch_extract_files_sync(items = "[{\"path\":\"text/plain.txt\"},{\"path\":\"/nonexistent/missing.pdf\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("batch_file_sync_basic: Extract text from multiple files synchronously", {
+  result <- jsonlite::fromJSON(batch_extract_files_sync(items = "[{\"path\":\"pdf/fake_memo.pdf\"},{\"path\":\"text/fake_text.txt\"}]", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_code.R
+++ b/e2e/r/tests/test_code.R
@@ -0,0 +1,14 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: code
+
+test_that("code_shebang_detection: Test language detection from shebang line via bytes input", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("code/script.sh"), mime_type = "text/x-source-code", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "text/x-source-code")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(grepl("build", result$content, fixed = TRUE)))
+  expect_true(any(grepl("clean", result$content, fixed = TRUE)))
+})
--- a/e2e/r/tests/test_contract.R
+++ b/e2e/r/tests/test_contract.R
@@ -0,0 +1,126 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: contract
+
+test_that("api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("api_batch_file_async: Tests async batch file extraction API (batch_extract_file)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_extract_file_async: Tests async file extraction API (extract_file)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("markdown/extraction_test.md"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("chunking" = list("chunker_type" = "markdown", "max_chars" = 300, "max_overlap" = 50, "prepend_heading_context" = TRUE)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'chunks' not available on result type
+  expect_true(all(sapply(result$chunks %||% list(), function(c) nchar(c$content) > 0)))
+  expect_true(!is.null(result$chunks) && length(result$chunks) > 0 && all(sapply(result$chunks, function(c) nchar(c$content) > 0)))
+  expect_true(!is.null(result$chunks) && length(result$chunks) > 0 && startsWith(trimws(result$chunks[[1]]$content), "#"))
+})
+
+test_that("config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("docx/fake.docx"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("include_document_structure" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+  # skipped: field 'document' not available on result type
+  # skipped: field 'document.nodes' not available on result type
+})
+
+test_that("config_element_types: Tests element-based result format with element type assertions on DOCX", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("docx/unit_test_headers.docx"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("result_format" = "element_based"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(any(sapply(c("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), function(v) grepl(v, result$mime_type, fixed = TRUE))))
+  # skipped: field 'elements' not available on result type
+})
+
+test_that("config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("extraction_timeout_secs" = 300), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+})
+
+test_that("config_keywords: Tests keyword extraction via YAKE algorithm", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("keywords" = list("algorithm" = "yake", "max_keywords" = 10)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'keywords' not available on R ExtractionResult
+  # skipped: field 'keywords' not available on R ExtractionResult
+})
+
+test_that("config_pages: Tests page extraction and page marker configuration", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("pages" = list("extract_pages" = TRUE, "insert_page_markers" = TRUE)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("PAGE"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("enable_quality_processing" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'quality_score' not available on result type
+  # skipped: field 'quality_score' not available on result type
+  # skipped: field 'quality_score' not available on result type
+})
+
+test_that("config_security_limits: Tests archive extraction with custom security limits", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("archives/documents.zip"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("security_limits" = list("max_archive_size" = 104857600, "max_compression_ratio" = 50, "max_files_in_archive" = 100)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(any(sapply(c("application/zip", "application/x-zip-compressed"), function(v) grepl(v, result$mime_type, fixed = TRUE))))
+  expect_true(nchar(result$content) >= 10)
+})
+
+test_that("config_tree_sitter: Tests tree-sitter configuration round-trip", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("code/hello.py"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("tree_sitter" = list("groups" = I(c("web")), "languages" = I(c("python", "rust")), "process" = list("comments" = FALSE, "diagnostics" = FALSE, "docstrings" = FALSE, "exports" = TRUE, "imports" = TRUE, "structure" = TRUE, "symbols" = FALSE))), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "text/x-source-code")
+  expect_true(nchar(result$content) >= 5)
+})
+
+test_that("output_format_bytes_markdown: Tests markdown output format via bytes extraction API", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), mime_type = "application/pdf", config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("output_format_markdown: Tests Markdown output format", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
--- a/e2e/r/tests/test_detection.R
+++ b/e2e/r/tests/test_detection.R
@@ -0,0 +1,25 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: detection
+
+test_that("detect_mime_bytes_html: Detect HTML MIME from bytes", {
+  result <- detect_mime_type_from_bytes(content = readBin(.resolve_fixture("html/html.html"), what = "raw", n = file.info(.resolve_fixture("html/html.html"))$size))
+  expect_true(TRUE)
+})
+
+test_that("detect_mime_bytes_pdf: Detect PDF MIME type from bytes", {
+  result <- detect_mime_type_from_bytes(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size))
+  expect_true(TRUE)
+})
+
+test_that("detect_mime_bytes_png: Detect PNG MIME type from bytes", {
+  result <- detect_mime_type_from_bytes(content = readBin(.resolve_fixture("images/test_hello_world.png"), what = "raw", n = file.info(.resolve_fixture("images/test_hello_world.png"))$size))
+  expect_true(TRUE)
+})
+
+test_that("get_extensions_unknown_mime: get_extensions unknown MIME", {
+  expect_error(get_extensions_for_mime(mime_type = "application/x-totally-unknown"))
+})
--- a/e2e/r/tests/test_document_extractor_management.R
+++ b/e2e/r/tests/test_document_extractor_management.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: document_extractor_management
+
+test_that("document_extractors_clear: Clear all document extractors and verify list is empty", {
+  invisible(clear_document_extractors())
+  expect_true(TRUE)
+})
+
+test_that("extractors_list: List all registered document extractors", {
+  result <- list_document_extractors()
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_embed_async_pending.R
+++ b/e2e/r/tests/test_embed_async_pending.R
@@ -0,0 +1,23 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: embed_async_pending
+
+test_that("embed_texts_async_empty_input: embed_texts_async: empty text list", {
+  result <- embed_texts_async(texts = character(0), config = EmbeddingConfig$default())
+  expect_true(TRUE)
+  expect_equal(length((if (is.character(result) && length(result) == 1) jsonlite::fromJSON(result, simplifyVector = FALSE) else result)), 0)
+})
+
+test_that("embed_texts_async_happy: embed_texts_async: basic async embedding", {
+  result <- embed_texts_async(texts = c("First", "Second"), config = EmbeddingConfig$default())
+  expect_true(TRUE)
+  expect_gte(length((if (is.character(result) && length(result) == 1) jsonlite::fromJSON(result, simplifyVector = FALSE) else result)), 2)
+})
+
+test_that("embed_texts_async_preset_switch: embed_texts_async: preset override", {
+  result <- embed_texts_async(texts = c("Text"), config = EmbeddingConfig$from_json(jsonlite::toJSON(list("model" = list("name" = "balanced", "type" = "preset")), auto_unbox = TRUE)))
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_embed_extra.R
+++ b/e2e/r/tests/test_embed_extra.R
@@ -0,0 +1,11 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: embed_extra
+
+test_that("embed_texts_batch: Batch embed texts", {
+  result <- embed_texts(texts = c("Hello", "World"), config = EmbeddingConfig$from_json(jsonlite::toJSON(list("model" = list("name" = "balanced", "type" = "preset")), auto_unbox = TRUE)))
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_embedding_backend_management.R
+++ b/e2e/r/tests/test_embedding_backend_management.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: embedding_backend_management
+
+test_that("embedding_backends_clear: Clear all embedding backends and verify list is empty", {
+  invisible(clear_embedding_backends())
+  expect_true(TRUE)
+})
+
+test_that("embedding_backends_list: List all registered embedding backends", {
+  result <- list_embedding_backends()
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_embeddings.R
+++ b/e2e/r/tests/test_embeddings.R
@@ -0,0 +1,32 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: embeddings
+
+test_that("embed_texts_different_preset: embed_texts: multilingual preset", {
+  result <- embed_texts(texts = c("Hello world", "Test"), config = EmbeddingConfig$from_json(jsonlite::toJSON(list("model" = list("name" = "multilingual", "type" = "preset")), auto_unbox = TRUE)))
+  expect_true(TRUE)
+  expect_gte(length((if (is.character(result) && length(result) == 1) jsonlite::fromJSON(result, simplifyVector = FALSE) else result)), 2)
+})
+
+test_that("get_embedding_preset_known: get_embedding_preset: known preset", {
+  result <- get_embedding_preset(name = "balanced")
+  expect_true(TRUE)
+})
+
+test_that("get_embedding_preset_nominal: get_embedding_preset: nominal case", {
+  result <- get_embedding_preset(name = "balanced")
+  expect_true(TRUE)
+})
+
+test_that("get_embedding_preset_unknown: get_embedding_preset: unknown preset fails", {
+  result <- get_embedding_preset(name = "nonexistent-xyz")
+  expect_true(is.null(result) || length(result) == 0 || (length(result) == 1 && (is.na(result) || identical(result, ""))))
+})
+
+test_that("list_embedding_presets_sanity: list_embedding_presets: returns at least one", {
+  result <- list_embedding_presets()
+  expect_true(if (is.character(result)) length(result) > 0 && any(nchar(result) > 0) else length(result) > 0)
+})
--- a/e2e/r/tests/test_error.R
+++ b/e2e/r/tests/test_error.R
@@ -0,0 +1,27 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: error
+
+test_that("error_empty_bytes: Graceful handling of empty bytes (should not error)", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("text/empty.txt"), what = "raw", n = file.info(.resolve_fixture("text/empty.txt"))$size), mime_type = "text/plain", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("error_empty_mime: Error when extracting with empty MIME type", {
+  expect_error(extract_bytes_sync(content = readBin(.resolve_fixture("text/plain.txt"), what = "raw", n = file.info(.resolve_fixture("text/plain.txt"))$size), mime_type = "", config = ExtractionConfig$default()))
+})
+
+test_that("error_extract_bytes_conflicting_ocr: extract_bytes force+disable OCR", {
+  expect_error(extract_bytes_sync(content = readBin(.resolve_fixture("text/fake_text.txt"), what = "raw", n = file.info(.resolve_fixture("text/fake_text.txt"))$size), mime_type = "text/plain", config = ExtractionConfig$from_json(jsonlite::toJSON(list("disable_ocr" = TRUE, "force_ocr" = TRUE), auto_unbox = TRUE))))
+})
+
+test_that("error_invalid_mime_format: Error when extracting with invalid MIME type format", {
+  expect_error(extract_bytes_sync(content = readBin(.resolve_fixture("text/plain.txt"), what = "raw", n = file.info(.resolve_fixture("text/plain.txt"))$size), mime_type = "not-a-mime", config = ExtractionConfig$default()))
+})
+
+test_that("error_unsupported_mime: Error when extracting with unsupported MIME type", {
+  expect_error(extract_bytes_sync(content = readBin(.resolve_fixture("text/plain.txt"), what = "raw", n = file.info(.resolve_fixture("text/plain.txt"))$size), mime_type = "application/x-nonexistent", config = ExtractionConfig$default()))
+})
--- a/e2e/r/tests/test_format_specific.R
+++ b/e2e/r/tests/test_format_specific.R
@@ -0,0 +1,36 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: format_specific
+
+test_that("format_docx_standalone: Standalone DOCX extraction using extract_bytes_sync", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("docx/fake.docx"), what = "raw", n = file.info(.resolve_fixture("docx/fake.docx"))$size), mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_true(nchar(result$content) >= 20)
+})
+
+test_that("format_hwpx_standalone: Standalone HWPX extraction using extract_bytes_sync", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("hwpx/simple.hwpx"), what = "raw", n = file.info(.resolve_fixture("hwpx/simple.hwpx"))$size), mime_type = "application/haansofthwpx", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_true(nchar(result$content) >= 20)
+  expect_true(grepl("Hello from HWPX", result$content, fixed = TRUE))
+})
+
+test_that("format_pdf_text: Standalone PDF text extraction using extract_bytes_sync", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), mime_type = "application/pdf", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+  expect_true(nchar(result$content) >= 50)
+  expect_true(any(sapply(c("Mallori", "May"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("format_pptx: PPTX presentation extraction using extract_file_sync", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pptx/simple.pptx"), mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
+
+test_that("format_xlsx: XLSX spreadsheet extraction using extract_file_sync", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("xlsx/stanley_cups.xlsx"), mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_mime_utilities.R
+++ b/e2e/r/tests/test_mime_utilities.R
@@ -0,0 +1,21 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: mime_utilities
+
+test_that("mime_detect_bytes: Detect MIME type from file bytes", {
+  result <- detect_mime_type_from_bytes(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size))
+  expect_true(grepl("pdf", result, fixed = TRUE))
+})
+
+test_that("mime_detect_image: Detect MIME type from PNG image bytes", {
+  result <- detect_mime_type_from_bytes(content = readBin(.resolve_fixture("images/test_hello_world.png"), what = "raw", n = file.info(.resolve_fixture("images/test_hello_world.png"))$size))
+  expect_true(grepl("png", result, fixed = TRUE))
+})
+
+test_that("mime_get_extensions: Get file extensions for a MIME type", {
+  result <- get_extensions_for_mime(mime_type = "application/pdf")
+  expect_true(grepl("pdf", result, fixed = TRUE))
+})
--- a/e2e/r/tests/test_ocr_backend_management.R
+++ b/e2e/r/tests/test_ocr_backend_management.R
@@ -0,0 +1,21 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: ocr_backend_management
+
+test_that("ocr_backends_clear: Clear all OCR backends and verify list is empty", {
+  invisible(clear_ocr_backends())
+  expect_true(TRUE)
+})
+
+test_that("ocr_backends_list: List all registered OCR backends", {
+  result <- list_ocr_backends()
+  expect_true(TRUE)
+})
+
+test_that("ocr_backends_unregister: Unregister nonexistent OCR backend gracefully", {
+  invisible(unregister_ocr_backend(name = "nonexistent-backend-xyz"))
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_pdf.R
+++ b/e2e/r/tests/test_pdf.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: pdf
+
+test_that("render_pdf_page_first: render_pdf_page_to_png: first page", {
+  result <- render_pdf_page_to_png(pdf_bytes = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), page_index = 0, NULL, NULL)
+  expect_true(TRUE)
+  expect_true(length(result) >= 100)
+})
+
+test_that("render_pdf_page_out_of_range: render_pdf_page_to_png: page out of range", {
+  expect_error(render_pdf_page_to_png(pdf_bytes = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), page_index = 999, NULL, NULL))
+})
--- a/e2e/r/tests/test_plugin_api.R
+++ b/e2e/r/tests/test_plugin_api.R
@@ -0,0 +1,108 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: plugin_api
+
+test_that("register_document_extractor_trait_bridge: register_document_extractor: trait bridge", {
+    r_backend_register_document_extractor_trait_bridge <- list(
+    name = "test-extractor",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    extract_bytes = function(content, mime_type, config) InternalDocument(),
+    supported_mime_types = function() c()
+  )
+  invisible(register_document_extractor(r_backend = r_backend_register_document_extractor_trait_bridge))
+  expect_true(TRUE)
+  unregister_document_extractor("test-extractor")
+})
+
+test_that("register_embedding_backend_trait_bridge: register_embedding_backend: trait bridge", {
+    r_backend_register_embedding_backend_trait_bridge <- list(
+    name = "test-embedding-backend",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    dimensions = function() 768,
+    embed = function(texts) c()
+  )
+  invisible(register_embedding_backend(r_backend = r_backend_register_embedding_backend_trait_bridge))
+  expect_true(TRUE)
+  unregister_embedding_backend("test-embedding-backend")
+})
+
+test_that("register_ocr_backend_trait_bridge: register_ocr_backend: trait bridge", {
+    r_backend_register_ocr_backend_trait_bridge <- list(
+    name = "test-backend",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    process_image = function(image_bytes, config) ExtractionResult(),
+    supports_language = function(lang) FALSE,
+    backend_type = function() OcrBackendType()
+  )
+  invisible(register_ocr_backend(r_backend = r_backend_register_ocr_backend_trait_bridge))
+  expect_true(TRUE)
+  unregister_ocr_backend("test-backend")
+})
+
+test_that("register_post_processor_trait_bridge: register_post_processor: trait bridge", {
+    r_backend_register_post_processor_trait_bridge <- list(
+    name = "test-processor",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    process = function(result, config) NULL,
+    processing_stage = function() ProcessingStage()
+  )
+  invisible(register_post_processor(r_backend = r_backend_register_post_processor_trait_bridge))
+  expect_true(TRUE)
+  unregister_post_processor("test-processor")
+})
+
+test_that("register_renderer_trait_bridge: register_renderer: trait bridge", {
+    r_backend_register_renderer_trait_bridge <- list(
+    name = "test-renderer",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    render = function(doc) ""
+  )
+  invisible(register_renderer(r_backend = r_backend_register_renderer_trait_bridge))
+  expect_true(TRUE)
+  unregister_renderer("test-renderer")
+})
+
+test_that("register_validator_trait_bridge: register_validator: trait bridge", {
+    r_backend_register_validator_trait_bridge <- list(
+    name = "test-validator",
+    initialize = function() invisible(NULL),
+    shutdown = function() invisible(NULL),
+    validate = function(result, config) NULL
+  )
+  invisible(register_validator(r_backend = r_backend_register_validator_trait_bridge))
+  expect_true(TRUE)
+  unregister_validator("test-validator")
+})
+
+test_that("unregister_document_extractor_after_register: unregister_document_extractor", {
+  invisible(unregister_document_extractor(name = "test-extractor"))
+  expect_true(TRUE)
+})
+
+test_that("unregister_embedding_backend_after_register: unregister_embedding_backend", {
+  invisible(unregister_embedding_backend(name = "test-embedding-backend"))
+  expect_true(TRUE)
+})
+
+test_that("unregister_post_processor_after_register: unregister_post_processor", {
+  invisible(unregister_post_processor(name = "test-processor"))
+  expect_true(TRUE)
+})
+
+test_that("unregister_renderer_after_register: unregister_renderer", {
+  invisible(unregister_renderer(name = "test-renderer"))
+  expect_true(TRUE)
+})
+
+test_that("unregister_validator_after_register: unregister_validator", {
+  invisible(unregister_validator(name = "test-validator"))
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_post_processor_management.R
+++ b/e2e/r/tests/test_post_processor_management.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: post_processor_management
+
+test_that("post_processors_clear: Clear all post-processors and verify list is empty", {
+  invisible(clear_post_processors())
+  expect_true(TRUE)
+})
+
+test_that("post_processors_list: List all registered post-processors", {
+  result <- list_post_processors()
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_registry.R
+++ b/e2e/r/tests/test_registry.R
@@ -0,0 +1,36 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: registry
+
+test_that("list_document_extractors: List document extractors", {
+  result <- list_document_extractors()
+  expect_true(TRUE)
+})
+
+test_that("list_embedding_backends: List embedding backends", {
+  result <- list_embedding_backends()
+  expect_true(TRUE)
+})
+
+test_that("list_ocr_backends: List OCR backends", {
+  result <- list_ocr_backends()
+  expect_true(TRUE)
+})
+
+test_that("list_post_processors: List post-processors", {
+  result <- list_post_processors()
+  expect_true(TRUE)
+})
+
+test_that("list_renderers: List renderers", {
+  result <- list_renderers()
+  expect_true(TRUE)
+})
+
+test_that("list_validators: List validators", {
+  result <- list_validators()
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_registry_operations.R
+++ b/e2e/r/tests/test_registry_operations.R
@@ -0,0 +1,21 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: registry_operations
+
+test_that("extensions_docx: Get file extensions for DOCX MIME type", {
+  result <- get_extensions_for_mime(mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+  expect_true(TRUE)
+})
+
+test_that("extensions_html: Get file extensions for HTML MIME type", {
+  result <- get_extensions_for_mime(mime_type = "text/html")
+  expect_true(TRUE)
+})
+
+test_that("extensions_pdf: Get file extensions for PDF MIME type", {
+  result <- get_extensions_for_mime(mime_type = "application/pdf")
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_renderer_management.R
+++ b/e2e/r/tests/test_renderer_management.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: renderer_management
+
+test_that("renderers_clear: Clear all renderers and verify list is empty", {
+  invisible(clear_renderers())
+  expect_true(TRUE)
+})
+
+test_that("renderers_list: List all registered renderers", {
+  result <- list_renderers()
+  expect_true(TRUE)
+})
--- a/e2e/r/tests/test_smoke.R
+++ b/e2e/r/tests/test_smoke.R
@@ -0,0 +1,69 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: smoke
+
+test_that("ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.", {
+  result <- jsonlite::fromJSON(extract_bytes(content = readBin(.resolve_fixture("images/test_hello_world.png"), what = "raw", n = file.info(.resolve_fixture("images/test_hello_world.png"))$size), mime_type = "image/png", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "image/png")
+  expect_true(nchar(result$content) >= 1)
+  expect_true(any(sapply(c("Hello", "World", "hello", "world"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("smoke_docx_basic: Smoke test: DOCX with formatted text", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("docx/fake.docx"), mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+  expect_true(nchar(result$content) >= 20)
+  expect_true(any(sapply(c("Lorem", "ipsum", "document", "text"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("smoke_html_basic: Smoke test: HTML table extraction", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("html/simple_table.html"), mime_type = "text/html", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "text/html")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("Sample Data Table", "Laptop", "Electronics", "Product"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("smoke_image_png: Smoke test: PNG image (without OCR, metadata only)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("images/sample.png"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("disable_ocr" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "image/png")
+})
+
+test_that("smoke_json_basic: Smoke test: JSON file extraction", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("json/simple.json"), mime_type = "application/json", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/json")
+  expect_true(nchar(result$content) >= 5)
+})
+
+test_that("smoke_pdf_basic: Smoke test: PDF with simple text extraction", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = "application/pdf", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 50)
+  expect_true(any(sapply(c("May 5, 2023", "To Whom it May Concern"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("smoke_txt_basic: Smoke test: Plain text file", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("text/report.txt"), mime_type = "text/plain", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "text/plain")
+  expect_true(nchar(result$content) >= 5)
+})
+
+test_that("smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("xlsx/stanley_cups.xlsx"), mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+  expect_true(nchar(result$content) >= 100)
+  expect_true(any(grepl("Team", result$content, fixed = TRUE)))
+  expect_true(any(grepl("Location", result$content, fixed = TRUE)))
+  expect_true(any(grepl("Stanley Cups", result$content, fixed = TRUE)))
+  expect_true(any(grepl("Blues", result$content, fixed = TRUE)))
+  expect_true(any(grepl("Flyers", result$content, fixed = TRUE)))
+  expect_true(any(grepl("Maple Leafs", result$content, fixed = TRUE)))
+  expect_true(any(grepl("STL", result$content, fixed = TRUE)))
+  expect_true(any(grepl("PHI", result$content, fixed = TRUE)))
+  expect_true(any(grepl("TOR", result$content, fixed = TRUE)))
+  # skipped: field 'tables' not available on result type
+  # skipped: field 'metadata.format.excel.sheet_count' not available on result type
+  # skipped: field 'metadata.format.excel.sheet_names' not available on result type
+})
--- a/e2e/r/tests/test_validator_management.R
+++ b/e2e/r/tests/test_validator_management.R
@@ -0,0 +1,16 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: validator_management
+
+test_that("validators_clear: Clear all validators and verify list is empty", {
+  invisible(clear_validators())
+  expect_true(TRUE)
+})
+
+test_that("validators_list: List all registered validators", {
+  result <- list_validators()
+  expect_true(TRUE)
+})