Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/e2e/r/tests/test_contract.R
+++ b/e2e/r/tests/test_contract.R
@@ -0,0 +1,126 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# E2e tests for category: contract
+
+test_that("api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("api_batch_file_async: Tests async batch file extraction API (batch_extract_file)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("api_extract_file_async: Tests async file extraction API (extract_file)", {
+  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$default()), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("May 5, 2023", "Mallori"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("markdown/extraction_test.md"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("chunking" = list("chunker_type" = "markdown", "max_chars" = 300, "max_overlap" = 50, "prepend_heading_context" = TRUE)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'chunks' not available on result type
+  expect_true(all(sapply(result$chunks %||% list(), function(c) nchar(c$content) > 0)))
+  expect_true(!is.null(result$chunks) && length(result$chunks) > 0 && all(sapply(result$chunks, function(c) nchar(c$content) > 0)))
+  expect_true(!is.null(result$chunks) && length(result$chunks) > 0 && startsWith(trimws(result$chunks[[1]]$content), "#"))
+})
+
+test_that("config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("docx/fake.docx"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("include_document_structure" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+  # skipped: field 'document' not available on result type
+  # skipped: field 'document.nodes' not available on result type
+})
+
+test_that("config_element_types: Tests element-based result format with element type assertions on DOCX", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("docx/unit_test_headers.docx"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("result_format" = "element_based"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(any(sapply(c("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), function(v) grepl(v, result$mime_type, fixed = TRUE))))
+  # skipped: field 'elements' not available on result type
+})
+
+test_that("config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("extraction_timeout_secs" = 300), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+})
+
+test_that("config_keywords: Tests keyword extraction via YAKE algorithm", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("keywords" = list("algorithm" = "yake", "max_keywords" = 10)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'keywords' not available on R ExtractionResult
+  # skipped: field 'keywords' not available on R ExtractionResult
+})
+
+test_that("config_pages: Tests page extraction and page marker configuration", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("pages" = list("extract_pages" = TRUE, "insert_page_markers" = TRUE)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  expect_true(any(sapply(c("PAGE"), function(v) grepl(v, result$content, fixed = TRUE))))
+})
+
+test_that("config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("enable_quality_processing" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'quality_score' not available on result type
+  # skipped: field 'quality_score' not available on result type
+  # skipped: field 'quality_score' not available on result type
+})
+
+test_that("config_security_limits: Tests archive extraction with custom security limits", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("archives/documents.zip"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("security_limits" = list("max_archive_size" = 104857600, "max_compression_ratio" = 50, "max_files_in_archive" = 100)), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_true(any(sapply(c("application/zip", "application/x-zip-compressed"), function(v) grepl(v, result$mime_type, fixed = TRUE))))
+  expect_true(nchar(result$content) >= 10)
+})
+
+test_that("config_tree_sitter: Tests tree-sitter configuration round-trip", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("code/hello.py"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("tree_sitter" = list("groups" = I(c("web")), "languages" = I(c("python", "rust")), "process" = list("comments" = FALSE, "diagnostics" = FALSE, "docstrings" = FALSE, "exports" = TRUE, "imports" = TRUE, "structure" = TRUE, "symbols" = FALSE))), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "text/x-source-code")
+  expect_true(nchar(result$content) >= 5)
+})
+
+test_that("output_format_bytes_markdown: Tests markdown output format via bytes extraction API", {
+  result <- jsonlite::fromJSON(extract_bytes_sync(content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), what = "raw", n = file.info(.resolve_fixture("pdf/fake_memo.pdf"))$size), mime_type = "application/pdf", config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})
+
+test_that("output_format_markdown: Tests Markdown output format", {
+  result <- jsonlite::fromJSON(extract_file_sync(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("output_format" = "markdown"), auto_unbox = TRUE))), simplifyVector = FALSE)
+  expect_equal(trimws(result$mime_type), "application/pdf")
+  expect_true(nchar(result$content) >= 10)
+  # skipped: field 'metadata.output_format' not available on result type
+})