fil/e2e/r/tests/test_smoke.R

# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# E2e tests for category: smoke

test_that("ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.", {
  result <- jsonlite::fromJSON(extract_bytes(content = readBin(.resolve_fixture("images/test_hello_world.png"), what = "raw", n = file.info(.resolve_fixture("images/test_hello_world.png"))$size), mime_type = "image/png", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "image/png")
  expect_true(nchar(result$content) >= 1)
  expect_true(any(sapply(c("Hello", "World", "hello", "world"), function(v) grepl(v, result$content, fixed = TRUE))))
})

test_that("smoke_docx_basic: Smoke test: DOCX with formatted text", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("docx/fake.docx"), mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
  expect_true(nchar(result$content) >= 20)
  expect_true(any(sapply(c("Lorem", "ipsum", "document", "text"), function(v) grepl(v, result$content, fixed = TRUE))))
})

test_that("smoke_html_basic: Smoke test: HTML table extraction", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("html/simple_table.html"), mime_type = "text/html", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "text/html")
  expect_true(nchar(result$content) >= 10)
  expect_true(any(sapply(c("Sample Data Table", "Laptop", "Electronics", "Product"), function(v) grepl(v, result$content, fixed = TRUE))))
})

test_that("smoke_image_png: Smoke test: PNG image (without OCR, metadata only)", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("images/sample.png"), mime_type = NULL, config = ExtractionConfig$from_json(jsonlite::toJSON(list("disable_ocr" = TRUE), auto_unbox = TRUE))), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "image/png")
})

test_that("smoke_json_basic: Smoke test: JSON file extraction", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("json/simple.json"), mime_type = "application/json", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "application/json")
  expect_true(nchar(result$content) >= 5)
})

test_that("smoke_pdf_basic: Smoke test: PDF with simple text extraction", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("pdf/fake_memo.pdf"), mime_type = "application/pdf", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "application/pdf")
  expect_true(nchar(result$content) >= 50)
  expect_true(any(sapply(c("May 5, 2023", "To Whom it May Concern"), function(v) grepl(v, result$content, fixed = TRUE))))
})

test_that("smoke_txt_basic: Smoke test: Plain text file", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("text/report.txt"), mime_type = "text/plain", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "text/plain")
  expect_true(nchar(result$content) >= 5)
})

test_that("smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables", {
  result <- jsonlite::fromJSON(extract_file(path = .resolve_fixture("xlsx/stanley_cups.xlsx"), mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", config = ExtractionConfig$default()), simplifyVector = FALSE)
  expect_equal(trimws(result$mime_type), "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
  expect_true(nchar(result$content) >= 100)
  expect_true(any(grepl("Team", result$content, fixed = TRUE)))
  expect_true(any(grepl("Location", result$content, fixed = TRUE)))
  expect_true(any(grepl("Stanley Cups", result$content, fixed = TRUE)))
  expect_true(any(grepl("Blues", result$content, fixed = TRUE)))
  expect_true(any(grepl("Flyers", result$content, fixed = TRUE)))
  expect_true(any(grepl("Maple Leafs", result$content, fixed = TRUE)))
  expect_true(any(grepl("STL", result$content, fixed = TRUE)))
  expect_true(any(grepl("PHI", result$content, fixed = TRUE)))
  expect_true(any(grepl("TOR", result$content, fixed = TRUE)))
  # skipped: field 'tables' not available on result type
  # skipped: field 'metadata.format.excel.sheet_count' not available on result type
  # skipped: field 'metadata.format.excel.sheet_names' not available on result type
})