157 lines
6.6 KiB
PHP
157 lines
6.6 KiB
PHP
|
|
<?php
|
||
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
|
||
|
|
|
||
|
|
declare(strict_types=1);
|
||
|
|
|
||
|
|
namespace Kreuzberg\E2e;
|
||
|
|
|
||
|
|
use PHPUnit\Framework\TestCase;
|
||
|
|
use Kreuzberg\Kreuzberg;
|
||
|
|
use Kreuzberg\ExtractionConfig;
|
||
|
|
|
||
|
|
/** E2e tests for category: smoke. */
|
||
|
|
final class SmokeTest extends TestCase
|
||
|
|
{
|
||
|
|
|
||
|
|
/** OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge. */
|
||
|
|
public function test_ocr_image_png(): void
|
||
|
|
{
|
||
|
|
$contentBytes = file_get_contents("images/test_hello_world.png");
|
||
|
|
if ($contentBytes === false) { $this->fail("failed to read fixture: images/test_hello_world.png"); }
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractBytes($contentBytes, "image/png", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("image/png", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(1, strlen($result->getContent()));
|
||
|
|
$found = false;
|
||
|
|
if (str_contains($result->getContent(), "Hello")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "World")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "hello")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "world")) { $found = true; }
|
||
|
|
$this->assertTrue($found, 'expected to contain at least one of the specified values');
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: DOCX with formatted text */
|
||
|
|
public function test_smoke_docx_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("docx/fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(20, strlen($result->getContent()));
|
||
|
|
$found = false;
|
||
|
|
if (str_contains($result->getContent(), "Lorem")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "ipsum")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "document")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "text")) { $found = true; }
|
||
|
|
$this->assertTrue($found, 'expected to contain at least one of the specified values');
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: HTML table extraction */
|
||
|
|
public function test_smoke_html_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("html/simple_table.html", "text/html", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("text/html", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
|
||
|
|
$found = false;
|
||
|
|
if (str_contains($result->getContent(), "Sample Data Table")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "Laptop")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "Electronics")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "Product")) { $found = true; }
|
||
|
|
$this->assertTrue($found, 'expected to contain at least one of the specified values');
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: PNG image (without OCR, metadata only) */
|
||
|
|
public function test_smoke_image_png(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["disableOcr" => true]));
|
||
|
|
$result = Kreuzberg::extractFile("images/sample.png", null, $config);
|
||
|
|
|
||
|
|
$this->assertEquals("image/png", trim($result->mimeType));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: JSON file extraction */
|
||
|
|
public function test_smoke_json_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("json/simple.json", "application/json", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("application/json", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(5, strlen($result->getContent()));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: PDF with simple text extraction */
|
||
|
|
public function test_smoke_pdf_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", "application/pdf", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("application/pdf", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(50, strlen($result->getContent()));
|
||
|
|
$found = false;
|
||
|
|
if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "To Whom it May Concern")) { $found = true; }
|
||
|
|
$this->assertTrue($found, 'expected to contain at least one of the specified values');
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: Plain text file */
|
||
|
|
public function test_smoke_txt_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("text/report.txt", "text/plain", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("text/plain", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(5, strlen($result->getContent()));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Smoke test: XLSX with basic spreadsheet data including tables */
|
||
|
|
public function test_smoke_xlsx_basic(): void
|
||
|
|
{
|
||
|
|
$config = \Kreuzberg\ExtractionConfig::from_json('{}');
|
||
|
|
$result = Kreuzberg::extractFile("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", $config);
|
||
|
|
|
||
|
|
$this->assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", trim($result->mimeType));
|
||
|
|
$this->assertGreaterThanOrEqual(100, strlen($result->getContent()));
|
||
|
|
$this->assertStringContainsString("Team", $result->getContent());
|
||
|
|
$this->assertStringContainsString("Location", $result->getContent());
|
||
|
|
$this->assertStringContainsString("Stanley Cups", $result->getContent());
|
||
|
|
$this->assertStringContainsString("Blues", $result->getContent());
|
||
|
|
$this->assertStringContainsString("Flyers", $result->getContent());
|
||
|
|
$this->assertStringContainsString("Maple Leafs", $result->getContent());
|
||
|
|
$this->assertStringContainsString("STL", $result->getContent());
|
||
|
|
$this->assertStringContainsString("PHI", $result->getContent());
|
||
|
|
$this->assertStringContainsString("TOR", $result->getContent());
|
||
|
|
// skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|