85 lines
3.1 KiB
PHP
85 lines
3.1 KiB
PHP
|
|
<?php
|
||
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
|
||
|
|
|
||
|
|
declare(strict_types=1);
|
||
|
|
|
||
|
|
namespace Kreuzberg\E2e;
|
||
|
|
|
||
|
|
use PHPUnit\Framework\TestCase;
|
||
|
|
use Kreuzberg\Kreuzberg;
|
||
|
|
use Kreuzberg\ExtractionConfig;
|
||
|
|
|
||
|
|
/** E2e tests for category: format_specific. */
|
||
|
|
final class FormatSpecificTest extends TestCase
|
||
|
|
{
|
||
|
|
|
||
|
|
/** Standalone DOCX extraction using extract_bytes_sync */
|
||
|
|
public function test_format_docx_standalone(): void
|
||
|
|
{
|
||
|
|
$contentBytes = file_get_contents("docx/fake.docx");
|
||
|
|
if ($contentBytes === false) { $this->fail("failed to read fixture: docx/fake.docx"); }
|
||
|
|
$result = Kreuzberg::extractBytesSync($contentBytes, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", \Kreuzberg\ExtractionConfig::from_json('{}'));
|
||
|
|
|
||
|
|
$this->assertGreaterThanOrEqual(20, strlen($result->getContent()));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Standalone HWPX extraction using extract_bytes_sync */
|
||
|
|
public function test_format_hwpx_standalone(): void
|
||
|
|
{
|
||
|
|
$contentBytes = file_get_contents("hwpx/simple.hwpx");
|
||
|
|
if ($contentBytes === false) { $this->fail("failed to read fixture: hwpx/simple.hwpx"); }
|
||
|
|
$result = Kreuzberg::extractBytesSync($contentBytes, "application/haansofthwpx", \Kreuzberg\ExtractionConfig::from_json('{}'));
|
||
|
|
|
||
|
|
$this->assertGreaterThanOrEqual(20, strlen($result->getContent()));
|
||
|
|
$this->assertStringContainsString("Hello from HWPX", $result->getContent());
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** Standalone PDF text extraction using extract_bytes_sync */
|
||
|
|
public function test_format_pdf_text(): void
|
||
|
|
{
|
||
|
|
$contentBytes = file_get_contents("pdf/fake_memo.pdf");
|
||
|
|
if ($contentBytes === false) { $this->fail("failed to read fixture: pdf/fake_memo.pdf"); }
|
||
|
|
$result = Kreuzberg::extractBytesSync($contentBytes, "application/pdf", \Kreuzberg\ExtractionConfig::from_json('{}'));
|
||
|
|
|
||
|
|
$this->assertGreaterThanOrEqual(50, strlen($result->getContent()));
|
||
|
|
$found = false;
|
||
|
|
if (str_contains($result->getContent(), "Mallori")) { $found = true; }
|
||
|
|
if (str_contains($result->getContent(), "May")) { $found = true; }
|
||
|
|
$this->assertTrue($found, 'expected to contain at least one of the specified values');
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** PPTX presentation extraction using extract_file_sync */
|
||
|
|
public function test_format_pptx(): void
|
||
|
|
{
|
||
|
|
$this->expectNotToPerformAssertions();
|
||
|
|
$result = Kreuzberg::extractFileSync("pptx/simple.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", \Kreuzberg\ExtractionConfig::from_json('{}'));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/** XLSX spreadsheet extraction using extract_file_sync */
|
||
|
|
public function test_format_xlsx(): void
|
||
|
|
{
|
||
|
|
$this->expectNotToPerformAssertions();
|
||
|
|
$result = Kreuzberg::extractFileSync("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", \Kreuzberg\ExtractionConfig::from_json('{}'));
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|