Files
fil/e2e/php/tests/ContractTest.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

261 lines
12 KiB
PHP
Generated

<?php
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
declare(strict_types=1);
namespace Kreuzberg\E2e;
use PHPUnit\Framework\TestCase;
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
/** E2e tests for category: contract. */
final class ContractTest extends TestCase
{
/** Tests async batch bytes extraction API (batch_extract_bytes) */
public function test_api_batch_bytes_async(): void
{
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}'));
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
$found = false;
if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; }
if (str_contains($result->getContent(), "Mallori")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
}
/** Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter) */
public function test_api_batch_bytes_with_configs_async(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"]));
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'metadata.output_format' not available on result type
}
/** Tests async batch file extraction API (batch_extract_file) */
public function test_api_batch_file_async(): void
{
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}'));
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
$found = false;
if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; }
if (str_contains($result->getContent(), "Mallori")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
}
/** Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter) */
public function test_api_batch_file_with_configs_async(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"]));
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'metadata.output_format' not available on result type
}
/** Tests async bytes extraction API (extract_bytes) */
public function test_api_extract_bytes_async(): void
{
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}'));
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
$found = false;
if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; }
if (str_contains($result->getContent(), "Mallori")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
}
/** Tests async file extraction API (extract_file) */
public function test_api_extract_file_async(): void
{
$result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}'));
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
$found = false;
if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; }
if (str_contains($result->getContent(), "Mallori")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
}
/** Tests markdown chunker prepends heading hierarchy to chunk content */
public function test_config_chunking_prepend_heading_context(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["chunking" => ["chunkerType" => "markdown", "maxChars" => 300, "maxOverlap" => 50, "prependHeadingContext" => true]]));
$result = Kreuzberg::extractFileSync("markdown/extraction_test.md", null, $config);
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'chunks' not available on result type $this->assertTrue(array_reduce($result->chunks ?? [], fn($carry, $c) => $carry && !empty($c->content), true)); // skipped: field 'chunks_have_heading_context' not available on result type // skipped: field 'first_chunk_starts_with_heading' not available on result type
}
/** Tests document structure with DOCX heading-driven nesting */
public function test_config_document_structure_with_headings(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["includeDocumentStructure" => true]));
$result = Kreuzberg::extractFileSync("docx/fake.docx", null, $config);
$this->assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", trim($result->mimeType));
// skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type
}
/** Tests element-based result format with element type assertions on DOCX */
public function test_config_element_types(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["resultFormat" => "element_based"]));
$result = Kreuzberg::extractFileSync("docx/unit_test_headers.docx", null, $config);
$found = false;
if (str_contains($result->mimeType, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
// skipped: field 'elements' not available on result type
}
/** Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions */
public function test_config_extraction_timeout(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["extractionTimeoutSecs" => 300]));
$result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
}
/** Tests keyword extraction via YAKE algorithm */
public function test_config_keywords(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["keywords" => ["algorithm" => "yake", "maxKeywords" => 10]]));
$result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'keywords' not available on PHP ExtractionResult // skipped: field 'keywords' not available on PHP ExtractionResult
}
/** Tests page extraction and page marker configuration */
public function test_config_pages(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["pages" => ["extractPages" => true, "insertPageMarkers" => true]]));
$result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
$found = false;
if (str_contains($result->getContent(), "PAGE")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
}
/** Tests quality scoring produces a score value in [0.0, 1.0] */
public function test_config_quality_enabled(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["enableQualityProcessing" => true]));
$result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type
}
/** Tests archive extraction with custom security limits */
public function test_config_security_limits(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["securityLimits" => ["maxArchiveSize" => 104857600, "maxCompressionRatio" => 50, "maxFilesInArchive" => 100]]));
$result = Kreuzberg::extractFileSync("archives/documents.zip", null, $config);
$found = false;
if (str_contains($result->mimeType, "application/zip")) { $found = true; }
if (str_contains($result->mimeType, "application/x-zip-compressed")) { $found = true; }
$this->assertTrue($found, 'expected to contain at least one of the specified values');
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
}
/** Tests tree-sitter configuration round-trip */
public function test_config_tree_sitter(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["treeSitter" => ["groups" => ["web"], "languages" => ["python", "rust"], "process" => ["comments" => false, "diagnostics" => false, "docstrings" => false, "exports" => true, "imports" => true, "structure" => true, "symbols" => false]]]));
$result = Kreuzberg::extractFileSync("code/hello.py", null, $config);
$this->assertEquals("text/x-source-code", trim($result->mimeType));
$this->assertGreaterThanOrEqual(5, strlen($result->getContent()));
}
/** Tests markdown output format via bytes extraction API */
public function test_output_format_bytes_markdown(): void
{
$contentBytes = file_get_contents("pdf/fake_memo.pdf");
if ($contentBytes === false) { $this->fail("failed to read fixture: pdf/fake_memo.pdf"); }
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"]));
$result = Kreuzberg::extractBytesSync($contentBytes, "application/pdf", $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'metadata.output_format' not available on result type
}
/** Tests Markdown output format */
public function test_output_format_markdown(): void
{
$config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"]));
$result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config);
$this->assertEquals("application/pdf", trim($result->mimeType));
$this->assertGreaterThanOrEqual(10, strlen($result->getContent()));
// skipped: field 'metadata.output_format' not available on result type
}
}