assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; } if (str_contains($result->getContent(), "Mallori")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter) */ public function test_api_batch_bytes_with_configs_async(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"])); $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'metadata.output_format' not available on result type } /** Tests async batch file extraction API (batch_extract_file) */ public function test_api_batch_file_async(): void { $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}')); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; } if (str_contains($result->getContent(), "Mallori")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter) */ public function test_api_batch_file_with_configs_async(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"])); $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'metadata.output_format' not available on result type } /** Tests async bytes extraction API (extract_bytes) */ public function test_api_extract_bytes_async(): void { $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}')); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; } if (str_contains($result->getContent(), "Mallori")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Tests async file extraction API (extract_file) */ public function test_api_extract_file_async(): void { $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", null, \Kreuzberg\ExtractionConfig::from_json('{}')); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; } if (str_contains($result->getContent(), "Mallori")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Tests markdown chunker prepends heading hierarchy to chunk content */ public function test_config_chunking_prepend_heading_context(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["chunking" => ["chunkerType" => "markdown", "maxChars" => 300, "maxOverlap" => 50, "prependHeadingContext" => true]])); $result = Kreuzberg::extractFileSync("markdown/extraction_test.md", null, $config); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'chunks' not available on result type $this->assertTrue(array_reduce($result->chunks ?? [], fn($carry, $c) => $carry && !empty($c->content), true)); // skipped: field 'chunks_have_heading_context' not available on result type // skipped: field 'first_chunk_starts_with_heading' not available on result type } /** Tests document structure with DOCX heading-driven nesting */ public function test_config_document_structure_with_headings(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["includeDocumentStructure" => true])); $result = Kreuzberg::extractFileSync("docx/fake.docx", null, $config); $this->assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", trim($result->mimeType)); // skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type } /** Tests element-based result format with element type assertions on DOCX */ public function test_config_element_types(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["resultFormat" => "element_based"])); $result = Kreuzberg::extractFileSync("docx/unit_test_headers.docx", null, $config); $found = false; if (str_contains($result->mimeType, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); // skipped: field 'elements' not available on result type } /** Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions */ public function test_config_extraction_timeout(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["extractionTimeoutSecs" => 300])); $result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); } /** Tests keyword extraction via YAKE algorithm */ public function test_config_keywords(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["keywords" => ["algorithm" => "yake", "maxKeywords" => 10]])); $result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'keywords' not available on PHP ExtractionResult // skipped: field 'keywords' not available on PHP ExtractionResult } /** Tests page extraction and page marker configuration */ public function test_config_pages(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["pages" => ["extractPages" => true, "insertPageMarkers" => true]])); $result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "PAGE")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Tests quality scoring produces a score value in [0.0, 1.0] */ public function test_config_quality_enabled(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["enableQualityProcessing" => true])); $result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type } /** Tests archive extraction with custom security limits */ public function test_config_security_limits(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["securityLimits" => ["maxArchiveSize" => 104857600, "maxCompressionRatio" => 50, "maxFilesInArchive" => 100]])); $result = Kreuzberg::extractFileSync("archives/documents.zip", null, $config); $found = false; if (str_contains($result->mimeType, "application/zip")) { $found = true; } if (str_contains($result->mimeType, "application/x-zip-compressed")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); } /** Tests tree-sitter configuration round-trip */ public function test_config_tree_sitter(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["treeSitter" => ["groups" => ["web"], "languages" => ["python", "rust"], "process" => ["comments" => false, "diagnostics" => false, "docstrings" => false, "exports" => true, "imports" => true, "structure" => true, "symbols" => false]]])); $result = Kreuzberg::extractFileSync("code/hello.py", null, $config); $this->assertEquals("text/x-source-code", trim($result->mimeType)); $this->assertGreaterThanOrEqual(5, strlen($result->getContent())); } /** Tests markdown output format via bytes extraction API */ public function test_output_format_bytes_markdown(): void { $contentBytes = file_get_contents("pdf/fake_memo.pdf"); if ($contentBytes === false) { $this->fail("failed to read fixture: pdf/fake_memo.pdf"); } $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"])); $result = Kreuzberg::extractBytesSync($contentBytes, "application/pdf", $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'metadata.output_format' not available on result type } /** Tests Markdown output format */ public function test_output_format_markdown(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["outputFormat" => "markdown"])); $result = Kreuzberg::extractFileSync("pdf/fake_memo.pdf", null, $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); // skipped: field 'metadata.output_format' not available on result type } }