fail("failed to read fixture: images/test_hello_world.png"); } $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractBytes($contentBytes, "image/png", $config); $this->assertEquals("image/png", trim($result->mimeType)); $this->assertGreaterThanOrEqual(1, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "Hello")) { $found = true; } if (str_contains($result->getContent(), "World")) { $found = true; } if (str_contains($result->getContent(), "hello")) { $found = true; } if (str_contains($result->getContent(), "world")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Smoke test: DOCX with formatted text */ public function test_smoke_docx_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("docx/fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", $config); $this->assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", trim($result->mimeType)); $this->assertGreaterThanOrEqual(20, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "Lorem")) { $found = true; } if (str_contains($result->getContent(), "ipsum")) { $found = true; } if (str_contains($result->getContent(), "document")) { $found = true; } if (str_contains($result->getContent(), "text")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Smoke test: HTML table extraction */ public function test_smoke_html_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("html/simple_table.html", "text/html", $config); $this->assertEquals("text/html", trim($result->mimeType)); $this->assertGreaterThanOrEqual(10, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "Sample Data Table")) { $found = true; } if (str_contains($result->getContent(), "Laptop")) { $found = true; } if (str_contains($result->getContent(), "Electronics")) { $found = true; } if (str_contains($result->getContent(), "Product")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Smoke test: PNG image (without OCR, metadata only) */ public function test_smoke_image_png(): void { $config = \Kreuzberg\ExtractionConfig::from_json(json_encode(["disableOcr" => true])); $result = Kreuzberg::extractFile("images/sample.png", null, $config); $this->assertEquals("image/png", trim($result->mimeType)); } /** Smoke test: JSON file extraction */ public function test_smoke_json_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("json/simple.json", "application/json", $config); $this->assertEquals("application/json", trim($result->mimeType)); $this->assertGreaterThanOrEqual(5, strlen($result->getContent())); } /** Smoke test: PDF with simple text extraction */ public function test_smoke_pdf_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("pdf/fake_memo.pdf", "application/pdf", $config); $this->assertEquals("application/pdf", trim($result->mimeType)); $this->assertGreaterThanOrEqual(50, strlen($result->getContent())); $found = false; if (str_contains($result->getContent(), "May 5, 2023")) { $found = true; } if (str_contains($result->getContent(), "To Whom it May Concern")) { $found = true; } $this->assertTrue($found, 'expected to contain at least one of the specified values'); } /** Smoke test: Plain text file */ public function test_smoke_txt_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("text/report.txt", "text/plain", $config); $this->assertEquals("text/plain", trim($result->mimeType)); $this->assertGreaterThanOrEqual(5, strlen($result->getContent())); } /** Smoke test: XLSX with basic spreadsheet data including tables */ public function test_smoke_xlsx_basic(): void { $config = \Kreuzberg\ExtractionConfig::from_json('{}'); $result = Kreuzberg::extractFile("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", $config); $this->assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", trim($result->mimeType)); $this->assertGreaterThanOrEqual(100, strlen($result->getContent())); $this->assertStringContainsString("Team", $result->getContent()); $this->assertStringContainsString("Location", $result->getContent()); $this->assertStringContainsString("Stanley Cups", $result->getContent()); $this->assertStringContainsString("Blues", $result->getContent()); $this->assertStringContainsString("Flyers", $result->getContent()); $this->assertStringContainsString("Maple Leafs", $result->getContent()); $this->assertStringContainsString("STL", $result->getContent()); $this->assertStringContainsString("PHI", $result->getContent()); $this->assertStringContainsString("TOR", $result->getContent()); // skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type } }