Files
fil/docs/snippets/php/extraction/docx_extraction.php

119 lines
3.3 KiB
PHP
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
```php title="docx_extraction.php"
<?php
declare(strict_types=1);
/**
* DOCX (Word) Document Extraction
*
* Extract text, tables, and metadata from Microsoft Word documents.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use function Kreuzberg\extract_file;
$result = extract_file('document.docx');
echo "Word Document Extraction:\n";
echo str_repeat('=', 60) . "\n";
echo "Content:\n";
echo $result->content . "\n\n";
echo "Document Metadata:\n";
echo str_repeat('=', 60) . "\n";
echo "Title: " . ($result->metadata->title ?? 'N/A') . "\n";
echo "Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
echo "Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
echo "Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n";
echo "Subject: " . ($result->metadata->subject ?? 'N/A') . "\n";
echo "Keywords: " . implode(', ', $result->metadata->keywords ?? []) . "\n\n";
$config = new ExtractionConfig(
extractTables: true,
preserveFormatting: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('report.docx');
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . ":\n";
echo str_repeat('-', 60) . "\n";
foreach ($table->cells as $rowIndex => $row) {
echo implode(' | ', $row) . "\n";
if ($rowIndex === 0) {
echo str_repeat('-', 60) . "\n";
}
}
echo "\n";
}
$conversions = [
'plain' => null,
'markdown' => 'markdown',
];
foreach ($conversions as $name => $format) {
$config = new ExtractionConfig(
outputFormat: $format,
preserveFormatting: $format !== null
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.docx');
$outputFile = "output_$name.txt";
file_put_contents($outputFile, $result->content);
echo "Saved $name format to: $outputFile\n";
}
use function Kreuzberg\batch_extract_files;
$docxFiles = glob('*.docx');
if (!empty($docxFiles)) {
echo "\nBatch processing " . count($docxFiles) . " DOCX files...\n";
$results = batch_extract_files($docxFiles);
foreach ($results as $index => $result) {
$filename = basename($docxFiles[$index]);
echo "\n$filename:\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Tables: " . count($result->tables) . "\n";
echo " Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown') . "\n";
}
}
$result = extract_file('reviewed_document.docx');
if (!empty($result->metadata->createdBy)) {
echo "\nDocument Information:\n";
echo "Created by: " . $result->metadata->createdBy . "\n";
}
if (!empty($result->metadata->producer)) {
echo "Producer: " . $result->metadata->producer . "\n";
}
$result = extract_file('document.docx');
$content = $result->content;
$stats = [
'characters' => mb_strlen($content),
'words' => str_word_count($content),
'lines' => substr_count($content, "\n"),
'paragraphs' => substr_count($content, "\n\n"),
'sentences' => preg_match_all('/[.!?]+/', $content),
];
echo "\nDocument Statistics:\n";
echo str_repeat('=', 60) . "\n";
foreach ($stats as $metric => $value) {
echo ucfirst($metric) . ": " . number_format($value) . "\n";
}
```