Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/tools/benchmark-harness/scripts/TikaExtract.java
+++ b/tools/benchmark-harness/scripts/TikaExtract.java
@@ -0,0 +1,394 @@
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+public final class TikaExtract {
+    private static final double NANOS_IN_MILLISECOND = 1_000_000.0;
+    /** Length of the JSON key {@code "path"} including surrounding quotes. */
+    private static final int PATH_KEY_LENGTH = 6;
+    private static final char LAST_CONTROL_CHAR = 0x1F;
+
+    private TikaExtract() {
+    }
+
+    public static void main(String[] args) {
+        boolean ocrEnabled = false;
+        List<String> positionalArgs = new ArrayList<>();
+
+        for (String arg : args) {
+            if ("--ocr".equals(arg)) {
+                ocrEnabled = true;
+            } else if ("--no-ocr".equals(arg)) {
+                ocrEnabled = false;
+            } else {
+                positionalArgs.add(arg);
+            }
+        }
+
+        if (positionalArgs.isEmpty()) {
+            System.err.println("Usage: TikaExtract [--ocr|--no-ocr] <mode> <file1> [file2] ...");
+            System.err.println("Modes: sync, batch, server");
+            System.exit(1);
+        }
+
+        String mode = positionalArgs.get(0);
+        if (!"sync".equals(mode) && !"batch".equals(mode) && !"server".equals(mode)) {
+            System.err.printf("Unsupported mode '%s'%n", mode);
+            System.exit(1);
+        }
+
+        // Enable debug logging if TIKA_BENCHMARK_DEBUG is set
+        boolean debug = "true".equalsIgnoreCase(System.getenv("TIKA_BENCHMARK_DEBUG"));
+
+        if (debug) {
+            debugLog("java.version", System.getProperty("java.version"));
+            debugLog("os.name", System.getProperty("os.name"));
+            debugLog("os.arch", System.getProperty("os.arch"));
+            debugLog("Mode", mode);
+            debugLog("OCR enabled", String.valueOf(ocrEnabled));
+            debugLog("Files to process", String.valueOf(positionalArgs.size() - 1));
+        }
+
+        try {
+            if ("sync".equals(mode)) {
+                if (positionalArgs.size() < 2) {
+                    System.err.println("Sync mode requires exactly one file");
+                    System.exit(1);
+                }
+                processSyncMode(positionalArgs.get(1), ocrEnabled, debug);
+            } else if ("batch".equals(mode)) {
+                processBatchMode(positionalArgs, ocrEnabled, debug);
+            } else {
+                processServerMode(ocrEnabled, debug);
+            }
+        } catch (Exception e) {
+            if (debug) {
+                debugLog("Processing failed with exception", e.getClass().getName());
+                e.printStackTrace(System.err);
+            } else {
+                e.printStackTrace(System.err);
+            }
+            System.exit(1);
+        }
+    }
+
+    private static void processSyncMode(String filePath, boolean ocrEnabled, boolean debug) throws Exception {
+        if (debug) {
+            debugLog("Input file", filePath);
+        }
+
+        Path path = Path.of(filePath);
+        ExtractionData data;
+        long start = System.nanoTime();
+
+        try {
+            if (debug) {
+                debugLog("Starting extraction", "");
+            }
+            data = extractFile(path.toFile(), ocrEnabled, debug);
+            if (debug) {
+                debugLog("Extraction completed", "");
+            }
+        } catch (Exception e) {
+            if (debug) {
+                debugLog("Extraction failed", e.getClass().getName());
+                e.printStackTrace(System.err);
+            }
+            throw e;
+        }
+
+        double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+        String json = toJson(data, elapsedMs, ocrEnabled);
+        System.out.print(json);
+    }
+
+    private static void processBatchMode(
+            List<String> positionalArgs, boolean ocrEnabled, boolean debug) throws Exception {
+        List<String> filePaths = new ArrayList<>();
+        for (int i = 1; i < positionalArgs.size(); i++) {
+            filePaths.add(positionalArgs.get(i));
+        }
+
+        long batchStart = System.nanoTime();
+        StringBuilder jsonArray = new StringBuilder();
+        jsonArray.append('[');
+
+        boolean first = true;
+        for (String filePath : filePaths) {
+            if (debug) {
+                debugLog("Processing file", filePath);
+            }
+
+            try {
+                Path path = Path.of(filePath);
+                long start = System.nanoTime();
+                ExtractionData data = extractFile(path.toFile(), ocrEnabled, debug);
+                double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+
+                if (!first) {
+                    jsonArray.append(',');
+                }
+                first = false;
+
+                double batchTotalMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
+                jsonArray.append(toJsonWithBatch(data, elapsedMs, batchTotalMs, ocrEnabled));
+
+                if (debug) {
+                    debugLog("File processed", filePath);
+                }
+            } catch (Exception e) {
+                if (debug) {
+                    debugLog("Failed to process file", filePath);
+                    debugLog("Exception", e.getClass().getName());
+                    e.printStackTrace(System.err);
+                } else {
+                    System.err.printf("Error processing %s: %s%n", filePath, e.getMessage());
+                }
+            }
+        }
+
+        double totalBatchMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
+        jsonArray.append(']');
+
+        if (first) {
+            System.err.println("No files were successfully processed");
+            System.exit(1);
+            return;
+        }
+
+        System.out.print(jsonArray.toString());
+    }
+
+    private static void processServerMode(boolean ocrEnabled, boolean debug) throws Exception {
+        // Pre-create shared parser and OCR config to avoid per-file construction overhead.
+        // AutoDetectParser is thread-safe and reusable. Only BodyContentHandler and Metadata
+        // need to be recreated per extraction since they accumulate state.
+        AutoDetectParser sharedParser = new AutoDetectParser();
+        TesseractOCRConfig sharedOcrConfig = new TesseractOCRConfig();
+        if (!ocrEnabled) {
+            sharedOcrConfig.setSkipOcr(true);
+        } else {
+            sharedOcrConfig.setLanguage("eng");
+        }
+
+        // Signal readiness after JVM + Tika parser initialization
+        System.out.println("READY");
+        System.out.flush();
+
+        BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
+        String line;
+        while ((line = reader.readLine()) != null) {
+            String filePath = line.trim();
+            if (filePath.isEmpty()) {
+                continue;
+            }
+            // Parse JSON request if the harness sends {"path":"...", "force_ocr": ...}
+            if (filePath.startsWith("{")) {
+                filePath = parseJsonPath(filePath);
+            }
+            try {
+                Path path = Path.of(filePath);
+                long start = System.nanoTime();
+                ExtractionData data = extractFileWithParser(path.toFile(), sharedParser, sharedOcrConfig, debug);
+                double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+                String json = toJson(data, elapsedMs, ocrEnabled);
+                System.out.println(json);
+                System.out.flush();
+            } catch (Exception e) {
+                String errorJson = String.format(
+                        "{\"error\":%s,\"_extraction_time_ms\":0,\"_ocr_used\":false}",
+                        quote(e.getMessage()));
+                System.out.println(errorJson);
+                System.out.flush();
+            }
+        }
+    }
+
+    private static ExtractionData extractFileWithParser(
+            File file, AutoDetectParser parser, TesseractOCRConfig ocrConfig, boolean debug) throws Exception {
+        if (!file.exists()) {
+            throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
+        }
+
+        BodyContentHandler handler = new BodyContentHandler(-1);
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        context.set(TesseractOCRConfig.class, ocrConfig);
+
+        try (InputStream stream = new FileInputStream(file)) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        String content = handler.toString();
+        String mimeType = metadata.get(Metadata.CONTENT_TYPE);
+
+        if (mimeType == null) {
+            mimeType = "application/octet-stream";
+        }
+
+        return new ExtractionData(content, mimeType);
+    }
+
+    private static ExtractionData extractFile(File file, boolean ocrEnabled, boolean debug) throws Exception {
+        if (!file.exists()) {
+            throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
+        }
+
+        AutoDetectParser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler(-1);
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        if (!ocrEnabled) {
+            TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+            ocrConfig.setSkipOcr(true);
+            context.set(TesseractOCRConfig.class, ocrConfig);
+        } else {
+            TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+            ocrConfig.setLanguage("eng");
+            context.set(TesseractOCRConfig.class, ocrConfig);
+        }
+
+        try (InputStream stream = new FileInputStream(file)) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        String content = handler.toString();
+        String mimeType = metadata.get(Metadata.CONTENT_TYPE);
+
+        if (mimeType == null) {
+            mimeType = "application/octet-stream";
+        }
+
+        return new ExtractionData(content, mimeType);
+    }
+
+    /**
+     * Determine if OCR was actually used based on MIME type and OCR config.
+     * OCR is used by Tika when enabled and the file is an image type.
+     */
+    private static boolean determineOcrUsed(String mimeType, boolean ocrEnabled) {
+        if (!ocrEnabled) {
+            return false;
+        }
+        return mimeType != null && mimeType.startsWith("image/");
+    }
+
+    private static String toJson(ExtractionData data, double elapsedMs, boolean ocrEnabled) {
+        StringBuilder builder = new StringBuilder();
+        builder.append('{');
+        builder.append("\"content\":").append(quote(data.getContent())).append(',');
+        builder.append("\"metadata\":{");
+        builder.append("\"mimeType\":").append(quote(data.getMimeType()));
+        builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
+        builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
+        builder.append('}');
+        return builder.toString();
+    }
+
+    private static String toJsonWithBatch(
+            ExtractionData data, double elapsedMs, double batchTotalMs, boolean ocrEnabled) {
+        StringBuilder builder = new StringBuilder();
+        builder.append('{');
+        builder.append("\"content\":").append(quote(data.getContent())).append(',');
+        builder.append("\"metadata\":{");
+        builder.append("\"mimeType\":").append(quote(data.getMimeType()));
+        builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
+        builder.append(",\"_batch_total_ms\":").append(String.format("%.3f", batchTotalMs));
+        builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
+        builder.append('}');
+        return builder.toString();
+    }
+
+    /**
+     * Parse a JSON request line to extract the "path" field.
+     * Minimal JSON parsing to avoid adding a dependency.
+     */
+    private static String parseJsonPath(String json) {
+        int idx = json.indexOf("\"path\"");
+        if (idx < 0) {
+            return json;
+        }
+        // Skip past "path" key, colon, optional whitespace, and opening quote
+        idx = json.indexOf(':', idx + PATH_KEY_LENGTH);
+        if (idx < 0) {
+            return json;
+        }
+        idx = json.indexOf('"', idx + 1);
+        if (idx < 0) {
+            return json;
+        }
+        int start = idx + 1;
+        int end = json.indexOf('"', start);
+        if (end < 0) {
+            return json;
+        }
+        return json.substring(start, end);
+    }
+
+    // CPD-OFF: quote() is intentionally duplicated in standalone benchmark scripts (no shared classpath)
+    private static String quote(String value) {
+        if (value == null) {
+            return "null";
+        }
+        StringBuilder sb = new StringBuilder(value.length() + 2);
+        sb.append('"');
+        for (int i = 0; i < value.length(); i++) {
+            char c = value.charAt(i);
+            switch (c) {
+                case '\\': sb.append("\\\\"); break;
+                case '"':  sb.append("\\\""); break;
+                case '\n': sb.append("\\n");  break;
+                case '\r': sb.append("\\r");  break;
+                case '\t': sb.append("\\t");  break;
+                case '\b': sb.append("\\b");  break;
+                case '\f': sb.append("\\f");  break;
+                default:
+                    if (c <= LAST_CONTROL_CHAR) {
+                        sb.append(String.format("\\u%04x", (int) c));
+                    } else {
+                        sb.append(c);
+                    }
+            }
+        }
+        sb.append('"');
+        return sb.toString();
+    }
+    // CPD-ON
+
+    private static void debugLog(String key, String value) {
+        if (value == null) {
+            value = "(null)";
+        }
+        System.err.printf("[BENCHMARK_DEBUG] %-30s = %s%n", key, value);
+    }
+
+    private static class ExtractionData {
+        private final String content;
+        private final String mimeType;
+
+        ExtractionData(String content, String mimeType) {
+            this.content = content;
+            this.mimeType = mimeType;
+        }
+
+        String getContent() {
+            return content;
+        }
+
+        String getMimeType() {
+            return mimeType;
+        }
+    }
+}