395 lines
14 KiB
Java
395 lines
14 KiB
Java
|
|
import org.apache.tika.parser.AutoDetectParser;
|
||
|
|
import org.apache.tika.parser.ParseContext;
|
||
|
|
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||
|
|
import org.apache.tika.sax.BodyContentHandler;
|
||
|
|
import org.apache.tika.metadata.Metadata;
|
||
|
|
|
||
|
|
import java.io.BufferedReader;
|
||
|
|
import java.io.File;
|
||
|
|
import java.io.FileInputStream;
|
||
|
|
import java.io.InputStream;
|
||
|
|
import java.io.InputStreamReader;
|
||
|
|
import java.nio.file.Path;
|
||
|
|
import java.util.ArrayList;
|
||
|
|
import java.util.List;
|
||
|
|
|
||
|
|
public final class TikaExtract {
|
||
|
|
private static final double NANOS_IN_MILLISECOND = 1_000_000.0;
|
||
|
|
/** Length of the JSON key {@code "path"} including surrounding quotes. */
|
||
|
|
private static final int PATH_KEY_LENGTH = 6;
|
||
|
|
private static final char LAST_CONTROL_CHAR = 0x1F;
|
||
|
|
|
||
|
|
private TikaExtract() {
|
||
|
|
}
|
||
|
|
|
||
|
|
public static void main(String[] args) {
|
||
|
|
boolean ocrEnabled = false;
|
||
|
|
List<String> positionalArgs = new ArrayList<>();
|
||
|
|
|
||
|
|
for (String arg : args) {
|
||
|
|
if ("--ocr".equals(arg)) {
|
||
|
|
ocrEnabled = true;
|
||
|
|
} else if ("--no-ocr".equals(arg)) {
|
||
|
|
ocrEnabled = false;
|
||
|
|
} else {
|
||
|
|
positionalArgs.add(arg);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (positionalArgs.isEmpty()) {
|
||
|
|
System.err.println("Usage: TikaExtract [--ocr|--no-ocr] <mode> <file1> [file2] ...");
|
||
|
|
System.err.println("Modes: sync, batch, server");
|
||
|
|
System.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
String mode = positionalArgs.get(0);
|
||
|
|
if (!"sync".equals(mode) && !"batch".equals(mode) && !"server".equals(mode)) {
|
||
|
|
System.err.printf("Unsupported mode '%s'%n", mode);
|
||
|
|
System.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Enable debug logging if TIKA_BENCHMARK_DEBUG is set
|
||
|
|
boolean debug = "true".equalsIgnoreCase(System.getenv("TIKA_BENCHMARK_DEBUG"));
|
||
|
|
|
||
|
|
if (debug) {
|
||
|
|
debugLog("java.version", System.getProperty("java.version"));
|
||
|
|
debugLog("os.name", System.getProperty("os.name"));
|
||
|
|
debugLog("os.arch", System.getProperty("os.arch"));
|
||
|
|
debugLog("Mode", mode);
|
||
|
|
debugLog("OCR enabled", String.valueOf(ocrEnabled));
|
||
|
|
debugLog("Files to process", String.valueOf(positionalArgs.size() - 1));
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
if ("sync".equals(mode)) {
|
||
|
|
if (positionalArgs.size() < 2) {
|
||
|
|
System.err.println("Sync mode requires exactly one file");
|
||
|
|
System.exit(1);
|
||
|
|
}
|
||
|
|
processSyncMode(positionalArgs.get(1), ocrEnabled, debug);
|
||
|
|
} else if ("batch".equals(mode)) {
|
||
|
|
processBatchMode(positionalArgs, ocrEnabled, debug);
|
||
|
|
} else {
|
||
|
|
processServerMode(ocrEnabled, debug);
|
||
|
|
}
|
||
|
|
} catch (Exception e) {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Processing failed with exception", e.getClass().getName());
|
||
|
|
e.printStackTrace(System.err);
|
||
|
|
} else {
|
||
|
|
e.printStackTrace(System.err);
|
||
|
|
}
|
||
|
|
System.exit(1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private static void processSyncMode(String filePath, boolean ocrEnabled, boolean debug) throws Exception {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Input file", filePath);
|
||
|
|
}
|
||
|
|
|
||
|
|
Path path = Path.of(filePath);
|
||
|
|
ExtractionData data;
|
||
|
|
long start = System.nanoTime();
|
||
|
|
|
||
|
|
try {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Starting extraction", "");
|
||
|
|
}
|
||
|
|
data = extractFile(path.toFile(), ocrEnabled, debug);
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Extraction completed", "");
|
||
|
|
}
|
||
|
|
} catch (Exception e) {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Extraction failed", e.getClass().getName());
|
||
|
|
e.printStackTrace(System.err);
|
||
|
|
}
|
||
|
|
throw e;
|
||
|
|
}
|
||
|
|
|
||
|
|
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||
|
|
String json = toJson(data, elapsedMs, ocrEnabled);
|
||
|
|
System.out.print(json);
|
||
|
|
}
|
||
|
|
|
||
|
|
private static void processBatchMode(
|
||
|
|
List<String> positionalArgs, boolean ocrEnabled, boolean debug) throws Exception {
|
||
|
|
List<String> filePaths = new ArrayList<>();
|
||
|
|
for (int i = 1; i < positionalArgs.size(); i++) {
|
||
|
|
filePaths.add(positionalArgs.get(i));
|
||
|
|
}
|
||
|
|
|
||
|
|
long batchStart = System.nanoTime();
|
||
|
|
StringBuilder jsonArray = new StringBuilder();
|
||
|
|
jsonArray.append('[');
|
||
|
|
|
||
|
|
boolean first = true;
|
||
|
|
for (String filePath : filePaths) {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Processing file", filePath);
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
Path path = Path.of(filePath);
|
||
|
|
long start = System.nanoTime();
|
||
|
|
ExtractionData data = extractFile(path.toFile(), ocrEnabled, debug);
|
||
|
|
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||
|
|
|
||
|
|
if (!first) {
|
||
|
|
jsonArray.append(',');
|
||
|
|
}
|
||
|
|
first = false;
|
||
|
|
|
||
|
|
double batchTotalMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
|
||
|
|
jsonArray.append(toJsonWithBatch(data, elapsedMs, batchTotalMs, ocrEnabled));
|
||
|
|
|
||
|
|
if (debug) {
|
||
|
|
debugLog("File processed", filePath);
|
||
|
|
}
|
||
|
|
} catch (Exception e) {
|
||
|
|
if (debug) {
|
||
|
|
debugLog("Failed to process file", filePath);
|
||
|
|
debugLog("Exception", e.getClass().getName());
|
||
|
|
e.printStackTrace(System.err);
|
||
|
|
} else {
|
||
|
|
System.err.printf("Error processing %s: %s%n", filePath, e.getMessage());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
double totalBatchMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
|
||
|
|
jsonArray.append(']');
|
||
|
|
|
||
|
|
if (first) {
|
||
|
|
System.err.println("No files were successfully processed");
|
||
|
|
System.exit(1);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
System.out.print(jsonArray.toString());
|
||
|
|
}
|
||
|
|
|
||
|
|
private static void processServerMode(boolean ocrEnabled, boolean debug) throws Exception {
|
||
|
|
// Pre-create shared parser and OCR config to avoid per-file construction overhead.
|
||
|
|
// AutoDetectParser is thread-safe and reusable. Only BodyContentHandler and Metadata
|
||
|
|
// need to be recreated per extraction since they accumulate state.
|
||
|
|
AutoDetectParser sharedParser = new AutoDetectParser();
|
||
|
|
TesseractOCRConfig sharedOcrConfig = new TesseractOCRConfig();
|
||
|
|
if (!ocrEnabled) {
|
||
|
|
sharedOcrConfig.setSkipOcr(true);
|
||
|
|
} else {
|
||
|
|
sharedOcrConfig.setLanguage("eng");
|
||
|
|
}
|
||
|
|
|
||
|
|
// Signal readiness after JVM + Tika parser initialization
|
||
|
|
System.out.println("READY");
|
||
|
|
System.out.flush();
|
||
|
|
|
||
|
|
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
|
||
|
|
String line;
|
||
|
|
while ((line = reader.readLine()) != null) {
|
||
|
|
String filePath = line.trim();
|
||
|
|
if (filePath.isEmpty()) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
// Parse JSON request if the harness sends {"path":"...", "force_ocr": ...}
|
||
|
|
if (filePath.startsWith("{")) {
|
||
|
|
filePath = parseJsonPath(filePath);
|
||
|
|
}
|
||
|
|
try {
|
||
|
|
Path path = Path.of(filePath);
|
||
|
|
long start = System.nanoTime();
|
||
|
|
ExtractionData data = extractFileWithParser(path.toFile(), sharedParser, sharedOcrConfig, debug);
|
||
|
|
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||
|
|
String json = toJson(data, elapsedMs, ocrEnabled);
|
||
|
|
System.out.println(json);
|
||
|
|
System.out.flush();
|
||
|
|
} catch (Exception e) {
|
||
|
|
String errorJson = String.format(
|
||
|
|
"{\"error\":%s,\"_extraction_time_ms\":0,\"_ocr_used\":false}",
|
||
|
|
quote(e.getMessage()));
|
||
|
|
System.out.println(errorJson);
|
||
|
|
System.out.flush();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private static ExtractionData extractFileWithParser(
|
||
|
|
File file, AutoDetectParser parser, TesseractOCRConfig ocrConfig, boolean debug) throws Exception {
|
||
|
|
if (!file.exists()) {
|
||
|
|
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
|
||
|
|
}
|
||
|
|
|
||
|
|
BodyContentHandler handler = new BodyContentHandler(-1);
|
||
|
|
Metadata metadata = new Metadata();
|
||
|
|
ParseContext context = new ParseContext();
|
||
|
|
context.set(TesseractOCRConfig.class, ocrConfig);
|
||
|
|
|
||
|
|
try (InputStream stream = new FileInputStream(file)) {
|
||
|
|
parser.parse(stream, handler, metadata, context);
|
||
|
|
}
|
||
|
|
|
||
|
|
String content = handler.toString();
|
||
|
|
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
|
||
|
|
|
||
|
|
if (mimeType == null) {
|
||
|
|
mimeType = "application/octet-stream";
|
||
|
|
}
|
||
|
|
|
||
|
|
return new ExtractionData(content, mimeType);
|
||
|
|
}
|
||
|
|
|
||
|
|
private static ExtractionData extractFile(File file, boolean ocrEnabled, boolean debug) throws Exception {
|
||
|
|
if (!file.exists()) {
|
||
|
|
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
|
||
|
|
}
|
||
|
|
|
||
|
|
AutoDetectParser parser = new AutoDetectParser();
|
||
|
|
BodyContentHandler handler = new BodyContentHandler(-1);
|
||
|
|
Metadata metadata = new Metadata();
|
||
|
|
ParseContext context = new ParseContext();
|
||
|
|
|
||
|
|
if (!ocrEnabled) {
|
||
|
|
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||
|
|
ocrConfig.setSkipOcr(true);
|
||
|
|
context.set(TesseractOCRConfig.class, ocrConfig);
|
||
|
|
} else {
|
||
|
|
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||
|
|
ocrConfig.setLanguage("eng");
|
||
|
|
context.set(TesseractOCRConfig.class, ocrConfig);
|
||
|
|
}
|
||
|
|
|
||
|
|
try (InputStream stream = new FileInputStream(file)) {
|
||
|
|
parser.parse(stream, handler, metadata, context);
|
||
|
|
}
|
||
|
|
|
||
|
|
String content = handler.toString();
|
||
|
|
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
|
||
|
|
|
||
|
|
if (mimeType == null) {
|
||
|
|
mimeType = "application/octet-stream";
|
||
|
|
}
|
||
|
|
|
||
|
|
return new ExtractionData(content, mimeType);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Determine if OCR was actually used based on MIME type and OCR config.
|
||
|
|
* OCR is used by Tika when enabled and the file is an image type.
|
||
|
|
*/
|
||
|
|
private static boolean determineOcrUsed(String mimeType, boolean ocrEnabled) {
|
||
|
|
if (!ocrEnabled) {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
return mimeType != null && mimeType.startsWith("image/");
|
||
|
|
}
|
||
|
|
|
||
|
|
private static String toJson(ExtractionData data, double elapsedMs, boolean ocrEnabled) {
|
||
|
|
StringBuilder builder = new StringBuilder();
|
||
|
|
builder.append('{');
|
||
|
|
builder.append("\"content\":").append(quote(data.getContent())).append(',');
|
||
|
|
builder.append("\"metadata\":{");
|
||
|
|
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
|
||
|
|
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
|
||
|
|
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
|
||
|
|
builder.append('}');
|
||
|
|
return builder.toString();
|
||
|
|
}
|
||
|
|
|
||
|
|
private static String toJsonWithBatch(
|
||
|
|
ExtractionData data, double elapsedMs, double batchTotalMs, boolean ocrEnabled) {
|
||
|
|
StringBuilder builder = new StringBuilder();
|
||
|
|
builder.append('{');
|
||
|
|
builder.append("\"content\":").append(quote(data.getContent())).append(',');
|
||
|
|
builder.append("\"metadata\":{");
|
||
|
|
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
|
||
|
|
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
|
||
|
|
builder.append(",\"_batch_total_ms\":").append(String.format("%.3f", batchTotalMs));
|
||
|
|
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
|
||
|
|
builder.append('}');
|
||
|
|
return builder.toString();
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse a JSON request line to extract the "path" field.
|
||
|
|
* Minimal JSON parsing to avoid adding a dependency.
|
||
|
|
*/
|
||
|
|
private static String parseJsonPath(String json) {
|
||
|
|
int idx = json.indexOf("\"path\"");
|
||
|
|
if (idx < 0) {
|
||
|
|
return json;
|
||
|
|
}
|
||
|
|
// Skip past "path" key, colon, optional whitespace, and opening quote
|
||
|
|
idx = json.indexOf(':', idx + PATH_KEY_LENGTH);
|
||
|
|
if (idx < 0) {
|
||
|
|
return json;
|
||
|
|
}
|
||
|
|
idx = json.indexOf('"', idx + 1);
|
||
|
|
if (idx < 0) {
|
||
|
|
return json;
|
||
|
|
}
|
||
|
|
int start = idx + 1;
|
||
|
|
int end = json.indexOf('"', start);
|
||
|
|
if (end < 0) {
|
||
|
|
return json;
|
||
|
|
}
|
||
|
|
return json.substring(start, end);
|
||
|
|
}
|
||
|
|
|
||
|
|
// CPD-OFF: quote() is intentionally duplicated in standalone benchmark scripts (no shared classpath)
|
||
|
|
private static String quote(String value) {
|
||
|
|
if (value == null) {
|
||
|
|
return "null";
|
||
|
|
}
|
||
|
|
StringBuilder sb = new StringBuilder(value.length() + 2);
|
||
|
|
sb.append('"');
|
||
|
|
for (int i = 0; i < value.length(); i++) {
|
||
|
|
char c = value.charAt(i);
|
||
|
|
switch (c) {
|
||
|
|
case '\\': sb.append("\\\\"); break;
|
||
|
|
case '"': sb.append("\\\""); break;
|
||
|
|
case '\n': sb.append("\\n"); break;
|
||
|
|
case '\r': sb.append("\\r"); break;
|
||
|
|
case '\t': sb.append("\\t"); break;
|
||
|
|
case '\b': sb.append("\\b"); break;
|
||
|
|
case '\f': sb.append("\\f"); break;
|
||
|
|
default:
|
||
|
|
if (c <= LAST_CONTROL_CHAR) {
|
||
|
|
sb.append(String.format("\\u%04x", (int) c));
|
||
|
|
} else {
|
||
|
|
sb.append(c);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
sb.append('"');
|
||
|
|
return sb.toString();
|
||
|
|
}
|
||
|
|
// CPD-ON
|
||
|
|
|
||
|
|
private static void debugLog(String key, String value) {
|
||
|
|
if (value == null) {
|
||
|
|
value = "(null)";
|
||
|
|
}
|
||
|
|
System.err.printf("[BENCHMARK_DEBUG] %-30s = %s%n", key, value);
|
||
|
|
}
|
||
|
|
|
||
|
|
private static class ExtractionData {
|
||
|
|
private final String content;
|
||
|
|
private final String mimeType;
|
||
|
|
|
||
|
|
ExtractionData(String content, String mimeType) {
|
||
|
|
this.content = content;
|
||
|
|
this.mimeType = mimeType;
|
||
|
|
}
|
||
|
|
|
||
|
|
String getContent() {
|
||
|
|
return content;
|
||
|
|
}
|
||
|
|
|
||
|
|
String getMimeType() {
|
||
|
|
return mimeType;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|