307 lines
7.9 KiB
Elixir
307 lines
7.9 KiB
Elixir
```elixir title="Elixir"
|
|
# Basic CLI Tool - Simple command-line interface for Kreuzberg
|
|
# Demonstrates how to build a CLI for document extraction
|
|
|
|
defmodule KreuzbergCLI do
|
|
@moduledoc """
|
|
Command-line interface for Kreuzberg document extraction.
|
|
|
|
Provides a simple, user-friendly CLI for extracting documents
|
|
from the command line with progress feedback.
|
|
"""
|
|
|
|
require Logger
|
|
|
|
@doc """
|
|
Main entry point for CLI application.
|
|
|
|
Parses command-line arguments and executes the appropriate command.
|
|
"""
|
|
def main(args) do
|
|
args
|
|
|> parse_args()
|
|
|> execute()
|
|
end
|
|
|
|
defp parse_args(args) do
|
|
case args do
|
|
[] ->
|
|
{:error, :no_command}
|
|
|
|
["extract", file | rest] ->
|
|
opts = parse_options(rest, %{})
|
|
{:extract, file, opts}
|
|
|
|
["batch", dir | rest] ->
|
|
opts = parse_options(rest, %{})
|
|
{:batch, dir, opts}
|
|
|
|
["help"] ->
|
|
{:help}
|
|
|
|
["-h"] ->
|
|
{:help}
|
|
|
|
["--help"] ->
|
|
{:help}
|
|
|
|
[cmd] ->
|
|
{:error, "Unknown command: #{cmd}"}
|
|
|
|
_ ->
|
|
{:error, :invalid_args}
|
|
end
|
|
end
|
|
|
|
defp parse_options([], acc), do: acc
|
|
|
|
defp parse_options(["-v" | rest], acc) do
|
|
parse_options(rest, Map.put(acc, :verbose, true))
|
|
end
|
|
|
|
defp parse_options(["--mime-type", mime | rest], acc) do
|
|
parse_options(rest, Map.put(acc, :mime_type, mime))
|
|
end
|
|
|
|
defp parse_options(["--output", path | rest], acc) do
|
|
parse_options(rest, Map.put(acc, :output, path))
|
|
end
|
|
|
|
defp parse_options(["--ocr" | rest], acc) do
|
|
parse_options(rest, Map.put(acc, :enable_ocr, true))
|
|
end
|
|
|
|
defp parse_options(["--chunks" | rest], acc) do
|
|
parse_options(rest, Map.put(acc, :enable_chunks, true))
|
|
end
|
|
|
|
defp parse_options([_ | rest], acc) do
|
|
parse_options(rest, acc)
|
|
end
|
|
|
|
defp execute({:help}) do
|
|
print_help()
|
|
:ok
|
|
end
|
|
|
|
defp execute({:extract, file, opts}) do
|
|
unless File.exists?(file) do
|
|
IO.puts(:stderr, "Error: File not found: #{file}")
|
|
:error
|
|
else
|
|
extract_file(file, opts)
|
|
end
|
|
end
|
|
|
|
defp execute({:batch, dir, opts}) do
|
|
unless File.dir?(dir) do
|
|
IO.puts(:stderr, "Error: Directory not found: #{dir}")
|
|
:error
|
|
else
|
|
batch_extract(dir, opts)
|
|
end
|
|
end
|
|
|
|
defp execute({:error, reason}) do
|
|
IO.puts(:stderr, "Error: #{inspect(reason)}")
|
|
print_help()
|
|
:error
|
|
end
|
|
|
|
defp extract_file(file_path, opts) do
|
|
verbose = Map.get(opts, :verbose, false)
|
|
mime_type = Map.get(opts, :mime_type, nil)
|
|
output_path = Map.get(opts, :output, nil)
|
|
|
|
config = build_config(opts)
|
|
|
|
IO.puts("Extracting: #{file_path}")
|
|
start_time = System.monotonic_time(:millisecond)
|
|
|
|
case Kreuzberg.extract_file(file_path, mime_type, config) do
|
|
{:ok, result} ->
|
|
elapsed = System.monotonic_time(:millisecond) - start_time
|
|
|
|
print_extraction_result(result, elapsed, verbose)
|
|
|
|
if output_path do
|
|
save_result(result, output_path)
|
|
end
|
|
|
|
:ok
|
|
|
|
{:error, reason} ->
|
|
IO.puts(:stderr, "Extraction failed: #{inspect(reason)}")
|
|
:error
|
|
end
|
|
end
|
|
|
|
defp batch_extract(dir, opts) do
|
|
verbose = Map.get(opts, :verbose, false)
|
|
config = build_config(opts)
|
|
|
|
pattern = Path.join(dir, "**/*.{pdf,docx,xlsx,txt,html,md}")
|
|
files = Path.wildcard(pattern)
|
|
|
|
case files do
|
|
[] ->
|
|
IO.puts("No documents found in #{dir}")
|
|
:ok
|
|
|
|
_ ->
|
|
IO.puts("Found #{length(files)} documents\n")
|
|
|
|
results =
|
|
files
|
|
|> Enum.with_index(1)
|
|
|> Enum.map(fn {file, idx} ->
|
|
IO.write(" [#{idx}/#{length(files)}] ")
|
|
start_time = System.monotonic_time(:millisecond)
|
|
|
|
case Kreuzberg.extract_file(file, nil, config) do
|
|
{:ok, result} ->
|
|
elapsed = System.monotonic_time(:millisecond) - start_time
|
|
IO.puts("#{Path.basename(file)} (#{elapsed}ms)")
|
|
{:ok, file, result, elapsed}
|
|
|
|
{:error, reason} ->
|
|
IO.puts("#{Path.basename(file)} - ERROR")
|
|
if verbose, do: IO.puts(" Error: #{inspect(reason)}")
|
|
{:error, file, reason}
|
|
end
|
|
end)
|
|
|
|
print_batch_summary(results)
|
|
:ok
|
|
end
|
|
end
|
|
|
|
defp build_config(opts) do
|
|
%Kreuzberg.ExtractionConfig{
|
|
ocr:
|
|
if(Map.get(opts, :enable_ocr),
|
|
do: %{"enabled" => true, "backend" => "tesseract"},
|
|
else: nil
|
|
),
|
|
chunking:
|
|
if(Map.get(opts, :enable_chunks),
|
|
do: %{"enabled" => true, "max_characters" => 1000, "overlap" => 100},
|
|
else: nil
|
|
),
|
|
use_cache: true
|
|
}
|
|
end
|
|
|
|
defp print_extraction_result(result, elapsed_ms, verbose) do
|
|
IO.puts("\nExtraction Results:")
|
|
IO.puts(" Content size: #{byte_size(result.content)} bytes")
|
|
IO.puts(" MIME type: #{result.mime_type}")
|
|
IO.puts(" Processing time: #{elapsed_ms}ms")
|
|
|
|
if result.metadata do
|
|
IO.puts(" Metadata keys: #{Enum.count(result.metadata)}")
|
|
end
|
|
|
|
if result.tables && !Enum.empty?(result.tables) do
|
|
IO.puts(" Tables found: #{length(result.tables)}")
|
|
end
|
|
|
|
if result.images && !Enum.empty?(result.images) do
|
|
IO.puts(" Images found: #{length(result.images)}")
|
|
end
|
|
|
|
if result.chunks && !Enum.empty?(result.chunks) do
|
|
IO.puts(" Chunks created: #{length(result.chunks)}")
|
|
end
|
|
|
|
if result.detected_languages && !Enum.empty?(result.detected_languages) do
|
|
IO.puts(" Languages: #{Enum.join(result.detected_languages, ", ")}")
|
|
end
|
|
|
|
if verbose do
|
|
IO.puts("\n Full metadata:")
|
|
IO.inspect(result.metadata, pretty: true)
|
|
end
|
|
|
|
IO.puts("")
|
|
end
|
|
|
|
defp print_batch_summary(results) do
|
|
total = length(results)
|
|
successful = Enum.count(results, &match?({:ok, _, _, _}, &1))
|
|
failed = Enum.count(results, &match?({:error, _, _}, &1))
|
|
|
|
total_time =
|
|
results
|
|
|> Enum.filter(&match?({:ok, _, _, _}, &1))
|
|
|> Enum.map(fn {:ok, _, _, time} -> time end)
|
|
|> Enum.sum()
|
|
|
|
IO.puts("\nBatch Summary:")
|
|
IO.puts(" Total: #{total}")
|
|
IO.puts(" Successful: #{successful}")
|
|
IO.puts(" Failed: #{failed}")
|
|
IO.puts(" Total time: #{total_time}ms")
|
|
IO.puts(" Average time: #{div(total_time, max(successful, 1))}ms/document")
|
|
end
|
|
|
|
defp save_result(result, output_path) do
|
|
output_data = %{
|
|
content: result.content,
|
|
mime_type: result.mime_type,
|
|
metadata: result.metadata,
|
|
tables: result.tables || [],
|
|
images: result.images || [],
|
|
chunks: result.chunks || [],
|
|
detected_languages: result.detected_languages || [],
|
|
extracted_at: DateTime.utc_now()
|
|
}
|
|
|
|
case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
|
|
:ok ->
|
|
IO.puts("Results saved to: #{output_path}")
|
|
|
|
{:error, reason} ->
|
|
IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
|
|
end
|
|
end
|
|
|
|
defp print_help do
|
|
IO.puts("""
|
|
Kreuzberg CLI - Document Extraction Tool
|
|
|
|
USAGE:
|
|
kreuzberg extract <file> [OPTIONS]
|
|
kreuzberg batch <directory> [OPTIONS]
|
|
kreuzberg help
|
|
|
|
COMMANDS:
|
|
extract <file> Extract content from a single document
|
|
batch <directory> Extract all documents in a directory
|
|
help Show this help message
|
|
|
|
OPTIONS:
|
|
-v, --verbose Show detailed output
|
|
--mime-type <type> Specify MIME type (e.g., application/pdf)
|
|
--output <path> Save results to JSON file
|
|
--ocr Enable OCR for scanned documents
|
|
--chunks Enable document chunking for RAG
|
|
|
|
EXAMPLES:
|
|
kreuzberg extract document.pdf
|
|
kreuzberg extract document.pdf --output results.json
|
|
kreuzberg batch ./documents --ocr --chunks
|
|
kreuzberg batch ./documents -v --output summary.json
|
|
""")
|
|
end
|
|
end
|
|
|
|
# Entry point for escript
|
|
def main(args) do
|
|
case KreuzbergCLI.main(args) do
|
|
:ok -> 0
|
|
:error -> 1
|
|
end
|
|
end
|
|
```
|