134 lines
3.6 KiB
Elixir
134 lines
3.6 KiB
Elixir
```elixir title="Elixir"
|
|
# Extract images from documents for further processing
|
|
# Images are returned as base64-encoded data with format and size information
|
|
|
|
alias Kreuzberg.ExtractionConfig
|
|
|
|
defmodule ImageExtractor do
|
|
@moduledoc """
|
|
Extract and process images from documents.
|
|
Provides utilities for working with extracted image data.
|
|
"""
|
|
|
|
@doc """
|
|
Extract all images from a document.
|
|
|
|
Returns a list of image data with format and metadata.
|
|
"""
|
|
def extract_images(file_path) do
|
|
config = %ExtractionConfig{
|
|
# Enable image extraction
|
|
images: %{
|
|
"extract" => true
|
|
},
|
|
use_cache: true
|
|
}
|
|
|
|
case Kreuzberg.extract_file(file_path, nil, config) do
|
|
{:ok, result} ->
|
|
{:ok, result.images || []}
|
|
|
|
{:error, reason} ->
|
|
{:error, reason}
|
|
end
|
|
end
|
|
|
|
@doc """
|
|
Save extracted images to disk.
|
|
|
|
Takes extracted images and writes them to individual files.
|
|
"""
|
|
def save_images(images, output_dir) do
|
|
File.mkdir_p!(output_dir)
|
|
|
|
Enum.with_index(images, 1) |> Enum.map(fn {image, idx} ->
|
|
format = image["format"] || "png"
|
|
filename = "#{output_dir}/image_#{idx}.#{format}"
|
|
|
|
# Decode base64 image data
|
|
case Base.decode64(image["data"]) do
|
|
{:ok, binary_data} ->
|
|
File.write!(filename, binary_data)
|
|
{:ok, filename}
|
|
|
|
:error ->
|
|
{:error, "Failed to decode image #{idx}"}
|
|
end
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Get image statistics from extracted images.
|
|
"""
|
|
def get_image_stats(images) do
|
|
{count, total_size, formats} = Enum.reduce(images, {0, 0, %{}}, fn image, {count, size, formats} ->
|
|
new_count = count + 1
|
|
new_size = size + (image["size"] || 0)
|
|
format = image["format"] || "unknown"
|
|
new_formats = Map.update(formats, format, 1, &(&1 + 1))
|
|
|
|
{new_count, new_size, new_formats}
|
|
end)
|
|
|
|
%{
|
|
total_images: count,
|
|
total_bytes: total_size,
|
|
formats: formats,
|
|
avg_size: if(count > 0, do: div(total_size, count), else: 0)
|
|
}
|
|
end
|
|
end
|
|
|
|
# Usage example
|
|
file_path = "document_with_images.pdf"
|
|
|
|
IO.puts("Extracting images from: #{file_path}\n")
|
|
|
|
case ImageExtractor.extract_images(file_path) do
|
|
{:ok, images} ->
|
|
IO.puts("Found #{length(images)} image(s)\n")
|
|
|
|
# Get image statistics
|
|
stats = ImageExtractor.get_image_stats(images)
|
|
IO.puts("=== Image Statistics ===")
|
|
IO.puts("Total images: #{stats.total_images}")
|
|
IO.puts("Total size: #{stats.total_bytes} bytes (#{div(stats.total_bytes, 1024)} KB)")
|
|
IO.puts("Average size: #{stats.avg_size} bytes")
|
|
IO.puts("Formats: #{inspect(stats.formats)}")
|
|
IO.puts("")
|
|
|
|
# Display individual image information
|
|
IO.puts("=== Individual Images ===")
|
|
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
|
IO.puts("Image #{idx}:")
|
|
IO.puts(" Format: #{image["format"]}")
|
|
IO.puts(" Size: #{image["size"]} bytes")
|
|
|
|
# Optional: show dimensions if available
|
|
if image["width"] && image["height"] do
|
|
IO.puts(" Dimensions: #{image["width"]}x#{image["height"]} pixels")
|
|
end
|
|
|
|
# Optional: show DPI if available
|
|
if image["dpi"] do
|
|
IO.puts(" DPI: #{image["dpi"]}")
|
|
end
|
|
|
|
IO.puts("")
|
|
end)
|
|
|
|
# Save images to disk
|
|
case ImageExtractor.save_images(images, "/tmp/extracted_images") do
|
|
results ->
|
|
successful = Enum.count(results, fn
|
|
{:ok, _path} -> true
|
|
_ -> false
|
|
end)
|
|
IO.puts("Saved #{successful}/#{length(results)} images to /tmp/extracted_images")
|
|
end
|
|
|
|
{:error, reason} ->
|
|
IO.puts("Error extracting images: #{reason}")
|
|
end
|
|
```
|