This commit is contained in:
39
docs/snippets/elixir/ocr/cloud_ocr_backend.md
Normal file
39
docs/snippets/elixir/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule MyApp.CloudOcrBackend do
|
||||
@behaviour Kreuzberg.Plugin
|
||||
|
||||
defstruct api_key: nil, supported_langs: []
|
||||
|
||||
def new(api_key, supported_langs) do
|
||||
%__MODULE__{api_key: api_key, supported_langs: supported_langs}
|
||||
end
|
||||
|
||||
@impl Kreuzberg.Plugin
|
||||
def name(_backend), do: "cloud-ocr"
|
||||
|
||||
@impl Kreuzberg.Plugin
|
||||
def version(_backend), do: "1.0.0"
|
||||
|
||||
@impl Kreuzberg.Plugin
|
||||
def initialize(_backend), do: :ok
|
||||
|
||||
@impl Kreuzberg.Plugin
|
||||
def shutdown(_backend), do: :ok
|
||||
|
||||
def process_image(backend, image_bytes, language) do
|
||||
call_cloud_api(backend, image_bytes, language)
|
||||
end
|
||||
|
||||
def supports_language(backend, lang) do
|
||||
Enum.member?(backend.supported_langs, lang)
|
||||
end
|
||||
|
||||
defp call_cloud_api(_backend, _image, _language) do
|
||||
{:ok, "Extracted text"}
|
||||
end
|
||||
end
|
||||
|
||||
# Register the custom backend
|
||||
backend = MyApp.CloudOcrBackend.new("api-key", ["en", "de", "fr"])
|
||||
# Use with Kreuzberg extraction...
|
||||
```
|
||||
133
docs/snippets/elixir/ocr/image_extraction.exs
Normal file
133
docs/snippets/elixir/ocr/image_extraction.exs
Normal file
@@ -0,0 +1,133 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract images from documents for further processing
|
||||
# Images are returned as base64-encoded data with format and size information
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
defmodule ImageExtractor do
|
||||
@moduledoc """
|
||||
Extract and process images from documents.
|
||||
Provides utilities for working with extracted image data.
|
||||
"""
|
||||
|
||||
@doc """
|
||||
Extract all images from a document.
|
||||
|
||||
Returns a list of image data with format and metadata.
|
||||
"""
|
||||
def extract_images(file_path) do
|
||||
config = %ExtractionConfig{
|
||||
# Enable image extraction
|
||||
images: %{
|
||||
"extract" => true
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file(file_path, nil, config) do
|
||||
{:ok, result} ->
|
||||
{:ok, result.images || []}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Save extracted images to disk.
|
||||
|
||||
Takes extracted images and writes them to individual files.
|
||||
"""
|
||||
def save_images(images, output_dir) do
|
||||
File.mkdir_p!(output_dir)
|
||||
|
||||
Enum.with_index(images, 1) |> Enum.map(fn {image, idx} ->
|
||||
format = image["format"] || "png"
|
||||
filename = "#{output_dir}/image_#{idx}.#{format}"
|
||||
|
||||
# Decode base64 image data
|
||||
case Base.decode64(image["data"]) do
|
||||
{:ok, binary_data} ->
|
||||
File.write!(filename, binary_data)
|
||||
{:ok, filename}
|
||||
|
||||
:error ->
|
||||
{:error, "Failed to decode image #{idx}"}
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get image statistics from extracted images.
|
||||
"""
|
||||
def get_image_stats(images) do
|
||||
{count, total_size, formats} = Enum.reduce(images, {0, 0, %{}}, fn image, {count, size, formats} ->
|
||||
new_count = count + 1
|
||||
new_size = size + (image["size"] || 0)
|
||||
format = image["format"] || "unknown"
|
||||
new_formats = Map.update(formats, format, 1, &(&1 + 1))
|
||||
|
||||
{new_count, new_size, new_formats}
|
||||
end)
|
||||
|
||||
%{
|
||||
total_images: count,
|
||||
total_bytes: total_size,
|
||||
formats: formats,
|
||||
avg_size: if(count > 0, do: div(total_size, count), else: 0)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
# Usage example
|
||||
file_path = "document_with_images.pdf"
|
||||
|
||||
IO.puts("Extracting images from: #{file_path}\n")
|
||||
|
||||
case ImageExtractor.extract_images(file_path) do
|
||||
{:ok, images} ->
|
||||
IO.puts("Found #{length(images)} image(s)\n")
|
||||
|
||||
# Get image statistics
|
||||
stats = ImageExtractor.get_image_stats(images)
|
||||
IO.puts("=== Image Statistics ===")
|
||||
IO.puts("Total images: #{stats.total_images}")
|
||||
IO.puts("Total size: #{stats.total_bytes} bytes (#{div(stats.total_bytes, 1024)} KB)")
|
||||
IO.puts("Average size: #{stats.avg_size} bytes")
|
||||
IO.puts("Formats: #{inspect(stats.formats)}")
|
||||
IO.puts("")
|
||||
|
||||
# Display individual image information
|
||||
IO.puts("=== Individual Images ===")
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts("Image #{idx}:")
|
||||
IO.puts(" Format: #{image["format"]}")
|
||||
IO.puts(" Size: #{image["size"]} bytes")
|
||||
|
||||
# Optional: show dimensions if available
|
||||
if image["width"] && image["height"] do
|
||||
IO.puts(" Dimensions: #{image["width"]}x#{image["height"]} pixels")
|
||||
end
|
||||
|
||||
# Optional: show DPI if available
|
||||
if image["dpi"] do
|
||||
IO.puts(" DPI: #{image["dpi"]}")
|
||||
end
|
||||
|
||||
IO.puts("")
|
||||
end)
|
||||
|
||||
# Save images to disk
|
||||
case ImageExtractor.save_images(images, "/tmp/extracted_images") do
|
||||
results ->
|
||||
successful = Enum.count(results, fn
|
||||
{:ok, _path} -> true
|
||||
_ -> false
|
||||
end)
|
||||
IO.puts("Saved #{successful}/#{length(results)} images to /tmp/extracted_images")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error extracting images: #{reason}")
|
||||
end
|
||||
```
|
||||
17
docs/snippets/elixir/ocr/image_extraction.md
Normal file
17
docs/snippets/elixir/ocr/image_extraction.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"images" => %{
|
||||
"extract_images" => true,
|
||||
"target_dpi" => 200,
|
||||
"max_image_dimension" => 2048,
|
||||
"inject_placeholders" => true,
|
||||
"auto_adjust_dpi" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
if result.images do
|
||||
IO.puts("Images extracted: #{length(result.images)}")
|
||||
end
|
||||
```
|
||||
174
docs/snippets/elixir/ocr/image_preprocessing.exs
Normal file
174
docs/snippets/elixir/ocr/image_preprocessing.exs
Normal file
@@ -0,0 +1,174 @@
|
||||
```elixir title="Elixir"
|
||||
# Preprocess images before OCR for improved text extraction
|
||||
# Image preprocessing can enhance OCR accuracy on poor quality scans
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
defmodule ImagePreprocessor do
|
||||
@moduledoc """
|
||||
Preprocess images for optimal OCR results.
|
||||
Provides utilities for image enhancement before text extraction.
|
||||
"""
|
||||
|
||||
@doc """
|
||||
Extract with image preprocessing enabled.
|
||||
|
||||
Preprocessing improves OCR accuracy on documents with:
|
||||
- Low contrast text
|
||||
- Skewed pages
|
||||
- Poor image quality
|
||||
- Faded or faint text
|
||||
"""
|
||||
def extract_with_preprocessing(file_path, opts \\ []) do
|
||||
# Get preprocessing options
|
||||
deskew = Keyword.get(opts, :deskew, true)
|
||||
denoise = Keyword.get(opts, :denoise, true)
|
||||
binarize = Keyword.get(opts, :binarize, false)
|
||||
brightness_threshold = Keyword.get(opts, :brightness_threshold, 50)
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
# Enable preprocessing for better OCR results
|
||||
"preprocessing" => %{
|
||||
"enabled" => true,
|
||||
"deskew" => deskew,
|
||||
"denoise" => denoise,
|
||||
"binarize" => binarize,
|
||||
"brightness_threshold" => brightness_threshold
|
||||
}
|
||||
},
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1500,
|
||||
"overlap" => 150
|
||||
},
|
||||
language_detection: %{
|
||||
"enabled" => true
|
||||
},
|
||||
use_cache: false # Disable caching for preprocessing runs
|
||||
}
|
||||
|
||||
Kreuzberg.extract_file(file_path, nil, config)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract with aggressive preprocessing for poor quality documents.
|
||||
|
||||
Use this for heavily degraded or difficult-to-read documents.
|
||||
"""
|
||||
def extract_with_aggressive_preprocessing(file_path) do
|
||||
extract_with_preprocessing(file_path,
|
||||
deskew: true,
|
||||
denoise: true,
|
||||
binarize: true,
|
||||
brightness_threshold: 75
|
||||
)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Compare extraction quality with and without preprocessing.
|
||||
|
||||
Useful for determining optimal preprocessing settings.
|
||||
"""
|
||||
def compare_preprocessing_quality(file_path) do
|
||||
IO.puts("Comparing preprocessing options...\n")
|
||||
|
||||
# Extract without preprocessing
|
||||
IO.puts("Extracting without preprocessing...")
|
||||
config_standard = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"preprocessing" => %{"enabled" => false}
|
||||
},
|
||||
use_cache: false
|
||||
}
|
||||
|
||||
{:ok, result_standard} = Kreuzberg.extract_file(file_path, nil, config_standard)
|
||||
|
||||
# Extract with preprocessing
|
||||
IO.puts("Extracting with preprocessing...")
|
||||
{:ok, result_preprocessed} = extract_with_aggressive_preprocessing(file_path)
|
||||
|
||||
# Compare results
|
||||
standard_size = byte_size(result_standard.content)
|
||||
preprocessed_size = byte_size(result_preprocessed.content)
|
||||
|
||||
IO.puts("\n=== Preprocessing Comparison ===")
|
||||
IO.puts("Standard extraction: #{standard_size} bytes")
|
||||
IO.puts("Preprocessed extraction: #{preprocessed_size} bytes")
|
||||
IO.puts("Size difference: #{abs(preprocessed_size - standard_size)} bytes")
|
||||
|
||||
# Compare chunk quality
|
||||
standard_chunks = result_standard.chunks || []
|
||||
preprocessed_chunks = result_preprocessed.chunks || []
|
||||
IO.puts("\nStandard chunks: #{length(standard_chunks)}")
|
||||
IO.puts("Preprocessed chunks: #{length(preprocessed_chunks)}")
|
||||
|
||||
# Show content comparison
|
||||
IO.puts("\n=== Content Comparison ===")
|
||||
IO.puts("Standard preview:")
|
||||
IO.puts(String.slice(result_standard.content, 0..199))
|
||||
IO.puts("\n...")
|
||||
|
||||
IO.puts("\nPreprocessed preview:")
|
||||
IO.puts(String.slice(result_preprocessed.content, 0..199))
|
||||
IO.puts("\n...")
|
||||
|
||||
# Return comparison data
|
||||
%{
|
||||
standard_content: result_standard.content,
|
||||
preprocessed_content: result_preprocessed.content,
|
||||
standard_size: standard_size,
|
||||
preprocessed_size: preprocessed_size,
|
||||
improvement: if standard_size > 0 do
|
||||
Float.round((preprocessed_size - standard_size) / standard_size * 100, 2)
|
||||
else
|
||||
0
|
||||
end
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
|
||||
# Example 1: Standard preprocessing
|
||||
file_path = "scanned_document.pdf"
|
||||
|
||||
IO.puts("Example 1: Standard Extraction with Preprocessing\n")
|
||||
case ImagePreprocessor.extract_with_preprocessing(file_path) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Extraction successful!")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Chunks created: #{length(result.chunks || [])}")
|
||||
IO.puts("Preview: #{String.slice(result.content, 0..100)}...\n")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}\n")
|
||||
end
|
||||
|
||||
# Example 2: Aggressive preprocessing for difficult documents
|
||||
IO.puts("Example 2: Aggressive Preprocessing for Poor Quality\n")
|
||||
case ImagePreprocessor.extract_with_aggressive_preprocessing(file_path) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Aggressive preprocessing extraction successful!")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}\n")
|
||||
end
|
||||
|
||||
# Example 3: Compare preprocessing options
|
||||
IO.puts("Example 3: Compare Preprocessing Quality\n")
|
||||
try do
|
||||
comparison = ImagePreprocessor.compare_preprocessing_quality(file_path)
|
||||
IO.puts("\nImprovement with preprocessing: #{comparison.improvement}%")
|
||||
rescue
|
||||
error ->
|
||||
IO.puts("Comparison completed with notice: #{inspect(error)}")
|
||||
end
|
||||
```
|
||||
19
docs/snippets/elixir/ocr/image_preprocessing.md
Normal file
19
docs/snippets/elixir/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"tesseract_config" => %{
|
||||
"preprocessing" => %{
|
||||
"target_dpi" => 300,
|
||||
"denoise" => true,
|
||||
"deskew" => true,
|
||||
"contrast_enhance" => true,
|
||||
"binarization_method" => "otsu"
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
11
docs/snippets/elixir/ocr/ocr_easyocr.md
Normal file
11
docs/snippets/elixir/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "easyocr",
|
||||
"language" => "en"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_async("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Extracted text: #{result.content}")
|
||||
```
|
||||
21
docs/snippets/elixir/ocr/ocr_elements.md
Normal file
21
docs/snippets/elixir/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "paddle-ocr", "language" => "en"}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned.pdf", nil, config)
|
||||
|
||||
for element <- result.ocr_elements || [] do
|
||||
IO.puts("Text: #{element.text}")
|
||||
IO.puts("Confidence: #{Float.round(element.confidence.recognition, 2)}")
|
||||
IO.puts("Geometry: #{inspect(element.geometry)}")
|
||||
|
||||
if element.rotation do
|
||||
IO.puts("Rotation: #{element.rotation.angle}°")
|
||||
end
|
||||
|
||||
IO.puts("")
|
||||
end
|
||||
```
|
||||
34
docs/snippets/elixir/ocr/ocr_extraction.exs
Normal file
34
docs/snippets/elixir/ocr/ocr_extraction.exs
Normal file
@@ -0,0 +1,34 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Extract text from a scanned PDF using OCR
|
||||
# Tesseract processes the document and returns structured content
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1500,
|
||||
"overlap" => 150
|
||||
},
|
||||
language_detection: %{"enabled" => true},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_invoice.pdf", nil, config)
|
||||
|
||||
# Process the extracted content
|
||||
content = result.content
|
||||
chunks = result.chunks || []
|
||||
metadata = result.metadata || %{}
|
||||
|
||||
IO.puts("OCR Extraction Complete:")
|
||||
IO.puts("Content length: #{byte_size(content)} bytes")
|
||||
IO.puts("Number of chunks: #{length(chunks)}")
|
||||
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
|
||||
IO.puts("Creation date: #{metadata["creation_date"] || "N/A"}")
|
||||
IO.puts("\nFirst 200 characters of extracted text:")
|
||||
IO.puts(String.slice(content, 0..199))
|
||||
```
|
||||
11
docs/snippets/elixir/ocr/ocr_extraction.md
Normal file
11
docs/snippets/elixir/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
96
docs/snippets/elixir/ocr/ocr_force_all_pages.exs
Normal file
96
docs/snippets/elixir/ocr/ocr_force_all_pages.exs
Normal file
@@ -0,0 +1,96 @@
|
||||
```elixir title="Elixir"
|
||||
# Force OCR processing on all pages of a document
|
||||
# By default, OCR is only applied when needed (scanned content detected)
|
||||
# Use force_all_pages to ensure OCR runs on every page regardless
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configuration with OCR forced on all pages
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
# Force OCR to run on every page, not just scanned content
|
||||
"force_all_pages" => true
|
||||
},
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1500,
|
||||
"overlap" => 150
|
||||
},
|
||||
language_detection: %{
|
||||
"enabled" => true
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
IO.puts("Starting OCR extraction with force_all_pages enabled...")
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case Kreuzberg.extract_file("mixed_document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
elapsed_ms = System.monotonic_time(:millisecond) - start_time
|
||||
|
||||
IO.puts("\n=== OCR Extraction Complete ===\n")
|
||||
|
||||
# Show timing information
|
||||
IO.puts("Processing time: #{elapsed_ms}ms")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
|
||||
# Metadata shows OCR was performed
|
||||
metadata = result.metadata || %{}
|
||||
if metadata["ocr_applied"] do
|
||||
IO.puts("OCR applied to all pages: Yes")
|
||||
end
|
||||
|
||||
# Show language detection results
|
||||
languages = result.detected_languages || []
|
||||
IO.puts("\nDetected languages (#{length(languages)}):")
|
||||
Enum.each(languages, fn lang ->
|
||||
IO.puts(" - #{lang}")
|
||||
end)
|
||||
|
||||
# Show chunking results (useful for RAG/search)
|
||||
chunks = result.chunks || []
|
||||
IO.puts("\nChunks created: #{length(chunks)}")
|
||||
avg_chunk_size = if Enum.empty?(chunks) do
|
||||
0
|
||||
else
|
||||
total_size = Enum.reduce(chunks, 0, &(byte_size(&1) + &2))
|
||||
div(total_size, length(chunks))
|
||||
end
|
||||
IO.puts("Average chunk size: #{avg_chunk_size} bytes")
|
||||
|
||||
# Display content preview
|
||||
IO.puts("\nContent preview (first 300 characters):")
|
||||
preview = String.slice(result.content, 0..299)
|
||||
IO.puts(preview)
|
||||
IO.puts("...\n")
|
||||
|
||||
# Show any extracted tables
|
||||
tables = result.tables || []
|
||||
if not Enum.empty?(tables) do
|
||||
IO.puts("Tables found: #{length(tables)}")
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
|
||||
cells = table["cells"] || []
|
||||
IO.puts(" Table #{idx}: #{length(cells)} rows")
|
||||
end)
|
||||
IO.puts("")
|
||||
end
|
||||
|
||||
# Show any extracted images
|
||||
images = result.images || []
|
||||
if not Enum.empty?(images) do
|
||||
IO.puts("Images extracted: #{length(images)}")
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts(" Image #{idx}: #{image["format"]} - #{image["size"]} bytes")
|
||||
end)
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
elapsed_ms = System.monotonic_time(:millisecond) - start_time
|
||||
IO.puts("OCR extraction failed after #{elapsed_ms}ms")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
11
docs/snippets/elixir/ocr/ocr_force_all_pages.md
Normal file
11
docs/snippets/elixir/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
"force_ocr" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
39
docs/snippets/elixir/ocr/ocr_multi_language.exs
Normal file
39
docs/snippets/elixir/ocr/ocr_multi_language.exs
Normal file
@@ -0,0 +1,39 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Perform OCR extraction with multiple languages
|
||||
# Tesseract can recognize text in multiple languages simultaneously
|
||||
# Combine language codes with '+' separator: "eng+fra+deu" for English, French, German
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+fra+deu"
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 2000,
|
||||
"overlap" => 200
|
||||
},
|
||||
language_detection: %{"enabled" => true},
|
||||
use_cache: true,
|
||||
force_ocr: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multilingual_document.pdf", nil, config)
|
||||
|
||||
# Results will contain text recognized in all specified languages
|
||||
IO.puts("Multi-language OCR Extraction:")
|
||||
IO.puts("Supported languages: English, French, German")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
|
||||
IO.puts("Chunks created: #{if result.chunks, do: length(result.chunks), else: 0}")
|
||||
IO.puts("\nExtracted content preview:")
|
||||
IO.puts(String.slice(result.content, 0..250))
|
||||
|
||||
# Access metadata if available
|
||||
metadata = result.metadata || %{}
|
||||
IO.puts("\nMetadata:")
|
||||
IO.puts("Pages: #{metadata["pages"] || "Unknown"}")
|
||||
IO.puts("Format: #{metadata["format"] || "Unknown"}")
|
||||
```
|
||||
11
docs/snippets/elixir/ocr/ocr_multi_language.md
Normal file
11
docs/snippets/elixir/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+deu+fra"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("multilingual.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
13
docs/snippets/elixir/ocr/ocr_paddleocr.md
Normal file
13
docs/snippets/elixir/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "paddle-ocr", "language" => "en"}
|
||||
# Add "model_tier" => "server" for max accuracy
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
IO.puts("OCR Extracted content:")
|
||||
IO.puts(result.content)
|
||||
```
|
||||
14
docs/snippets/elixir/ocr/tesseract_basic.exs
Normal file
14
docs/snippets/elixir/ocr/tesseract_basic.exs
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
content = result.content
|
||||
IO.puts("OCR Extracted content:")
|
||||
IO.puts(content)
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
Reference in New Issue
Block a user