Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
# Advanced Chunking Configuration
# This example shows how to configure sophisticated document chunking strategies
# with fine-grained control over chunk size, overlap, and boundary respect.
alias Kreuzberg.ExtractionConfig
# Advanced chunking configuration with multiple parameters
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 200,
"min_size" => 100,
"respect_boundaries" => true,
"split_on" => ["sentence", "paragraph"]
}
}
# Use the configuration for extraction
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the chunked results
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.each(chunks, fn chunk ->
IO.inspect(%{
text_length: String.length(chunk["content"]),
page: chunk["page"],
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
})
end)
# Prepend heading context to chunk content
config_with_headings = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
}
{:ok, result_headings} = Kreuzberg.extract_file("document.md", nil, config_with_headings)
if result_headings.chunks do
IO.puts("Generated #{length(result_headings.chunks)} chunks with prepended headings")
Enum.each(result_headings.chunks, fn chunk ->
# Each chunk's content is prefixed with its heading breadcrumb
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end

View File

@@ -0,0 +1,36 @@
# Advanced Language Detection Configuration
# This example demonstrates how to configure language detection with multiple
# parameters for detecting and tracking all languages in documents.
alias Kreuzberg.ExtractionConfig
# Advanced language detection configuration
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true,
"min_confidence" => 0.8
}
}
# Extract file with language detection enabled
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the detection results
detection_results = result.detected_languages || []
IO.puts("Detected languages:")
Enum.each(detection_results, fn lang_result ->
confidence = Map.get(lang_result, "confidence", "unknown")
language = Map.get(lang_result, "language", "unknown")
IO.puts(" - #{language}: #{confidence}")
end)
# Filter results by minimum confidence threshold
high_confidence_languages = Enum.filter(detection_results, fn lang_result ->
confidence = Map.get(lang_result, "confidence", 0)
confidence >= 0.8
end)
IO.puts("\nHigh confidence languages (>= 0.8): #{length(high_confidence_languages)}")

View File

@@ -0,0 +1,22 @@
# Advanced keyword extraction configuration
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 20,
"min_score" => 0.6,
"algorithm" => "tfidf"
}
}
{:ok, result} = Kreuzberg.extract_file("research_paper.pdf", nil, config)
if result.keywords do
# Group by score ranges
high_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.8 end)
medium_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.6 and kw["score"] < 0.8 end)
IO.puts("High confidence keywords (#{length(high_score)}):")
Enum.each(high_score, fn kw -> IO.puts(" - #{kw["word"]} (#{kw["score"]})") end)
end

View File

@@ -0,0 +1,39 @@
# Quality processing configuration
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.75,
"remove_noise" => true,
"enhance_clarity" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
}
}
# Extract file with quality processing and OCR
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the quality-processed results
IO.puts("=== Quality Processing Results ===\n")
# Display extracted content with quality processing applied
content_preview = String.slice(result.content, 0..300)
IO.puts("Extracted Content (first 300 chars):")
IO.puts(content_preview)
IO.puts("\nTotal content size: #{byte_size(result.content)} bytes")
# Check quality metrics
quality_score = result.quality_score
if quality_score && quality_score > 0 do
IO.puts("\nQuality Score: #{quality_score}")
IO.puts("Quality Status: #{if quality_score >= 0.75, do: "Acceptable", else: "Below threshold"}")
end
# Display any OCR results if applicable
if result.images && length(result.images) > 0 do
IO.puts("\nImages found: #{length(result.images)}")
end

View File

@@ -0,0 +1,21 @@
# Token reduction for different LLM context windows
alias Kreuzberg.ExtractionConfig
# For GPT-4
config_gpt4 = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 8000,
"strategy" => "intelligent",
"preserve_structure" => true
}
}
# For Claude
config_claude = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 100000,
"strategy" => "minimal"
}
}

View File

@@ -0,0 +1,33 @@
# Vector Database Integration
# This example demonstrates how to prepare document chunks for integration with vector databases
# by configuring chunking and processing the extracted content.
alias Kreuzberg.ExtractionConfig
# Configure extraction with chunking enabled
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
}
}
# Extract file with chunking
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Prepare chunks for vector database ingestion
documents = Enum.map(result.chunks || [], fn chunk ->
%{
content: chunk["content"],
metadata: %{
page: chunk["page"],
char_count: String.length(chunk["content"])
}
}
end)
IO.puts("Prepared #{length(documents)} documents for vector DB")
# The documents list can now be sent to your vector database
# Example: documents |> MyVectorDB.index_documents()

View File

@@ -0,0 +1,13 @@
# Extract with chunking and track page boundaries
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 500},
track_page_boundaries: true
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Map each chunk to its source page
Enum.with_index(result.chunks || [], 1) |> Enum.each(fn {chunk, idx} ->
page = chunk["page"] || "unknown"
IO.puts("Chunk #{idx} from page #{page}")
end)

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 1024,
"overlap" => 128
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Map chunks to their source pages
chunks_with_pages = result.chunks
|> Enum.map(fn chunk ->
%{
"chunk_id" => chunk["id"],
"content" => chunk["content"],
"page_number" => chunk["page"]
}
end)
IO.inspect(chunks_with_pages, label: "Chunks with Page Mapping")
```

View File

@@ -0,0 +1,45 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 200,
"min_size" => 100,
"respect_boundaries" => true,
"split_on" => ["sentence", "paragraph"]
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.each(chunks, fn chunk ->
IO.inspect(%{
text_length: String.length(chunk["content"]),
page: chunk["page"],
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
})
end)
```
```elixir title="Elixir - Prepend Heading Context"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.md", "text/markdown", config_json)
if result.chunks do
IO.puts("Generated #{length(result.chunks)} chunks with prepended headings")
Enum.each(result.chunks, fn chunk ->
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end
```

View File

@@ -0,0 +1,26 @@
# Configure chunking for RAG/vector search
config = %Kreuzberg.ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50,
"respect_boundaries" => true
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Prepare chunks for vector embedding and search
chunks_for_embedding = result.chunks
|> Enum.map(fn chunk ->
%{
"id" => chunk["id"],
"content" => chunk["content"],
"metadata" => %{
"page" => chunk["page"],
"source" => "document.pdf"
}
}
end)
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")

View File

@@ -0,0 +1,27 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50,
"respect_boundaries" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Prepare chunks for vector embedding and search
chunks_for_embedding = result.chunks
|> Enum.map(fn chunk ->
%{
"id" => chunk["id"],
"content" => chunk["content"],
"metadata" => %{
"page" => chunk["page"],
"source" => "document.pdf"
}
}
end)
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
```

View File

@@ -0,0 +1,20 @@
# Use all major features together
config = %Kreuzberg.ExtractionConfig{
ocr: %{"enabled" => true},
chunking: %{"enabled" => true, "max_characters" => 1000},
images: %{"extract" => true},
language_detection: %{"enabled" => true},
keyword_extraction: %{"enabled" => true}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process results with all extracted features
IO.inspect(result, label: "Extraction Result")
# Access different feature outputs
if result.content, do: IO.puts("Text: #{String.slice(result.content, 0..100)}")
if result.detected_languages, do: IO.puts("Language: #{inspect(result.detected_languages)}")
if result.metadata["keywords"], do: IO.puts("Keywords: #{inspect(result.metadata["keywords"])}")
if result.chunks, do: IO.puts("Chunks: #{length(result.chunks)}")
if result.images, do: IO.puts("Images: #{length(result.images)}")

View File

@@ -0,0 +1,103 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Build comprehensive configuration for all features
config = %ExtractionConfig{
# Enable OCR for scanned documents
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"force_all_pages" => false
},
# Configure chunking for RAG applications
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 100
},
# Extract images from documents
images: %{
"extract" => true
},
# Enable language detection
language_detection: %{
"enabled" => true
},
# Use caching for performance
use_cache: true
}
# Extract file with full configuration
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Extraction Successful ===\n")
# 1. Process content
IO.puts("--- Content ---")
content_preview = String.slice(result.content, 0..100)
IO.puts("Preview: #{content_preview}...")
IO.puts("Total size: #{byte_size(result.content)} bytes\n")
# 2. Process metadata
IO.puts("--- Metadata ---")
IO.puts("MIME type: #{result.mime_type}")
metadata = result.metadata || %{}
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
# Handle PDF-specific metadata
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
IO.puts(" Pages: #{pdf_meta["page_count"]}")
IO.puts(" Author: #{pdf_meta["author"]}")
IO.puts(" Title: #{pdf_meta["title"]}")
_ -> nil
end
IO.puts("")
# 3. Process tables
IO.puts("--- Tables ---")
tables = result.tables || []
IO.puts("Total tables: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
cells = table["cells"] || []
IO.puts(" Table #{idx}: #{length(cells)} rows")
markdown = table["markdown"]
if markdown, do: IO.puts(" Markdown: #{String.slice(markdown, 0..50)}...")
end)
IO.puts("")
# 4. Process chunks for RAG
IO.puts("--- Chunks ---")
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, idx} ->
IO.puts(" Chunk #{idx}: #{byte_size(chunk)} bytes")
end)
IO.puts("")
# 5. Process detected languages
IO.puts("--- Language Detection ---")
languages = result.detected_languages || []
if Enum.empty?(languages) do
IO.puts("No languages detected")
else
Enum.each(languages, fn lang ->
IO.puts(" Language: #{lang}")
end)
end
IO.puts("")
# 6. Process images
IO.puts("--- Images ---")
images = result.images || []
IO.puts("Total images: #{length(images)}")
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
IO.puts(" Image #{idx}: #{image["format"]} (#{image["size"]} bytes)")
end)
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,27 @@
# Extract and prepare for embeddings
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 512},
embeddings: %{"enabled" => true}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process chunks with embeddings for semantic search
embedded_chunks = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"chunk_id" => idx,
"content" => chunk["content"],
"embedding" => chunk["embedding"],
"page" => chunk["page"],
"metadata" => %{
"document" => "document.pdf",
"chunk_index" => idx
}
}
end)
# Store embeddings in vector database
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
IO.inspect(embedded_chunks, label: "Embedded Chunks")

View File

@@ -0,0 +1,32 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512
},
"embeddings" => %{
"enabled" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Process chunks with embeddings for semantic search
embedded_chunks = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"chunk_id" => idx,
"content" => chunk["content"],
"embedding" => chunk["embedding"],
"page" => chunk["page"],
"metadata" => %{
"document" => "document.pdf",
"chunk_index" => idx
}
}
end)
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
IO.inspect(embedded_chunks, label: "Embedded Chunks")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.3
}
})
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
if result.keywords do
IO.puts("Keywords: #{inspect(result.keywords)}")
end
```

View File

@@ -0,0 +1,30 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Extract keywords from document
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 15
}
}
case Kreuzberg.extract_file("article.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Keyword Extraction ===\n")
# Display extracted keywords
if result.keywords do
IO.puts("Extracted keywords:")
Enum.each(result.keywords, fn kw ->
IO.puts("- #{kw["word"]}: #{kw["score"]}")
end)
else
IO.puts("No keywords extracted")
end
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.3
}
})
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
if result.keywords do
result.keywords
|> Enum.each(fn %{"keyword" => kw, "score" => score} ->
IO.puts("#{kw}: #{Float.round(score, 4)}")
end)
end
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => false
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
if result.language do
IO.puts("Detected language: #{result.language}")
end
```

View File

@@ -0,0 +1,32 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Detect all languages in multilingual document
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true
}
}
case Kreuzberg.extract_file("multilingual.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Language Detection ===\n")
# Display detected languages
languages = result.detected_languages || []
if Enum.empty?(languages) do
IO.puts("No languages detected")
else
IO.puts("Detected languages:")
Enum.each(languages, fn lang ->
IO.puts("- #{lang}")
end)
IO.puts("\nTotal languages: #{length(languages)}")
end
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.7,
"detect_multiple" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("multilingual_document.pdf", "application/pdf", config_json)
if result.languages do
IO.puts("Detected languages:")
Enum.each(result.languages, fn %{"language" => lang, "confidence" => conf} ->
IO.puts(" - #{lang}: #{Float.round(conf, 4)}")
end)
end
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
# Access PDF metadata directly from the flat map
page_count = metadata["page_count"]
if page_count, do: IO.puts("Page count: #{page_count}")
authors = metadata["authors"] || []
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
title = metadata["title"]
if title, do: IO.puts("Title: #{title}")
# Access HTML metadata directly from the flat map
{:ok, html_result} = Kreuzberg.extract_file("page.html")
html_meta = html_result.metadata
keywords = html_meta["keywords"] || []
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
description = html_meta["description"]
if description, do: IO.puts("Description: #{description}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"post_processors" => [
%{
"name" => "QualityFilter",
"enabled" => true
}
]
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Extraction quality applied")
IO.inspect(result.text, label: "Quality-filtered text")
```

View File

@@ -0,0 +1,39 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Process document with quality filtering
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.8
}
}
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Quality Processing ===\n")
# Display quality metrics if available
quality_score = result.quality_score
if quality_score && quality_score > 0 do
IO.puts("Quality Score: #{quality_score}")
IO.puts("Content Quality: #{quality_status(quality_score)}")
end
# Display content with quality assurance
IO.puts("\n--- Extracted Content ---")
content_preview = String.slice(result.content, 0..200)
IO.puts(content_preview)
IO.puts("\nTotal size: #{byte_size(result.content)} bytes")
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
# Helper function to determine quality status
defp quality_status(score) when score >= 0.9, do: "Excellent"
defp quality_status(score) when score >= 0.8, do: "Good"
defp quality_status(score) when score >= 0.7, do: "Fair"
defp quality_status(_score), do: "Poor"
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"post_processors" => [
%{
"name" => "QualityFilter",
"enabled" => true
}
]
})
{:ok, result_before} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
{:ok, result_after} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Compare text quality metrics
text_before = result_before.text || ""
text_after = result_after.text || ""
IO.puts("Before quality processing: #{String.length(text_before)} chars")
IO.puts("After quality processing: #{String.length(text_after)} chars")
IO.puts("Improvement: #{Float.round((1 - String.length(text_after) / String.length(text_before)) * 100, 2)}%")
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
tables = result.tables
IO.puts("Total tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
IO.puts("\n--- Table #{index} ---")
# Access table cells
cells = table["cells"] || []
IO.puts("Rows: #{length(cells)}")
# Access table markdown representation
markdown = table["markdown"]
IO.puts("Markdown representation:")
IO.puts(markdown)
end)
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_markdown" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
if result.original_token_count do
IO.puts("Original tokens: #{result.original_token_count}")
end
if result.reduced_token_count do
IO.puts("Reduced tokens: #{result.reduced_token_count}")
end
```

View File

@@ -0,0 +1,42 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Reduce token count for LLM
config = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 2000
}
}
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Token Reduction ===\n")
# Display content and token information
content_size = byte_size(result.content)
estimated_tokens = div(content_size, 4) # Rough estimate: 1 token ≈ 4 bytes
IO.puts("Content size: #{content_size} bytes")
IO.puts("Estimated tokens: ~#{estimated_tokens}")
IO.puts("Target tokens: 2000")
# Show reduction status
if estimated_tokens > 2000 do
reduction_percentage = trunc((1 - 2000 / estimated_tokens) * 100)
IO.puts("\nToken reduction applied: ~#{reduction_percentage}% reduction")
else
IO.puts("\nNo reduction needed - content already below target")
end
# Display reduced content preview
IO.puts("\n--- Reduced Content ---")
content_preview = String.slice(result.content, 0..300)
IO.puts(content_preview)
IO.puts("...")
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_markdown" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
# Display token reduction metrics
original = result.original_token_count || 0
reduced = result.reduced_token_count || 0
IO.puts("Original tokens: #{original}")
IO.puts("Reduced tokens: #{reduced}")
if original > 0 do
reduction_percent = Float.round((1 - reduced / original) * 100, 2)
IO.puts("Reduction: #{reduction_percent}%")
end
# Show sample of reduced text
if result.text do
IO.puts("\nSample of reduced text:")
IO.puts(String.slice(result.text, 0..200) <> "...")
end
```

View File

@@ -0,0 +1,40 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
},
"embeddings" => %{
"enabled" => true,
"model" => "all-MiniLM-L6-v2"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Prepare data for vector database storage
vector_db_records = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"id" => "#{result.document_id}-chunk-#{idx}",
"vector" => chunk["embedding"],
"metadata" => %{
"content" => chunk["content"],
"page" => chunk["page"],
"document_id" => result.document_id,
"chunk_index" => idx
}
}
end)
IO.puts("Generated #{length(vector_db_records)} records for vector database")
IO.inspect(List.first(vector_db_records), label: "Sample Record")
# Example: Insert into Pinecone-like vector database
Enum.each(vector_db_records, fn record ->
# vector_db_client.upsert(record)
IO.puts("Would insert: #{record["id"]}")
end)
```

View File

@@ -0,0 +1,25 @@
```elixir title="Elixir"
defmodule Example do
def batch_extract_bytes do
# Note: Batch extraction in Elixir is done via Task.async_stream over sync calls
files = ["doc1.pdf", "doc2.docx", "report.pdf"]
config = nil
results =
files
|> Task.async_stream(
fn file ->
content = File.read!(file)
Kreuzberg.extract_bytes_sync(content, "application/pdf", config)
end,
max_concurrency: 4
)
|> Enum.map(fn {:ok, result} -> result end)
Enum.each(results, fn
{:ok, content} -> IO.puts("Extracted: #{String.length(content)} chars")
{:error, reason} -> IO.puts("Error: #{reason}")
end)
end
end
```

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
defmodule Example do
def batch_extract_files do
files = ["doc1.pdf", "doc2.docx", "report.pdf"]
config = nil
results =
files
|> Task.async_stream(
fn file ->
Kreuzberg.extract_file_sync(file, nil, config)
end,
max_concurrency: 4
)
|> Enum.map(fn {:ok, result} -> result end)
Enum.each(results, fn
{:ok, content} -> IO.puts("File extracted: #{String.length(content)} chars")
{:error, reason} -> IO.puts("Error: #{reason}")
end)
end
end
```

View File

@@ -0,0 +1,30 @@
```elixir title="Elixir"
defmodule Example do
def chunk_text_via_http do
text = "Your long document text here..."
config = %{
"chunking" => %{
"max_characters" => 800,
"overlap" => 100,
"chunker_type" => "Markdown"
}
}
with {:ok, response} <- Req.post(
"http://localhost:8000/chunk",
json: %{
"text" => text,
"config" => config
}
),
{:ok, body} <- Jason.decode(response.body) do
chunks = body["chunks"]
IO.puts("Created #{length(chunks)} chunks")
{:ok, chunks}
else
error -> {:error, inspect(error)}
end
end
end
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
defmodule Example do
def extract_via_http do
file_path = "document.pdf"
with {:ok, file} <- File.read(file_path),
{:ok, response} <- Req.post(
"http://localhost:8000/extract",
form: [file: {:file, file_path}]
),
{:ok, body} <- Jason.decode(response.body) do
IO.puts("Extracted content: #{body["content"]}")
{:ok, body}
else
error -> {:error, inspect(error)}
end
end
end
```

View File

@@ -0,0 +1,41 @@
```elixir title="Elixir"
defmodule Example do
def full_extraction_pipeline do
# Build a comprehensive extraction config as a JSON string or map
config_json = Jason.encode!(%{
"use_cache" => true,
"enable_quality_processing" => true,
"force_ocr" => false,
"ocr" => %{
"backend" => "tesseract",
"language" => "eng"
},
"chunking" => %{
"max_characters" => 800,
"overlap" => 100,
"chunker_type" => "Markdown",
"prepend_heading_context" => true
},
"output_format" => "Markdown",
"include_document_structure" => true,
"images" => %{
"extract_images" => true
},
"language_detection" => %{
"detect" => true
}
})
case Kreuzberg.extract_file_sync("report.pdf", nil, config_json) do
{:ok, result} ->
IO.puts("Extraction successful")
IO.puts("Content length: #{String.length(result)} chars")
:ok
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,24 @@
```elixir title="Elixir"
defmodule Example do
def handle_extraction_errors do
# Extract with invalid MIME type
case Kreuzberg.extract_file_sync("document.txt", nil, nil) do
{:ok, result} ->
IO.puts("Success: #{String.length(result)} chars")
{:error, reason} when is_binary(reason) ->
# Error is a string description
case reason do
msg when String.contains?(msg, "unsupported") ->
IO.puts("Unsupported format: #{msg}")
msg when String.contains?(msg, "not found") ->
IO.puts("File not found: #{msg}")
msg ->
IO.puts("Extraction failed: #{msg}")
end
end
end
end
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
defmodule Example do
def robust_extract(path) do
with {:file_exists, true} <- {:file_exists, File.exists?(path)},
{:read, {:ok, content}} <- {:read, File.read(path)},
{:mime, {:ok, mime_type}} <- {:mime, detect_mime_type(content)},
{:extract, {:ok, result}} <- {:extract, Kreuzberg.extract_bytes_sync(content, mime_type, nil)} do
{:ok, result}
else
{:file_exists, false} ->
{:error, "File not found: #{path}"}
{:read, {:error, reason}} ->
{:error, "Failed to read file: #{inspect(reason)}"}
{:mime, {:error, reason}} ->
{:error, "MIME detection failed: #{reason}"}
{:extract, {:error, reason}} ->
{:error, "Extraction failed: #{reason}"}
end
end
defp detect_mime_type(content) do
Kreuzberg.detect_mime_type_from_bytes(content)
end
end
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
defmodule Example do
def extract_from_bytes_async do
content = File.read!("document.pdf")
config = nil
task = Task.async(fn ->
Kreuzberg.extract_bytes_async(content, "application/pdf", config)
end)
case Task.await(task) do
{:ok, result} ->
IO.puts("Content: #{result}")
:ok
{:error, reason} ->
IO.puts("Error: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
defmodule Example do
def extract_from_bytes do
content = File.read!("document.pdf")
config = nil
case Kreuzberg.extract_bytes_sync(content, "application/pdf", config) do
{:ok, result} ->
IO.puts("Content: #{result}")
:ok
{:error, reason} ->
IO.puts("Error: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,21 @@
```elixir title="Elixir"
defmodule Example do
def extract_file_async do
config = nil
task = Task.async(fn ->
Kreuzberg.extract_file_async("document.pdf", nil, config)
end)
case Task.await(task) do
{:ok, result} ->
IO.puts("Content: #{result}")
:ok
{:error, reason} ->
IO.puts("Error: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
defmodule Example do
def extract_file do
config = nil
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("Content: #{result}")
:ok
{:error, reason} ->
IO.puts("Error: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,435 @@
```elixir title="Elixir"
# Disk Caching - Implement persistent disk caching for extraction results
# Demonstrates advanced caching strategies for document extraction
defmodule KreuzbergDiskCache do
@moduledoc """
Disk-based caching layer for Kreuzberg extraction results.
Provides persistent caching of extraction results with features like:
- TTL-based cache expiration
- Compression for large results
- Cache statistics and management
- Multi-tiered caching (memory + disk)
"""
require Logger
defmodule CacheEntry do
@moduledoc """
Represents a cached extraction result.
"""
defstruct [
:key,
:result,
:created_at,
:accessed_at,
:ttl_seconds,
:size_bytes,
:compressed
]
@doc """
Create a new cache entry.
"""
def new(key, result, ttl_seconds \\ 86400) do
size = calculate_size(result)
%CacheEntry{
key: key,
result: result,
created_at: System.monotonic_time(:second),
accessed_at: System.monotonic_time(:second),
ttl_seconds: ttl_seconds,
size_bytes: size,
compressed: false
}
end
@doc """
Check if entry has expired.
"""
def expired?(%CacheEntry{created_at: created_at, ttl_seconds: ttl}) do
now = System.monotonic_time(:second)
now - created_at > ttl
end
@doc """
Update access time.
"""
def touch(%CacheEntry{} = entry) do
%{entry | accessed_at: System.monotonic_time(:second)}
end
defp calculate_size(result) do
case result do
%{content: content} -> byte_size(content)
_ -> 0
end
end
end
defmodule Cache do
@moduledoc """
Main disk cache implementation.
"""
defstruct [
:cache_dir,
:max_size_bytes,
:ttl_seconds,
:compression_enabled,
:memory_cache
]
@doc """
Initialize disk cache.
"""
def new(cache_dir, opts \\ []) do
File.mkdir_p!(cache_dir)
%Cache{
cache_dir: cache_dir,
max_size_bytes: Keyword.get(opts, :max_size_bytes, 1_000_000_000),
ttl_seconds: Keyword.get(opts, :ttl_seconds, 604_800),
compression_enabled: Keyword.get(opts, :compression_enabled, true),
memory_cache: %{}
}
end
@doc """
Get cached result by key.
"""
def get(cache, key) do
# Check memory cache first
case Map.get(cache.memory_cache, key) do
%CacheEntry{} = entry ->
if CacheEntry.expired?(entry) do
Logger.debug("Cache hit (memory) - expired: #{key}")
:miss
else
Logger.debug("Cache hit (memory): #{key}")
{:hit, CacheEntry.touch(entry).result}
end
nil ->
get_from_disk(cache, key)
end
end
@doc """
Store result in cache.
"""
def put(cache, key, result) do
entry = CacheEntry.new(key, result, cache.ttl_seconds)
# Store in memory
new_memory_cache = Map.put(cache.memory_cache, key, entry)
# Store on disk
store_on_disk(cache, key, entry)
# Check cache size and cleanup if needed
cache = %{cache | memory_cache: new_memory_cache}
maybe_cleanup(cache)
Logger.info("Cache stored: #{key}")
cache
end
@doc """
Delete cache entry.
"""
def delete(cache, key) do
new_memory_cache = Map.delete(cache.memory_cache, key)
cache_file = cache_path(cache, key)
if File.exists?(cache_file), do: File.rm(cache_file)
Logger.info("Cache deleted: #{key}")
%{cache | memory_cache: new_memory_cache}
end
@doc """
Clear all cache entries.
"""
def clear(cache) do
# Clear disk cache
File.rm_rf!(cache.cache_dir)
File.mkdir_p!(cache.cache_dir)
Logger.info("Cache cleared")
%{cache | memory_cache: %{}}
end
@doc """
Get cache statistics.
"""
def stats(cache) do
total_size = calculate_total_size(cache)
entry_count = map_size(cache.memory_cache)
memory_entries = Enum.count(cache.memory_cache)
disk_entries =
case File.ls(cache.cache_dir) do
{:ok, files} -> length(files)
{:error, _} -> 0
end
%{
total_entries: entry_count,
memory_entries: memory_entries,
disk_entries: disk_entries,
total_size_bytes: total_size,
max_size_bytes: cache.max_size_bytes,
usage_percent: (total_size / cache.max_size_bytes * 100) |> Float.round(2),
compression_enabled: cache.compression_enabled
}
end
# Private helpers
defp get_from_disk(cache, key) do
cache_file = cache_path(cache, key)
if File.exists?(cache_file) do
case File.read(cache_file) do
{:ok, data} ->
case deserialize(data, cache.compression_enabled) do
{:ok, entry} ->
if CacheEntry.expired?(entry) do
File.rm(cache_file)
Logger.debug("Cache hit (disk) - expired: #{key}")
:miss
else
Logger.debug("Cache hit (disk): #{key}")
{:hit, CacheEntry.touch(entry).result}
end
{:error, reason} ->
Logger.warn("Failed to deserialize cache: #{inspect(reason)}")
:miss
end
{:error, reason} ->
Logger.warn("Failed to read cache file: #{inspect(reason)}")
:miss
end
else
:miss
end
end
defp store_on_disk(cache, key, entry) do
cache_file = cache_path(cache, key)
data = serialize(entry, cache.compression_enabled)
File.write!(cache_file, data)
end
defp cache_path(cache, key) do
Path.join(cache.cache_dir, "#{key}.cache")
end
defp serialize(entry, compression_enabled) do
data = :erlang.term_to_binary(entry)
if compression_enabled do
:zlib.compress(data)
else
data
end
end
defp deserialize(data, compression_enabled) do
try do
uncompressed =
if compression_enabled do
:zlib.uncompress(data)
else
data
end
{:ok, :erlang.binary_to_term(uncompressed)}
rescue
e -> {:error, e}
end
end
defp calculate_total_size(cache) do
cache.memory_cache
|> Map.values()
|> Enum.map(& &1.size_bytes)
|> Enum.sum()
end
defp maybe_cleanup(cache) do
total_size = calculate_total_size(cache)
if total_size > cache.max_size_bytes do
Logger.info("Cache size (#{total_size}) exceeds limit, starting cleanup")
cleanup_lru(cache)
else
cache
end
end
defp cleanup_lru(cache) do
# Remove least recently used entries until under limit
entries =
cache.memory_cache
|> Enum.sort_by(fn {_k, entry} -> entry.accessed_at end)
target_size = div(cache.max_size_bytes, 2)
current_size = calculate_total_size(cache)
entries
|> Enum.reduce_while({cache, current_size}, fn {key, entry}, {acc_cache, size} ->
if size <= target_size do
{:halt, {acc_cache, size}}
else
new_cache = delete(acc_cache, key)
new_size = size - entry.size_bytes
{:cont, {new_cache, new_size}}
end
end)
|> elem(0)
end
end
@doc """
Initialize cache and extract with caching.
"""
def extract_with_cache(file_path, cache_dir, opts \\ []) do
cache = Cache.new(cache_dir, opts)
cache_key = compute_cache_key(file_path, opts)
case Cache.get(cache, cache_key) do
{:hit, result} ->
{:ok, result, cache}
:miss ->
Logger.info("Cache miss: #{file_path}")
case Kreuzberg.extract_file(file_path) do
{:ok, result} ->
new_cache = Cache.put(cache, cache_key, result)
{:ok, result, new_cache}
error ->
{error, cache}
end
end
end
@doc """
Extract multiple files with batch caching.
"""
def batch_extract_with_cache(file_paths, cache_dir, opts \\ []) do
cache = Cache.new(cache_dir, opts)
results =
file_paths
|> Enum.map(fn path ->
case extract_with_cache(path, cache_dir, opts) do
{:ok, result, _} -> {:ok, path, result}
{{:error, reason}, _} -> {:error, path, reason}
end
end)
stats = Cache.stats(cache)
{results, stats}
end
@doc """
Manage cache - get stats, clear, etc.
"""
def manage_cache(cache_dir, action, opts \\ []) do
cache = Cache.new(cache_dir, opts)
case action do
:stats ->
Cache.stats(cache)
:clear ->
Cache.clear(cache)
:list ->
case File.ls(cache_dir) do
{:ok, files} -> files
{:error, reason} -> {:error, reason}
end
{:delete, key} ->
Cache.delete(cache, key)
_ ->
{:error, "Unknown action: #{action}"}
end
end
# Private helpers
defp compute_cache_key(file_path, opts) do
# Include file path and options in key
content = "#{file_path}|#{inspect(opts)}"
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
end
end
# Usage examples
IO.puts("=== Kreuzberg Disk Cache ===\n")
cache_dir = "/tmp/kreuzberg_cache"
# Example 1: Single file extraction with caching
IO.puts("Example 1: Single file extraction with caching")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
case KreuzbergDiskCache.extract_with_cache("document.pdf", cache_dir) do
{:ok, result, cache} ->
IO.puts("Extraction successful!")
IO.puts("Content size: #{byte_size(result.content)} bytes")
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
IO.puts("\nCache Statistics:")
IO.puts(" Entries: #{stats.total_entries}")
IO.puts(" Size: #{stats.total_size_bytes} bytes")
IO.puts(" Usage: #{stats.usage_percent}%\n")
{error, _cache} ->
IO.puts("Extraction failed: #{inspect(error)}\n")
end
# Example 2: Batch extraction with cache statistics
IO.puts("Example 2: Batch extraction with caching")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
{results, stats} = KreuzbergDiskCache.batch_extract_with_cache(documents, cache_dir)
successful = Enum.count(results, &match?({:ok, _, _}, &1))
IO.puts("Batch results:")
IO.puts(" Processed: #{length(documents)}")
IO.puts(" Successful: #{successful}")
IO.puts("\nCache Statistics:")
IO.puts(" Total entries: #{stats.total_entries}")
IO.puts(" Memory entries: #{stats.memory_entries}")
IO.puts(" Disk entries: #{stats.disk_entries}")
IO.puts(" Total size: #{stats.total_size_bytes} bytes")
IO.puts(" Usage: #{stats.usage_percent}%\n")
# Example 3: Cache management
IO.puts("Example 3: Cache management")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
cached_files = KreuzbergDiskCache.manage_cache(cache_dir, :list)
IO.puts("Cached files:")
Enum.each(cached_files, fn file -> IO.puts(" - #{file}") end)
IO.puts("\nCache stats:")
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
IO.inspect(stats, pretty: true)
```

View File

@@ -0,0 +1,306 @@
```elixir title="Elixir"
# Basic CLI Tool - Simple command-line interface for Kreuzberg
# Demonstrates how to build a CLI for document extraction
defmodule KreuzbergCLI do
@moduledoc """
Command-line interface for Kreuzberg document extraction.
Provides a simple, user-friendly CLI for extracting documents
from the command line with progress feedback.
"""
require Logger
@doc """
Main entry point for CLI application.
Parses command-line arguments and executes the appropriate command.
"""
def main(args) do
args
|> parse_args()
|> execute()
end
defp parse_args(args) do
case args do
[] ->
{:error, :no_command}
["extract", file | rest] ->
opts = parse_options(rest, %{})
{:extract, file, opts}
["batch", dir | rest] ->
opts = parse_options(rest, %{})
{:batch, dir, opts}
["help"] ->
{:help}
["-h"] ->
{:help}
["--help"] ->
{:help}
[cmd] ->
{:error, "Unknown command: #{cmd}"}
_ ->
{:error, :invalid_args}
end
end
defp parse_options([], acc), do: acc
defp parse_options(["-v" | rest], acc) do
parse_options(rest, Map.put(acc, :verbose, true))
end
defp parse_options(["--mime-type", mime | rest], acc) do
parse_options(rest, Map.put(acc, :mime_type, mime))
end
defp parse_options(["--output", path | rest], acc) do
parse_options(rest, Map.put(acc, :output, path))
end
defp parse_options(["--ocr" | rest], acc) do
parse_options(rest, Map.put(acc, :enable_ocr, true))
end
defp parse_options(["--chunks" | rest], acc) do
parse_options(rest, Map.put(acc, :enable_chunks, true))
end
defp parse_options([_ | rest], acc) do
parse_options(rest, acc)
end
defp execute({:help}) do
print_help()
:ok
end
defp execute({:extract, file, opts}) do
unless File.exists?(file) do
IO.puts(:stderr, "Error: File not found: #{file}")
:error
else
extract_file(file, opts)
end
end
defp execute({:batch, dir, opts}) do
unless File.dir?(dir) do
IO.puts(:stderr, "Error: Directory not found: #{dir}")
:error
else
batch_extract(dir, opts)
end
end
defp execute({:error, reason}) do
IO.puts(:stderr, "Error: #{inspect(reason)}")
print_help()
:error
end
defp extract_file(file_path, opts) do
verbose = Map.get(opts, :verbose, false)
mime_type = Map.get(opts, :mime_type, nil)
output_path = Map.get(opts, :output, nil)
config = build_config(opts)
IO.puts("Extracting: #{file_path}")
start_time = System.monotonic_time(:millisecond)
case Kreuzberg.extract_file(file_path, mime_type, config) do
{:ok, result} ->
elapsed = System.monotonic_time(:millisecond) - start_time
print_extraction_result(result, elapsed, verbose)
if output_path do
save_result(result, output_path)
end
:ok
{:error, reason} ->
IO.puts(:stderr, "Extraction failed: #{inspect(reason)}")
:error
end
end
defp batch_extract(dir, opts) do
verbose = Map.get(opts, :verbose, false)
config = build_config(opts)
pattern = Path.join(dir, "**/*.{pdf,docx,xlsx,txt,html,md}")
files = Path.wildcard(pattern)
case files do
[] ->
IO.puts("No documents found in #{dir}")
:ok
_ ->
IO.puts("Found #{length(files)} documents\n")
results =
files
|> Enum.with_index(1)
|> Enum.map(fn {file, idx} ->
IO.write(" [#{idx}/#{length(files)}] ")
start_time = System.monotonic_time(:millisecond)
case Kreuzberg.extract_file(file, nil, config) do
{:ok, result} ->
elapsed = System.monotonic_time(:millisecond) - start_time
IO.puts("#{Path.basename(file)} (#{elapsed}ms)")
{:ok, file, result, elapsed}
{:error, reason} ->
IO.puts("#{Path.basename(file)} - ERROR")
if verbose, do: IO.puts(" Error: #{inspect(reason)}")
{:error, file, reason}
end
end)
print_batch_summary(results)
:ok
end
end
defp build_config(opts) do
%Kreuzberg.ExtractionConfig{
ocr:
if(Map.get(opts, :enable_ocr),
do: %{"enabled" => true, "backend" => "tesseract"},
else: nil
),
chunking:
if(Map.get(opts, :enable_chunks),
do: %{"enabled" => true, "max_characters" => 1000, "overlap" => 100},
else: nil
),
use_cache: true
}
end
defp print_extraction_result(result, elapsed_ms, verbose) do
IO.puts("\nExtraction Results:")
IO.puts(" Content size: #{byte_size(result.content)} bytes")
IO.puts(" MIME type: #{result.mime_type}")
IO.puts(" Processing time: #{elapsed_ms}ms")
if result.metadata do
IO.puts(" Metadata keys: #{Enum.count(result.metadata)}")
end
if result.tables && !Enum.empty?(result.tables) do
IO.puts(" Tables found: #{length(result.tables)}")
end
if result.images && !Enum.empty?(result.images) do
IO.puts(" Images found: #{length(result.images)}")
end
if result.chunks && !Enum.empty?(result.chunks) do
IO.puts(" Chunks created: #{length(result.chunks)}")
end
if result.detected_languages && !Enum.empty?(result.detected_languages) do
IO.puts(" Languages: #{Enum.join(result.detected_languages, ", ")}")
end
if verbose do
IO.puts("\n Full metadata:")
IO.inspect(result.metadata, pretty: true)
end
IO.puts("")
end
defp print_batch_summary(results) do
total = length(results)
successful = Enum.count(results, &match?({:ok, _, _, _}, &1))
failed = Enum.count(results, &match?({:error, _, _}, &1))
total_time =
results
|> Enum.filter(&match?({:ok, _, _, _}, &1))
|> Enum.map(fn {:ok, _, _, time} -> time end)
|> Enum.sum()
IO.puts("\nBatch Summary:")
IO.puts(" Total: #{total}")
IO.puts(" Successful: #{successful}")
IO.puts(" Failed: #{failed}")
IO.puts(" Total time: #{total_time}ms")
IO.puts(" Average time: #{div(total_time, max(successful, 1))}ms/document")
end
defp save_result(result, output_path) do
output_data = %{
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata,
tables: result.tables || [],
images: result.images || [],
chunks: result.chunks || [],
detected_languages: result.detected_languages || [],
extracted_at: DateTime.utc_now()
}
case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
:ok ->
IO.puts("Results saved to: #{output_path}")
{:error, reason} ->
IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
end
end
defp print_help do
IO.puts("""
Kreuzberg CLI - Document Extraction Tool
USAGE:
kreuzberg extract <file> [OPTIONS]
kreuzberg batch <directory> [OPTIONS]
kreuzberg help
COMMANDS:
extract <file> Extract content from a single document
batch <directory> Extract all documents in a directory
help Show this help message
OPTIONS:
-v, --verbose Show detailed output
--mime-type <type> Specify MIME type (e.g., application/pdf)
--output <path> Save results to JSON file
--ocr Enable OCR for scanned documents
--chunks Enable document chunking for RAG
EXAMPLES:
kreuzberg extract document.pdf
kreuzberg extract document.pdf --output results.json
kreuzberg batch ./documents --ocr --chunks
kreuzberg batch ./documents -v --output summary.json
""")
end
end
# Entry point for escript
def main(args) do
case KreuzbergCLI.main(args) do
:ok -> 0
:error -> 1
end
end
```

View File

@@ -0,0 +1,455 @@
```elixir title="Elixir"
# CLI with Configuration - Advanced CLI with config file support
# Demonstrates managing complex extraction configurations via CLI
defmodule KreuzbergAdvancedCLI do
@moduledoc """
Advanced CLI with configuration file support for Kreuzberg.
Allows users to define extraction configurations in YAML/TOML files,
specify preprocessing options, and manage extraction profiles.
"""
require Logger
defmodule ConfigFile do
@moduledoc """
Configuration file parser and manager.
"""
defstruct [
:profiles,
:default_profile,
:cache_enabled,
:cache_dir
]
@doc """
Load configuration from YAML/JSON file.
"""
def load(config_path) do
unless File.exists?(config_path) do
{:error, "Config file not found: #{config_path}"}
else
case Path.extname(config_path) do
".yaml" -> load_yaml(config_path)
".yml" -> load_yaml(config_path)
".json" -> load_json(config_path)
ext -> {:error, "Unsupported config format: #{ext}"}
end
end
end
defp load_yaml(path) do
case File.read(path) do
{:ok, content} ->
case :yamerl_constr.string(content, []) do
[config] -> {:ok, parse_config(config)}
error -> {:error, "Failed to parse YAML: #{inspect(error)}"}
end
{:error, reason} ->
{:error, "Failed to read config: #{inspect(reason)}"}
end
end
defp load_json(path) do
case File.read(path) do
{:ok, content} ->
case Jason.decode(content) do
{:ok, config} -> {:ok, parse_config(config)}
error -> {:error, "Failed to parse JSON: #{inspect(error)}"}
end
{:error, reason} ->
{:error, "Failed to read config: #{inspect(reason)}"}
end
end
defp parse_config(raw_config) when is_list(raw_config) do
raw_config = Map.new(raw_config)
parse_config(raw_config)
end
defp parse_config(raw_config) when is_map(raw_config) do
%ConfigFile{
profiles: Map.get(raw_config, "profiles", %{}),
default_profile: Map.get(raw_config, "default_profile", "default"),
cache_enabled: Map.get(raw_config, "cache_enabled", true),
cache_dir: Map.get(raw_config, "cache_dir", "/tmp/kreuzberg_cache")
}
end
@doc """
Get extraction configuration for a profile.
"""
def get_profile(config_file, profile_name) do
profile_name = profile_name || config_file.default_profile
case Map.get(config_file.profiles, profile_name) do
nil -> {:error, "Profile not found: #{profile_name}"}
profile -> {:ok, profile}
end
end
@doc """
List all available profiles.
"""
def list_profiles(config_file) do
Map.keys(config_file.profiles)
end
end
defmodule Extractor do
@moduledoc """
Main extraction engine with profile support.
"""
def extract_with_profile(file_path, config_file, profile_name, opts \\ []) do
verbose = Keyword.get(opts, :verbose, false)
case ConfigFile.get_profile(config_file, profile_name) do
{:ok, profile} ->
extract_with_config(file_path, profile, config_file, verbose)
{:error, reason} ->
{:error, reason}
end
end
defp extract_with_config(file_path, profile, config_file, verbose) do
unless File.exists?(file_path) do
{:error, "File not found: #{file_path}"}
else
# Build extraction config from profile
extraction_config = build_extraction_config(profile)
# Apply caching if enabled
use_cache = config_file.cache_enabled
cache_dir = config_file.cache_dir
IO.puts("Profile: #{profile["name"]}")
IO.puts("File: #{file_path}")
IO.puts("Cache: #{if use_cache, do: "enabled (#{cache_dir})", else: "disabled"}")
IO.puts("")
# Preprocess if configured
processed_file = preprocess_if_needed(file_path, profile, verbose)
start_time = System.monotonic_time(:millisecond)
case Kreuzberg.extract_file(processed_file, nil, extraction_config) do
{:ok, result} ->
elapsed = System.monotonic_time(:millisecond) - start_time
# Post-process if configured
final_result = postprocess_if_needed(result, profile)
print_results(final_result, elapsed, verbose)
cleanup_temp_files(processed_file, file_path)
{:ok, final_result}
{:error, reason} ->
cleanup_temp_files(processed_file, file_path)
{:error, reason}
end
end
end
defp build_extraction_config(profile) do
%Kreuzberg.ExtractionConfig{
ocr: profile["ocr"],
chunking: profile["chunking"],
quality_processing: profile["quality_processing"],
language_detection: profile["language_detection"],
keyword_extraction: profile["keyword_extraction"],
images: profile["images"],
use_cache: true
}
end
defp preprocess_if_needed(file_path, profile, verbose) do
case profile["preprocessing"] do
nil ->
file_path
preprocessing ->
IO.puts("Preprocessing enabled:")
temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
# Apply preprocessing steps
preprocessing
|> Enum.reduce(file_path, fn step, path ->
apply_preprocessing_step(step, path, temp_path, verbose)
end)
end
end
defp apply_preprocessing_step(step, input_path, _temp_path, verbose) do
case step do
%{"type" => "rotate", "degrees" => degrees} ->
if verbose, do: IO.puts(" - Rotating #{degrees} degrees")
input_path
%{"type" => "normalize", "target_format" => format} ->
if verbose, do: IO.puts(" - Normalizing to #{format}")
input_path
%{"type" => "deskew"} ->
if verbose, do: IO.puts(" - Deskewing")
input_path
_ ->
input_path
end
end
defp postprocess_if_needed(result, profile) do
case profile["postprocessing"] do
nil ->
result
postprocessing ->
Enum.reduce(postprocessing, result, fn step, acc_result ->
apply_postprocessing_step(step, acc_result)
end)
end
end
defp apply_postprocessing_step(%{"type" => "filter_empty_chunks"}, result) do
case result.chunks do
nil -> result
chunks ->
filtered = Enum.filter(chunks, &(byte_size(&1) > 0))
%{result | chunks: filtered}
end
end
defp apply_postprocessing_step(%{"type" => "limit_tables", "max" => max_tables}, result) do
case result.tables do
nil -> result
tables ->
limited = Enum.take(tables, max_tables)
%{result | tables: limited}
end
end
defp apply_postprocessing_step(_, result), do: result
defp cleanup_temp_files(processed_path, original_path) do
if processed_path != original_path && String.starts_with?(processed_path, "/tmp/") do
File.rm(processed_path)
end
end
defp print_results(result, elapsed_ms, verbose) do
IO.puts("Results:")
IO.puts(" Content size: #{byte_size(result.content)} bytes")
IO.puts(" Mime type: #{result.mime_type}")
IO.puts(" Processing time: #{elapsed_ms}ms")
if result.metadata do
IO.puts(" Metadata entries: #{Enum.count(result.metadata)}")
end
if result.tables && !Enum.empty?(result.tables) do
IO.puts(" Tables: #{length(result.tables)}")
end
if result.chunks && !Enum.empty?(result.chunks) do
IO.puts(" Chunks: #{length(result.chunks)}")
end
if result.detected_languages && !Enum.empty?(result.detected_languages) do
IO.puts(" Languages: #{Enum.join(result.detected_languages, ", ")}")
end
if verbose && result.metadata do
IO.puts("\n Full metadata:")
IO.inspect(result.metadata, pretty: true, limit: :infinity)
end
IO.puts("")
end
end
@doc """
Main CLI entry point.
"""
def main(args) do
args
|> parse_args()
|> execute()
end
defp parse_args(args) do
{opts, args, _invalid} = OptionParser.parse(args,
switches: [
config: :string,
profile: :string,
output: :string,
verbose: :boolean,
list: :boolean
],
aliases: [c: :config, p: :profile, o: :output, v: :verbose, l: :list]
)
{opts, args}
end
defp execute({_opts, []}) do
print_usage()
:error
end
defp execute({opts, [command | rest]}) do
case command do
"extract" ->
execute_extract(rest, opts)
"profiles" ->
execute_list_profiles(opts)
"help" ->
print_help()
:ok
_ ->
IO.puts(:stderr, "Unknown command: #{command}")
print_usage()
:error
end
end
defp execute_extract(args, opts) do
config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
profile = Keyword.get(opts, :profile, nil)
output_path = Keyword.get(opts, :output, nil)
verbose = Keyword.get(opts, :verbose, false)
case ConfigFile.load(config_path) do
{:ok, config_file} ->
case args do
[] ->
IO.puts(:stderr, "Error: No file specified")
:error
[file_path | _] ->
case Extractor.extract_with_profile(file_path, config_file, profile, verbose: verbose) do
{:ok, result} ->
if output_path do
save_result(result, output_path)
end
:ok
{:error, reason} ->
IO.puts(:stderr, "Extraction failed: #{reason}")
:error
end
end
{:error, reason} ->
IO.puts(:stderr, "Configuration error: #{reason}")
:error
end
end
defp execute_list_profiles(opts) do
config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
case ConfigFile.load(config_path) do
{:ok, config_file} ->
profiles = ConfigFile.list_profiles(config_file)
IO.puts("Available profiles:")
Enum.each(profiles, fn profile ->
IO.puts(" - #{profile}")
end)
:ok
{:error, reason} ->
IO.puts(:stderr, "Configuration error: #{reason}")
:error
end
end
defp save_result(result, output_path) do
output_data = %{
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata,
tables: result.tables || [],
chunks: result.chunks || [],
images: result.images || [],
detected_languages: result.detected_languages || [],
extracted_at: DateTime.utc_now()
}
case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
:ok ->
IO.puts("Results saved to: #{output_path}")
{:error, reason} ->
IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
end
end
defp print_usage do
IO.puts("Usage: kreuzberg-cli <command> [options] [args]")
end
defp print_help do
IO.puts("""
Kreuzberg Advanced CLI with Configuration
USAGE:
kreuzberg extract <file> [OPTIONS]
kreuzberg profiles [OPTIONS]
kreuzberg help
COMMANDS:
extract <file> Extract with configured profile
profiles List available profiles
help Show this help message
OPTIONS:
-c, --config <path> Config file path (default: kreuzberg.yaml)
-p, --profile <name> Profile name (default: from config)
-o, --output <path> Save results to JSON
-v, --verbose Verbose output
CONFIG FILE EXAMPLE:
default_profile: "standard"
cache_enabled: true
cache_dir: "/tmp/kreuzberg_cache"
profiles:
standard:
name: "Standard Extraction"
ocr:
enabled: false
chunking:
enabled: true
max_characters: 1000
overlap: 100
ocr_intensive:
name: "OCR + Language Detection"
ocr:
enabled: true
backend: tesseract
language_detection:
enabled: true
preprocessing:
- type: deskew
- type: rotate
degrees: 90
""")
end
end
# Entry point
case KreuzbergAdvancedCLI.main(System.argv()) do
:ok -> IO.puts("\nDone.")
:error -> exit(1)
end
```

View File

@@ -0,0 +1,53 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"enable_quality_processing" => true,
"ocr" => %{
"backend" => "tesseract",
"language" => "eng"
},
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200,
"embedding" => %{
"model" => %{
"preset" => %{
"name" => "balanced"
}
},
"batch_size" => 32,
"normalize" => true,
"show_download_progress" => false
}
},
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => false
},
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.1,
"ngram_range" => [1, 3],
"language" => "en"
},
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
},
"postprocessor" => %{
"enabled" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Content: #{result.content}")
if result.detected_languages do
IO.puts("Languages: #{inspect(result.detected_languages)}")
end
chunks_count = if result.chunks, do: length(result.chunks), else: 0
IO.puts("Chunks: #{chunks_count}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Chunks: #{length(result.chunks)}")
Enum.each(result.chunks, fn chunk ->
IO.puts("Length: #{String.length(chunk.content)}")
end)
```

View File

@@ -0,0 +1,9 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"enable_quality_processing" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts(result.content)
```

View File

@@ -0,0 +1,5 @@
```elixir title="Elixir"
# Extract with nil config to use discovered/default configuration
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
IO.puts(result.content)
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"ocr" => %{
"backend" => "tesseract",
"language" => "eng"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
IO.puts("Content length: #{String.length(result.content)}")
IO.puts("Tables detected: #{length(result.tables)}")
```

View File

@@ -0,0 +1,20 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"ocr" => %{
"backend" => "tesseract",
"language" => "eng+deu",
"tesseract_config" => %{
"psm" => 6
}
},
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200
},
"enable_quality_processing" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Content length: #{String.length(result.content)}")
```

View File

@@ -0,0 +1,13 @@
```elixir title="Document Structure Config (Elixir)"
config = %Kreuzberg.ExtractionConfig{
include_document_structure: true
}
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
if result.document do
Enum.each(result.document.nodes, fn node ->
IO.puts("[#{node.content.node_type}]")
end)
end
```

View File

@@ -0,0 +1,36 @@
```elixir title="Element-Based Output (Elixir)"
# Configure element-based output
config = %Kreuzberg.ExtractionConfig{
output_format: :element_based
}
# Extract document
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
# Access elements
Enum.each(result.elements, fn element ->
IO.puts("Type: #{element.element_type}")
text = String.slice(element.text, 0, 100)
IO.puts("Text: #{text}")
if element.metadata.page_number do
IO.puts("Page: #{element.metadata.page_number}")
end
if element.metadata.coordinates do
coords = element.metadata.coordinates
IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
end
IO.puts("---")
end)
# Filter by element type
titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
Enum.each(titles, fn title ->
level = Map.get(title.metadata.additional, "level", "unknown")
IO.puts("[#{level}] #{title.text}")
end)
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200,
"embedding" => %{
"model" => %{
"preset" => %{
"name" => "balanced"
}
},
"batch_size" => 16,
"normalize" => true,
"show_download_progress" => true
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
chunks_count = if result.chunks, do: length(result.chunks), else: 0
IO.puts("Chunks with embeddings: #{chunks_count}")
```

View File

@@ -0,0 +1,11 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"output_format" => "Html",
"html_output" => %{
"theme" => "GitHub"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts(result.content)
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.1,
"ngram_range" => [1, 3],
"language" => "en"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Keywords: #{inspect(result.keywords)}")
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Detected language: #{result.language}")
IO.puts("Confidence: #{result.language_confidence}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"images" => %{
"extract_images" => true,
"target_dpi" => 300,
"max_image_dimension" => 4096,
"auto_adjust_dpi" => true,
"min_dpi" => 150,
"max_dpi" => 600
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Extracted images: #{length(result.images)}")
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"pdf_options" => %{
"extract_images" => true,
"passwords" => ["password123"],
"extract_metadata" => true,
"hierarchy" => %{}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
IO.puts("Title: #{inspect(result.metadata.title)}")
IO.puts("Authors: #{inspect(result.metadata.authors)}")
```

View File

@@ -0,0 +1,16 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"pdf_options" => %{
"hierarchy" => %{
"enabled" => true,
"detection_threshold" => 0.75,
"ocr_coverage_threshold" => 0.8,
"min_level" => 1,
"max_level" => 5
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"postprocessor" => %{
"enabled" => true,
"enabled_processors" => [
"whitespace_normalizer",
"unicode_normalizer"
]
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Processed content: #{result.content}")
```

View File

@@ -0,0 +1,10 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"enable_quality_processing" => true,
"use_cache" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Quality score: #{result.quality_score}")
IO.puts("Processing time: #{inspect(result.processing_time)}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"ocr" => %{
"backend" => "tesseract",
"language" => "eng+deu",
"tesseract_config" => %{
"psm" => 6,
"oem" => 3
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
IO.puts("OCR text: #{result.content}")
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Original tokens: #{result.token_count}")
IO.puts("Reduced content: #{result.content}")
```

View File

@@ -0,0 +1,78 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Example 1: Basic character-based chunking for RAG applications
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 100,
"min_size" => 200,
"respect_boundaries" => true
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Access chunks
if result.chunks do
IO.puts("Generated #{length(result.chunks)} chunks")
Enum.each(result.chunks, fn chunk ->
IO.puts("Chunk: #{String.slice(chunk["content"], 0..50)}...")
end)
end
# Example 2: Markdown chunker with token-based sizing and heading context
config2 = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"sizing" => %{
"type" => "tokenizer",
"model" => "Xenova/gpt-4o"
}
}
}
{:ok, result2} = Kreuzberg.extract_file("document.md", nil, config2)
if result2.chunks do
IO.puts("Generated #{length(result2.chunks)} markdown chunks")
Enum.each(result2.chunks, fn chunk ->
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..60)}...")
# Access heading context
if is_map(chunk["metadata"]) and is_map(chunk["metadata"]["heading_context"]) do
headings = chunk["metadata"]["heading_context"]["headings"] || []
if length(headings) > 0 do
IO.puts(" Headings in context:")
Enum.each(headings, fn heading ->
IO.puts(" - Level #{heading["level"]}: #{heading["text"]}")
end)
end
end
end)
end
# Example 3: Prepend heading context to chunk content
config3 = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
}
{:ok, result3} = Kreuzberg.extract_file("document.md", nil, config3)
if result3.chunks do
IO.puts("Generated #{length(result3.chunks)} chunks with prepended headings")
Enum.each(result3.chunks, fn chunk ->
# Each chunk's content is prefixed with its heading breadcrumb
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"},
chunking: %{"max_characters" => 1000, "overlap" => 100},
language_detection: %{"enabled" => true},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
```

View File

@@ -0,0 +1,26 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Load configuration from file
# Automatically discovers and loads configuration from the user's config directory
config_path = Path.expand("~/.kreuzberg/config.json")
config = if File.exists?(config_path) do
config_data = config_path |> File.read!() |> Jason.decode!()
struct(ExtractionConfig, Map.new(config_data, fn {k, v} -> {String.to_atom(k), v} end))
else
IO.puts("Config file not found at #{config_path}. Using defaults.")
%ExtractionConfig{}
end
IO.puts("Configuration Source: #{if File.exists?(config_path), do: "#{config_path} (file)", else: "defaults"}")
IO.puts("OCR Enabled: #{inspect(config.ocr["enabled"])}")
IO.puts("Chunking Max Characters: #{inspect(config.chunking["max_characters"])}")
IO.puts("Use Cache: #{inspect(config.use_cache)}")
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Document extraction complete")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Languages detected: #{inspect(result.detected_languages)}")
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure OCR-specific settings
# Includes backend selection, language settings, and preprocessing options
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"preprocessing" => true
},
chunking: %{
"max_characters" => 2000,
"overlap" => 200
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
IO.puts("OCR Configuration Applied:")
IO.puts("Backend: tesseract")
IO.puts("Language: eng")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,44 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Build configuration dynamically based on runtime conditions
# Useful for environment-specific settings and feature flags
defmodule ConfigBuilder do
def build_config(file_type, enable_ocr?) do
base_config = %ExtractionConfig{
chunking: %{"max_characters" => 1000, "overlap" => 100},
use_cache: true
}
case {file_type, enable_ocr?} do
{:pdf, true} ->
%{base_config | ocr: %{"enabled" => true, "backend" => "tesseract"}, force_ocr: true}
{:pdf, false} ->
%{base_config | ocr: %{"enabled" => false}}
{:image, true} ->
%{
base_config
| ocr: %{"enabled" => true, "backend" => "tesseract", "preprocessing" => true},
force_ocr: true
}
{:image, false} ->
%{base_config | ocr: %{"enabled" => false}}
{_, _} ->
base_config
end
end
end
# Build configuration based on file type and requirements
config = ConfigBuilder.build_config(:pdf, true)
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Dynamic configuration applied")
IO.puts("Content: #{String.slice(result.content, 0..100)}")
```

View File

@@ -0,0 +1,20 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure embeddings for vector search
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
},
embeddings: %{
"enabled" => true,
"model" => "sentence-transformers/all-MiniLM-L6-v2"
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Extracted chunks with embeddings: #{length(result.chunks || [])}")
```

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure keyword extraction
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 10,
"min_score" => 0.5
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Access extracted keywords
if result.keywords do
IO.puts("Extracted #{length(result.keywords)} keywords")
Enum.each(result.keywords, fn keyword ->
IO.puts("#{keyword["text"]}: #{keyword["score"]}")
end)
end
```

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure language detection for multilingual documents
# Automatically detect all languages present in the content
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true
},
chunking: %{
"max_characters" => 1000,
"overlap" => 100
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("multilingual.pdf", nil, config)
IO.puts("Detected Languages:")
IO.inspect(result.detected_languages)
IO.puts("Content: #{String.slice(result.content, 0..100)}")
```

View File

@@ -0,0 +1,25 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure PDF-specific extraction options
# Extract annotations, metadata, and flatten forms for comprehensive document processing
config = %ExtractionConfig{
pdf: %{
"extract_annotations" => true,
"extract_metadata" => true,
"flatten_forms" => true
},
chunking: %{
"max_characters" => 1500,
"overlap" => 150
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("form_document.pdf", nil, config)
IO.puts("PDF Extraction Complete:")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
IO.puts("Annotations present: #{map_size(result.metadata["annotations"] || %{}) > 0}")
```

View File

@@ -0,0 +1,29 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure post-processing pipeline
# Applies transformations to extracted content after initial processing
config = %ExtractionConfig{
postprocessing: %{
"remove_whitespace" => true,
"normalize_unicode" => true,
"fix_encoding" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Post-Processing Configuration Applied:")
IO.puts("Remove Whitespace: true")
IO.puts("Normalize Unicode: true")
IO.puts("Fix Encoding: true")
IO.puts("Original content length: #{byte_size(result.content)} bytes")
IO.puts("Processed content: #{String.slice(result.content, 0..100)}...")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,30 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure quality processing settings
# Enable noise removal and set minimum quality thresholds for extracted content
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.7,
"remove_noise" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
chunking: %{
"max_characters" => 1000,
"overlap" => 100
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("noisy_document.pdf", nil, config)
IO.puts("Quality Processing Applied:")
IO.puts("Content quality score: #{result.quality_score}")
IO.puts("Noise removed: true")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Processing complete: #{inspect(result)}")
```

View File

@@ -0,0 +1,29 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure Tesseract OCR settings
# Includes language selection, page segmentation mode, OCR engine mode, and DPI settings
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"psm" => 3,
"oem" => 3,
"dpi" => 300
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Tesseract Configuration Applied:")
IO.puts("Backend: tesseract")
IO.puts("Language: eng")
IO.puts("PSM (Page Segmentation Mode): 3")
IO.puts("OEM (OCR Engine Mode): 3")
IO.puts("DPI: 300")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure token reduction for LLM context windows
# Helps manage token usage when working with large language models
config = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 4000,
"strategy" => "truncate"
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("large_document.pdf", nil, config)
IO.puts("Token Reduction Configuration Applied:")
IO.puts("Token Reduction Enabled: true")
IO.puts("Target Tokens: 4000")
IO.puts("Strategy: truncate")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Tokens reduced: #{inspect(result.metadata[:token_reduction_applied])}")
IO.puts("Final metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,54 @@
```elixir title="Elixir"
# Extract from multiple binary inputs in a batch operation
# Useful for processing multiple documents in memory
# Prepare binary data from multiple sources
{:ok, pdf_data_1} = File.read("document1.pdf")
{:ok, pdf_data_2} = File.read("document2.pdf")
{:ok, pdf_data_3} = File.read("document3.pdf")
data_list = [pdf_data_1, pdf_data_2, pdf_data_3]
# Option 1: Use single MIME type for all inputs
{:ok, results} = Kreuzberg.batch_extract_bytes(data_list, "application/pdf")
# Process results
Enum.each(results, fn result ->
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total documents processed: #{length(results)}")
# Option 2: Use different MIME types for each input
mime_types = ["application/pdf", "text/html", "application/pdf"]
{:ok, mixed_results} = Kreuzberg.batch_extract_bytes(data_list, mime_types)
# Option 3: Batch extraction with configuration
config = %Kreuzberg.ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"},
extract_images: true
}
case Kreuzberg.batch_extract_bytes(data_list, "application/pdf", config) do
{:ok, results} ->
IO.puts("Successfully extracted #{length(results)} documents")
Enum.each(results, fn result ->
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
end)
{:error, reason} ->
IO.puts("Batch extraction failed: #{reason}")
end
# Option 4: Using the bang variant (raises on error)
try do
results = Kreuzberg.batch_extract_bytes!(data_list, "application/pdf")
IO.puts("Extracted #{length(results)} documents successfully")
rescue
error in Kreuzberg.Error ->
IO.puts("Error: #{error.message}")
end
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
Enum.each(results, fn result ->
IO.puts("File: #{result.mime_type}")
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Tables: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total files processed: #{length(results)}")
```

View File

@@ -0,0 +1,163 @@
```elixir title="Elixir"
# Client wrapper for batch file extraction
# Provides error handling, logging, and result aggregation
defmodule BatchDocumentClient do
@moduledoc """
Client wrapper for batch file document extraction.
Handles multiple files with comprehensive error handling and logging.
"""
alias Kreuzberg.ExtractionResult
@doc """
Extract content from multiple files in batch.
Returns results for all successfully processed files and logs errors
for any files that fail during extraction.
## Options
* `:mime_type` - MIME type for all files (optional, defaults to auto-detection)
* `:config` - ExtractionConfig struct with options (optional)
* `:log_errors` - Whether to log extraction errors (default: true)
* `:fail_fast` - Stop on first error (default: false)
## Examples
{:ok, results} = BatchDocumentClient.extract_files(
["doc1.pdf", "doc2.pdf", "doc3.pdf"],
mime_type: "application/pdf"
)
"""
@spec extract_files([String.t()], keyword()) ::
{:ok, [ExtractionResult.t()]} | {:error, String.t()}
def extract_files(paths, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
log_errors = Keyword.get(opts, :log_errors, true)
case Kreuzberg.batch_extract_files(paths, mime_type, config) do
{:ok, results} ->
IO.debug("Successfully extracted #{length(results)} files")
{:ok, results}
{:error, reason} ->
if log_errors do
IO.debug("Batch extraction error: #{reason}")
end
{:error, reason}
end
end
@doc """
Extract files and return detailed statistics.
Returns aggregated metrics about all processed files.
"""
@spec extract_files_with_stats([String.t()], keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_files_with_stats(paths, opts \\ []) do
start_time = System.monotonic_time(:millisecond)
case extract_files(paths, opts) do
{:ok, results} ->
elapsed_ms = System.monotonic_time(:millisecond) - start_time
stats = %{
total_files: length(results),
total_content_size: Enum.reduce(results, 0, &(byte_size(&1.content) + &2)),
total_tables: Enum.reduce(results, 0, &(length(&1.tables) + &2)),
total_images: Enum.reduce(results, 0, &(length(&1.images || []) + &2)),
processing_time_ms: elapsed_ms,
avg_time_per_file_ms: div(elapsed_ms, max(length(results), 1)),
results: results
}
{:ok, stats}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract files and transform results.
Applies a transformation function to each extraction result.
Useful for custom processing or formatting of results.
"""
@spec extract_and_transform([String.t()], function(), keyword()) ::
{:ok, [any()]} | {:error, String.t()}
def extract_and_transform(paths, transform_fn, opts \\ []) do
case extract_files(paths, opts) do
{:ok, results} ->
transformed =
results
|> Enum.map(fn result ->
try do
{:ok, transform_fn.(result)}
rescue
error ->
IO.debug("Transform error: #{inspect(error)}")
{:error, error}
end
end)
# Check if any transforms failed
case Enum.find(transformed, fn r -> match?({:error, _}, r) end) do
nil ->
# All succeeded
{:ok, Enum.map(transformed, fn {:ok, value} -> value end)}
{:error, error} ->
{:error, "Transform failed: #{inspect(error)}"}
end
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
# Extract multiple files
case BatchDocumentClient.extract_files(["doc1.pdf", "doc2.pdf", "doc3.pdf"]) do
{:ok, results} ->
Enum.each(results, fn result ->
IO.puts("Extracted: #{byte_size(result.content)} bytes")
end)
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with statistics
case BatchDocumentClient.extract_files_with_stats(["doc1.pdf", "doc2.pdf"]) do
{:ok, stats} ->
IO.puts("Total files: #{stats.total_files}")
IO.puts("Total size: #{stats.total_content_size} bytes")
IO.puts("Processing time: #{stats.processing_time_ms}ms")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract and transform
transform = fn result ->
%{
mime: result.mime_type,
size: byte_size(result.content),
tables: length(result.tables)
}
end
case BatchDocumentClient.extract_and_transform(["doc1.pdf", "doc2.pdf"], transform) do
{:ok, transformed_results} ->
IO.inspect(transformed_results)
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,107 @@
```elixir title="Elixir"
# Reusable client pattern for single file extraction
# Encapsulates extraction logic with consistent error handling
defmodule DocumentClient do
@moduledoc """
Client wrapper for single file document extraction.
Provides a consistent interface for extracting content from files.
"""
alias Kreuzberg.ExtractionResult
@doc """
Extract content from a single file.
Returns a result tuple with the extraction outcome.
Supports explicit MIME type specification or auto-detection.
## Options
* `:mime_type` - MIME type of the file (optional, defaults to auto-detection)
* `:config` - ExtractionConfig struct with options (optional)
## Examples
{:ok, result} = DocumentClient.extract_file("document.pdf")
{:ok, result} = DocumentClient.extract_file("document.pdf", mime_type: "application/pdf")
"""
@spec extract_file(String.t(), keyword()) ::
{:ok, ExtractionResult.t()} | {:error, String.t()}
def extract_file(path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
IO.debug("Successfully extracted file: #{path}")
{:ok, result}
{:error, reason} ->
IO.debug("Failed to extract file: #{path} - #{reason}")
{:error, reason}
end
end
@doc """
Extract content from a file, raising on error.
Raises Kreuzberg.Error if extraction fails.
"""
@spec extract_file!(String.t(), keyword()) :: ExtractionResult.t()
def extract_file!(path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
Kreuzberg.extract_file!(path, mime_type, config)
end
@doc """
Extract and process file content.
Returns a map with extracted content, metadata, and processing statistics.
"""
@spec extract_with_stats(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_stats(path, opts \\ []) do
start_time = System.monotonic_time(:millisecond)
case extract_file(path, opts) do
{:ok, result} ->
elapsed_ms = System.monotonic_time(:millisecond) - start_time
{:ok,
%{
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata,
table_count: length(result.tables),
image_count: length(result.images || []),
processing_time_ms: elapsed_ms
}}
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
case DocumentClient.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Content length: #{byte_size(result.content)} bytes")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Extract with statistics
case DocumentClient.extract_with_stats("document.pdf") do
{:ok, stats} ->
IO.puts("Processing time: #{stats.processing_time_ms}ms")
IO.puts("Tables found: #{stats.table_count}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,247 @@
```elixir title="Elixir"
# Client wrapper with comprehensive configuration
# Demonstrates advanced extraction patterns with OCR, chunking, and validation
defmodule ConfiguredDocumentClient do
@moduledoc """
Client wrapper for document extraction with advanced configuration.
Supports OCR, chunking, language detection, and custom error handling.
"""
alias Kreuzberg.{ExtractionConfig, ExtractionResult}
@doc """
Extract with OCR enabled for scanned documents.
Uses Tesseract OCR backend for text extraction from images.
"""
@spec extract_with_ocr(String.t() | binary(), keyword()) ::
{:ok, ExtractionResult.t()} | {:error, String.t()}
def extract_with_ocr(input, opts \\ []) do
is_file = is_binary(input) and File.exists?(input)
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
},
force_ocr: Keyword.get(opts, :force_ocr, false)
}
mime_type = Keyword.get(opts, :mime_type, nil)
case is_file do
true -> Kreuzberg.extract_file(input, mime_type, config)
false -> Kreuzberg.extract(input, mime_type || "application/pdf", config)
end
end
@doc """
Extract with text chunking for embedding or RAG pipelines.
Splits extracted text into chunks with configurable size and overlap.
"""
@spec extract_with_chunking(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_chunking(path, opts \\ []) do
config = %ExtractionConfig{
chunking: %{
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
}
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
chunks = result.chunks || []
{:ok,
%{
content: result.content,
chunks: chunks,
chunk_count: length(chunks),
metadata: result.metadata
}}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract with language detection and multi-language support.
Detects document languages and can extract from specific languages.
"""
@spec extract_with_language_detection(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_language_detection(path, opts \\ []) do
config = %ExtractionConfig{
language_detection: %{"enabled" => true},
extract_images: true
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
{:ok,
%{
content: result.content,
detected_languages: result.detected_languages || [],
mime_type: result.mime_type,
tables: result.tables,
images: result.images || []
}}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract with comprehensive configuration for production use.
Combines OCR, chunking, caching, and language detection with error handling.
"""
@spec extract_with_full_config(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_full_config(path, opts \\ []) do
config = %ExtractionConfig{
# OCR settings
ocr: %{
"enabled" => Keyword.get(opts, :ocr_enabled, true),
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
},
# Chunking for embeddings
chunking: %{
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
},
# Language detection
language_detection: %{"enabled" => Keyword.get(opts, :detect_language, true)},
# Cache results
use_cache: Keyword.get(opts, :use_cache, true),
# Extract various content types
extract_images: Keyword.get(opts, :extract_images, true),
extract_tables: true
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
summary = %{
file_path: path,
mime_type: result.mime_type,
content_length: byte_size(result.content),
content_preview: String.slice(result.content, 0..200),
detected_languages: result.detected_languages || [],
table_count: length(result.tables),
image_count: length(result.images || []),
chunk_count: length(result.chunks || []),
metadata: result.metadata
}
{:ok, summary}
{:error, reason} ->
{:error, "Extraction failed: #{reason}"}
end
end
@doc """
Validate file before extraction.
Checks file existence and MIME type compatibility.
"""
@spec validate_file(String.t()) :: :ok | {:error, String.t()}
def validate_file(path) do
cond do
not File.exists?(path) ->
{:error, "File not found: #{path}"}
true ->
case Kreuzberg.detect_mime_type_from_path(path) do
{:ok, _mime_type} ->
:ok
{:error, reason} ->
{:error, "Cannot determine MIME type: #{reason}"}
end
end
end
@doc """
Extract with validation and error recovery.
Validates file before extraction and provides detailed error information.
"""
@spec extract_safely(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_safely(path, opts \\ []) do
with :ok <- validate_file(path),
{:ok, summary} <- extract_with_full_config(path, opts) do
{:ok, summary}
else
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
# Extract with OCR
case ConfiguredDocumentClient.extract_with_ocr("scanned_document.pdf",
ocr_backend: "tesseract"
) do
{:ok, result} ->
IO.puts("OCR extraction successful")
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with chunking
case ConfiguredDocumentClient.extract_with_chunking("document.pdf",
chunk_size: 500,
chunk_overlap: 50
) do
{:ok, data} ->
IO.puts("Chunks: #{data.chunk_count}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with language detection
case ConfiguredDocumentClient.extract_with_language_detection("multilingual.pdf") do
{:ok, data} ->
IO.puts("Detected languages: #{inspect(data.detected_languages)}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Comprehensive extraction with validation
case ConfiguredDocumentClient.extract_safely("document.pdf",
ocr_enabled: true,
detect_language: true,
extract_images: true,
use_cache: true
) do
{:ok, summary} ->
IO.puts("File: #{summary.file_path}")
IO.puts("MIME: #{summary.mime_type}")
IO.puts("Size: #{summary.content_length} bytes")
IO.puts("Tables: #{summary.table_count}")
IO.puts("Languages: #{inspect(summary.detected_languages)}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,31 @@
```elixir title="Elixir"
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Successfully extracted content")
IO.puts("Content length: #{byte_size(result.content)} characters")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")
case result do
{:ok, data} ->
IO.puts("File processed successfully")
{:error, error} ->
IO.puts("Error details: #{inspect(error)}")
end
# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
{:error, msg} when is_binary(msg) ->
IO.puts("Validation error: #{msg}")
{:error, reason} ->
IO.puts("Unknown error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
# Read file into memory
{:ok, file_content} = File.read("document.pdf")
# Extract from bytes/binary data
{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
content = result.content
IO.puts("Extracted content:")
IO.puts(content)
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
```

View File

@@ -0,0 +1,11 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
# Basic document extraction workflow
# Load file -> extract -> access results
{:ok, result} = Kreuzberg.extract_file("document.pdf")
IO.puts("Extracted Content:")
IO.puts(result.content)
IO.puts("\nMetadata:")
IO.puts("Format: #{inspect(result.metadata.format)}")
IO.puts("Tables found: #{length(result.tables)}")
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
defmodule BasicUsage do
def extract_with_default_config do
# Use default configuration (nil)
config = nil
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, content} ->
IO.puts("Extracted content:")
IO.puts(content)
:ok
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
# Extract from different file types (PDF, DOCX, etc.)
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
IO.puts("Format: #{inspect(result.metadata.format)}")
IO.puts("Tables: #{length(result.tables)}")
{:error, reason} ->
IO.puts("Extraction failed: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
defmodule ExtractFile do
def show_extraction_details do
# Extract from a file
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
{:ok, result} ->
# Result is a string containing extracted content
IO.puts("Content length: #{String.length(result)} characters")
IO.puts("---")
IO.puts(result)
:ok
{:error, reason} ->
IO.puts("Failed to extract: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
# Extract scanned documents with OCR
# Configure Tesseract for OCR processing
ocr_config = %Kreuzberg.Config.OCR{
backend: "tesseract",
language: "eng"
}
config = %Kreuzberg.Config.Extraction{
ocr: ocr_config
}
{:ok, result} = Kreuzberg.extract_file("scanned.pdf", config: config)
IO.puts("Extracted text from scanned document:")
IO.puts(result.content)
IO.puts("Used OCR backend: tesseract")
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
defmodule ExtractWithOcr do
def extract_scanned_document do
# Build configuration with OCR settings as JSON string
config = ~s({"ocr": {"backend": "tesseract", "language": "eng"}})
case Kreuzberg.extract_file_sync("scanned.pdf", nil, config) do
{:ok, result} ->
IO.puts("Extracted via OCR:")
IO.puts(result)
:ok
{:error, reason} ->
IO.puts("OCR extraction failed: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,5 @@
```elixir title="Elixir"
# First Kreuzberg program - extract text from a PDF
{:ok, result} = Kreuzberg.extract_file("document.pdf")
IO.puts(result.content)
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
defmodule HelloWorld do
def main do
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
{:ok, result} ->
IO.puts("Extraction succeeded!")
IO.puts(result)
{:error, reason} ->
IO.puts("Error: #{reason}")
end
end
end
```

View File

@@ -0,0 +1,5 @@
```elixir title="Elixir"
# Verify Kreuzberg is installed and working by extracting a document
{:ok, result} = Kreuzberg.extract_file("sample.pdf")
IO.puts("Installation verified! Extracted #{String.length(result.content)} characters")
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
defmodule InstallVerify do
def verify_install do
# Verify Kreuzberg module is available
{:ok, extractors} = Kreuzberg.list_document_extractors()
IO.puts("Available extractors: #{inspect(extractors)}")
# Verify a simple extraction works
case Kreuzberg.extract_file_sync("test.txt", nil, nil) do
{:ok, _result} ->
IO.puts("Kreuzberg is properly installed and working!")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
end
end
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
# Access different parts of result
# Content, metadata, tables, images
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Access main content
content = result.content
IO.puts("Content length: #{String.length(content)} characters")
# Access tables
tables = result.tables
IO.puts("Tables found: #{length(tables)}")
# Access images
images = result.images
IO.puts("Images found: #{length(images)}")
# Access metadata
format = result.metadata.format
IO.puts("Format: #{inspect(format)}")
```

View File

@@ -0,0 +1,24 @@
```elixir title="Elixir"
defmodule ReadContent do
def process_extracted_content do
# Extract content and iterate over lines
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
{:ok, content} ->
IO.puts("Processing extracted content:")
# Split content into lines and iterate
content
|> String.split("\n", trim: true)
|> Enum.each(fn line ->
IO.puts(" #{line}")
end)
:ok
{:error, reason} ->
IO.puts("Error: #{reason}")
:error
end
end
end
```

View File

@@ -0,0 +1,32 @@
<!-- snippet:syntax-only -->
```elixir
schema = %{
"type" => "object",
"properties" => %{
"title" => %{"type" => "string"},
"authors" => %{"type" => "array", "items" => %{"type" => "string"}},
"date" => %{"type" => "string"}
},
"required" => ["title", "authors", "date"],
"additionalProperties" => false
}
config_json =
Jason.encode!(%{
"structured_extraction" => %{
"schema" => schema,
"schema_name" => "paper_metadata",
"strict" => true,
"llm" => %{"model" => "openai/gpt-4o-mini"}
}
})
{:ok, json} = Kreuzberg.extract_file_async("paper.pdf", nil, config_json)
result = Jason.decode!(json)
case result["structured_output"] do
nil -> IO.puts("no structured output")
output -> IO.inspect(output, label: "structured")
end
```

View File

@@ -0,0 +1,312 @@
```elixir title="Elixir"
# MCP Custom Client - Connect to Kreuzberg MCP servers
# Demonstrates creating a reusable MCP client for document extraction
defmodule KreuzbergMCPClient do
@moduledoc """
MCP client for communicating with Kreuzberg extraction servers.
Provides methods for extracting documents from remote MCP servers
with support for caching, retry logic, and error handling.
"""
require Logger
defmodule Config do
@moduledoc """
Configuration for MCP client connections.
"""
defstruct [
:host,
:port,
:timeout_ms,
:max_retries,
:retry_delay_ms,
:cache_dir
]
def new(opts \\ []) do
%Config{
host: Keyword.get(opts, :host, "localhost"),
port: Keyword.get(opts, :port, 8080),
timeout_ms: Keyword.get(opts, :timeout_ms, 30000),
max_retries: Keyword.get(opts, :max_retries, 3),
retry_delay_ms: Keyword.get(opts, :retry_delay_ms, 1000),
cache_dir: Keyword.get(opts, :cache_dir, nil)
}
end
end
@doc """
Extract document from file via MCP server.
Sends extraction request to remote Kreuzberg MCP server and returns
structured extraction result with optional caching.
## Options
* `:mime_type` - MIME type of document
* `:config` - Extraction configuration map
* `:use_cache` - Enable result caching (default: false)
"""
@spec extract_file(Config.t(), String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_file(config, file_path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type)
extraction_config = Keyword.get(opts, :config)
use_cache = Keyword.get(opts, :use_cache, false)
# Check cache first
if use_cache and config.cache_dir do
cache_key = compute_cache_key(file_path, mime_type, extraction_config)
case get_from_cache(config.cache_dir, cache_key) do
{:ok, cached_result} ->
Logger.debug("Cache hit for #{file_path}")
{:ok, cached_result}
:miss ->
# Cache miss, fetch from server
case fetch_from_server(config, file_path, mime_type, extraction_config) do
{:ok, result} ->
if use_cache, do: store_in_cache(config.cache_dir, cache_key, result)
{:ok, result}
error ->
error
end
end
else
fetch_from_server(config, file_path, mime_type, extraction_config)
end
end
@doc """
Upload and extract document via MCP server.
Reads file from disk, uploads it to the server, and returns extraction result.
Useful for server-side processing of large files.
"""
@spec upload_and_extract(Config.t(), String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def upload_and_extract(config, file_path, opts \\ []) do
unless File.exists?(file_path) do
{:error, "File not found: #{file_path}"}
else
case File.read(file_path) do
{:ok, body} ->
url = "http://#{config.host}:#{config.port}/extract/file"
headers = [
{"Content-Type", "application/octet-stream"},
{"X-File-Name", Path.basename(file_path)}
]
case HTTPoison.post(url, body, headers, timeout: config.timeout_ms) do
{:ok, response} ->
handle_response(response)
{:error, reason} ->
Logger.error("Upload failed: #{inspect(reason)}")
{:error, "Upload failed: #{inspect(reason)}"}
end
{:error, reason} ->
{:error, "Failed to read file: #{inspect(reason)}"}
end
end
end
@doc """
Check health status of MCP server.
"""
@spec health_check(Config.t()) :: {:ok, map()} | {:error, String.t()}
def health_check(config) do
url = "http://#{config.host}:#{config.port}/health"
case HTTPoison.get(url, [], timeout: config.timeout_ms) do
{:ok, response} ->
case handle_response(response) do
{:ok, data} -> {:ok, data}
error -> error
end
{:error, reason} ->
{:error, "Health check failed: #{inspect(reason)}"}
end
end
@doc """
Batch extract multiple documents with parallel requests.
Sends concurrent extraction requests for better throughput with large
document collections.
"""
@spec batch_extract(Config.t(), [String.t()], keyword()) ::
{:ok, [map()]} | {:error, String.t()}
def batch_extract(config, file_paths, opts \\ []) do
Logger.info("Batch extracting #{length(file_paths)} documents")
results =
file_paths
|> Task.async_stream(fn path ->
extract_file(config, path, opts)
end)
|> Stream.map(fn {:ok, result} -> result end)
|> Enum.to_list()
success_count = Enum.count(results, &match?({:ok, _}, &1))
Logger.info("Batch extraction complete: #{success_count}/#{length(file_paths)} succeeded")
{:ok, results}
end
# Private helpers
defp fetch_from_server(config, file_path, mime_type, extraction_config) do
url = "http://#{config.host}:#{config.port}/extract"
body =
Jason.encode!(%{
file_path: file_path,
mime_type: mime_type,
config: extraction_config
})
headers = [{"Content-Type", "application/json"}]
retry_request(config, fn ->
HTTPoison.post(url, body, headers, timeout: config.timeout_ms)
end)
|> case do
{:ok, response} -> handle_response(response)
error -> error
end
end
defp retry_request(config, request_fn) do
retry_request(config, request_fn, 0)
end
defp retry_request(config, request_fn, attempt) when attempt < config.max_retries do
case request_fn.() do
{:ok, response} ->
{:ok, response}
{:error, reason} ->
Logger.warn("Request failed (attempt #{attempt + 1}): #{inspect(reason)}")
Process.sleep(config.retry_delay_ms)
retry_request(config, request_fn, attempt + 1)
end
end
defp retry_request(_config, _request_fn, _attempt) do
{:error, "Max retries exceeded"}
end
defp handle_response(%HTTPoison.Response{status_code: 200, body: body}) do
case Jason.decode(body) do
{:ok, data} ->
if Map.get(data, "success") do
{:ok, data}
else
{:error, Map.get(data, "error", "Unknown error")}
end
{:error, reason} ->
{:error, "Failed to decode response: #{inspect(reason)}"}
end
end
defp handle_response(%HTTPoison.Response{status_code: status, body: body}) do
{:error, "Server error (#{status}): #{body}"}
end
defp compute_cache_key(file_path, mime_type, config) do
content = "#{file_path}|#{mime_type}|#{inspect(config)}"
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
end
defp get_from_cache(cache_dir, cache_key) do
cache_file = Path.join(cache_dir, "#{cache_key}.json")
if File.exists?(cache_file) do
case File.read(cache_file) do
{:ok, content} ->
{:ok, Jason.decode!(content)}
:error ->
:miss
end
else
:miss
end
end
defp store_in_cache(cache_dir, cache_key, result) do
File.mkdir_p!(cache_dir)
cache_file = Path.join(cache_dir, "#{cache_key}.json")
File.write!(cache_file, Jason.encode!(result))
end
end
# Usage examples
IO.puts("=== Kreuzberg MCP Client ===\n")
# Create client configuration
config = KreuzbergMCPClient.Config.new(
host: "localhost",
port: 8080,
timeout_ms: 30000,
max_retries: 3,
cache_dir: "/tmp/kreuzberg_cache"
)
# Check server health
IO.puts("Checking server health...")
case KreuzbergMCPClient.health_check(config) do
{:ok, health} ->
IO.puts("Server status: #{health["status"]}")
IO.puts("Service: #{health["service"]}\n")
{:error, reason} ->
IO.puts("Health check failed: #{reason}\n")
end
# Extract single document
IO.puts("Extracting document...")
case KreuzbergMCPClient.extract_file(config, "document.pdf", use_cache: true) do
{:ok, result} ->
IO.puts("Success!")
IO.puts("Content size: #{byte_size(result["content"])} bytes")
IO.puts("MIME type: #{result["mime_type"]}")
IO.puts("Tables found: #{length(result["tables"])}")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
IO.puts("")
# Batch extract multiple documents
IO.puts("Batch extracting multiple documents...")
documents = [
"doc1.pdf",
"doc2.pdf",
"doc3.pdf"
]
case KreuzbergMCPClient.batch_extract(config, documents) do
{:ok, results} ->
IO.puts("Batch extraction complete!")
successful = Enum.count(results, &match?({:ok, _}, &1))
IO.puts("Successful: #{successful}/#{length(results)}")
{:error, reason} ->
IO.puts("Batch extraction failed: #{reason}")
end
```

View File

@@ -0,0 +1,300 @@
```elixir title="Elixir"
# MCP Server Integration - Start a Kreuzberg MCP server
# Demonstrates how to set up and manage an MCP server for remote document extraction
defmodule KreuzbergMCPServer do
@moduledoc """
MCP (Model Context Protocol) server for Kreuzberg document extraction.
Provides a standardized interface for remote clients to extract documents
using the Kreuzberg library via the Model Context Protocol.
"""
require Logger
alias Kreuzberg.ExtractionConfig
@doc """
Start the MCP server on the specified host and port.
The server accepts extraction requests from MCP clients and returns
structured document data including content, metadata, and extracted elements.
## Options
* `:host` - Server host (default: "127.0.0.1")
* `:port` - Server port (default: 8080)
* `:max_connections` - Maximum concurrent connections (default: 10)
"""
def start_server(opts \\ []) do
host = Keyword.get(opts, :host, "127.0.0.1")
port = Keyword.get(opts, :port, 8080)
max_connections = Keyword.get(opts, :max_connections, 10)
Logger.info("Starting Kreuzberg MCP server on #{host}:#{port}")
{:ok, _pid} =
:cowboy.start_clear(
:kreuzberg_http,
[{:port, port}],
%{
env: [
{:dispatch,
[
{:_,
[
{"/extract", KreuzbergMCPServer.Handler, []},
{"/extract/file", KreuzbergMCPServer.FileHandler, []},
{"/health", KreuzbergMCPServer.HealthHandler, []}
]}
]}
]
}
)
Logger.info("MCP server started successfully")
{:ok, "Server running on #{host}:#{port}"}
end
@doc """
Stop the MCP server gracefully.
"""
def stop_server do
Logger.info("Stopping Kreuzberg MCP server")
:cowboy.stop_listener(:kreuzberg_http)
Logger.info("MCP server stopped")
:ok
end
end
# Handler for extraction requests
defmodule KreuzbergMCPServer.Handler do
@moduledoc """
HTTP handler for MCP extraction requests.
Processes incoming extraction requests with optional configuration.
"""
require Logger
def init(req, state) do
req
|> handle_request()
|> reply()
|> wrap_response(state)
end
defp handle_request(req) do
case req.method do
"POST" -> handle_extraction(req)
_ -> error_response(405, "Method not allowed")
end
end
defp handle_extraction(req) do
case :cowboy_req.read_body(req) do
{:ok, body, req} ->
case Jason.decode(body) do
{:ok, params} ->
extract_from_params(params, req)
{:error, reason} ->
error_response(400, "Invalid JSON: #{inspect(reason)}")
end
{:error, reason} ->
error_response(400, "Failed to read body: #{inspect(reason)}")
end
end
defp extract_from_params(params, req) do
file_path = Map.get(params, "file_path")
mime_type = Map.get(params, "mime_type")
config_opts = Map.get(params, "config", %{})
unless file_path do
error_response(400, "Missing required parameter: file_path")
else
config = build_config(config_opts)
case Kreuzberg.extract_file(file_path, mime_type, config) do
{:ok, result} ->
response_data = %{
success: true,
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata || %{},
tables: result.tables || [],
chunks: result.chunks || [],
images: result.images || [],
detected_languages: result.detected_languages || []
}
success_response(200, response_data, req)
{:error, reason} ->
error_response(400, "Extraction failed: #{inspect(reason)}")
end
end
end
defp build_config(opts) when is_map(opts) do
%Kreuzberg.ExtractionConfig{
ocr: opts["ocr"],
chunking: opts["chunking"],
quality_processing: opts["quality_processing"],
language_detection: opts["language_detection"],
images: opts["images"],
use_cache: Map.get(opts, "use_cache", true)
}
end
defp build_config(_), do: nil
defp success_response(status, data, req) do
{:ok,
:cowboy_req.reply(
status,
%{"content-type" => "application/json"},
Jason.encode!(data),
req
)}
end
defp error_response(status, message) do
{:error,
status,
Jason.encode!(%{
success: false,
error: message
})}
end
defp reply({:ok, req}), do: {req, :ok}
defp reply({:error, status, body}), do: {status, body}
defp wrap_response({req, :ok}, state), do: {:ok, req, state}
defp wrap_response({status, body}, state) do
# Note: In actual implementation, req needs to be passed through the pipeline
# For now, create a minimal request object for error responses
req = :cowboy_req.new()
{:cowboy_req.reply(status, %{}, body, req), state}
end
end
# Health check handler
defmodule KreuzbergMCPServer.HealthHandler do
@moduledoc """
Health check endpoint for the MCP server.
"""
def init(req, state) do
response = Jason.encode!(%{
status: "healthy",
service: "kreuzberg-mcp",
timestamp: DateTime.utc_now() |> DateTime.to_iso8601()
})
req =
:cowboy_req.reply(
200,
%{"content-type" => "application/json"},
response,
req
)
{:ok, req, state}
end
end
# File upload handler
defmodule KreuzbergMCPServer.FileHandler do
@moduledoc """
Handler for multipart file uploads for extraction.
"""
require Logger
def init(req, state) do
case req.method do
"POST" -> handle_file_upload(req, state)
_ -> {:cowboy_req.reply(405, %{}, "Method not allowed", req), state}
end
end
defp handle_file_upload(req, state) do
# Store uploaded file temporarily
temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
case :cowboy_req.read_body(req) do
{:ok, body, req} ->
File.write!(temp_path, body)
case Kreuzberg.extract_file(temp_path) do
{:ok, result} ->
response = Jason.encode!(%{
success: true,
content_size: byte_size(result.content),
mime_type: result.mime_type,
metadata: result.metadata
})
req =
:cowboy_req.reply(
200,
%{"content-type" => "application/json"},
response,
req
)
File.rm(temp_path)
{:ok, req, state}
{:error, reason} ->
response = Jason.encode!(%{success: false, error: inspect(reason)})
req =
:cowboy_req.reply(
400,
%{"content-type" => "application/json"},
response,
req
)
File.rm(temp_path)
{:ok, req, state}
end
{:error, reason} ->
response = Jason.encode!(%{success: false, error: inspect(reason)})
req =
:cowboy_req.reply(
400,
%{"content-type" => "application/json"},
response,
req
)
{:ok, req, state}
end
end
end
# Usage example - start the server
IO.puts("=== Kreuzberg MCP Server ===\n")
case KreuzbergMCPServer.start_server(port: 8080) do
{:ok, message} ->
IO.puts(message)
IO.puts("\nServer is running and ready to accept requests:")
IO.puts(" - POST /extract - Extract from file path")
IO.puts(" - POST /extract/file - Upload and extract")
IO.puts(" - GET /health - Health check")
# Keep the server running
IO.puts("\nServer started. Press Ctrl+C to stop.")
Process.sleep(:infinity)
{:error, reason} ->
IO.puts("Failed to start server: #{inspect(reason)}")
end
```

Some files were not shown because too many files have changed in this diff Show More