This commit is contained in:
53
docs/snippets/elixir/advanced/10_chunking_config.exs
Normal file
53
docs/snippets/elixir/advanced/10_chunking_config.exs
Normal file
@@ -0,0 +1,53 @@
|
||||
# Advanced Chunking Configuration
|
||||
# This example shows how to configure sophisticated document chunking strategies
|
||||
# with fine-grained control over chunk size, overlap, and boundary respect.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Advanced chunking configuration with multiple parameters
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"min_size" => 100,
|
||||
"respect_boundaries" => true,
|
||||
"split_on" => ["sentence", "paragraph"]
|
||||
}
|
||||
}
|
||||
|
||||
# Use the configuration for extraction
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the chunked results
|
||||
chunks = result.chunks || []
|
||||
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
|
||||
Enum.each(chunks, fn chunk ->
|
||||
IO.inspect(%{
|
||||
text_length: String.length(chunk["content"]),
|
||||
page: chunk["page"],
|
||||
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
|
||||
})
|
||||
end)
|
||||
|
||||
# Prepend heading context to chunk content
|
||||
config_with_headings = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result_headings} = Kreuzberg.extract_file("document.md", nil, config_with_headings)
|
||||
|
||||
if result_headings.chunks do
|
||||
IO.puts("Generated #{length(result_headings.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result_headings.chunks, fn chunk ->
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
@@ -0,0 +1,36 @@
|
||||
# Advanced Language Detection Configuration
|
||||
# This example demonstrates how to configure language detection with multiple
|
||||
# parameters for detecting and tracking all languages in documents.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Advanced language detection configuration
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true,
|
||||
"min_confidence" => 0.8
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with language detection enabled
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the detection results
|
||||
detection_results = result.detected_languages || []
|
||||
|
||||
IO.puts("Detected languages:")
|
||||
|
||||
Enum.each(detection_results, fn lang_result ->
|
||||
confidence = Map.get(lang_result, "confidence", "unknown")
|
||||
language = Map.get(lang_result, "language", "unknown")
|
||||
IO.puts(" - #{language}: #{confidence}")
|
||||
end)
|
||||
|
||||
# Filter results by minimum confidence threshold
|
||||
high_confidence_languages = Enum.filter(detection_results, fn lang_result ->
|
||||
confidence = Map.get(lang_result, "confidence", 0)
|
||||
confidence >= 0.8
|
||||
end)
|
||||
|
||||
IO.puts("\nHigh confidence languages (>= 0.8): #{length(high_confidence_languages)}")
|
||||
@@ -0,0 +1,22 @@
|
||||
# Advanced keyword extraction configuration
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 20,
|
||||
"min_score" => 0.6,
|
||||
"algorithm" => "tfidf"
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("research_paper.pdf", nil, config)
|
||||
|
||||
if result.keywords do
|
||||
# Group by score ranges
|
||||
high_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.8 end)
|
||||
medium_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.6 and kw["score"] < 0.8 end)
|
||||
|
||||
IO.puts("High confidence keywords (#{length(high_score)}):")
|
||||
Enum.each(high_score, fn kw -> IO.puts(" - #{kw["word"]} (#{kw["score"]})") end)
|
||||
end
|
||||
@@ -0,0 +1,39 @@
|
||||
# Quality processing configuration
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.75,
|
||||
"remove_noise" => true,
|
||||
"enhance_clarity" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with quality processing and OCR
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the quality-processed results
|
||||
IO.puts("=== Quality Processing Results ===\n")
|
||||
|
||||
# Display extracted content with quality processing applied
|
||||
content_preview = String.slice(result.content, 0..300)
|
||||
IO.puts("Extracted Content (first 300 chars):")
|
||||
IO.puts(content_preview)
|
||||
IO.puts("\nTotal content size: #{byte_size(result.content)} bytes")
|
||||
|
||||
# Check quality metrics
|
||||
quality_score = result.quality_score
|
||||
if quality_score && quality_score > 0 do
|
||||
IO.puts("\nQuality Score: #{quality_score}")
|
||||
IO.puts("Quality Status: #{if quality_score >= 0.75, do: "Acceptable", else: "Below threshold"}")
|
||||
end
|
||||
|
||||
# Display any OCR results if applicable
|
||||
if result.images && length(result.images) > 0 do
|
||||
IO.puts("\nImages found: #{length(result.images)}")
|
||||
end
|
||||
21
docs/snippets/elixir/advanced/14_token_reduction_config.exs
Normal file
21
docs/snippets/elixir/advanced/14_token_reduction_config.exs
Normal file
@@ -0,0 +1,21 @@
|
||||
# Token reduction for different LLM context windows
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# For GPT-4
|
||||
config_gpt4 = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 8000,
|
||||
"strategy" => "intelligent",
|
||||
"preserve_structure" => true
|
||||
}
|
||||
}
|
||||
|
||||
# For Claude
|
||||
config_claude = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 100000,
|
||||
"strategy" => "minimal"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
# Vector Database Integration
|
||||
# This example demonstrates how to prepare document chunks for integration with vector databases
|
||||
# by configuring chunking and processing the extracted content.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure extraction with chunking enabled
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with chunking
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Prepare chunks for vector database ingestion
|
||||
documents = Enum.map(result.chunks || [], fn chunk ->
|
||||
%{
|
||||
content: chunk["content"],
|
||||
metadata: %{
|
||||
page: chunk["page"],
|
||||
char_count: String.length(chunk["content"])
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Prepared #{length(documents)} documents for vector DB")
|
||||
|
||||
# The documents list can now be sent to your vector database
|
||||
# Example: documents |> MyVectorDB.index_documents()
|
||||
13
docs/snippets/elixir/advanced/chunk_page_mapping.exs
Normal file
13
docs/snippets/elixir/advanced/chunk_page_mapping.exs
Normal file
@@ -0,0 +1,13 @@
|
||||
# Extract with chunking and track page boundaries
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 500},
|
||||
track_page_boundaries: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Map each chunk to its source page
|
||||
Enum.with_index(result.chunks || [], 1) |> Enum.each(fn {chunk, idx} ->
|
||||
page = chunk["page"] || "unknown"
|
||||
IO.puts("Chunk #{idx} from page #{page}")
|
||||
end)
|
||||
23
docs/snippets/elixir/advanced/chunk_page_mapping.md
Normal file
23
docs/snippets/elixir/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1024,
|
||||
"overlap" => 128
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Map chunks to their source pages
|
||||
chunks_with_pages = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"chunk_id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"page_number" => chunk["page"]
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_with_pages, label: "Chunks with Page Mapping")
|
||||
```
|
||||
45
docs/snippets/elixir/advanced/chunking_config.md
Normal file
45
docs/snippets/elixir/advanced/chunking_config.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"min_size" => 100,
|
||||
"respect_boundaries" => true,
|
||||
"split_on" => ["sentence", "paragraph"]
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
chunks = result.chunks || []
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
|
||||
Enum.each(chunks, fn chunk ->
|
||||
IO.inspect(%{
|
||||
text_length: String.length(chunk["content"]),
|
||||
page: chunk["page"],
|
||||
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
|
||||
})
|
||||
end)
|
||||
```
|
||||
|
||||
```elixir title="Elixir - Prepend Heading Context"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.md", "text/markdown", config_json)
|
||||
|
||||
if result.chunks do
|
||||
IO.puts("Generated #{length(result.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
```
|
||||
26
docs/snippets/elixir/advanced/chunking_rag.exs
Normal file
26
docs/snippets/elixir/advanced/chunking_rag.exs
Normal file
@@ -0,0 +1,26 @@
|
||||
# Configure chunking for RAG/vector search
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Prepare chunks for vector embedding and search
|
||||
chunks_for_embedding = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"metadata" => %{
|
||||
"page" => chunk["page"],
|
||||
"source" => "document.pdf"
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
|
||||
27
docs/snippets/elixir/advanced/chunking_rag.md
Normal file
27
docs/snippets/elixir/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Prepare chunks for vector embedding and search
|
||||
chunks_for_embedding = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"metadata" => %{
|
||||
"page" => chunk["page"],
|
||||
"source" => "document.pdf"
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
|
||||
```
|
||||
20
docs/snippets/elixir/advanced/combining_all_features.exs
Normal file
20
docs/snippets/elixir/advanced/combining_all_features.exs
Normal file
@@ -0,0 +1,20 @@
|
||||
# Use all major features together
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
ocr: %{"enabled" => true},
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000},
|
||||
images: %{"extract" => true},
|
||||
language_detection: %{"enabled" => true},
|
||||
keyword_extraction: %{"enabled" => true}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process results with all extracted features
|
||||
IO.inspect(result, label: "Extraction Result")
|
||||
|
||||
# Access different feature outputs
|
||||
if result.content, do: IO.puts("Text: #{String.slice(result.content, 0..100)}")
|
||||
if result.detected_languages, do: IO.puts("Language: #{inspect(result.detected_languages)}")
|
||||
if result.metadata["keywords"], do: IO.puts("Keywords: #{inspect(result.metadata["keywords"])}")
|
||||
if result.chunks, do: IO.puts("Chunks: #{length(result.chunks)}")
|
||||
if result.images, do: IO.puts("Images: #{length(result.images)}")
|
||||
103
docs/snippets/elixir/advanced/complete_example.exs
Normal file
103
docs/snippets/elixir/advanced/complete_example.exs
Normal file
@@ -0,0 +1,103 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Build comprehensive configuration for all features
|
||||
config = %ExtractionConfig{
|
||||
# Enable OCR for scanned documents
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"force_all_pages" => false
|
||||
},
|
||||
# Configure chunking for RAG applications
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
# Extract images from documents
|
||||
images: %{
|
||||
"extract" => true
|
||||
},
|
||||
# Enable language detection
|
||||
language_detection: %{
|
||||
"enabled" => true
|
||||
},
|
||||
# Use caching for performance
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
# Extract file with full configuration
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Extraction Successful ===\n")
|
||||
|
||||
# 1. Process content
|
||||
IO.puts("--- Content ---")
|
||||
content_preview = String.slice(result.content, 0..100)
|
||||
IO.puts("Preview: #{content_preview}...")
|
||||
IO.puts("Total size: #{byte_size(result.content)} bytes\n")
|
||||
|
||||
# 2. Process metadata
|
||||
IO.puts("--- Metadata ---")
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
metadata = result.metadata || %{}
|
||||
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
|
||||
# Handle PDF-specific metadata
|
||||
case metadata["pdf"] do
|
||||
pdf_meta when is_map(pdf_meta) ->
|
||||
IO.puts(" Pages: #{pdf_meta["page_count"]}")
|
||||
IO.puts(" Author: #{pdf_meta["author"]}")
|
||||
IO.puts(" Title: #{pdf_meta["title"]}")
|
||||
_ -> nil
|
||||
end
|
||||
IO.puts("")
|
||||
|
||||
# 3. Process tables
|
||||
IO.puts("--- Tables ---")
|
||||
tables = result.tables || []
|
||||
IO.puts("Total tables: #{length(tables)}")
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
|
||||
cells = table["cells"] || []
|
||||
IO.puts(" Table #{idx}: #{length(cells)} rows")
|
||||
markdown = table["markdown"]
|
||||
if markdown, do: IO.puts(" Markdown: #{String.slice(markdown, 0..50)}...")
|
||||
end)
|
||||
IO.puts("")
|
||||
|
||||
# 4. Process chunks for RAG
|
||||
IO.puts("--- Chunks ---")
|
||||
chunks = result.chunks || []
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, idx} ->
|
||||
IO.puts(" Chunk #{idx}: #{byte_size(chunk)} bytes")
|
||||
end)
|
||||
IO.puts("")
|
||||
|
||||
# 5. Process detected languages
|
||||
IO.puts("--- Language Detection ---")
|
||||
languages = result.detected_languages || []
|
||||
if Enum.empty?(languages) do
|
||||
IO.puts("No languages detected")
|
||||
else
|
||||
Enum.each(languages, fn lang ->
|
||||
IO.puts(" Language: #{lang}")
|
||||
end)
|
||||
end
|
||||
IO.puts("")
|
||||
|
||||
# 6. Process images
|
||||
IO.puts("--- Images ---")
|
||||
images = result.images || []
|
||||
IO.puts("Total images: #{length(images)}")
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts(" Image #{idx}: #{image["format"]} (#{image["size"]} bytes)")
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
27
docs/snippets/elixir/advanced/embedding_with_chunking.exs
Normal file
27
docs/snippets/elixir/advanced/embedding_with_chunking.exs
Normal file
@@ -0,0 +1,27 @@
|
||||
# Extract and prepare for embeddings
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 512},
|
||||
embeddings: %{"enabled" => true}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process chunks with embeddings for semantic search
|
||||
embedded_chunks = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"chunk_id" => idx,
|
||||
"content" => chunk["content"],
|
||||
"embedding" => chunk["embedding"],
|
||||
"page" => chunk["page"],
|
||||
"metadata" => %{
|
||||
"document" => "document.pdf",
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
# Store embeddings in vector database
|
||||
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
|
||||
IO.inspect(embedded_chunks, label: "Embedded Chunks")
|
||||
32
docs/snippets/elixir/advanced/embedding_with_chunking.md
Normal file
32
docs/snippets/elixir/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512
|
||||
},
|
||||
"embeddings" => %{
|
||||
"enabled" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Process chunks with embeddings for semantic search
|
||||
embedded_chunks = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"chunk_id" => idx,
|
||||
"content" => chunk["content"],
|
||||
"embedding" => chunk["embedding"],
|
||||
"page" => chunk["page"],
|
||||
"metadata" => %{
|
||||
"document" => "document.pdf",
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
|
||||
IO.inspect(embedded_chunks, label: "Embedded Chunks")
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/elixir/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.3
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.keywords do
|
||||
IO.puts("Keywords: #{inspect(result.keywords)}")
|
||||
end
|
||||
```
|
||||
30
docs/snippets/elixir/advanced/keyword_extraction_example.exs
Normal file
30
docs/snippets/elixir/advanced/keyword_extraction_example.exs
Normal file
@@ -0,0 +1,30 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Extract keywords from document
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 15
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("article.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Keyword Extraction ===\n")
|
||||
|
||||
# Display extracted keywords
|
||||
if result.keywords do
|
||||
IO.puts("Extracted keywords:")
|
||||
Enum.each(result.keywords, fn kw ->
|
||||
IO.puts("- #{kw["word"]}: #{kw["score"]}")
|
||||
end)
|
||||
else
|
||||
IO.puts("No keywords extracted")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
18
docs/snippets/elixir/advanced/keyword_extraction_example.md
Normal file
18
docs/snippets/elixir/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.3
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.keywords do
|
||||
result.keywords
|
||||
|> Enum.each(fn %{"keyword" => kw, "score" => score} ->
|
||||
IO.puts("#{kw}: #{Float.round(score, 4)}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/language_detection_config.md
Normal file
15
docs/snippets/elixir/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => false
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.language do
|
||||
IO.puts("Detected language: #{result.language}")
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,32 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Detect all languages in multilingual document
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("multilingual.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Language Detection ===\n")
|
||||
|
||||
# Display detected languages
|
||||
languages = result.detected_languages || []
|
||||
if Enum.empty?(languages) do
|
||||
IO.puts("No languages detected")
|
||||
else
|
||||
IO.puts("Detected languages:")
|
||||
Enum.each(languages, fn lang ->
|
||||
IO.puts("- #{lang}")
|
||||
end)
|
||||
IO.puts("\nTotal languages: #{length(languages)}")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.7,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("multilingual_document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.languages do
|
||||
IO.puts("Detected languages:")
|
||||
Enum.each(result.languages, fn %{"language" => lang, "confidence" => conf} ->
|
||||
IO.puts(" - #{lang}: #{Float.round(conf, 4)}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
28
docs/snippets/elixir/advanced/metadata_extraction.exs
Normal file
28
docs/snippets/elixir/advanced/metadata_extraction.exs
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
|
||||
# Access PDF metadata directly from the flat map
|
||||
page_count = metadata["page_count"]
|
||||
if page_count, do: IO.puts("Page count: #{page_count}")
|
||||
|
||||
authors = metadata["authors"] || []
|
||||
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
|
||||
|
||||
title = metadata["title"]
|
||||
if title, do: IO.puts("Title: #{title}")
|
||||
|
||||
# Access HTML metadata directly from the flat map
|
||||
{:ok, html_result} = Kreuzberg.extract_file("page.html")
|
||||
html_meta = html_result.metadata
|
||||
|
||||
keywords = html_meta["keywords"] || []
|
||||
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
|
||||
|
||||
description = html_meta["description"]
|
||||
if description, do: IO.puts("Description: #{description}")
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/quality_processing_config.md
Normal file
15
docs/snippets/elixir/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"post_processors" => [
|
||||
%{
|
||||
"name" => "QualityFilter",
|
||||
"enabled" => true
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
IO.puts("Extraction quality applied")
|
||||
IO.inspect(result.text, label: "Quality-filtered text")
|
||||
```
|
||||
39
docs/snippets/elixir/advanced/quality_processing_example.exs
Normal file
39
docs/snippets/elixir/advanced/quality_processing_example.exs
Normal file
@@ -0,0 +1,39 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Process document with quality filtering
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.8
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Quality Processing ===\n")
|
||||
|
||||
# Display quality metrics if available
|
||||
quality_score = result.quality_score
|
||||
if quality_score && quality_score > 0 do
|
||||
IO.puts("Quality Score: #{quality_score}")
|
||||
IO.puts("Content Quality: #{quality_status(quality_score)}")
|
||||
end
|
||||
|
||||
# Display content with quality assurance
|
||||
IO.puts("\n--- Extracted Content ---")
|
||||
content_preview = String.slice(result.content, 0..200)
|
||||
IO.puts(content_preview)
|
||||
IO.puts("\nTotal size: #{byte_size(result.content)} bytes")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
|
||||
# Helper function to determine quality status
|
||||
defp quality_status(score) when score >= 0.9, do: "Excellent"
|
||||
defp quality_status(score) when score >= 0.8, do: "Good"
|
||||
defp quality_status(score) when score >= 0.7, do: "Fair"
|
||||
defp quality_status(_score), do: "Poor"
|
||||
```
|
||||
22
docs/snippets/elixir/advanced/quality_processing_example.md
Normal file
22
docs/snippets/elixir/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"post_processors" => [
|
||||
%{
|
||||
"name" => "QualityFilter",
|
||||
"enabled" => true
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
{:ok, result_before} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
|
||||
|
||||
{:ok, result_after} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Compare text quality metrics
|
||||
text_before = result_before.text || ""
|
||||
text_after = result_after.text || ""
|
||||
|
||||
IO.puts("Before quality processing: #{String.length(text_before)} chars")
|
||||
IO.puts("After quality processing: #{String.length(text_after)} chars")
|
||||
IO.puts("Improvement: #{Float.round((1 - String.length(text_after) / String.length(text_before)) * 100, 2)}%")
|
||||
```
|
||||
19
docs/snippets/elixir/advanced/table_extraction.exs
Normal file
19
docs/snippets/elixir/advanced/table_extraction.exs
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
tables = result.tables
|
||||
IO.puts("Total tables found: #{length(tables)}")
|
||||
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
|
||||
IO.puts("\n--- Table #{index} ---")
|
||||
|
||||
# Access table cells
|
||||
cells = table["cells"] || []
|
||||
IO.puts("Rows: #{length(cells)}")
|
||||
|
||||
# Access table markdown representation
|
||||
markdown = table["markdown"]
|
||||
IO.puts("Markdown representation:")
|
||||
IO.puts(markdown)
|
||||
end)
|
||||
```
|
||||
17
docs/snippets/elixir/advanced/token_reduction_config.md
Normal file
17
docs/snippets/elixir/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_markdown" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.original_token_count do
|
||||
IO.puts("Original tokens: #{result.original_token_count}")
|
||||
end
|
||||
if result.reduced_token_count do
|
||||
IO.puts("Reduced tokens: #{result.reduced_token_count}")
|
||||
end
|
||||
```
|
||||
42
docs/snippets/elixir/advanced/token_reduction_example.exs
Normal file
42
docs/snippets/elixir/advanced/token_reduction_example.exs
Normal file
@@ -0,0 +1,42 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Reduce token count for LLM
|
||||
config = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 2000
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Token Reduction ===\n")
|
||||
|
||||
# Display content and token information
|
||||
content_size = byte_size(result.content)
|
||||
estimated_tokens = div(content_size, 4) # Rough estimate: 1 token ≈ 4 bytes
|
||||
|
||||
IO.puts("Content size: #{content_size} bytes")
|
||||
IO.puts("Estimated tokens: ~#{estimated_tokens}")
|
||||
IO.puts("Target tokens: 2000")
|
||||
|
||||
# Show reduction status
|
||||
if estimated_tokens > 2000 do
|
||||
reduction_percentage = trunc((1 - 2000 / estimated_tokens) * 100)
|
||||
IO.puts("\nToken reduction applied: ~#{reduction_percentage}% reduction")
|
||||
else
|
||||
IO.puts("\nNo reduction needed - content already below target")
|
||||
end
|
||||
|
||||
# Display reduced content preview
|
||||
IO.puts("\n--- Reduced Content ---")
|
||||
content_preview = String.slice(result.content, 0..300)
|
||||
IO.puts(content_preview)
|
||||
IO.puts("...")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
28
docs/snippets/elixir/advanced/token_reduction_example.md
Normal file
28
docs/snippets/elixir/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_markdown" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Display token reduction metrics
|
||||
original = result.original_token_count || 0
|
||||
reduced = result.reduced_token_count || 0
|
||||
|
||||
IO.puts("Original tokens: #{original}")
|
||||
IO.puts("Reduced tokens: #{reduced}")
|
||||
|
||||
if original > 0 do
|
||||
reduction_percent = Float.round((1 - reduced / original) * 100, 2)
|
||||
IO.puts("Reduction: #{reduction_percent}%")
|
||||
end
|
||||
|
||||
# Show sample of reduced text
|
||||
if result.text do
|
||||
IO.puts("\nSample of reduced text:")
|
||||
IO.puts(String.slice(result.text, 0..200) <> "...")
|
||||
end
|
||||
```
|
||||
40
docs/snippets/elixir/advanced/vector_database_integration.md
Normal file
40
docs/snippets/elixir/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
},
|
||||
"embeddings" => %{
|
||||
"enabled" => true,
|
||||
"model" => "all-MiniLM-L6-v2"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Prepare data for vector database storage
|
||||
vector_db_records = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"id" => "#{result.document_id}-chunk-#{idx}",
|
||||
"vector" => chunk["embedding"],
|
||||
"metadata" => %{
|
||||
"content" => chunk["content"],
|
||||
"page" => chunk["page"],
|
||||
"document_id" => result.document_id,
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Generated #{length(vector_db_records)} records for vector database")
|
||||
IO.inspect(List.first(vector_db_records), label: "Sample Record")
|
||||
|
||||
# Example: Insert into Pinecone-like vector database
|
||||
Enum.each(vector_db_records, fn record ->
|
||||
# vector_db_client.upsert(record)
|
||||
IO.puts("Would insert: #{record["id"]}")
|
||||
end)
|
||||
```
|
||||
25
docs/snippets/elixir/api/batch_extract_bytes_sync.md
Normal file
25
docs/snippets/elixir/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def batch_extract_bytes do
|
||||
# Note: Batch extraction in Elixir is done via Task.async_stream over sync calls
|
||||
files = ["doc1.pdf", "doc2.docx", "report.pdf"]
|
||||
config = nil
|
||||
|
||||
results =
|
||||
files
|
||||
|> Task.async_stream(
|
||||
fn file ->
|
||||
content = File.read!(file)
|
||||
Kreuzberg.extract_bytes_sync(content, "application/pdf", config)
|
||||
end,
|
||||
max_concurrency: 4
|
||||
)
|
||||
|> Enum.map(fn {:ok, result} -> result end)
|
||||
|
||||
Enum.each(results, fn
|
||||
{:ok, content} -> IO.puts("Extracted: #{String.length(content)} chars")
|
||||
{:error, reason} -> IO.puts("Error: #{reason}")
|
||||
end)
|
||||
end
|
||||
end
|
||||
```
|
||||
23
docs/snippets/elixir/api/batch_extract_files_sync.md
Normal file
23
docs/snippets/elixir/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def batch_extract_files do
|
||||
files = ["doc1.pdf", "doc2.docx", "report.pdf"]
|
||||
config = nil
|
||||
|
||||
results =
|
||||
files
|
||||
|> Task.async_stream(
|
||||
fn file ->
|
||||
Kreuzberg.extract_file_sync(file, nil, config)
|
||||
end,
|
||||
max_concurrency: 4
|
||||
)
|
||||
|> Enum.map(fn {:ok, result} -> result end)
|
||||
|
||||
Enum.each(results, fn
|
||||
{:ok, content} -> IO.puts("File extracted: #{String.length(content)} chars")
|
||||
{:error, reason} -> IO.puts("Error: #{reason}")
|
||||
end)
|
||||
end
|
||||
end
|
||||
```
|
||||
30
docs/snippets/elixir/api/client_chunk_text.md
Normal file
30
docs/snippets/elixir/api/client_chunk_text.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def chunk_text_via_http do
|
||||
text = "Your long document text here..."
|
||||
|
||||
config = %{
|
||||
"chunking" => %{
|
||||
"max_characters" => 800,
|
||||
"overlap" => 100,
|
||||
"chunker_type" => "Markdown"
|
||||
}
|
||||
}
|
||||
|
||||
with {:ok, response} <- Req.post(
|
||||
"http://localhost:8000/chunk",
|
||||
json: %{
|
||||
"text" => text,
|
||||
"config" => config
|
||||
}
|
||||
),
|
||||
{:ok, body} <- Jason.decode(response.body) do
|
||||
chunks = body["chunks"]
|
||||
IO.puts("Created #{length(chunks)} chunks")
|
||||
{:ok, chunks}
|
||||
else
|
||||
error -> {:error, inspect(error)}
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
19
docs/snippets/elixir/api/client_extract_single_file.md
Normal file
19
docs/snippets/elixir/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def extract_via_http do
|
||||
file_path = "document.pdf"
|
||||
|
||||
with {:ok, file} <- File.read(file_path),
|
||||
{:ok, response} <- Req.post(
|
||||
"http://localhost:8000/extract",
|
||||
form: [file: {:file, file_path}]
|
||||
),
|
||||
{:ok, body} <- Jason.decode(response.body) do
|
||||
IO.puts("Extracted content: #{body["content"]}")
|
||||
{:ok, body}
|
||||
else
|
||||
error -> {:error, inspect(error)}
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
41
docs/snippets/elixir/api/combining_all_features.md
Normal file
41
docs/snippets/elixir/api/combining_all_features.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def full_extraction_pipeline do
|
||||
# Build a comprehensive extraction config as a JSON string or map
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"enable_quality_processing" => true,
|
||||
"force_ocr" => false,
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
},
|
||||
"chunking" => %{
|
||||
"max_characters" => 800,
|
||||
"overlap" => 100,
|
||||
"chunker_type" => "Markdown",
|
||||
"prepend_heading_context" => true
|
||||
},
|
||||
"output_format" => "Markdown",
|
||||
"include_document_structure" => true,
|
||||
"images" => %{
|
||||
"extract_images" => true
|
||||
},
|
||||
"language_detection" => %{
|
||||
"detect" => true
|
||||
}
|
||||
})
|
||||
|
||||
case Kreuzberg.extract_file_sync("report.pdf", nil, config_json) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Extraction successful")
|
||||
IO.puts("Content length: #{String.length(result)} chars")
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
24
docs/snippets/elixir/api/error_handling.md
Normal file
24
docs/snippets/elixir/api/error_handling.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def handle_extraction_errors do
|
||||
# Extract with invalid MIME type
|
||||
case Kreuzberg.extract_file_sync("document.txt", nil, nil) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Success: #{String.length(result)} chars")
|
||||
|
||||
{:error, reason} when is_binary(reason) ->
|
||||
# Error is a string description
|
||||
case reason do
|
||||
msg when String.contains?(msg, "unsupported") ->
|
||||
IO.puts("Unsupported format: #{msg}")
|
||||
|
||||
msg when String.contains?(msg, "not found") ->
|
||||
IO.puts("File not found: #{msg}")
|
||||
|
||||
msg ->
|
||||
IO.puts("Extraction failed: #{msg}")
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
28
docs/snippets/elixir/api/error_handling_extract.md
Normal file
28
docs/snippets/elixir/api/error_handling_extract.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def robust_extract(path) do
|
||||
with {:file_exists, true} <- {:file_exists, File.exists?(path)},
|
||||
{:read, {:ok, content}} <- {:read, File.read(path)},
|
||||
{:mime, {:ok, mime_type}} <- {:mime, detect_mime_type(content)},
|
||||
{:extract, {:ok, result}} <- {:extract, Kreuzberg.extract_bytes_sync(content, mime_type, nil)} do
|
||||
{:ok, result}
|
||||
else
|
||||
{:file_exists, false} ->
|
||||
{:error, "File not found: #{path}"}
|
||||
|
||||
{:read, {:error, reason}} ->
|
||||
{:error, "Failed to read file: #{inspect(reason)}"}
|
||||
|
||||
{:mime, {:error, reason}} ->
|
||||
{:error, "MIME detection failed: #{reason}"}
|
||||
|
||||
{:extract, {:error, reason}} ->
|
||||
{:error, "Extraction failed: #{reason}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp detect_mime_type(content) do
|
||||
Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
end
|
||||
end
|
||||
```
|
||||
22
docs/snippets/elixir/api/extract_bytes_async.md
Normal file
22
docs/snippets/elixir/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def extract_from_bytes_async do
|
||||
content = File.read!("document.pdf")
|
||||
config = nil
|
||||
|
||||
task = Task.async(fn ->
|
||||
Kreuzberg.extract_bytes_async(content, "application/pdf", config)
|
||||
end)
|
||||
|
||||
case Task.await(task) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result}")
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
18
docs/snippets/elixir/api/extract_bytes_sync.md
Normal file
18
docs/snippets/elixir/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def extract_from_bytes do
|
||||
content = File.read!("document.pdf")
|
||||
config = nil
|
||||
|
||||
case Kreuzberg.extract_bytes_sync(content, "application/pdf", config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result}")
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
21
docs/snippets/elixir/api/extract_file_async.md
Normal file
21
docs/snippets/elixir/api/extract_file_async.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def extract_file_async do
|
||||
config = nil
|
||||
|
||||
task = Task.async(fn ->
|
||||
Kreuzberg.extract_file_async("document.pdf", nil, config)
|
||||
end)
|
||||
|
||||
case Task.await(task) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result}")
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
17
docs/snippets/elixir/api/extract_file_sync.md
Normal file
17
docs/snippets/elixir/api/extract_file_sync.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule Example do
|
||||
def extract_file do
|
||||
config = nil
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result}")
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
435
docs/snippets/elixir/cache/disk_cache.exs
vendored
Normal file
435
docs/snippets/elixir/cache/disk_cache.exs
vendored
Normal file
@@ -0,0 +1,435 @@
|
||||
```elixir title="Elixir"
|
||||
# Disk Caching - Implement persistent disk caching for extraction results
|
||||
# Demonstrates advanced caching strategies for document extraction
|
||||
|
||||
defmodule KreuzbergDiskCache do
|
||||
@moduledoc """
|
||||
Disk-based caching layer for Kreuzberg extraction results.
|
||||
|
||||
Provides persistent caching of extraction results with features like:
|
||||
- TTL-based cache expiration
|
||||
- Compression for large results
|
||||
- Cache statistics and management
|
||||
- Multi-tiered caching (memory + disk)
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
defmodule CacheEntry do
|
||||
@moduledoc """
|
||||
Represents a cached extraction result.
|
||||
"""
|
||||
|
||||
defstruct [
|
||||
:key,
|
||||
:result,
|
||||
:created_at,
|
||||
:accessed_at,
|
||||
:ttl_seconds,
|
||||
:size_bytes,
|
||||
:compressed
|
||||
]
|
||||
|
||||
@doc """
|
||||
Create a new cache entry.
|
||||
"""
|
||||
def new(key, result, ttl_seconds \\ 86400) do
|
||||
size = calculate_size(result)
|
||||
|
||||
%CacheEntry{
|
||||
key: key,
|
||||
result: result,
|
||||
created_at: System.monotonic_time(:second),
|
||||
accessed_at: System.monotonic_time(:second),
|
||||
ttl_seconds: ttl_seconds,
|
||||
size_bytes: size,
|
||||
compressed: false
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Check if entry has expired.
|
||||
"""
|
||||
def expired?(%CacheEntry{created_at: created_at, ttl_seconds: ttl}) do
|
||||
now = System.monotonic_time(:second)
|
||||
now - created_at > ttl
|
||||
end
|
||||
|
||||
@doc """
|
||||
Update access time.
|
||||
"""
|
||||
def touch(%CacheEntry{} = entry) do
|
||||
%{entry | accessed_at: System.monotonic_time(:second)}
|
||||
end
|
||||
|
||||
defp calculate_size(result) do
|
||||
case result do
|
||||
%{content: content} -> byte_size(content)
|
||||
_ -> 0
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defmodule Cache do
|
||||
@moduledoc """
|
||||
Main disk cache implementation.
|
||||
"""
|
||||
|
||||
defstruct [
|
||||
:cache_dir,
|
||||
:max_size_bytes,
|
||||
:ttl_seconds,
|
||||
:compression_enabled,
|
||||
:memory_cache
|
||||
]
|
||||
|
||||
@doc """
|
||||
Initialize disk cache.
|
||||
"""
|
||||
def new(cache_dir, opts \\ []) do
|
||||
File.mkdir_p!(cache_dir)
|
||||
|
||||
%Cache{
|
||||
cache_dir: cache_dir,
|
||||
max_size_bytes: Keyword.get(opts, :max_size_bytes, 1_000_000_000),
|
||||
ttl_seconds: Keyword.get(opts, :ttl_seconds, 604_800),
|
||||
compression_enabled: Keyword.get(opts, :compression_enabled, true),
|
||||
memory_cache: %{}
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get cached result by key.
|
||||
"""
|
||||
def get(cache, key) do
|
||||
# Check memory cache first
|
||||
case Map.get(cache.memory_cache, key) do
|
||||
%CacheEntry{} = entry ->
|
||||
if CacheEntry.expired?(entry) do
|
||||
Logger.debug("Cache hit (memory) - expired: #{key}")
|
||||
:miss
|
||||
else
|
||||
Logger.debug("Cache hit (memory): #{key}")
|
||||
{:hit, CacheEntry.touch(entry).result}
|
||||
end
|
||||
|
||||
nil ->
|
||||
get_from_disk(cache, key)
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Store result in cache.
|
||||
"""
|
||||
def put(cache, key, result) do
|
||||
entry = CacheEntry.new(key, result, cache.ttl_seconds)
|
||||
|
||||
# Store in memory
|
||||
new_memory_cache = Map.put(cache.memory_cache, key, entry)
|
||||
|
||||
# Store on disk
|
||||
store_on_disk(cache, key, entry)
|
||||
|
||||
# Check cache size and cleanup if needed
|
||||
cache = %{cache | memory_cache: new_memory_cache}
|
||||
maybe_cleanup(cache)
|
||||
|
||||
Logger.info("Cache stored: #{key}")
|
||||
cache
|
||||
end
|
||||
|
||||
@doc """
|
||||
Delete cache entry.
|
||||
"""
|
||||
def delete(cache, key) do
|
||||
new_memory_cache = Map.delete(cache.memory_cache, key)
|
||||
|
||||
cache_file = cache_path(cache, key)
|
||||
if File.exists?(cache_file), do: File.rm(cache_file)
|
||||
|
||||
Logger.info("Cache deleted: #{key}")
|
||||
%{cache | memory_cache: new_memory_cache}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Clear all cache entries.
|
||||
"""
|
||||
def clear(cache) do
|
||||
# Clear disk cache
|
||||
File.rm_rf!(cache.cache_dir)
|
||||
File.mkdir_p!(cache.cache_dir)
|
||||
|
||||
Logger.info("Cache cleared")
|
||||
%{cache | memory_cache: %{}}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get cache statistics.
|
||||
"""
|
||||
def stats(cache) do
|
||||
total_size = calculate_total_size(cache)
|
||||
entry_count = map_size(cache.memory_cache)
|
||||
memory_entries = Enum.count(cache.memory_cache)
|
||||
|
||||
disk_entries =
|
||||
case File.ls(cache.cache_dir) do
|
||||
{:ok, files} -> length(files)
|
||||
{:error, _} -> 0
|
||||
end
|
||||
|
||||
%{
|
||||
total_entries: entry_count,
|
||||
memory_entries: memory_entries,
|
||||
disk_entries: disk_entries,
|
||||
total_size_bytes: total_size,
|
||||
max_size_bytes: cache.max_size_bytes,
|
||||
usage_percent: (total_size / cache.max_size_bytes * 100) |> Float.round(2),
|
||||
compression_enabled: cache.compression_enabled
|
||||
}
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
|
||||
defp get_from_disk(cache, key) do
|
||||
cache_file = cache_path(cache, key)
|
||||
|
||||
if File.exists?(cache_file) do
|
||||
case File.read(cache_file) do
|
||||
{:ok, data} ->
|
||||
case deserialize(data, cache.compression_enabled) do
|
||||
{:ok, entry} ->
|
||||
if CacheEntry.expired?(entry) do
|
||||
File.rm(cache_file)
|
||||
Logger.debug("Cache hit (disk) - expired: #{key}")
|
||||
:miss
|
||||
else
|
||||
Logger.debug("Cache hit (disk): #{key}")
|
||||
{:hit, CacheEntry.touch(entry).result}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.warn("Failed to deserialize cache: #{inspect(reason)}")
|
||||
:miss
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.warn("Failed to read cache file: #{inspect(reason)}")
|
||||
:miss
|
||||
end
|
||||
else
|
||||
:miss
|
||||
end
|
||||
end
|
||||
|
||||
defp store_on_disk(cache, key, entry) do
|
||||
cache_file = cache_path(cache, key)
|
||||
|
||||
data = serialize(entry, cache.compression_enabled)
|
||||
File.write!(cache_file, data)
|
||||
end
|
||||
|
||||
defp cache_path(cache, key) do
|
||||
Path.join(cache.cache_dir, "#{key}.cache")
|
||||
end
|
||||
|
||||
defp serialize(entry, compression_enabled) do
|
||||
data = :erlang.term_to_binary(entry)
|
||||
|
||||
if compression_enabled do
|
||||
:zlib.compress(data)
|
||||
else
|
||||
data
|
||||
end
|
||||
end
|
||||
|
||||
defp deserialize(data, compression_enabled) do
|
||||
try do
|
||||
uncompressed =
|
||||
if compression_enabled do
|
||||
:zlib.uncompress(data)
|
||||
else
|
||||
data
|
||||
end
|
||||
|
||||
{:ok, :erlang.binary_to_term(uncompressed)}
|
||||
rescue
|
||||
e -> {:error, e}
|
||||
end
|
||||
end
|
||||
|
||||
defp calculate_total_size(cache) do
|
||||
cache.memory_cache
|
||||
|> Map.values()
|
||||
|> Enum.map(& &1.size_bytes)
|
||||
|> Enum.sum()
|
||||
end
|
||||
|
||||
defp maybe_cleanup(cache) do
|
||||
total_size = calculate_total_size(cache)
|
||||
|
||||
if total_size > cache.max_size_bytes do
|
||||
Logger.info("Cache size (#{total_size}) exceeds limit, starting cleanup")
|
||||
cleanup_lru(cache)
|
||||
else
|
||||
cache
|
||||
end
|
||||
end
|
||||
|
||||
defp cleanup_lru(cache) do
|
||||
# Remove least recently used entries until under limit
|
||||
entries =
|
||||
cache.memory_cache
|
||||
|> Enum.sort_by(fn {_k, entry} -> entry.accessed_at end)
|
||||
|
||||
target_size = div(cache.max_size_bytes, 2)
|
||||
current_size = calculate_total_size(cache)
|
||||
|
||||
entries
|
||||
|> Enum.reduce_while({cache, current_size}, fn {key, entry}, {acc_cache, size} ->
|
||||
if size <= target_size do
|
||||
{:halt, {acc_cache, size}}
|
||||
else
|
||||
new_cache = delete(acc_cache, key)
|
||||
new_size = size - entry.size_bytes
|
||||
{:cont, {new_cache, new_size}}
|
||||
end
|
||||
end)
|
||||
|> elem(0)
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Initialize cache and extract with caching.
|
||||
"""
|
||||
def extract_with_cache(file_path, cache_dir, opts \\ []) do
|
||||
cache = Cache.new(cache_dir, opts)
|
||||
cache_key = compute_cache_key(file_path, opts)
|
||||
|
||||
case Cache.get(cache, cache_key) do
|
||||
{:hit, result} ->
|
||||
{:ok, result, cache}
|
||||
|
||||
:miss ->
|
||||
Logger.info("Cache miss: #{file_path}")
|
||||
|
||||
case Kreuzberg.extract_file(file_path) do
|
||||
{:ok, result} ->
|
||||
new_cache = Cache.put(cache, cache_key, result)
|
||||
{:ok, result, new_cache}
|
||||
|
||||
error ->
|
||||
{error, cache}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract multiple files with batch caching.
|
||||
"""
|
||||
def batch_extract_with_cache(file_paths, cache_dir, opts \\ []) do
|
||||
cache = Cache.new(cache_dir, opts)
|
||||
|
||||
results =
|
||||
file_paths
|
||||
|> Enum.map(fn path ->
|
||||
case extract_with_cache(path, cache_dir, opts) do
|
||||
{:ok, result, _} -> {:ok, path, result}
|
||||
{{:error, reason}, _} -> {:error, path, reason}
|
||||
end
|
||||
end)
|
||||
|
||||
stats = Cache.stats(cache)
|
||||
{results, stats}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Manage cache - get stats, clear, etc.
|
||||
"""
|
||||
def manage_cache(cache_dir, action, opts \\ []) do
|
||||
cache = Cache.new(cache_dir, opts)
|
||||
|
||||
case action do
|
||||
:stats ->
|
||||
Cache.stats(cache)
|
||||
|
||||
:clear ->
|
||||
Cache.clear(cache)
|
||||
|
||||
:list ->
|
||||
case File.ls(cache_dir) do
|
||||
{:ok, files} -> files
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
|
||||
{:delete, key} ->
|
||||
Cache.delete(cache, key)
|
||||
|
||||
_ ->
|
||||
{:error, "Unknown action: #{action}"}
|
||||
end
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
|
||||
defp compute_cache_key(file_path, opts) do
|
||||
# Include file path and options in key
|
||||
content = "#{file_path}|#{inspect(opts)}"
|
||||
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
IO.puts("=== Kreuzberg Disk Cache ===\n")
|
||||
|
||||
cache_dir = "/tmp/kreuzberg_cache"
|
||||
|
||||
# Example 1: Single file extraction with caching
|
||||
IO.puts("Example 1: Single file extraction with caching")
|
||||
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
|
||||
|
||||
case KreuzbergDiskCache.extract_with_cache("document.pdf", cache_dir) do
|
||||
{:ok, result, cache} ->
|
||||
IO.puts("Extraction successful!")
|
||||
IO.puts("Content size: #{byte_size(result.content)} bytes")
|
||||
|
||||
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
|
||||
IO.puts("\nCache Statistics:")
|
||||
IO.puts(" Entries: #{stats.total_entries}")
|
||||
IO.puts(" Size: #{stats.total_size_bytes} bytes")
|
||||
IO.puts(" Usage: #{stats.usage_percent}%\n")
|
||||
|
||||
{error, _cache} ->
|
||||
IO.puts("Extraction failed: #{inspect(error)}\n")
|
||||
end
|
||||
|
||||
# Example 2: Batch extraction with cache statistics
|
||||
IO.puts("Example 2: Batch extraction with caching")
|
||||
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
|
||||
|
||||
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
|
||||
|
||||
{results, stats} = KreuzbergDiskCache.batch_extract_with_cache(documents, cache_dir)
|
||||
|
||||
successful = Enum.count(results, &match?({:ok, _, _}, &1))
|
||||
IO.puts("Batch results:")
|
||||
IO.puts(" Processed: #{length(documents)}")
|
||||
IO.puts(" Successful: #{successful}")
|
||||
IO.puts("\nCache Statistics:")
|
||||
IO.puts(" Total entries: #{stats.total_entries}")
|
||||
IO.puts(" Memory entries: #{stats.memory_entries}")
|
||||
IO.puts(" Disk entries: #{stats.disk_entries}")
|
||||
IO.puts(" Total size: #{stats.total_size_bytes} bytes")
|
||||
IO.puts(" Usage: #{stats.usage_percent}%\n")
|
||||
|
||||
# Example 3: Cache management
|
||||
IO.puts("Example 3: Cache management")
|
||||
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
|
||||
|
||||
cached_files = KreuzbergDiskCache.manage_cache(cache_dir, :list)
|
||||
IO.puts("Cached files:")
|
||||
Enum.each(cached_files, fn file -> IO.puts(" - #{file}") end)
|
||||
|
||||
IO.puts("\nCache stats:")
|
||||
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
|
||||
IO.inspect(stats, pretty: true)
|
||||
```
|
||||
306
docs/snippets/elixir/cli/basic_cli.exs
Normal file
306
docs/snippets/elixir/cli/basic_cli.exs
Normal file
@@ -0,0 +1,306 @@
|
||||
```elixir title="Elixir"
|
||||
# Basic CLI Tool - Simple command-line interface for Kreuzberg
|
||||
# Demonstrates how to build a CLI for document extraction
|
||||
|
||||
defmodule KreuzbergCLI do
|
||||
@moduledoc """
|
||||
Command-line interface for Kreuzberg document extraction.
|
||||
|
||||
Provides a simple, user-friendly CLI for extracting documents
|
||||
from the command line with progress feedback.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
@doc """
|
||||
Main entry point for CLI application.
|
||||
|
||||
Parses command-line arguments and executes the appropriate command.
|
||||
"""
|
||||
def main(args) do
|
||||
args
|
||||
|> parse_args()
|
||||
|> execute()
|
||||
end
|
||||
|
||||
defp parse_args(args) do
|
||||
case args do
|
||||
[] ->
|
||||
{:error, :no_command}
|
||||
|
||||
["extract", file | rest] ->
|
||||
opts = parse_options(rest, %{})
|
||||
{:extract, file, opts}
|
||||
|
||||
["batch", dir | rest] ->
|
||||
opts = parse_options(rest, %{})
|
||||
{:batch, dir, opts}
|
||||
|
||||
["help"] ->
|
||||
{:help}
|
||||
|
||||
["-h"] ->
|
||||
{:help}
|
||||
|
||||
["--help"] ->
|
||||
{:help}
|
||||
|
||||
[cmd] ->
|
||||
{:error, "Unknown command: #{cmd}"}
|
||||
|
||||
_ ->
|
||||
{:error, :invalid_args}
|
||||
end
|
||||
end
|
||||
|
||||
defp parse_options([], acc), do: acc
|
||||
|
||||
defp parse_options(["-v" | rest], acc) do
|
||||
parse_options(rest, Map.put(acc, :verbose, true))
|
||||
end
|
||||
|
||||
defp parse_options(["--mime-type", mime | rest], acc) do
|
||||
parse_options(rest, Map.put(acc, :mime_type, mime))
|
||||
end
|
||||
|
||||
defp parse_options(["--output", path | rest], acc) do
|
||||
parse_options(rest, Map.put(acc, :output, path))
|
||||
end
|
||||
|
||||
defp parse_options(["--ocr" | rest], acc) do
|
||||
parse_options(rest, Map.put(acc, :enable_ocr, true))
|
||||
end
|
||||
|
||||
defp parse_options(["--chunks" | rest], acc) do
|
||||
parse_options(rest, Map.put(acc, :enable_chunks, true))
|
||||
end
|
||||
|
||||
defp parse_options([_ | rest], acc) do
|
||||
parse_options(rest, acc)
|
||||
end
|
||||
|
||||
defp execute({:help}) do
|
||||
print_help()
|
||||
:ok
|
||||
end
|
||||
|
||||
defp execute({:extract, file, opts}) do
|
||||
unless File.exists?(file) do
|
||||
IO.puts(:stderr, "Error: File not found: #{file}")
|
||||
:error
|
||||
else
|
||||
extract_file(file, opts)
|
||||
end
|
||||
end
|
||||
|
||||
defp execute({:batch, dir, opts}) do
|
||||
unless File.dir?(dir) do
|
||||
IO.puts(:stderr, "Error: Directory not found: #{dir}")
|
||||
:error
|
||||
else
|
||||
batch_extract(dir, opts)
|
||||
end
|
||||
end
|
||||
|
||||
defp execute({:error, reason}) do
|
||||
IO.puts(:stderr, "Error: #{inspect(reason)}")
|
||||
print_help()
|
||||
:error
|
||||
end
|
||||
|
||||
defp extract_file(file_path, opts) do
|
||||
verbose = Map.get(opts, :verbose, false)
|
||||
mime_type = Map.get(opts, :mime_type, nil)
|
||||
output_path = Map.get(opts, :output, nil)
|
||||
|
||||
config = build_config(opts)
|
||||
|
||||
IO.puts("Extracting: #{file_path}")
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case Kreuzberg.extract_file(file_path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
elapsed = System.monotonic_time(:millisecond) - start_time
|
||||
|
||||
print_extraction_result(result, elapsed, verbose)
|
||||
|
||||
if output_path do
|
||||
save_result(result, output_path)
|
||||
end
|
||||
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Extraction failed: #{inspect(reason)}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
defp batch_extract(dir, opts) do
|
||||
verbose = Map.get(opts, :verbose, false)
|
||||
config = build_config(opts)
|
||||
|
||||
pattern = Path.join(dir, "**/*.{pdf,docx,xlsx,txt,html,md}")
|
||||
files = Path.wildcard(pattern)
|
||||
|
||||
case files do
|
||||
[] ->
|
||||
IO.puts("No documents found in #{dir}")
|
||||
:ok
|
||||
|
||||
_ ->
|
||||
IO.puts("Found #{length(files)} documents\n")
|
||||
|
||||
results =
|
||||
files
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {file, idx} ->
|
||||
IO.write(" [#{idx}/#{length(files)}] ")
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case Kreuzberg.extract_file(file, nil, config) do
|
||||
{:ok, result} ->
|
||||
elapsed = System.monotonic_time(:millisecond) - start_time
|
||||
IO.puts("#{Path.basename(file)} (#{elapsed}ms)")
|
||||
{:ok, file, result, elapsed}
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("#{Path.basename(file)} - ERROR")
|
||||
if verbose, do: IO.puts(" Error: #{inspect(reason)}")
|
||||
{:error, file, reason}
|
||||
end
|
||||
end)
|
||||
|
||||
print_batch_summary(results)
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
defp build_config(opts) do
|
||||
%Kreuzberg.ExtractionConfig{
|
||||
ocr:
|
||||
if(Map.get(opts, :enable_ocr),
|
||||
do: %{"enabled" => true, "backend" => "tesseract"},
|
||||
else: nil
|
||||
),
|
||||
chunking:
|
||||
if(Map.get(opts, :enable_chunks),
|
||||
do: %{"enabled" => true, "max_characters" => 1000, "overlap" => 100},
|
||||
else: nil
|
||||
),
|
||||
use_cache: true
|
||||
}
|
||||
end
|
||||
|
||||
defp print_extraction_result(result, elapsed_ms, verbose) do
|
||||
IO.puts("\nExtraction Results:")
|
||||
IO.puts(" Content size: #{byte_size(result.content)} bytes")
|
||||
IO.puts(" MIME type: #{result.mime_type}")
|
||||
IO.puts(" Processing time: #{elapsed_ms}ms")
|
||||
|
||||
if result.metadata do
|
||||
IO.puts(" Metadata keys: #{Enum.count(result.metadata)}")
|
||||
end
|
||||
|
||||
if result.tables && !Enum.empty?(result.tables) do
|
||||
IO.puts(" Tables found: #{length(result.tables)}")
|
||||
end
|
||||
|
||||
if result.images && !Enum.empty?(result.images) do
|
||||
IO.puts(" Images found: #{length(result.images)}")
|
||||
end
|
||||
|
||||
if result.chunks && !Enum.empty?(result.chunks) do
|
||||
IO.puts(" Chunks created: #{length(result.chunks)}")
|
||||
end
|
||||
|
||||
if result.detected_languages && !Enum.empty?(result.detected_languages) do
|
||||
IO.puts(" Languages: #{Enum.join(result.detected_languages, ", ")}")
|
||||
end
|
||||
|
||||
if verbose do
|
||||
IO.puts("\n Full metadata:")
|
||||
IO.inspect(result.metadata, pretty: true)
|
||||
end
|
||||
|
||||
IO.puts("")
|
||||
end
|
||||
|
||||
defp print_batch_summary(results) do
|
||||
total = length(results)
|
||||
successful = Enum.count(results, &match?({:ok, _, _, _}, &1))
|
||||
failed = Enum.count(results, &match?({:error, _, _}, &1))
|
||||
|
||||
total_time =
|
||||
results
|
||||
|> Enum.filter(&match?({:ok, _, _, _}, &1))
|
||||
|> Enum.map(fn {:ok, _, _, time} -> time end)
|
||||
|> Enum.sum()
|
||||
|
||||
IO.puts("\nBatch Summary:")
|
||||
IO.puts(" Total: #{total}")
|
||||
IO.puts(" Successful: #{successful}")
|
||||
IO.puts(" Failed: #{failed}")
|
||||
IO.puts(" Total time: #{total_time}ms")
|
||||
IO.puts(" Average time: #{div(total_time, max(successful, 1))}ms/document")
|
||||
end
|
||||
|
||||
defp save_result(result, output_path) do
|
||||
output_data = %{
|
||||
content: result.content,
|
||||
mime_type: result.mime_type,
|
||||
metadata: result.metadata,
|
||||
tables: result.tables || [],
|
||||
images: result.images || [],
|
||||
chunks: result.chunks || [],
|
||||
detected_languages: result.detected_languages || [],
|
||||
extracted_at: DateTime.utc_now()
|
||||
}
|
||||
|
||||
case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
|
||||
:ok ->
|
||||
IO.puts("Results saved to: #{output_path}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
|
||||
end
|
||||
end
|
||||
|
||||
defp print_help do
|
||||
IO.puts("""
|
||||
Kreuzberg CLI - Document Extraction Tool
|
||||
|
||||
USAGE:
|
||||
kreuzberg extract <file> [OPTIONS]
|
||||
kreuzberg batch <directory> [OPTIONS]
|
||||
kreuzberg help
|
||||
|
||||
COMMANDS:
|
||||
extract <file> Extract content from a single document
|
||||
batch <directory> Extract all documents in a directory
|
||||
help Show this help message
|
||||
|
||||
OPTIONS:
|
||||
-v, --verbose Show detailed output
|
||||
--mime-type <type> Specify MIME type (e.g., application/pdf)
|
||||
--output <path> Save results to JSON file
|
||||
--ocr Enable OCR for scanned documents
|
||||
--chunks Enable document chunking for RAG
|
||||
|
||||
EXAMPLES:
|
||||
kreuzberg extract document.pdf
|
||||
kreuzberg extract document.pdf --output results.json
|
||||
kreuzberg batch ./documents --ocr --chunks
|
||||
kreuzberg batch ./documents -v --output summary.json
|
||||
""")
|
||||
end
|
||||
end
|
||||
|
||||
# Entry point for escript
|
||||
def main(args) do
|
||||
case KreuzbergCLI.main(args) do
|
||||
:ok -> 0
|
||||
:error -> 1
|
||||
end
|
||||
end
|
||||
```
|
||||
455
docs/snippets/elixir/cli/cli_with_config.exs
Normal file
455
docs/snippets/elixir/cli/cli_with_config.exs
Normal file
@@ -0,0 +1,455 @@
|
||||
```elixir title="Elixir"
|
||||
# CLI with Configuration - Advanced CLI with config file support
|
||||
# Demonstrates managing complex extraction configurations via CLI
|
||||
|
||||
defmodule KreuzbergAdvancedCLI do
|
||||
@moduledoc """
|
||||
Advanced CLI with configuration file support for Kreuzberg.
|
||||
|
||||
Allows users to define extraction configurations in YAML/TOML files,
|
||||
specify preprocessing options, and manage extraction profiles.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
defmodule ConfigFile do
|
||||
@moduledoc """
|
||||
Configuration file parser and manager.
|
||||
"""
|
||||
|
||||
defstruct [
|
||||
:profiles,
|
||||
:default_profile,
|
||||
:cache_enabled,
|
||||
:cache_dir
|
||||
]
|
||||
|
||||
@doc """
|
||||
Load configuration from YAML/JSON file.
|
||||
"""
|
||||
def load(config_path) do
|
||||
unless File.exists?(config_path) do
|
||||
{:error, "Config file not found: #{config_path}"}
|
||||
else
|
||||
case Path.extname(config_path) do
|
||||
".yaml" -> load_yaml(config_path)
|
||||
".yml" -> load_yaml(config_path)
|
||||
".json" -> load_json(config_path)
|
||||
ext -> {:error, "Unsupported config format: #{ext}"}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp load_yaml(path) do
|
||||
case File.read(path) do
|
||||
{:ok, content} ->
|
||||
case :yamerl_constr.string(content, []) do
|
||||
[config] -> {:ok, parse_config(config)}
|
||||
error -> {:error, "Failed to parse YAML: #{inspect(error)}"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Failed to read config: #{inspect(reason)}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp load_json(path) do
|
||||
case File.read(path) do
|
||||
{:ok, content} ->
|
||||
case Jason.decode(content) do
|
||||
{:ok, config} -> {:ok, parse_config(config)}
|
||||
error -> {:error, "Failed to parse JSON: #{inspect(error)}"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Failed to read config: #{inspect(reason)}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp parse_config(raw_config) when is_list(raw_config) do
|
||||
raw_config = Map.new(raw_config)
|
||||
parse_config(raw_config)
|
||||
end
|
||||
|
||||
defp parse_config(raw_config) when is_map(raw_config) do
|
||||
%ConfigFile{
|
||||
profiles: Map.get(raw_config, "profiles", %{}),
|
||||
default_profile: Map.get(raw_config, "default_profile", "default"),
|
||||
cache_enabled: Map.get(raw_config, "cache_enabled", true),
|
||||
cache_dir: Map.get(raw_config, "cache_dir", "/tmp/kreuzberg_cache")
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get extraction configuration for a profile.
|
||||
"""
|
||||
def get_profile(config_file, profile_name) do
|
||||
profile_name = profile_name || config_file.default_profile
|
||||
|
||||
case Map.get(config_file.profiles, profile_name) do
|
||||
nil -> {:error, "Profile not found: #{profile_name}"}
|
||||
profile -> {:ok, profile}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
List all available profiles.
|
||||
"""
|
||||
def list_profiles(config_file) do
|
||||
Map.keys(config_file.profiles)
|
||||
end
|
||||
end
|
||||
|
||||
defmodule Extractor do
|
||||
@moduledoc """
|
||||
Main extraction engine with profile support.
|
||||
"""
|
||||
|
||||
def extract_with_profile(file_path, config_file, profile_name, opts \\ []) do
|
||||
verbose = Keyword.get(opts, :verbose, false)
|
||||
|
||||
case ConfigFile.get_profile(config_file, profile_name) do
|
||||
{:ok, profile} ->
|
||||
extract_with_config(file_path, profile, config_file, verbose)
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_with_config(file_path, profile, config_file, verbose) do
|
||||
unless File.exists?(file_path) do
|
||||
{:error, "File not found: #{file_path}"}
|
||||
else
|
||||
# Build extraction config from profile
|
||||
extraction_config = build_extraction_config(profile)
|
||||
|
||||
# Apply caching if enabled
|
||||
use_cache = config_file.cache_enabled
|
||||
cache_dir = config_file.cache_dir
|
||||
|
||||
IO.puts("Profile: #{profile["name"]}")
|
||||
IO.puts("File: #{file_path}")
|
||||
IO.puts("Cache: #{if use_cache, do: "enabled (#{cache_dir})", else: "disabled"}")
|
||||
IO.puts("")
|
||||
|
||||
# Preprocess if configured
|
||||
processed_file = preprocess_if_needed(file_path, profile, verbose)
|
||||
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case Kreuzberg.extract_file(processed_file, nil, extraction_config) do
|
||||
{:ok, result} ->
|
||||
elapsed = System.monotonic_time(:millisecond) - start_time
|
||||
|
||||
# Post-process if configured
|
||||
final_result = postprocess_if_needed(result, profile)
|
||||
|
||||
print_results(final_result, elapsed, verbose)
|
||||
cleanup_temp_files(processed_file, file_path)
|
||||
{:ok, final_result}
|
||||
|
||||
{:error, reason} ->
|
||||
cleanup_temp_files(processed_file, file_path)
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp build_extraction_config(profile) do
|
||||
%Kreuzberg.ExtractionConfig{
|
||||
ocr: profile["ocr"],
|
||||
chunking: profile["chunking"],
|
||||
quality_processing: profile["quality_processing"],
|
||||
language_detection: profile["language_detection"],
|
||||
keyword_extraction: profile["keyword_extraction"],
|
||||
images: profile["images"],
|
||||
use_cache: true
|
||||
}
|
||||
end
|
||||
|
||||
defp preprocess_if_needed(file_path, profile, verbose) do
|
||||
case profile["preprocessing"] do
|
||||
nil ->
|
||||
file_path
|
||||
|
||||
preprocessing ->
|
||||
IO.puts("Preprocessing enabled:")
|
||||
temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
|
||||
|
||||
# Apply preprocessing steps
|
||||
preprocessing
|
||||
|> Enum.reduce(file_path, fn step, path ->
|
||||
apply_preprocessing_step(step, path, temp_path, verbose)
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
||||
defp apply_preprocessing_step(step, input_path, _temp_path, verbose) do
|
||||
case step do
|
||||
%{"type" => "rotate", "degrees" => degrees} ->
|
||||
if verbose, do: IO.puts(" - Rotating #{degrees} degrees")
|
||||
input_path
|
||||
|
||||
%{"type" => "normalize", "target_format" => format} ->
|
||||
if verbose, do: IO.puts(" - Normalizing to #{format}")
|
||||
input_path
|
||||
|
||||
%{"type" => "deskew"} ->
|
||||
if verbose, do: IO.puts(" - Deskewing")
|
||||
input_path
|
||||
|
||||
_ ->
|
||||
input_path
|
||||
end
|
||||
end
|
||||
|
||||
defp postprocess_if_needed(result, profile) do
|
||||
case profile["postprocessing"] do
|
||||
nil ->
|
||||
result
|
||||
|
||||
postprocessing ->
|
||||
Enum.reduce(postprocessing, result, fn step, acc_result ->
|
||||
apply_postprocessing_step(step, acc_result)
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
||||
defp apply_postprocessing_step(%{"type" => "filter_empty_chunks"}, result) do
|
||||
case result.chunks do
|
||||
nil -> result
|
||||
chunks ->
|
||||
filtered = Enum.filter(chunks, &(byte_size(&1) > 0))
|
||||
%{result | chunks: filtered}
|
||||
end
|
||||
end
|
||||
|
||||
defp apply_postprocessing_step(%{"type" => "limit_tables", "max" => max_tables}, result) do
|
||||
case result.tables do
|
||||
nil -> result
|
||||
tables ->
|
||||
limited = Enum.take(tables, max_tables)
|
||||
%{result | tables: limited}
|
||||
end
|
||||
end
|
||||
|
||||
defp apply_postprocessing_step(_, result), do: result
|
||||
|
||||
defp cleanup_temp_files(processed_path, original_path) do
|
||||
if processed_path != original_path && String.starts_with?(processed_path, "/tmp/") do
|
||||
File.rm(processed_path)
|
||||
end
|
||||
end
|
||||
|
||||
defp print_results(result, elapsed_ms, verbose) do
|
||||
IO.puts("Results:")
|
||||
IO.puts(" Content size: #{byte_size(result.content)} bytes")
|
||||
IO.puts(" Mime type: #{result.mime_type}")
|
||||
IO.puts(" Processing time: #{elapsed_ms}ms")
|
||||
|
||||
if result.metadata do
|
||||
IO.puts(" Metadata entries: #{Enum.count(result.metadata)}")
|
||||
end
|
||||
|
||||
if result.tables && !Enum.empty?(result.tables) do
|
||||
IO.puts(" Tables: #{length(result.tables)}")
|
||||
end
|
||||
|
||||
if result.chunks && !Enum.empty?(result.chunks) do
|
||||
IO.puts(" Chunks: #{length(result.chunks)}")
|
||||
end
|
||||
|
||||
if result.detected_languages && !Enum.empty?(result.detected_languages) do
|
||||
IO.puts(" Languages: #{Enum.join(result.detected_languages, ", ")}")
|
||||
end
|
||||
|
||||
if verbose && result.metadata do
|
||||
IO.puts("\n Full metadata:")
|
||||
IO.inspect(result.metadata, pretty: true, limit: :infinity)
|
||||
end
|
||||
|
||||
IO.puts("")
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Main CLI entry point.
|
||||
"""
|
||||
def main(args) do
|
||||
args
|
||||
|> parse_args()
|
||||
|> execute()
|
||||
end
|
||||
|
||||
defp parse_args(args) do
|
||||
{opts, args, _invalid} = OptionParser.parse(args,
|
||||
switches: [
|
||||
config: :string,
|
||||
profile: :string,
|
||||
output: :string,
|
||||
verbose: :boolean,
|
||||
list: :boolean
|
||||
],
|
||||
aliases: [c: :config, p: :profile, o: :output, v: :verbose, l: :list]
|
||||
)
|
||||
|
||||
{opts, args}
|
||||
end
|
||||
|
||||
defp execute({_opts, []}) do
|
||||
print_usage()
|
||||
:error
|
||||
end
|
||||
|
||||
defp execute({opts, [command | rest]}) do
|
||||
case command do
|
||||
"extract" ->
|
||||
execute_extract(rest, opts)
|
||||
|
||||
"profiles" ->
|
||||
execute_list_profiles(opts)
|
||||
|
||||
"help" ->
|
||||
print_help()
|
||||
:ok
|
||||
|
||||
_ ->
|
||||
IO.puts(:stderr, "Unknown command: #{command}")
|
||||
print_usage()
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
defp execute_extract(args, opts) do
|
||||
config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
|
||||
profile = Keyword.get(opts, :profile, nil)
|
||||
output_path = Keyword.get(opts, :output, nil)
|
||||
verbose = Keyword.get(opts, :verbose, false)
|
||||
|
||||
case ConfigFile.load(config_path) do
|
||||
{:ok, config_file} ->
|
||||
case args do
|
||||
[] ->
|
||||
IO.puts(:stderr, "Error: No file specified")
|
||||
:error
|
||||
|
||||
[file_path | _] ->
|
||||
case Extractor.extract_with_profile(file_path, config_file, profile, verbose: verbose) do
|
||||
{:ok, result} ->
|
||||
if output_path do
|
||||
save_result(result, output_path)
|
||||
end
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Extraction failed: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Configuration error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
defp execute_list_profiles(opts) do
|
||||
config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
|
||||
|
||||
case ConfigFile.load(config_path) do
|
||||
{:ok, config_file} ->
|
||||
profiles = ConfigFile.list_profiles(config_file)
|
||||
IO.puts("Available profiles:")
|
||||
Enum.each(profiles, fn profile ->
|
||||
IO.puts(" - #{profile}")
|
||||
end)
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Configuration error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
defp save_result(result, output_path) do
|
||||
output_data = %{
|
||||
content: result.content,
|
||||
mime_type: result.mime_type,
|
||||
metadata: result.metadata,
|
||||
tables: result.tables || [],
|
||||
chunks: result.chunks || [],
|
||||
images: result.images || [],
|
||||
detected_languages: result.detected_languages || [],
|
||||
extracted_at: DateTime.utc_now()
|
||||
}
|
||||
|
||||
case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
|
||||
:ok ->
|
||||
IO.puts("Results saved to: #{output_path}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
|
||||
end
|
||||
end
|
||||
|
||||
defp print_usage do
|
||||
IO.puts("Usage: kreuzberg-cli <command> [options] [args]")
|
||||
end
|
||||
|
||||
defp print_help do
|
||||
IO.puts("""
|
||||
Kreuzberg Advanced CLI with Configuration
|
||||
|
||||
USAGE:
|
||||
kreuzberg extract <file> [OPTIONS]
|
||||
kreuzberg profiles [OPTIONS]
|
||||
kreuzberg help
|
||||
|
||||
COMMANDS:
|
||||
extract <file> Extract with configured profile
|
||||
profiles List available profiles
|
||||
help Show this help message
|
||||
|
||||
OPTIONS:
|
||||
-c, --config <path> Config file path (default: kreuzberg.yaml)
|
||||
-p, --profile <name> Profile name (default: from config)
|
||||
-o, --output <path> Save results to JSON
|
||||
-v, --verbose Verbose output
|
||||
|
||||
CONFIG FILE EXAMPLE:
|
||||
default_profile: "standard"
|
||||
cache_enabled: true
|
||||
cache_dir: "/tmp/kreuzberg_cache"
|
||||
|
||||
profiles:
|
||||
standard:
|
||||
name: "Standard Extraction"
|
||||
ocr:
|
||||
enabled: false
|
||||
chunking:
|
||||
enabled: true
|
||||
max_characters: 1000
|
||||
overlap: 100
|
||||
|
||||
ocr_intensive:
|
||||
name: "OCR + Language Detection"
|
||||
ocr:
|
||||
enabled: true
|
||||
backend: tesseract
|
||||
language_detection:
|
||||
enabled: true
|
||||
preprocessing:
|
||||
- type: deskew
|
||||
- type: rotate
|
||||
degrees: 90
|
||||
""")
|
||||
end
|
||||
end
|
||||
|
||||
# Entry point
|
||||
case KreuzbergAdvancedCLI.main(System.argv()) do
|
||||
:ok -> IO.puts("\nDone.")
|
||||
:error -> exit(1)
|
||||
end
|
||||
```
|
||||
53
docs/snippets/elixir/config/advanced_config.md
Normal file
53
docs/snippets/elixir/config/advanced_config.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"enable_quality_processing" => true,
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
},
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"embedding" => %{
|
||||
"model" => %{
|
||||
"preset" => %{
|
||||
"name" => "balanced"
|
||||
}
|
||||
},
|
||||
"batch_size" => 32,
|
||||
"normalize" => true,
|
||||
"show_download_progress" => false
|
||||
}
|
||||
},
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => false
|
||||
},
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.1,
|
||||
"ngram_range" => [1, 3],
|
||||
"language" => "en"
|
||||
},
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
},
|
||||
"postprocessor" => %{
|
||||
"enabled" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content: #{result.content}")
|
||||
|
||||
if result.detected_languages do
|
||||
IO.puts("Languages: #{inspect(result.detected_languages)}")
|
||||
end
|
||||
|
||||
chunks_count = if result.chunks, do: length(result.chunks), else: 0
|
||||
IO.puts("Chunks: #{chunks_count}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/chunking_config.md
Normal file
15
docs/snippets/elixir/config/chunking_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Chunks: #{length(result.chunks)}")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Length: #{String.length(chunk.content)}")
|
||||
end)
|
||||
```
|
||||
9
docs/snippets/elixir/config/config_basic.md
Normal file
9
docs/snippets/elixir/config/config_basic.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"enable_quality_processing" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
5
docs/snippets/elixir/config/config_discover.md
Normal file
5
docs/snippets/elixir/config/config_discover.md
Normal file
@@ -0,0 +1,5 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract with nil config to use discovered/default configuration
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
12
docs/snippets/elixir/config/config_ocr.md
Normal file
12
docs/snippets/elixir/config/config_ocr.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
IO.puts("Tables detected: #{length(result.tables)}")
|
||||
```
|
||||
20
docs/snippets/elixir/config/config_programmatic.md
Normal file
20
docs/snippets/elixir/config/config_programmatic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+deu",
|
||||
"tesseract_config" => %{
|
||||
"psm" => 6
|
||||
}
|
||||
},
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200
|
||||
},
|
||||
"enable_quality_processing" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
```
|
||||
13
docs/snippets/elixir/config/document_structure_config.md
Normal file
13
docs/snippets/elixir/config/document_structure_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Document Structure Config (Elixir)"
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
include_document_structure: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
|
||||
if result.document do
|
||||
Enum.each(result.document.nodes, fn node ->
|
||||
IO.puts("[#{node.content.node_type}]")
|
||||
end)
|
||||
end
|
||||
```
|
||||
36
docs/snippets/elixir/config/element_based_output.md
Normal file
36
docs/snippets/elixir/config/element_based_output.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```elixir title="Element-Based Output (Elixir)"
|
||||
# Configure element-based output
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
output_format: :element_based
|
||||
}
|
||||
|
||||
# Extract document
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
|
||||
# Access elements
|
||||
Enum.each(result.elements, fn element ->
|
||||
IO.puts("Type: #{element.element_type}")
|
||||
|
||||
text = String.slice(element.text, 0, 100)
|
||||
IO.puts("Text: #{text}")
|
||||
|
||||
if element.metadata.page_number do
|
||||
IO.puts("Page: #{element.metadata.page_number}")
|
||||
end
|
||||
|
||||
if element.metadata.coordinates do
|
||||
coords = element.metadata.coordinates
|
||||
IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
|
||||
end
|
||||
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
# Filter by element type
|
||||
titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
|
||||
|
||||
Enum.each(titles, fn title ->
|
||||
level = Map.get(title.metadata.additional, "level", "unknown")
|
||||
IO.puts("[#{level}] #{title.text}")
|
||||
end)
|
||||
```
|
||||
22
docs/snippets/elixir/config/embedding_config.md
Normal file
22
docs/snippets/elixir/config/embedding_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"embedding" => %{
|
||||
"model" => %{
|
||||
"preset" => %{
|
||||
"name" => "balanced"
|
||||
}
|
||||
},
|
||||
"batch_size" => 16,
|
||||
"normalize" => true,
|
||||
"show_download_progress" => true
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
chunks_count = if result.chunks, do: length(result.chunks), else: 0
|
||||
IO.puts("Chunks with embeddings: #{chunks_count}")
|
||||
```
|
||||
11
docs/snippets/elixir/config/html_output.md
Normal file
11
docs/snippets/elixir/config/html_output.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"output_format" => "Html",
|
||||
"html_output" => %{
|
||||
"theme" => "GitHub"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
14
docs/snippets/elixir/config/keyword_extraction_config.md
Normal file
14
docs/snippets/elixir/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.1,
|
||||
"ngram_range" => [1, 3],
|
||||
"language" => "en"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Keywords: #{inspect(result.keywords)}")
|
||||
```
|
||||
13
docs/snippets/elixir/config/language_detection_config.md
Normal file
13
docs/snippets/elixir/config/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Detected language: #{result.language}")
|
||||
IO.puts("Confidence: #{result.language_confidence}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/ocr_dpi_config.md
Normal file
15
docs/snippets/elixir/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"images" => %{
|
||||
"extract_images" => true,
|
||||
"target_dpi" => 300,
|
||||
"max_image_dimension" => 4096,
|
||||
"auto_adjust_dpi" => true,
|
||||
"min_dpi" => 150,
|
||||
"max_dpi" => 600
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Extracted images: #{length(result.images)}")
|
||||
```
|
||||
14
docs/snippets/elixir/config/pdf_config.md
Normal file
14
docs/snippets/elixir/config/pdf_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"pdf_options" => %{
|
||||
"extract_images" => true,
|
||||
"passwords" => ["password123"],
|
||||
"extract_metadata" => true,
|
||||
"hierarchy" => %{}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
|
||||
IO.puts("Title: #{inspect(result.metadata.title)}")
|
||||
IO.puts("Authors: #{inspect(result.metadata.authors)}")
|
||||
```
|
||||
16
docs/snippets/elixir/config/pdf_hierarchy_config.md
Normal file
16
docs/snippets/elixir/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"pdf_options" => %{
|
||||
"hierarchy" => %{
|
||||
"enabled" => true,
|
||||
"detection_threshold" => 0.75,
|
||||
"ocr_coverage_threshold" => 0.8,
|
||||
"min_level" => 1,
|
||||
"max_level" => 5
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
|
||||
```
|
||||
14
docs/snippets/elixir/config/postprocessor_config.md
Normal file
14
docs/snippets/elixir/config/postprocessor_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"postprocessor" => %{
|
||||
"enabled" => true,
|
||||
"enabled_processors" => [
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Processed content: #{result.content}")
|
||||
```
|
||||
10
docs/snippets/elixir/config/quality_processing_config.md
Normal file
10
docs/snippets/elixir/config/quality_processing_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"enable_quality_processing" => true,
|
||||
"use_cache" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Quality score: #{result.quality_score}")
|
||||
IO.puts("Processing time: #{inspect(result.processing_time)}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/tesseract_config.md
Normal file
15
docs/snippets/elixir/config/tesseract_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+deu",
|
||||
"tesseract_config" => %{
|
||||
"psm" => 6,
|
||||
"oem" => 3
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts("OCR text: #{result.content}")
|
||||
```
|
||||
12
docs/snippets/elixir/config/token_reduction_config.md
Normal file
12
docs/snippets/elixir/config/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Original tokens: #{result.token_count}")
|
||||
IO.puts("Reduced content: #{result.content}")
|
||||
```
|
||||
78
docs/snippets/elixir/configuration/chunking_config.exs
Normal file
78
docs/snippets/elixir/configuration/chunking_config.exs
Normal file
@@ -0,0 +1,78 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Example 1: Basic character-based chunking for RAG applications
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100,
|
||||
"min_size" => 200,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Access chunks
|
||||
if result.chunks do
|
||||
IO.puts("Generated #{length(result.chunks)} chunks")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Chunk: #{String.slice(chunk["content"], 0..50)}...")
|
||||
end)
|
||||
end
|
||||
|
||||
# Example 2: Markdown chunker with token-based sizing and heading context
|
||||
config2 = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"sizing" => %{
|
||||
"type" => "tokenizer",
|
||||
"model" => "Xenova/gpt-4o"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result2} = Kreuzberg.extract_file("document.md", nil, config2)
|
||||
|
||||
if result2.chunks do
|
||||
IO.puts("Generated #{length(result2.chunks)} markdown chunks")
|
||||
|
||||
Enum.each(result2.chunks, fn chunk ->
|
||||
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..60)}...")
|
||||
|
||||
# Access heading context
|
||||
if is_map(chunk["metadata"]) and is_map(chunk["metadata"]["heading_context"]) do
|
||||
headings = chunk["metadata"]["heading_context"]["headings"] || []
|
||||
if length(headings) > 0 do
|
||||
IO.puts(" Headings in context:")
|
||||
Enum.each(headings, fn heading ->
|
||||
IO.puts(" - Level #{heading["level"]}: #{heading["text"]}")
|
||||
end)
|
||||
end
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
# Example 3: Prepend heading context to chunk content
|
||||
config3 = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result3} = Kreuzberg.extract_file("document.md", nil, config3)
|
||||
|
||||
if result3.chunks do
|
||||
IO.puts("Generated #{length(result3.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result3.chunks, fn chunk ->
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
```
|
||||
17
docs/snippets/elixir/configuration/config_basic.exs
Normal file
17
docs/snippets/elixir/configuration/config_basic.exs
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"},
|
||||
chunking: %{"max_characters" => 1000, "overlap" => 100},
|
||||
language_detection: %{"enabled" => true},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
|
||||
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
|
||||
```
|
||||
26
docs/snippets/elixir/configuration/config_discover.exs
Normal file
26
docs/snippets/elixir/configuration/config_discover.exs
Normal file
@@ -0,0 +1,26 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Load configuration from file
|
||||
# Automatically discovers and loads configuration from the user's config directory
|
||||
config_path = Path.expand("~/.kreuzberg/config.json")
|
||||
|
||||
config = if File.exists?(config_path) do
|
||||
config_data = config_path |> File.read!() |> Jason.decode!()
|
||||
struct(ExtractionConfig, Map.new(config_data, fn {k, v} -> {String.to_atom(k), v} end))
|
||||
else
|
||||
IO.puts("Config file not found at #{config_path}. Using defaults.")
|
||||
%ExtractionConfig{}
|
||||
end
|
||||
|
||||
IO.puts("Configuration Source: #{if File.exists?(config_path), do: "#{config_path} (file)", else: "defaults"}")
|
||||
IO.puts("OCR Enabled: #{inspect(config.ocr["enabled"])}")
|
||||
IO.puts("Chunking Max Characters: #{inspect(config.chunking["max_characters"])}")
|
||||
IO.puts("Use Cache: #{inspect(config.use_cache)}")
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Document extraction complete")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Languages detected: #{inspect(result.detected_languages)}")
|
||||
```
|
||||
28
docs/snippets/elixir/configuration/config_ocr.exs
Normal file
28
docs/snippets/elixir/configuration/config_ocr.exs
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure OCR-specific settings
|
||||
# Includes backend selection, language settings, and preprocessing options
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"preprocessing" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 2000,
|
||||
"overlap" => 200
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
IO.puts("OCR Configuration Applied:")
|
||||
IO.puts("Backend: tesseract")
|
||||
IO.puts("Language: eng")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
44
docs/snippets/elixir/configuration/config_programmatic.exs
Normal file
44
docs/snippets/elixir/configuration/config_programmatic.exs
Normal file
@@ -0,0 +1,44 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Build configuration dynamically based on runtime conditions
|
||||
# Useful for environment-specific settings and feature flags
|
||||
|
||||
defmodule ConfigBuilder do
|
||||
def build_config(file_type, enable_ocr?) do
|
||||
base_config = %ExtractionConfig{
|
||||
chunking: %{"max_characters" => 1000, "overlap" => 100},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
case {file_type, enable_ocr?} do
|
||||
{:pdf, true} ->
|
||||
%{base_config | ocr: %{"enabled" => true, "backend" => "tesseract"}, force_ocr: true}
|
||||
|
||||
{:pdf, false} ->
|
||||
%{base_config | ocr: %{"enabled" => false}}
|
||||
|
||||
{:image, true} ->
|
||||
%{
|
||||
base_config
|
||||
| ocr: %{"enabled" => true, "backend" => "tesseract", "preprocessing" => true},
|
||||
force_ocr: true
|
||||
}
|
||||
|
||||
{:image, false} ->
|
||||
%{base_config | ocr: %{"enabled" => false}}
|
||||
|
||||
{_, _} ->
|
||||
base_config
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Build configuration based on file type and requirements
|
||||
config = ConfigBuilder.build_config(:pdf, true)
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Dynamic configuration applied")
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}")
|
||||
```
|
||||
20
docs/snippets/elixir/configuration/embedding_config.exs
Normal file
20
docs/snippets/elixir/configuration/embedding_config.exs
Normal file
@@ -0,0 +1,20 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure embeddings for vector search
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
},
|
||||
embeddings: %{
|
||||
"enabled" => true,
|
||||
"model" => "sentence-transformers/all-MiniLM-L6-v2"
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Extracted chunks with embeddings: #{length(result.chunks || [])}")
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure keyword extraction
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.5
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Access extracted keywords
|
||||
if result.keywords do
|
||||
IO.puts("Extracted #{length(result.keywords)} keywords")
|
||||
|
||||
Enum.each(result.keywords, fn keyword ->
|
||||
IO.puts("#{keyword["text"]}: #{keyword["score"]}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure language detection for multilingual documents
|
||||
# Automatically detect all languages present in the content
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multilingual.pdf", nil, config)
|
||||
|
||||
IO.puts("Detected Languages:")
|
||||
IO.inspect(result.detected_languages)
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}")
|
||||
```
|
||||
25
docs/snippets/elixir/configuration/pdf_config.exs
Normal file
25
docs/snippets/elixir/configuration/pdf_config.exs
Normal file
@@ -0,0 +1,25 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure PDF-specific extraction options
|
||||
# Extract annotations, metadata, and flatten forms for comprehensive document processing
|
||||
config = %ExtractionConfig{
|
||||
pdf: %{
|
||||
"extract_annotations" => true,
|
||||
"extract_metadata" => true,
|
||||
"flatten_forms" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1500,
|
||||
"overlap" => 150
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("form_document.pdf", nil, config)
|
||||
|
||||
IO.puts("PDF Extraction Complete:")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
IO.puts("Annotations present: #{map_size(result.metadata["annotations"] || %{}) > 0}")
|
||||
```
|
||||
29
docs/snippets/elixir/configuration/postprocessor_config.exs
Normal file
29
docs/snippets/elixir/configuration/postprocessor_config.exs
Normal file
@@ -0,0 +1,29 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure post-processing pipeline
|
||||
# Applies transformations to extracted content after initial processing
|
||||
config = %ExtractionConfig{
|
||||
postprocessing: %{
|
||||
"remove_whitespace" => true,
|
||||
"normalize_unicode" => true,
|
||||
"fix_encoding" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Post-Processing Configuration Applied:")
|
||||
IO.puts("Remove Whitespace: true")
|
||||
IO.puts("Normalize Unicode: true")
|
||||
IO.puts("Fix Encoding: true")
|
||||
IO.puts("Original content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Processed content: #{String.slice(result.content, 0..100)}...")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
@@ -0,0 +1,30 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure quality processing settings
|
||||
# Enable noise removal and set minimum quality thresholds for extracted content
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.7,
|
||||
"remove_noise" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("noisy_document.pdf", nil, config)
|
||||
|
||||
IO.puts("Quality Processing Applied:")
|
||||
IO.puts("Content quality score: #{result.quality_score}")
|
||||
IO.puts("Noise removed: true")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Processing complete: #{inspect(result)}")
|
||||
```
|
||||
29
docs/snippets/elixir/configuration/tesseract_config.exs
Normal file
29
docs/snippets/elixir/configuration/tesseract_config.exs
Normal file
@@ -0,0 +1,29 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure Tesseract OCR settings
|
||||
# Includes language selection, page segmentation mode, OCR engine mode, and DPI settings
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"psm" => 3,
|
||||
"oem" => 3,
|
||||
"dpi" => 300
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Tesseract Configuration Applied:")
|
||||
IO.puts("Backend: tesseract")
|
||||
IO.puts("Language: eng")
|
||||
IO.puts("PSM (Page Segmentation Mode): 3")
|
||||
IO.puts("OEM (OCR Engine Mode): 3")
|
||||
IO.puts("DPI: 300")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure token reduction for LLM context windows
|
||||
# Helps manage token usage when working with large language models
|
||||
config = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 4000,
|
||||
"strategy" => "truncate"
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("large_document.pdf", nil, config)
|
||||
|
||||
IO.puts("Token Reduction Configuration Applied:")
|
||||
IO.puts("Token Reduction Enabled: true")
|
||||
IO.puts("Target Tokens: 4000")
|
||||
IO.puts("Strategy: truncate")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Tokens reduced: #{inspect(result.metadata[:token_reduction_applied])}")
|
||||
IO.puts("Final metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
54
docs/snippets/elixir/core/batch_extract_bytes_sync.exs
Normal file
54
docs/snippets/elixir/core/batch_extract_bytes_sync.exs
Normal file
@@ -0,0 +1,54 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract from multiple binary inputs in a batch operation
|
||||
# Useful for processing multiple documents in memory
|
||||
|
||||
# Prepare binary data from multiple sources
|
||||
{:ok, pdf_data_1} = File.read("document1.pdf")
|
||||
{:ok, pdf_data_2} = File.read("document2.pdf")
|
||||
{:ok, pdf_data_3} = File.read("document3.pdf")
|
||||
|
||||
data_list = [pdf_data_1, pdf_data_2, pdf_data_3]
|
||||
|
||||
# Option 1: Use single MIME type for all inputs
|
||||
{:ok, results} = Kreuzberg.batch_extract_bytes(data_list, "application/pdf")
|
||||
|
||||
# Process results
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
IO.puts("Tables found: #{length(result.tables)}")
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
IO.puts("Total documents processed: #{length(results)}")
|
||||
|
||||
# Option 2: Use different MIME types for each input
|
||||
mime_types = ["application/pdf", "text/html", "application/pdf"]
|
||||
{:ok, mixed_results} = Kreuzberg.batch_extract_bytes(data_list, mime_types)
|
||||
|
||||
# Option 3: Batch extraction with configuration
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"},
|
||||
extract_images: true
|
||||
}
|
||||
|
||||
case Kreuzberg.batch_extract_bytes(data_list, "application/pdf", config) do
|
||||
{:ok, results} ->
|
||||
IO.puts("Successfully extracted #{length(results)} documents")
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Batch extraction failed: #{reason}")
|
||||
end
|
||||
|
||||
# Option 4: Using the bang variant (raises on error)
|
||||
try do
|
||||
results = Kreuzberg.batch_extract_bytes!(data_list, "application/pdf")
|
||||
IO.puts("Extracted #{length(results)} documents successfully")
|
||||
rescue
|
||||
error in Kreuzberg.Error ->
|
||||
IO.puts("Error: #{error.message}")
|
||||
end
|
||||
```
|
||||
14
docs/snippets/elixir/core/batch_extract_files_sync.exs
Normal file
14
docs/snippets/elixir/core/batch_extract_files_sync.exs
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
|
||||
|
||||
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
|
||||
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("File: #{result.mime_type}")
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
IO.puts("Total files processed: #{length(results)}")
|
||||
```
|
||||
163
docs/snippets/elixir/core/client_extract_multiple_files.exs
Normal file
163
docs/snippets/elixir/core/client_extract_multiple_files.exs
Normal file
@@ -0,0 +1,163 @@
|
||||
```elixir title="Elixir"
|
||||
# Client wrapper for batch file extraction
|
||||
# Provides error handling, logging, and result aggregation
|
||||
|
||||
defmodule BatchDocumentClient do
|
||||
@moduledoc """
|
||||
Client wrapper for batch file document extraction.
|
||||
Handles multiple files with comprehensive error handling and logging.
|
||||
"""
|
||||
|
||||
alias Kreuzberg.ExtractionResult
|
||||
|
||||
@doc """
|
||||
Extract content from multiple files in batch.
|
||||
|
||||
Returns results for all successfully processed files and logs errors
|
||||
for any files that fail during extraction.
|
||||
|
||||
## Options
|
||||
|
||||
* `:mime_type` - MIME type for all files (optional, defaults to auto-detection)
|
||||
* `:config` - ExtractionConfig struct with options (optional)
|
||||
* `:log_errors` - Whether to log extraction errors (default: true)
|
||||
* `:fail_fast` - Stop on first error (default: false)
|
||||
|
||||
## Examples
|
||||
|
||||
{:ok, results} = BatchDocumentClient.extract_files(
|
||||
["doc1.pdf", "doc2.pdf", "doc3.pdf"],
|
||||
mime_type: "application/pdf"
|
||||
)
|
||||
"""
|
||||
@spec extract_files([String.t()], keyword()) ::
|
||||
{:ok, [ExtractionResult.t()]} | {:error, String.t()}
|
||||
def extract_files(paths, opts \\ []) do
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
config = Keyword.get(opts, :config, nil)
|
||||
log_errors = Keyword.get(opts, :log_errors, true)
|
||||
|
||||
case Kreuzberg.batch_extract_files(paths, mime_type, config) do
|
||||
{:ok, results} ->
|
||||
IO.debug("Successfully extracted #{length(results)} files")
|
||||
{:ok, results}
|
||||
|
||||
{:error, reason} ->
|
||||
if log_errors do
|
||||
IO.debug("Batch extraction error: #{reason}")
|
||||
end
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract files and return detailed statistics.
|
||||
|
||||
Returns aggregated metrics about all processed files.
|
||||
"""
|
||||
@spec extract_files_with_stats([String.t()], keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_files_with_stats(paths, opts \\ []) do
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case extract_files(paths, opts) do
|
||||
{:ok, results} ->
|
||||
elapsed_ms = System.monotonic_time(:millisecond) - start_time
|
||||
|
||||
stats = %{
|
||||
total_files: length(results),
|
||||
total_content_size: Enum.reduce(results, 0, &(byte_size(&1.content) + &2)),
|
||||
total_tables: Enum.reduce(results, 0, &(length(&1.tables) + &2)),
|
||||
total_images: Enum.reduce(results, 0, &(length(&1.images || []) + &2)),
|
||||
processing_time_ms: elapsed_ms,
|
||||
avg_time_per_file_ms: div(elapsed_ms, max(length(results), 1)),
|
||||
results: results
|
||||
}
|
||||
|
||||
{:ok, stats}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract files and transform results.
|
||||
|
||||
Applies a transformation function to each extraction result.
|
||||
Useful for custom processing or formatting of results.
|
||||
"""
|
||||
@spec extract_and_transform([String.t()], function(), keyword()) ::
|
||||
{:ok, [any()]} | {:error, String.t()}
|
||||
def extract_and_transform(paths, transform_fn, opts \\ []) do
|
||||
case extract_files(paths, opts) do
|
||||
{:ok, results} ->
|
||||
transformed =
|
||||
results
|
||||
|> Enum.map(fn result ->
|
||||
try do
|
||||
{:ok, transform_fn.(result)}
|
||||
rescue
|
||||
error ->
|
||||
IO.debug("Transform error: #{inspect(error)}")
|
||||
{:error, error}
|
||||
end
|
||||
end)
|
||||
|
||||
# Check if any transforms failed
|
||||
case Enum.find(transformed, fn r -> match?({:error, _}, r) end) do
|
||||
nil ->
|
||||
# All succeeded
|
||||
{:ok, Enum.map(transformed, fn {:ok, value} -> value end)}
|
||||
|
||||
{:error, error} ->
|
||||
{:error, "Transform failed: #{inspect(error)}"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
|
||||
# Extract multiple files
|
||||
case BatchDocumentClient.extract_files(["doc1.pdf", "doc2.pdf", "doc3.pdf"]) do
|
||||
{:ok, results} ->
|
||||
Enum.each(results, fn result ->
|
||||
IO.puts("Extracted: #{byte_size(result.content)} bytes")
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
# Extract with statistics
|
||||
case BatchDocumentClient.extract_files_with_stats(["doc1.pdf", "doc2.pdf"]) do
|
||||
{:ok, stats} ->
|
||||
IO.puts("Total files: #{stats.total_files}")
|
||||
IO.puts("Total size: #{stats.total_content_size} bytes")
|
||||
IO.puts("Processing time: #{stats.processing_time_ms}ms")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
# Extract and transform
|
||||
transform = fn result ->
|
||||
%{
|
||||
mime: result.mime_type,
|
||||
size: byte_size(result.content),
|
||||
tables: length(result.tables)
|
||||
}
|
||||
end
|
||||
|
||||
case BatchDocumentClient.extract_and_transform(["doc1.pdf", "doc2.pdf"], transform) do
|
||||
{:ok, transformed_results} ->
|
||||
IO.inspect(transformed_results)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
107
docs/snippets/elixir/core/client_extract_single_file.exs
Normal file
107
docs/snippets/elixir/core/client_extract_single_file.exs
Normal file
@@ -0,0 +1,107 @@
|
||||
```elixir title="Elixir"
|
||||
# Reusable client pattern for single file extraction
|
||||
# Encapsulates extraction logic with consistent error handling
|
||||
|
||||
defmodule DocumentClient do
|
||||
@moduledoc """
|
||||
Client wrapper for single file document extraction.
|
||||
Provides a consistent interface for extracting content from files.
|
||||
"""
|
||||
|
||||
alias Kreuzberg.ExtractionResult
|
||||
|
||||
@doc """
|
||||
Extract content from a single file.
|
||||
|
||||
Returns a result tuple with the extraction outcome.
|
||||
Supports explicit MIME type specification or auto-detection.
|
||||
|
||||
## Options
|
||||
|
||||
* `:mime_type` - MIME type of the file (optional, defaults to auto-detection)
|
||||
* `:config` - ExtractionConfig struct with options (optional)
|
||||
|
||||
## Examples
|
||||
|
||||
{:ok, result} = DocumentClient.extract_file("document.pdf")
|
||||
{:ok, result} = DocumentClient.extract_file("document.pdf", mime_type: "application/pdf")
|
||||
"""
|
||||
@spec extract_file(String.t(), keyword()) ::
|
||||
{:ok, ExtractionResult.t()} | {:error, String.t()}
|
||||
def extract_file(path, opts \\ []) do
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
config = Keyword.get(opts, :config, nil)
|
||||
|
||||
case Kreuzberg.extract_file(path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
IO.debug("Successfully extracted file: #{path}")
|
||||
{:ok, result}
|
||||
|
||||
{:error, reason} ->
|
||||
IO.debug("Failed to extract file: #{path} - #{reason}")
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract content from a file, raising on error.
|
||||
|
||||
Raises Kreuzberg.Error if extraction fails.
|
||||
"""
|
||||
@spec extract_file!(String.t(), keyword()) :: ExtractionResult.t()
|
||||
def extract_file!(path, opts \\ []) do
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
config = Keyword.get(opts, :config, nil)
|
||||
|
||||
Kreuzberg.extract_file!(path, mime_type, config)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract and process file content.
|
||||
|
||||
Returns a map with extracted content, metadata, and processing statistics.
|
||||
"""
|
||||
@spec extract_with_stats(String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_with_stats(path, opts \\ []) do
|
||||
start_time = System.monotonic_time(:millisecond)
|
||||
|
||||
case extract_file(path, opts) do
|
||||
{:ok, result} ->
|
||||
elapsed_ms = System.monotonic_time(:millisecond) - start_time
|
||||
|
||||
{:ok,
|
||||
%{
|
||||
content: result.content,
|
||||
mime_type: result.mime_type,
|
||||
metadata: result.metadata,
|
||||
table_count: length(result.tables),
|
||||
image_count: length(result.images || []),
|
||||
processing_time_ms: elapsed_ms
|
||||
}}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
case DocumentClient.extract_file("document.pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
end
|
||||
|
||||
# Extract with statistics
|
||||
case DocumentClient.extract_with_stats("document.pdf") do
|
||||
{:ok, stats} ->
|
||||
IO.puts("Processing time: #{stats.processing_time_ms}ms")
|
||||
IO.puts("Tables found: #{stats.table_count}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
247
docs/snippets/elixir/core/client_extract_with_config.exs
Normal file
247
docs/snippets/elixir/core/client_extract_with_config.exs
Normal file
@@ -0,0 +1,247 @@
|
||||
```elixir title="Elixir"
|
||||
# Client wrapper with comprehensive configuration
|
||||
# Demonstrates advanced extraction patterns with OCR, chunking, and validation
|
||||
|
||||
defmodule ConfiguredDocumentClient do
|
||||
@moduledoc """
|
||||
Client wrapper for document extraction with advanced configuration.
|
||||
Supports OCR, chunking, language detection, and custom error handling.
|
||||
"""
|
||||
|
||||
alias Kreuzberg.{ExtractionConfig, ExtractionResult}
|
||||
|
||||
@doc """
|
||||
Extract with OCR enabled for scanned documents.
|
||||
|
||||
Uses Tesseract OCR backend for text extraction from images.
|
||||
"""
|
||||
@spec extract_with_ocr(String.t() | binary(), keyword()) ::
|
||||
{:ok, ExtractionResult.t()} | {:error, String.t()}
|
||||
def extract_with_ocr(input, opts \\ []) do
|
||||
is_file = is_binary(input) and File.exists?(input)
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
|
||||
},
|
||||
force_ocr: Keyword.get(opts, :force_ocr, false)
|
||||
}
|
||||
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
|
||||
case is_file do
|
||||
true -> Kreuzberg.extract_file(input, mime_type, config)
|
||||
false -> Kreuzberg.extract(input, mime_type || "application/pdf", config)
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract with text chunking for embedding or RAG pipelines.
|
||||
|
||||
Splits extracted text into chunks with configurable size and overlap.
|
||||
"""
|
||||
@spec extract_with_chunking(String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_with_chunking(path, opts \\ []) do
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
|
||||
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
|
||||
}
|
||||
}
|
||||
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
|
||||
case Kreuzberg.extract_file(path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
chunks = result.chunks || []
|
||||
|
||||
{:ok,
|
||||
%{
|
||||
content: result.content,
|
||||
chunks: chunks,
|
||||
chunk_count: length(chunks),
|
||||
metadata: result.metadata
|
||||
}}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract with language detection and multi-language support.
|
||||
|
||||
Detects document languages and can extract from specific languages.
|
||||
"""
|
||||
@spec extract_with_language_detection(String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_with_language_detection(path, opts \\ []) do
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{"enabled" => true},
|
||||
extract_images: true
|
||||
}
|
||||
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
|
||||
case Kreuzberg.extract_file(path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
{:ok,
|
||||
%{
|
||||
content: result.content,
|
||||
detected_languages: result.detected_languages || [],
|
||||
mime_type: result.mime_type,
|
||||
tables: result.tables,
|
||||
images: result.images || []
|
||||
}}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract with comprehensive configuration for production use.
|
||||
|
||||
Combines OCR, chunking, caching, and language detection with error handling.
|
||||
"""
|
||||
@spec extract_with_full_config(String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_with_full_config(path, opts \\ []) do
|
||||
config = %ExtractionConfig{
|
||||
# OCR settings
|
||||
ocr: %{
|
||||
"enabled" => Keyword.get(opts, :ocr_enabled, true),
|
||||
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
|
||||
},
|
||||
# Chunking for embeddings
|
||||
chunking: %{
|
||||
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
|
||||
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
|
||||
},
|
||||
# Language detection
|
||||
language_detection: %{"enabled" => Keyword.get(opts, :detect_language, true)},
|
||||
# Cache results
|
||||
use_cache: Keyword.get(opts, :use_cache, true),
|
||||
# Extract various content types
|
||||
extract_images: Keyword.get(opts, :extract_images, true),
|
||||
extract_tables: true
|
||||
}
|
||||
|
||||
mime_type = Keyword.get(opts, :mime_type, nil)
|
||||
|
||||
case Kreuzberg.extract_file(path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
summary = %{
|
||||
file_path: path,
|
||||
mime_type: result.mime_type,
|
||||
content_length: byte_size(result.content),
|
||||
content_preview: String.slice(result.content, 0..200),
|
||||
detected_languages: result.detected_languages || [],
|
||||
table_count: length(result.tables),
|
||||
image_count: length(result.images || []),
|
||||
chunk_count: length(result.chunks || []),
|
||||
metadata: result.metadata
|
||||
}
|
||||
|
||||
{:ok, summary}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Extraction failed: #{reason}"}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Validate file before extraction.
|
||||
|
||||
Checks file existence and MIME type compatibility.
|
||||
"""
|
||||
@spec validate_file(String.t()) :: :ok | {:error, String.t()}
|
||||
def validate_file(path) do
|
||||
cond do
|
||||
not File.exists?(path) ->
|
||||
{:error, "File not found: #{path}"}
|
||||
|
||||
true ->
|
||||
case Kreuzberg.detect_mime_type_from_path(path) do
|
||||
{:ok, _mime_type} ->
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Cannot determine MIME type: #{reason}"}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract with validation and error recovery.
|
||||
|
||||
Validates file before extraction and provides detailed error information.
|
||||
"""
|
||||
@spec extract_safely(String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_safely(path, opts \\ []) do
|
||||
with :ok <- validate_file(path),
|
||||
{:ok, summary} <- extract_with_full_config(path, opts) do
|
||||
{:ok, summary}
|
||||
else
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
|
||||
# Extract with OCR
|
||||
case ConfiguredDocumentClient.extract_with_ocr("scanned_document.pdf",
|
||||
ocr_backend: "tesseract"
|
||||
) do
|
||||
{:ok, result} ->
|
||||
IO.puts("OCR extraction successful")
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
# Extract with chunking
|
||||
case ConfiguredDocumentClient.extract_with_chunking("document.pdf",
|
||||
chunk_size: 500,
|
||||
chunk_overlap: 50
|
||||
) do
|
||||
{:ok, data} ->
|
||||
IO.puts("Chunks: #{data.chunk_count}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
# Extract with language detection
|
||||
case ConfiguredDocumentClient.extract_with_language_detection("multilingual.pdf") do
|
||||
{:ok, data} ->
|
||||
IO.puts("Detected languages: #{inspect(data.detected_languages)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
# Comprehensive extraction with validation
|
||||
case ConfiguredDocumentClient.extract_safely("document.pdf",
|
||||
ocr_enabled: true,
|
||||
detect_language: true,
|
||||
extract_images: true,
|
||||
use_cache: true
|
||||
) do
|
||||
{:ok, summary} ->
|
||||
IO.puts("File: #{summary.file_path}")
|
||||
IO.puts("MIME: #{summary.mime_type}")
|
||||
IO.puts("Size: #{summary.content_length} bytes")
|
||||
IO.puts("Tables: #{summary.table_count}")
|
||||
IO.puts("Languages: #{inspect(summary.detected_languages)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
31
docs/snippets/elixir/core/error_handling.exs
Normal file
31
docs/snippets/elixir/core/error_handling.exs
Normal file
@@ -0,0 +1,31 @@
|
||||
```elixir title="Elixir"
|
||||
# Example: Handling extraction errors
|
||||
case Kreuzberg.extract_file("document.pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Successfully extracted content")
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
end
|
||||
|
||||
# Example: Handling with custom error message
|
||||
result = Kreuzberg.extract_file("nonexistent.pdf")
|
||||
|
||||
case result do
|
||||
{:ok, data} ->
|
||||
IO.puts("File processed successfully")
|
||||
{:error, error} ->
|
||||
IO.puts("Error details: #{inspect(error)}")
|
||||
end
|
||||
|
||||
# Example: Extract with pattern matching
|
||||
case Kreuzberg.extract(<<>>, "application/pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result.content}")
|
||||
{:error, msg} when is_binary(msg) ->
|
||||
IO.puts("Validation error: #{msg}")
|
||||
{:error, reason} ->
|
||||
IO.puts("Unknown error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
13
docs/snippets/elixir/core/extract_bytes_sync.exs
Normal file
13
docs/snippets/elixir/core/extract_bytes_sync.exs
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
# Read file into memory
|
||||
{:ok, file_content} = File.read("document.pdf")
|
||||
|
||||
# Extract from bytes/binary data
|
||||
{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
|
||||
|
||||
content = result.content
|
||||
IO.puts("Extracted content:")
|
||||
IO.puts(content)
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
IO.puts("Tables found: #{length(result.tables)}")
|
||||
```
|
||||
12
docs/snippets/elixir/core/extract_file_async.exs
Normal file
12
docs/snippets/elixir/core/extract_file_async.exs
Normal file
@@ -0,0 +1,12 @@
|
||||
```elixir title="Elixir"
|
||||
task = Kreuzberg.extract_file_async("document.pdf")
|
||||
{:ok, result} = Task.await(task)
|
||||
|
||||
content = result.content
|
||||
table_count = length(result.tables)
|
||||
metadata = result.metadata
|
||||
|
||||
IO.puts("Content length: #{byte_size(content)} characters")
|
||||
IO.puts("Tables: #{table_count}")
|
||||
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
```
|
||||
11
docs/snippets/elixir/core/extract_file_sync.exs
Normal file
11
docs/snippets/elixir/core/extract_file_sync.exs
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
content = result.content
|
||||
table_count = length(result.tables)
|
||||
metadata = result.metadata
|
||||
|
||||
IO.puts("Content length: #{byte_size(content)} characters")
|
||||
IO.puts("Tables: #{table_count}")
|
||||
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
```
|
||||
13
docs/snippets/elixir/getting-started/basic_usage.exs
Normal file
13
docs/snippets/elixir/getting-started/basic_usage.exs
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
# Basic document extraction workflow
|
||||
# Load file -> extract -> access results
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
IO.puts("Extracted Content:")
|
||||
IO.puts(result.content)
|
||||
|
||||
IO.puts("\nMetadata:")
|
||||
IO.puts("Format: #{inspect(result.metadata.format)}")
|
||||
IO.puts("Tables found: #{length(result.tables)}")
|
||||
```
|
||||
19
docs/snippets/elixir/getting-started/basic_usage.md
Normal file
19
docs/snippets/elixir/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule BasicUsage do
|
||||
def extract_with_default_config do
|
||||
# Use default configuration (nil)
|
||||
config = nil
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, content} ->
|
||||
IO.puts("Extracted content:")
|
||||
IO.puts(content)
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
13
docs/snippets/elixir/getting-started/extract_file.exs
Normal file
13
docs/snippets/elixir/getting-started/extract_file.exs
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract from different file types (PDF, DOCX, etc.)
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf") do
|
||||
{:ok, result} ->
|
||||
IO.puts("Content: #{result.content}")
|
||||
IO.puts("Format: #{inspect(result.metadata.format)}")
|
||||
IO.puts("Tables: #{length(result.tables)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
19
docs/snippets/elixir/getting-started/extract_file.md
Normal file
19
docs/snippets/elixir/getting-started/extract_file.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule ExtractFile do
|
||||
def show_extraction_details do
|
||||
# Extract from a file
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
|
||||
{:ok, result} ->
|
||||
# Result is a string containing extracted content
|
||||
IO.puts("Content length: #{String.length(result)} characters")
|
||||
IO.puts("---")
|
||||
IO.puts(result)
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Failed to extract: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
19
docs/snippets/elixir/getting-started/extract_with_ocr.exs
Normal file
19
docs/snippets/elixir/getting-started/extract_with_ocr.exs
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract scanned documents with OCR
|
||||
# Configure Tesseract for OCR processing
|
||||
|
||||
ocr_config = %Kreuzberg.Config.OCR{
|
||||
backend: "tesseract",
|
||||
language: "eng"
|
||||
}
|
||||
|
||||
config = %Kreuzberg.Config.Extraction{
|
||||
ocr: ocr_config
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned.pdf", config: config)
|
||||
|
||||
IO.puts("Extracted text from scanned document:")
|
||||
IO.puts(result.content)
|
||||
IO.puts("Used OCR backend: tesseract")
|
||||
```
|
||||
19
docs/snippets/elixir/getting-started/extract_with_ocr.md
Normal file
19
docs/snippets/elixir/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule ExtractWithOcr do
|
||||
def extract_scanned_document do
|
||||
# Build configuration with OCR settings as JSON string
|
||||
config = ~s({"ocr": {"backend": "tesseract", "language": "eng"}})
|
||||
|
||||
case Kreuzberg.extract_file_sync("scanned.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Extracted via OCR:")
|
||||
IO.puts(result)
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("OCR extraction failed: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
5
docs/snippets/elixir/getting-started/hello_world.exs
Normal file
5
docs/snippets/elixir/getting-started/hello_world.exs
Normal file
@@ -0,0 +1,5 @@
|
||||
```elixir title="Elixir"
|
||||
# First Kreuzberg program - extract text from a PDF
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
IO.puts(result.content)
|
||||
```
|
||||
14
docs/snippets/elixir/getting-started/hello_world.md
Normal file
14
docs/snippets/elixir/getting-started/hello_world.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule HelloWorld do
|
||||
def main do
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Extraction succeeded!")
|
||||
IO.puts(result)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
5
docs/snippets/elixir/getting-started/install_verify.exs
Normal file
5
docs/snippets/elixir/getting-started/install_verify.exs
Normal file
@@ -0,0 +1,5 @@
|
||||
```elixir title="Elixir"
|
||||
# Verify Kreuzberg is installed and working by extracting a document
|
||||
{:ok, result} = Kreuzberg.extract_file("sample.pdf")
|
||||
IO.puts("Installation verified! Extracted #{String.length(result.content)} characters")
|
||||
```
|
||||
18
docs/snippets/elixir/getting-started/install_verify.md
Normal file
18
docs/snippets/elixir/getting-started/install_verify.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule InstallVerify do
|
||||
def verify_install do
|
||||
# Verify Kreuzberg module is available
|
||||
{:ok, extractors} = Kreuzberg.list_document_extractors()
|
||||
IO.puts("Available extractors: #{inspect(extractors)}")
|
||||
|
||||
# Verify a simple extraction works
|
||||
case Kreuzberg.extract_file_sync("test.txt", nil, nil) do
|
||||
{:ok, _result} ->
|
||||
IO.puts("Kreuzberg is properly installed and working!")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
22
docs/snippets/elixir/getting-started/read_content.exs
Normal file
22
docs/snippets/elixir/getting-started/read_content.exs
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
# Access different parts of result
|
||||
# Content, metadata, tables, images
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
# Access main content
|
||||
content = result.content
|
||||
IO.puts("Content length: #{String.length(content)} characters")
|
||||
|
||||
# Access tables
|
||||
tables = result.tables
|
||||
IO.puts("Tables found: #{length(tables)}")
|
||||
|
||||
# Access images
|
||||
images = result.images
|
||||
IO.puts("Images found: #{length(images)}")
|
||||
|
||||
# Access metadata
|
||||
format = result.metadata.format
|
||||
IO.puts("Format: #{inspect(format)}")
|
||||
```
|
||||
24
docs/snippets/elixir/getting-started/read_content.md
Normal file
24
docs/snippets/elixir/getting-started/read_content.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule ReadContent do
|
||||
def process_extracted_content do
|
||||
# Extract content and iterate over lines
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
|
||||
{:ok, content} ->
|
||||
IO.puts("Processing extracted content:")
|
||||
|
||||
# Split content into lines and iterate
|
||||
content
|
||||
|> String.split("\n", trim: true)
|
||||
|> Enum.each(fn line ->
|
||||
IO.puts(" #{line}")
|
||||
end)
|
||||
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
:error
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
32
docs/snippets/elixir/llm/structured_extraction.md
Normal file
32
docs/snippets/elixir/llm/structured_extraction.md
Normal file
@@ -0,0 +1,32 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
schema = %{
|
||||
"type" => "object",
|
||||
"properties" => %{
|
||||
"title" => %{"type" => "string"},
|
||||
"authors" => %{"type" => "array", "items" => %{"type" => "string"}},
|
||||
"date" => %{"type" => "string"}
|
||||
},
|
||||
"required" => ["title", "authors", "date"],
|
||||
"additionalProperties" => false
|
||||
}
|
||||
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"structured_extraction" => %{
|
||||
"schema" => schema,
|
||||
"schema_name" => "paper_metadata",
|
||||
"strict" => true,
|
||||
"llm" => %{"model" => "openai/gpt-4o-mini"}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("paper.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
|
||||
case result["structured_output"] do
|
||||
nil -> IO.puts("no structured output")
|
||||
output -> IO.inspect(output, label: "structured")
|
||||
end
|
||||
```
|
||||
312
docs/snippets/elixir/mcp/mcp_custom_client.exs
Normal file
312
docs/snippets/elixir/mcp/mcp_custom_client.exs
Normal file
@@ -0,0 +1,312 @@
|
||||
```elixir title="Elixir"
|
||||
# MCP Custom Client - Connect to Kreuzberg MCP servers
|
||||
# Demonstrates creating a reusable MCP client for document extraction
|
||||
|
||||
defmodule KreuzbergMCPClient do
|
||||
@moduledoc """
|
||||
MCP client for communicating with Kreuzberg extraction servers.
|
||||
|
||||
Provides methods for extracting documents from remote MCP servers
|
||||
with support for caching, retry logic, and error handling.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
defmodule Config do
|
||||
@moduledoc """
|
||||
Configuration for MCP client connections.
|
||||
"""
|
||||
|
||||
defstruct [
|
||||
:host,
|
||||
:port,
|
||||
:timeout_ms,
|
||||
:max_retries,
|
||||
:retry_delay_ms,
|
||||
:cache_dir
|
||||
]
|
||||
|
||||
def new(opts \\ []) do
|
||||
%Config{
|
||||
host: Keyword.get(opts, :host, "localhost"),
|
||||
port: Keyword.get(opts, :port, 8080),
|
||||
timeout_ms: Keyword.get(opts, :timeout_ms, 30000),
|
||||
max_retries: Keyword.get(opts, :max_retries, 3),
|
||||
retry_delay_ms: Keyword.get(opts, :retry_delay_ms, 1000),
|
||||
cache_dir: Keyword.get(opts, :cache_dir, nil)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract document from file via MCP server.
|
||||
|
||||
Sends extraction request to remote Kreuzberg MCP server and returns
|
||||
structured extraction result with optional caching.
|
||||
|
||||
## Options
|
||||
|
||||
* `:mime_type` - MIME type of document
|
||||
* `:config` - Extraction configuration map
|
||||
* `:use_cache` - Enable result caching (default: false)
|
||||
"""
|
||||
@spec extract_file(Config.t(), String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def extract_file(config, file_path, opts \\ []) do
|
||||
mime_type = Keyword.get(opts, :mime_type)
|
||||
extraction_config = Keyword.get(opts, :config)
|
||||
use_cache = Keyword.get(opts, :use_cache, false)
|
||||
|
||||
# Check cache first
|
||||
if use_cache and config.cache_dir do
|
||||
cache_key = compute_cache_key(file_path, mime_type, extraction_config)
|
||||
|
||||
case get_from_cache(config.cache_dir, cache_key) do
|
||||
{:ok, cached_result} ->
|
||||
Logger.debug("Cache hit for #{file_path}")
|
||||
{:ok, cached_result}
|
||||
|
||||
:miss ->
|
||||
# Cache miss, fetch from server
|
||||
case fetch_from_server(config, file_path, mime_type, extraction_config) do
|
||||
{:ok, result} ->
|
||||
if use_cache, do: store_in_cache(config.cache_dir, cache_key, result)
|
||||
{:ok, result}
|
||||
|
||||
error ->
|
||||
error
|
||||
end
|
||||
end
|
||||
else
|
||||
fetch_from_server(config, file_path, mime_type, extraction_config)
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Upload and extract document via MCP server.
|
||||
|
||||
Reads file from disk, uploads it to the server, and returns extraction result.
|
||||
Useful for server-side processing of large files.
|
||||
"""
|
||||
@spec upload_and_extract(Config.t(), String.t(), keyword()) ::
|
||||
{:ok, map()} | {:error, String.t()}
|
||||
def upload_and_extract(config, file_path, opts \\ []) do
|
||||
unless File.exists?(file_path) do
|
||||
{:error, "File not found: #{file_path}"}
|
||||
else
|
||||
case File.read(file_path) do
|
||||
{:ok, body} ->
|
||||
url = "http://#{config.host}:#{config.port}/extract/file"
|
||||
|
||||
headers = [
|
||||
{"Content-Type", "application/octet-stream"},
|
||||
{"X-File-Name", Path.basename(file_path)}
|
||||
]
|
||||
|
||||
case HTTPoison.post(url, body, headers, timeout: config.timeout_ms) do
|
||||
{:ok, response} ->
|
||||
handle_response(response)
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.error("Upload failed: #{inspect(reason)}")
|
||||
{:error, "Upload failed: #{inspect(reason)}"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Failed to read file: #{inspect(reason)}"}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Check health status of MCP server.
|
||||
"""
|
||||
@spec health_check(Config.t()) :: {:ok, map()} | {:error, String.t()}
|
||||
def health_check(config) do
|
||||
url = "http://#{config.host}:#{config.port}/health"
|
||||
|
||||
case HTTPoison.get(url, [], timeout: config.timeout_ms) do
|
||||
{:ok, response} ->
|
||||
case handle_response(response) do
|
||||
{:ok, data} -> {:ok, data}
|
||||
error -> error
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Health check failed: #{inspect(reason)}"}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Batch extract multiple documents with parallel requests.
|
||||
|
||||
Sends concurrent extraction requests for better throughput with large
|
||||
document collections.
|
||||
"""
|
||||
@spec batch_extract(Config.t(), [String.t()], keyword()) ::
|
||||
{:ok, [map()]} | {:error, String.t()}
|
||||
def batch_extract(config, file_paths, opts \\ []) do
|
||||
Logger.info("Batch extracting #{length(file_paths)} documents")
|
||||
|
||||
results =
|
||||
file_paths
|
||||
|> Task.async_stream(fn path ->
|
||||
extract_file(config, path, opts)
|
||||
end)
|
||||
|> Stream.map(fn {:ok, result} -> result end)
|
||||
|> Enum.to_list()
|
||||
|
||||
success_count = Enum.count(results, &match?({:ok, _}, &1))
|
||||
Logger.info("Batch extraction complete: #{success_count}/#{length(file_paths)} succeeded")
|
||||
|
||||
{:ok, results}
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
|
||||
defp fetch_from_server(config, file_path, mime_type, extraction_config) do
|
||||
url = "http://#{config.host}:#{config.port}/extract"
|
||||
|
||||
body =
|
||||
Jason.encode!(%{
|
||||
file_path: file_path,
|
||||
mime_type: mime_type,
|
||||
config: extraction_config
|
||||
})
|
||||
|
||||
headers = [{"Content-Type", "application/json"}]
|
||||
|
||||
retry_request(config, fn ->
|
||||
HTTPoison.post(url, body, headers, timeout: config.timeout_ms)
|
||||
end)
|
||||
|> case do
|
||||
{:ok, response} -> handle_response(response)
|
||||
error -> error
|
||||
end
|
||||
end
|
||||
|
||||
defp retry_request(config, request_fn) do
|
||||
retry_request(config, request_fn, 0)
|
||||
end
|
||||
|
||||
defp retry_request(config, request_fn, attempt) when attempt < config.max_retries do
|
||||
case request_fn.() do
|
||||
{:ok, response} ->
|
||||
{:ok, response}
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.warn("Request failed (attempt #{attempt + 1}): #{inspect(reason)}")
|
||||
Process.sleep(config.retry_delay_ms)
|
||||
retry_request(config, request_fn, attempt + 1)
|
||||
end
|
||||
end
|
||||
|
||||
defp retry_request(_config, _request_fn, _attempt) do
|
||||
{:error, "Max retries exceeded"}
|
||||
end
|
||||
|
||||
defp handle_response(%HTTPoison.Response{status_code: 200, body: body}) do
|
||||
case Jason.decode(body) do
|
||||
{:ok, data} ->
|
||||
if Map.get(data, "success") do
|
||||
{:ok, data}
|
||||
else
|
||||
{:error, Map.get(data, "error", "Unknown error")}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, "Failed to decode response: #{inspect(reason)}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp handle_response(%HTTPoison.Response{status_code: status, body: body}) do
|
||||
{:error, "Server error (#{status}): #{body}"}
|
||||
end
|
||||
|
||||
defp compute_cache_key(file_path, mime_type, config) do
|
||||
content = "#{file_path}|#{mime_type}|#{inspect(config)}"
|
||||
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
|
||||
end
|
||||
|
||||
defp get_from_cache(cache_dir, cache_key) do
|
||||
cache_file = Path.join(cache_dir, "#{cache_key}.json")
|
||||
|
||||
if File.exists?(cache_file) do
|
||||
case File.read(cache_file) do
|
||||
{:ok, content} ->
|
||||
{:ok, Jason.decode!(content)}
|
||||
|
||||
:error ->
|
||||
:miss
|
||||
end
|
||||
else
|
||||
:miss
|
||||
end
|
||||
end
|
||||
|
||||
defp store_in_cache(cache_dir, cache_key, result) do
|
||||
File.mkdir_p!(cache_dir)
|
||||
cache_file = Path.join(cache_dir, "#{cache_key}.json")
|
||||
File.write!(cache_file, Jason.encode!(result))
|
||||
end
|
||||
end
|
||||
|
||||
# Usage examples
|
||||
IO.puts("=== Kreuzberg MCP Client ===\n")
|
||||
|
||||
# Create client configuration
|
||||
config = KreuzbergMCPClient.Config.new(
|
||||
host: "localhost",
|
||||
port: 8080,
|
||||
timeout_ms: 30000,
|
||||
max_retries: 3,
|
||||
cache_dir: "/tmp/kreuzberg_cache"
|
||||
)
|
||||
|
||||
# Check server health
|
||||
IO.puts("Checking server health...")
|
||||
|
||||
case KreuzbergMCPClient.health_check(config) do
|
||||
{:ok, health} ->
|
||||
IO.puts("Server status: #{health["status"]}")
|
||||
IO.puts("Service: #{health["service"]}\n")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Health check failed: #{reason}\n")
|
||||
end
|
||||
|
||||
# Extract single document
|
||||
IO.puts("Extracting document...")
|
||||
|
||||
case KreuzbergMCPClient.extract_file(config, "document.pdf", use_cache: true) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Success!")
|
||||
IO.puts("Content size: #{byte_size(result["content"])} bytes")
|
||||
IO.puts("MIME type: #{result["mime_type"]}")
|
||||
IO.puts("Tables found: #{length(result["tables"])}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
end
|
||||
|
||||
IO.puts("")
|
||||
|
||||
# Batch extract multiple documents
|
||||
IO.puts("Batch extracting multiple documents...")
|
||||
|
||||
documents = [
|
||||
"doc1.pdf",
|
||||
"doc2.pdf",
|
||||
"doc3.pdf"
|
||||
]
|
||||
|
||||
case KreuzbergMCPClient.batch_extract(config, documents) do
|
||||
{:ok, results} ->
|
||||
IO.puts("Batch extraction complete!")
|
||||
successful = Enum.count(results, &match?({:ok, _}, &1))
|
||||
IO.puts("Successful: #{successful}/#{length(results)}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Batch extraction failed: #{reason}")
|
||||
end
|
||||
```
|
||||
300
docs/snippets/elixir/mcp/mcp_server_start.exs
Normal file
300
docs/snippets/elixir/mcp/mcp_server_start.exs
Normal file
@@ -0,0 +1,300 @@
|
||||
```elixir title="Elixir"
|
||||
# MCP Server Integration - Start a Kreuzberg MCP server
|
||||
# Demonstrates how to set up and manage an MCP server for remote document extraction
|
||||
|
||||
defmodule KreuzbergMCPServer do
|
||||
@moduledoc """
|
||||
MCP (Model Context Protocol) server for Kreuzberg document extraction.
|
||||
|
||||
Provides a standardized interface for remote clients to extract documents
|
||||
using the Kreuzberg library via the Model Context Protocol.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
@doc """
|
||||
Start the MCP server on the specified host and port.
|
||||
|
||||
The server accepts extraction requests from MCP clients and returns
|
||||
structured document data including content, metadata, and extracted elements.
|
||||
|
||||
## Options
|
||||
|
||||
* `:host` - Server host (default: "127.0.0.1")
|
||||
* `:port` - Server port (default: 8080)
|
||||
* `:max_connections` - Maximum concurrent connections (default: 10)
|
||||
"""
|
||||
def start_server(opts \\ []) do
|
||||
host = Keyword.get(opts, :host, "127.0.0.1")
|
||||
port = Keyword.get(opts, :port, 8080)
|
||||
max_connections = Keyword.get(opts, :max_connections, 10)
|
||||
|
||||
Logger.info("Starting Kreuzberg MCP server on #{host}:#{port}")
|
||||
|
||||
{:ok, _pid} =
|
||||
:cowboy.start_clear(
|
||||
:kreuzberg_http,
|
||||
[{:port, port}],
|
||||
%{
|
||||
env: [
|
||||
{:dispatch,
|
||||
[
|
||||
{:_,
|
||||
[
|
||||
{"/extract", KreuzbergMCPServer.Handler, []},
|
||||
{"/extract/file", KreuzbergMCPServer.FileHandler, []},
|
||||
{"/health", KreuzbergMCPServer.HealthHandler, []}
|
||||
]}
|
||||
]}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
Logger.info("MCP server started successfully")
|
||||
{:ok, "Server running on #{host}:#{port}"}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Stop the MCP server gracefully.
|
||||
"""
|
||||
def stop_server do
|
||||
Logger.info("Stopping Kreuzberg MCP server")
|
||||
:cowboy.stop_listener(:kreuzberg_http)
|
||||
Logger.info("MCP server stopped")
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
# Handler for extraction requests
|
||||
defmodule KreuzbergMCPServer.Handler do
|
||||
@moduledoc """
|
||||
HTTP handler for MCP extraction requests.
|
||||
Processes incoming extraction requests with optional configuration.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
def init(req, state) do
|
||||
req
|
||||
|> handle_request()
|
||||
|> reply()
|
||||
|> wrap_response(state)
|
||||
end
|
||||
|
||||
defp handle_request(req) do
|
||||
case req.method do
|
||||
"POST" -> handle_extraction(req)
|
||||
_ -> error_response(405, "Method not allowed")
|
||||
end
|
||||
end
|
||||
|
||||
defp handle_extraction(req) do
|
||||
case :cowboy_req.read_body(req) do
|
||||
{:ok, body, req} ->
|
||||
case Jason.decode(body) do
|
||||
{:ok, params} ->
|
||||
extract_from_params(params, req)
|
||||
|
||||
{:error, reason} ->
|
||||
error_response(400, "Invalid JSON: #{inspect(reason)}")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
error_response(400, "Failed to read body: #{inspect(reason)}")
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_from_params(params, req) do
|
||||
file_path = Map.get(params, "file_path")
|
||||
mime_type = Map.get(params, "mime_type")
|
||||
config_opts = Map.get(params, "config", %{})
|
||||
|
||||
unless file_path do
|
||||
error_response(400, "Missing required parameter: file_path")
|
||||
else
|
||||
config = build_config(config_opts)
|
||||
|
||||
case Kreuzberg.extract_file(file_path, mime_type, config) do
|
||||
{:ok, result} ->
|
||||
response_data = %{
|
||||
success: true,
|
||||
content: result.content,
|
||||
mime_type: result.mime_type,
|
||||
metadata: result.metadata || %{},
|
||||
tables: result.tables || [],
|
||||
chunks: result.chunks || [],
|
||||
images: result.images || [],
|
||||
detected_languages: result.detected_languages || []
|
||||
}
|
||||
|
||||
success_response(200, response_data, req)
|
||||
|
||||
{:error, reason} ->
|
||||
error_response(400, "Extraction failed: #{inspect(reason)}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp build_config(opts) when is_map(opts) do
|
||||
%Kreuzberg.ExtractionConfig{
|
||||
ocr: opts["ocr"],
|
||||
chunking: opts["chunking"],
|
||||
quality_processing: opts["quality_processing"],
|
||||
language_detection: opts["language_detection"],
|
||||
images: opts["images"],
|
||||
use_cache: Map.get(opts, "use_cache", true)
|
||||
}
|
||||
end
|
||||
|
||||
defp build_config(_), do: nil
|
||||
|
||||
defp success_response(status, data, req) do
|
||||
{:ok,
|
||||
:cowboy_req.reply(
|
||||
status,
|
||||
%{"content-type" => "application/json"},
|
||||
Jason.encode!(data),
|
||||
req
|
||||
)}
|
||||
end
|
||||
|
||||
defp error_response(status, message) do
|
||||
{:error,
|
||||
status,
|
||||
Jason.encode!(%{
|
||||
success: false,
|
||||
error: message
|
||||
})}
|
||||
end
|
||||
|
||||
defp reply({:ok, req}), do: {req, :ok}
|
||||
defp reply({:error, status, body}), do: {status, body}
|
||||
|
||||
defp wrap_response({req, :ok}, state), do: {:ok, req, state}
|
||||
defp wrap_response({status, body}, state) do
|
||||
# Note: In actual implementation, req needs to be passed through the pipeline
|
||||
# For now, create a minimal request object for error responses
|
||||
req = :cowboy_req.new()
|
||||
{:cowboy_req.reply(status, %{}, body, req), state}
|
||||
end
|
||||
end
|
||||
|
||||
# Health check handler
|
||||
defmodule KreuzbergMCPServer.HealthHandler do
|
||||
@moduledoc """
|
||||
Health check endpoint for the MCP server.
|
||||
"""
|
||||
|
||||
def init(req, state) do
|
||||
response = Jason.encode!(%{
|
||||
status: "healthy",
|
||||
service: "kreuzberg-mcp",
|
||||
timestamp: DateTime.utc_now() |> DateTime.to_iso8601()
|
||||
})
|
||||
|
||||
req =
|
||||
:cowboy_req.reply(
|
||||
200,
|
||||
%{"content-type" => "application/json"},
|
||||
response,
|
||||
req
|
||||
)
|
||||
|
||||
{:ok, req, state}
|
||||
end
|
||||
end
|
||||
|
||||
# File upload handler
|
||||
defmodule KreuzbergMCPServer.FileHandler do
|
||||
@moduledoc """
|
||||
Handler for multipart file uploads for extraction.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
def init(req, state) do
|
||||
case req.method do
|
||||
"POST" -> handle_file_upload(req, state)
|
||||
_ -> {:cowboy_req.reply(405, %{}, "Method not allowed", req), state}
|
||||
end
|
||||
end
|
||||
|
||||
defp handle_file_upload(req, state) do
|
||||
# Store uploaded file temporarily
|
||||
temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
|
||||
|
||||
case :cowboy_req.read_body(req) do
|
||||
{:ok, body, req} ->
|
||||
File.write!(temp_path, body)
|
||||
|
||||
case Kreuzberg.extract_file(temp_path) do
|
||||
{:ok, result} ->
|
||||
response = Jason.encode!(%{
|
||||
success: true,
|
||||
content_size: byte_size(result.content),
|
||||
mime_type: result.mime_type,
|
||||
metadata: result.metadata
|
||||
})
|
||||
|
||||
req =
|
||||
:cowboy_req.reply(
|
||||
200,
|
||||
%{"content-type" => "application/json"},
|
||||
response,
|
||||
req
|
||||
)
|
||||
|
||||
File.rm(temp_path)
|
||||
{:ok, req, state}
|
||||
|
||||
{:error, reason} ->
|
||||
response = Jason.encode!(%{success: false, error: inspect(reason)})
|
||||
|
||||
req =
|
||||
:cowboy_req.reply(
|
||||
400,
|
||||
%{"content-type" => "application/json"},
|
||||
response,
|
||||
req
|
||||
)
|
||||
|
||||
File.rm(temp_path)
|
||||
{:ok, req, state}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
response = Jason.encode!(%{success: false, error: inspect(reason)})
|
||||
|
||||
req =
|
||||
:cowboy_req.reply(
|
||||
400,
|
||||
%{"content-type" => "application/json"},
|
||||
response,
|
||||
req
|
||||
)
|
||||
|
||||
{:ok, req, state}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Usage example - start the server
|
||||
IO.puts("=== Kreuzberg MCP Server ===\n")
|
||||
|
||||
case KreuzbergMCPServer.start_server(port: 8080) do
|
||||
{:ok, message} ->
|
||||
IO.puts(message)
|
||||
IO.puts("\nServer is running and ready to accept requests:")
|
||||
IO.puts(" - POST /extract - Extract from file path")
|
||||
IO.puts(" - POST /extract/file - Upload and extract")
|
||||
IO.puts(" - GET /health - Health check")
|
||||
|
||||
# Keep the server running
|
||||
IO.puts("\nServer started. Press Ctrl+C to stop.")
|
||||
Process.sleep(:infinity)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Failed to start server: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user