Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/advanced/10_chunking_config.exs
+++ b/docs/snippets/elixir/advanced/10_chunking_config.exs
@@ -0,0 +1,53 @@
+# Advanced Chunking Configuration
+# This example shows how to configure sophisticated document chunking strategies
+# with fine-grained control over chunk size, overlap, and boundary respect.
+
+alias Kreuzberg.ExtractionConfig
+
+# Advanced chunking configuration with multiple parameters
+config = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "min_size" => 100,
+    "respect_boundaries" => true,
+    "split_on" => ["sentence", "paragraph"]
+  }
+}
+
+# Use the configuration for extraction
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Process the chunked results
+chunks = result.chunks || []
+
+IO.puts("Total chunks: #{length(chunks)}")
+
+Enum.each(chunks, fn chunk ->
+  IO.inspect(%{
+    text_length: String.length(chunk["content"]),
+    page: chunk["page"],
+    boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
+  })
+end)
+
+# Prepend heading context to chunk content
+config_with_headings = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "chunker_type" => "markdown",
+    "prepend_heading_context" => true
+  }
+}
+
+{:ok, result_headings} = Kreuzberg.extract_file("document.md", nil, config_with_headings)
+
+if result_headings.chunks do
+  IO.puts("Generated #{length(result_headings.chunks)} chunks with prepended headings")
+
+  Enum.each(result_headings.chunks, fn chunk ->
+    # Each chunk's content is prefixed with its heading breadcrumb
+    IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
+  end)
+end
--- a/docs/snippets/elixir/advanced/11_language_detection_config.exs
+++ b/docs/snippets/elixir/advanced/11_language_detection_config.exs
@@ -0,0 +1,36 @@
+# Advanced Language Detection Configuration
+# This example demonstrates how to configure language detection with multiple
+# parameters for detecting and tracking all languages in documents.
+
+alias Kreuzberg.ExtractionConfig
+
+# Advanced language detection configuration
+config = %ExtractionConfig{
+  language_detection: %{
+    "enabled" => true,
+    "detect_all" => true,
+    "min_confidence" => 0.8
+  }
+}
+
+# Extract file with language detection enabled
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Process the detection results
+detection_results = result.detected_languages || []
+
+IO.puts("Detected languages:")
+
+Enum.each(detection_results, fn lang_result ->
+  confidence = Map.get(lang_result, "confidence", "unknown")
+  language = Map.get(lang_result, "language", "unknown")
+  IO.puts("  - #{language}: #{confidence}")
+end)
+
+# Filter results by minimum confidence threshold
+high_confidence_languages = Enum.filter(detection_results, fn lang_result ->
+  confidence = Map.get(lang_result, "confidence", 0)
+  confidence >= 0.8
+end)
+
+IO.puts("\nHigh confidence languages (>= 0.8): #{length(high_confidence_languages)}")
--- a/docs/snippets/elixir/advanced/12_keyword_extraction_config.exs
+++ b/docs/snippets/elixir/advanced/12_keyword_extraction_config.exs
@@ -0,0 +1,22 @@
+# Advanced keyword extraction configuration
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  keyword_extraction: %{
+    "enabled" => true,
+    "max_keywords" => 20,
+    "min_score" => 0.6,
+    "algorithm" => "tfidf"
+  }
+}
+
+{:ok, result} = Kreuzberg.extract_file("research_paper.pdf", nil, config)
+
+if result.keywords do
+  # Group by score ranges
+  high_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.8 end)
+  medium_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.6 and kw["score"] < 0.8 end)
+
+  IO.puts("High confidence keywords (#{length(high_score)}):")
+  Enum.each(high_score, fn kw -> IO.puts("  - #{kw["word"]} (#{kw["score"]})") end)
+end
--- a/docs/snippets/elixir/advanced/13_quality_processing_config.exs
+++ b/docs/snippets/elixir/advanced/13_quality_processing_config.exs
@@ -0,0 +1,39 @@
+# Quality processing configuration
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  quality_processing: %{
+    "enabled" => true,
+    "min_quality_score" => 0.75,
+    "remove_noise" => true,
+    "enhance_clarity" => true
+  },
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract"
+  }
+}
+
+# Extract file with quality processing and OCR
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Process the quality-processed results
+IO.puts("=== Quality Processing Results ===\n")
+
+# Display extracted content with quality processing applied
+content_preview = String.slice(result.content, 0..300)
+IO.puts("Extracted Content (first 300 chars):")
+IO.puts(content_preview)
+IO.puts("\nTotal content size: #{byte_size(result.content)} bytes")
+
+# Check quality metrics
+quality_score = result.quality_score
+if quality_score && quality_score > 0 do
+  IO.puts("\nQuality Score: #{quality_score}")
+  IO.puts("Quality Status: #{if quality_score >= 0.75, do: "Acceptable", else: "Below threshold"}")
+end
+
+# Display any OCR results if applicable
+if result.images && length(result.images) > 0 do
+  IO.puts("\nImages found: #{length(result.images)}")
+end
--- a/docs/snippets/elixir/advanced/14_token_reduction_config.exs
+++ b/docs/snippets/elixir/advanced/14_token_reduction_config.exs
@@ -0,0 +1,21 @@
+# Token reduction for different LLM context windows
+alias Kreuzberg.ExtractionConfig
+
+# For GPT-4
+config_gpt4 = %ExtractionConfig{
+  token_reduction: %{
+    "enabled" => true,
+    "target_tokens" => 8000,
+    "strategy" => "intelligent",
+    "preserve_structure" => true
+  }
+}
+
+# For Claude
+config_claude = %ExtractionConfig{
+  token_reduction: %{
+    "enabled" => true,
+    "target_tokens" => 100000,
+    "strategy" => "minimal"
+  }
+}
--- a/docs/snippets/elixir/advanced/9_vector_database_integration.exs
+++ b/docs/snippets/elixir/advanced/9_vector_database_integration.exs
@@ -0,0 +1,33 @@
+# Vector Database Integration
+# This example demonstrates how to prepare document chunks for integration with vector databases
+# by configuring chunking and processing the extracted content.
+
+alias Kreuzberg.ExtractionConfig
+
+# Configure extraction with chunking enabled
+config = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 512,
+    "overlap" => 50
+  }
+}
+
+# Extract file with chunking
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Prepare chunks for vector database ingestion
+documents = Enum.map(result.chunks || [], fn chunk ->
+  %{
+    content: chunk["content"],
+    metadata: %{
+      page: chunk["page"],
+      char_count: String.length(chunk["content"])
+    }
+  }
+end)
+
+IO.puts("Prepared #{length(documents)} documents for vector DB")
+
+# The documents list can now be sent to your vector database
+# Example: documents |> MyVectorDB.index_documents()
--- a/docs/snippets/elixir/advanced/chunk_page_mapping.exs
+++ b/docs/snippets/elixir/advanced/chunk_page_mapping.exs
@@ -0,0 +1,13 @@
+# Extract with chunking and track page boundaries
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{"enabled" => true, "max_characters" => 500},
+  track_page_boundaries: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Map each chunk to its source page
+Enum.with_index(result.chunks || [], 1) |> Enum.each(fn {chunk, idx} ->
+  page = chunk["page"] || "unknown"
+  IO.puts("Chunk #{idx} from page #{page}")
+end)
--- a/docs/snippets/elixir/advanced/chunk_page_mapping.md
+++ b/docs/snippets/elixir/advanced/chunk_page_mapping.md
@@ -0,0 +1,23 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "max_characters" => 1024,
+    "overlap" => 128
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+# Map chunks to their source pages
+chunks_with_pages = result.chunks
+  |> Enum.map(fn chunk ->
+    %{
+      "chunk_id" => chunk["id"],
+      "content" => chunk["content"],
+      "page_number" => chunk["page"]
+    }
+  end)
+
+IO.inspect(chunks_with_pages, label: "Chunks with Page Mapping")
+```
--- a/docs/snippets/elixir/advanced/chunking_config.md
+++ b/docs/snippets/elixir/advanced/chunking_config.md
@@ -0,0 +1,45 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "min_size" => 100,
+    "respect_boundaries" => true,
+    "split_on" => ["sentence", "paragraph"]
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+chunks = result.chunks || []
+IO.puts("Total chunks: #{length(chunks)}")
+
+Enum.each(chunks, fn chunk ->
+  IO.inspect(%{
+    text_length: String.length(chunk["content"]),
+    page: chunk["page"],
+    boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
+  })
+end)
+```
+
+```elixir title="Elixir - Prepend Heading Context"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "chunker_type" => "markdown",
+    "prepend_heading_context" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.md", "text/markdown", config_json)
+
+if result.chunks do
+  IO.puts("Generated #{length(result.chunks)} chunks with prepended headings")
+
+  Enum.each(result.chunks, fn chunk ->
+    IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
+  end)
+end
+```
--- a/docs/snippets/elixir/advanced/chunking_rag.exs
+++ b/docs/snippets/elixir/advanced/chunking_rag.exs
@@ -0,0 +1,26 @@
+# Configure chunking for RAG/vector search
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 512,
+    "overlap" => 50,
+    "respect_boundaries" => true
+  }
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Prepare chunks for vector embedding and search
+chunks_for_embedding = result.chunks
+  |> Enum.map(fn chunk ->
+    %{
+      "id" => chunk["id"],
+      "content" => chunk["content"],
+      "metadata" => %{
+        "page" => chunk["page"],
+        "source" => "document.pdf"
+      }
+    }
+  end)
+
+IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
--- a/docs/snippets/elixir/advanced/chunking_rag.md
+++ b/docs/snippets/elixir/advanced/chunking_rag.md
@@ -0,0 +1,27 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "max_characters" => 512,
+    "overlap" => 50,
+    "respect_boundaries" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+# Prepare chunks for vector embedding and search
+chunks_for_embedding = result.chunks
+  |> Enum.map(fn chunk ->
+    %{
+      "id" => chunk["id"],
+      "content" => chunk["content"],
+      "metadata" => %{
+        "page" => chunk["page"],
+        "source" => "document.pdf"
+      }
+    }
+  end)
+
+IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
+```
--- a/docs/snippets/elixir/advanced/combining_all_features.exs
+++ b/docs/snippets/elixir/advanced/combining_all_features.exs
@@ -0,0 +1,20 @@
+# Use all major features together
+config = %Kreuzberg.ExtractionConfig{
+  ocr: %{"enabled" => true},
+  chunking: %{"enabled" => true, "max_characters" => 1000},
+  images: %{"extract" => true},
+  language_detection: %{"enabled" => true},
+  keyword_extraction: %{"enabled" => true}
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Process results with all extracted features
+IO.inspect(result, label: "Extraction Result")
+
+# Access different feature outputs
+if result.content, do: IO.puts("Text: #{String.slice(result.content, 0..100)}")
+if result.detected_languages, do: IO.puts("Language: #{inspect(result.detected_languages)}")
+if result.metadata["keywords"], do: IO.puts("Keywords: #{inspect(result.metadata["keywords"])}")
+if result.chunks, do: IO.puts("Chunks: #{length(result.chunks)}")
+if result.images, do: IO.puts("Images: #{length(result.images)}")
--- a/docs/snippets/elixir/advanced/complete_example.exs
+++ b/docs/snippets/elixir/advanced/complete_example.exs
@@ -0,0 +1,103 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Build comprehensive configuration for all features
+config = %ExtractionConfig{
+  # Enable OCR for scanned documents
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng",
+    "force_all_pages" => false
+  },
+  # Configure chunking for RAG applications
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 1000,
+    "overlap" => 100
+  },
+  # Extract images from documents
+  images: %{
+    "extract" => true
+  },
+  # Enable language detection
+  language_detection: %{
+    "enabled" => true
+  },
+  # Use caching for performance
+  use_cache: true
+}
+
+# Extract file with full configuration
+case Kreuzberg.extract_file("document.pdf", nil, config) do
+  {:ok, result} ->
+    IO.puts("=== Extraction Successful ===\n")
+
+    # 1. Process content
+    IO.puts("--- Content ---")
+    content_preview = String.slice(result.content, 0..100)
+    IO.puts("Preview: #{content_preview}...")
+    IO.puts("Total size: #{byte_size(result.content)} bytes\n")
+
+    # 2. Process metadata
+    IO.puts("--- Metadata ---")
+    IO.puts("MIME type: #{result.mime_type}")
+    metadata = result.metadata || %{}
+    IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
+
+    # Handle PDF-specific metadata
+    case metadata["pdf"] do
+      pdf_meta when is_map(pdf_meta) ->
+        IO.puts("  Pages: #{pdf_meta["page_count"]}")
+        IO.puts("  Author: #{pdf_meta["author"]}")
+        IO.puts("  Title: #{pdf_meta["title"]}")
+      _ -> nil
+    end
+    IO.puts("")
+
+    # 3. Process tables
+    IO.puts("--- Tables ---")
+    tables = result.tables || []
+    IO.puts("Total tables: #{length(tables)}")
+    Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
+      cells = table["cells"] || []
+      IO.puts("  Table #{idx}: #{length(cells)} rows")
+      markdown = table["markdown"]
+      if markdown, do: IO.puts("    Markdown: #{String.slice(markdown, 0..50)}...")
+    end)
+    IO.puts("")
+
+    # 4. Process chunks for RAG
+    IO.puts("--- Chunks ---")
+    chunks = result.chunks || []
+    IO.puts("Total chunks: #{length(chunks)}")
+    Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, idx} ->
+      IO.puts("  Chunk #{idx}: #{byte_size(chunk)} bytes")
+    end)
+    IO.puts("")
+
+    # 5. Process detected languages
+    IO.puts("--- Language Detection ---")
+    languages = result.detected_languages || []
+    if Enum.empty?(languages) do
+      IO.puts("No languages detected")
+    else
+      Enum.each(languages, fn lang ->
+        IO.puts("  Language: #{lang}")
+      end)
+    end
+    IO.puts("")
+
+    # 6. Process images
+    IO.puts("--- Images ---")
+    images = result.images || []
+    IO.puts("Total images: #{length(images)}")
+    Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
+      IO.puts("  Image #{idx}: #{image["format"]} (#{image["size"]} bytes)")
+    end)
+
+  {:error, reason} ->
+    IO.puts("Extraction failed!")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/advanced/embedding_with_chunking.exs
+++ b/docs/snippets/elixir/advanced/embedding_with_chunking.exs
@@ -0,0 +1,27 @@
+# Extract and prepare for embeddings
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{"enabled" => true, "max_characters" => 512},
+  embeddings: %{"enabled" => true}
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Process chunks with embeddings for semantic search
+embedded_chunks = result.chunks
+  |> Enum.with_index(1)
+  |> Enum.map(fn {chunk, idx} ->
+    %{
+      "chunk_id" => idx,
+      "content" => chunk["content"],
+      "embedding" => chunk["embedding"],
+      "page" => chunk["page"],
+      "metadata" => %{
+        "document" => "document.pdf",
+        "chunk_index" => idx
+      }
+    }
+  end)
+
+# Store embeddings in vector database
+IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
+IO.inspect(embedded_chunks, label: "Embedded Chunks")
--- a/docs/snippets/elixir/advanced/embedding_with_chunking.md
+++ b/docs/snippets/elixir/advanced/embedding_with_chunking.md
@@ -0,0 +1,32 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "max_characters" => 512
+  },
+  "embeddings" => %{
+    "enabled" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+# Process chunks with embeddings for semantic search
+embedded_chunks = result.chunks
+  |> Enum.with_index(1)
+  |> Enum.map(fn {chunk, idx} ->
+    %{
+      "chunk_id" => idx,
+      "content" => chunk["content"],
+      "embedding" => chunk["embedding"],
+      "page" => chunk["page"],
+      "metadata" => %{
+        "document" => "document.pdf",
+        "chunk_index" => idx
+      }
+    }
+  end)
+
+IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
+IO.inspect(embedded_chunks, label: "Embedded Chunks")
+```
--- a/docs/snippets/elixir/advanced/keyword_extraction_config.md
+++ b/docs/snippets/elixir/advanced/keyword_extraction_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.3
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
+
+if result.keywords do
+  IO.puts("Keywords: #{inspect(result.keywords)}")
+end
+```
--- a/docs/snippets/elixir/advanced/keyword_extraction_example.exs
+++ b/docs/snippets/elixir/advanced/keyword_extraction_example.exs
@@ -0,0 +1,30 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Extract keywords from document
+config = %ExtractionConfig{
+  keyword_extraction: %{
+    "enabled" => true,
+    "max_keywords" => 15
+  }
+}
+
+case Kreuzberg.extract_file("article.pdf", nil, config) do
+  {:ok, result} ->
+    IO.puts("=== Keyword Extraction ===\n")
+
+    # Display extracted keywords
+    if result.keywords do
+      IO.puts("Extracted keywords:")
+      Enum.each(result.keywords, fn kw ->
+        IO.puts("- #{kw["word"]}: #{kw["score"]}")
+      end)
+    else
+      IO.puts("No keywords extracted")
+    end
+
+  {:error, reason} ->
+    IO.puts("Extraction failed!")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/advanced/keyword_extraction_example.md
+++ b/docs/snippets/elixir/advanced/keyword_extraction_example.md
@@ -0,0 +1,18 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.3
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
+
+if result.keywords do
+  result.keywords
+    |> Enum.each(fn %{"keyword" => kw, "score" => score} ->
+      IO.puts("#{kw}: #{Float.round(score, 4)}")
+    end)
+end
+```
--- a/docs/snippets/elixir/advanced/language_detection_config.md
+++ b/docs/snippets/elixir/advanced/language_detection_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.8,
+    "detect_multiple" => false
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+if result.language do
+  IO.puts("Detected language: #{result.language}")
+end
+```
--- a/docs/snippets/elixir/advanced/language_detection_multilingual.exs
+++ b/docs/snippets/elixir/advanced/language_detection_multilingual.exs
@@ -0,0 +1,32 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Detect all languages in multilingual document
+config = %ExtractionConfig{
+  language_detection: %{
+    "enabled" => true,
+    "detect_all" => true
+  }
+}
+
+case Kreuzberg.extract_file("multilingual.pdf", nil, config) do
+  {:ok, result} ->
+    IO.puts("=== Language Detection ===\n")
+
+    # Display detected languages
+    languages = result.detected_languages || []
+    if Enum.empty?(languages) do
+      IO.puts("No languages detected")
+    else
+      IO.puts("Detected languages:")
+      Enum.each(languages, fn lang ->
+        IO.puts("- #{lang}")
+      end)
+      IO.puts("\nTotal languages: #{length(languages)}")
+    end
+
+  {:error, reason} ->
+    IO.puts("Extraction failed!")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/advanced/language_detection_multilingual.md
+++ b/docs/snippets/elixir/advanced/language_detection_multilingual.md
@@ -0,0 +1,18 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.7,
+    "detect_multiple" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("multilingual_document.pdf", "application/pdf", config_json)
+
+if result.languages do
+  IO.puts("Detected languages:")
+  Enum.each(result.languages, fn %{"language" => lang, "confidence" => conf} ->
+    IO.puts("  - #{lang}: #{Float.round(conf, 4)}")
+  end)
+end
+```
--- a/docs/snippets/elixir/advanced/metadata_extraction.exs
+++ b/docs/snippets/elixir/advanced/metadata_extraction.exs
@@ -0,0 +1,28 @@
+```elixir title="Elixir"
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+# Metadata is flat — format-specific fields are at the top level
+metadata = result.metadata
+IO.puts("MIME type: #{result.mime_type}")
+IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
+
+# Access PDF metadata directly from the flat map
+page_count = metadata["page_count"]
+if page_count, do: IO.puts("Page count: #{page_count}")
+
+authors = metadata["authors"] || []
+if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
+
+title = metadata["title"]
+if title, do: IO.puts("Title: #{title}")
+
+# Access HTML metadata directly from the flat map
+{:ok, html_result} = Kreuzberg.extract_file("page.html")
+html_meta = html_result.metadata
+
+keywords = html_meta["keywords"] || []
+if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
+
+description = html_meta["description"]
+if description, do: IO.puts("Description: #{description}")
+```
--- a/docs/snippets/elixir/advanced/quality_processing_config.md
+++ b/docs/snippets/elixir/advanced/quality_processing_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "post_processors" => [
+    %{
+      "name" => "QualityFilter",
+      "enabled" => true
+    }
+  ]
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+IO.puts("Extraction quality applied")
+IO.inspect(result.text, label: "Quality-filtered text")
+```
--- a/docs/snippets/elixir/advanced/quality_processing_example.exs
+++ b/docs/snippets/elixir/advanced/quality_processing_example.exs
@@ -0,0 +1,39 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Process document with quality filtering
+config = %ExtractionConfig{
+  quality_processing: %{
+    "enabled" => true,
+    "min_quality_score" => 0.8
+  }
+}
+
+case Kreuzberg.extract_file("document.pdf", nil, config) do
+  {:ok, result} ->
+    IO.puts("=== Quality Processing ===\n")
+
+    # Display quality metrics if available
+    quality_score = result.quality_score
+    if quality_score && quality_score > 0 do
+      IO.puts("Quality Score: #{quality_score}")
+      IO.puts("Content Quality: #{quality_status(quality_score)}")
+    end
+
+    # Display content with quality assurance
+    IO.puts("\n--- Extracted Content ---")
+    content_preview = String.slice(result.content, 0..200)
+    IO.puts(content_preview)
+    IO.puts("\nTotal size: #{byte_size(result.content)} bytes")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed!")
+    IO.puts("Error: #{inspect(reason)}")
+end
+
+# Helper function to determine quality status
+defp quality_status(score) when score >= 0.9, do: "Excellent"
+defp quality_status(score) when score >= 0.8, do: "Good"
+defp quality_status(score) when score >= 0.7, do: "Fair"
+defp quality_status(_score), do: "Poor"
+```
--- a/docs/snippets/elixir/advanced/quality_processing_example.md
+++ b/docs/snippets/elixir/advanced/quality_processing_example.md
@@ -0,0 +1,22 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "post_processors" => [
+    %{
+      "name" => "QualityFilter",
+      "enabled" => true
+    }
+  ]
+})
+
+{:ok, result_before} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
+
+{:ok, result_after} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+# Compare text quality metrics
+text_before = result_before.text || ""
+text_after = result_after.text || ""
+
+IO.puts("Before quality processing: #{String.length(text_before)} chars")
+IO.puts("After quality processing: #{String.length(text_after)} chars")
+IO.puts("Improvement: #{Float.round((1 - String.length(text_after) / String.length(text_before)) * 100, 2)}%")
+```
--- a/docs/snippets/elixir/advanced/table_extraction.exs
+++ b/docs/snippets/elixir/advanced/table_extraction.exs
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+tables = result.tables
+IO.puts("Total tables found: #{length(tables)}")
+
+Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
+  IO.puts("\n--- Table #{index} ---")
+
+  # Access table cells
+  cells = table["cells"] || []
+  IO.puts("Rows: #{length(cells)}")
+
+  # Access table markdown representation
+  markdown = table["markdown"]
+  IO.puts("Markdown representation:")
+  IO.puts(markdown)
+end)
+```
--- a/docs/snippets/elixir/advanced/token_reduction_config.md
+++ b/docs/snippets/elixir/advanced/token_reduction_config.md
@@ -0,0 +1,17 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_markdown" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
+
+if result.original_token_count do
+  IO.puts("Original tokens: #{result.original_token_count}")
+end
+if result.reduced_token_count do
+  IO.puts("Reduced tokens: #{result.reduced_token_count}")
+end
+```
--- a/docs/snippets/elixir/advanced/token_reduction_example.exs
+++ b/docs/snippets/elixir/advanced/token_reduction_example.exs
@@ -0,0 +1,42 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Reduce token count for LLM
+config = %ExtractionConfig{
+  token_reduction: %{
+    "enabled" => true,
+    "target_tokens" => 2000
+  }
+}
+
+case Kreuzberg.extract_file("document.pdf", nil, config) do
+  {:ok, result} ->
+    IO.puts("=== Token Reduction ===\n")
+
+    # Display content and token information
+    content_size = byte_size(result.content)
+    estimated_tokens = div(content_size, 4)  # Rough estimate: 1 token ≈ 4 bytes
+
+    IO.puts("Content size: #{content_size} bytes")
+    IO.puts("Estimated tokens: ~#{estimated_tokens}")
+    IO.puts("Target tokens: 2000")
+
+    # Show reduction status
+    if estimated_tokens > 2000 do
+      reduction_percentage = trunc((1 - 2000 / estimated_tokens) * 100)
+      IO.puts("\nToken reduction applied: ~#{reduction_percentage}% reduction")
+    else
+      IO.puts("\nNo reduction needed - content already below target")
+    end
+
+    # Display reduced content preview
+    IO.puts("\n--- Reduced Content ---")
+    content_preview = String.slice(result.content, 0..300)
+    IO.puts(content_preview)
+    IO.puts("...")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed!")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/advanced/token_reduction_example.md
+++ b/docs/snippets/elixir/advanced/token_reduction_example.md
@@ -0,0 +1,28 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_markdown" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
+
+# Display token reduction metrics
+original = result.original_token_count || 0
+reduced = result.reduced_token_count || 0
+
+IO.puts("Original tokens: #{original}")
+IO.puts("Reduced tokens: #{reduced}")
+
+if original > 0 do
+  reduction_percent = Float.round((1 - reduced / original) * 100, 2)
+  IO.puts("Reduction: #{reduction_percent}%")
+end
+
+# Show sample of reduced text
+if result.text do
+  IO.puts("\nSample of reduced text:")
+  IO.puts(String.slice(result.text, 0..200) <> "...")
+end
+```
--- a/docs/snippets/elixir/advanced/vector_database_integration.md
+++ b/docs/snippets/elixir/advanced/vector_database_integration.md
@@ -0,0 +1,40 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "enabled" => true,
+    "max_characters" => 512,
+    "overlap" => 50
+  },
+  "embeddings" => %{
+    "enabled" => true,
+    "model" => "all-MiniLM-L6-v2"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+
+# Prepare data for vector database storage
+vector_db_records = result.chunks
+  |> Enum.with_index(1)
+  |> Enum.map(fn {chunk, idx} ->
+    %{
+      "id" => "#{result.document_id}-chunk-#{idx}",
+      "vector" => chunk["embedding"],
+      "metadata" => %{
+        "content" => chunk["content"],
+        "page" => chunk["page"],
+        "document_id" => result.document_id,
+        "chunk_index" => idx
+      }
+    }
+  end)
+
+IO.puts("Generated #{length(vector_db_records)} records for vector database")
+IO.inspect(List.first(vector_db_records), label: "Sample Record")
+
+# Example: Insert into Pinecone-like vector database
+Enum.each(vector_db_records, fn record ->
+  # vector_db_client.upsert(record)
+  IO.puts("Would insert: #{record["id"]}")
+end)
+```
--- a/docs/snippets/elixir/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/elixir/api/batch_extract_bytes_sync.md
@@ -0,0 +1,25 @@
+```elixir title="Elixir"
+defmodule Example do
+  def batch_extract_bytes do
+    # Note: Batch extraction in Elixir is done via Task.async_stream over sync calls
+    files = ["doc1.pdf", "doc2.docx", "report.pdf"]
+    config = nil
+
+    results =
+      files
+      |> Task.async_stream(
+        fn file ->
+          content = File.read!(file)
+          Kreuzberg.extract_bytes_sync(content, "application/pdf", config)
+        end,
+        max_concurrency: 4
+      )
+      |> Enum.map(fn {:ok, result} -> result end)
+
+    Enum.each(results, fn
+      {:ok, content} -> IO.puts("Extracted: #{String.length(content)} chars")
+      {:error, reason} -> IO.puts("Error: #{reason}")
+    end)
+  end
+end
+```
--- a/docs/snippets/elixir/api/batch_extract_files_sync.md
+++ b/docs/snippets/elixir/api/batch_extract_files_sync.md
@@ -0,0 +1,23 @@
+```elixir title="Elixir"
+defmodule Example do
+  def batch_extract_files do
+    files = ["doc1.pdf", "doc2.docx", "report.pdf"]
+    config = nil
+
+    results =
+      files
+      |> Task.async_stream(
+        fn file ->
+          Kreuzberg.extract_file_sync(file, nil, config)
+        end,
+        max_concurrency: 4
+      )
+      |> Enum.map(fn {:ok, result} -> result end)
+
+    Enum.each(results, fn
+      {:ok, content} -> IO.puts("File extracted: #{String.length(content)} chars")
+      {:error, reason} -> IO.puts("Error: #{reason}")
+    end)
+  end
+end
+```
--- a/docs/snippets/elixir/api/client_chunk_text.md
+++ b/docs/snippets/elixir/api/client_chunk_text.md
@@ -0,0 +1,30 @@
+```elixir title="Elixir"
+defmodule Example do
+  def chunk_text_via_http do
+    text = "Your long document text here..."
+
+    config = %{
+      "chunking" => %{
+        "max_characters" => 800,
+        "overlap" => 100,
+        "chunker_type" => "Markdown"
+      }
+    }
+
+    with {:ok, response} <- Req.post(
+           "http://localhost:8000/chunk",
+           json: %{
+             "text" => text,
+             "config" => config
+           }
+         ),
+         {:ok, body} <- Jason.decode(response.body) do
+      chunks = body["chunks"]
+      IO.puts("Created #{length(chunks)} chunks")
+      {:ok, chunks}
+    else
+      error -> {:error, inspect(error)}
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/client_extract_single_file.md
+++ b/docs/snippets/elixir/api/client_extract_single_file.md
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+defmodule Example do
+  def extract_via_http do
+    file_path = "document.pdf"
+
+    with {:ok, file} <- File.read(file_path),
+         {:ok, response} <- Req.post(
+           "http://localhost:8000/extract",
+           form: [file: {:file, file_path}]
+         ),
+         {:ok, body} <- Jason.decode(response.body) do
+      IO.puts("Extracted content: #{body["content"]}")
+      {:ok, body}
+    else
+      error -> {:error, inspect(error)}
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/combining_all_features.md
+++ b/docs/snippets/elixir/api/combining_all_features.md
@@ -0,0 +1,41 @@
+```elixir title="Elixir"
+defmodule Example do
+  def full_extraction_pipeline do
+    # Build a comprehensive extraction config as a JSON string or map
+    config_json = Jason.encode!(%{
+      "use_cache" => true,
+      "enable_quality_processing" => true,
+      "force_ocr" => false,
+      "ocr" => %{
+        "backend" => "tesseract",
+        "language" => "eng"
+      },
+      "chunking" => %{
+        "max_characters" => 800,
+        "overlap" => 100,
+        "chunker_type" => "Markdown",
+        "prepend_heading_context" => true
+      },
+      "output_format" => "Markdown",
+      "include_document_structure" => true,
+      "images" => %{
+        "extract_images" => true
+      },
+      "language_detection" => %{
+        "detect" => true
+      }
+    })
+
+    case Kreuzberg.extract_file_sync("report.pdf", nil, config_json) do
+      {:ok, result} ->
+        IO.puts("Extraction successful")
+        IO.puts("Content length: #{String.length(result)} chars")
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Extraction failed: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/error_handling.md
+++ b/docs/snippets/elixir/api/error_handling.md
@@ -0,0 +1,24 @@
+```elixir title="Elixir"
+defmodule Example do
+  def handle_extraction_errors do
+    # Extract with invalid MIME type
+    case Kreuzberg.extract_file_sync("document.txt", nil, nil) do
+      {:ok, result} ->
+        IO.puts("Success: #{String.length(result)} chars")
+
+      {:error, reason} when is_binary(reason) ->
+        # Error is a string description
+        case reason do
+          msg when String.contains?(msg, "unsupported") ->
+            IO.puts("Unsupported format: #{msg}")
+
+          msg when String.contains?(msg, "not found") ->
+            IO.puts("File not found: #{msg}")
+
+          msg ->
+            IO.puts("Extraction failed: #{msg}")
+        end
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/error_handling_extract.md
+++ b/docs/snippets/elixir/api/error_handling_extract.md
@@ -0,0 +1,28 @@
+```elixir title="Elixir"
+defmodule Example do
+  def robust_extract(path) do
+    with {:file_exists, true} <- {:file_exists, File.exists?(path)},
+         {:read, {:ok, content}} <- {:read, File.read(path)},
+         {:mime, {:ok, mime_type}} <- {:mime, detect_mime_type(content)},
+         {:extract, {:ok, result}} <- {:extract, Kreuzberg.extract_bytes_sync(content, mime_type, nil)} do
+      {:ok, result}
+    else
+      {:file_exists, false} ->
+        {:error, "File not found: #{path}"}
+
+      {:read, {:error, reason}} ->
+        {:error, "Failed to read file: #{inspect(reason)}"}
+
+      {:mime, {:error, reason}} ->
+        {:error, "MIME detection failed: #{reason}"}
+
+      {:extract, {:error, reason}} ->
+        {:error, "Extraction failed: #{reason}"}
+    end
+  end
+
+  defp detect_mime_type(content) do
+    Kreuzberg.detect_mime_type_from_bytes(content)
+  end
+end
+```
--- a/docs/snippets/elixir/api/extract_bytes_async.md
+++ b/docs/snippets/elixir/api/extract_bytes_async.md
@@ -0,0 +1,22 @@
+```elixir title="Elixir"
+defmodule Example do
+  def extract_from_bytes_async do
+    content = File.read!("document.pdf")
+    config = nil
+
+    task = Task.async(fn ->
+      Kreuzberg.extract_bytes_async(content, "application/pdf", config)
+    end)
+
+    case Task.await(task) do
+      {:ok, result} ->
+        IO.puts("Content: #{result}")
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/extract_bytes_sync.md
+++ b/docs/snippets/elixir/api/extract_bytes_sync.md
@@ -0,0 +1,18 @@
+```elixir title="Elixir"
+defmodule Example do
+  def extract_from_bytes do
+    content = File.read!("document.pdf")
+    config = nil
+
+    case Kreuzberg.extract_bytes_sync(content, "application/pdf", config) do
+      {:ok, result} ->
+        IO.puts("Content: #{result}")
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/extract_file_async.md
+++ b/docs/snippets/elixir/api/extract_file_async.md
@@ -0,0 +1,21 @@
+```elixir title="Elixir"
+defmodule Example do
+  def extract_file_async do
+    config = nil
+
+    task = Task.async(fn ->
+      Kreuzberg.extract_file_async("document.pdf", nil, config)
+    end)
+
+    case Task.await(task) do
+      {:ok, result} ->
+        IO.puts("Content: #{result}")
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/api/extract_file_sync.md
+++ b/docs/snippets/elixir/api/extract_file_sync.md
@@ -0,0 +1,17 @@
+```elixir title="Elixir"
+defmodule Example do
+  def extract_file do
+    config = nil
+
+    case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
+      {:ok, result} ->
+        IO.puts("Content: #{result}")
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/cache/disk_cache.exs
+++ b/docs/snippets/elixir/cache/disk_cache.exs
@@ -0,0 +1,435 @@
+```elixir title="Elixir"
+# Disk Caching - Implement persistent disk caching for extraction results
+# Demonstrates advanced caching strategies for document extraction
+
+defmodule KreuzbergDiskCache do
+  @moduledoc """
+  Disk-based caching layer for Kreuzberg extraction results.
+
+  Provides persistent caching of extraction results with features like:
+  - TTL-based cache expiration
+  - Compression for large results
+  - Cache statistics and management
+  - Multi-tiered caching (memory + disk)
+  """
+
+  require Logger
+
+  defmodule CacheEntry do
+    @moduledoc """
+    Represents a cached extraction result.
+    """
+
+    defstruct [
+      :key,
+      :result,
+      :created_at,
+      :accessed_at,
+      :ttl_seconds,
+      :size_bytes,
+      :compressed
+    ]
+
+    @doc """
+    Create a new cache entry.
+    """
+    def new(key, result, ttl_seconds \\ 86400) do
+      size = calculate_size(result)
+
+      %CacheEntry{
+        key: key,
+        result: result,
+        created_at: System.monotonic_time(:second),
+        accessed_at: System.monotonic_time(:second),
+        ttl_seconds: ttl_seconds,
+        size_bytes: size,
+        compressed: false
+      }
+    end
+
+    @doc """
+    Check if entry has expired.
+    """
+    def expired?(%CacheEntry{created_at: created_at, ttl_seconds: ttl}) do
+      now = System.monotonic_time(:second)
+      now - created_at > ttl
+    end
+
+    @doc """
+    Update access time.
+    """
+    def touch(%CacheEntry{} = entry) do
+      %{entry | accessed_at: System.monotonic_time(:second)}
+    end
+
+    defp calculate_size(result) do
+      case result do
+        %{content: content} -> byte_size(content)
+        _ -> 0
+      end
+    end
+  end
+
+  defmodule Cache do
+    @moduledoc """
+    Main disk cache implementation.
+    """
+
+    defstruct [
+      :cache_dir,
+      :max_size_bytes,
+      :ttl_seconds,
+      :compression_enabled,
+      :memory_cache
+    ]
+
+    @doc """
+    Initialize disk cache.
+    """
+    def new(cache_dir, opts \\ []) do
+      File.mkdir_p!(cache_dir)
+
+      %Cache{
+        cache_dir: cache_dir,
+        max_size_bytes: Keyword.get(opts, :max_size_bytes, 1_000_000_000),
+        ttl_seconds: Keyword.get(opts, :ttl_seconds, 604_800),
+        compression_enabled: Keyword.get(opts, :compression_enabled, true),
+        memory_cache: %{}
+      }
+    end
+
+    @doc """
+    Get cached result by key.
+    """
+    def get(cache, key) do
+      # Check memory cache first
+      case Map.get(cache.memory_cache, key) do
+        %CacheEntry{} = entry ->
+          if CacheEntry.expired?(entry) do
+            Logger.debug("Cache hit (memory) - expired: #{key}")
+            :miss
+          else
+            Logger.debug("Cache hit (memory): #{key}")
+            {:hit, CacheEntry.touch(entry).result}
+          end
+
+        nil ->
+          get_from_disk(cache, key)
+      end
+    end
+
+    @doc """
+    Store result in cache.
+    """
+    def put(cache, key, result) do
+      entry = CacheEntry.new(key, result, cache.ttl_seconds)
+
+      # Store in memory
+      new_memory_cache = Map.put(cache.memory_cache, key, entry)
+
+      # Store on disk
+      store_on_disk(cache, key, entry)
+
+      # Check cache size and cleanup if needed
+      cache = %{cache | memory_cache: new_memory_cache}
+      maybe_cleanup(cache)
+
+      Logger.info("Cache stored: #{key}")
+      cache
+    end
+
+    @doc """
+    Delete cache entry.
+    """
+    def delete(cache, key) do
+      new_memory_cache = Map.delete(cache.memory_cache, key)
+
+      cache_file = cache_path(cache, key)
+      if File.exists?(cache_file), do: File.rm(cache_file)
+
+      Logger.info("Cache deleted: #{key}")
+      %{cache | memory_cache: new_memory_cache}
+    end
+
+    @doc """
+    Clear all cache entries.
+    """
+    def clear(cache) do
+      # Clear disk cache
+      File.rm_rf!(cache.cache_dir)
+      File.mkdir_p!(cache.cache_dir)
+
+      Logger.info("Cache cleared")
+      %{cache | memory_cache: %{}}
+    end
+
+    @doc """
+    Get cache statistics.
+    """
+    def stats(cache) do
+      total_size = calculate_total_size(cache)
+      entry_count = map_size(cache.memory_cache)
+      memory_entries = Enum.count(cache.memory_cache)
+
+      disk_entries =
+        case File.ls(cache.cache_dir) do
+          {:ok, files} -> length(files)
+          {:error, _} -> 0
+        end
+
+      %{
+        total_entries: entry_count,
+        memory_entries: memory_entries,
+        disk_entries: disk_entries,
+        total_size_bytes: total_size,
+        max_size_bytes: cache.max_size_bytes,
+        usage_percent: (total_size / cache.max_size_bytes * 100) |> Float.round(2),
+        compression_enabled: cache.compression_enabled
+      }
+    end
+
+    # Private helpers
+
+    defp get_from_disk(cache, key) do
+      cache_file = cache_path(cache, key)
+
+      if File.exists?(cache_file) do
+        case File.read(cache_file) do
+          {:ok, data} ->
+            case deserialize(data, cache.compression_enabled) do
+              {:ok, entry} ->
+                if CacheEntry.expired?(entry) do
+                  File.rm(cache_file)
+                  Logger.debug("Cache hit (disk) - expired: #{key}")
+                  :miss
+                else
+                  Logger.debug("Cache hit (disk): #{key}")
+                  {:hit, CacheEntry.touch(entry).result}
+                end
+
+              {:error, reason} ->
+                Logger.warn("Failed to deserialize cache: #{inspect(reason)}")
+                :miss
+            end
+
+          {:error, reason} ->
+            Logger.warn("Failed to read cache file: #{inspect(reason)}")
+            :miss
+        end
+      else
+        :miss
+      end
+    end
+
+    defp store_on_disk(cache, key, entry) do
+      cache_file = cache_path(cache, key)
+
+      data = serialize(entry, cache.compression_enabled)
+      File.write!(cache_file, data)
+    end
+
+    defp cache_path(cache, key) do
+      Path.join(cache.cache_dir, "#{key}.cache")
+    end
+
+    defp serialize(entry, compression_enabled) do
+      data = :erlang.term_to_binary(entry)
+
+      if compression_enabled do
+        :zlib.compress(data)
+      else
+        data
+      end
+    end
+
+    defp deserialize(data, compression_enabled) do
+      try do
+        uncompressed =
+          if compression_enabled do
+            :zlib.uncompress(data)
+          else
+            data
+          end
+
+        {:ok, :erlang.binary_to_term(uncompressed)}
+      rescue
+        e -> {:error, e}
+      end
+    end
+
+    defp calculate_total_size(cache) do
+      cache.memory_cache
+      |> Map.values()
+      |> Enum.map(& &1.size_bytes)
+      |> Enum.sum()
+    end
+
+    defp maybe_cleanup(cache) do
+      total_size = calculate_total_size(cache)
+
+      if total_size > cache.max_size_bytes do
+        Logger.info("Cache size (#{total_size}) exceeds limit, starting cleanup")
+        cleanup_lru(cache)
+      else
+        cache
+      end
+    end
+
+    defp cleanup_lru(cache) do
+      # Remove least recently used entries until under limit
+      entries =
+        cache.memory_cache
+        |> Enum.sort_by(fn {_k, entry} -> entry.accessed_at end)
+
+      target_size = div(cache.max_size_bytes, 2)
+      current_size = calculate_total_size(cache)
+
+      entries
+      |> Enum.reduce_while({cache, current_size}, fn {key, entry}, {acc_cache, size} ->
+        if size <= target_size do
+          {:halt, {acc_cache, size}}
+        else
+          new_cache = delete(acc_cache, key)
+          new_size = size - entry.size_bytes
+          {:cont, {new_cache, new_size}}
+        end
+      end)
+      |> elem(0)
+    end
+  end
+
+  @doc """
+  Initialize cache and extract with caching.
+  """
+  def extract_with_cache(file_path, cache_dir, opts \\ []) do
+    cache = Cache.new(cache_dir, opts)
+    cache_key = compute_cache_key(file_path, opts)
+
+    case Cache.get(cache, cache_key) do
+      {:hit, result} ->
+        {:ok, result, cache}
+
+      :miss ->
+        Logger.info("Cache miss: #{file_path}")
+
+        case Kreuzberg.extract_file(file_path) do
+          {:ok, result} ->
+            new_cache = Cache.put(cache, cache_key, result)
+            {:ok, result, new_cache}
+
+          error ->
+            {error, cache}
+        end
+    end
+  end
+
+  @doc """
+  Extract multiple files with batch caching.
+  """
+  def batch_extract_with_cache(file_paths, cache_dir, opts \\ []) do
+    cache = Cache.new(cache_dir, opts)
+
+    results =
+      file_paths
+      |> Enum.map(fn path ->
+        case extract_with_cache(path, cache_dir, opts) do
+          {:ok, result, _} -> {:ok, path, result}
+          {{:error, reason}, _} -> {:error, path, reason}
+        end
+      end)
+
+    stats = Cache.stats(cache)
+    {results, stats}
+  end
+
+  @doc """
+  Manage cache - get stats, clear, etc.
+  """
+  def manage_cache(cache_dir, action, opts \\ []) do
+    cache = Cache.new(cache_dir, opts)
+
+    case action do
+      :stats ->
+        Cache.stats(cache)
+
+      :clear ->
+        Cache.clear(cache)
+
+      :list ->
+        case File.ls(cache_dir) do
+          {:ok, files} -> files
+          {:error, reason} -> {:error, reason}
+        end
+
+      {:delete, key} ->
+        Cache.delete(cache, key)
+
+      _ ->
+        {:error, "Unknown action: #{action}"}
+    end
+  end
+
+  # Private helpers
+
+  defp compute_cache_key(file_path, opts) do
+    # Include file path and options in key
+    content = "#{file_path}|#{inspect(opts)}"
+    :crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
+  end
+end
+
+# Usage examples
+IO.puts("=== Kreuzberg Disk Cache ===\n")
+
+cache_dir = "/tmp/kreuzberg_cache"
+
+# Example 1: Single file extraction with caching
+IO.puts("Example 1: Single file extraction with caching")
+IO.puts("-" <> String.duplicate("-", 40) <> "\n")
+
+case KreuzbergDiskCache.extract_with_cache("document.pdf", cache_dir) do
+  {:ok, result, cache} ->
+    IO.puts("Extraction successful!")
+    IO.puts("Content size: #{byte_size(result.content)} bytes")
+
+    stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
+    IO.puts("\nCache Statistics:")
+    IO.puts("  Entries: #{stats.total_entries}")
+    IO.puts("  Size: #{stats.total_size_bytes} bytes")
+    IO.puts("  Usage: #{stats.usage_percent}%\n")
+
+  {error, _cache} ->
+    IO.puts("Extraction failed: #{inspect(error)}\n")
+end
+
+# Example 2: Batch extraction with cache statistics
+IO.puts("Example 2: Batch extraction with caching")
+IO.puts("-" <> String.duplicate("-", 40) <> "\n")
+
+documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
+
+{results, stats} = KreuzbergDiskCache.batch_extract_with_cache(documents, cache_dir)
+
+successful = Enum.count(results, &match?({:ok, _, _}, &1))
+IO.puts("Batch results:")
+IO.puts("  Processed: #{length(documents)}")
+IO.puts("  Successful: #{successful}")
+IO.puts("\nCache Statistics:")
+IO.puts("  Total entries: #{stats.total_entries}")
+IO.puts("  Memory entries: #{stats.memory_entries}")
+IO.puts("  Disk entries: #{stats.disk_entries}")
+IO.puts("  Total size: #{stats.total_size_bytes} bytes")
+IO.puts("  Usage: #{stats.usage_percent}%\n")
+
+# Example 3: Cache management
+IO.puts("Example 3: Cache management")
+IO.puts("-" <> String.duplicate("-", 40) <> "\n")
+
+cached_files = KreuzbergDiskCache.manage_cache(cache_dir, :list)
+IO.puts("Cached files:")
+Enum.each(cached_files, fn file -> IO.puts("  - #{file}") end)
+
+IO.puts("\nCache stats:")
+stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
+IO.inspect(stats, pretty: true)
+```
--- a/docs/snippets/elixir/cli/basic_cli.exs
+++ b/docs/snippets/elixir/cli/basic_cli.exs
@@ -0,0 +1,306 @@
+```elixir title="Elixir"
+# Basic CLI Tool - Simple command-line interface for Kreuzberg
+# Demonstrates how to build a CLI for document extraction
+
+defmodule KreuzbergCLI do
+  @moduledoc """
+  Command-line interface for Kreuzberg document extraction.
+
+  Provides a simple, user-friendly CLI for extracting documents
+  from the command line with progress feedback.
+  """
+
+  require Logger
+
+  @doc """
+  Main entry point for CLI application.
+
+  Parses command-line arguments and executes the appropriate command.
+  """
+  def main(args) do
+    args
+    |> parse_args()
+    |> execute()
+  end
+
+  defp parse_args(args) do
+    case args do
+      [] ->
+        {:error, :no_command}
+
+      ["extract", file | rest] ->
+        opts = parse_options(rest, %{})
+        {:extract, file, opts}
+
+      ["batch", dir | rest] ->
+        opts = parse_options(rest, %{})
+        {:batch, dir, opts}
+
+      ["help"] ->
+        {:help}
+
+      ["-h"] ->
+        {:help}
+
+      ["--help"] ->
+        {:help}
+
+      [cmd] ->
+        {:error, "Unknown command: #{cmd}"}
+
+      _ ->
+        {:error, :invalid_args}
+    end
+  end
+
+  defp parse_options([], acc), do: acc
+
+  defp parse_options(["-v" | rest], acc) do
+    parse_options(rest, Map.put(acc, :verbose, true))
+  end
+
+  defp parse_options(["--mime-type", mime | rest], acc) do
+    parse_options(rest, Map.put(acc, :mime_type, mime))
+  end
+
+  defp parse_options(["--output", path | rest], acc) do
+    parse_options(rest, Map.put(acc, :output, path))
+  end
+
+  defp parse_options(["--ocr" | rest], acc) do
+    parse_options(rest, Map.put(acc, :enable_ocr, true))
+  end
+
+  defp parse_options(["--chunks" | rest], acc) do
+    parse_options(rest, Map.put(acc, :enable_chunks, true))
+  end
+
+  defp parse_options([_ | rest], acc) do
+    parse_options(rest, acc)
+  end
+
+  defp execute({:help}) do
+    print_help()
+    :ok
+  end
+
+  defp execute({:extract, file, opts}) do
+    unless File.exists?(file) do
+      IO.puts(:stderr, "Error: File not found: #{file}")
+      :error
+    else
+      extract_file(file, opts)
+    end
+  end
+
+  defp execute({:batch, dir, opts}) do
+    unless File.dir?(dir) do
+      IO.puts(:stderr, "Error: Directory not found: #{dir}")
+      :error
+    else
+      batch_extract(dir, opts)
+    end
+  end
+
+  defp execute({:error, reason}) do
+    IO.puts(:stderr, "Error: #{inspect(reason)}")
+    print_help()
+    :error
+  end
+
+  defp extract_file(file_path, opts) do
+    verbose = Map.get(opts, :verbose, false)
+    mime_type = Map.get(opts, :mime_type, nil)
+    output_path = Map.get(opts, :output, nil)
+
+    config = build_config(opts)
+
+    IO.puts("Extracting: #{file_path}")
+    start_time = System.monotonic_time(:millisecond)
+
+    case Kreuzberg.extract_file(file_path, mime_type, config) do
+      {:ok, result} ->
+        elapsed = System.monotonic_time(:millisecond) - start_time
+
+        print_extraction_result(result, elapsed, verbose)
+
+        if output_path do
+          save_result(result, output_path)
+        end
+
+        :ok
+
+      {:error, reason} ->
+        IO.puts(:stderr, "Extraction failed: #{inspect(reason)}")
+        :error
+    end
+  end
+
+  defp batch_extract(dir, opts) do
+    verbose = Map.get(opts, :verbose, false)
+    config = build_config(opts)
+
+    pattern = Path.join(dir, "**/*.{pdf,docx,xlsx,txt,html,md}")
+    files = Path.wildcard(pattern)
+
+    case files do
+      [] ->
+        IO.puts("No documents found in #{dir}")
+        :ok
+
+      _ ->
+        IO.puts("Found #{length(files)} documents\n")
+
+        results =
+          files
+          |> Enum.with_index(1)
+          |> Enum.map(fn {file, idx} ->
+            IO.write("  [#{idx}/#{length(files)}] ")
+            start_time = System.monotonic_time(:millisecond)
+
+            case Kreuzberg.extract_file(file, nil, config) do
+              {:ok, result} ->
+                elapsed = System.monotonic_time(:millisecond) - start_time
+                IO.puts("#{Path.basename(file)} (#{elapsed}ms)")
+                {:ok, file, result, elapsed}
+
+              {:error, reason} ->
+                IO.puts("#{Path.basename(file)} - ERROR")
+                if verbose, do: IO.puts("  Error: #{inspect(reason)}")
+                {:error, file, reason}
+            end
+          end)
+
+        print_batch_summary(results)
+        :ok
+    end
+  end
+
+  defp build_config(opts) do
+    %Kreuzberg.ExtractionConfig{
+      ocr:
+        if(Map.get(opts, :enable_ocr),
+          do: %{"enabled" => true, "backend" => "tesseract"},
+          else: nil
+        ),
+      chunking:
+        if(Map.get(opts, :enable_chunks),
+          do: %{"enabled" => true, "max_characters" => 1000, "overlap" => 100},
+          else: nil
+        ),
+      use_cache: true
+    }
+  end
+
+  defp print_extraction_result(result, elapsed_ms, verbose) do
+    IO.puts("\nExtraction Results:")
+    IO.puts("  Content size: #{byte_size(result.content)} bytes")
+    IO.puts("  MIME type: #{result.mime_type}")
+    IO.puts("  Processing time: #{elapsed_ms}ms")
+
+    if result.metadata do
+      IO.puts("  Metadata keys: #{Enum.count(result.metadata)}")
+    end
+
+    if result.tables && !Enum.empty?(result.tables) do
+      IO.puts("  Tables found: #{length(result.tables)}")
+    end
+
+    if result.images && !Enum.empty?(result.images) do
+      IO.puts("  Images found: #{length(result.images)}")
+    end
+
+    if result.chunks && !Enum.empty?(result.chunks) do
+      IO.puts("  Chunks created: #{length(result.chunks)}")
+    end
+
+    if result.detected_languages && !Enum.empty?(result.detected_languages) do
+      IO.puts("  Languages: #{Enum.join(result.detected_languages, ", ")}")
+    end
+
+    if verbose do
+      IO.puts("\n  Full metadata:")
+      IO.inspect(result.metadata, pretty: true)
+    end
+
+    IO.puts("")
+  end
+
+  defp print_batch_summary(results) do
+    total = length(results)
+    successful = Enum.count(results, &match?({:ok, _, _, _}, &1))
+    failed = Enum.count(results, &match?({:error, _, _}, &1))
+
+    total_time =
+      results
+      |> Enum.filter(&match?({:ok, _, _, _}, &1))
+      |> Enum.map(fn {:ok, _, _, time} -> time end)
+      |> Enum.sum()
+
+    IO.puts("\nBatch Summary:")
+    IO.puts("  Total: #{total}")
+    IO.puts("  Successful: #{successful}")
+    IO.puts("  Failed: #{failed}")
+    IO.puts("  Total time: #{total_time}ms")
+    IO.puts("  Average time: #{div(total_time, max(successful, 1))}ms/document")
+  end
+
+  defp save_result(result, output_path) do
+    output_data = %{
+      content: result.content,
+      mime_type: result.mime_type,
+      metadata: result.metadata,
+      tables: result.tables || [],
+      images: result.images || [],
+      chunks: result.chunks || [],
+      detected_languages: result.detected_languages || [],
+      extracted_at: DateTime.utc_now()
+    }
+
+    case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
+      :ok ->
+        IO.puts("Results saved to: #{output_path}")
+
+      {:error, reason} ->
+        IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
+    end
+  end
+
+  defp print_help do
+    IO.puts("""
+    Kreuzberg CLI - Document Extraction Tool
+
+    USAGE:
+      kreuzberg extract <file> [OPTIONS]
+      kreuzberg batch <directory> [OPTIONS]
+      kreuzberg help
+
+    COMMANDS:
+      extract <file>        Extract content from a single document
+      batch <directory>     Extract all documents in a directory
+      help                  Show this help message
+
+    OPTIONS:
+      -v, --verbose         Show detailed output
+      --mime-type <type>    Specify MIME type (e.g., application/pdf)
+      --output <path>       Save results to JSON file
+      --ocr                 Enable OCR for scanned documents
+      --chunks              Enable document chunking for RAG
+
+    EXAMPLES:
+      kreuzberg extract document.pdf
+      kreuzberg extract document.pdf --output results.json
+      kreuzberg batch ./documents --ocr --chunks
+      kreuzberg batch ./documents -v --output summary.json
+    """)
+  end
+end
+
+# Entry point for escript
+def main(args) do
+  case KreuzbergCLI.main(args) do
+    :ok -> 0
+    :error -> 1
+  end
+end
+```
--- a/docs/snippets/elixir/cli/cli_with_config.exs
+++ b/docs/snippets/elixir/cli/cli_with_config.exs
@@ -0,0 +1,455 @@
+```elixir title="Elixir"
+# CLI with Configuration - Advanced CLI with config file support
+# Demonstrates managing complex extraction configurations via CLI
+
+defmodule KreuzbergAdvancedCLI do
+  @moduledoc """
+  Advanced CLI with configuration file support for Kreuzberg.
+
+  Allows users to define extraction configurations in YAML/TOML files,
+  specify preprocessing options, and manage extraction profiles.
+  """
+
+  require Logger
+
+  defmodule ConfigFile do
+    @moduledoc """
+    Configuration file parser and manager.
+    """
+
+    defstruct [
+      :profiles,
+      :default_profile,
+      :cache_enabled,
+      :cache_dir
+    ]
+
+    @doc """
+    Load configuration from YAML/JSON file.
+    """
+    def load(config_path) do
+      unless File.exists?(config_path) do
+        {:error, "Config file not found: #{config_path}"}
+      else
+        case Path.extname(config_path) do
+          ".yaml" -> load_yaml(config_path)
+          ".yml" -> load_yaml(config_path)
+          ".json" -> load_json(config_path)
+          ext -> {:error, "Unsupported config format: #{ext}"}
+        end
+      end
+    end
+
+    defp load_yaml(path) do
+      case File.read(path) do
+        {:ok, content} ->
+          case :yamerl_constr.string(content, []) do
+            [config] -> {:ok, parse_config(config)}
+            error -> {:error, "Failed to parse YAML: #{inspect(error)}"}
+          end
+
+        {:error, reason} ->
+          {:error, "Failed to read config: #{inspect(reason)}"}
+      end
+    end
+
+    defp load_json(path) do
+      case File.read(path) do
+        {:ok, content} ->
+          case Jason.decode(content) do
+            {:ok, config} -> {:ok, parse_config(config)}
+            error -> {:error, "Failed to parse JSON: #{inspect(error)}"}
+          end
+
+        {:error, reason} ->
+          {:error, "Failed to read config: #{inspect(reason)}"}
+      end
+    end
+
+    defp parse_config(raw_config) when is_list(raw_config) do
+      raw_config = Map.new(raw_config)
+      parse_config(raw_config)
+    end
+
+    defp parse_config(raw_config) when is_map(raw_config) do
+      %ConfigFile{
+        profiles: Map.get(raw_config, "profiles", %{}),
+        default_profile: Map.get(raw_config, "default_profile", "default"),
+        cache_enabled: Map.get(raw_config, "cache_enabled", true),
+        cache_dir: Map.get(raw_config, "cache_dir", "/tmp/kreuzberg_cache")
+      }
+    end
+
+    @doc """
+    Get extraction configuration for a profile.
+    """
+    def get_profile(config_file, profile_name) do
+      profile_name = profile_name || config_file.default_profile
+
+      case Map.get(config_file.profiles, profile_name) do
+        nil -> {:error, "Profile not found: #{profile_name}"}
+        profile -> {:ok, profile}
+      end
+    end
+
+    @doc """
+    List all available profiles.
+    """
+    def list_profiles(config_file) do
+      Map.keys(config_file.profiles)
+    end
+  end
+
+  defmodule Extractor do
+    @moduledoc """
+    Main extraction engine with profile support.
+    """
+
+    def extract_with_profile(file_path, config_file, profile_name, opts \\ []) do
+      verbose = Keyword.get(opts, :verbose, false)
+
+      case ConfigFile.get_profile(config_file, profile_name) do
+        {:ok, profile} ->
+          extract_with_config(file_path, profile, config_file, verbose)
+
+        {:error, reason} ->
+          {:error, reason}
+      end
+    end
+
+    defp extract_with_config(file_path, profile, config_file, verbose) do
+      unless File.exists?(file_path) do
+        {:error, "File not found: #{file_path}"}
+      else
+        # Build extraction config from profile
+        extraction_config = build_extraction_config(profile)
+
+        # Apply caching if enabled
+        use_cache = config_file.cache_enabled
+        cache_dir = config_file.cache_dir
+
+        IO.puts("Profile: #{profile["name"]}")
+        IO.puts("File: #{file_path}")
+        IO.puts("Cache: #{if use_cache, do: "enabled (#{cache_dir})", else: "disabled"}")
+        IO.puts("")
+
+        # Preprocess if configured
+        processed_file = preprocess_if_needed(file_path, profile, verbose)
+
+        start_time = System.monotonic_time(:millisecond)
+
+        case Kreuzberg.extract_file(processed_file, nil, extraction_config) do
+          {:ok, result} ->
+            elapsed = System.monotonic_time(:millisecond) - start_time
+
+            # Post-process if configured
+            final_result = postprocess_if_needed(result, profile)
+
+            print_results(final_result, elapsed, verbose)
+            cleanup_temp_files(processed_file, file_path)
+            {:ok, final_result}
+
+          {:error, reason} ->
+            cleanup_temp_files(processed_file, file_path)
+            {:error, reason}
+        end
+      end
+    end
+
+    defp build_extraction_config(profile) do
+      %Kreuzberg.ExtractionConfig{
+        ocr: profile["ocr"],
+        chunking: profile["chunking"],
+        quality_processing: profile["quality_processing"],
+        language_detection: profile["language_detection"],
+        keyword_extraction: profile["keyword_extraction"],
+        images: profile["images"],
+        use_cache: true
+      }
+    end
+
+    defp preprocess_if_needed(file_path, profile, verbose) do
+      case profile["preprocessing"] do
+        nil ->
+          file_path
+
+        preprocessing ->
+          IO.puts("Preprocessing enabled:")
+          temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
+
+          # Apply preprocessing steps
+          preprocessing
+          |> Enum.reduce(file_path, fn step, path ->
+            apply_preprocessing_step(step, path, temp_path, verbose)
+          end)
+      end
+    end
+
+    defp apply_preprocessing_step(step, input_path, _temp_path, verbose) do
+      case step do
+        %{"type" => "rotate", "degrees" => degrees} ->
+          if verbose, do: IO.puts("  - Rotating #{degrees} degrees")
+          input_path
+
+        %{"type" => "normalize", "target_format" => format} ->
+          if verbose, do: IO.puts("  - Normalizing to #{format}")
+          input_path
+
+        %{"type" => "deskew"} ->
+          if verbose, do: IO.puts("  - Deskewing")
+          input_path
+
+        _ ->
+          input_path
+      end
+    end
+
+    defp postprocess_if_needed(result, profile) do
+      case profile["postprocessing"] do
+        nil ->
+          result
+
+        postprocessing ->
+          Enum.reduce(postprocessing, result, fn step, acc_result ->
+            apply_postprocessing_step(step, acc_result)
+          end)
+      end
+    end
+
+    defp apply_postprocessing_step(%{"type" => "filter_empty_chunks"}, result) do
+      case result.chunks do
+        nil -> result
+        chunks ->
+          filtered = Enum.filter(chunks, &(byte_size(&1) > 0))
+          %{result | chunks: filtered}
+      end
+    end
+
+    defp apply_postprocessing_step(%{"type" => "limit_tables", "max" => max_tables}, result) do
+      case result.tables do
+        nil -> result
+        tables ->
+          limited = Enum.take(tables, max_tables)
+          %{result | tables: limited}
+      end
+    end
+
+    defp apply_postprocessing_step(_, result), do: result
+
+    defp cleanup_temp_files(processed_path, original_path) do
+      if processed_path != original_path && String.starts_with?(processed_path, "/tmp/") do
+        File.rm(processed_path)
+      end
+    end
+
+    defp print_results(result, elapsed_ms, verbose) do
+      IO.puts("Results:")
+      IO.puts("  Content size: #{byte_size(result.content)} bytes")
+      IO.puts("  Mime type: #{result.mime_type}")
+      IO.puts("  Processing time: #{elapsed_ms}ms")
+
+      if result.metadata do
+        IO.puts("  Metadata entries: #{Enum.count(result.metadata)}")
+      end
+
+      if result.tables && !Enum.empty?(result.tables) do
+        IO.puts("  Tables: #{length(result.tables)}")
+      end
+
+      if result.chunks && !Enum.empty?(result.chunks) do
+        IO.puts("  Chunks: #{length(result.chunks)}")
+      end
+
+      if result.detected_languages && !Enum.empty?(result.detected_languages) do
+        IO.puts("  Languages: #{Enum.join(result.detected_languages, ", ")}")
+      end
+
+      if verbose && result.metadata do
+        IO.puts("\n  Full metadata:")
+        IO.inspect(result.metadata, pretty: true, limit: :infinity)
+      end
+
+      IO.puts("")
+    end
+  end
+
+  @doc """
+  Main CLI entry point.
+  """
+  def main(args) do
+    args
+    |> parse_args()
+    |> execute()
+  end
+
+  defp parse_args(args) do
+    {opts, args, _invalid} = OptionParser.parse(args,
+      switches: [
+        config: :string,
+        profile: :string,
+        output: :string,
+        verbose: :boolean,
+        list: :boolean
+      ],
+      aliases: [c: :config, p: :profile, o: :output, v: :verbose, l: :list]
+    )
+
+    {opts, args}
+  end
+
+  defp execute({_opts, []}) do
+    print_usage()
+    :error
+  end
+
+  defp execute({opts, [command | rest]}) do
+    case command do
+      "extract" ->
+        execute_extract(rest, opts)
+
+      "profiles" ->
+        execute_list_profiles(opts)
+
+      "help" ->
+        print_help()
+        :ok
+
+      _ ->
+        IO.puts(:stderr, "Unknown command: #{command}")
+        print_usage()
+        :error
+    end
+  end
+
+  defp execute_extract(args, opts) do
+    config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
+    profile = Keyword.get(opts, :profile, nil)
+    output_path = Keyword.get(opts, :output, nil)
+    verbose = Keyword.get(opts, :verbose, false)
+
+    case ConfigFile.load(config_path) do
+      {:ok, config_file} ->
+        case args do
+          [] ->
+            IO.puts(:stderr, "Error: No file specified")
+            :error
+
+          [file_path | _] ->
+            case Extractor.extract_with_profile(file_path, config_file, profile, verbose: verbose) do
+              {:ok, result} ->
+                if output_path do
+                  save_result(result, output_path)
+                end
+                :ok
+
+              {:error, reason} ->
+                IO.puts(:stderr, "Extraction failed: #{reason}")
+                :error
+            end
+        end
+
+      {:error, reason} ->
+        IO.puts(:stderr, "Configuration error: #{reason}")
+        :error
+    end
+  end
+
+  defp execute_list_profiles(opts) do
+    config_path = Keyword.get(opts, :config, "kreuzberg.yaml")
+
+    case ConfigFile.load(config_path) do
+      {:ok, config_file} ->
+        profiles = ConfigFile.list_profiles(config_file)
+        IO.puts("Available profiles:")
+        Enum.each(profiles, fn profile ->
+          IO.puts("  - #{profile}")
+        end)
+        :ok
+
+      {:error, reason} ->
+        IO.puts(:stderr, "Configuration error: #{reason}")
+        :error
+    end
+  end
+
+  defp save_result(result, output_path) do
+    output_data = %{
+      content: result.content,
+      mime_type: result.mime_type,
+      metadata: result.metadata,
+      tables: result.tables || [],
+      chunks: result.chunks || [],
+      images: result.images || [],
+      detected_languages: result.detected_languages || [],
+      extracted_at: DateTime.utc_now()
+    }
+
+    case File.write(output_path, Jason.encode!(output_data, pretty: true)) do
+      :ok ->
+        IO.puts("Results saved to: #{output_path}")
+
+      {:error, reason} ->
+        IO.puts(:stderr, "Failed to save results: #{inspect(reason)}")
+    end
+  end
+
+  defp print_usage do
+    IO.puts("Usage: kreuzberg-cli <command> [options] [args]")
+  end
+
+  defp print_help do
+    IO.puts("""
+    Kreuzberg Advanced CLI with Configuration
+
+    USAGE:
+      kreuzberg extract <file> [OPTIONS]
+      kreuzberg profiles [OPTIONS]
+      kreuzberg help
+
+    COMMANDS:
+      extract <file>        Extract with configured profile
+      profiles              List available profiles
+      help                  Show this help message
+
+    OPTIONS:
+      -c, --config <path>   Config file path (default: kreuzberg.yaml)
+      -p, --profile <name>  Profile name (default: from config)
+      -o, --output <path>   Save results to JSON
+      -v, --verbose         Verbose output
+
+    CONFIG FILE EXAMPLE:
+      default_profile: "standard"
+      cache_enabled: true
+      cache_dir: "/tmp/kreuzberg_cache"
+
+      profiles:
+        standard:
+          name: "Standard Extraction"
+          ocr:
+            enabled: false
+          chunking:
+            enabled: true
+            max_characters: 1000
+            overlap: 100
+
+        ocr_intensive:
+          name: "OCR + Language Detection"
+          ocr:
+            enabled: true
+            backend: tesseract
+          language_detection:
+            enabled: true
+          preprocessing:
+            - type: deskew
+            - type: rotate
+              degrees: 90
+    """)
+  end
+end
+
+# Entry point
+case KreuzbergAdvancedCLI.main(System.argv()) do
+  :ok -> IO.puts("\nDone.")
+  :error -> exit(1)
+end
+```
--- a/docs/snippets/elixir/config/advanced_config.md
+++ b/docs/snippets/elixir/config/advanced_config.md
@@ -0,0 +1,53 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "enable_quality_processing" => true,
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng"
+  },
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "embedding" => %{
+      "model" => %{
+        "preset" => %{
+          "name" => "balanced"
+        }
+      },
+      "batch_size" => 32,
+      "normalize" => true,
+      "show_download_progress" => false
+    }
+  },
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.8,
+    "detect_multiple" => false
+  },
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.1,
+    "ngram_range" => [1, 3],
+    "language" => "en"
+  },
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_important_words" => true
+  },
+  "postprocessor" => %{
+    "enabled" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Content: #{result.content}")
+
+if result.detected_languages do
+  IO.puts("Languages: #{inspect(result.detected_languages)}")
+end
+
+chunks_count = if result.chunks, do: length(result.chunks), else: 0
+IO.puts("Chunks: #{chunks_count}")
+```
--- a/docs/snippets/elixir/config/chunking_config.md
+++ b/docs/snippets/elixir/config/chunking_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Chunks: #{length(result.chunks)}")
+
+Enum.each(result.chunks, fn chunk ->
+  IO.puts("Length: #{String.length(chunk.content)}")
+end)
+```
--- a/docs/snippets/elixir/config/config_basic.md
+++ b/docs/snippets/elixir/config/config_basic.md
@@ -0,0 +1,9 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "enable_quality_processing" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/config_discover.md
+++ b/docs/snippets/elixir/config/config_discover.md
@@ -0,0 +1,5 @@
+```elixir title="Elixir"
+# Extract with nil config to use discovered/default configuration
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/config_ocr.md
+++ b/docs/snippets/elixir/config/config_ocr.md
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts("Content length: #{String.length(result.content)}")
+IO.puts("Tables detected: #{length(result.tables)}")
+```
--- a/docs/snippets/elixir/config/config_programmatic.md
+++ b/docs/snippets/elixir/config/config_programmatic.md
@@ -0,0 +1,20 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng+deu",
+    "tesseract_config" => %{
+      "psm" => 6
+    }
+  },
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200
+  },
+  "enable_quality_processing" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Content length: #{String.length(result.content)}")
+```
--- a/docs/snippets/elixir/config/document_structure_config.md
+++ b/docs/snippets/elixir/config/document_structure_config.md
@@ -0,0 +1,13 @@
+```elixir title="Document Structure Config (Elixir)"
+config = %Kreuzberg.ExtractionConfig{
+  include_document_structure: true
+}
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
+
+if result.document do
+  Enum.each(result.document.nodes, fn node ->
+    IO.puts("[#{node.content.node_type}]")
+  end)
+end
+```
--- a/docs/snippets/elixir/config/element_based_output.md
+++ b/docs/snippets/elixir/config/element_based_output.md
@@ -0,0 +1,36 @@
+```elixir title="Element-Based Output (Elixir)"
+# Configure element-based output
+config = %Kreuzberg.ExtractionConfig{
+  output_format: :element_based
+}
+
+# Extract document
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
+
+# Access elements
+Enum.each(result.elements, fn element ->
+  IO.puts("Type: #{element.element_type}")
+
+  text = String.slice(element.text, 0, 100)
+  IO.puts("Text: #{text}")
+
+  if element.metadata.page_number do
+    IO.puts("Page: #{element.metadata.page_number}")
+  end
+
+  if element.metadata.coordinates do
+    coords = element.metadata.coordinates
+    IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
+  end
+
+  IO.puts("---")
+end)
+
+# Filter by element type
+titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
+
+Enum.each(titles, fn title ->
+  level = Map.get(title.metadata.additional, "level", "unknown")
+  IO.puts("[#{level}] #{title.text}")
+end)
+```
--- a/docs/snippets/elixir/config/embedding_config.md
+++ b/docs/snippets/elixir/config/embedding_config.md
@@ -0,0 +1,22 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "embedding" => %{
+      "model" => %{
+        "preset" => %{
+          "name" => "balanced"
+        }
+      },
+      "batch_size" => 16,
+      "normalize" => true,
+      "show_download_progress" => true
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+chunks_count = if result.chunks, do: length(result.chunks), else: 0
+IO.puts("Chunks with embeddings: #{chunks_count}")
+```
--- a/docs/snippets/elixir/config/html_output.md
+++ b/docs/snippets/elixir/config/html_output.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "output_format" => "Html",
+  "html_output" => %{
+    "theme" => "GitHub"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/keyword_extraction_config.md
+++ b/docs/snippets/elixir/config/keyword_extraction_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.1,
+    "ngram_range" => [1, 3],
+    "language" => "en"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Keywords: #{inspect(result.keywords)}")
+```
--- a/docs/snippets/elixir/config/language_detection_config.md
+++ b/docs/snippets/elixir/config/language_detection_config.md
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.8,
+    "detect_multiple" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Detected language: #{result.language}")
+IO.puts("Confidence: #{result.language_confidence}")
+```
--- a/docs/snippets/elixir/config/ocr_dpi_config.md
+++ b/docs/snippets/elixir/config/ocr_dpi_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "images" => %{
+    "extract_images" => true,
+    "target_dpi" => 300,
+    "max_image_dimension" => 4096,
+    "auto_adjust_dpi" => true,
+    "min_dpi" => 150,
+    "max_dpi" => 600
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Extracted images: #{length(result.images)}")
+```
--- a/docs/snippets/elixir/config/pdf_config.md
+++ b/docs/snippets/elixir/config/pdf_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "pdf_options" => %{
+    "extract_images" => true,
+    "passwords" => ["password123"],
+    "extract_metadata" => true,
+    "hierarchy" => %{}
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
+IO.puts("Title: #{inspect(result.metadata.title)}")
+IO.puts("Authors: #{inspect(result.metadata.authors)}")
+```
--- a/docs/snippets/elixir/config/pdf_hierarchy_config.md
+++ b/docs/snippets/elixir/config/pdf_hierarchy_config.md
@@ -0,0 +1,16 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "pdf_options" => %{
+    "hierarchy" => %{
+      "enabled" => true,
+      "detection_threshold" => 0.75,
+      "ocr_coverage_threshold" => 0.8,
+      "min_level" => 1,
+      "max_level" => 5
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
+```
--- a/docs/snippets/elixir/config/postprocessor_config.md
+++ b/docs/snippets/elixir/config/postprocessor_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "postprocessor" => %{
+    "enabled" => true,
+    "enabled_processors" => [
+      "whitespace_normalizer",
+      "unicode_normalizer"
+    ]
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Processed content: #{result.content}")
+```
--- a/docs/snippets/elixir/config/quality_processing_config.md
+++ b/docs/snippets/elixir/config/quality_processing_config.md
@@ -0,0 +1,10 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "enable_quality_processing" => true,
+  "use_cache" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Quality score: #{result.quality_score}")
+IO.puts("Processing time: #{inspect(result.processing_time)}")
+```
--- a/docs/snippets/elixir/config/tesseract_config.md
+++ b/docs/snippets/elixir/config/tesseract_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng+deu",
+    "tesseract_config" => %{
+      "psm" => 6,
+      "oem" => 3
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts("OCR text: #{result.content}")
+```
--- a/docs/snippets/elixir/config/token_reduction_config.md
+++ b/docs/snippets/elixir/config/token_reduction_config.md
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_important_words" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Original tokens: #{result.token_count}")
+IO.puts("Reduced content: #{result.content}")
+```
--- a/docs/snippets/elixir/configuration/chunking_config.exs
+++ b/docs/snippets/elixir/configuration/chunking_config.exs
@@ -0,0 +1,78 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Example 1: Basic character-based chunking for RAG applications
+config = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 1000,
+    "overlap" => 100,
+    "min_size" => 200,
+    "respect_boundaries" => true
+  }
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Access chunks
+if result.chunks do
+  IO.puts("Generated #{length(result.chunks)} chunks")
+
+  Enum.each(result.chunks, fn chunk ->
+    IO.puts("Chunk: #{String.slice(chunk["content"], 0..50)}...")
+  end)
+end
+
+# Example 2: Markdown chunker with token-based sizing and heading context
+config2 = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "chunker_type" => "markdown",
+    "sizing" => %{
+      "type" => "tokenizer",
+      "model" => "Xenova/gpt-4o"
+    }
+  }
+}
+
+{:ok, result2} = Kreuzberg.extract_file("document.md", nil, config2)
+
+if result2.chunks do
+  IO.puts("Generated #{length(result2.chunks)} markdown chunks")
+
+  Enum.each(result2.chunks, fn chunk ->
+    IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..60)}...")
+
+    # Access heading context
+    if is_map(chunk["metadata"]) and is_map(chunk["metadata"]["heading_context"]) do
+      headings = chunk["metadata"]["heading_context"]["headings"] || []
+      if length(headings) > 0 do
+        IO.puts("  Headings in context:")
+        Enum.each(headings, fn heading ->
+          IO.puts("    - Level #{heading["level"]}: #{heading["text"]}")
+        end)
+      end
+    end
+  end)
+end
+
+# Example 3: Prepend heading context to chunk content
+config3 = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "chunker_type" => "markdown",
+    "prepend_heading_context" => true
+  }
+}
+
+{:ok, result3} = Kreuzberg.extract_file("document.md", nil, config3)
+
+if result3.chunks do
+  IO.puts("Generated #{length(result3.chunks)} chunks with prepended headings")
+
+  Enum.each(result3.chunks, fn chunk ->
+    # Each chunk's content is prefixed with its heading breadcrumb
+    IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..80)}...")
+  end)
+end
+```
--- a/docs/snippets/elixir/configuration/config_basic.exs
+++ b/docs/snippets/elixir/configuration/config_basic.exs
@@ -0,0 +1,17 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "tesseract"},
+  chunking: %{"max_characters" => 1000, "overlap" => 100},
+  language_detection: %{"enabled" => true},
+  use_cache: true,
+  force_ocr: false
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Content length: #{byte_size(result.content)} characters")
+IO.puts("Detected languages: #{inspect(result.detected_languages)}")
+IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
+```
--- a/docs/snippets/elixir/configuration/config_discover.exs
+++ b/docs/snippets/elixir/configuration/config_discover.exs
@@ -0,0 +1,26 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Load configuration from file
+# Automatically discovers and loads configuration from the user's config directory
+config_path = Path.expand("~/.kreuzberg/config.json")
+
+config = if File.exists?(config_path) do
+  config_data = config_path |> File.read!() |> Jason.decode!()
+  struct(ExtractionConfig, Map.new(config_data, fn {k, v} -> {String.to_atom(k), v} end))
+else
+  IO.puts("Config file not found at #{config_path}. Using defaults.")
+  %ExtractionConfig{}
+end
+
+IO.puts("Configuration Source: #{if File.exists?(config_path), do: "#{config_path} (file)", else: "defaults"}")
+IO.puts("OCR Enabled: #{inspect(config.ocr["enabled"])}")
+IO.puts("Chunking Max Characters: #{inspect(config.chunking["max_characters"])}")
+IO.puts("Use Cache: #{inspect(config.use_cache)}")
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Document extraction complete")
+IO.puts("Content length: #{byte_size(result.content)} bytes")
+IO.puts("Languages detected: #{inspect(result.detected_languages)}")
+```
--- a/docs/snippets/elixir/configuration/config_ocr.exs
+++ b/docs/snippets/elixir/configuration/config_ocr.exs
@@ -0,0 +1,28 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure OCR-specific settings
+# Includes backend selection, language settings, and preprocessing options
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng",
+    "preprocessing" => true
+  },
+  chunking: %{
+    "max_characters" => 2000,
+    "overlap" => 200
+  },
+  use_cache: true,
+  force_ocr: false
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
+
+IO.puts("OCR Configuration Applied:")
+IO.puts("Backend: tesseract")
+IO.puts("Language: eng")
+IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+IO.puts("Metadata: #{inspect(result.metadata)}")
+```
--- a/docs/snippets/elixir/configuration/config_programmatic.exs
+++ b/docs/snippets/elixir/configuration/config_programmatic.exs
@@ -0,0 +1,44 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Build configuration dynamically based on runtime conditions
+# Useful for environment-specific settings and feature flags
+
+defmodule ConfigBuilder do
+  def build_config(file_type, enable_ocr?) do
+    base_config = %ExtractionConfig{
+      chunking: %{"max_characters" => 1000, "overlap" => 100},
+      use_cache: true
+    }
+
+    case {file_type, enable_ocr?} do
+      {:pdf, true} ->
+        %{base_config | ocr: %{"enabled" => true, "backend" => "tesseract"}, force_ocr: true}
+
+      {:pdf, false} ->
+        %{base_config | ocr: %{"enabled" => false}}
+
+      {:image, true} ->
+        %{
+          base_config
+          | ocr: %{"enabled" => true, "backend" => "tesseract", "preprocessing" => true},
+            force_ocr: true
+        }
+
+      {:image, false} ->
+        %{base_config | ocr: %{"enabled" => false}}
+
+      {_, _} ->
+        base_config
+    end
+  end
+end
+
+# Build configuration based on file type and requirements
+config = ConfigBuilder.build_config(:pdf, true)
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Dynamic configuration applied")
+IO.puts("Content: #{String.slice(result.content, 0..100)}")
+```
--- a/docs/snippets/elixir/configuration/embedding_config.exs
+++ b/docs/snippets/elixir/configuration/embedding_config.exs
@@ -0,0 +1,20 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure embeddings for vector search
+config = %ExtractionConfig{
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 512,
+    "overlap" => 50
+  },
+  embeddings: %{
+    "enabled" => true,
+    "model" => "sentence-transformers/all-MiniLM-L6-v2"
+  }
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Extracted chunks with embeddings: #{length(result.chunks || [])}")
+```
--- a/docs/snippets/elixir/configuration/keyword_extraction_config.exs
+++ b/docs/snippets/elixir/configuration/keyword_extraction_config.exs
@@ -0,0 +1,23 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure keyword extraction
+config = %ExtractionConfig{
+  keyword_extraction: %{
+    "enabled" => true,
+    "max_keywords" => 10,
+    "min_score" => 0.5
+  }
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+# Access extracted keywords
+if result.keywords do
+  IO.puts("Extracted #{length(result.keywords)} keywords")
+
+  Enum.each(result.keywords, fn keyword ->
+    IO.puts("#{keyword["text"]}: #{keyword["score"]}")
+  end)
+end
+```
--- a/docs/snippets/elixir/configuration/language_detection_config.exs
+++ b/docs/snippets/elixir/configuration/language_detection_config.exs
@@ -0,0 +1,23 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure language detection for multilingual documents
+# Automatically detect all languages present in the content
+config = %ExtractionConfig{
+  language_detection: %{
+    "enabled" => true,
+    "detect_all" => true
+  },
+  chunking: %{
+    "max_characters" => 1000,
+    "overlap" => 100
+  },
+  use_cache: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("multilingual.pdf", nil, config)
+
+IO.puts("Detected Languages:")
+IO.inspect(result.detected_languages)
+IO.puts("Content: #{String.slice(result.content, 0..100)}")
+```
--- a/docs/snippets/elixir/configuration/pdf_config.exs
+++ b/docs/snippets/elixir/configuration/pdf_config.exs
@@ -0,0 +1,25 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure PDF-specific extraction options
+# Extract annotations, metadata, and flatten forms for comprehensive document processing
+config = %ExtractionConfig{
+  pdf: %{
+    "extract_annotations" => true,
+    "extract_metadata" => true,
+    "flatten_forms" => true
+  },
+  chunking: %{
+    "max_characters" => 1500,
+    "overlap" => 150
+  },
+  use_cache: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("form_document.pdf", nil, config)
+
+IO.puts("PDF Extraction Complete:")
+IO.puts("Content length: #{byte_size(result.content)} bytes")
+IO.puts("Metadata: #{inspect(result.metadata)}")
+IO.puts("Annotations present: #{map_size(result.metadata["annotations"] || %{}) > 0}")
+```
--- a/docs/snippets/elixir/configuration/postprocessor_config.exs
+++ b/docs/snippets/elixir/configuration/postprocessor_config.exs
@@ -0,0 +1,29 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure post-processing pipeline
+# Applies transformations to extracted content after initial processing
+config = %ExtractionConfig{
+  postprocessing: %{
+    "remove_whitespace" => true,
+    "normalize_unicode" => true,
+    "fix_encoding" => true
+  },
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract"
+  },
+  use_cache: true,
+  force_ocr: false
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Post-Processing Configuration Applied:")
+IO.puts("Remove Whitespace: true")
+IO.puts("Normalize Unicode: true")
+IO.puts("Fix Encoding: true")
+IO.puts("Original content length: #{byte_size(result.content)} bytes")
+IO.puts("Processed content: #{String.slice(result.content, 0..100)}...")
+IO.puts("Metadata: #{inspect(result.metadata)}")
+```
--- a/docs/snippets/elixir/configuration/quality_processing_config.exs
+++ b/docs/snippets/elixir/configuration/quality_processing_config.exs
@@ -0,0 +1,30 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure quality processing settings
+# Enable noise removal and set minimum quality thresholds for extracted content
+config = %ExtractionConfig{
+  quality_processing: %{
+    "enabled" => true,
+    "min_quality_score" => 0.7,
+    "remove_noise" => true
+  },
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract"
+  },
+  chunking: %{
+    "max_characters" => 1000,
+    "overlap" => 100
+  },
+  use_cache: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("noisy_document.pdf", nil, config)
+
+IO.puts("Quality Processing Applied:")
+IO.puts("Content quality score: #{result.quality_score}")
+IO.puts("Noise removed: true")
+IO.puts("Content length: #{byte_size(result.content)} bytes")
+IO.puts("Processing complete: #{inspect(result)}")
+```
--- a/docs/snippets/elixir/configuration/tesseract_config.exs
+++ b/docs/snippets/elixir/configuration/tesseract_config.exs
@@ -0,0 +1,29 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure Tesseract OCR settings
+# Includes language selection, page segmentation mode, OCR engine mode, and DPI settings
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng",
+    "psm" => 3,
+    "oem" => 3,
+    "dpi" => 300
+  },
+  use_cache: true,
+  force_ocr: false
+}
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
+
+IO.puts("Tesseract Configuration Applied:")
+IO.puts("Backend: tesseract")
+IO.puts("Language: eng")
+IO.puts("PSM (Page Segmentation Mode): 3")
+IO.puts("OEM (OCR Engine Mode): 3")
+IO.puts("DPI: 300")
+IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+IO.puts("Metadata: #{inspect(result.metadata)}")
+```
--- a/docs/snippets/elixir/configuration/token_reduction_config.exs
+++ b/docs/snippets/elixir/configuration/token_reduction_config.exs
@@ -0,0 +1,28 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Configure token reduction for LLM context windows
+# Helps manage token usage when working with large language models
+config = %ExtractionConfig{
+  token_reduction: %{
+    "enabled" => true,
+    "target_tokens" => 4000,
+    "strategy" => "truncate"
+  },
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract"
+  },
+  use_cache: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("large_document.pdf", nil, config)
+
+IO.puts("Token Reduction Configuration Applied:")
+IO.puts("Token Reduction Enabled: true")
+IO.puts("Target Tokens: 4000")
+IO.puts("Strategy: truncate")
+IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+IO.puts("Tokens reduced: #{inspect(result.metadata[:token_reduction_applied])}")
+IO.puts("Final metadata: #{inspect(result.metadata)}")
+```
--- a/docs/snippets/elixir/core/batch_extract_bytes_sync.exs
+++ b/docs/snippets/elixir/core/batch_extract_bytes_sync.exs
@@ -0,0 +1,54 @@
+```elixir title="Elixir"
+# Extract from multiple binary inputs in a batch operation
+# Useful for processing multiple documents in memory
+
+# Prepare binary data from multiple sources
+{:ok, pdf_data_1} = File.read("document1.pdf")
+{:ok, pdf_data_2} = File.read("document2.pdf")
+{:ok, pdf_data_3} = File.read("document3.pdf")
+
+data_list = [pdf_data_1, pdf_data_2, pdf_data_3]
+
+# Option 1: Use single MIME type for all inputs
+{:ok, results} = Kreuzberg.batch_extract_bytes(data_list, "application/pdf")
+
+# Process results
+Enum.each(results, fn result ->
+  IO.puts("Content length: #{byte_size(result.content)} characters")
+  IO.puts("MIME type: #{result.mime_type}")
+  IO.puts("Tables found: #{length(result.tables)}")
+  IO.puts("---")
+end)
+
+IO.puts("Total documents processed: #{length(results)}")
+
+# Option 2: Use different MIME types for each input
+mime_types = ["application/pdf", "text/html", "application/pdf"]
+{:ok, mixed_results} = Kreuzberg.batch_extract_bytes(data_list, mime_types)
+
+# Option 3: Batch extraction with configuration
+config = %Kreuzberg.ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "tesseract"},
+  extract_images: true
+}
+
+case Kreuzberg.batch_extract_bytes(data_list, "application/pdf", config) do
+  {:ok, results} ->
+    IO.puts("Successfully extracted #{length(results)} documents")
+    Enum.each(results, fn result ->
+      IO.puts("Content: #{String.slice(result.content, 0..100)}...")
+    end)
+
+  {:error, reason} ->
+    IO.puts("Batch extraction failed: #{reason}")
+end
+
+# Option 4: Using the bang variant (raises on error)
+try do
+  results = Kreuzberg.batch_extract_bytes!(data_list, "application/pdf")
+  IO.puts("Extracted #{length(results)} documents successfully")
+rescue
+  error in Kreuzberg.Error ->
+    IO.puts("Error: #{error.message}")
+end
+```
--- a/docs/snippets/elixir/core/batch_extract_files_sync.exs
+++ b/docs/snippets/elixir/core/batch_extract_files_sync.exs
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
+
+{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
+
+Enum.each(results, fn result ->
+  IO.puts("File: #{result.mime_type}")
+  IO.puts("Content length: #{byte_size(result.content)} characters")
+  IO.puts("Tables: #{length(result.tables)}")
+  IO.puts("---")
+end)
+
+IO.puts("Total files processed: #{length(results)}")
+```
--- a/docs/snippets/elixir/core/client_extract_multiple_files.exs
+++ b/docs/snippets/elixir/core/client_extract_multiple_files.exs
@@ -0,0 +1,163 @@
+```elixir title="Elixir"
+# Client wrapper for batch file extraction
+# Provides error handling, logging, and result aggregation
+
+defmodule BatchDocumentClient do
+  @moduledoc """
+  Client wrapper for batch file document extraction.
+  Handles multiple files with comprehensive error handling and logging.
+  """
+
+  alias Kreuzberg.ExtractionResult
+
+  @doc """
+  Extract content from multiple files in batch.
+
+  Returns results for all successfully processed files and logs errors
+  for any files that fail during extraction.
+
+  ## Options
+
+    * `:mime_type` - MIME type for all files (optional, defaults to auto-detection)
+    * `:config` - ExtractionConfig struct with options (optional)
+    * `:log_errors` - Whether to log extraction errors (default: true)
+    * `:fail_fast` - Stop on first error (default: false)
+
+  ## Examples
+
+      {:ok, results} = BatchDocumentClient.extract_files(
+        ["doc1.pdf", "doc2.pdf", "doc3.pdf"],
+        mime_type: "application/pdf"
+      )
+  """
+  @spec extract_files([String.t()], keyword()) ::
+          {:ok, [ExtractionResult.t()]} | {:error, String.t()}
+  def extract_files(paths, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+    log_errors = Keyword.get(opts, :log_errors, true)
+
+    case Kreuzberg.batch_extract_files(paths, mime_type, config) do
+      {:ok, results} ->
+        IO.debug("Successfully extracted #{length(results)} files")
+        {:ok, results}
+
+      {:error, reason} ->
+        if log_errors do
+          IO.debug("Batch extraction error: #{reason}")
+        end
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract files and return detailed statistics.
+
+  Returns aggregated metrics about all processed files.
+  """
+  @spec extract_files_with_stats([String.t()], keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_files_with_stats(paths, opts \\ []) do
+    start_time = System.monotonic_time(:millisecond)
+
+    case extract_files(paths, opts) do
+      {:ok, results} ->
+        elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+        stats = %{
+          total_files: length(results),
+          total_content_size: Enum.reduce(results, 0, &(byte_size(&1.content) + &2)),
+          total_tables: Enum.reduce(results, 0, &(length(&1.tables) + &2)),
+          total_images: Enum.reduce(results, 0, &(length(&1.images || []) + &2)),
+          processing_time_ms: elapsed_ms,
+          avg_time_per_file_ms: div(elapsed_ms, max(length(results), 1)),
+          results: results
+        }
+
+        {:ok, stats}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract files and transform results.
+
+  Applies a transformation function to each extraction result.
+  Useful for custom processing or formatting of results.
+  """
+  @spec extract_and_transform([String.t()], function(), keyword()) ::
+          {:ok, [any()]} | {:error, String.t()}
+  def extract_and_transform(paths, transform_fn, opts \\ []) do
+    case extract_files(paths, opts) do
+      {:ok, results} ->
+        transformed =
+          results
+          |> Enum.map(fn result ->
+            try do
+              {:ok, transform_fn.(result)}
+            rescue
+              error ->
+                IO.debug("Transform error: #{inspect(error)}")
+                {:error, error}
+            end
+          end)
+
+        # Check if any transforms failed
+        case Enum.find(transformed, fn r -> match?({:error, _}, r) end) do
+          nil ->
+            # All succeeded
+            {:ok, Enum.map(transformed, fn {:ok, value} -> value end)}
+
+          {:error, error} ->
+            {:error, "Transform failed: #{inspect(error)}"}
+        end
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+
+# Extract multiple files
+case BatchDocumentClient.extract_files(["doc1.pdf", "doc2.pdf", "doc3.pdf"]) do
+  {:ok, results} ->
+    Enum.each(results, fn result ->
+      IO.puts("Extracted: #{byte_size(result.content)} bytes")
+    end)
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with statistics
+case BatchDocumentClient.extract_files_with_stats(["doc1.pdf", "doc2.pdf"]) do
+  {:ok, stats} ->
+    IO.puts("Total files: #{stats.total_files}")
+    IO.puts("Total size: #{stats.total_content_size} bytes")
+    IO.puts("Processing time: #{stats.processing_time_ms}ms")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract and transform
+transform = fn result ->
+  %{
+    mime: result.mime_type,
+    size: byte_size(result.content),
+    tables: length(result.tables)
+  }
+end
+
+case BatchDocumentClient.extract_and_transform(["doc1.pdf", "doc2.pdf"], transform) do
+  {:ok, transformed_results} ->
+    IO.inspect(transformed_results)
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/client_extract_single_file.exs
+++ b/docs/snippets/elixir/core/client_extract_single_file.exs
@@ -0,0 +1,107 @@
+```elixir title="Elixir"
+# Reusable client pattern for single file extraction
+# Encapsulates extraction logic with consistent error handling
+
+defmodule DocumentClient do
+  @moduledoc """
+  Client wrapper for single file document extraction.
+  Provides a consistent interface for extracting content from files.
+  """
+
+  alias Kreuzberg.ExtractionResult
+
+  @doc """
+  Extract content from a single file.
+
+  Returns a result tuple with the extraction outcome.
+  Supports explicit MIME type specification or auto-detection.
+
+  ## Options
+
+    * `:mime_type` - MIME type of the file (optional, defaults to auto-detection)
+    * `:config` - ExtractionConfig struct with options (optional)
+
+  ## Examples
+
+      {:ok, result} = DocumentClient.extract_file("document.pdf")
+      {:ok, result} = DocumentClient.extract_file("document.pdf", mime_type: "application/pdf")
+  """
+  @spec extract_file(String.t(), keyword()) ::
+          {:ok, ExtractionResult.t()} | {:error, String.t()}
+  def extract_file(path, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        IO.debug("Successfully extracted file: #{path}")
+        {:ok, result}
+
+      {:error, reason} ->
+        IO.debug("Failed to extract file: #{path} - #{reason}")
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract content from a file, raising on error.
+
+  Raises Kreuzberg.Error if extraction fails.
+  """
+  @spec extract_file!(String.t(), keyword()) :: ExtractionResult.t()
+  def extract_file!(path, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+
+    Kreuzberg.extract_file!(path, mime_type, config)
+  end
+
+  @doc """
+  Extract and process file content.
+
+  Returns a map with extracted content, metadata, and processing statistics.
+  """
+  @spec extract_with_stats(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_stats(path, opts \\ []) do
+    start_time = System.monotonic_time(:millisecond)
+
+    case extract_file(path, opts) do
+      {:ok, result} ->
+        elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+        {:ok,
+         %{
+           content: result.content,
+           mime_type: result.mime_type,
+           metadata: result.metadata,
+           table_count: length(result.tables),
+           image_count: length(result.images || []),
+           processing_time_ms: elapsed_ms
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+case DocumentClient.extract_file("document.pdf") do
+  {:ok, result} ->
+    IO.puts("Content length: #{byte_size(result.content)} bytes")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+# Extract with statistics
+case DocumentClient.extract_with_stats("document.pdf") do
+  {:ok, stats} ->
+    IO.puts("Processing time: #{stats.processing_time_ms}ms")
+    IO.puts("Tables found: #{stats.table_count}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/client_extract_with_config.exs
+++ b/docs/snippets/elixir/core/client_extract_with_config.exs
@@ -0,0 +1,247 @@
+```elixir title="Elixir"
+# Client wrapper with comprehensive configuration
+# Demonstrates advanced extraction patterns with OCR, chunking, and validation
+
+defmodule ConfiguredDocumentClient do
+  @moduledoc """
+  Client wrapper for document extraction with advanced configuration.
+  Supports OCR, chunking, language detection, and custom error handling.
+  """
+
+  alias Kreuzberg.{ExtractionConfig, ExtractionResult}
+
+  @doc """
+  Extract with OCR enabled for scanned documents.
+
+  Uses Tesseract OCR backend for text extraction from images.
+  """
+  @spec extract_with_ocr(String.t() | binary(), keyword()) ::
+          {:ok, ExtractionResult.t()} | {:error, String.t()}
+  def extract_with_ocr(input, opts \\ []) do
+    is_file = is_binary(input) and File.exists?(input)
+
+    config = %ExtractionConfig{
+      ocr: %{
+        "enabled" => true,
+        "backend" => Keyword.get(opts, :ocr_backend, "tesseract")
+      },
+      force_ocr: Keyword.get(opts, :force_ocr, false)
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case is_file do
+      true -> Kreuzberg.extract_file(input, mime_type, config)
+      false -> Kreuzberg.extract(input, mime_type || "application/pdf", config)
+    end
+  end
+
+  @doc """
+  Extract with text chunking for embedding or RAG pipelines.
+
+  Splits extracted text into chunks with configurable size and overlap.
+  """
+  @spec extract_with_chunking(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_chunking(path, opts \\ []) do
+    config = %ExtractionConfig{
+      chunking: %{
+        "max_characters" => Keyword.get(opts, :chunk_size, 1000),
+        "overlap" => Keyword.get(opts, :chunk_overlap, 100)
+      }
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        chunks = result.chunks || []
+
+        {:ok,
+         %{
+           content: result.content,
+           chunks: chunks,
+           chunk_count: length(chunks),
+           metadata: result.metadata
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract with language detection and multi-language support.
+
+  Detects document languages and can extract from specific languages.
+  """
+  @spec extract_with_language_detection(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_language_detection(path, opts \\ []) do
+    config = %ExtractionConfig{
+      language_detection: %{"enabled" => true},
+      extract_images: true
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        {:ok,
+         %{
+           content: result.content,
+           detected_languages: result.detected_languages || [],
+           mime_type: result.mime_type,
+           tables: result.tables,
+           images: result.images || []
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract with comprehensive configuration for production use.
+
+  Combines OCR, chunking, caching, and language detection with error handling.
+  """
+  @spec extract_with_full_config(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_full_config(path, opts \\ []) do
+    config = %ExtractionConfig{
+      # OCR settings
+      ocr: %{
+        "enabled" => Keyword.get(opts, :ocr_enabled, true),
+        "backend" => Keyword.get(opts, :ocr_backend, "tesseract")
+      },
+      # Chunking for embeddings
+      chunking: %{
+        "max_characters" => Keyword.get(opts, :chunk_size, 1000),
+        "overlap" => Keyword.get(opts, :chunk_overlap, 100)
+      },
+      # Language detection
+      language_detection: %{"enabled" => Keyword.get(opts, :detect_language, true)},
+      # Cache results
+      use_cache: Keyword.get(opts, :use_cache, true),
+      # Extract various content types
+      extract_images: Keyword.get(opts, :extract_images, true),
+      extract_tables: true
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        summary = %{
+          file_path: path,
+          mime_type: result.mime_type,
+          content_length: byte_size(result.content),
+          content_preview: String.slice(result.content, 0..200),
+          detected_languages: result.detected_languages || [],
+          table_count: length(result.tables),
+          image_count: length(result.images || []),
+          chunk_count: length(result.chunks || []),
+          metadata: result.metadata
+        }
+
+        {:ok, summary}
+
+      {:error, reason} ->
+        {:error, "Extraction failed: #{reason}"}
+    end
+  end
+
+  @doc """
+  Validate file before extraction.
+
+  Checks file existence and MIME type compatibility.
+  """
+  @spec validate_file(String.t()) :: :ok | {:error, String.t()}
+  def validate_file(path) do
+    cond do
+      not File.exists?(path) ->
+        {:error, "File not found: #{path}"}
+
+      true ->
+        case Kreuzberg.detect_mime_type_from_path(path) do
+          {:ok, _mime_type} ->
+            :ok
+
+          {:error, reason} ->
+            {:error, "Cannot determine MIME type: #{reason}"}
+        end
+    end
+  end
+
+  @doc """
+  Extract with validation and error recovery.
+
+  Validates file before extraction and provides detailed error information.
+  """
+  @spec extract_safely(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_safely(path, opts \\ []) do
+    with :ok <- validate_file(path),
+         {:ok, summary} <- extract_with_full_config(path, opts) do
+      {:ok, summary}
+    else
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+
+# Extract with OCR
+case ConfiguredDocumentClient.extract_with_ocr("scanned_document.pdf",
+  ocr_backend: "tesseract"
+) do
+  {:ok, result} ->
+    IO.puts("OCR extraction successful")
+    IO.puts("Content: #{String.slice(result.content, 0..100)}...")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with chunking
+case ConfiguredDocumentClient.extract_with_chunking("document.pdf",
+  chunk_size: 500,
+  chunk_overlap: 50
+) do
+  {:ok, data} ->
+    IO.puts("Chunks: #{data.chunk_count}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with language detection
+case ConfiguredDocumentClient.extract_with_language_detection("multilingual.pdf") do
+  {:ok, data} ->
+    IO.puts("Detected languages: #{inspect(data.detected_languages)}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Comprehensive extraction with validation
+case ConfiguredDocumentClient.extract_safely("document.pdf",
+  ocr_enabled: true,
+  detect_language: true,
+  extract_images: true,
+  use_cache: true
+) do
+  {:ok, summary} ->
+    IO.puts("File: #{summary.file_path}")
+    IO.puts("MIME: #{summary.mime_type}")
+    IO.puts("Size: #{summary.content_length} bytes")
+    IO.puts("Tables: #{summary.table_count}")
+    IO.puts("Languages: #{inspect(summary.detected_languages)}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/error_handling.exs
+++ b/docs/snippets/elixir/core/error_handling.exs
@@ -0,0 +1,31 @@
+```elixir title="Elixir"
+# Example: Handling extraction errors
+case Kreuzberg.extract_file("document.pdf") do
+  {:ok, result} ->
+    IO.puts("Successfully extracted content")
+    IO.puts("Content length: #{byte_size(result.content)} characters")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+# Example: Handling with custom error message
+result = Kreuzberg.extract_file("nonexistent.pdf")
+
+case result do
+  {:ok, data} ->
+    IO.puts("File processed successfully")
+  {:error, error} ->
+    IO.puts("Error details: #{inspect(error)}")
+end
+
+# Example: Extract with pattern matching
+case Kreuzberg.extract(<<>>, "application/pdf") do
+  {:ok, result} ->
+    IO.puts("Content: #{result.content}")
+  {:error, msg} when is_binary(msg) ->
+    IO.puts("Validation error: #{msg}")
+  {:error, reason} ->
+    IO.puts("Unknown error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/core/extract_bytes_sync.exs
+++ b/docs/snippets/elixir/core/extract_bytes_sync.exs
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+# Read file into memory
+{:ok, file_content} = File.read("document.pdf")
+
+# Extract from bytes/binary data
+{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
+
+content = result.content
+IO.puts("Extracted content:")
+IO.puts(content)
+IO.puts("MIME type: #{result.mime_type}")
+IO.puts("Tables found: #{length(result.tables)}")
+```
--- a/docs/snippets/elixir/core/extract_file_async.exs
+++ b/docs/snippets/elixir/core/extract_file_async.exs
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+task = Kreuzberg.extract_file_async("document.pdf")
+{:ok, result} = Task.await(task)
+
+content = result.content
+table_count = length(result.tables)
+metadata = result.metadata
+
+IO.puts("Content length: #{byte_size(content)} characters")
+IO.puts("Tables: #{table_count}")
+IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
+```
--- a/docs/snippets/elixir/core/extract_file_sync.exs
+++ b/docs/snippets/elixir/core/extract_file_sync.exs
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+content = result.content
+table_count = length(result.tables)
+metadata = result.metadata
+
+IO.puts("Content length: #{byte_size(content)} characters")
+IO.puts("Tables: #{table_count}")
+IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
+```
--- a/docs/snippets/elixir/getting-started/basic_usage.exs
+++ b/docs/snippets/elixir/getting-started/basic_usage.exs
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+# Basic document extraction workflow
+# Load file -> extract -> access results
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+IO.puts("Extracted Content:")
+IO.puts(result.content)
+
+IO.puts("\nMetadata:")
+IO.puts("Format: #{inspect(result.metadata.format)}")
+IO.puts("Tables found: #{length(result.tables)}")
+```
--- a/docs/snippets/elixir/getting-started/basic_usage.md
+++ b/docs/snippets/elixir/getting-started/basic_usage.md
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+defmodule BasicUsage do
+  def extract_with_default_config do
+    # Use default configuration (nil)
+    config = nil
+
+    case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
+      {:ok, content} ->
+        IO.puts("Extracted content:")
+        IO.puts(content)
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Extraction failed: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/getting-started/extract_file.exs
+++ b/docs/snippets/elixir/getting-started/extract_file.exs
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+# Extract from different file types (PDF, DOCX, etc.)
+
+case Kreuzberg.extract_file("document.pdf") do
+  {:ok, result} ->
+    IO.puts("Content: #{result.content}")
+    IO.puts("Format: #{inspect(result.metadata.format)}")
+    IO.puts("Tables: #{length(result.tables)}")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/getting-started/extract_file.md
+++ b/docs/snippets/elixir/getting-started/extract_file.md
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+defmodule ExtractFile do
+  def show_extraction_details do
+    # Extract from a file
+    case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
+      {:ok, result} ->
+        # Result is a string containing extracted content
+        IO.puts("Content length: #{String.length(result)} characters")
+        IO.puts("---")
+        IO.puts(result)
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Failed to extract: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/getting-started/extract_with_ocr.exs
+++ b/docs/snippets/elixir/getting-started/extract_with_ocr.exs
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+# Extract scanned documents with OCR
+# Configure Tesseract for OCR processing
+
+ocr_config = %Kreuzberg.Config.OCR{
+  backend: "tesseract",
+  language: "eng"
+}
+
+config = %Kreuzberg.Config.Extraction{
+  ocr: ocr_config
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned.pdf", config: config)
+
+IO.puts("Extracted text from scanned document:")
+IO.puts(result.content)
+IO.puts("Used OCR backend: tesseract")
+```
--- a/docs/snippets/elixir/getting-started/extract_with_ocr.md
+++ b/docs/snippets/elixir/getting-started/extract_with_ocr.md
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+defmodule ExtractWithOcr do
+  def extract_scanned_document do
+    # Build configuration with OCR settings as JSON string
+    config = ~s({"ocr": {"backend": "tesseract", "language": "eng"}})
+
+    case Kreuzberg.extract_file_sync("scanned.pdf", nil, config) do
+      {:ok, result} ->
+        IO.puts("Extracted via OCR:")
+        IO.puts(result)
+        :ok
+
+      {:error, reason} ->
+        IO.puts("OCR extraction failed: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/getting-started/hello_world.exs
+++ b/docs/snippets/elixir/getting-started/hello_world.exs
@@ -0,0 +1,5 @@
+```elixir title="Elixir"
+# First Kreuzberg program - extract text from a PDF
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/getting-started/hello_world.md
+++ b/docs/snippets/elixir/getting-started/hello_world.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+defmodule HelloWorld do
+  def main do
+    case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
+      {:ok, result} ->
+        IO.puts("Extraction succeeded!")
+        IO.puts(result)
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/getting-started/install_verify.exs
+++ b/docs/snippets/elixir/getting-started/install_verify.exs
@@ -0,0 +1,5 @@
+```elixir title="Elixir"
+# Verify Kreuzberg is installed and working by extracting a document
+{:ok, result} = Kreuzberg.extract_file("sample.pdf")
+IO.puts("Installation verified! Extracted #{String.length(result.content)} characters")
+```
--- a/docs/snippets/elixir/getting-started/install_verify.md
+++ b/docs/snippets/elixir/getting-started/install_verify.md
@@ -0,0 +1,18 @@
+```elixir title="Elixir"
+defmodule InstallVerify do
+  def verify_install do
+    # Verify Kreuzberg module is available
+    {:ok, extractors} = Kreuzberg.list_document_extractors()
+    IO.puts("Available extractors: #{inspect(extractors)}")
+
+    # Verify a simple extraction works
+    case Kreuzberg.extract_file_sync("test.txt", nil, nil) do
+      {:ok, _result} ->
+        IO.puts("Kreuzberg is properly installed and working!")
+
+      {:error, reason} ->
+        IO.puts("Extraction failed: #{reason}")
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/getting-started/read_content.exs
+++ b/docs/snippets/elixir/getting-started/read_content.exs
@@ -0,0 +1,22 @@
+```elixir title="Elixir"
+# Access different parts of result
+# Content, metadata, tables, images
+
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+# Access main content
+content = result.content
+IO.puts("Content length: #{String.length(content)} characters")
+
+# Access tables
+tables = result.tables
+IO.puts("Tables found: #{length(tables)}")
+
+# Access images
+images = result.images
+IO.puts("Images found: #{length(images)}")
+
+# Access metadata
+format = result.metadata.format
+IO.puts("Format: #{inspect(format)}")
+```
--- a/docs/snippets/elixir/getting-started/read_content.md
+++ b/docs/snippets/elixir/getting-started/read_content.md
@@ -0,0 +1,24 @@
+```elixir title="Elixir"
+defmodule ReadContent do
+  def process_extracted_content do
+    # Extract content and iterate over lines
+    case Kreuzberg.extract_file_sync("document.pdf", nil, nil) do
+      {:ok, content} ->
+        IO.puts("Processing extracted content:")
+
+        # Split content into lines and iterate
+        content
+        |> String.split("\n", trim: true)
+        |> Enum.each(fn line ->
+          IO.puts("  #{line}")
+        end)
+
+        :ok
+
+      {:error, reason} ->
+        IO.puts("Error: #{reason}")
+        :error
+    end
+  end
+end
+```
--- a/docs/snippets/elixir/llm/structured_extraction.md
+++ b/docs/snippets/elixir/llm/structured_extraction.md
@@ -0,0 +1,32 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+schema = %{
+  "type" => "object",
+  "properties" => %{
+    "title" => %{"type" => "string"},
+    "authors" => %{"type" => "array", "items" => %{"type" => "string"}},
+    "date" => %{"type" => "string"}
+  },
+  "required" => ["title", "authors", "date"],
+  "additionalProperties" => false
+}
+
+config_json =
+  Jason.encode!(%{
+    "structured_extraction" => %{
+      "schema" => schema,
+      "schema_name" => "paper_metadata",
+      "strict" => true,
+      "llm" => %{"model" => "openai/gpt-4o-mini"}
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("paper.pdf", nil, config_json)
+result = Jason.decode!(json)
+
+case result["structured_output"] do
+  nil -> IO.puts("no structured output")
+  output -> IO.inspect(output, label: "structured")
+end
+```
--- a/docs/snippets/elixir/mcp/mcp_custom_client.exs
+++ b/docs/snippets/elixir/mcp/mcp_custom_client.exs
@@ -0,0 +1,312 @@
+```elixir title="Elixir"
+# MCP Custom Client - Connect to Kreuzberg MCP servers
+# Demonstrates creating a reusable MCP client for document extraction
+
+defmodule KreuzbergMCPClient do
+  @moduledoc """
+  MCP client for communicating with Kreuzberg extraction servers.
+
+  Provides methods for extracting documents from remote MCP servers
+  with support for caching, retry logic, and error handling.
+  """
+
+  require Logger
+
+  defmodule Config do
+    @moduledoc """
+    Configuration for MCP client connections.
+    """
+
+    defstruct [
+      :host,
+      :port,
+      :timeout_ms,
+      :max_retries,
+      :retry_delay_ms,
+      :cache_dir
+    ]
+
+    def new(opts \\ []) do
+      %Config{
+        host: Keyword.get(opts, :host, "localhost"),
+        port: Keyword.get(opts, :port, 8080),
+        timeout_ms: Keyword.get(opts, :timeout_ms, 30000),
+        max_retries: Keyword.get(opts, :max_retries, 3),
+        retry_delay_ms: Keyword.get(opts, :retry_delay_ms, 1000),
+        cache_dir: Keyword.get(opts, :cache_dir, nil)
+      }
+    end
+  end
+
+  @doc """
+  Extract document from file via MCP server.
+
+  Sends extraction request to remote Kreuzberg MCP server and returns
+  structured extraction result with optional caching.
+
+  ## Options
+
+    * `:mime_type` - MIME type of document
+    * `:config` - Extraction configuration map
+    * `:use_cache` - Enable result caching (default: false)
+  """
+  @spec extract_file(Config.t(), String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_file(config, file_path, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type)
+    extraction_config = Keyword.get(opts, :config)
+    use_cache = Keyword.get(opts, :use_cache, false)
+
+    # Check cache first
+    if use_cache and config.cache_dir do
+      cache_key = compute_cache_key(file_path, mime_type, extraction_config)
+
+      case get_from_cache(config.cache_dir, cache_key) do
+        {:ok, cached_result} ->
+          Logger.debug("Cache hit for #{file_path}")
+          {:ok, cached_result}
+
+        :miss ->
+          # Cache miss, fetch from server
+          case fetch_from_server(config, file_path, mime_type, extraction_config) do
+            {:ok, result} ->
+              if use_cache, do: store_in_cache(config.cache_dir, cache_key, result)
+              {:ok, result}
+
+            error ->
+              error
+          end
+      end
+    else
+      fetch_from_server(config, file_path, mime_type, extraction_config)
+    end
+  end
+
+  @doc """
+  Upload and extract document via MCP server.
+
+  Reads file from disk, uploads it to the server, and returns extraction result.
+  Useful for server-side processing of large files.
+  """
+  @spec upload_and_extract(Config.t(), String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def upload_and_extract(config, file_path, opts \\ []) do
+    unless File.exists?(file_path) do
+      {:error, "File not found: #{file_path}"}
+    else
+      case File.read(file_path) do
+        {:ok, body} ->
+          url = "http://#{config.host}:#{config.port}/extract/file"
+
+          headers = [
+            {"Content-Type", "application/octet-stream"},
+            {"X-File-Name", Path.basename(file_path)}
+          ]
+
+          case HTTPoison.post(url, body, headers, timeout: config.timeout_ms) do
+            {:ok, response} ->
+              handle_response(response)
+
+            {:error, reason} ->
+              Logger.error("Upload failed: #{inspect(reason)}")
+              {:error, "Upload failed: #{inspect(reason)}"}
+          end
+
+        {:error, reason} ->
+          {:error, "Failed to read file: #{inspect(reason)}"}
+      end
+    end
+  end
+
+  @doc """
+  Check health status of MCP server.
+  """
+  @spec health_check(Config.t()) :: {:ok, map()} | {:error, String.t()}
+  def health_check(config) do
+    url = "http://#{config.host}:#{config.port}/health"
+
+    case HTTPoison.get(url, [], timeout: config.timeout_ms) do
+      {:ok, response} ->
+        case handle_response(response) do
+          {:ok, data} -> {:ok, data}
+          error -> error
+        end
+
+      {:error, reason} ->
+        {:error, "Health check failed: #{inspect(reason)}"}
+    end
+  end
+
+  @doc """
+  Batch extract multiple documents with parallel requests.
+
+  Sends concurrent extraction requests for better throughput with large
+  document collections.
+  """
+  @spec batch_extract(Config.t(), [String.t()], keyword()) ::
+          {:ok, [map()]} | {:error, String.t()}
+  def batch_extract(config, file_paths, opts \\ []) do
+    Logger.info("Batch extracting #{length(file_paths)} documents")
+
+    results =
+      file_paths
+      |> Task.async_stream(fn path ->
+        extract_file(config, path, opts)
+      end)
+      |> Stream.map(fn {:ok, result} -> result end)
+      |> Enum.to_list()
+
+    success_count = Enum.count(results, &match?({:ok, _}, &1))
+    Logger.info("Batch extraction complete: #{success_count}/#{length(file_paths)} succeeded")
+
+    {:ok, results}
+  end
+
+  # Private helpers
+
+  defp fetch_from_server(config, file_path, mime_type, extraction_config) do
+    url = "http://#{config.host}:#{config.port}/extract"
+
+    body =
+      Jason.encode!(%{
+        file_path: file_path,
+        mime_type: mime_type,
+        config: extraction_config
+      })
+
+    headers = [{"Content-Type", "application/json"}]
+
+    retry_request(config, fn ->
+      HTTPoison.post(url, body, headers, timeout: config.timeout_ms)
+    end)
+    |> case do
+      {:ok, response} -> handle_response(response)
+      error -> error
+    end
+  end
+
+  defp retry_request(config, request_fn) do
+    retry_request(config, request_fn, 0)
+  end
+
+  defp retry_request(config, request_fn, attempt) when attempt < config.max_retries do
+    case request_fn.() do
+      {:ok, response} ->
+        {:ok, response}
+
+      {:error, reason} ->
+        Logger.warn("Request failed (attempt #{attempt + 1}): #{inspect(reason)}")
+        Process.sleep(config.retry_delay_ms)
+        retry_request(config, request_fn, attempt + 1)
+    end
+  end
+
+  defp retry_request(_config, _request_fn, _attempt) do
+    {:error, "Max retries exceeded"}
+  end
+
+  defp handle_response(%HTTPoison.Response{status_code: 200, body: body}) do
+    case Jason.decode(body) do
+      {:ok, data} ->
+        if Map.get(data, "success") do
+          {:ok, data}
+        else
+          {:error, Map.get(data, "error", "Unknown error")}
+        end
+
+      {:error, reason} ->
+        {:error, "Failed to decode response: #{inspect(reason)}"}
+    end
+  end
+
+  defp handle_response(%HTTPoison.Response{status_code: status, body: body}) do
+    {:error, "Server error (#{status}): #{body}"}
+  end
+
+  defp compute_cache_key(file_path, mime_type, config) do
+    content = "#{file_path}|#{mime_type}|#{inspect(config)}"
+    :crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
+  end
+
+  defp get_from_cache(cache_dir, cache_key) do
+    cache_file = Path.join(cache_dir, "#{cache_key}.json")
+
+    if File.exists?(cache_file) do
+      case File.read(cache_file) do
+        {:ok, content} ->
+          {:ok, Jason.decode!(content)}
+
+        :error ->
+          :miss
+      end
+    else
+      :miss
+    end
+  end
+
+  defp store_in_cache(cache_dir, cache_key, result) do
+    File.mkdir_p!(cache_dir)
+    cache_file = Path.join(cache_dir, "#{cache_key}.json")
+    File.write!(cache_file, Jason.encode!(result))
+  end
+end
+
+# Usage examples
+IO.puts("=== Kreuzberg MCP Client ===\n")
+
+# Create client configuration
+config = KreuzbergMCPClient.Config.new(
+  host: "localhost",
+  port: 8080,
+  timeout_ms: 30000,
+  max_retries: 3,
+  cache_dir: "/tmp/kreuzberg_cache"
+)
+
+# Check server health
+IO.puts("Checking server health...")
+
+case KreuzbergMCPClient.health_check(config) do
+  {:ok, health} ->
+    IO.puts("Server status: #{health["status"]}")
+    IO.puts("Service: #{health["service"]}\n")
+
+  {:error, reason} ->
+    IO.puts("Health check failed: #{reason}\n")
+end
+
+# Extract single document
+IO.puts("Extracting document...")
+
+case KreuzbergMCPClient.extract_file(config, "document.pdf", use_cache: true) do
+  {:ok, result} ->
+    IO.puts("Success!")
+    IO.puts("Content size: #{byte_size(result["content"])} bytes")
+    IO.puts("MIME type: #{result["mime_type"]}")
+    IO.puts("Tables found: #{length(result["tables"])}")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+IO.puts("")
+
+# Batch extract multiple documents
+IO.puts("Batch extracting multiple documents...")
+
+documents = [
+  "doc1.pdf",
+  "doc2.pdf",
+  "doc3.pdf"
+]
+
+case KreuzbergMCPClient.batch_extract(config, documents) do
+  {:ok, results} ->
+    IO.puts("Batch extraction complete!")
+    successful = Enum.count(results, &match?({:ok, _}, &1))
+    IO.puts("Successful: #{successful}/#{length(results)}")
+
+  {:error, reason} ->
+    IO.puts("Batch extraction failed: #{reason}")
+end
+```
--- a/docs/snippets/elixir/mcp/mcp_server_start.exs
+++ b/docs/snippets/elixir/mcp/mcp_server_start.exs
@@ -0,0 +1,300 @@
+```elixir title="Elixir"
+# MCP Server Integration - Start a Kreuzberg MCP server
+# Demonstrates how to set up and manage an MCP server for remote document extraction
+
+defmodule KreuzbergMCPServer do
+  @moduledoc """
+  MCP (Model Context Protocol) server for Kreuzberg document extraction.
+
+  Provides a standardized interface for remote clients to extract documents
+  using the Kreuzberg library via the Model Context Protocol.
+  """
+
+  require Logger
+  alias Kreuzberg.ExtractionConfig
+
+  @doc """
+  Start the MCP server on the specified host and port.
+
+  The server accepts extraction requests from MCP clients and returns
+  structured document data including content, metadata, and extracted elements.
+
+  ## Options
+
+    * `:host` - Server host (default: "127.0.0.1")
+    * `:port` - Server port (default: 8080)
+    * `:max_connections` - Maximum concurrent connections (default: 10)
+  """
+  def start_server(opts \\ []) do
+    host = Keyword.get(opts, :host, "127.0.0.1")
+    port = Keyword.get(opts, :port, 8080)
+    max_connections = Keyword.get(opts, :max_connections, 10)
+
+    Logger.info("Starting Kreuzberg MCP server on #{host}:#{port}")
+
+    {:ok, _pid} =
+      :cowboy.start_clear(
+        :kreuzberg_http,
+        [{:port, port}],
+        %{
+          env: [
+            {:dispatch,
+             [
+               {:_,
+                [
+                  {"/extract", KreuzbergMCPServer.Handler, []},
+                  {"/extract/file", KreuzbergMCPServer.FileHandler, []},
+                  {"/health", KreuzbergMCPServer.HealthHandler, []}
+                ]}
+             ]}
+          ]
+        }
+      )
+
+    Logger.info("MCP server started successfully")
+    {:ok, "Server running on #{host}:#{port}"}
+  end
+
+  @doc """
+  Stop the MCP server gracefully.
+  """
+  def stop_server do
+    Logger.info("Stopping Kreuzberg MCP server")
+    :cowboy.stop_listener(:kreuzberg_http)
+    Logger.info("MCP server stopped")
+    :ok
+  end
+end
+
+# Handler for extraction requests
+defmodule KreuzbergMCPServer.Handler do
+  @moduledoc """
+  HTTP handler for MCP extraction requests.
+  Processes incoming extraction requests with optional configuration.
+  """
+
+  require Logger
+
+  def init(req, state) do
+    req
+    |> handle_request()
+    |> reply()
+    |> wrap_response(state)
+  end
+
+  defp handle_request(req) do
+    case req.method do
+      "POST" -> handle_extraction(req)
+      _ -> error_response(405, "Method not allowed")
+    end
+  end
+
+  defp handle_extraction(req) do
+    case :cowboy_req.read_body(req) do
+      {:ok, body, req} ->
+        case Jason.decode(body) do
+          {:ok, params} ->
+            extract_from_params(params, req)
+
+          {:error, reason} ->
+            error_response(400, "Invalid JSON: #{inspect(reason)}")
+        end
+
+      {:error, reason} ->
+        error_response(400, "Failed to read body: #{inspect(reason)}")
+    end
+  end
+
+  defp extract_from_params(params, req) do
+    file_path = Map.get(params, "file_path")
+    mime_type = Map.get(params, "mime_type")
+    config_opts = Map.get(params, "config", %{})
+
+    unless file_path do
+      error_response(400, "Missing required parameter: file_path")
+    else
+      config = build_config(config_opts)
+
+      case Kreuzberg.extract_file(file_path, mime_type, config) do
+        {:ok, result} ->
+          response_data = %{
+            success: true,
+            content: result.content,
+            mime_type: result.mime_type,
+            metadata: result.metadata || %{},
+            tables: result.tables || [],
+            chunks: result.chunks || [],
+            images: result.images || [],
+            detected_languages: result.detected_languages || []
+          }
+
+          success_response(200, response_data, req)
+
+        {:error, reason} ->
+          error_response(400, "Extraction failed: #{inspect(reason)}")
+      end
+    end
+  end
+
+  defp build_config(opts) when is_map(opts) do
+    %Kreuzberg.ExtractionConfig{
+      ocr: opts["ocr"],
+      chunking: opts["chunking"],
+      quality_processing: opts["quality_processing"],
+      language_detection: opts["language_detection"],
+      images: opts["images"],
+      use_cache: Map.get(opts, "use_cache", true)
+    }
+  end
+
+  defp build_config(_), do: nil
+
+  defp success_response(status, data, req) do
+    {:ok,
+     :cowboy_req.reply(
+       status,
+       %{"content-type" => "application/json"},
+       Jason.encode!(data),
+       req
+     )}
+  end
+
+  defp error_response(status, message) do
+    {:error,
+     status,
+     Jason.encode!(%{
+       success: false,
+       error: message
+     })}
+  end
+
+  defp reply({:ok, req}), do: {req, :ok}
+  defp reply({:error, status, body}), do: {status, body}
+
+  defp wrap_response({req, :ok}, state), do: {:ok, req, state}
+  defp wrap_response({status, body}, state) do
+    # Note: In actual implementation, req needs to be passed through the pipeline
+    # For now, create a minimal request object for error responses
+    req = :cowboy_req.new()
+    {:cowboy_req.reply(status, %{}, body, req), state}
+  end
+end
+
+# Health check handler
+defmodule KreuzbergMCPServer.HealthHandler do
+  @moduledoc """
+  Health check endpoint for the MCP server.
+  """
+
+  def init(req, state) do
+    response = Jason.encode!(%{
+      status: "healthy",
+      service: "kreuzberg-mcp",
+      timestamp: DateTime.utc_now() |> DateTime.to_iso8601()
+    })
+
+    req =
+      :cowboy_req.reply(
+        200,
+        %{"content-type" => "application/json"},
+        response,
+        req
+      )
+
+    {:ok, req, state}
+  end
+end
+
+# File upload handler
+defmodule KreuzbergMCPServer.FileHandler do
+  @moduledoc """
+  Handler for multipart file uploads for extraction.
+  """
+
+  require Logger
+
+  def init(req, state) do
+    case req.method do
+      "POST" -> handle_file_upload(req, state)
+      _ -> {:cowboy_req.reply(405, %{}, "Method not allowed", req), state}
+    end
+  end
+
+  defp handle_file_upload(req, state) do
+    # Store uploaded file temporarily
+    temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
+
+    case :cowboy_req.read_body(req) do
+      {:ok, body, req} ->
+        File.write!(temp_path, body)
+
+        case Kreuzberg.extract_file(temp_path) do
+          {:ok, result} ->
+            response = Jason.encode!(%{
+              success: true,
+              content_size: byte_size(result.content),
+              mime_type: result.mime_type,
+              metadata: result.metadata
+            })
+
+            req =
+              :cowboy_req.reply(
+                200,
+                %{"content-type" => "application/json"},
+                response,
+                req
+              )
+
+            File.rm(temp_path)
+            {:ok, req, state}
+
+          {:error, reason} ->
+            response = Jason.encode!(%{success: false, error: inspect(reason)})
+
+            req =
+              :cowboy_req.reply(
+                400,
+                %{"content-type" => "application/json"},
+                response,
+                req
+              )
+
+            File.rm(temp_path)
+            {:ok, req, state}
+        end
+
+      {:error, reason} ->
+        response = Jason.encode!(%{success: false, error: inspect(reason)})
+
+        req =
+          :cowboy_req.reply(
+            400,
+            %{"content-type" => "application/json"},
+            response,
+            req
+          )
+
+        {:ok, req, state}
+    end
+  end
+end
+
+# Usage example - start the server
+IO.puts("=== Kreuzberg MCP Server ===\n")
+
+case KreuzbergMCPServer.start_server(port: 8080) do
+  {:ok, message} ->
+    IO.puts(message)
+    IO.puts("\nServer is running and ready to accept requests:")
+    IO.puts("  - POST /extract - Extract from file path")
+    IO.puts("  - POST /extract/file - Upload and extract")
+    IO.puts("  - GET /health - Health check")
+
+    # Keep the server running
+    IO.puts("\nServer started. Press Ctrl+C to stop.")
+    Process.sleep(:infinity)
+
+  {:error, reason} ->
+    IO.puts("Failed to start server: #{inspect(reason)}")
+end
+```
--- a/Show More
+++ b/Show More