This commit is contained in:
53
docs/snippets/elixir/advanced/10_chunking_config.exs
Normal file
53
docs/snippets/elixir/advanced/10_chunking_config.exs
Normal file
@@ -0,0 +1,53 @@
|
||||
# Advanced Chunking Configuration
|
||||
# This example shows how to configure sophisticated document chunking strategies
|
||||
# with fine-grained control over chunk size, overlap, and boundary respect.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Advanced chunking configuration with multiple parameters
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"min_size" => 100,
|
||||
"respect_boundaries" => true,
|
||||
"split_on" => ["sentence", "paragraph"]
|
||||
}
|
||||
}
|
||||
|
||||
# Use the configuration for extraction
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the chunked results
|
||||
chunks = result.chunks || []
|
||||
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
|
||||
Enum.each(chunks, fn chunk ->
|
||||
IO.inspect(%{
|
||||
text_length: String.length(chunk["content"]),
|
||||
page: chunk["page"],
|
||||
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
|
||||
})
|
||||
end)
|
||||
|
||||
# Prepend heading context to chunk content
|
||||
config_with_headings = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result_headings} = Kreuzberg.extract_file("document.md", nil, config_with_headings)
|
||||
|
||||
if result_headings.chunks do
|
||||
IO.puts("Generated #{length(result_headings.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result_headings.chunks, fn chunk ->
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
@@ -0,0 +1,36 @@
|
||||
# Advanced Language Detection Configuration
|
||||
# This example demonstrates how to configure language detection with multiple
|
||||
# parameters for detecting and tracking all languages in documents.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Advanced language detection configuration
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true,
|
||||
"min_confidence" => 0.8
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with language detection enabled
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the detection results
|
||||
detection_results = result.detected_languages || []
|
||||
|
||||
IO.puts("Detected languages:")
|
||||
|
||||
Enum.each(detection_results, fn lang_result ->
|
||||
confidence = Map.get(lang_result, "confidence", "unknown")
|
||||
language = Map.get(lang_result, "language", "unknown")
|
||||
IO.puts(" - #{language}: #{confidence}")
|
||||
end)
|
||||
|
||||
# Filter results by minimum confidence threshold
|
||||
high_confidence_languages = Enum.filter(detection_results, fn lang_result ->
|
||||
confidence = Map.get(lang_result, "confidence", 0)
|
||||
confidence >= 0.8
|
||||
end)
|
||||
|
||||
IO.puts("\nHigh confidence languages (>= 0.8): #{length(high_confidence_languages)}")
|
||||
@@ -0,0 +1,22 @@
|
||||
# Advanced keyword extraction configuration
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 20,
|
||||
"min_score" => 0.6,
|
||||
"algorithm" => "tfidf"
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("research_paper.pdf", nil, config)
|
||||
|
||||
if result.keywords do
|
||||
# Group by score ranges
|
||||
high_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.8 end)
|
||||
medium_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.6 and kw["score"] < 0.8 end)
|
||||
|
||||
IO.puts("High confidence keywords (#{length(high_score)}):")
|
||||
Enum.each(high_score, fn kw -> IO.puts(" - #{kw["word"]} (#{kw["score"]})") end)
|
||||
end
|
||||
@@ -0,0 +1,39 @@
|
||||
# Quality processing configuration
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.75,
|
||||
"remove_noise" => true,
|
||||
"enhance_clarity" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with quality processing and OCR
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process the quality-processed results
|
||||
IO.puts("=== Quality Processing Results ===\n")
|
||||
|
||||
# Display extracted content with quality processing applied
|
||||
content_preview = String.slice(result.content, 0..300)
|
||||
IO.puts("Extracted Content (first 300 chars):")
|
||||
IO.puts(content_preview)
|
||||
IO.puts("\nTotal content size: #{byte_size(result.content)} bytes")
|
||||
|
||||
# Check quality metrics
|
||||
quality_score = result.quality_score
|
||||
if quality_score && quality_score > 0 do
|
||||
IO.puts("\nQuality Score: #{quality_score}")
|
||||
IO.puts("Quality Status: #{if quality_score >= 0.75, do: "Acceptable", else: "Below threshold"}")
|
||||
end
|
||||
|
||||
# Display any OCR results if applicable
|
||||
if result.images && length(result.images) > 0 do
|
||||
IO.puts("\nImages found: #{length(result.images)}")
|
||||
end
|
||||
21
docs/snippets/elixir/advanced/14_token_reduction_config.exs
Normal file
21
docs/snippets/elixir/advanced/14_token_reduction_config.exs
Normal file
@@ -0,0 +1,21 @@
|
||||
# Token reduction for different LLM context windows
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# For GPT-4
|
||||
config_gpt4 = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 8000,
|
||||
"strategy" => "intelligent",
|
||||
"preserve_structure" => true
|
||||
}
|
||||
}
|
||||
|
||||
# For Claude
|
||||
config_claude = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 100000,
|
||||
"strategy" => "minimal"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
# Vector Database Integration
|
||||
# This example demonstrates how to prepare document chunks for integration with vector databases
|
||||
# by configuring chunking and processing the extracted content.
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure extraction with chunking enabled
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
}
|
||||
}
|
||||
|
||||
# Extract file with chunking
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Prepare chunks for vector database ingestion
|
||||
documents = Enum.map(result.chunks || [], fn chunk ->
|
||||
%{
|
||||
content: chunk["content"],
|
||||
metadata: %{
|
||||
page: chunk["page"],
|
||||
char_count: String.length(chunk["content"])
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Prepared #{length(documents)} documents for vector DB")
|
||||
|
||||
# The documents list can now be sent to your vector database
|
||||
# Example: documents |> MyVectorDB.index_documents()
|
||||
13
docs/snippets/elixir/advanced/chunk_page_mapping.exs
Normal file
13
docs/snippets/elixir/advanced/chunk_page_mapping.exs
Normal file
@@ -0,0 +1,13 @@
|
||||
# Extract with chunking and track page boundaries
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 500},
|
||||
track_page_boundaries: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Map each chunk to its source page
|
||||
Enum.with_index(result.chunks || [], 1) |> Enum.each(fn {chunk, idx} ->
|
||||
page = chunk["page"] || "unknown"
|
||||
IO.puts("Chunk #{idx} from page #{page}")
|
||||
end)
|
||||
23
docs/snippets/elixir/advanced/chunk_page_mapping.md
Normal file
23
docs/snippets/elixir/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1024,
|
||||
"overlap" => 128
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Map chunks to their source pages
|
||||
chunks_with_pages = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"chunk_id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"page_number" => chunk["page"]
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_with_pages, label: "Chunks with Page Mapping")
|
||||
```
|
||||
45
docs/snippets/elixir/advanced/chunking_config.md
Normal file
45
docs/snippets/elixir/advanced/chunking_config.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"min_size" => 100,
|
||||
"respect_boundaries" => true,
|
||||
"split_on" => ["sentence", "paragraph"]
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
chunks = result.chunks || []
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
|
||||
Enum.each(chunks, fn chunk ->
|
||||
IO.inspect(%{
|
||||
text_length: String.length(chunk["content"]),
|
||||
page: chunk["page"],
|
||||
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
|
||||
})
|
||||
end)
|
||||
```
|
||||
|
||||
```elixir title="Elixir - Prepend Heading Context"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.md", "text/markdown", config_json)
|
||||
|
||||
if result.chunks do
|
||||
IO.puts("Generated #{length(result.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
```
|
||||
26
docs/snippets/elixir/advanced/chunking_rag.exs
Normal file
26
docs/snippets/elixir/advanced/chunking_rag.exs
Normal file
@@ -0,0 +1,26 @@
|
||||
# Configure chunking for RAG/vector search
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Prepare chunks for vector embedding and search
|
||||
chunks_for_embedding = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"metadata" => %{
|
||||
"page" => chunk["page"],
|
||||
"source" => "document.pdf"
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
|
||||
27
docs/snippets/elixir/advanced/chunking_rag.md
Normal file
27
docs/snippets/elixir/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Prepare chunks for vector embedding and search
|
||||
chunks_for_embedding = result.chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
%{
|
||||
"id" => chunk["id"],
|
||||
"content" => chunk["content"],
|
||||
"metadata" => %{
|
||||
"page" => chunk["page"],
|
||||
"source" => "document.pdf"
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
|
||||
```
|
||||
20
docs/snippets/elixir/advanced/combining_all_features.exs
Normal file
20
docs/snippets/elixir/advanced/combining_all_features.exs
Normal file
@@ -0,0 +1,20 @@
|
||||
# Use all major features together
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
ocr: %{"enabled" => true},
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000},
|
||||
images: %{"extract" => true},
|
||||
language_detection: %{"enabled" => true},
|
||||
keyword_extraction: %{"enabled" => true}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process results with all extracted features
|
||||
IO.inspect(result, label: "Extraction Result")
|
||||
|
||||
# Access different feature outputs
|
||||
if result.content, do: IO.puts("Text: #{String.slice(result.content, 0..100)}")
|
||||
if result.detected_languages, do: IO.puts("Language: #{inspect(result.detected_languages)}")
|
||||
if result.metadata["keywords"], do: IO.puts("Keywords: #{inspect(result.metadata["keywords"])}")
|
||||
if result.chunks, do: IO.puts("Chunks: #{length(result.chunks)}")
|
||||
if result.images, do: IO.puts("Images: #{length(result.images)}")
|
||||
103
docs/snippets/elixir/advanced/complete_example.exs
Normal file
103
docs/snippets/elixir/advanced/complete_example.exs
Normal file
@@ -0,0 +1,103 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Build comprehensive configuration for all features
|
||||
config = %ExtractionConfig{
|
||||
# Enable OCR for scanned documents
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"force_all_pages" => false
|
||||
},
|
||||
# Configure chunking for RAG applications
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
# Extract images from documents
|
||||
images: %{
|
||||
"extract" => true
|
||||
},
|
||||
# Enable language detection
|
||||
language_detection: %{
|
||||
"enabled" => true
|
||||
},
|
||||
# Use caching for performance
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
# Extract file with full configuration
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Extraction Successful ===\n")
|
||||
|
||||
# 1. Process content
|
||||
IO.puts("--- Content ---")
|
||||
content_preview = String.slice(result.content, 0..100)
|
||||
IO.puts("Preview: #{content_preview}...")
|
||||
IO.puts("Total size: #{byte_size(result.content)} bytes\n")
|
||||
|
||||
# 2. Process metadata
|
||||
IO.puts("--- Metadata ---")
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
metadata = result.metadata || %{}
|
||||
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
|
||||
# Handle PDF-specific metadata
|
||||
case metadata["pdf"] do
|
||||
pdf_meta when is_map(pdf_meta) ->
|
||||
IO.puts(" Pages: #{pdf_meta["page_count"]}")
|
||||
IO.puts(" Author: #{pdf_meta["author"]}")
|
||||
IO.puts(" Title: #{pdf_meta["title"]}")
|
||||
_ -> nil
|
||||
end
|
||||
IO.puts("")
|
||||
|
||||
# 3. Process tables
|
||||
IO.puts("--- Tables ---")
|
||||
tables = result.tables || []
|
||||
IO.puts("Total tables: #{length(tables)}")
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
|
||||
cells = table["cells"] || []
|
||||
IO.puts(" Table #{idx}: #{length(cells)} rows")
|
||||
markdown = table["markdown"]
|
||||
if markdown, do: IO.puts(" Markdown: #{String.slice(markdown, 0..50)}...")
|
||||
end)
|
||||
IO.puts("")
|
||||
|
||||
# 4. Process chunks for RAG
|
||||
IO.puts("--- Chunks ---")
|
||||
chunks = result.chunks || []
|
||||
IO.puts("Total chunks: #{length(chunks)}")
|
||||
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, idx} ->
|
||||
IO.puts(" Chunk #{idx}: #{byte_size(chunk)} bytes")
|
||||
end)
|
||||
IO.puts("")
|
||||
|
||||
# 5. Process detected languages
|
||||
IO.puts("--- Language Detection ---")
|
||||
languages = result.detected_languages || []
|
||||
if Enum.empty?(languages) do
|
||||
IO.puts("No languages detected")
|
||||
else
|
||||
Enum.each(languages, fn lang ->
|
||||
IO.puts(" Language: #{lang}")
|
||||
end)
|
||||
end
|
||||
IO.puts("")
|
||||
|
||||
# 6. Process images
|
||||
IO.puts("--- Images ---")
|
||||
images = result.images || []
|
||||
IO.puts("Total images: #{length(images)}")
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts(" Image #{idx}: #{image["format"]} (#{image["size"]} bytes)")
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
27
docs/snippets/elixir/advanced/embedding_with_chunking.exs
Normal file
27
docs/snippets/elixir/advanced/embedding_with_chunking.exs
Normal file
@@ -0,0 +1,27 @@
|
||||
# Extract and prepare for embeddings
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 512},
|
||||
embeddings: %{"enabled" => true}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Process chunks with embeddings for semantic search
|
||||
embedded_chunks = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"chunk_id" => idx,
|
||||
"content" => chunk["content"],
|
||||
"embedding" => chunk["embedding"],
|
||||
"page" => chunk["page"],
|
||||
"metadata" => %{
|
||||
"document" => "document.pdf",
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
# Store embeddings in vector database
|
||||
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
|
||||
IO.inspect(embedded_chunks, label: "Embedded Chunks")
|
||||
32
docs/snippets/elixir/advanced/embedding_with_chunking.md
Normal file
32
docs/snippets/elixir/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512
|
||||
},
|
||||
"embeddings" => %{
|
||||
"enabled" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Process chunks with embeddings for semantic search
|
||||
embedded_chunks = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"chunk_id" => idx,
|
||||
"content" => chunk["content"],
|
||||
"embedding" => chunk["embedding"],
|
||||
"page" => chunk["page"],
|
||||
"metadata" => %{
|
||||
"document" => "document.pdf",
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
|
||||
IO.inspect(embedded_chunks, label: "Embedded Chunks")
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/elixir/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.3
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.keywords do
|
||||
IO.puts("Keywords: #{inspect(result.keywords)}")
|
||||
end
|
||||
```
|
||||
30
docs/snippets/elixir/advanced/keyword_extraction_example.exs
Normal file
30
docs/snippets/elixir/advanced/keyword_extraction_example.exs
Normal file
@@ -0,0 +1,30 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Extract keywords from document
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 15
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("article.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Keyword Extraction ===\n")
|
||||
|
||||
# Display extracted keywords
|
||||
if result.keywords do
|
||||
IO.puts("Extracted keywords:")
|
||||
Enum.each(result.keywords, fn kw ->
|
||||
IO.puts("- #{kw["word"]}: #{kw["score"]}")
|
||||
end)
|
||||
else
|
||||
IO.puts("No keywords extracted")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
18
docs/snippets/elixir/advanced/keyword_extraction_example.md
Normal file
18
docs/snippets/elixir/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.3
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.keywords do
|
||||
result.keywords
|
||||
|> Enum.each(fn %{"keyword" => kw, "score" => score} ->
|
||||
IO.puts("#{kw}: #{Float.round(score, 4)}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/language_detection_config.md
Normal file
15
docs/snippets/elixir/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => false
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.language do
|
||||
IO.puts("Detected language: #{result.language}")
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,32 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Detect all languages in multilingual document
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("multilingual.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Language Detection ===\n")
|
||||
|
||||
# Display detected languages
|
||||
languages = result.detected_languages || []
|
||||
if Enum.empty?(languages) do
|
||||
IO.puts("No languages detected")
|
||||
else
|
||||
IO.puts("Detected languages:")
|
||||
Enum.each(languages, fn lang ->
|
||||
IO.puts("- #{lang}")
|
||||
end)
|
||||
IO.puts("\nTotal languages: #{length(languages)}")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,18 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.7,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("multilingual_document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.languages do
|
||||
IO.puts("Detected languages:")
|
||||
Enum.each(result.languages, fn %{"language" => lang, "confidence" => conf} ->
|
||||
IO.puts(" - #{lang}: #{Float.round(conf, 4)}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
28
docs/snippets/elixir/advanced/metadata_extraction.exs
Normal file
28
docs/snippets/elixir/advanced/metadata_extraction.exs
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
IO.puts("MIME type: #{result.mime_type}")
|
||||
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
|
||||
|
||||
# Access PDF metadata directly from the flat map
|
||||
page_count = metadata["page_count"]
|
||||
if page_count, do: IO.puts("Page count: #{page_count}")
|
||||
|
||||
authors = metadata["authors"] || []
|
||||
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
|
||||
|
||||
title = metadata["title"]
|
||||
if title, do: IO.puts("Title: #{title}")
|
||||
|
||||
# Access HTML metadata directly from the flat map
|
||||
{:ok, html_result} = Kreuzberg.extract_file("page.html")
|
||||
html_meta = html_result.metadata
|
||||
|
||||
keywords = html_meta["keywords"] || []
|
||||
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
|
||||
|
||||
description = html_meta["description"]
|
||||
if description, do: IO.puts("Description: #{description}")
|
||||
```
|
||||
15
docs/snippets/elixir/advanced/quality_processing_config.md
Normal file
15
docs/snippets/elixir/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"post_processors" => [
|
||||
%{
|
||||
"name" => "QualityFilter",
|
||||
"enabled" => true
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
IO.puts("Extraction quality applied")
|
||||
IO.inspect(result.text, label: "Quality-filtered text")
|
||||
```
|
||||
39
docs/snippets/elixir/advanced/quality_processing_example.exs
Normal file
39
docs/snippets/elixir/advanced/quality_processing_example.exs
Normal file
@@ -0,0 +1,39 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Process document with quality filtering
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.8
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Quality Processing ===\n")
|
||||
|
||||
# Display quality metrics if available
|
||||
quality_score = result.quality_score
|
||||
if quality_score && quality_score > 0 do
|
||||
IO.puts("Quality Score: #{quality_score}")
|
||||
IO.puts("Content Quality: #{quality_status(quality_score)}")
|
||||
end
|
||||
|
||||
# Display content with quality assurance
|
||||
IO.puts("\n--- Extracted Content ---")
|
||||
content_preview = String.slice(result.content, 0..200)
|
||||
IO.puts(content_preview)
|
||||
IO.puts("\nTotal size: #{byte_size(result.content)} bytes")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
|
||||
# Helper function to determine quality status
|
||||
defp quality_status(score) when score >= 0.9, do: "Excellent"
|
||||
defp quality_status(score) when score >= 0.8, do: "Good"
|
||||
defp quality_status(score) when score >= 0.7, do: "Fair"
|
||||
defp quality_status(_score), do: "Poor"
|
||||
```
|
||||
22
docs/snippets/elixir/advanced/quality_processing_example.md
Normal file
22
docs/snippets/elixir/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"post_processors" => [
|
||||
%{
|
||||
"name" => "QualityFilter",
|
||||
"enabled" => true
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
{:ok, result_before} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
|
||||
|
||||
{:ok, result_after} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Compare text quality metrics
|
||||
text_before = result_before.text || ""
|
||||
text_after = result_after.text || ""
|
||||
|
||||
IO.puts("Before quality processing: #{String.length(text_before)} chars")
|
||||
IO.puts("After quality processing: #{String.length(text_after)} chars")
|
||||
IO.puts("Improvement: #{Float.round((1 - String.length(text_after) / String.length(text_before)) * 100, 2)}%")
|
||||
```
|
||||
19
docs/snippets/elixir/advanced/table_extraction.exs
Normal file
19
docs/snippets/elixir/advanced/table_extraction.exs
Normal file
@@ -0,0 +1,19 @@
|
||||
```elixir title="Elixir"
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
|
||||
tables = result.tables
|
||||
IO.puts("Total tables found: #{length(tables)}")
|
||||
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
|
||||
IO.puts("\n--- Table #{index} ---")
|
||||
|
||||
# Access table cells
|
||||
cells = table["cells"] || []
|
||||
IO.puts("Rows: #{length(cells)}")
|
||||
|
||||
# Access table markdown representation
|
||||
markdown = table["markdown"]
|
||||
IO.puts("Markdown representation:")
|
||||
IO.puts(markdown)
|
||||
end)
|
||||
```
|
||||
17
docs/snippets/elixir/advanced/token_reduction_config.md
Normal file
17
docs/snippets/elixir/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_markdown" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
|
||||
|
||||
if result.original_token_count do
|
||||
IO.puts("Original tokens: #{result.original_token_count}")
|
||||
end
|
||||
if result.reduced_token_count do
|
||||
IO.puts("Reduced tokens: #{result.reduced_token_count}")
|
||||
end
|
||||
```
|
||||
42
docs/snippets/elixir/advanced/token_reduction_example.exs
Normal file
42
docs/snippets/elixir/advanced/token_reduction_example.exs
Normal file
@@ -0,0 +1,42 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Reduce token count for LLM
|
||||
config = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 2000
|
||||
}
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
IO.puts("=== Token Reduction ===\n")
|
||||
|
||||
# Display content and token information
|
||||
content_size = byte_size(result.content)
|
||||
estimated_tokens = div(content_size, 4) # Rough estimate: 1 token ≈ 4 bytes
|
||||
|
||||
IO.puts("Content size: #{content_size} bytes")
|
||||
IO.puts("Estimated tokens: ~#{estimated_tokens}")
|
||||
IO.puts("Target tokens: 2000")
|
||||
|
||||
# Show reduction status
|
||||
if estimated_tokens > 2000 do
|
||||
reduction_percentage = trunc((1 - 2000 / estimated_tokens) * 100)
|
||||
IO.puts("\nToken reduction applied: ~#{reduction_percentage}% reduction")
|
||||
else
|
||||
IO.puts("\nNo reduction needed - content already below target")
|
||||
end
|
||||
|
||||
# Display reduced content preview
|
||||
IO.puts("\n--- Reduced Content ---")
|
||||
content_preview = String.slice(result.content, 0..300)
|
||||
IO.puts(content_preview)
|
||||
IO.puts("...")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed!")
|
||||
IO.puts("Error: #{inspect(reason)}")
|
||||
end
|
||||
```
|
||||
28
docs/snippets/elixir/advanced/token_reduction_example.md
Normal file
28
docs/snippets/elixir/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_markdown" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Display token reduction metrics
|
||||
original = result.original_token_count || 0
|
||||
reduced = result.reduced_token_count || 0
|
||||
|
||||
IO.puts("Original tokens: #{original}")
|
||||
IO.puts("Reduced tokens: #{reduced}")
|
||||
|
||||
if original > 0 do
|
||||
reduction_percent = Float.round((1 - reduced / original) * 100, 2)
|
||||
IO.puts("Reduction: #{reduction_percent}%")
|
||||
end
|
||||
|
||||
# Show sample of reduced text
|
||||
if result.text do
|
||||
IO.puts("\nSample of reduced text:")
|
||||
IO.puts(String.slice(result.text, 0..200) <> "...")
|
||||
end
|
||||
```
|
||||
40
docs/snippets/elixir/advanced/vector_database_integration.md
Normal file
40
docs/snippets/elixir/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
},
|
||||
"embeddings" => %{
|
||||
"enabled" => true,
|
||||
"model" => "all-MiniLM-L6-v2"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
|
||||
# Prepare data for vector database storage
|
||||
vector_db_records = result.chunks
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.map(fn {chunk, idx} ->
|
||||
%{
|
||||
"id" => "#{result.document_id}-chunk-#{idx}",
|
||||
"vector" => chunk["embedding"],
|
||||
"metadata" => %{
|
||||
"content" => chunk["content"],
|
||||
"page" => chunk["page"],
|
||||
"document_id" => result.document_id,
|
||||
"chunk_index" => idx
|
||||
}
|
||||
}
|
||||
end)
|
||||
|
||||
IO.puts("Generated #{length(vector_db_records)} records for vector database")
|
||||
IO.inspect(List.first(vector_db_records), label: "Sample Record")
|
||||
|
||||
# Example: Insert into Pinecone-like vector database
|
||||
Enum.each(vector_db_records, fn record ->
|
||||
# vector_db_client.upsert(record)
|
||||
IO.puts("Would insert: #{record["id"]}")
|
||||
end)
|
||||
```
|
||||
Reference in New Issue
Block a user