This commit is contained in:
78
docs/snippets/elixir/configuration/chunking_config.exs
Normal file
78
docs/snippets/elixir/configuration/chunking_config.exs
Normal file
@@ -0,0 +1,78 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Example 1: Basic character-based chunking for RAG applications
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100,
|
||||
"min_size" => 200,
|
||||
"respect_boundaries" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Access chunks
|
||||
if result.chunks do
|
||||
IO.puts("Generated #{length(result.chunks)} chunks")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Chunk: #{String.slice(chunk["content"], 0..50)}...")
|
||||
end)
|
||||
end
|
||||
|
||||
# Example 2: Markdown chunker with token-based sizing and heading context
|
||||
config2 = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"sizing" => %{
|
||||
"type" => "tokenizer",
|
||||
"model" => "Xenova/gpt-4o"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result2} = Kreuzberg.extract_file("document.md", nil, config2)
|
||||
|
||||
if result2.chunks do
|
||||
IO.puts("Generated #{length(result2.chunks)} markdown chunks")
|
||||
|
||||
Enum.each(result2.chunks, fn chunk ->
|
||||
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..60)}...")
|
||||
|
||||
# Access heading context
|
||||
if is_map(chunk["metadata"]) and is_map(chunk["metadata"]["heading_context"]) do
|
||||
headings = chunk["metadata"]["heading_context"]["headings"] || []
|
||||
if length(headings) > 0 do
|
||||
IO.puts(" Headings in context:")
|
||||
Enum.each(headings, fn heading ->
|
||||
IO.puts(" - Level #{heading["level"]}: #{heading["text"]}")
|
||||
end)
|
||||
end
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
# Example 3: Prepend heading context to chunk content
|
||||
config3 = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"chunker_type" => "markdown",
|
||||
"prepend_heading_context" => true
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result3} = Kreuzberg.extract_file("document.md", nil, config3)
|
||||
|
||||
if result3.chunks do
|
||||
IO.puts("Generated #{length(result3.chunks)} chunks with prepended headings")
|
||||
|
||||
Enum.each(result3.chunks, fn chunk ->
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..80)}...")
|
||||
end)
|
||||
end
|
||||
```
|
||||
17
docs/snippets/elixir/configuration/config_basic.exs
Normal file
17
docs/snippets/elixir/configuration/config_basic.exs
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{"enabled" => true, "backend" => "tesseract"},
|
||||
chunking: %{"max_characters" => 1000, "overlap" => 100},
|
||||
language_detection: %{"enabled" => true},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Content length: #{byte_size(result.content)} characters")
|
||||
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
|
||||
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
|
||||
```
|
||||
26
docs/snippets/elixir/configuration/config_discover.exs
Normal file
26
docs/snippets/elixir/configuration/config_discover.exs
Normal file
@@ -0,0 +1,26 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Load configuration from file
|
||||
# Automatically discovers and loads configuration from the user's config directory
|
||||
config_path = Path.expand("~/.kreuzberg/config.json")
|
||||
|
||||
config = if File.exists?(config_path) do
|
||||
config_data = config_path |> File.read!() |> Jason.decode!()
|
||||
struct(ExtractionConfig, Map.new(config_data, fn {k, v} -> {String.to_atom(k), v} end))
|
||||
else
|
||||
IO.puts("Config file not found at #{config_path}. Using defaults.")
|
||||
%ExtractionConfig{}
|
||||
end
|
||||
|
||||
IO.puts("Configuration Source: #{if File.exists?(config_path), do: "#{config_path} (file)", else: "defaults"}")
|
||||
IO.puts("OCR Enabled: #{inspect(config.ocr["enabled"])}")
|
||||
IO.puts("Chunking Max Characters: #{inspect(config.chunking["max_characters"])}")
|
||||
IO.puts("Use Cache: #{inspect(config.use_cache)}")
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Document extraction complete")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Languages detected: #{inspect(result.detected_languages)}")
|
||||
```
|
||||
28
docs/snippets/elixir/configuration/config_ocr.exs
Normal file
28
docs/snippets/elixir/configuration/config_ocr.exs
Normal file
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure OCR-specific settings
|
||||
# Includes backend selection, language settings, and preprocessing options
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"preprocessing" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 2000,
|
||||
"overlap" => 200
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
|
||||
|
||||
IO.puts("OCR Configuration Applied:")
|
||||
IO.puts("Backend: tesseract")
|
||||
IO.puts("Language: eng")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
44
docs/snippets/elixir/configuration/config_programmatic.exs
Normal file
44
docs/snippets/elixir/configuration/config_programmatic.exs
Normal file
@@ -0,0 +1,44 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Build configuration dynamically based on runtime conditions
|
||||
# Useful for environment-specific settings and feature flags
|
||||
|
||||
defmodule ConfigBuilder do
|
||||
def build_config(file_type, enable_ocr?) do
|
||||
base_config = %ExtractionConfig{
|
||||
chunking: %{"max_characters" => 1000, "overlap" => 100},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
case {file_type, enable_ocr?} do
|
||||
{:pdf, true} ->
|
||||
%{base_config | ocr: %{"enabled" => true, "backend" => "tesseract"}, force_ocr: true}
|
||||
|
||||
{:pdf, false} ->
|
||||
%{base_config | ocr: %{"enabled" => false}}
|
||||
|
||||
{:image, true} ->
|
||||
%{
|
||||
base_config
|
||||
| ocr: %{"enabled" => true, "backend" => "tesseract", "preprocessing" => true},
|
||||
force_ocr: true
|
||||
}
|
||||
|
||||
{:image, false} ->
|
||||
%{base_config | ocr: %{"enabled" => false}}
|
||||
|
||||
{_, _} ->
|
||||
base_config
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Build configuration based on file type and requirements
|
||||
config = ConfigBuilder.build_config(:pdf, true)
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Dynamic configuration applied")
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}")
|
||||
```
|
||||
20
docs/snippets/elixir/configuration/embedding_config.exs
Normal file
20
docs/snippets/elixir/configuration/embedding_config.exs
Normal file
@@ -0,0 +1,20 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure embeddings for vector search
|
||||
config = %ExtractionConfig{
|
||||
chunking: %{
|
||||
"enabled" => true,
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50
|
||||
},
|
||||
embeddings: %{
|
||||
"enabled" => true,
|
||||
"model" => "sentence-transformers/all-MiniLM-L6-v2"
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Extracted chunks with embeddings: #{length(result.chunks || [])}")
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure keyword extraction
|
||||
config = %ExtractionConfig{
|
||||
keyword_extraction: %{
|
||||
"enabled" => true,
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.5
|
||||
}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
# Access extracted keywords
|
||||
if result.keywords do
|
||||
IO.puts("Extracted #{length(result.keywords)} keywords")
|
||||
|
||||
Enum.each(result.keywords, fn keyword ->
|
||||
IO.puts("#{keyword["text"]}: #{keyword["score"]}")
|
||||
end)
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure language detection for multilingual documents
|
||||
# Automatically detect all languages present in the content
|
||||
config = %ExtractionConfig{
|
||||
language_detection: %{
|
||||
"enabled" => true,
|
||||
"detect_all" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multilingual.pdf", nil, config)
|
||||
|
||||
IO.puts("Detected Languages:")
|
||||
IO.inspect(result.detected_languages)
|
||||
IO.puts("Content: #{String.slice(result.content, 0..100)}")
|
||||
```
|
||||
25
docs/snippets/elixir/configuration/pdf_config.exs
Normal file
25
docs/snippets/elixir/configuration/pdf_config.exs
Normal file
@@ -0,0 +1,25 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure PDF-specific extraction options
|
||||
# Extract annotations, metadata, and flatten forms for comprehensive document processing
|
||||
config = %ExtractionConfig{
|
||||
pdf: %{
|
||||
"extract_annotations" => true,
|
||||
"extract_metadata" => true,
|
||||
"flatten_forms" => true
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1500,
|
||||
"overlap" => 150
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("form_document.pdf", nil, config)
|
||||
|
||||
IO.puts("PDF Extraction Complete:")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
IO.puts("Annotations present: #{map_size(result.metadata["annotations"] || %{}) > 0}")
|
||||
```
|
||||
29
docs/snippets/elixir/configuration/postprocessor_config.exs
Normal file
29
docs/snippets/elixir/configuration/postprocessor_config.exs
Normal file
@@ -0,0 +1,29 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure post-processing pipeline
|
||||
# Applies transformations to extracted content after initial processing
|
||||
config = %ExtractionConfig{
|
||||
postprocessing: %{
|
||||
"remove_whitespace" => true,
|
||||
"normalize_unicode" => true,
|
||||
"fix_encoding" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Post-Processing Configuration Applied:")
|
||||
IO.puts("Remove Whitespace: true")
|
||||
IO.puts("Normalize Unicode: true")
|
||||
IO.puts("Fix Encoding: true")
|
||||
IO.puts("Original content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Processed content: #{String.slice(result.content, 0..100)}...")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
@@ -0,0 +1,30 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure quality processing settings
|
||||
# Enable noise removal and set minimum quality thresholds for extracted content
|
||||
config = %ExtractionConfig{
|
||||
quality_processing: %{
|
||||
"enabled" => true,
|
||||
"min_quality_score" => 0.7,
|
||||
"remove_noise" => true
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
chunking: %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 100
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("noisy_document.pdf", nil, config)
|
||||
|
||||
IO.puts("Quality Processing Applied:")
|
||||
IO.puts("Content quality score: #{result.quality_score}")
|
||||
IO.puts("Noise removed: true")
|
||||
IO.puts("Content length: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Processing complete: #{inspect(result)}")
|
||||
```
|
||||
29
docs/snippets/elixir/configuration/tesseract_config.exs
Normal file
29
docs/snippets/elixir/configuration/tesseract_config.exs
Normal file
@@ -0,0 +1,29 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure Tesseract OCR settings
|
||||
# Includes language selection, page segmentation mode, OCR engine mode, and DPI settings
|
||||
config = %ExtractionConfig{
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng",
|
||||
"psm" => 3,
|
||||
"oem" => 3,
|
||||
"dpi" => 300
|
||||
},
|
||||
use_cache: true,
|
||||
force_ocr: false
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("Tesseract Configuration Applied:")
|
||||
IO.puts("Backend: tesseract")
|
||||
IO.puts("Language: eng")
|
||||
IO.puts("PSM (Page Segmentation Mode): 3")
|
||||
IO.puts("OEM (OCR Engine Mode): 3")
|
||||
IO.puts("DPI: 300")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
@@ -0,0 +1,28 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
# Configure token reduction for LLM context windows
|
||||
# Helps manage token usage when working with large language models
|
||||
config = %ExtractionConfig{
|
||||
token_reduction: %{
|
||||
"enabled" => true,
|
||||
"target_tokens" => 4000,
|
||||
"strategy" => "truncate"
|
||||
},
|
||||
ocr: %{
|
||||
"enabled" => true,
|
||||
"backend" => "tesseract"
|
||||
},
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("large_document.pdf", nil, config)
|
||||
|
||||
IO.puts("Token Reduction Configuration Applied:")
|
||||
IO.puts("Token Reduction Enabled: true")
|
||||
IO.puts("Target Tokens: 4000")
|
||||
IO.puts("Strategy: truncate")
|
||||
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
|
||||
IO.puts("Tokens reduced: #{inspect(result.metadata[:token_reduction_applied])}")
|
||||
IO.puts("Final metadata: #{inspect(result.metadata)}")
|
||||
```
|
||||
Reference in New Issue
Block a user