Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,78 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Example 1: Basic character-based chunking for RAG applications
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 100,
"min_size" => 200,
"respect_boundaries" => true
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Access chunks
if result.chunks do
IO.puts("Generated #{length(result.chunks)} chunks")
Enum.each(result.chunks, fn chunk ->
IO.puts("Chunk: #{String.slice(chunk["content"], 0..50)}...")
end)
end
# Example 2: Markdown chunker with token-based sizing and heading context
config2 = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"sizing" => %{
"type" => "tokenizer",
"model" => "Xenova/gpt-4o"
}
}
}
{:ok, result2} = Kreuzberg.extract_file("document.md", nil, config2)
if result2.chunks do
IO.puts("Generated #{length(result2.chunks)} markdown chunks")
Enum.each(result2.chunks, fn chunk ->
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..60)}...")
# Access heading context
if is_map(chunk["metadata"]) and is_map(chunk["metadata"]["heading_context"]) do
headings = chunk["metadata"]["heading_context"]["headings"] || []
if length(headings) > 0 do
IO.puts(" Headings in context:")
Enum.each(headings, fn heading ->
IO.puts(" - Level #{heading["level"]}: #{heading["text"]}")
end)
end
end
end)
end
# Example 3: Prepend heading context to chunk content
config3 = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
}
{:ok, result3} = Kreuzberg.extract_file("document.md", nil, config3)
if result3.chunks do
IO.puts("Generated #{length(result3.chunks)} chunks with prepended headings")
Enum.each(result3.chunks, fn chunk ->
# Each chunk's content is prefixed with its heading breadcrumb
IO.puts("\nChunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"},
chunking: %{"max_characters" => 1000, "overlap" => 100},
language_detection: %{"enabled" => true},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
```

View File

@@ -0,0 +1,26 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Load configuration from file
# Automatically discovers and loads configuration from the user's config directory
config_path = Path.expand("~/.kreuzberg/config.json")
config = if File.exists?(config_path) do
config_data = config_path |> File.read!() |> Jason.decode!()
struct(ExtractionConfig, Map.new(config_data, fn {k, v} -> {String.to_atom(k), v} end))
else
IO.puts("Config file not found at #{config_path}. Using defaults.")
%ExtractionConfig{}
end
IO.puts("Configuration Source: #{if File.exists?(config_path), do: "#{config_path} (file)", else: "defaults"}")
IO.puts("OCR Enabled: #{inspect(config.ocr["enabled"])}")
IO.puts("Chunking Max Characters: #{inspect(config.chunking["max_characters"])}")
IO.puts("Use Cache: #{inspect(config.use_cache)}")
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Document extraction complete")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Languages detected: #{inspect(result.detected_languages)}")
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure OCR-specific settings
# Includes backend selection, language settings, and preprocessing options
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"preprocessing" => true
},
chunking: %{
"max_characters" => 2000,
"overlap" => 200
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
IO.puts("OCR Configuration Applied:")
IO.puts("Backend: tesseract")
IO.puts("Language: eng")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,44 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Build configuration dynamically based on runtime conditions
# Useful for environment-specific settings and feature flags
defmodule ConfigBuilder do
def build_config(file_type, enable_ocr?) do
base_config = %ExtractionConfig{
chunking: %{"max_characters" => 1000, "overlap" => 100},
use_cache: true
}
case {file_type, enable_ocr?} do
{:pdf, true} ->
%{base_config | ocr: %{"enabled" => true, "backend" => "tesseract"}, force_ocr: true}
{:pdf, false} ->
%{base_config | ocr: %{"enabled" => false}}
{:image, true} ->
%{
base_config
| ocr: %{"enabled" => true, "backend" => "tesseract", "preprocessing" => true},
force_ocr: true
}
{:image, false} ->
%{base_config | ocr: %{"enabled" => false}}
{_, _} ->
base_config
end
end
end
# Build configuration based on file type and requirements
config = ConfigBuilder.build_config(:pdf, true)
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Dynamic configuration applied")
IO.puts("Content: #{String.slice(result.content, 0..100)}")
```

View File

@@ -0,0 +1,20 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure embeddings for vector search
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
},
embeddings: %{
"enabled" => true,
"model" => "sentence-transformers/all-MiniLM-L6-v2"
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Extracted chunks with embeddings: #{length(result.chunks || [])}")
```

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure keyword extraction
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 10,
"min_score" => 0.5
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Access extracted keywords
if result.keywords do
IO.puts("Extracted #{length(result.keywords)} keywords")
Enum.each(result.keywords, fn keyword ->
IO.puts("#{keyword["text"]}: #{keyword["score"]}")
end)
end
```

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure language detection for multilingual documents
# Automatically detect all languages present in the content
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true
},
chunking: %{
"max_characters" => 1000,
"overlap" => 100
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("multilingual.pdf", nil, config)
IO.puts("Detected Languages:")
IO.inspect(result.detected_languages)
IO.puts("Content: #{String.slice(result.content, 0..100)}")
```

View File

@@ -0,0 +1,25 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure PDF-specific extraction options
# Extract annotations, metadata, and flatten forms for comprehensive document processing
config = %ExtractionConfig{
pdf: %{
"extract_annotations" => true,
"extract_metadata" => true,
"flatten_forms" => true
},
chunking: %{
"max_characters" => 1500,
"overlap" => 150
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("form_document.pdf", nil, config)
IO.puts("PDF Extraction Complete:")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
IO.puts("Annotations present: #{map_size(result.metadata["annotations"] || %{}) > 0}")
```

View File

@@ -0,0 +1,29 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure post-processing pipeline
# Applies transformations to extracted content after initial processing
config = %ExtractionConfig{
postprocessing: %{
"remove_whitespace" => true,
"normalize_unicode" => true,
"fix_encoding" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Post-Processing Configuration Applied:")
IO.puts("Remove Whitespace: true")
IO.puts("Normalize Unicode: true")
IO.puts("Fix Encoding: true")
IO.puts("Original content length: #{byte_size(result.content)} bytes")
IO.puts("Processed content: #{String.slice(result.content, 0..100)}...")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,30 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure quality processing settings
# Enable noise removal and set minimum quality thresholds for extracted content
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.7,
"remove_noise" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
chunking: %{
"max_characters" => 1000,
"overlap" => 100
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("noisy_document.pdf", nil, config)
IO.puts("Quality Processing Applied:")
IO.puts("Content quality score: #{result.quality_score}")
IO.puts("Noise removed: true")
IO.puts("Content length: #{byte_size(result.content)} bytes")
IO.puts("Processing complete: #{inspect(result)}")
```

View File

@@ -0,0 +1,29 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure Tesseract OCR settings
# Includes language selection, page segmentation mode, OCR engine mode, and DPI settings
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"psm" => 3,
"oem" => 3,
"dpi" => 300
},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Tesseract Configuration Applied:")
IO.puts("Backend: tesseract")
IO.puts("Language: eng")
IO.puts("PSM (Page Segmentation Mode): 3")
IO.puts("OEM (OCR Engine Mode): 3")
IO.puts("DPI: 300")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Metadata: #{inspect(result.metadata)}")
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Configure token reduction for LLM context windows
# Helps manage token usage when working with large language models
config = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 4000,
"strategy" => "truncate"
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
},
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("large_document.pdf", nil, config)
IO.puts("Token Reduction Configuration Applied:")
IO.puts("Token Reduction Enabled: true")
IO.puts("Target Tokens: 4000")
IO.puts("Strategy: truncate")
IO.puts("Content extracted: #{byte_size(result.content)} bytes")
IO.puts("Tokens reduced: #{inspect(result.metadata[:token_reduction_applied])}")
IO.puts("Final metadata: #{inspect(result.metadata)}")
```