Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
# Advanced Chunking Configuration
# This example shows how to configure sophisticated document chunking strategies
# with fine-grained control over chunk size, overlap, and boundary respect.
alias Kreuzberg.ExtractionConfig
# Advanced chunking configuration with multiple parameters
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 200,
"min_size" => 100,
"respect_boundaries" => true,
"split_on" => ["sentence", "paragraph"]
}
}
# Use the configuration for extraction
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the chunked results
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.each(chunks, fn chunk ->
IO.inspect(%{
text_length: String.length(chunk["content"]),
page: chunk["page"],
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
})
end)
# Prepend heading context to chunk content
config_with_headings = %ExtractionConfig{
chunking: %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
}
{:ok, result_headings} = Kreuzberg.extract_file("document.md", nil, config_with_headings)
if result_headings.chunks do
IO.puts("Generated #{length(result_headings.chunks)} chunks with prepended headings")
Enum.each(result_headings.chunks, fn chunk ->
# Each chunk's content is prefixed with its heading breadcrumb
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end

View File

@@ -0,0 +1,36 @@
# Advanced Language Detection Configuration
# This example demonstrates how to configure language detection with multiple
# parameters for detecting and tracking all languages in documents.
alias Kreuzberg.ExtractionConfig
# Advanced language detection configuration
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true,
"min_confidence" => 0.8
}
}
# Extract file with language detection enabled
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the detection results
detection_results = result.detected_languages || []
IO.puts("Detected languages:")
Enum.each(detection_results, fn lang_result ->
confidence = Map.get(lang_result, "confidence", "unknown")
language = Map.get(lang_result, "language", "unknown")
IO.puts(" - #{language}: #{confidence}")
end)
# Filter results by minimum confidence threshold
high_confidence_languages = Enum.filter(detection_results, fn lang_result ->
confidence = Map.get(lang_result, "confidence", 0)
confidence >= 0.8
end)
IO.puts("\nHigh confidence languages (>= 0.8): #{length(high_confidence_languages)}")

View File

@@ -0,0 +1,22 @@
# Advanced keyword extraction configuration
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 20,
"min_score" => 0.6,
"algorithm" => "tfidf"
}
}
{:ok, result} = Kreuzberg.extract_file("research_paper.pdf", nil, config)
if result.keywords do
# Group by score ranges
high_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.8 end)
medium_score = Enum.filter(result.keywords, fn kw -> kw["score"] >= 0.6 and kw["score"] < 0.8 end)
IO.puts("High confidence keywords (#{length(high_score)}):")
Enum.each(high_score, fn kw -> IO.puts(" - #{kw["word"]} (#{kw["score"]})") end)
end

View File

@@ -0,0 +1,39 @@
# Quality processing configuration
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.75,
"remove_noise" => true,
"enhance_clarity" => true
},
ocr: %{
"enabled" => true,
"backend" => "tesseract"
}
}
# Extract file with quality processing and OCR
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process the quality-processed results
IO.puts("=== Quality Processing Results ===\n")
# Display extracted content with quality processing applied
content_preview = String.slice(result.content, 0..300)
IO.puts("Extracted Content (first 300 chars):")
IO.puts(content_preview)
IO.puts("\nTotal content size: #{byte_size(result.content)} bytes")
# Check quality metrics
quality_score = result.quality_score
if quality_score && quality_score > 0 do
IO.puts("\nQuality Score: #{quality_score}")
IO.puts("Quality Status: #{if quality_score >= 0.75, do: "Acceptable", else: "Below threshold"}")
end
# Display any OCR results if applicable
if result.images && length(result.images) > 0 do
IO.puts("\nImages found: #{length(result.images)}")
end

View File

@@ -0,0 +1,21 @@
# Token reduction for different LLM context windows
alias Kreuzberg.ExtractionConfig
# For GPT-4
config_gpt4 = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 8000,
"strategy" => "intelligent",
"preserve_structure" => true
}
}
# For Claude
config_claude = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 100000,
"strategy" => "minimal"
}
}

View File

@@ -0,0 +1,33 @@
# Vector Database Integration
# This example demonstrates how to prepare document chunks for integration with vector databases
# by configuring chunking and processing the extracted content.
alias Kreuzberg.ExtractionConfig
# Configure extraction with chunking enabled
config = %ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
}
}
# Extract file with chunking
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Prepare chunks for vector database ingestion
documents = Enum.map(result.chunks || [], fn chunk ->
%{
content: chunk["content"],
metadata: %{
page: chunk["page"],
char_count: String.length(chunk["content"])
}
}
end)
IO.puts("Prepared #{length(documents)} documents for vector DB")
# The documents list can now be sent to your vector database
# Example: documents |> MyVectorDB.index_documents()

View File

@@ -0,0 +1,13 @@
# Extract with chunking and track page boundaries
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 500},
track_page_boundaries: true
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Map each chunk to its source page
Enum.with_index(result.chunks || [], 1) |> Enum.each(fn {chunk, idx} ->
page = chunk["page"] || "unknown"
IO.puts("Chunk #{idx} from page #{page}")
end)

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 1024,
"overlap" => 128
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Map chunks to their source pages
chunks_with_pages = result.chunks
|> Enum.map(fn chunk ->
%{
"chunk_id" => chunk["id"],
"content" => chunk["content"],
"page_number" => chunk["page"]
}
end)
IO.inspect(chunks_with_pages, label: "Chunks with Page Mapping")
```

View File

@@ -0,0 +1,45 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 200,
"min_size" => 100,
"respect_boundaries" => true,
"split_on" => ["sentence", "paragraph"]
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.each(chunks, fn chunk ->
IO.inspect(%{
text_length: String.length(chunk["content"]),
page: chunk["page"],
boundaries_respected: !String.ends_with?(chunk["content"], [" ", "\n"])
})
end)
```
```elixir title="Elixir - Prepend Heading Context"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"chunker_type" => "markdown",
"prepend_heading_context" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.md", "text/markdown", config_json)
if result.chunks do
IO.puts("Generated #{length(result.chunks)} chunks with prepended headings")
Enum.each(result.chunks, fn chunk ->
IO.puts("Chunk preview: #{String.slice(chunk["content"], 0..80)}...")
end)
end
```

View File

@@ -0,0 +1,26 @@
# Configure chunking for RAG/vector search
config = %Kreuzberg.ExtractionConfig{
chunking: %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50,
"respect_boundaries" => true
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Prepare chunks for vector embedding and search
chunks_for_embedding = result.chunks
|> Enum.map(fn chunk ->
%{
"id" => chunk["id"],
"content" => chunk["content"],
"metadata" => %{
"page" => chunk["page"],
"source" => "document.pdf"
}
}
end)
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")

View File

@@ -0,0 +1,27 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50,
"respect_boundaries" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Prepare chunks for vector embedding and search
chunks_for_embedding = result.chunks
|> Enum.map(fn chunk ->
%{
"id" => chunk["id"],
"content" => chunk["content"],
"metadata" => %{
"page" => chunk["page"],
"source" => "document.pdf"
}
}
end)
IO.inspect(chunks_for_embedding, label: "Chunks Ready for RAG")
```

View File

@@ -0,0 +1,20 @@
# Use all major features together
config = %Kreuzberg.ExtractionConfig{
ocr: %{"enabled" => true},
chunking: %{"enabled" => true, "max_characters" => 1000},
images: %{"extract" => true},
language_detection: %{"enabled" => true},
keyword_extraction: %{"enabled" => true}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process results with all extracted features
IO.inspect(result, label: "Extraction Result")
# Access different feature outputs
if result.content, do: IO.puts("Text: #{String.slice(result.content, 0..100)}")
if result.detected_languages, do: IO.puts("Language: #{inspect(result.detected_languages)}")
if result.metadata["keywords"], do: IO.puts("Keywords: #{inspect(result.metadata["keywords"])}")
if result.chunks, do: IO.puts("Chunks: #{length(result.chunks)}")
if result.images, do: IO.puts("Images: #{length(result.images)}")

View File

@@ -0,0 +1,103 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Build comprehensive configuration for all features
config = %ExtractionConfig{
# Enable OCR for scanned documents
ocr: %{
"enabled" => true,
"backend" => "tesseract",
"language" => "eng",
"force_all_pages" => false
},
# Configure chunking for RAG applications
chunking: %{
"enabled" => true,
"max_characters" => 1000,
"overlap" => 100
},
# Extract images from documents
images: %{
"extract" => true
},
# Enable language detection
language_detection: %{
"enabled" => true
},
# Use caching for performance
use_cache: true
}
# Extract file with full configuration
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Extraction Successful ===\n")
# 1. Process content
IO.puts("--- Content ---")
content_preview = String.slice(result.content, 0..100)
IO.puts("Preview: #{content_preview}...")
IO.puts("Total size: #{byte_size(result.content)} bytes\n")
# 2. Process metadata
IO.puts("--- Metadata ---")
IO.puts("MIME type: #{result.mime_type}")
metadata = result.metadata || %{}
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
# Handle PDF-specific metadata
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
IO.puts(" Pages: #{pdf_meta["page_count"]}")
IO.puts(" Author: #{pdf_meta["author"]}")
IO.puts(" Title: #{pdf_meta["title"]}")
_ -> nil
end
IO.puts("")
# 3. Process tables
IO.puts("--- Tables ---")
tables = result.tables || []
IO.puts("Total tables: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
cells = table["cells"] || []
IO.puts(" Table #{idx}: #{length(cells)} rows")
markdown = table["markdown"]
if markdown, do: IO.puts(" Markdown: #{String.slice(markdown, 0..50)}...")
end)
IO.puts("")
# 4. Process chunks for RAG
IO.puts("--- Chunks ---")
chunks = result.chunks || []
IO.puts("Total chunks: #{length(chunks)}")
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, idx} ->
IO.puts(" Chunk #{idx}: #{byte_size(chunk)} bytes")
end)
IO.puts("")
# 5. Process detected languages
IO.puts("--- Language Detection ---")
languages = result.detected_languages || []
if Enum.empty?(languages) do
IO.puts("No languages detected")
else
Enum.each(languages, fn lang ->
IO.puts(" Language: #{lang}")
end)
end
IO.puts("")
# 6. Process images
IO.puts("--- Images ---")
images = result.images || []
IO.puts("Total images: #{length(images)}")
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
IO.puts(" Image #{idx}: #{image["format"]} (#{image["size"]} bytes)")
end)
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,27 @@
# Extract and prepare for embeddings
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 512},
embeddings: %{"enabled" => true}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
# Process chunks with embeddings for semantic search
embedded_chunks = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"chunk_id" => idx,
"content" => chunk["content"],
"embedding" => chunk["embedding"],
"page" => chunk["page"],
"metadata" => %{
"document" => "document.pdf",
"chunk_index" => idx
}
}
end)
# Store embeddings in vector database
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
IO.inspect(embedded_chunks, label: "Embedded Chunks")

View File

@@ -0,0 +1,32 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512
},
"embeddings" => %{
"enabled" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Process chunks with embeddings for semantic search
embedded_chunks = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"chunk_id" => idx,
"content" => chunk["content"],
"embedding" => chunk["embedding"],
"page" => chunk["page"],
"metadata" => %{
"document" => "document.pdf",
"chunk_index" => idx
}
}
end)
IO.puts("Prepared #{length(embedded_chunks)} chunks with embeddings")
IO.inspect(embedded_chunks, label: "Embedded Chunks")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.3
}
})
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
if result.keywords do
IO.puts("Keywords: #{inspect(result.keywords)}")
end
```

View File

@@ -0,0 +1,30 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Extract keywords from document
config = %ExtractionConfig{
keyword_extraction: %{
"enabled" => true,
"max_keywords" => 15
}
}
case Kreuzberg.extract_file("article.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Keyword Extraction ===\n")
# Display extracted keywords
if result.keywords do
IO.puts("Extracted keywords:")
Enum.each(result.keywords, fn kw ->
IO.puts("- #{kw["word"]}: #{kw["score"]}")
end)
else
IO.puts("No keywords extracted")
end
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.3
}
})
{:ok, result} = Kreuzberg.extract_file_sync("research_paper.pdf", "application/pdf", config_json)
if result.keywords do
result.keywords
|> Enum.each(fn %{"keyword" => kw, "score" => score} ->
IO.puts("#{kw}: #{Float.round(score, 4)}")
end)
end
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => false
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
if result.language do
IO.puts("Detected language: #{result.language}")
end
```

View File

@@ -0,0 +1,32 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Detect all languages in multilingual document
config = %ExtractionConfig{
language_detection: %{
"enabled" => true,
"detect_all" => true
}
}
case Kreuzberg.extract_file("multilingual.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Language Detection ===\n")
# Display detected languages
languages = result.detected_languages || []
if Enum.empty?(languages) do
IO.puts("No languages detected")
else
IO.puts("Detected languages:")
Enum.each(languages, fn lang ->
IO.puts("- #{lang}")
end)
IO.puts("\nTotal languages: #{length(languages)}")
end
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,18 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.7,
"detect_multiple" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("multilingual_document.pdf", "application/pdf", config_json)
if result.languages do
IO.puts("Detected languages:")
Enum.each(result.languages, fn %{"language" => lang, "confidence" => conf} ->
IO.puts(" - #{lang}: #{Float.round(conf, 4)}")
end)
end
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
# Access PDF metadata directly from the flat map
page_count = metadata["page_count"]
if page_count, do: IO.puts("Page count: #{page_count}")
authors = metadata["authors"] || []
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
title = metadata["title"]
if title, do: IO.puts("Title: #{title}")
# Access HTML metadata directly from the flat map
{:ok, html_result} = Kreuzberg.extract_file("page.html")
html_meta = html_result.metadata
keywords = html_meta["keywords"] || []
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
description = html_meta["description"]
if description, do: IO.puts("Description: #{description}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"post_processors" => [
%{
"name" => "QualityFilter",
"enabled" => true
}
]
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Extraction quality applied")
IO.inspect(result.text, label: "Quality-filtered text")
```

View File

@@ -0,0 +1,39 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Process document with quality filtering
config = %ExtractionConfig{
quality_processing: %{
"enabled" => true,
"min_quality_score" => 0.8
}
}
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Quality Processing ===\n")
# Display quality metrics if available
quality_score = result.quality_score
if quality_score && quality_score > 0 do
IO.puts("Quality Score: #{quality_score}")
IO.puts("Content Quality: #{quality_status(quality_score)}")
end
# Display content with quality assurance
IO.puts("\n--- Extracted Content ---")
content_preview = String.slice(result.content, 0..200)
IO.puts(content_preview)
IO.puts("\nTotal size: #{byte_size(result.content)} bytes")
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
# Helper function to determine quality status
defp quality_status(score) when score >= 0.9, do: "Excellent"
defp quality_status(score) when score >= 0.8, do: "Good"
defp quality_status(score) when score >= 0.7, do: "Fair"
defp quality_status(_score), do: "Poor"
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"post_processors" => [
%{
"name" => "QualityFilter",
"enabled" => true
}
]
})
{:ok, result_before} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
{:ok, result_after} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Compare text quality metrics
text_before = result_before.text || ""
text_after = result_after.text || ""
IO.puts("Before quality processing: #{String.length(text_before)} chars")
IO.puts("After quality processing: #{String.length(text_after)} chars")
IO.puts("Improvement: #{Float.round((1 - String.length(text_after) / String.length(text_before)) * 100, 2)}%")
```

View File

@@ -0,0 +1,19 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
tables = result.tables
IO.puts("Total tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
IO.puts("\n--- Table #{index} ---")
# Access table cells
cells = table["cells"] || []
IO.puts("Rows: #{length(cells)}")
# Access table markdown representation
markdown = table["markdown"]
IO.puts("Markdown representation:")
IO.puts(markdown)
end)
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_markdown" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
if result.original_token_count do
IO.puts("Original tokens: #{result.original_token_count}")
end
if result.reduced_token_count do
IO.puts("Reduced tokens: #{result.reduced_token_count}")
end
```

View File

@@ -0,0 +1,42 @@
```elixir title="Elixir"
alias Kreuzberg.ExtractionConfig
# Reduce token count for LLM
config = %ExtractionConfig{
token_reduction: %{
"enabled" => true,
"target_tokens" => 2000
}
}
case Kreuzberg.extract_file("document.pdf", nil, config) do
{:ok, result} ->
IO.puts("=== Token Reduction ===\n")
# Display content and token information
content_size = byte_size(result.content)
estimated_tokens = div(content_size, 4) # Rough estimate: 1 token ≈ 4 bytes
IO.puts("Content size: #{content_size} bytes")
IO.puts("Estimated tokens: ~#{estimated_tokens}")
IO.puts("Target tokens: 2000")
# Show reduction status
if estimated_tokens > 2000 do
reduction_percentage = trunc((1 - 2000 / estimated_tokens) * 100)
IO.puts("\nToken reduction applied: ~#{reduction_percentage}% reduction")
else
IO.puts("\nNo reduction needed - content already below target")
end
# Display reduced content preview
IO.puts("\n--- Reduced Content ---")
content_preview = String.slice(result.content, 0..300)
IO.puts(content_preview)
IO.puts("...")
{:error, reason} ->
IO.puts("Extraction failed!")
IO.puts("Error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,28 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_markdown" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("verbose_document.pdf", "application/pdf", config_json)
# Display token reduction metrics
original = result.original_token_count || 0
reduced = result.reduced_token_count || 0
IO.puts("Original tokens: #{original}")
IO.puts("Reduced tokens: #{reduced}")
if original > 0 do
reduction_percent = Float.round((1 - reduced / original) * 100, 2)
IO.puts("Reduction: #{reduction_percent}%")
end
# Show sample of reduced text
if result.text do
IO.puts("\nSample of reduced text:")
IO.puts(String.slice(result.text, 0..200) <> "...")
end
```

View File

@@ -0,0 +1,40 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"enabled" => true,
"max_characters" => 512,
"overlap" => 50
},
"embeddings" => %{
"enabled" => true,
"model" => "all-MiniLM-L6-v2"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
# Prepare data for vector database storage
vector_db_records = result.chunks
|> Enum.with_index(1)
|> Enum.map(fn {chunk, idx} ->
%{
"id" => "#{result.document_id}-chunk-#{idx}",
"vector" => chunk["embedding"],
"metadata" => %{
"content" => chunk["content"],
"page" => chunk["page"],
"document_id" => result.document_id,
"chunk_index" => idx
}
}
end)
IO.puts("Generated #{length(vector_db_records)} records for vector database")
IO.inspect(List.first(vector_db_records), label: "Sample Record")
# Example: Insert into Pinecone-like vector database
Enum.each(vector_db_records, fn record ->
# vector_db_client.upsert(record)
IO.puts("Would insert: #{record["id"]}")
end)
```