Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"enable_quality_processing" => true,
"ocr" => %{
"backend" => "tesseract",
"language" => "eng"
},
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200,
"embedding" => %{
"model" => %{
"preset" => %{
"name" => "balanced"
}
},
"batch_size" => 32,
"normalize" => true,
"show_download_progress" => false
}
},
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => false
},
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.1,
"ngram_range" => [1, 3],
"language" => "en"
},
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
},
"postprocessor" => %{
"enabled" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Content: #{result.content}")
if result.detected_languages do
IO.puts("Languages: #{inspect(result.detected_languages)}")
end
chunks_count = if result.chunks, do: length(result.chunks), else: 0
IO.puts("Chunks: #{chunks_count}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Chunks: #{length(result.chunks)}")
Enum.each(result.chunks, fn chunk ->
IO.puts("Length: #{String.length(chunk.content)}")
end)
```

View File

@@ -0,0 +1,9 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"enable_quality_processing" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts(result.content)
```

View File

@@ -0,0 +1,5 @@
```elixir title="Elixir"
# Extract with nil config to use discovered/default configuration
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
IO.puts(result.content)
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"ocr" => %{
"backend" => "tesseract",
"language" => "eng"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
IO.puts("Content length: #{String.length(result.content)}")
IO.puts("Tables detected: #{length(result.tables)}")
```

View File

@@ -0,0 +1,20 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"use_cache" => true,
"ocr" => %{
"backend" => "tesseract",
"language" => "eng+deu",
"tesseract_config" => %{
"psm" => 6
}
},
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200
},
"enable_quality_processing" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Content length: #{String.length(result.content)}")
```

View File

@@ -0,0 +1,13 @@
```elixir title="Document Structure Config (Elixir)"
config = %Kreuzberg.ExtractionConfig{
include_document_structure: true
}
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
if result.document do
Enum.each(result.document.nodes, fn node ->
IO.puts("[#{node.content.node_type}]")
end)
end
```

View File

@@ -0,0 +1,36 @@
```elixir title="Element-Based Output (Elixir)"
# Configure element-based output
config = %Kreuzberg.ExtractionConfig{
output_format: :element_based
}
# Extract document
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
# Access elements
Enum.each(result.elements, fn element ->
IO.puts("Type: #{element.element_type}")
text = String.slice(element.text, 0, 100)
IO.puts("Text: #{text}")
if element.metadata.page_number do
IO.puts("Page: #{element.metadata.page_number}")
end
if element.metadata.coordinates do
coords = element.metadata.coordinates
IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
end
IO.puts("---")
end)
# Filter by element type
titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
Enum.each(titles, fn title ->
level = Map.get(title.metadata.additional, "level", "unknown")
IO.puts("[#{level}] #{title.text}")
end)
```

View File

@@ -0,0 +1,22 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"chunking" => %{
"max_characters" => 1000,
"overlap" => 200,
"embedding" => %{
"model" => %{
"preset" => %{
"name" => "balanced"
}
},
"batch_size" => 16,
"normalize" => true,
"show_download_progress" => true
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
chunks_count = if result.chunks, do: length(result.chunks), else: 0
IO.puts("Chunks with embeddings: #{chunks_count}")
```

View File

@@ -0,0 +1,11 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"output_format" => "Html",
"html_output" => %{
"theme" => "GitHub"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts(result.content)
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"keywords" => %{
"algorithm" => "Yake",
"max_keywords" => 10,
"min_score" => 0.1,
"ngram_range" => [1, 3],
"language" => "en"
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Keywords: #{inspect(result.keywords)}")
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Detected language: #{result.language}")
IO.puts("Confidence: #{result.language_confidence}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"images" => %{
"extract_images" => true,
"target_dpi" => 300,
"max_image_dimension" => 4096,
"auto_adjust_dpi" => true,
"min_dpi" => 150,
"max_dpi" => 600
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Extracted images: #{length(result.images)}")
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"pdf_options" => %{
"extract_images" => true,
"passwords" => ["password123"],
"extract_metadata" => true,
"hierarchy" => %{}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
IO.puts("Title: #{inspect(result.metadata.title)}")
IO.puts("Authors: #{inspect(result.metadata.authors)}")
```

View File

@@ -0,0 +1,16 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"pdf_options" => %{
"hierarchy" => %{
"enabled" => true,
"detection_threshold" => 0.75,
"ocr_coverage_threshold" => 0.8,
"min_level" => 1,
"max_level" => 5
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"postprocessor" => %{
"enabled" => true,
"enabled_processors" => [
"whitespace_normalizer",
"unicode_normalizer"
]
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Processed content: #{result.content}")
```

View File

@@ -0,0 +1,10 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"enable_quality_processing" => true,
"use_cache" => true
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Quality score: #{result.quality_score}")
IO.puts("Processing time: #{inspect(result.processing_time)}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"ocr" => %{
"backend" => "tesseract",
"language" => "eng+deu",
"tesseract_config" => %{
"psm" => 6,
"oem" => 3
}
}
})
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
IO.puts("OCR text: #{result.content}")
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
config_json = Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
}
})
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
IO.puts("Original tokens: #{result.token_count}")
IO.puts("Reduced content: #{result.content}")
```