This commit is contained in:
53
docs/snippets/elixir/config/advanced_config.md
Normal file
53
docs/snippets/elixir/config/advanced_config.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"enable_quality_processing" => true,
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
},
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"embedding" => %{
|
||||
"model" => %{
|
||||
"preset" => %{
|
||||
"name" => "balanced"
|
||||
}
|
||||
},
|
||||
"batch_size" => 32,
|
||||
"normalize" => true,
|
||||
"show_download_progress" => false
|
||||
}
|
||||
},
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => false
|
||||
},
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.1,
|
||||
"ngram_range" => [1, 3],
|
||||
"language" => "en"
|
||||
},
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
},
|
||||
"postprocessor" => %{
|
||||
"enabled" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content: #{result.content}")
|
||||
|
||||
if result.detected_languages do
|
||||
IO.puts("Languages: #{inspect(result.detected_languages)}")
|
||||
end
|
||||
|
||||
chunks_count = if result.chunks, do: length(result.chunks), else: 0
|
||||
IO.puts("Chunks: #{chunks_count}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/chunking_config.md
Normal file
15
docs/snippets/elixir/config/chunking_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Chunks: #{length(result.chunks)}")
|
||||
|
||||
Enum.each(result.chunks, fn chunk ->
|
||||
IO.puts("Length: #{String.length(chunk.content)}")
|
||||
end)
|
||||
```
|
||||
9
docs/snippets/elixir/config/config_basic.md
Normal file
9
docs/snippets/elixir/config/config_basic.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"enable_quality_processing" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
5
docs/snippets/elixir/config/config_discover.md
Normal file
5
docs/snippets/elixir/config/config_discover.md
Normal file
@@ -0,0 +1,5 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract with nil config to use discovered/default configuration
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
12
docs/snippets/elixir/config/config_ocr.md
Normal file
12
docs/snippets/elixir/config/config_ocr.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
IO.puts("Tables detected: #{length(result.tables)}")
|
||||
```
|
||||
20
docs/snippets/elixir/config/config_programmatic.md
Normal file
20
docs/snippets/elixir/config/config_programmatic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"use_cache" => true,
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+deu",
|
||||
"tesseract_config" => %{
|
||||
"psm" => 6
|
||||
}
|
||||
},
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200
|
||||
},
|
||||
"enable_quality_processing" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
```
|
||||
13
docs/snippets/elixir/config/document_structure_config.md
Normal file
13
docs/snippets/elixir/config/document_structure_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Document Structure Config (Elixir)"
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
include_document_structure: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
|
||||
if result.document do
|
||||
Enum.each(result.document.nodes, fn node ->
|
||||
IO.puts("[#{node.content.node_type}]")
|
||||
end)
|
||||
end
|
||||
```
|
||||
36
docs/snippets/elixir/config/element_based_output.md
Normal file
36
docs/snippets/elixir/config/element_based_output.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```elixir title="Element-Based Output (Elixir)"
|
||||
# Configure element-based output
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
output_format: :element_based
|
||||
}
|
||||
|
||||
# Extract document
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
|
||||
# Access elements
|
||||
Enum.each(result.elements, fn element ->
|
||||
IO.puts("Type: #{element.element_type}")
|
||||
|
||||
text = String.slice(element.text, 0, 100)
|
||||
IO.puts("Text: #{text}")
|
||||
|
||||
if element.metadata.page_number do
|
||||
IO.puts("Page: #{element.metadata.page_number}")
|
||||
end
|
||||
|
||||
if element.metadata.coordinates do
|
||||
coords = element.metadata.coordinates
|
||||
IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
|
||||
end
|
||||
|
||||
IO.puts("---")
|
||||
end)
|
||||
|
||||
# Filter by element type
|
||||
titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
|
||||
|
||||
Enum.each(titles, fn title ->
|
||||
level = Map.get(title.metadata.additional, "level", "unknown")
|
||||
IO.puts("[#{level}] #{title.text}")
|
||||
end)
|
||||
```
|
||||
22
docs/snippets/elixir/config/embedding_config.md
Normal file
22
docs/snippets/elixir/config/embedding_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 1000,
|
||||
"overlap" => 200,
|
||||
"embedding" => %{
|
||||
"model" => %{
|
||||
"preset" => %{
|
||||
"name" => "balanced"
|
||||
}
|
||||
},
|
||||
"batch_size" => 16,
|
||||
"normalize" => true,
|
||||
"show_download_progress" => true
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
chunks_count = if result.chunks, do: length(result.chunks), else: 0
|
||||
IO.puts("Chunks with embeddings: #{chunks_count}")
|
||||
```
|
||||
11
docs/snippets/elixir/config/html_output.md
Normal file
11
docs/snippets/elixir/config/html_output.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"output_format" => "Html",
|
||||
"html_output" => %{
|
||||
"theme" => "GitHub"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts(result.content)
|
||||
```
|
||||
14
docs/snippets/elixir/config/keyword_extraction_config.md
Normal file
14
docs/snippets/elixir/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "Yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.1,
|
||||
"ngram_range" => [1, 3],
|
||||
"language" => "en"
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Keywords: #{inspect(result.keywords)}")
|
||||
```
|
||||
13
docs/snippets/elixir/config/language_detection_config.md
Normal file
13
docs/snippets/elixir/config/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Detected language: #{result.language}")
|
||||
IO.puts("Confidence: #{result.language_confidence}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/ocr_dpi_config.md
Normal file
15
docs/snippets/elixir/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"images" => %{
|
||||
"extract_images" => true,
|
||||
"target_dpi" => 300,
|
||||
"max_image_dimension" => 4096,
|
||||
"auto_adjust_dpi" => true,
|
||||
"min_dpi" => 150,
|
||||
"max_dpi" => 600
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Extracted images: #{length(result.images)}")
|
||||
```
|
||||
14
docs/snippets/elixir/config/pdf_config.md
Normal file
14
docs/snippets/elixir/config/pdf_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"pdf_options" => %{
|
||||
"extract_images" => true,
|
||||
"passwords" => ["password123"],
|
||||
"extract_metadata" => true,
|
||||
"hierarchy" => %{}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
|
||||
IO.puts("Title: #{inspect(result.metadata.title)}")
|
||||
IO.puts("Authors: #{inspect(result.metadata.authors)}")
|
||||
```
|
||||
16
docs/snippets/elixir/config/pdf_hierarchy_config.md
Normal file
16
docs/snippets/elixir/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"pdf_options" => %{
|
||||
"hierarchy" => %{
|
||||
"enabled" => true,
|
||||
"detection_threshold" => 0.75,
|
||||
"ocr_coverage_threshold" => 0.8,
|
||||
"min_level" => 1,
|
||||
"max_level" => 5
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
|
||||
```
|
||||
14
docs/snippets/elixir/config/postprocessor_config.md
Normal file
14
docs/snippets/elixir/config/postprocessor_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"postprocessor" => %{
|
||||
"enabled" => true,
|
||||
"enabled_processors" => [
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Processed content: #{result.content}")
|
||||
```
|
||||
10
docs/snippets/elixir/config/quality_processing_config.md
Normal file
10
docs/snippets/elixir/config/quality_processing_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"enable_quality_processing" => true,
|
||||
"use_cache" => true
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Quality score: #{result.quality_score}")
|
||||
IO.puts("Processing time: #{inspect(result.processing_time)}")
|
||||
```
|
||||
15
docs/snippets/elixir/config/tesseract_config.md
Normal file
15
docs/snippets/elixir/config/tesseract_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"ocr" => %{
|
||||
"backend" => "tesseract",
|
||||
"language" => "eng+deu",
|
||||
"tesseract_config" => %{
|
||||
"psm" => 6,
|
||||
"oem" => 3
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
|
||||
IO.puts("OCR text: #{result.content}")
|
||||
```
|
||||
12
docs/snippets/elixir/config/token_reduction_config.md
Normal file
12
docs/snippets/elixir/config/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```elixir title="Elixir"
|
||||
config_json = Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
|
||||
IO.puts("Original tokens: #{result.token_count}")
|
||||
IO.puts("Reduced content: #{result.content}")
|
||||
```
|
||||
Reference in New Issue
Block a user