This commit is contained in:
92
docs/snippets/elixir/utils/chunking.exs
Normal file
92
docs/snippets/elixir/utils/chunking.exs
Normal file
@@ -0,0 +1,92 @@
|
||||
# Utility functions for working with chunks
|
||||
defmodule ChunkingUtils do
|
||||
@doc """
|
||||
Calculate statistics for a list of chunks.
|
||||
Returns min, max, and average chunk sizes.
|
||||
"""
|
||||
def chunk_size_stats(chunks) when is_list(chunks) do
|
||||
case chunks do
|
||||
[] ->
|
||||
%{min: 0, max: 0, avg: 0}
|
||||
|
||||
chunks ->
|
||||
sizes = Enum.map(chunks, &chunk_text_length/1)
|
||||
|
||||
%{
|
||||
min: Enum.min(sizes),
|
||||
max: Enum.max(sizes),
|
||||
avg: div(Enum.sum(sizes), length(sizes)),
|
||||
count: length(chunks)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Filter chunks by a minimum size threshold.
|
||||
"""
|
||||
def filter_by_min_size(chunks, min_size) do
|
||||
Enum.filter(chunks, &(chunk_text_length(&1) >= min_size))
|
||||
end
|
||||
|
||||
@doc """
|
||||
Merge adjacent chunks if they are below a size threshold.
|
||||
"""
|
||||
def merge_small_chunks(chunks, threshold) do
|
||||
chunks
|
||||
|> Enum.reduce([], fn chunk, acc ->
|
||||
case acc do
|
||||
[] ->
|
||||
[chunk]
|
||||
|
||||
[last | rest] ->
|
||||
last_size = chunk_text_length(last)
|
||||
|
||||
if last_size < threshold do
|
||||
merged_text = "#{last["content"]} #{chunk["content"]}"
|
||||
merged_chunk = Map.put(chunk, "content", merged_text)
|
||||
[merged_chunk | rest]
|
||||
else
|
||||
[chunk, last | rest]
|
||||
end
|
||||
end
|
||||
end)
|
||||
|> Enum.reverse()
|
||||
end
|
||||
|
||||
@doc """
|
||||
Group chunks by document section (if metadata contains section info).
|
||||
"""
|
||||
def group_by_section(chunks) do
|
||||
Enum.group_by(chunks, fn chunk ->
|
||||
Map.get(chunk, "metadata", %{})
|
||||
|> Map.get("section", "general")
|
||||
end)
|
||||
end
|
||||
|
||||
# Private helper
|
||||
defp chunk_text_length(chunk) do
|
||||
chunk
|
||||
|> Map.get("content", "")
|
||||
|> String.length()
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("doc.pdf", nil, config)
|
||||
chunks = result.chunks || []
|
||||
|
||||
IO.puts("=== Chunk Statistics ===")
|
||||
IO.inspect(ChunkingUtils.chunk_size_stats(chunks))
|
||||
|
||||
IO.puts("\n=== Chunks by Section ===")
|
||||
IO.inspect(ChunkingUtils.group_by_section(chunks))
|
||||
|
||||
IO.puts("\n=== Filtering chunks >= 500 chars ===")
|
||||
IO.inspect(ChunkingUtils.filter_by_min_size(chunks, 500))
|
||||
|
||||
IO.puts("\n=== Merging small chunks < 200 chars ===")
|
||||
IO.inspect(ChunkingUtils.merge_small_chunks(chunks, 200))
|
||||
31
docs/snippets/elixir/utils/chunking_rag.md
Normal file
31
docs/snippets/elixir/utils/chunking_rag.md
Normal file
@@ -0,0 +1,31 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 500,
|
||||
"overlap" => 50,
|
||||
"embedding" => %{
|
||||
"model" => %{"preset" => %{"name" => "balanced"}},
|
||||
"normalize" => true,
|
||||
"batch_size" => 16
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
|
||||
chunks_with_embeddings =
|
||||
for chunk <- result["chunks"] || [],
|
||||
embedding = chunk["embedding"],
|
||||
is_list(embedding) do
|
||||
%{
|
||||
content: String.slice(chunk["content"] || "", 0, 100),
|
||||
embedding_dims: length(embedding)
|
||||
}
|
||||
end
|
||||
|
||||
IO.puts("Chunks with embeddings: #{length(chunks_with_embeddings)}")
|
||||
```
|
||||
21
docs/snippets/elixir/utils/embedding_with_chunking.md
Normal file
21
docs/snippets/elixir/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 1024,
|
||||
"overlap" => 100,
|
||||
"embedding" => %{
|
||||
"model" => %{"preset" => %{"name" => "balanced"}},
|
||||
"normalize" => true,
|
||||
"batch_size" => 32,
|
||||
"show_download_progress" => false
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
IO.puts("chunks: #{length(result["chunks"] || [])}")
|
||||
```
|
||||
155
docs/snippets/elixir/utils/error_handling.exs
Normal file
155
docs/snippets/elixir/utils/error_handling.exs
Normal file
@@ -0,0 +1,155 @@
|
||||
# Error handling patterns for Kreuzberg extraction
|
||||
defmodule ErrorHandlingUtils do
|
||||
@doc """
|
||||
Safely extract a file with retry logic and error recovery.
|
||||
"""
|
||||
def extract_with_retry(file_path, config, max_retries \\ 3) do
|
||||
extract_with_retry(file_path, config, max_retries, 1, nil)
|
||||
end
|
||||
|
||||
defp extract_with_retry(_file_path, _config, max_retries, attempt, _error)
|
||||
when attempt > max_retries do
|
||||
{:error, "Max retries (#{max_retries}) exceeded"}
|
||||
end
|
||||
|
||||
defp extract_with_retry(file_path, config, max_retries, attempt, _prev_error) do
|
||||
case Kreuzberg.extract_file(file_path, nil, config) do
|
||||
{:ok, result} ->
|
||||
{:ok, result}
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Attempt #{attempt} failed: #{inspect(reason)}")
|
||||
Process.sleep(100 * attempt) # Exponential backoff
|
||||
extract_with_retry(file_path, config, max_retries, attempt + 1, reason)
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract multiple files and collect results and errors separately.
|
||||
"""
|
||||
def extract_multiple(files, config) do
|
||||
files
|
||||
|> Enum.map(fn file ->
|
||||
{file, Kreuzberg.extract_file(file, nil, config)}
|
||||
end)
|
||||
|> Enum.reduce(%{successes: [], failures: []}, fn {file, result}, acc ->
|
||||
case result do
|
||||
{:ok, data} ->
|
||||
Map.update!(acc, :successes, &[{file, data} | &1])
|
||||
|
||||
{:error, reason} ->
|
||||
Map.update!(acc, :failures, &[{file, reason} | &1])
|
||||
end
|
||||
end)
|
||||
|> then(fn acc ->
|
||||
%{
|
||||
acc
|
||||
| successes: Enum.reverse(acc.successes),
|
||||
failures: Enum.reverse(acc.failures)
|
||||
}
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Validate extraction result and return detailed error information.
|
||||
"""
|
||||
def validate_result(result, required_fields \\ ["text", "metadata"]) do
|
||||
case result do
|
||||
{:ok, data} ->
|
||||
missing = Enum.filter(required_fields, &(!Map.has_key?(data, &1)))
|
||||
|
||||
if Enum.empty?(missing) do
|
||||
{:ok, data}
|
||||
else
|
||||
{:error, "Missing required fields: #{inspect(missing)}"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, format_error(reason)}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Format errors into human-readable messages.
|
||||
"""
|
||||
def format_error(reason) when is_binary(reason), do: reason
|
||||
|
||||
def format_error(reason) when is_atom(reason) do
|
||||
case reason do
|
||||
:file_not_found -> "The specified file could not be found"
|
||||
:invalid_format -> "The file format is not supported"
|
||||
:extraction_failed -> "Failed to extract content from the file"
|
||||
:timeout -> "Extraction operation timed out"
|
||||
:permission_denied -> "Permission denied when accessing the file"
|
||||
other -> "Unknown error: #{inspect(other)}"
|
||||
end
|
||||
end
|
||||
|
||||
def format_error(reason), do: inspect(reason)
|
||||
|
||||
@doc """
|
||||
Log extraction metrics for debugging and monitoring.
|
||||
"""
|
||||
def log_metrics(file_path, result, duration_ms) do
|
||||
status =
|
||||
case result do
|
||||
{:ok, _} -> "success"
|
||||
{:error, _} -> "failure"
|
||||
end
|
||||
|
||||
IO.puts("""
|
||||
[#{DateTime.utc_now()}] Extraction Metrics
|
||||
- File: #{file_path}
|
||||
- Status: #{status}
|
||||
- Duration: #{duration_ms}ms
|
||||
""")
|
||||
|
||||
case result do
|
||||
{:ok, data} ->
|
||||
IO.puts("- Chunks: #{length(data.chunks || [])}")
|
||||
IO.puts("- Text length: #{String.length(data.content || "")}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("- Error: #{format_error(reason)}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage with error handling
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
IO.puts("=== Extract with Retry ===")
|
||||
|
||||
case ErrorHandlingUtils.extract_with_retry("document.pdf", config, 3) do
|
||||
{:ok, result} ->
|
||||
IO.puts("Extraction succeeded")
|
||||
IO.inspect(result)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Extraction failed: #{reason}")
|
||||
end
|
||||
|
||||
IO.puts("\n=== Extract Multiple Files ===")
|
||||
|
||||
files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
|
||||
|
||||
case ErrorHandlingUtils.extract_multiple(files, config) do
|
||||
results ->
|
||||
IO.puts("Successes: #{length(results.successes)}")
|
||||
IO.puts("Failures: #{length(results.failures)}")
|
||||
IO.inspect(results)
|
||||
end
|
||||
|
||||
IO.puts("\n=== Validate Result ===")
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("test.pdf", nil, config)
|
||||
|
||||
case ErrorHandlingUtils.validate_result(result, ["text", "chunks"]) do
|
||||
{:ok, data} ->
|
||||
IO.puts("Validation passed")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Validation failed: #{reason}")
|
||||
end
|
||||
21
docs/snippets/elixir/utils/keyword_extraction_example.md
Normal file
21
docs/snippets/elixir/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"keywords" => %{
|
||||
"algorithm" => "yake",
|
||||
"max_keywords" => 10,
|
||||
"min_score" => 0.3
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
|
||||
for keyword <- result["extracted_keywords"] || [] do
|
||||
text = keyword["text"] || ""
|
||||
score = keyword["score"] || 0.0
|
||||
IO.puts("#{text}: #{:io_lib.format("~.3f", [score])}")
|
||||
end
|
||||
```
|
||||
187
docs/snippets/elixir/utils/language_detection.exs
Normal file
187
docs/snippets/elixir/utils/language_detection.exs
Normal file
@@ -0,0 +1,187 @@
|
||||
# Language detection and processing utilities
|
||||
defmodule LanguageDetectionUtils do
|
||||
@default_confidence 0.8
|
||||
|
||||
@language_indicators %{
|
||||
"en" => %{
|
||||
patterns: ~w[the and to of a in is that],
|
||||
common_words: ~w[english language detection],
|
||||
min_match: 2
|
||||
},
|
||||
"es" => %{
|
||||
patterns: ~w[de la el y en que],
|
||||
common_words: ~w[español detección idioma],
|
||||
min_match: 2
|
||||
},
|
||||
"fr" => %{
|
||||
patterns: ~w[le la de et une est],
|
||||
common_words: ~w[français détection langue],
|
||||
min_match: 2
|
||||
},
|
||||
"de" => %{
|
||||
patterns: ~w[der die und das in ein],
|
||||
common_words: ~w[deutsch erkennung sprache],
|
||||
min_match: 2
|
||||
}
|
||||
}
|
||||
|
||||
@doc """
|
||||
Detect the language of extracted text using pattern matching.
|
||||
Returns {language_code, confidence} tuple.
|
||||
"""
|
||||
def detect_language(text) when is_binary(text) and byte_size(text) > 0 do
|
||||
normalized = String.downcase(text)
|
||||
words = String.split(normalized, ~r/\W+/)
|
||||
|
||||
scores =
|
||||
@language_indicators
|
||||
|> Enum.map(fn {lang, indicators} ->
|
||||
matched = count_pattern_matches(words, indicators.patterns)
|
||||
confidence = min(matched / indicators.min_match, 1.0)
|
||||
{lang, confidence}
|
||||
end)
|
||||
|> Enum.sort_by(&elem(&1, 1), :desc)
|
||||
|
||||
case scores do
|
||||
[{lang, confidence} | _] when confidence > 0.3 ->
|
||||
{lang, min(confidence * 100, 100)}
|
||||
|
||||
_ ->
|
||||
{"unknown", 0.0}
|
||||
end
|
||||
end
|
||||
|
||||
def detect_language(_), do: {"unknown", 0.0}
|
||||
|
||||
@doc """
|
||||
Detect language for each chunk and add it to metadata.
|
||||
"""
|
||||
def detect_chunk_languages(chunks) when is_list(chunks) do
|
||||
Enum.map(chunks, fn chunk ->
|
||||
text = Map.get(chunk, "text", "")
|
||||
{lang, confidence} = detect_language(text)
|
||||
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
|
||||
updated_metadata =
|
||||
metadata
|
||||
|> Map.put("detected_language", lang)
|
||||
|> Map.put("language_confidence", confidence)
|
||||
|
||||
Map.put(chunk, "metadata", updated_metadata)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Group chunks by detected language.
|
||||
"""
|
||||
def group_chunks_by_language(chunks) do
|
||||
chunks
|
||||
|> detect_chunk_languages()
|
||||
|> Enum.group_by(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> Map.get("detected_language", "unknown")
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Filter chunks by language with optional confidence threshold.
|
||||
"""
|
||||
def filter_by_language(chunks, target_language, min_confidence \\ 0.7) do
|
||||
chunks
|
||||
|> detect_chunk_languages()
|
||||
|> Enum.filter(fn chunk ->
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
detected = Map.get(metadata, "detected_language", "unknown")
|
||||
confidence = Map.get(metadata, "language_confidence", 0.0) / 100
|
||||
|
||||
detected == target_language and confidence >= min_confidence
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Summarize language distribution across chunks.
|
||||
"""
|
||||
def language_summary(chunks) do
|
||||
chunks
|
||||
|> group_chunks_by_language()
|
||||
|> Enum.map(fn {lang, group} ->
|
||||
avg_confidence =
|
||||
group
|
||||
|> Enum.map(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> Map.get("language_confidence", 0.0)
|
||||
end)
|
||||
|> then(fn confidences ->
|
||||
if Enum.empty?(confidences) do
|
||||
0.0
|
||||
else
|
||||
Enum.sum(confidences) / length(confidences)
|
||||
end
|
||||
end)
|
||||
|
||||
%{
|
||||
language: lang,
|
||||
chunk_count: length(group),
|
||||
avg_confidence: Float.round(avg_confidence, 2)
|
||||
}
|
||||
end)
|
||||
|> Enum.sort_by(&Map.get(&1, :chunk_count), :desc)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Determine if text is mostly in a single language.
|
||||
"""
|
||||
def is_single_language?(chunks, threshold \\ 0.8) do
|
||||
case language_summary(chunks) do
|
||||
[top | rest] ->
|
||||
dominant_ratio = top.chunk_count / Enum.reduce(rest, top.chunk_count, fn x, acc ->
|
||||
acc + x.chunk_count
|
||||
end)
|
||||
|
||||
dominant_ratio >= threshold
|
||||
|
||||
_ ->
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
defp count_pattern_matches(words, patterns) do
|
||||
Enum.count(words, &Enum.member?(patterns, &1))
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multilingual_doc.pdf", nil, config)
|
||||
chunks = result.chunks || []
|
||||
|
||||
IO.puts("=== Language Detection ===")
|
||||
|
||||
case LanguageDetectionUtils.detect_language(result.content || "") do
|
||||
{lang, confidence} ->
|
||||
IO.puts("Detected Language: #{lang}")
|
||||
IO.puts("Confidence: #{Float.round(confidence, 2)}%")
|
||||
end
|
||||
|
||||
IO.puts("\n=== Language Summary ===")
|
||||
IO.inspect(LanguageDetectionUtils.language_summary(chunks))
|
||||
|
||||
IO.puts("\n=== Group by Language ===")
|
||||
|
||||
LanguageDetectionUtils.group_chunks_by_language(chunks)
|
||||
|> Enum.each(fn {lang, group} ->
|
||||
IO.puts("Language: #{lang} - Chunks: #{length(group)}")
|
||||
end)
|
||||
|
||||
IO.puts("\n=== Filter English Chunks (min 80% confidence) ===")
|
||||
IO.inspect(LanguageDetectionUtils.filter_by_language(chunks, "en", 0.8))
|
||||
|
||||
IO.puts("\n=== Check if Single Language ===")
|
||||
IO.puts(LanguageDetectionUtils.is_single_language?(chunks, 0.8))
|
||||
17
docs/snippets/elixir/utils/quality_processing_example.md
Normal file
17
docs/snippets/elixir/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,17 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json = Jason.encode!(%{"enable_quality_processing" => true})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("scanned_document.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
|
||||
quality_score = result["quality_score"] || 0.0
|
||||
|
||||
if quality_score < 0.5 do
|
||||
IO.puts("Warning: Low quality extraction (#{:io_lib.format("~.2f", [quality_score])})")
|
||||
IO.puts("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
else
|
||||
IO.puts("Quality score: #{:io_lib.format("~.2f", [quality_score])}")
|
||||
end
|
||||
```
|
||||
11
docs/snippets/elixir/utils/standalone_embed.md
Normal file
11
docs/snippets/elixir/utils/standalone_embed.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```elixir
|
||||
# Embed with default config
|
||||
{:ok, embeddings} = Kreuzberg.embed(["Hello world", "How are you?"])
|
||||
|
||||
# Embed with specific preset
|
||||
config = %Kreuzberg.EmbeddingConfig{model: {:preset, "fast"}}
|
||||
{:ok, embeddings} = Kreuzberg.embed(["Hello world"], config)
|
||||
|
||||
# Raise on error
|
||||
embeddings = Kreuzberg.embed!(["Hello world"])
|
||||
```
|
||||
15
docs/snippets/elixir/utils/token_reduction.md
Normal file
15
docs/snippets/elixir/utils/token_reduction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
IO.puts("Content length: #{String.length(result["content"] || "")}")
|
||||
```
|
||||
22
docs/snippets/elixir/utils/token_reduction_example.md
Normal file
22
docs/snippets/elixir/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"token_reduction" => %{
|
||||
"mode" => "moderate",
|
||||
"preserve_important_words" => true
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("verbose_document.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
metadata = result["metadata"] || %{}
|
||||
|
||||
original = metadata["original_token_count"] || 0
|
||||
reduced = metadata["token_count"] || 0
|
||||
ratio = metadata["token_reduction_ratio"] || 0.0
|
||||
|
||||
IO.puts("Reduced from #{original} to #{reduced} tokens")
|
||||
IO.puts("Reduction: #{:io_lib.format("~.1f", [ratio * 100])}%")
|
||||
```
|
||||
26
docs/snippets/elixir/utils/vector_database_integration.md
Normal file
26
docs/snippets/elixir/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,26 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```elixir
|
||||
config_json =
|
||||
Jason.encode!(%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"embedding" => %{
|
||||
"model" => %{"preset" => %{"name" => "balanced"}},
|
||||
"normalize" => true
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
|
||||
result = Jason.decode!(json)
|
||||
|
||||
(result["chunks"] || [])
|
||||
|> Enum.with_index()
|
||||
|> Enum.each(fn {chunk, i} ->
|
||||
chunk_id = "doc_chunk_#{i}"
|
||||
preview = String.slice(chunk["content"] || "", 0, 50)
|
||||
IO.puts("Chunk #{chunk_id}: #{preview}")
|
||||
end)
|
||||
```
|
||||
Reference in New Issue
Block a user