Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
```elixir title="Elixir"
config =
%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.9,
"detect_multiple" => true
}
}
|> Jason.encode!()
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"detected_languages" => languages} when is_list(languages) ->
IO.inspect(languages, label: "Detected languages")
_ ->
IO.puts("No language detection results")
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,27 @@
```elixir title="Elixir"
config =
%{
"language_detection" => %{
"enabled" => true,
"min_confidence" => 0.8,
"detect_multiple" => true
}
}
|> Jason.encode!()
case Kreuzberg.extract_file_sync("multilingual_document.pdf", nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"detected_languages" => languages} when is_list(languages) ->
IO.inspect(languages, label: "Detected languages")
_ ->
IO.puts("No language detection results")
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,215 @@
# General metadata extraction and manipulation utilities
defmodule MetadataUtils do
@doc """
Extract standard metadata fields from extraction result.
"""
def extract_standard_metadata(result) do
metadata = Map.get(result, "metadata", %{})
%{
title: Map.get(metadata, "title", "Unknown"),
author: Map.get(metadata, "author", "Unknown"),
creation_date: Map.get(metadata, "creation_date"),
modification_date: Map.get(metadata, "modification_date"),
language: Map.get(metadata, "language"),
page_count: Map.get(metadata, "page_count", 0),
format: Map.get(metadata, "format", "unknown")
}
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|> Enum.into(%{})
end
@doc """
Merge metadata from multiple extraction results.
"""
def merge_metadata(results) when is_list(results) do
results
|> Enum.reduce(%{}, fn result, acc ->
metadata = Map.get(result, "metadata", %{})
Enum.reduce(metadata, acc, fn {key, value}, meta_acc ->
existing = Map.get(meta_acc, key, [])
updated_value =
case existing do
[] -> [value]
list when is_list(list) -> list ++ [value]
single -> [single, value]
end
Map.put(meta_acc, key, updated_value)
end)
end)
end
@doc """
Enrich chunks with additional metadata.
"""
def enrich_chunks_metadata(chunks, source_metadata) do
Enum.map(chunks, fn chunk ->
chunk_metadata = Map.get(chunk, "metadata", %{})
enriched =
source_metadata
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|> Enum.into(chunk_metadata)
Map.put(chunk, "metadata", enriched)
end)
end
@doc """
Extract metadata specific to a chunk's context (position, size, etc).
"""
def extract_chunk_context(chunks) do
total_chunks = length(chunks)
chunks
|> Enum.with_index()
|> Enum.map(fn {chunk, index} ->
metadata = Map.get(chunk, "metadata", %{})
text = Map.get(chunk, "text", "")
context = %{
"chunk_index" => index,
"chunk_number" => index + 1,
"total_chunks" => total_chunks,
"position_percentage" => Float.round((index + 1) / total_chunks * 100, 2),
"text_length" => String.length(text),
"word_count" => String.split(text) |> length(),
"has_headings" => String.contains?(text, ~w[## # ===]),
"has_lists" => String.contains?(text, ["- ", "* "])
}
enriched_metadata = Map.merge(metadata, context)
Map.put(chunk, "metadata", enriched_metadata)
end)
end
@doc """
Filter metadata by a set of allowed keys.
"""
def filter_metadata_keys(data, allowed_keys) when is_list(allowed_keys) do
metadata = Map.get(data, "metadata", %{})
filtered =
metadata
|> Enum.filter(fn {key, _value} -> Enum.member?(allowed_keys, key) end)
|> Enum.into(%{})
Map.put(data, "metadata", filtered)
end
@doc """
Create a metadata summary across all chunks.
"""
def metadata_summary(chunks) do
%{
total_chunks: length(chunks),
total_text_length: Enum.reduce(chunks, 0, fn chunk, acc ->
String.length(Map.get(chunk, "text", "")) + acc
end),
avg_chunk_size: calculate_avg_size(chunks),
metadata_fields: extract_all_metadata_fields(chunks),
enrichment_level: assess_enrichment(chunks)
}
end
@doc """
Generate a human-readable metadata report.
"""
def generate_report(result) do
standard = extract_standard_metadata(result)
chunks = Map.get(result, "chunks", [])
"""
=== Document Metadata Report ===
Standard Fields:
#{format_dict(standard)}
Chunk Statistics:
- Total Chunks: #{length(chunks)}
- Avg Chunk Size: #{calculate_avg_size(chunks)} characters
Metadata Summary:
#{format_dict(metadata_summary(chunks))}
"""
end
# Private helpers
defp calculate_avg_size(chunks) do
case chunks do
[] ->
0
chunks ->
total = Enum.reduce(chunks, 0, fn chunk, acc ->
String.length(Map.get(chunk, "text", "")) + acc
end)
div(total, length(chunks))
end
end
defp extract_all_metadata_fields(chunks) do
chunks
|> Enum.flat_map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.keys()
end)
|> Enum.uniq()
end
defp assess_enrichment(chunks) do
avg_fields =
chunks
|> Enum.map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> map_size()
end)
|> then(fn sizes ->
if Enum.empty?(sizes) do
0
else
div(Enum.sum(sizes), length(sizes))
end
end)
case avg_fields do
count when count >= 5 -> "high"
count when count >= 2 -> "medium"
_ -> "low"
end
end
defp format_dict(dict) when is_map(dict) do
dict
|> Enum.map(fn {key, value} ->
" #{key}: #{inspect(value)}"
end)
|> Enum.join("\n")
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("=== Standard Metadata ===")
IO.inspect(MetadataUtils.extract_standard_metadata(result))
IO.puts("\n=== Chunk Context ===")
enhanced_chunks = MetadataUtils.extract_chunk_context(result.chunks || [])
IO.inspect(hd(enhanced_chunks))
IO.puts("\n=== Metadata Summary ===")
IO.inspect(MetadataUtils.metadata_summary(result.chunks || []))
IO.puts("\n=== Report ===")
IO.puts(MetadataUtils.generate_report(result))

View File

@@ -0,0 +1,154 @@
```elixir title="Elixir"
config = Jason.encode!(%{})
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
with %{"metadata" => %{"pdf" => pdf_meta}} <- Jason.decode!(result) do
case pdf_meta do
%{"page_count" => pages} ->
IO.puts("Pages: #{pages}")
_ ->
nil
end
case pdf_meta do
%{"author" => author} ->
IO.puts("Author: #{author}")
_ ->
nil
end
case pdf_meta do
%{"title" => title} ->
IO.puts("Title: #{title}")
_ ->
nil
end
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
case Kreuzberg.extract_file_sync("page.html", nil, config) do
{:ok, result} ->
with %{"metadata" => %{"html" => html_meta}} <- Jason.decode!(result) do
case html_meta do
%{"title" => title} ->
IO.puts("Title: #{title}")
_ ->
nil
end
case html_meta do
%{"description" => desc} ->
IO.puts("Description: #{desc}")
_ ->
nil
end
# Access keywords array
case html_meta do
%{"keywords" => keywords} ->
IO.inspect(keywords, label: "Keywords")
_ ->
nil
end
# Access canonical URL
case html_meta do
%{"canonical_url" => canonical} ->
IO.puts("Canonical URL: #{canonical}")
_ ->
nil
end
# Access Open Graph fields as a map
case html_meta do
%{"open_graph" => og} when is_map(og) ->
case og do
%{"image" => og_image} ->
IO.puts("Open Graph Image: #{og_image}")
_ ->
nil
end
case og do
%{"title" => og_title} ->
IO.puts("Open Graph Title: #{og_title}")
_ ->
nil
end
_ ->
nil
end
# Access Twitter Card fields as a map
case html_meta do
%{"twitter_card" => tc} when is_map(tc) ->
case tc do
%{"card" => card_type} ->
IO.puts("Twitter Card Type: #{card_type}")
_ ->
nil
end
_ ->
nil
end
# Access language
case html_meta do
%{"language" => lang} ->
IO.puts("Language: #{lang}")
_ ->
nil
end
# Access headers
case html_meta do
%{"headers" => headers} when is_list(headers) and length(headers) > 0 ->
Enum.each(headers, fn header ->
IO.puts("Header (level #{header["level"]}): #{header["text"]}")
end)
_ ->
nil
end
# Access links
case html_meta do
%{"links" => links} when is_list(links) and length(links) > 0 ->
Enum.each(links, fn link ->
IO.puts("Link: #{link["href"]} (#{link["text"]})")
end)
_ ->
nil
end
# Access images
case html_meta do
%{"images" => images} when is_list(images) and length(images) > 0 ->
Enum.each(images, fn image ->
IO.puts("Image: #{image["src"]}")
end)
_ ->
nil
end
# Access structured data
case html_meta do
%{"structured_data" => sd} when is_list(sd) ->
IO.puts("Structured data items: #{length(sd)}")
_ ->
nil
end
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,264 @@
# Page boundary tracking and analysis utilities
defmodule PageBoundaryUtils do
@doc """
Track page boundaries throughout the document extraction.
Assumes chunks contain page_number metadata.
"""
def track_page_boundaries(chunks) when is_list(chunks) do
chunks
|> Enum.with_index()
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
page = extract_page_number(chunk)
if page do
acc
|> Map.update(page, [index], &(&1 ++ [index]))
else
acc
end
end)
end
@doc """
Get all chunks that belong to a specific page.
"""
def get_chunks_by_page(chunks, page_number) when is_list(chunks) do
chunks
|> Enum.filter(fn chunk ->
extract_page_number(chunk) == page_number
end)
end
@doc """
Identify page boundaries and create page-level segments.
"""
def identify_page_segments(chunks) when is_list(chunks) do
chunks
|> Enum.reduce([], fn chunk, segments ->
page = extract_page_number(chunk)
text = Map.get(chunk, "text", "")
case segments do
[] ->
[%{page: page, content: text, chunks: [chunk]}]
[current | rest] ->
if current.page == page do
# Same page, append to current segment
updated = %{
current
| content: current.content <> " " <> text,
chunks: current.chunks ++ [chunk]
}
[updated | rest]
else
# New page, create new segment
[%{page: page, content: text, chunks: [chunk]}, current | rest]
end
end
end)
|> Enum.reverse()
end
@doc """
Analyze transitions between pages (e.g., page breaks).
"""
def analyze_page_transitions(chunks) do
chunks
|> Enum.map(&extract_page_number/1)
|> Enum.uniq()
|> Enum.sort()
|> then(fn pages ->
%{
total_pages: length(pages),
page_numbers: pages,
gaps: find_page_gaps(pages)
}
end)
end
@doc """
Get statistics for each page (chunk count, text length, etc).
"""
def page_statistics(chunks) when is_list(chunks) do
chunks
|> Enum.group_by(&extract_page_number/1)
|> Enum.map(fn {page, page_chunks} ->
total_length =
page_chunks
|> Enum.map(&String.length(Map.get(&1, "text", "")))
|> Enum.sum()
%{
page_number: page,
chunk_count: length(page_chunks),
total_text_length: total_length,
avg_chunk_size: if(length(page_chunks) > 0, do: div(total_length, length(page_chunks)), else: 0)
}
end)
|> Enum.sort_by(&Map.get(&1, :page_number))
end
@doc """
Extract content from a page range.
"""
def extract_page_range(chunks, start_page, end_page) when is_list(chunks) do
chunks
|> Enum.filter(fn chunk ->
page = extract_page_number(chunk)
page && page >= start_page && page <= end_page
end)
end
@doc """
Add page boundary markers to chunks for processing.
"""
def add_page_markers(chunks) when is_list(chunks) do
page_segments = identify_page_segments(chunks)
segments_map =
page_segments
|> Enum.reduce(%{}, fn segment, acc ->
Map.put(acc, segment.page, segment)
end)
chunks
|> Enum.map(fn chunk ->
page = extract_page_number(chunk)
metadata = Map.get(chunk, "metadata", %{})
updated_metadata =
case segments_map[page] do
nil ->
metadata
segment ->
metadata
|> Map.put("page_number", page)
|> Map.put("is_first_on_page", hd(segment.chunks) == chunk)
|> Map.put("is_last_on_page", List.last(segment.chunks) == chunk)
|> Map.put("position_on_page", find_chunk_position(segment.chunks, chunk))
end
Map.put(chunk, "metadata", updated_metadata)
end)
end
@doc """
Generate a page index for quick access.
"""
def generate_page_index(chunks) when is_list(chunks) do
chunks
|> Enum.with_index()
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
page = extract_page_number(chunk)
if page do
Map.update(acc, page, [index], &(&1 ++ [index]))
else
acc
end
end)
end
@doc """
Create a table of contents based on page boundaries and content structure.
"""
def create_page_toc(chunks) do
chunks
|> add_page_markers()
|> Enum.filter(fn chunk ->
metadata = Map.get(chunk, "metadata", %{})
Map.get(metadata, "is_first_on_page", false)
end)
|> Enum.map(fn chunk ->
text = Map.get(chunk, "text", "")
metadata = Map.get(chunk, "metadata", %{})
page = Map.get(metadata, "page_number", "unknown")
# Extract first line or heading as TOC entry
first_line =
text
|> String.split("\n")
|> hd()
|> String.trim()
|> then(fn line ->
if String.length(line) > 100 do
String.slice(line, 0, 100) <> "..."
else
line
end
end)
%{
page: page,
content: first_line,
content_type: detect_content_type(text)
}
end)
end
# Private helpers
defp extract_page_number(chunk) do
chunk
|> Map.get("metadata", %{})
|> Map.get("page_number")
end
defp find_page_gaps(pages) do
pages
|> Enum.chunk_every(2, 1, :discard)
|> Enum.filter(fn [a, b] -> b - a > 1 end)
|> Enum.map(fn [a, b] -> {a, b} end)
end
defp find_chunk_position(chunks, target_chunk) do
chunks
|> Enum.find_index(&(&1 == target_chunk))
|> then(fn
nil -> nil
index -> index + 1
end)
end
defp detect_content_type(text) do
cond do
String.contains?(text, ~w[# ## ===]) -> "heading"
String.contains?(text, ~w[- * •]) -> "list"
String.length(text) < 100 -> "snippet"
true -> "body"
end
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("multipage_doc.pdf", nil, config)
chunks = result.chunks || []
IO.puts("=== Page Boundaries ===")
IO.inspect(PageBoundaryUtils.track_page_boundaries(chunks))
IO.puts("\n=== Page Statistics ===")
IO.inspect(PageBoundaryUtils.page_statistics(chunks))
IO.puts("\n=== Page Transitions Analysis ===")
IO.inspect(PageBoundaryUtils.analyze_page_transitions(chunks))
IO.puts("\n=== Chunks from Page 2-4 ===")
IO.inspect(PageBoundaryUtils.extract_page_range(chunks, 2, 4) |> length())
IO.puts("\n=== Page Markers Added ===")
marked_chunks = PageBoundaryUtils.add_page_markers(chunks)
IO.inspect(hd(marked_chunks))
IO.puts("\n=== Page Index ===")
IO.inspect(PageBoundaryUtils.generate_page_index(chunks))
IO.puts("\n=== Table of Contents ===")
IO.inspect(PageBoundaryUtils.create_page_toc(chunks))

View File

@@ -0,0 +1,35 @@
```elixir title="Elixir"
config = Jason.encode!(%{})
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"metadata" => %{"pages" => %{"boundaries" => boundaries}}, "content" => content}
when is_list(boundaries) ->
boundaries
|> Enum.take(3)
|> Enum.each(fn boundary ->
byte_start = boundary["byte_start"]
byte_end = boundary["byte_end"]
page_number = boundary["page_number"]
# Extract substring for this boundary
page_text = String.slice(content, byte_start, byte_end - byte_start)
preview_end = min(100, String.length(page_text))
preview = String.slice(page_text, 0, preview_end)
IO.puts("Page #{page_number}:")
IO.puts(" Byte range: #{byte_start}-#{byte_end}")
IO.puts(" Preview: #{preview}...")
end)
_ ->
nil
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,71 @@
```elixir title="Elixir"
# Track which pages content originated from during extraction
# Useful for cross-referencing extracted content back to source documents
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
# Standard extraction configuration
use_cache: true
}
{:ok, result} = Kreuzberg.extract_file("multi_page_document.pdf", nil, config)
# Access metadata to retrieve page information
metadata = result.metadata || %{}
# For PDF documents, metadata includes page tracking
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
IO.puts("Total pages in document: #{pdf_meta["page_count"]}")
IO.puts("Document title: #{pdf_meta["title"]}")
IO.puts("Document author: #{pdf_meta["author"]}")
_ ->
IO.puts("No PDF metadata available")
end
# When using chunks, track which content came from which page
chunks = result.chunks || []
IO.puts("\nTotal chunks extracted: #{length(chunks)}")
# Process chunks and associate with page information
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, index} ->
# Estimate page number based on chunk position
# This is a simplified approach - actual implementation may vary
text = Map.get(chunk, "text", "")
IO.puts("Chunk #{index}: #{byte_size(text)} bytes")
IO.puts(" Content preview: #{String.slice(text, 0..80)}...")
end)
# For tables, metadata may include page numbers
tables = result.tables || []
IO.puts("\nTotal tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
cells = table["cells"] || []
IO.puts("Table #{idx}: #{length(cells)} rows")
# Table metadata may indicate source page
case table["metadata"] do
meta when is_map(meta) ->
IO.puts(" Page: #{meta["page"] || "Unknown"}")
_ ->
IO.puts(" Page: Unknown")
end
end)
# Track images with page information
images = result.images || []
IO.puts("\nTotal images found: #{length(images)}")
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
IO.puts("Image #{idx}:")
IO.puts(" Format: #{image["format"]}")
IO.puts(" Size: #{image["size"]} bytes")
# Page information if available in image metadata
if image["page"] do
IO.puts(" Page: #{image["page"]}")
end
end)
```

View File

@@ -0,0 +1,31 @@
```elixir title="Elixir"
config =
%{"pages" => %{"extract_pages" => true}}
|> Jason.encode!()
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"pages" => pages} when is_list(pages) ->
Enum.each(pages, fn page ->
page_number = page["page_number"]
content = page["content"]
tables = page["tables"] || []
images = page["images"] || []
IO.puts("Page #{page_number}:")
IO.puts(" Content: #{String.length(content)} chars")
IO.puts(" Tables: #{length(tables)}")
IO.puts(" Images: #{length(images)}")
end)
_ ->
nil
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,108 @@
```elixir title="Elixir"
# Extract and process comprehensive PDF metadata
# Useful for document indexing, cataloging, and metadata-driven workflows
alias Kreuzberg.ExtractionConfig
defmodule PDFMetadataExtractor do
@moduledoc """
Extract and process PDF metadata from documents.
Provides structured access to PDF properties and document information.
"""
@doc """
Extract PDF metadata from a file.
Returns a map with normalized metadata fields.
"""
def extract_metadata(file_path) do
config = %ExtractionConfig{
use_cache: true
}
case Kreuzberg.extract_file(file_path, nil, config) do
{:ok, result} ->
process_metadata(result.metadata || %{})
{:error, reason} ->
{:error, reason}
end
end
@doc """
Process and normalize PDF metadata.
"""
defp process_metadata(metadata) do
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
{:ok,
%{
page_count: pdf_meta["page_count"] || 0,
title: pdf_meta["title"],
author: pdf_meta["author"],
subject: pdf_meta["subject"],
keywords: pdf_meta["keywords"],
creator: pdf_meta["creator"],
producer: pdf_meta["producer"],
creation_date: pdf_meta["creation_date"],
modification_date: pdf_meta["modification_date"],
version: pdf_meta["version"],
is_encrypted: pdf_meta["is_encrypted"] || false,
is_tagged: pdf_meta["is_tagged"] || false
}}
_ ->
{:error, "No PDF metadata found"}
end
end
@doc """
Format metadata for display.
"""
def format_metadata(metadata) when is_map(metadata) do
"""
=== PDF Metadata ===
Title: #{metadata[:title] || "N/A"}
Author: #{metadata[:author] || "N/A"}
Subject: #{metadata[:subject] || "N/A"}
Keywords: #{inspect(metadata[:keywords]) || "N/A"}
Creator: #{metadata[:creator] || "N/A"}
Producer: #{metadata[:producer] || "N/A"}
=== Document Properties ===
Pages: #{metadata[:page_count]}
Version: #{metadata[:version] || "N/A"}
Encrypted: #{metadata[:is_encrypted]}
Tagged (Accessible): #{metadata[:is_tagged]}
=== Dates ===
Created: #{metadata[:creation_date] || "N/A"}
Modified: #{metadata[:modification_date] || "N/A"}
"""
end
end
# Usage example
file_path = "document.pdf"
case PDFMetadataExtractor.extract_metadata(file_path) do
{:ok, metadata} ->
IO.puts(PDFMetadataExtractor.format_metadata(metadata))
# Perform metadata-driven operations
if metadata[:page_count] > 100 do
IO.puts("Note: Document is large (#{metadata[:page_count]} pages)")
end
if metadata[:is_encrypted] do
IO.puts("Note: Document is password-protected")
end
if metadata[:is_tagged] do
IO.puts("Note: Document is accessible with tags")
end
{:error, reason} ->
IO.puts("Error extracting metadata: #{reason}")
end
```

View File

@@ -0,0 +1,29 @@
```elixir title="Elixir"
config = Jason.encode!(%{})
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"tables" => tables} when is_list(tables) ->
Enum.each(tables, fn table ->
cells = table["cells"] || []
markdown = table["markdown"] || ""
IO.puts("Table with #{length(cells)} rows")
IO.puts("#{markdown}")
Enum.each(cells, fn row ->
IO.inspect(row)
end)
end)
_ ->
nil
end
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,66 @@
```elixir title="Elixir"
defmodule VectorRecord do
defstruct [:id, :content, :embedding, :metadata]
end
defmodule VectorIntegration do
def extract_and_vectorize(document_path, document_id) do
config =
%{
"chunking" => %{
"max_characters" => 512,
"overlap" => 50,
"embedding" => %{
"model" => %{"preset" => %{"name" => "balanced"}},
"normalize" => true,
"batch_size" => 32
}
}
}
|> Jason.encode!()
case Kreuzberg.extract_file_sync(document_path, nil, config) do
{:ok, result} ->
decoded = Jason.decode!(result)
case decoded do
%{"chunks" => chunks} when is_list(chunks) ->
records =
chunks
|> Enum.with_index()
|> Enum.flat_map(fn {chunk, index} ->
case chunk do
%{"embedding" => embedding, "content" => content}
when is_list(embedding) ->
metadata = %{
"document_id" => document_id,
"chunk_index" => Integer.to_string(index),
"content_length" => Integer.to_string(String.length(content))
}
[
%VectorRecord{
id: "#{document_id}_chunk_#{index}",
content: content,
embedding: embedding,
metadata: metadata
}
]
_ ->
[]
end
end)
{:ok, records}
_ ->
{:error, "No chunks in extraction result"}
end
{:error, reason} ->
{:error, reason}
end
end
end
```