Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,215 @@
# General metadata extraction and manipulation utilities
defmodule MetadataUtils do
@doc """
Extract standard metadata fields from extraction result.
"""
def extract_standard_metadata(result) do
metadata = Map.get(result, "metadata", %{})
%{
title: Map.get(metadata, "title", "Unknown"),
author: Map.get(metadata, "author", "Unknown"),
creation_date: Map.get(metadata, "creation_date"),
modification_date: Map.get(metadata, "modification_date"),
language: Map.get(metadata, "language"),
page_count: Map.get(metadata, "page_count", 0),
format: Map.get(metadata, "format", "unknown")
}
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|> Enum.into(%{})
end
@doc """
Merge metadata from multiple extraction results.
"""
def merge_metadata(results) when is_list(results) do
results
|> Enum.reduce(%{}, fn result, acc ->
metadata = Map.get(result, "metadata", %{})
Enum.reduce(metadata, acc, fn {key, value}, meta_acc ->
existing = Map.get(meta_acc, key, [])
updated_value =
case existing do
[] -> [value]
list when is_list(list) -> list ++ [value]
single -> [single, value]
end
Map.put(meta_acc, key, updated_value)
end)
end)
end
@doc """
Enrich chunks with additional metadata.
"""
def enrich_chunks_metadata(chunks, source_metadata) do
Enum.map(chunks, fn chunk ->
chunk_metadata = Map.get(chunk, "metadata", %{})
enriched =
source_metadata
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|> Enum.into(chunk_metadata)
Map.put(chunk, "metadata", enriched)
end)
end
@doc """
Extract metadata specific to a chunk's context (position, size, etc).
"""
def extract_chunk_context(chunks) do
total_chunks = length(chunks)
chunks
|> Enum.with_index()
|> Enum.map(fn {chunk, index} ->
metadata = Map.get(chunk, "metadata", %{})
text = Map.get(chunk, "text", "")
context = %{
"chunk_index" => index,
"chunk_number" => index + 1,
"total_chunks" => total_chunks,
"position_percentage" => Float.round((index + 1) / total_chunks * 100, 2),
"text_length" => String.length(text),
"word_count" => String.split(text) |> length(),
"has_headings" => String.contains?(text, ~w[## # ===]),
"has_lists" => String.contains?(text, ["- ", "* "])
}
enriched_metadata = Map.merge(metadata, context)
Map.put(chunk, "metadata", enriched_metadata)
end)
end
@doc """
Filter metadata by a set of allowed keys.
"""
def filter_metadata_keys(data, allowed_keys) when is_list(allowed_keys) do
metadata = Map.get(data, "metadata", %{})
filtered =
metadata
|> Enum.filter(fn {key, _value} -> Enum.member?(allowed_keys, key) end)
|> Enum.into(%{})
Map.put(data, "metadata", filtered)
end
@doc """
Create a metadata summary across all chunks.
"""
def metadata_summary(chunks) do
%{
total_chunks: length(chunks),
total_text_length: Enum.reduce(chunks, 0, fn chunk, acc ->
String.length(Map.get(chunk, "text", "")) + acc
end),
avg_chunk_size: calculate_avg_size(chunks),
metadata_fields: extract_all_metadata_fields(chunks),
enrichment_level: assess_enrichment(chunks)
}
end
@doc """
Generate a human-readable metadata report.
"""
def generate_report(result) do
standard = extract_standard_metadata(result)
chunks = Map.get(result, "chunks", [])
"""
=== Document Metadata Report ===
Standard Fields:
#{format_dict(standard)}
Chunk Statistics:
- Total Chunks: #{length(chunks)}
- Avg Chunk Size: #{calculate_avg_size(chunks)} characters
Metadata Summary:
#{format_dict(metadata_summary(chunks))}
"""
end
# Private helpers
defp calculate_avg_size(chunks) do
case chunks do
[] ->
0
chunks ->
total = Enum.reduce(chunks, 0, fn chunk, acc ->
String.length(Map.get(chunk, "text", "")) + acc
end)
div(total, length(chunks))
end
end
defp extract_all_metadata_fields(chunks) do
chunks
|> Enum.flat_map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.keys()
end)
|> Enum.uniq()
end
defp assess_enrichment(chunks) do
avg_fields =
chunks
|> Enum.map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> map_size()
end)
|> then(fn sizes ->
if Enum.empty?(sizes) do
0
else
div(Enum.sum(sizes), length(sizes))
end
end)
case avg_fields do
count when count >= 5 -> "high"
count when count >= 2 -> "medium"
_ -> "low"
end
end
defp format_dict(dict) when is_map(dict) do
dict
|> Enum.map(fn {key, value} ->
" #{key}: #{inspect(value)}"
end)
|> Enum.join("\n")
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("=== Standard Metadata ===")
IO.inspect(MetadataUtils.extract_standard_metadata(result))
IO.puts("\n=== Chunk Context ===")
enhanced_chunks = MetadataUtils.extract_chunk_context(result.chunks || [])
IO.inspect(hd(enhanced_chunks))
IO.puts("\n=== Metadata Summary ===")
IO.inspect(MetadataUtils.metadata_summary(result.chunks || []))
IO.puts("\n=== Report ===")
IO.puts(MetadataUtils.generate_report(result))