216 lines
5.5 KiB
Elixir
216 lines
5.5 KiB
Elixir
|
|
# General metadata extraction and manipulation utilities
|
||
|
|
defmodule MetadataUtils do
|
||
|
|
@doc """
|
||
|
|
Extract standard metadata fields from extraction result.
|
||
|
|
"""
|
||
|
|
def extract_standard_metadata(result) do
|
||
|
|
metadata = Map.get(result, "metadata", %{})
|
||
|
|
%{
|
||
|
|
title: Map.get(metadata, "title", "Unknown"),
|
||
|
|
author: Map.get(metadata, "author", "Unknown"),
|
||
|
|
creation_date: Map.get(metadata, "creation_date"),
|
||
|
|
modification_date: Map.get(metadata, "modification_date"),
|
||
|
|
language: Map.get(metadata, "language"),
|
||
|
|
page_count: Map.get(metadata, "page_count", 0),
|
||
|
|
format: Map.get(metadata, "format", "unknown")
|
||
|
|
}
|
||
|
|
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||
|
|
|> Enum.into(%{})
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Merge metadata from multiple extraction results.
|
||
|
|
"""
|
||
|
|
def merge_metadata(results) when is_list(results) do
|
||
|
|
results
|
||
|
|
|> Enum.reduce(%{}, fn result, acc ->
|
||
|
|
metadata = Map.get(result, "metadata", %{})
|
||
|
|
|
||
|
|
Enum.reduce(metadata, acc, fn {key, value}, meta_acc ->
|
||
|
|
existing = Map.get(meta_acc, key, [])
|
||
|
|
|
||
|
|
updated_value =
|
||
|
|
case existing do
|
||
|
|
[] -> [value]
|
||
|
|
list when is_list(list) -> list ++ [value]
|
||
|
|
single -> [single, value]
|
||
|
|
end
|
||
|
|
|
||
|
|
Map.put(meta_acc, key, updated_value)
|
||
|
|
end)
|
||
|
|
end)
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Enrich chunks with additional metadata.
|
||
|
|
"""
|
||
|
|
def enrich_chunks_metadata(chunks, source_metadata) do
|
||
|
|
Enum.map(chunks, fn chunk ->
|
||
|
|
chunk_metadata = Map.get(chunk, "metadata", %{})
|
||
|
|
|
||
|
|
enriched =
|
||
|
|
source_metadata
|
||
|
|
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||
|
|
|> Enum.into(chunk_metadata)
|
||
|
|
|
||
|
|
Map.put(chunk, "metadata", enriched)
|
||
|
|
end)
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Extract metadata specific to a chunk's context (position, size, etc).
|
||
|
|
"""
|
||
|
|
def extract_chunk_context(chunks) do
|
||
|
|
total_chunks = length(chunks)
|
||
|
|
|
||
|
|
chunks
|
||
|
|
|> Enum.with_index()
|
||
|
|
|> Enum.map(fn {chunk, index} ->
|
||
|
|
metadata = Map.get(chunk, "metadata", %{})
|
||
|
|
text = Map.get(chunk, "text", "")
|
||
|
|
|
||
|
|
context = %{
|
||
|
|
"chunk_index" => index,
|
||
|
|
"chunk_number" => index + 1,
|
||
|
|
"total_chunks" => total_chunks,
|
||
|
|
"position_percentage" => Float.round((index + 1) / total_chunks * 100, 2),
|
||
|
|
"text_length" => String.length(text),
|
||
|
|
"word_count" => String.split(text) |> length(),
|
||
|
|
"has_headings" => String.contains?(text, ~w[## # ===]),
|
||
|
|
"has_lists" => String.contains?(text, ["- ", "* "])
|
||
|
|
}
|
||
|
|
|
||
|
|
enriched_metadata = Map.merge(metadata, context)
|
||
|
|
Map.put(chunk, "metadata", enriched_metadata)
|
||
|
|
end)
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Filter metadata by a set of allowed keys.
|
||
|
|
"""
|
||
|
|
def filter_metadata_keys(data, allowed_keys) when is_list(allowed_keys) do
|
||
|
|
metadata = Map.get(data, "metadata", %{})
|
||
|
|
|
||
|
|
filtered =
|
||
|
|
metadata
|
||
|
|
|> Enum.filter(fn {key, _value} -> Enum.member?(allowed_keys, key) end)
|
||
|
|
|> Enum.into(%{})
|
||
|
|
|
||
|
|
Map.put(data, "metadata", filtered)
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Create a metadata summary across all chunks.
|
||
|
|
"""
|
||
|
|
def metadata_summary(chunks) do
|
||
|
|
%{
|
||
|
|
total_chunks: length(chunks),
|
||
|
|
total_text_length: Enum.reduce(chunks, 0, fn chunk, acc ->
|
||
|
|
String.length(Map.get(chunk, "text", "")) + acc
|
||
|
|
end),
|
||
|
|
avg_chunk_size: calculate_avg_size(chunks),
|
||
|
|
metadata_fields: extract_all_metadata_fields(chunks),
|
||
|
|
enrichment_level: assess_enrichment(chunks)
|
||
|
|
}
|
||
|
|
end
|
||
|
|
|
||
|
|
@doc """
|
||
|
|
Generate a human-readable metadata report.
|
||
|
|
"""
|
||
|
|
def generate_report(result) do
|
||
|
|
standard = extract_standard_metadata(result)
|
||
|
|
chunks = Map.get(result, "chunks", [])
|
||
|
|
|
||
|
|
"""
|
||
|
|
=== Document Metadata Report ===
|
||
|
|
|
||
|
|
Standard Fields:
|
||
|
|
#{format_dict(standard)}
|
||
|
|
|
||
|
|
Chunk Statistics:
|
||
|
|
- Total Chunks: #{length(chunks)}
|
||
|
|
- Avg Chunk Size: #{calculate_avg_size(chunks)} characters
|
||
|
|
|
||
|
|
Metadata Summary:
|
||
|
|
#{format_dict(metadata_summary(chunks))}
|
||
|
|
"""
|
||
|
|
end
|
||
|
|
|
||
|
|
# Private helpers
|
||
|
|
defp calculate_avg_size(chunks) do
|
||
|
|
case chunks do
|
||
|
|
[] ->
|
||
|
|
0
|
||
|
|
|
||
|
|
chunks ->
|
||
|
|
total = Enum.reduce(chunks, 0, fn chunk, acc ->
|
||
|
|
String.length(Map.get(chunk, "text", "")) + acc
|
||
|
|
end)
|
||
|
|
|
||
|
|
div(total, length(chunks))
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
defp extract_all_metadata_fields(chunks) do
|
||
|
|
chunks
|
||
|
|
|> Enum.flat_map(fn chunk ->
|
||
|
|
chunk
|
||
|
|
|> Map.get("metadata", %{})
|
||
|
|
|> Map.keys()
|
||
|
|
end)
|
||
|
|
|> Enum.uniq()
|
||
|
|
end
|
||
|
|
|
||
|
|
defp assess_enrichment(chunks) do
|
||
|
|
avg_fields =
|
||
|
|
chunks
|
||
|
|
|> Enum.map(fn chunk ->
|
||
|
|
chunk
|
||
|
|
|> Map.get("metadata", %{})
|
||
|
|
|> map_size()
|
||
|
|
end)
|
||
|
|
|> then(fn sizes ->
|
||
|
|
if Enum.empty?(sizes) do
|
||
|
|
0
|
||
|
|
else
|
||
|
|
div(Enum.sum(sizes), length(sizes))
|
||
|
|
end
|
||
|
|
end)
|
||
|
|
|
||
|
|
case avg_fields do
|
||
|
|
count when count >= 5 -> "high"
|
||
|
|
count when count >= 2 -> "medium"
|
||
|
|
_ -> "low"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
defp format_dict(dict) when is_map(dict) do
|
||
|
|
dict
|
||
|
|
|> Enum.map(fn {key, value} ->
|
||
|
|
" #{key}: #{inspect(value)}"
|
||
|
|
end)
|
||
|
|
|> Enum.join("\n")
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
# Example usage
|
||
|
|
config = %Kreuzberg.ExtractionConfig{
|
||
|
|
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||
|
|
}
|
||
|
|
|
||
|
|
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||
|
|
|
||
|
|
IO.puts("=== Standard Metadata ===")
|
||
|
|
IO.inspect(MetadataUtils.extract_standard_metadata(result))
|
||
|
|
|
||
|
|
IO.puts("\n=== Chunk Context ===")
|
||
|
|
|
||
|
|
enhanced_chunks = MetadataUtils.extract_chunk_context(result.chunks || [])
|
||
|
|
IO.inspect(hd(enhanced_chunks))
|
||
|
|
|
||
|
|
IO.puts("\n=== Metadata Summary ===")
|
||
|
|
IO.inspect(MetadataUtils.metadata_summary(result.chunks || []))
|
||
|
|
|
||
|
|
IO.puts("\n=== Report ===")
|
||
|
|
IO.puts(MetadataUtils.generate_report(result))
|