This commit is contained in:
215
docs/snippets/elixir/metadata/metadata.exs
Normal file
215
docs/snippets/elixir/metadata/metadata.exs
Normal file
@@ -0,0 +1,215 @@
|
||||
# General metadata extraction and manipulation utilities
|
||||
defmodule MetadataUtils do
|
||||
@doc """
|
||||
Extract standard metadata fields from extraction result.
|
||||
"""
|
||||
def extract_standard_metadata(result) do
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
%{
|
||||
title: Map.get(metadata, "title", "Unknown"),
|
||||
author: Map.get(metadata, "author", "Unknown"),
|
||||
creation_date: Map.get(metadata, "creation_date"),
|
||||
modification_date: Map.get(metadata, "modification_date"),
|
||||
language: Map.get(metadata, "language"),
|
||||
page_count: Map.get(metadata, "page_count", 0),
|
||||
format: Map.get(metadata, "format", "unknown")
|
||||
}
|
||||
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||||
|> Enum.into(%{})
|
||||
end
|
||||
|
||||
@doc """
|
||||
Merge metadata from multiple extraction results.
|
||||
"""
|
||||
def merge_metadata(results) when is_list(results) do
|
||||
results
|
||||
|> Enum.reduce(%{}, fn result, acc ->
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
|
||||
Enum.reduce(metadata, acc, fn {key, value}, meta_acc ->
|
||||
existing = Map.get(meta_acc, key, [])
|
||||
|
||||
updated_value =
|
||||
case existing do
|
||||
[] -> [value]
|
||||
list when is_list(list) -> list ++ [value]
|
||||
single -> [single, value]
|
||||
end
|
||||
|
||||
Map.put(meta_acc, key, updated_value)
|
||||
end)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Enrich chunks with additional metadata.
|
||||
"""
|
||||
def enrich_chunks_metadata(chunks, source_metadata) do
|
||||
Enum.map(chunks, fn chunk ->
|
||||
chunk_metadata = Map.get(chunk, "metadata", %{})
|
||||
|
||||
enriched =
|
||||
source_metadata
|
||||
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||||
|> Enum.into(chunk_metadata)
|
||||
|
||||
Map.put(chunk, "metadata", enriched)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract metadata specific to a chunk's context (position, size, etc).
|
||||
"""
|
||||
def extract_chunk_context(chunks) do
|
||||
total_chunks = length(chunks)
|
||||
|
||||
chunks
|
||||
|> Enum.with_index()
|
||||
|> Enum.map(fn {chunk, index} ->
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
text = Map.get(chunk, "text", "")
|
||||
|
||||
context = %{
|
||||
"chunk_index" => index,
|
||||
"chunk_number" => index + 1,
|
||||
"total_chunks" => total_chunks,
|
||||
"position_percentage" => Float.round((index + 1) / total_chunks * 100, 2),
|
||||
"text_length" => String.length(text),
|
||||
"word_count" => String.split(text) |> length(),
|
||||
"has_headings" => String.contains?(text, ~w[## # ===]),
|
||||
"has_lists" => String.contains?(text, ["- ", "* "])
|
||||
}
|
||||
|
||||
enriched_metadata = Map.merge(metadata, context)
|
||||
Map.put(chunk, "metadata", enriched_metadata)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Filter metadata by a set of allowed keys.
|
||||
"""
|
||||
def filter_metadata_keys(data, allowed_keys) when is_list(allowed_keys) do
|
||||
metadata = Map.get(data, "metadata", %{})
|
||||
|
||||
filtered =
|
||||
metadata
|
||||
|> Enum.filter(fn {key, _value} -> Enum.member?(allowed_keys, key) end)
|
||||
|> Enum.into(%{})
|
||||
|
||||
Map.put(data, "metadata", filtered)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Create a metadata summary across all chunks.
|
||||
"""
|
||||
def metadata_summary(chunks) do
|
||||
%{
|
||||
total_chunks: length(chunks),
|
||||
total_text_length: Enum.reduce(chunks, 0, fn chunk, acc ->
|
||||
String.length(Map.get(chunk, "text", "")) + acc
|
||||
end),
|
||||
avg_chunk_size: calculate_avg_size(chunks),
|
||||
metadata_fields: extract_all_metadata_fields(chunks),
|
||||
enrichment_level: assess_enrichment(chunks)
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Generate a human-readable metadata report.
|
||||
"""
|
||||
def generate_report(result) do
|
||||
standard = extract_standard_metadata(result)
|
||||
chunks = Map.get(result, "chunks", [])
|
||||
|
||||
"""
|
||||
=== Document Metadata Report ===
|
||||
|
||||
Standard Fields:
|
||||
#{format_dict(standard)}
|
||||
|
||||
Chunk Statistics:
|
||||
- Total Chunks: #{length(chunks)}
|
||||
- Avg Chunk Size: #{calculate_avg_size(chunks)} characters
|
||||
|
||||
Metadata Summary:
|
||||
#{format_dict(metadata_summary(chunks))}
|
||||
"""
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
defp calculate_avg_size(chunks) do
|
||||
case chunks do
|
||||
[] ->
|
||||
0
|
||||
|
||||
chunks ->
|
||||
total = Enum.reduce(chunks, 0, fn chunk, acc ->
|
||||
String.length(Map.get(chunk, "text", "")) + acc
|
||||
end)
|
||||
|
||||
div(total, length(chunks))
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_all_metadata_fields(chunks) do
|
||||
chunks
|
||||
|> Enum.flat_map(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> Map.keys()
|
||||
end)
|
||||
|> Enum.uniq()
|
||||
end
|
||||
|
||||
defp assess_enrichment(chunks) do
|
||||
avg_fields =
|
||||
chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> map_size()
|
||||
end)
|
||||
|> then(fn sizes ->
|
||||
if Enum.empty?(sizes) do
|
||||
0
|
||||
else
|
||||
div(Enum.sum(sizes), length(sizes))
|
||||
end
|
||||
end)
|
||||
|
||||
case avg_fields do
|
||||
count when count >= 5 -> "high"
|
||||
count when count >= 2 -> "medium"
|
||||
_ -> "low"
|
||||
end
|
||||
end
|
||||
|
||||
defp format_dict(dict) when is_map(dict) do
|
||||
dict
|
||||
|> Enum.map(fn {key, value} ->
|
||||
" #{key}: #{inspect(value)}"
|
||||
end)
|
||||
|> Enum.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("=== Standard Metadata ===")
|
||||
IO.inspect(MetadataUtils.extract_standard_metadata(result))
|
||||
|
||||
IO.puts("\n=== Chunk Context ===")
|
||||
|
||||
enhanced_chunks = MetadataUtils.extract_chunk_context(result.chunks || [])
|
||||
IO.inspect(hd(enhanced_chunks))
|
||||
|
||||
IO.puts("\n=== Metadata Summary ===")
|
||||
IO.inspect(MetadataUtils.metadata_summary(result.chunks || []))
|
||||
|
||||
IO.puts("\n=== Report ===")
|
||||
IO.puts(MetadataUtils.generate_report(result))
|
||||
Reference in New Issue
Block a user