This commit is contained in:
27
docs/snippets/elixir/metadata/language_detection.md
Normal file
27
docs/snippets/elixir/metadata/language_detection.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```elixir title="Elixir"
|
||||
config =
|
||||
%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.9,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
}
|
||||
|> Jason.encode!()
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"detected_languages" => languages} when is_list(languages) ->
|
||||
IO.inspect(languages, label: "Detected languages")
|
||||
|
||||
_ ->
|
||||
IO.puts("No language detection results")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
@@ -0,0 +1,27 @@
|
||||
```elixir title="Elixir"
|
||||
config =
|
||||
%{
|
||||
"language_detection" => %{
|
||||
"enabled" => true,
|
||||
"min_confidence" => 0.8,
|
||||
"detect_multiple" => true
|
||||
}
|
||||
}
|
||||
|> Jason.encode!()
|
||||
|
||||
case Kreuzberg.extract_file_sync("multilingual_document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"detected_languages" => languages} when is_list(languages) ->
|
||||
IO.inspect(languages, label: "Detected languages")
|
||||
|
||||
_ ->
|
||||
IO.puts("No language detection results")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
215
docs/snippets/elixir/metadata/metadata.exs
Normal file
215
docs/snippets/elixir/metadata/metadata.exs
Normal file
@@ -0,0 +1,215 @@
|
||||
# General metadata extraction and manipulation utilities
|
||||
defmodule MetadataUtils do
|
||||
@doc """
|
||||
Extract standard metadata fields from extraction result.
|
||||
"""
|
||||
def extract_standard_metadata(result) do
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
%{
|
||||
title: Map.get(metadata, "title", "Unknown"),
|
||||
author: Map.get(metadata, "author", "Unknown"),
|
||||
creation_date: Map.get(metadata, "creation_date"),
|
||||
modification_date: Map.get(metadata, "modification_date"),
|
||||
language: Map.get(metadata, "language"),
|
||||
page_count: Map.get(metadata, "page_count", 0),
|
||||
format: Map.get(metadata, "format", "unknown")
|
||||
}
|
||||
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||||
|> Enum.into(%{})
|
||||
end
|
||||
|
||||
@doc """
|
||||
Merge metadata from multiple extraction results.
|
||||
"""
|
||||
def merge_metadata(results) when is_list(results) do
|
||||
results
|
||||
|> Enum.reduce(%{}, fn result, acc ->
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
|
||||
Enum.reduce(metadata, acc, fn {key, value}, meta_acc ->
|
||||
existing = Map.get(meta_acc, key, [])
|
||||
|
||||
updated_value =
|
||||
case existing do
|
||||
[] -> [value]
|
||||
list when is_list(list) -> list ++ [value]
|
||||
single -> [single, value]
|
||||
end
|
||||
|
||||
Map.put(meta_acc, key, updated_value)
|
||||
end)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Enrich chunks with additional metadata.
|
||||
"""
|
||||
def enrich_chunks_metadata(chunks, source_metadata) do
|
||||
Enum.map(chunks, fn chunk ->
|
||||
chunk_metadata = Map.get(chunk, "metadata", %{})
|
||||
|
||||
enriched =
|
||||
source_metadata
|
||||
|> Enum.reject(fn {_k, v} -> is_nil(v) end)
|
||||
|> Enum.into(chunk_metadata)
|
||||
|
||||
Map.put(chunk, "metadata", enriched)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract metadata specific to a chunk's context (position, size, etc).
|
||||
"""
|
||||
def extract_chunk_context(chunks) do
|
||||
total_chunks = length(chunks)
|
||||
|
||||
chunks
|
||||
|> Enum.with_index()
|
||||
|> Enum.map(fn {chunk, index} ->
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
text = Map.get(chunk, "text", "")
|
||||
|
||||
context = %{
|
||||
"chunk_index" => index,
|
||||
"chunk_number" => index + 1,
|
||||
"total_chunks" => total_chunks,
|
||||
"position_percentage" => Float.round((index + 1) / total_chunks * 100, 2),
|
||||
"text_length" => String.length(text),
|
||||
"word_count" => String.split(text) |> length(),
|
||||
"has_headings" => String.contains?(text, ~w[## # ===]),
|
||||
"has_lists" => String.contains?(text, ["- ", "* "])
|
||||
}
|
||||
|
||||
enriched_metadata = Map.merge(metadata, context)
|
||||
Map.put(chunk, "metadata", enriched_metadata)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Filter metadata by a set of allowed keys.
|
||||
"""
|
||||
def filter_metadata_keys(data, allowed_keys) when is_list(allowed_keys) do
|
||||
metadata = Map.get(data, "metadata", %{})
|
||||
|
||||
filtered =
|
||||
metadata
|
||||
|> Enum.filter(fn {key, _value} -> Enum.member?(allowed_keys, key) end)
|
||||
|> Enum.into(%{})
|
||||
|
||||
Map.put(data, "metadata", filtered)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Create a metadata summary across all chunks.
|
||||
"""
|
||||
def metadata_summary(chunks) do
|
||||
%{
|
||||
total_chunks: length(chunks),
|
||||
total_text_length: Enum.reduce(chunks, 0, fn chunk, acc ->
|
||||
String.length(Map.get(chunk, "text", "")) + acc
|
||||
end),
|
||||
avg_chunk_size: calculate_avg_size(chunks),
|
||||
metadata_fields: extract_all_metadata_fields(chunks),
|
||||
enrichment_level: assess_enrichment(chunks)
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Generate a human-readable metadata report.
|
||||
"""
|
||||
def generate_report(result) do
|
||||
standard = extract_standard_metadata(result)
|
||||
chunks = Map.get(result, "chunks", [])
|
||||
|
||||
"""
|
||||
=== Document Metadata Report ===
|
||||
|
||||
Standard Fields:
|
||||
#{format_dict(standard)}
|
||||
|
||||
Chunk Statistics:
|
||||
- Total Chunks: #{length(chunks)}
|
||||
- Avg Chunk Size: #{calculate_avg_size(chunks)} characters
|
||||
|
||||
Metadata Summary:
|
||||
#{format_dict(metadata_summary(chunks))}
|
||||
"""
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
defp calculate_avg_size(chunks) do
|
||||
case chunks do
|
||||
[] ->
|
||||
0
|
||||
|
||||
chunks ->
|
||||
total = Enum.reduce(chunks, 0, fn chunk, acc ->
|
||||
String.length(Map.get(chunk, "text", "")) + acc
|
||||
end)
|
||||
|
||||
div(total, length(chunks))
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_all_metadata_fields(chunks) do
|
||||
chunks
|
||||
|> Enum.flat_map(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> Map.keys()
|
||||
end)
|
||||
|> Enum.uniq()
|
||||
end
|
||||
|
||||
defp assess_enrichment(chunks) do
|
||||
avg_fields =
|
||||
chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> map_size()
|
||||
end)
|
||||
|> then(fn sizes ->
|
||||
if Enum.empty?(sizes) do
|
||||
0
|
||||
else
|
||||
div(Enum.sum(sizes), length(sizes))
|
||||
end
|
||||
end)
|
||||
|
||||
case avg_fields do
|
||||
count when count >= 5 -> "high"
|
||||
count when count >= 2 -> "medium"
|
||||
_ -> "low"
|
||||
end
|
||||
end
|
||||
|
||||
defp format_dict(dict) when is_map(dict) do
|
||||
dict
|
||||
|> Enum.map(fn {key, value} ->
|
||||
" #{key}: #{inspect(value)}"
|
||||
end)
|
||||
|> Enum.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
|
||||
|
||||
IO.puts("=== Standard Metadata ===")
|
||||
IO.inspect(MetadataUtils.extract_standard_metadata(result))
|
||||
|
||||
IO.puts("\n=== Chunk Context ===")
|
||||
|
||||
enhanced_chunks = MetadataUtils.extract_chunk_context(result.chunks || [])
|
||||
IO.inspect(hd(enhanced_chunks))
|
||||
|
||||
IO.puts("\n=== Metadata Summary ===")
|
||||
IO.inspect(MetadataUtils.metadata_summary(result.chunks || []))
|
||||
|
||||
IO.puts("\n=== Report ===")
|
||||
IO.puts(MetadataUtils.generate_report(result))
|
||||
154
docs/snippets/elixir/metadata/metadata.md
Normal file
154
docs/snippets/elixir/metadata/metadata.md
Normal file
@@ -0,0 +1,154 @@
|
||||
```elixir title="Elixir"
|
||||
config = Jason.encode!(%{})
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
with %{"metadata" => %{"pdf" => pdf_meta}} <- Jason.decode!(result) do
|
||||
case pdf_meta do
|
||||
%{"page_count" => pages} ->
|
||||
IO.puts("Pages: #{pages}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
case pdf_meta do
|
||||
%{"author" => author} ->
|
||||
IO.puts("Author: #{author}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
case pdf_meta do
|
||||
%{"title" => title} ->
|
||||
IO.puts("Title: #{title}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
|
||||
case Kreuzberg.extract_file_sync("page.html", nil, config) do
|
||||
{:ok, result} ->
|
||||
with %{"metadata" => %{"html" => html_meta}} <- Jason.decode!(result) do
|
||||
case html_meta do
|
||||
%{"title" => title} ->
|
||||
IO.puts("Title: #{title}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
case html_meta do
|
||||
%{"description" => desc} ->
|
||||
IO.puts("Description: #{desc}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access keywords array
|
||||
case html_meta do
|
||||
%{"keywords" => keywords} ->
|
||||
IO.inspect(keywords, label: "Keywords")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access canonical URL
|
||||
case html_meta do
|
||||
%{"canonical_url" => canonical} ->
|
||||
IO.puts("Canonical URL: #{canonical}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access Open Graph fields as a map
|
||||
case html_meta do
|
||||
%{"open_graph" => og} when is_map(og) ->
|
||||
case og do
|
||||
%{"image" => og_image} ->
|
||||
IO.puts("Open Graph Image: #{og_image}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
case og do
|
||||
%{"title" => og_title} ->
|
||||
IO.puts("Open Graph Title: #{og_title}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access Twitter Card fields as a map
|
||||
case html_meta do
|
||||
%{"twitter_card" => tc} when is_map(tc) ->
|
||||
case tc do
|
||||
%{"card" => card_type} ->
|
||||
IO.puts("Twitter Card Type: #{card_type}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access language
|
||||
case html_meta do
|
||||
%{"language" => lang} ->
|
||||
IO.puts("Language: #{lang}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access headers
|
||||
case html_meta do
|
||||
%{"headers" => headers} when is_list(headers) and length(headers) > 0 ->
|
||||
Enum.each(headers, fn header ->
|
||||
IO.puts("Header (level #{header["level"]}): #{header["text"]}")
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access links
|
||||
case html_meta do
|
||||
%{"links" => links} when is_list(links) and length(links) > 0 ->
|
||||
Enum.each(links, fn link ->
|
||||
IO.puts("Link: #{link["href"]} (#{link["text"]})")
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access images
|
||||
case html_meta do
|
||||
%{"images" => images} when is_list(images) and length(images) > 0 ->
|
||||
Enum.each(images, fn image ->
|
||||
IO.puts("Image: #{image["src"]}")
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
# Access structured data
|
||||
case html_meta do
|
||||
%{"structured_data" => sd} when is_list(sd) ->
|
||||
IO.puts("Structured data items: #{length(sd)}")
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
264
docs/snippets/elixir/metadata/page_boundaries.exs
Normal file
264
docs/snippets/elixir/metadata/page_boundaries.exs
Normal file
@@ -0,0 +1,264 @@
|
||||
# Page boundary tracking and analysis utilities
|
||||
defmodule PageBoundaryUtils do
|
||||
@doc """
|
||||
Track page boundaries throughout the document extraction.
|
||||
Assumes chunks contain page_number metadata.
|
||||
"""
|
||||
def track_page_boundaries(chunks) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.with_index()
|
||||
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
|
||||
page = extract_page_number(chunk)
|
||||
|
||||
if page do
|
||||
acc
|
||||
|> Map.update(page, [index], &(&1 ++ [index]))
|
||||
else
|
||||
acc
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get all chunks that belong to a specific page.
|
||||
"""
|
||||
def get_chunks_by_page(chunks, page_number) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.filter(fn chunk ->
|
||||
extract_page_number(chunk) == page_number
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Identify page boundaries and create page-level segments.
|
||||
"""
|
||||
def identify_page_segments(chunks) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.reduce([], fn chunk, segments ->
|
||||
page = extract_page_number(chunk)
|
||||
text = Map.get(chunk, "text", "")
|
||||
|
||||
case segments do
|
||||
[] ->
|
||||
[%{page: page, content: text, chunks: [chunk]}]
|
||||
|
||||
[current | rest] ->
|
||||
if current.page == page do
|
||||
# Same page, append to current segment
|
||||
updated = %{
|
||||
current
|
||||
| content: current.content <> " " <> text,
|
||||
chunks: current.chunks ++ [chunk]
|
||||
}
|
||||
|
||||
[updated | rest]
|
||||
else
|
||||
# New page, create new segment
|
||||
[%{page: page, content: text, chunks: [chunk]}, current | rest]
|
||||
end
|
||||
end
|
||||
end)
|
||||
|> Enum.reverse()
|
||||
end
|
||||
|
||||
@doc """
|
||||
Analyze transitions between pages (e.g., page breaks).
|
||||
"""
|
||||
def analyze_page_transitions(chunks) do
|
||||
chunks
|
||||
|> Enum.map(&extract_page_number/1)
|
||||
|> Enum.uniq()
|
||||
|> Enum.sort()
|
||||
|> then(fn pages ->
|
||||
%{
|
||||
total_pages: length(pages),
|
||||
page_numbers: pages,
|
||||
gaps: find_page_gaps(pages)
|
||||
}
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Get statistics for each page (chunk count, text length, etc).
|
||||
"""
|
||||
def page_statistics(chunks) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.group_by(&extract_page_number/1)
|
||||
|> Enum.map(fn {page, page_chunks} ->
|
||||
total_length =
|
||||
page_chunks
|
||||
|> Enum.map(&String.length(Map.get(&1, "text", "")))
|
||||
|> Enum.sum()
|
||||
|
||||
%{
|
||||
page_number: page,
|
||||
chunk_count: length(page_chunks),
|
||||
total_text_length: total_length,
|
||||
avg_chunk_size: if(length(page_chunks) > 0, do: div(total_length, length(page_chunks)), else: 0)
|
||||
}
|
||||
end)
|
||||
|> Enum.sort_by(&Map.get(&1, :page_number))
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract content from a page range.
|
||||
"""
|
||||
def extract_page_range(chunks, start_page, end_page) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.filter(fn chunk ->
|
||||
page = extract_page_number(chunk)
|
||||
page && page >= start_page && page <= end_page
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Add page boundary markers to chunks for processing.
|
||||
"""
|
||||
def add_page_markers(chunks) when is_list(chunks) do
|
||||
page_segments = identify_page_segments(chunks)
|
||||
|
||||
segments_map =
|
||||
page_segments
|
||||
|> Enum.reduce(%{}, fn segment, acc ->
|
||||
Map.put(acc, segment.page, segment)
|
||||
end)
|
||||
|
||||
chunks
|
||||
|> Enum.map(fn chunk ->
|
||||
page = extract_page_number(chunk)
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
|
||||
updated_metadata =
|
||||
case segments_map[page] do
|
||||
nil ->
|
||||
metadata
|
||||
|
||||
segment ->
|
||||
metadata
|
||||
|> Map.put("page_number", page)
|
||||
|> Map.put("is_first_on_page", hd(segment.chunks) == chunk)
|
||||
|> Map.put("is_last_on_page", List.last(segment.chunks) == chunk)
|
||||
|> Map.put("position_on_page", find_chunk_position(segment.chunks, chunk))
|
||||
end
|
||||
|
||||
Map.put(chunk, "metadata", updated_metadata)
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Generate a page index for quick access.
|
||||
"""
|
||||
def generate_page_index(chunks) when is_list(chunks) do
|
||||
chunks
|
||||
|> Enum.with_index()
|
||||
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
|
||||
page = extract_page_number(chunk)
|
||||
|
||||
if page do
|
||||
Map.update(acc, page, [index], &(&1 ++ [index]))
|
||||
else
|
||||
acc
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Create a table of contents based on page boundaries and content structure.
|
||||
"""
|
||||
def create_page_toc(chunks) do
|
||||
chunks
|
||||
|> add_page_markers()
|
||||
|> Enum.filter(fn chunk ->
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
Map.get(metadata, "is_first_on_page", false)
|
||||
end)
|
||||
|> Enum.map(fn chunk ->
|
||||
text = Map.get(chunk, "text", "")
|
||||
metadata = Map.get(chunk, "metadata", %{})
|
||||
page = Map.get(metadata, "page_number", "unknown")
|
||||
|
||||
# Extract first line or heading as TOC entry
|
||||
first_line =
|
||||
text
|
||||
|> String.split("\n")
|
||||
|> hd()
|
||||
|> String.trim()
|
||||
|> then(fn line ->
|
||||
if String.length(line) > 100 do
|
||||
String.slice(line, 0, 100) <> "..."
|
||||
else
|
||||
line
|
||||
end
|
||||
end)
|
||||
|
||||
%{
|
||||
page: page,
|
||||
content: first_line,
|
||||
content_type: detect_content_type(text)
|
||||
}
|
||||
end)
|
||||
end
|
||||
|
||||
# Private helpers
|
||||
defp extract_page_number(chunk) do
|
||||
chunk
|
||||
|> Map.get("metadata", %{})
|
||||
|> Map.get("page_number")
|
||||
end
|
||||
|
||||
defp find_page_gaps(pages) do
|
||||
pages
|
||||
|> Enum.chunk_every(2, 1, :discard)
|
||||
|> Enum.filter(fn [a, b] -> b - a > 1 end)
|
||||
|> Enum.map(fn [a, b] -> {a, b} end)
|
||||
end
|
||||
|
||||
defp find_chunk_position(chunks, target_chunk) do
|
||||
chunks
|
||||
|> Enum.find_index(&(&1 == target_chunk))
|
||||
|> then(fn
|
||||
nil -> nil
|
||||
index -> index + 1
|
||||
end)
|
||||
end
|
||||
|
||||
defp detect_content_type(text) do
|
||||
cond do
|
||||
String.contains?(text, ~w[# ## ===]) -> "heading"
|
||||
String.contains?(text, ~w[- * •]) -> "list"
|
||||
String.length(text) < 100 -> "snippet"
|
||||
true -> "body"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Example usage
|
||||
config = %Kreuzberg.ExtractionConfig{
|
||||
chunking: %{"enabled" => true, "max_characters" => 1000}
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multipage_doc.pdf", nil, config)
|
||||
chunks = result.chunks || []
|
||||
|
||||
IO.puts("=== Page Boundaries ===")
|
||||
IO.inspect(PageBoundaryUtils.track_page_boundaries(chunks))
|
||||
|
||||
IO.puts("\n=== Page Statistics ===")
|
||||
IO.inspect(PageBoundaryUtils.page_statistics(chunks))
|
||||
|
||||
IO.puts("\n=== Page Transitions Analysis ===")
|
||||
IO.inspect(PageBoundaryUtils.analyze_page_transitions(chunks))
|
||||
|
||||
IO.puts("\n=== Chunks from Page 2-4 ===")
|
||||
IO.inspect(PageBoundaryUtils.extract_page_range(chunks, 2, 4) |> length())
|
||||
|
||||
IO.puts("\n=== Page Markers Added ===")
|
||||
|
||||
marked_chunks = PageBoundaryUtils.add_page_markers(chunks)
|
||||
IO.inspect(hd(marked_chunks))
|
||||
|
||||
IO.puts("\n=== Page Index ===")
|
||||
IO.inspect(PageBoundaryUtils.generate_page_index(chunks))
|
||||
|
||||
IO.puts("\n=== Table of Contents ===")
|
||||
IO.inspect(PageBoundaryUtils.create_page_toc(chunks))
|
||||
35
docs/snippets/elixir/metadata/page_boundaries.md
Normal file
35
docs/snippets/elixir/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```elixir title="Elixir"
|
||||
config = Jason.encode!(%{})
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"metadata" => %{"pages" => %{"boundaries" => boundaries}}, "content" => content}
|
||||
when is_list(boundaries) ->
|
||||
boundaries
|
||||
|> Enum.take(3)
|
||||
|> Enum.each(fn boundary ->
|
||||
byte_start = boundary["byte_start"]
|
||||
byte_end = boundary["byte_end"]
|
||||
page_number = boundary["page_number"]
|
||||
|
||||
# Extract substring for this boundary
|
||||
page_text = String.slice(content, byte_start, byte_end - byte_start)
|
||||
preview_end = min(100, String.length(page_text))
|
||||
preview = String.slice(page_text, 0, preview_end)
|
||||
|
||||
IO.puts("Page #{page_number}:")
|
||||
IO.puts(" Byte range: #{byte_start}-#{byte_end}")
|
||||
IO.puts(" Preview: #{preview}...")
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
71
docs/snippets/elixir/metadata/page_tracking_basic.exs
Normal file
71
docs/snippets/elixir/metadata/page_tracking_basic.exs
Normal file
@@ -0,0 +1,71 @@
|
||||
```elixir title="Elixir"
|
||||
# Track which pages content originated from during extraction
|
||||
# Useful for cross-referencing extracted content back to source documents
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
# Standard extraction configuration
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multi_page_document.pdf", nil, config)
|
||||
|
||||
# Access metadata to retrieve page information
|
||||
metadata = result.metadata || %{}
|
||||
|
||||
# For PDF documents, metadata includes page tracking
|
||||
case metadata["pdf"] do
|
||||
pdf_meta when is_map(pdf_meta) ->
|
||||
IO.puts("Total pages in document: #{pdf_meta["page_count"]}")
|
||||
IO.puts("Document title: #{pdf_meta["title"]}")
|
||||
IO.puts("Document author: #{pdf_meta["author"]}")
|
||||
_ ->
|
||||
IO.puts("No PDF metadata available")
|
||||
end
|
||||
|
||||
# When using chunks, track which content came from which page
|
||||
chunks = result.chunks || []
|
||||
IO.puts("\nTotal chunks extracted: #{length(chunks)}")
|
||||
|
||||
# Process chunks and associate with page information
|
||||
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, index} ->
|
||||
# Estimate page number based on chunk position
|
||||
# This is a simplified approach - actual implementation may vary
|
||||
text = Map.get(chunk, "text", "")
|
||||
IO.puts("Chunk #{index}: #{byte_size(text)} bytes")
|
||||
IO.puts(" Content preview: #{String.slice(text, 0..80)}...")
|
||||
end)
|
||||
|
||||
# For tables, metadata may include page numbers
|
||||
tables = result.tables || []
|
||||
IO.puts("\nTotal tables found: #{length(tables)}")
|
||||
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
|
||||
cells = table["cells"] || []
|
||||
IO.puts("Table #{idx}: #{length(cells)} rows")
|
||||
|
||||
# Table metadata may indicate source page
|
||||
case table["metadata"] do
|
||||
meta when is_map(meta) ->
|
||||
IO.puts(" Page: #{meta["page"] || "Unknown"}")
|
||||
_ ->
|
||||
IO.puts(" Page: Unknown")
|
||||
end
|
||||
end)
|
||||
|
||||
# Track images with page information
|
||||
images = result.images || []
|
||||
IO.puts("\nTotal images found: #{length(images)}")
|
||||
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts("Image #{idx}:")
|
||||
IO.puts(" Format: #{image["format"]}")
|
||||
IO.puts(" Size: #{image["size"]} bytes")
|
||||
|
||||
# Page information if available in image metadata
|
||||
if image["page"] do
|
||||
IO.puts(" Page: #{image["page"]}")
|
||||
end
|
||||
end)
|
||||
```
|
||||
31
docs/snippets/elixir/metadata/page_tracking_basic.md
Normal file
31
docs/snippets/elixir/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```elixir title="Elixir"
|
||||
config =
|
||||
%{"pages" => %{"extract_pages" => true}}
|
||||
|> Jason.encode!()
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"pages" => pages} when is_list(pages) ->
|
||||
Enum.each(pages, fn page ->
|
||||
page_number = page["page_number"]
|
||||
content = page["content"]
|
||||
tables = page["tables"] || []
|
||||
images = page["images"] || []
|
||||
|
||||
IO.puts("Page #{page_number}:")
|
||||
IO.puts(" Content: #{String.length(content)} chars")
|
||||
IO.puts(" Tables: #{length(tables)}")
|
||||
IO.puts(" Images: #{length(images)}")
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
108
docs/snippets/elixir/metadata/pdf_metadata_extractor.exs
Normal file
108
docs/snippets/elixir/metadata/pdf_metadata_extractor.exs
Normal file
@@ -0,0 +1,108 @@
|
||||
```elixir title="Elixir"
|
||||
# Extract and process comprehensive PDF metadata
|
||||
# Useful for document indexing, cataloging, and metadata-driven workflows
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
defmodule PDFMetadataExtractor do
|
||||
@moduledoc """
|
||||
Extract and process PDF metadata from documents.
|
||||
Provides structured access to PDF properties and document information.
|
||||
"""
|
||||
|
||||
@doc """
|
||||
Extract PDF metadata from a file.
|
||||
|
||||
Returns a map with normalized metadata fields.
|
||||
"""
|
||||
def extract_metadata(file_path) do
|
||||
config = %ExtractionConfig{
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
case Kreuzberg.extract_file(file_path, nil, config) do
|
||||
{:ok, result} ->
|
||||
process_metadata(result.metadata || %{})
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Process and normalize PDF metadata.
|
||||
"""
|
||||
defp process_metadata(metadata) do
|
||||
case metadata["pdf"] do
|
||||
pdf_meta when is_map(pdf_meta) ->
|
||||
{:ok,
|
||||
%{
|
||||
page_count: pdf_meta["page_count"] || 0,
|
||||
title: pdf_meta["title"],
|
||||
author: pdf_meta["author"],
|
||||
subject: pdf_meta["subject"],
|
||||
keywords: pdf_meta["keywords"],
|
||||
creator: pdf_meta["creator"],
|
||||
producer: pdf_meta["producer"],
|
||||
creation_date: pdf_meta["creation_date"],
|
||||
modification_date: pdf_meta["modification_date"],
|
||||
version: pdf_meta["version"],
|
||||
is_encrypted: pdf_meta["is_encrypted"] || false,
|
||||
is_tagged: pdf_meta["is_tagged"] || false
|
||||
}}
|
||||
|
||||
_ ->
|
||||
{:error, "No PDF metadata found"}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Format metadata for display.
|
||||
"""
|
||||
def format_metadata(metadata) when is_map(metadata) do
|
||||
"""
|
||||
=== PDF Metadata ===
|
||||
Title: #{metadata[:title] || "N/A"}
|
||||
Author: #{metadata[:author] || "N/A"}
|
||||
Subject: #{metadata[:subject] || "N/A"}
|
||||
Keywords: #{inspect(metadata[:keywords]) || "N/A"}
|
||||
Creator: #{metadata[:creator] || "N/A"}
|
||||
Producer: #{metadata[:producer] || "N/A"}
|
||||
|
||||
=== Document Properties ===
|
||||
Pages: #{metadata[:page_count]}
|
||||
Version: #{metadata[:version] || "N/A"}
|
||||
Encrypted: #{metadata[:is_encrypted]}
|
||||
Tagged (Accessible): #{metadata[:is_tagged]}
|
||||
|
||||
=== Dates ===
|
||||
Created: #{metadata[:creation_date] || "N/A"}
|
||||
Modified: #{metadata[:modification_date] || "N/A"}
|
||||
"""
|
||||
end
|
||||
end
|
||||
|
||||
# Usage example
|
||||
file_path = "document.pdf"
|
||||
|
||||
case PDFMetadataExtractor.extract_metadata(file_path) do
|
||||
{:ok, metadata} ->
|
||||
IO.puts(PDFMetadataExtractor.format_metadata(metadata))
|
||||
|
||||
# Perform metadata-driven operations
|
||||
if metadata[:page_count] > 100 do
|
||||
IO.puts("Note: Document is large (#{metadata[:page_count]} pages)")
|
||||
end
|
||||
|
||||
if metadata[:is_encrypted] do
|
||||
IO.puts("Note: Document is password-protected")
|
||||
end
|
||||
|
||||
if metadata[:is_tagged] do
|
||||
IO.puts("Note: Document is accessible with tags")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error extracting metadata: #{reason}")
|
||||
end
|
||||
```
|
||||
29
docs/snippets/elixir/metadata/tables.md
Normal file
29
docs/snippets/elixir/metadata/tables.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```elixir title="Elixir"
|
||||
config = Jason.encode!(%{})
|
||||
|
||||
case Kreuzberg.extract_file_sync("document.pdf", nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"tables" => tables} when is_list(tables) ->
|
||||
Enum.each(tables, fn table ->
|
||||
cells = table["cells"] || []
|
||||
markdown = table["markdown"] || ""
|
||||
|
||||
IO.puts("Table with #{length(cells)} rows")
|
||||
IO.puts("#{markdown}")
|
||||
|
||||
Enum.each(cells, fn row ->
|
||||
IO.inspect(row)
|
||||
end)
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Error: #{reason}")
|
||||
end
|
||||
```
|
||||
66
docs/snippets/elixir/metadata/vector_database_integration.md
Normal file
66
docs/snippets/elixir/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,66 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule VectorRecord do
|
||||
defstruct [:id, :content, :embedding, :metadata]
|
||||
end
|
||||
|
||||
defmodule VectorIntegration do
|
||||
def extract_and_vectorize(document_path, document_id) do
|
||||
config =
|
||||
%{
|
||||
"chunking" => %{
|
||||
"max_characters" => 512,
|
||||
"overlap" => 50,
|
||||
"embedding" => %{
|
||||
"model" => %{"preset" => %{"name" => "balanced"}},
|
||||
"normalize" => true,
|
||||
"batch_size" => 32
|
||||
}
|
||||
}
|
||||
}
|
||||
|> Jason.encode!()
|
||||
|
||||
case Kreuzberg.extract_file_sync(document_path, nil, config) do
|
||||
{:ok, result} ->
|
||||
decoded = Jason.decode!(result)
|
||||
|
||||
case decoded do
|
||||
%{"chunks" => chunks} when is_list(chunks) ->
|
||||
records =
|
||||
chunks
|
||||
|> Enum.with_index()
|
||||
|> Enum.flat_map(fn {chunk, index} ->
|
||||
case chunk do
|
||||
%{"embedding" => embedding, "content" => content}
|
||||
when is_list(embedding) ->
|
||||
metadata = %{
|
||||
"document_id" => document_id,
|
||||
"chunk_index" => Integer.to_string(index),
|
||||
"content_length" => Integer.to_string(String.length(content))
|
||||
}
|
||||
|
||||
[
|
||||
%VectorRecord{
|
||||
id: "#{document_id}_chunk_#{index}",
|
||||
content: content,
|
||||
embedding: embedding,
|
||||
metadata: metadata
|
||||
}
|
||||
]
|
||||
|
||||
_ ->
|
||||
[]
|
||||
end
|
||||
end)
|
||||
|
||||
{:ok, records}
|
||||
|
||||
_ ->
|
||||
{:error, "No chunks in extraction result"}
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
Reference in New Issue
Block a user