Files
fil/docs/snippets/elixir/metadata/page_boundaries.exs

265 lines
6.7 KiB
Elixir
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
# Page boundary tracking and analysis utilities
defmodule PageBoundaryUtils do
@doc """
Track page boundaries throughout the document extraction.
Assumes chunks contain page_number metadata.
"""
def track_page_boundaries(chunks) when is_list(chunks) do
chunks
|> Enum.with_index()
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
page = extract_page_number(chunk)
if page do
acc
|> Map.update(page, [index], &(&1 ++ [index]))
else
acc
end
end)
end
@doc """
Get all chunks that belong to a specific page.
"""
def get_chunks_by_page(chunks, page_number) when is_list(chunks) do
chunks
|> Enum.filter(fn chunk ->
extract_page_number(chunk) == page_number
end)
end
@doc """
Identify page boundaries and create page-level segments.
"""
def identify_page_segments(chunks) when is_list(chunks) do
chunks
|> Enum.reduce([], fn chunk, segments ->
page = extract_page_number(chunk)
text = Map.get(chunk, "text", "")
case segments do
[] ->
[%{page: page, content: text, chunks: [chunk]}]
[current | rest] ->
if current.page == page do
# Same page, append to current segment
updated = %{
current
| content: current.content <> " " <> text,
chunks: current.chunks ++ [chunk]
}
[updated | rest]
else
# New page, create new segment
[%{page: page, content: text, chunks: [chunk]}, current | rest]
end
end
end)
|> Enum.reverse()
end
@doc """
Analyze transitions between pages (e.g., page breaks).
"""
def analyze_page_transitions(chunks) do
chunks
|> Enum.map(&extract_page_number/1)
|> Enum.uniq()
|> Enum.sort()
|> then(fn pages ->
%{
total_pages: length(pages),
page_numbers: pages,
gaps: find_page_gaps(pages)
}
end)
end
@doc """
Get statistics for each page (chunk count, text length, etc).
"""
def page_statistics(chunks) when is_list(chunks) do
chunks
|> Enum.group_by(&extract_page_number/1)
|> Enum.map(fn {page, page_chunks} ->
total_length =
page_chunks
|> Enum.map(&String.length(Map.get(&1, "text", "")))
|> Enum.sum()
%{
page_number: page,
chunk_count: length(page_chunks),
total_text_length: total_length,
avg_chunk_size: if(length(page_chunks) > 0, do: div(total_length, length(page_chunks)), else: 0)
}
end)
|> Enum.sort_by(&Map.get(&1, :page_number))
end
@doc """
Extract content from a page range.
"""
def extract_page_range(chunks, start_page, end_page) when is_list(chunks) do
chunks
|> Enum.filter(fn chunk ->
page = extract_page_number(chunk)
page && page >= start_page && page <= end_page
end)
end
@doc """
Add page boundary markers to chunks for processing.
"""
def add_page_markers(chunks) when is_list(chunks) do
page_segments = identify_page_segments(chunks)
segments_map =
page_segments
|> Enum.reduce(%{}, fn segment, acc ->
Map.put(acc, segment.page, segment)
end)
chunks
|> Enum.map(fn chunk ->
page = extract_page_number(chunk)
metadata = Map.get(chunk, "metadata", %{})
updated_metadata =
case segments_map[page] do
nil ->
metadata
segment ->
metadata
|> Map.put("page_number", page)
|> Map.put("is_first_on_page", hd(segment.chunks) == chunk)
|> Map.put("is_last_on_page", List.last(segment.chunks) == chunk)
|> Map.put("position_on_page", find_chunk_position(segment.chunks, chunk))
end
Map.put(chunk, "metadata", updated_metadata)
end)
end
@doc """
Generate a page index for quick access.
"""
def generate_page_index(chunks) when is_list(chunks) do
chunks
|> Enum.with_index()
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
page = extract_page_number(chunk)
if page do
Map.update(acc, page, [index], &(&1 ++ [index]))
else
acc
end
end)
end
@doc """
Create a table of contents based on page boundaries and content structure.
"""
def create_page_toc(chunks) do
chunks
|> add_page_markers()
|> Enum.filter(fn chunk ->
metadata = Map.get(chunk, "metadata", %{})
Map.get(metadata, "is_first_on_page", false)
end)
|> Enum.map(fn chunk ->
text = Map.get(chunk, "text", "")
metadata = Map.get(chunk, "metadata", %{})
page = Map.get(metadata, "page_number", "unknown")
# Extract first line or heading as TOC entry
first_line =
text
|> String.split("\n")
|> hd()
|> String.trim()
|> then(fn line ->
if String.length(line) > 100 do
String.slice(line, 0, 100) <> "..."
else
line
end
end)
%{
page: page,
content: first_line,
content_type: detect_content_type(text)
}
end)
end
# Private helpers
defp extract_page_number(chunk) do
chunk
|> Map.get("metadata", %{})
|> Map.get("page_number")
end
defp find_page_gaps(pages) do
pages
|> Enum.chunk_every(2, 1, :discard)
|> Enum.filter(fn [a, b] -> b - a > 1 end)
|> Enum.map(fn [a, b] -> {a, b} end)
end
defp find_chunk_position(chunks, target_chunk) do
chunks
|> Enum.find_index(&(&1 == target_chunk))
|> then(fn
nil -> nil
index -> index + 1
end)
end
defp detect_content_type(text) do
cond do
String.contains?(text, ~w[# ## ===]) -> "heading"
String.contains?(text, ~w[- * •]) -> "list"
String.length(text) < 100 -> "snippet"
true -> "body"
end
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("multipage_doc.pdf", nil, config)
chunks = result.chunks || []
IO.puts("=== Page Boundaries ===")
IO.inspect(PageBoundaryUtils.track_page_boundaries(chunks))
IO.puts("\n=== Page Statistics ===")
IO.inspect(PageBoundaryUtils.page_statistics(chunks))
IO.puts("\n=== Page Transitions Analysis ===")
IO.inspect(PageBoundaryUtils.analyze_page_transitions(chunks))
IO.puts("\n=== Chunks from Page 2-4 ===")
IO.inspect(PageBoundaryUtils.extract_page_range(chunks, 2, 4) |> length())
IO.puts("\n=== Page Markers Added ===")
marked_chunks = PageBoundaryUtils.add_page_markers(chunks)
IO.inspect(hd(marked_chunks))
IO.puts("\n=== Page Index ===")
IO.inspect(PageBoundaryUtils.generate_page_index(chunks))
IO.puts("\n=== Table of Contents ===")
IO.inspect(PageBoundaryUtils.create_page_toc(chunks))