265 lines
6.7 KiB
Elixir
265 lines
6.7 KiB
Elixir
# Page boundary tracking and analysis utilities
|
|
defmodule PageBoundaryUtils do
|
|
@doc """
|
|
Track page boundaries throughout the document extraction.
|
|
Assumes chunks contain page_number metadata.
|
|
"""
|
|
def track_page_boundaries(chunks) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.with_index()
|
|
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
|
|
page = extract_page_number(chunk)
|
|
|
|
if page do
|
|
acc
|
|
|> Map.update(page, [index], &(&1 ++ [index]))
|
|
else
|
|
acc
|
|
end
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Get all chunks that belong to a specific page.
|
|
"""
|
|
def get_chunks_by_page(chunks, page_number) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.filter(fn chunk ->
|
|
extract_page_number(chunk) == page_number
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Identify page boundaries and create page-level segments.
|
|
"""
|
|
def identify_page_segments(chunks) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.reduce([], fn chunk, segments ->
|
|
page = extract_page_number(chunk)
|
|
text = Map.get(chunk, "text", "")
|
|
|
|
case segments do
|
|
[] ->
|
|
[%{page: page, content: text, chunks: [chunk]}]
|
|
|
|
[current | rest] ->
|
|
if current.page == page do
|
|
# Same page, append to current segment
|
|
updated = %{
|
|
current
|
|
| content: current.content <> " " <> text,
|
|
chunks: current.chunks ++ [chunk]
|
|
}
|
|
|
|
[updated | rest]
|
|
else
|
|
# New page, create new segment
|
|
[%{page: page, content: text, chunks: [chunk]}, current | rest]
|
|
end
|
|
end
|
|
end)
|
|
|> Enum.reverse()
|
|
end
|
|
|
|
@doc """
|
|
Analyze transitions between pages (e.g., page breaks).
|
|
"""
|
|
def analyze_page_transitions(chunks) do
|
|
chunks
|
|
|> Enum.map(&extract_page_number/1)
|
|
|> Enum.uniq()
|
|
|> Enum.sort()
|
|
|> then(fn pages ->
|
|
%{
|
|
total_pages: length(pages),
|
|
page_numbers: pages,
|
|
gaps: find_page_gaps(pages)
|
|
}
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Get statistics for each page (chunk count, text length, etc).
|
|
"""
|
|
def page_statistics(chunks) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.group_by(&extract_page_number/1)
|
|
|> Enum.map(fn {page, page_chunks} ->
|
|
total_length =
|
|
page_chunks
|
|
|> Enum.map(&String.length(Map.get(&1, "text", "")))
|
|
|> Enum.sum()
|
|
|
|
%{
|
|
page_number: page,
|
|
chunk_count: length(page_chunks),
|
|
total_text_length: total_length,
|
|
avg_chunk_size: if(length(page_chunks) > 0, do: div(total_length, length(page_chunks)), else: 0)
|
|
}
|
|
end)
|
|
|> Enum.sort_by(&Map.get(&1, :page_number))
|
|
end
|
|
|
|
@doc """
|
|
Extract content from a page range.
|
|
"""
|
|
def extract_page_range(chunks, start_page, end_page) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.filter(fn chunk ->
|
|
page = extract_page_number(chunk)
|
|
page && page >= start_page && page <= end_page
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Add page boundary markers to chunks for processing.
|
|
"""
|
|
def add_page_markers(chunks) when is_list(chunks) do
|
|
page_segments = identify_page_segments(chunks)
|
|
|
|
segments_map =
|
|
page_segments
|
|
|> Enum.reduce(%{}, fn segment, acc ->
|
|
Map.put(acc, segment.page, segment)
|
|
end)
|
|
|
|
chunks
|
|
|> Enum.map(fn chunk ->
|
|
page = extract_page_number(chunk)
|
|
metadata = Map.get(chunk, "metadata", %{})
|
|
|
|
updated_metadata =
|
|
case segments_map[page] do
|
|
nil ->
|
|
metadata
|
|
|
|
segment ->
|
|
metadata
|
|
|> Map.put("page_number", page)
|
|
|> Map.put("is_first_on_page", hd(segment.chunks) == chunk)
|
|
|> Map.put("is_last_on_page", List.last(segment.chunks) == chunk)
|
|
|> Map.put("position_on_page", find_chunk_position(segment.chunks, chunk))
|
|
end
|
|
|
|
Map.put(chunk, "metadata", updated_metadata)
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Generate a page index for quick access.
|
|
"""
|
|
def generate_page_index(chunks) when is_list(chunks) do
|
|
chunks
|
|
|> Enum.with_index()
|
|
|> Enum.reduce(%{}, fn {chunk, index}, acc ->
|
|
page = extract_page_number(chunk)
|
|
|
|
if page do
|
|
Map.update(acc, page, [index], &(&1 ++ [index]))
|
|
else
|
|
acc
|
|
end
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Create a table of contents based on page boundaries and content structure.
|
|
"""
|
|
def create_page_toc(chunks) do
|
|
chunks
|
|
|> add_page_markers()
|
|
|> Enum.filter(fn chunk ->
|
|
metadata = Map.get(chunk, "metadata", %{})
|
|
Map.get(metadata, "is_first_on_page", false)
|
|
end)
|
|
|> Enum.map(fn chunk ->
|
|
text = Map.get(chunk, "text", "")
|
|
metadata = Map.get(chunk, "metadata", %{})
|
|
page = Map.get(metadata, "page_number", "unknown")
|
|
|
|
# Extract first line or heading as TOC entry
|
|
first_line =
|
|
text
|
|
|> String.split("\n")
|
|
|> hd()
|
|
|> String.trim()
|
|
|> then(fn line ->
|
|
if String.length(line) > 100 do
|
|
String.slice(line, 0, 100) <> "..."
|
|
else
|
|
line
|
|
end
|
|
end)
|
|
|
|
%{
|
|
page: page,
|
|
content: first_line,
|
|
content_type: detect_content_type(text)
|
|
}
|
|
end)
|
|
end
|
|
|
|
# Private helpers
|
|
defp extract_page_number(chunk) do
|
|
chunk
|
|
|> Map.get("metadata", %{})
|
|
|> Map.get("page_number")
|
|
end
|
|
|
|
defp find_page_gaps(pages) do
|
|
pages
|
|
|> Enum.chunk_every(2, 1, :discard)
|
|
|> Enum.filter(fn [a, b] -> b - a > 1 end)
|
|
|> Enum.map(fn [a, b] -> {a, b} end)
|
|
end
|
|
|
|
defp find_chunk_position(chunks, target_chunk) do
|
|
chunks
|
|
|> Enum.find_index(&(&1 == target_chunk))
|
|
|> then(fn
|
|
nil -> nil
|
|
index -> index + 1
|
|
end)
|
|
end
|
|
|
|
defp detect_content_type(text) do
|
|
cond do
|
|
String.contains?(text, ~w[# ## ===]) -> "heading"
|
|
String.contains?(text, ~w[- * •]) -> "list"
|
|
String.length(text) < 100 -> "snippet"
|
|
true -> "body"
|
|
end
|
|
end
|
|
end
|
|
|
|
# Example usage
|
|
config = %Kreuzberg.ExtractionConfig{
|
|
chunking: %{"enabled" => true, "max_characters" => 1000}
|
|
}
|
|
|
|
{:ok, result} = Kreuzberg.extract_file("multipage_doc.pdf", nil, config)
|
|
chunks = result.chunks || []
|
|
|
|
IO.puts("=== Page Boundaries ===")
|
|
IO.inspect(PageBoundaryUtils.track_page_boundaries(chunks))
|
|
|
|
IO.puts("\n=== Page Statistics ===")
|
|
IO.inspect(PageBoundaryUtils.page_statistics(chunks))
|
|
|
|
IO.puts("\n=== Page Transitions Analysis ===")
|
|
IO.inspect(PageBoundaryUtils.analyze_page_transitions(chunks))
|
|
|
|
IO.puts("\n=== Chunks from Page 2-4 ===")
|
|
IO.inspect(PageBoundaryUtils.extract_page_range(chunks, 2, 4) |> length())
|
|
|
|
IO.puts("\n=== Page Markers Added ===")
|
|
|
|
marked_chunks = PageBoundaryUtils.add_page_markers(chunks)
|
|
IO.inspect(hd(marked_chunks))
|
|
|
|
IO.puts("\n=== Page Index ===")
|
|
IO.inspect(PageBoundaryUtils.generate_page_index(chunks))
|
|
|
|
IO.puts("\n=== Table of Contents ===")
|
|
IO.inspect(PageBoundaryUtils.create_page_toc(chunks))
|