This commit is contained in:
71
docs/snippets/elixir/metadata/page_tracking_basic.exs
Normal file
71
docs/snippets/elixir/metadata/page_tracking_basic.exs
Normal file
@@ -0,0 +1,71 @@
|
||||
```elixir title="Elixir"
|
||||
# Track which pages content originated from during extraction
|
||||
# Useful for cross-referencing extracted content back to source documents
|
||||
|
||||
alias Kreuzberg.ExtractionConfig
|
||||
|
||||
config = %ExtractionConfig{
|
||||
# Standard extraction configuration
|
||||
use_cache: true
|
||||
}
|
||||
|
||||
{:ok, result} = Kreuzberg.extract_file("multi_page_document.pdf", nil, config)
|
||||
|
||||
# Access metadata to retrieve page information
|
||||
metadata = result.metadata || %{}
|
||||
|
||||
# For PDF documents, metadata includes page tracking
|
||||
case metadata["pdf"] do
|
||||
pdf_meta when is_map(pdf_meta) ->
|
||||
IO.puts("Total pages in document: #{pdf_meta["page_count"]}")
|
||||
IO.puts("Document title: #{pdf_meta["title"]}")
|
||||
IO.puts("Document author: #{pdf_meta["author"]}")
|
||||
_ ->
|
||||
IO.puts("No PDF metadata available")
|
||||
end
|
||||
|
||||
# When using chunks, track which content came from which page
|
||||
chunks = result.chunks || []
|
||||
IO.puts("\nTotal chunks extracted: #{length(chunks)}")
|
||||
|
||||
# Process chunks and associate with page information
|
||||
Enum.with_index(chunks, 1) |> Enum.each(fn {chunk, index} ->
|
||||
# Estimate page number based on chunk position
|
||||
# This is a simplified approach - actual implementation may vary
|
||||
text = Map.get(chunk, "text", "")
|
||||
IO.puts("Chunk #{index}: #{byte_size(text)} bytes")
|
||||
IO.puts(" Content preview: #{String.slice(text, 0..80)}...")
|
||||
end)
|
||||
|
||||
# For tables, metadata may include page numbers
|
||||
tables = result.tables || []
|
||||
IO.puts("\nTotal tables found: #{length(tables)}")
|
||||
|
||||
Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
|
||||
cells = table["cells"] || []
|
||||
IO.puts("Table #{idx}: #{length(cells)} rows")
|
||||
|
||||
# Table metadata may indicate source page
|
||||
case table["metadata"] do
|
||||
meta when is_map(meta) ->
|
||||
IO.puts(" Page: #{meta["page"] || "Unknown"}")
|
||||
_ ->
|
||||
IO.puts(" Page: Unknown")
|
||||
end
|
||||
end)
|
||||
|
||||
# Track images with page information
|
||||
images = result.images || []
|
||||
IO.puts("\nTotal images found: #{length(images)}")
|
||||
|
||||
Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
|
||||
IO.puts("Image #{idx}:")
|
||||
IO.puts(" Format: #{image["format"]}")
|
||||
IO.puts(" Size: #{image["size"]} bytes")
|
||||
|
||||
# Page information if available in image metadata
|
||||
if image["page"] do
|
||||
IO.puts(" Page: #{image["page"]}")
|
||||
end
|
||||
end)
|
||||
```
|
||||
Reference in New Issue
Block a user