Files
fil/docs/snippets/elixir/metadata/pdf_metadata_extractor.exs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

109 lines
2.9 KiB
Elixir

```elixir title="Elixir"
# Extract and process comprehensive PDF metadata
# Useful for document indexing, cataloging, and metadata-driven workflows
alias Kreuzberg.ExtractionConfig
defmodule PDFMetadataExtractor do
@moduledoc """
Extract and process PDF metadata from documents.
Provides structured access to PDF properties and document information.
"""
@doc """
Extract PDF metadata from a file.
Returns a map with normalized metadata fields.
"""
def extract_metadata(file_path) do
config = %ExtractionConfig{
use_cache: true
}
case Kreuzberg.extract_file(file_path, nil, config) do
{:ok, result} ->
process_metadata(result.metadata || %{})
{:error, reason} ->
{:error, reason}
end
end
@doc """
Process and normalize PDF metadata.
"""
defp process_metadata(metadata) do
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
{:ok,
%{
page_count: pdf_meta["page_count"] || 0,
title: pdf_meta["title"],
author: pdf_meta["author"],
subject: pdf_meta["subject"],
keywords: pdf_meta["keywords"],
creator: pdf_meta["creator"],
producer: pdf_meta["producer"],
creation_date: pdf_meta["creation_date"],
modification_date: pdf_meta["modification_date"],
version: pdf_meta["version"],
is_encrypted: pdf_meta["is_encrypted"] || false,
is_tagged: pdf_meta["is_tagged"] || false
}}
_ ->
{:error, "No PDF metadata found"}
end
end
@doc """
Format metadata for display.
"""
def format_metadata(metadata) when is_map(metadata) do
"""
=== PDF Metadata ===
Title: #{metadata[:title] || "N/A"}
Author: #{metadata[:author] || "N/A"}
Subject: #{metadata[:subject] || "N/A"}
Keywords: #{inspect(metadata[:keywords]) || "N/A"}
Creator: #{metadata[:creator] || "N/A"}
Producer: #{metadata[:producer] || "N/A"}
=== Document Properties ===
Pages: #{metadata[:page_count]}
Version: #{metadata[:version] || "N/A"}
Encrypted: #{metadata[:is_encrypted]}
Tagged (Accessible): #{metadata[:is_tagged]}
=== Dates ===
Created: #{metadata[:creation_date] || "N/A"}
Modified: #{metadata[:modification_date] || "N/A"}
"""
end
end
# Usage example
file_path = "document.pdf"
case PDFMetadataExtractor.extract_metadata(file_path) do
{:ok, metadata} ->
IO.puts(PDFMetadataExtractor.format_metadata(metadata))
# Perform metadata-driven operations
if metadata[:page_count] > 100 do
IO.puts("Note: Document is large (#{metadata[:page_count]} pages)")
end
if metadata[:is_encrypted] do
IO.puts("Note: Document is password-protected")
end
if metadata[:is_tagged] do
IO.puts("Note: Document is accessible with tags")
end
{:error, reason} ->
IO.puts("Error extracting metadata: #{reason}")
end
```