Files
fil/docs/snippets/elixir/cache/disk_cache.exs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

436 lines
11 KiB
Elixir

```elixir title="Elixir"
# Disk Caching - Implement persistent disk caching for extraction results
# Demonstrates advanced caching strategies for document extraction
defmodule KreuzbergDiskCache do
@moduledoc """
Disk-based caching layer for Kreuzberg extraction results.
Provides persistent caching of extraction results with features like:
- TTL-based cache expiration
- Compression for large results
- Cache statistics and management
- Multi-tiered caching (memory + disk)
"""
require Logger
defmodule CacheEntry do
@moduledoc """
Represents a cached extraction result.
"""
defstruct [
:key,
:result,
:created_at,
:accessed_at,
:ttl_seconds,
:size_bytes,
:compressed
]
@doc """
Create a new cache entry.
"""
def new(key, result, ttl_seconds \\ 86400) do
size = calculate_size(result)
%CacheEntry{
key: key,
result: result,
created_at: System.monotonic_time(:second),
accessed_at: System.monotonic_time(:second),
ttl_seconds: ttl_seconds,
size_bytes: size,
compressed: false
}
end
@doc """
Check if entry has expired.
"""
def expired?(%CacheEntry{created_at: created_at, ttl_seconds: ttl}) do
now = System.monotonic_time(:second)
now - created_at > ttl
end
@doc """
Update access time.
"""
def touch(%CacheEntry{} = entry) do
%{entry | accessed_at: System.monotonic_time(:second)}
end
defp calculate_size(result) do
case result do
%{content: content} -> byte_size(content)
_ -> 0
end
end
end
defmodule Cache do
@moduledoc """
Main disk cache implementation.
"""
defstruct [
:cache_dir,
:max_size_bytes,
:ttl_seconds,
:compression_enabled,
:memory_cache
]
@doc """
Initialize disk cache.
"""
def new(cache_dir, opts \\ []) do
File.mkdir_p!(cache_dir)
%Cache{
cache_dir: cache_dir,
max_size_bytes: Keyword.get(opts, :max_size_bytes, 1_000_000_000),
ttl_seconds: Keyword.get(opts, :ttl_seconds, 604_800),
compression_enabled: Keyword.get(opts, :compression_enabled, true),
memory_cache: %{}
}
end
@doc """
Get cached result by key.
"""
def get(cache, key) do
# Check memory cache first
case Map.get(cache.memory_cache, key) do
%CacheEntry{} = entry ->
if CacheEntry.expired?(entry) do
Logger.debug("Cache hit (memory) - expired: #{key}")
:miss
else
Logger.debug("Cache hit (memory): #{key}")
{:hit, CacheEntry.touch(entry).result}
end
nil ->
get_from_disk(cache, key)
end
end
@doc """
Store result in cache.
"""
def put(cache, key, result) do
entry = CacheEntry.new(key, result, cache.ttl_seconds)
# Store in memory
new_memory_cache = Map.put(cache.memory_cache, key, entry)
# Store on disk
store_on_disk(cache, key, entry)
# Check cache size and cleanup if needed
cache = %{cache | memory_cache: new_memory_cache}
maybe_cleanup(cache)
Logger.info("Cache stored: #{key}")
cache
end
@doc """
Delete cache entry.
"""
def delete(cache, key) do
new_memory_cache = Map.delete(cache.memory_cache, key)
cache_file = cache_path(cache, key)
if File.exists?(cache_file), do: File.rm(cache_file)
Logger.info("Cache deleted: #{key}")
%{cache | memory_cache: new_memory_cache}
end
@doc """
Clear all cache entries.
"""
def clear(cache) do
# Clear disk cache
File.rm_rf!(cache.cache_dir)
File.mkdir_p!(cache.cache_dir)
Logger.info("Cache cleared")
%{cache | memory_cache: %{}}
end
@doc """
Get cache statistics.
"""
def stats(cache) do
total_size = calculate_total_size(cache)
entry_count = map_size(cache.memory_cache)
memory_entries = Enum.count(cache.memory_cache)
disk_entries =
case File.ls(cache.cache_dir) do
{:ok, files} -> length(files)
{:error, _} -> 0
end
%{
total_entries: entry_count,
memory_entries: memory_entries,
disk_entries: disk_entries,
total_size_bytes: total_size,
max_size_bytes: cache.max_size_bytes,
usage_percent: (total_size / cache.max_size_bytes * 100) |> Float.round(2),
compression_enabled: cache.compression_enabled
}
end
# Private helpers
defp get_from_disk(cache, key) do
cache_file = cache_path(cache, key)
if File.exists?(cache_file) do
case File.read(cache_file) do
{:ok, data} ->
case deserialize(data, cache.compression_enabled) do
{:ok, entry} ->
if CacheEntry.expired?(entry) do
File.rm(cache_file)
Logger.debug("Cache hit (disk) - expired: #{key}")
:miss
else
Logger.debug("Cache hit (disk): #{key}")
{:hit, CacheEntry.touch(entry).result}
end
{:error, reason} ->
Logger.warn("Failed to deserialize cache: #{inspect(reason)}")
:miss
end
{:error, reason} ->
Logger.warn("Failed to read cache file: #{inspect(reason)}")
:miss
end
else
:miss
end
end
defp store_on_disk(cache, key, entry) do
cache_file = cache_path(cache, key)
data = serialize(entry, cache.compression_enabled)
File.write!(cache_file, data)
end
defp cache_path(cache, key) do
Path.join(cache.cache_dir, "#{key}.cache")
end
defp serialize(entry, compression_enabled) do
data = :erlang.term_to_binary(entry)
if compression_enabled do
:zlib.compress(data)
else
data
end
end
defp deserialize(data, compression_enabled) do
try do
uncompressed =
if compression_enabled do
:zlib.uncompress(data)
else
data
end
{:ok, :erlang.binary_to_term(uncompressed)}
rescue
e -> {:error, e}
end
end
defp calculate_total_size(cache) do
cache.memory_cache
|> Map.values()
|> Enum.map(& &1.size_bytes)
|> Enum.sum()
end
defp maybe_cleanup(cache) do
total_size = calculate_total_size(cache)
if total_size > cache.max_size_bytes do
Logger.info("Cache size (#{total_size}) exceeds limit, starting cleanup")
cleanup_lru(cache)
else
cache
end
end
defp cleanup_lru(cache) do
# Remove least recently used entries until under limit
entries =
cache.memory_cache
|> Enum.sort_by(fn {_k, entry} -> entry.accessed_at end)
target_size = div(cache.max_size_bytes, 2)
current_size = calculate_total_size(cache)
entries
|> Enum.reduce_while({cache, current_size}, fn {key, entry}, {acc_cache, size} ->
if size <= target_size do
{:halt, {acc_cache, size}}
else
new_cache = delete(acc_cache, key)
new_size = size - entry.size_bytes
{:cont, {new_cache, new_size}}
end
end)
|> elem(0)
end
end
@doc """
Initialize cache and extract with caching.
"""
def extract_with_cache(file_path, cache_dir, opts \\ []) do
cache = Cache.new(cache_dir, opts)
cache_key = compute_cache_key(file_path, opts)
case Cache.get(cache, cache_key) do
{:hit, result} ->
{:ok, result, cache}
:miss ->
Logger.info("Cache miss: #{file_path}")
case Kreuzberg.extract_file(file_path) do
{:ok, result} ->
new_cache = Cache.put(cache, cache_key, result)
{:ok, result, new_cache}
error ->
{error, cache}
end
end
end
@doc """
Extract multiple files with batch caching.
"""
def batch_extract_with_cache(file_paths, cache_dir, opts \\ []) do
cache = Cache.new(cache_dir, opts)
results =
file_paths
|> Enum.map(fn path ->
case extract_with_cache(path, cache_dir, opts) do
{:ok, result, _} -> {:ok, path, result}
{{:error, reason}, _} -> {:error, path, reason}
end
end)
stats = Cache.stats(cache)
{results, stats}
end
@doc """
Manage cache - get stats, clear, etc.
"""
def manage_cache(cache_dir, action, opts \\ []) do
cache = Cache.new(cache_dir, opts)
case action do
:stats ->
Cache.stats(cache)
:clear ->
Cache.clear(cache)
:list ->
case File.ls(cache_dir) do
{:ok, files} -> files
{:error, reason} -> {:error, reason}
end
{:delete, key} ->
Cache.delete(cache, key)
_ ->
{:error, "Unknown action: #{action}"}
end
end
# Private helpers
defp compute_cache_key(file_path, opts) do
# Include file path and options in key
content = "#{file_path}|#{inspect(opts)}"
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
end
end
# Usage examples
IO.puts("=== Kreuzberg Disk Cache ===\n")
cache_dir = "/tmp/kreuzberg_cache"
# Example 1: Single file extraction with caching
IO.puts("Example 1: Single file extraction with caching")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
case KreuzbergDiskCache.extract_with_cache("document.pdf", cache_dir) do
{:ok, result, cache} ->
IO.puts("Extraction successful!")
IO.puts("Content size: #{byte_size(result.content)} bytes")
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
IO.puts("\nCache Statistics:")
IO.puts(" Entries: #{stats.total_entries}")
IO.puts(" Size: #{stats.total_size_bytes} bytes")
IO.puts(" Usage: #{stats.usage_percent}%\n")
{error, _cache} ->
IO.puts("Extraction failed: #{inspect(error)}\n")
end
# Example 2: Batch extraction with cache statistics
IO.puts("Example 2: Batch extraction with caching")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
{results, stats} = KreuzbergDiskCache.batch_extract_with_cache(documents, cache_dir)
successful = Enum.count(results, &match?({:ok, _, _}, &1))
IO.puts("Batch results:")
IO.puts(" Processed: #{length(documents)}")
IO.puts(" Successful: #{successful}")
IO.puts("\nCache Statistics:")
IO.puts(" Total entries: #{stats.total_entries}")
IO.puts(" Memory entries: #{stats.memory_entries}")
IO.puts(" Disk entries: #{stats.disk_entries}")
IO.puts(" Total size: #{stats.total_size_bytes} bytes")
IO.puts(" Usage: #{stats.usage_percent}%\n")
# Example 3: Cache management
IO.puts("Example 3: Cache management")
IO.puts("-" <> String.duplicate("-", 40) <> "\n")
cached_files = KreuzbergDiskCache.manage_cache(cache_dir, :list)
IO.puts("Cached files:")
Enum.each(cached_files, fn file -> IO.puts(" - #{file}") end)
IO.puts("\nCache stats:")
stats = KreuzbergDiskCache.manage_cache(cache_dir, :stats)
IO.inspect(stats, pretty: true)
```