Files
fil/docs/snippets/elixir/utils/language_detection.exs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

188 lines
5.0 KiB
Elixir

# Language detection and processing utilities
defmodule LanguageDetectionUtils do
@default_confidence 0.8
@language_indicators %{
"en" => %{
patterns: ~w[the and to of a in is that],
common_words: ~w[english language detection],
min_match: 2
},
"es" => %{
patterns: ~w[de la el y en que],
common_words: ~w[español detección idioma],
min_match: 2
},
"fr" => %{
patterns: ~w[le la de et une est],
common_words: ~w[français détection langue],
min_match: 2
},
"de" => %{
patterns: ~w[der die und das in ein],
common_words: ~w[deutsch erkennung sprache],
min_match: 2
}
}
@doc """
Detect the language of extracted text using pattern matching.
Returns {language_code, confidence} tuple.
"""
def detect_language(text) when is_binary(text) and byte_size(text) > 0 do
normalized = String.downcase(text)
words = String.split(normalized, ~r/\W+/)
scores =
@language_indicators
|> Enum.map(fn {lang, indicators} ->
matched = count_pattern_matches(words, indicators.patterns)
confidence = min(matched / indicators.min_match, 1.0)
{lang, confidence}
end)
|> Enum.sort_by(&elem(&1, 1), :desc)
case scores do
[{lang, confidence} | _] when confidence > 0.3 ->
{lang, min(confidence * 100, 100)}
_ ->
{"unknown", 0.0}
end
end
def detect_language(_), do: {"unknown", 0.0}
@doc """
Detect language for each chunk and add it to metadata.
"""
def detect_chunk_languages(chunks) when is_list(chunks) do
Enum.map(chunks, fn chunk ->
text = Map.get(chunk, "text", "")
{lang, confidence} = detect_language(text)
metadata = Map.get(chunk, "metadata", %{})
updated_metadata =
metadata
|> Map.put("detected_language", lang)
|> Map.put("language_confidence", confidence)
Map.put(chunk, "metadata", updated_metadata)
end)
end
@doc """
Group chunks by detected language.
"""
def group_chunks_by_language(chunks) do
chunks
|> detect_chunk_languages()
|> Enum.group_by(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.get("detected_language", "unknown")
end)
end
@doc """
Filter chunks by language with optional confidence threshold.
"""
def filter_by_language(chunks, target_language, min_confidence \\ 0.7) do
chunks
|> detect_chunk_languages()
|> Enum.filter(fn chunk ->
metadata = Map.get(chunk, "metadata", %{})
detected = Map.get(metadata, "detected_language", "unknown")
confidence = Map.get(metadata, "language_confidence", 0.0) / 100
detected == target_language and confidence >= min_confidence
end)
end
@doc """
Summarize language distribution across chunks.
"""
def language_summary(chunks) do
chunks
|> group_chunks_by_language()
|> Enum.map(fn {lang, group} ->
avg_confidence =
group
|> Enum.map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.get("language_confidence", 0.0)
end)
|> then(fn confidences ->
if Enum.empty?(confidences) do
0.0
else
Enum.sum(confidences) / length(confidences)
end
end)
%{
language: lang,
chunk_count: length(group),
avg_confidence: Float.round(avg_confidence, 2)
}
end)
|> Enum.sort_by(&Map.get(&1, :chunk_count), :desc)
end
@doc """
Determine if text is mostly in a single language.
"""
def is_single_language?(chunks, threshold \\ 0.8) do
case language_summary(chunks) do
[top | rest] ->
dominant_ratio = top.chunk_count / Enum.reduce(rest, top.chunk_count, fn x, acc ->
acc + x.chunk_count
end)
dominant_ratio >= threshold
_ ->
false
end
end
# Private helpers
defp count_pattern_matches(words, patterns) do
Enum.count(words, &Enum.member?(patterns, &1))
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("multilingual_doc.pdf", nil, config)
chunks = result.chunks || []
IO.puts("=== Language Detection ===")
case LanguageDetectionUtils.detect_language(result.content || "") do
{lang, confidence} ->
IO.puts("Detected Language: #{lang}")
IO.puts("Confidence: #{Float.round(confidence, 2)}%")
end
IO.puts("\n=== Language Summary ===")
IO.inspect(LanguageDetectionUtils.language_summary(chunks))
IO.puts("\n=== Group by Language ===")
LanguageDetectionUtils.group_chunks_by_language(chunks)
|> Enum.each(fn {lang, group} ->
IO.puts("Language: #{lang} - Chunks: #{length(group)}")
end)
IO.puts("\n=== Filter English Chunks (min 80% confidence) ===")
IO.inspect(LanguageDetectionUtils.filter_by_language(chunks, "en", 0.8))
IO.puts("\n=== Check if Single Language ===")
IO.puts(LanguageDetectionUtils.is_single_language?(chunks, 0.8))