188 lines
5.0 KiB
Elixir
188 lines
5.0 KiB
Elixir
# Language detection and processing utilities
|
|
defmodule LanguageDetectionUtils do
|
|
@default_confidence 0.8
|
|
|
|
@language_indicators %{
|
|
"en" => %{
|
|
patterns: ~w[the and to of a in is that],
|
|
common_words: ~w[english language detection],
|
|
min_match: 2
|
|
},
|
|
"es" => %{
|
|
patterns: ~w[de la el y en que],
|
|
common_words: ~w[español detección idioma],
|
|
min_match: 2
|
|
},
|
|
"fr" => %{
|
|
patterns: ~w[le la de et une est],
|
|
common_words: ~w[français détection langue],
|
|
min_match: 2
|
|
},
|
|
"de" => %{
|
|
patterns: ~w[der die und das in ein],
|
|
common_words: ~w[deutsch erkennung sprache],
|
|
min_match: 2
|
|
}
|
|
}
|
|
|
|
@doc """
|
|
Detect the language of extracted text using pattern matching.
|
|
Returns {language_code, confidence} tuple.
|
|
"""
|
|
def detect_language(text) when is_binary(text) and byte_size(text) > 0 do
|
|
normalized = String.downcase(text)
|
|
words = String.split(normalized, ~r/\W+/)
|
|
|
|
scores =
|
|
@language_indicators
|
|
|> Enum.map(fn {lang, indicators} ->
|
|
matched = count_pattern_matches(words, indicators.patterns)
|
|
confidence = min(matched / indicators.min_match, 1.0)
|
|
{lang, confidence}
|
|
end)
|
|
|> Enum.sort_by(&elem(&1, 1), :desc)
|
|
|
|
case scores do
|
|
[{lang, confidence} | _] when confidence > 0.3 ->
|
|
{lang, min(confidence * 100, 100)}
|
|
|
|
_ ->
|
|
{"unknown", 0.0}
|
|
end
|
|
end
|
|
|
|
def detect_language(_), do: {"unknown", 0.0}
|
|
|
|
@doc """
|
|
Detect language for each chunk and add it to metadata.
|
|
"""
|
|
def detect_chunk_languages(chunks) when is_list(chunks) do
|
|
Enum.map(chunks, fn chunk ->
|
|
text = Map.get(chunk, "text", "")
|
|
{lang, confidence} = detect_language(text)
|
|
|
|
metadata = Map.get(chunk, "metadata", %{})
|
|
|
|
updated_metadata =
|
|
metadata
|
|
|> Map.put("detected_language", lang)
|
|
|> Map.put("language_confidence", confidence)
|
|
|
|
Map.put(chunk, "metadata", updated_metadata)
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Group chunks by detected language.
|
|
"""
|
|
def group_chunks_by_language(chunks) do
|
|
chunks
|
|
|> detect_chunk_languages()
|
|
|> Enum.group_by(fn chunk ->
|
|
chunk
|
|
|> Map.get("metadata", %{})
|
|
|> Map.get("detected_language", "unknown")
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Filter chunks by language with optional confidence threshold.
|
|
"""
|
|
def filter_by_language(chunks, target_language, min_confidence \\ 0.7) do
|
|
chunks
|
|
|> detect_chunk_languages()
|
|
|> Enum.filter(fn chunk ->
|
|
metadata = Map.get(chunk, "metadata", %{})
|
|
detected = Map.get(metadata, "detected_language", "unknown")
|
|
confidence = Map.get(metadata, "language_confidence", 0.0) / 100
|
|
|
|
detected == target_language and confidence >= min_confidence
|
|
end)
|
|
end
|
|
|
|
@doc """
|
|
Summarize language distribution across chunks.
|
|
"""
|
|
def language_summary(chunks) do
|
|
chunks
|
|
|> group_chunks_by_language()
|
|
|> Enum.map(fn {lang, group} ->
|
|
avg_confidence =
|
|
group
|
|
|> Enum.map(fn chunk ->
|
|
chunk
|
|
|> Map.get("metadata", %{})
|
|
|> Map.get("language_confidence", 0.0)
|
|
end)
|
|
|> then(fn confidences ->
|
|
if Enum.empty?(confidences) do
|
|
0.0
|
|
else
|
|
Enum.sum(confidences) / length(confidences)
|
|
end
|
|
end)
|
|
|
|
%{
|
|
language: lang,
|
|
chunk_count: length(group),
|
|
avg_confidence: Float.round(avg_confidence, 2)
|
|
}
|
|
end)
|
|
|> Enum.sort_by(&Map.get(&1, :chunk_count), :desc)
|
|
end
|
|
|
|
@doc """
|
|
Determine if text is mostly in a single language.
|
|
"""
|
|
def is_single_language?(chunks, threshold \\ 0.8) do
|
|
case language_summary(chunks) do
|
|
[top | rest] ->
|
|
dominant_ratio = top.chunk_count / Enum.reduce(rest, top.chunk_count, fn x, acc ->
|
|
acc + x.chunk_count
|
|
end)
|
|
|
|
dominant_ratio >= threshold
|
|
|
|
_ ->
|
|
false
|
|
end
|
|
end
|
|
|
|
# Private helpers
|
|
defp count_pattern_matches(words, patterns) do
|
|
Enum.count(words, &Enum.member?(patterns, &1))
|
|
end
|
|
end
|
|
|
|
# Example usage
|
|
config = %Kreuzberg.ExtractionConfig{
|
|
chunking: %{"enabled" => true, "max_characters" => 1000}
|
|
}
|
|
|
|
{:ok, result} = Kreuzberg.extract_file("multilingual_doc.pdf", nil, config)
|
|
chunks = result.chunks || []
|
|
|
|
IO.puts("=== Language Detection ===")
|
|
|
|
case LanguageDetectionUtils.detect_language(result.content || "") do
|
|
{lang, confidence} ->
|
|
IO.puts("Detected Language: #{lang}")
|
|
IO.puts("Confidence: #{Float.round(confidence, 2)}%")
|
|
end
|
|
|
|
IO.puts("\n=== Language Summary ===")
|
|
IO.inspect(LanguageDetectionUtils.language_summary(chunks))
|
|
|
|
IO.puts("\n=== Group by Language ===")
|
|
|
|
LanguageDetectionUtils.group_chunks_by_language(chunks)
|
|
|> Enum.each(fn {lang, group} ->
|
|
IO.puts("Language: #{lang} - Chunks: #{length(group)}")
|
|
end)
|
|
|
|
IO.puts("\n=== Filter English Chunks (min 80% confidence) ===")
|
|
IO.inspect(LanguageDetectionUtils.filter_by_language(chunks, "en", 0.8))
|
|
|
|
IO.puts("\n=== Check if Single Language ===")
|
|
IO.puts(LanguageDetectionUtils.is_single_language?(chunks, 0.8))
|