Files
fil/docs/snippets/csharp/plugins/custom_postprocessor_plugin.cs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

125 lines
3.5 KiB
C#

using Kreuzberg;
using System.Text.RegularExpressions;
class WordCountPostProcessor : IPostProcessor
{
public string Name => "word-count";
public int Priority => 10;
public ExtractionResult Process(ExtractionResult result)
{
if (string.IsNullOrEmpty(result.Content))
{
return result;
}
var wordCount = result.Content.Split(
new[] { ' ', '\n', '\r', '\t' },
StringSplitOptions.RemoveEmptyEntries
).Length;
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
return result;
}
}
class CleanupPostProcessor : IPostProcessor
{
public string Name => "text-cleanup";
public int Priority => 5;
public ExtractionResult Process(ExtractionResult result)
{
if (string.IsNullOrEmpty(result.Content))
{
return result;
}
var cleaned = Regex.Replace(result.Content, @"\s+", " ").Trim();
cleaned = Regex.Replace(cleaned, @"[^\w\s\.\,\!\?\-]", "");
result.Content = cleaned;
return result;
}
}
class LanguageDetectionPostProcessor : IPostProcessor
{
public string Name => "language-detection";
public int Priority => 1;
public ExtractionResult Process(ExtractionResult result)
{
if (string.IsNullOrEmpty(result.Content))
{
return result;
}
var detectedLanguage = DetectLanguage(result.Content);
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["detected_language"] = System.Text.Json.Nodes.JsonValue.Create(detectedLanguage);
return result;
}
private string DetectLanguage(string text)
{
var commonEnglishWords = new[] { "the", "is", "and", "to", "of", "a", "in", "that" };
var lowerText = text.ToLower();
var matches = commonEnglishWords.Count(word =>
Regex.IsMatch(lowerText, $@"\b{word}\b")
);
return matches > 5 ? "en" : "unknown";
}
}
class Program
{
static void Main()
{
var wordCountProcessor = new WordCountPostProcessor();
var cleanupProcessor = new CleanupPostProcessor();
var languageProcessor = new LanguageDetectionPostProcessor();
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
KreuzbergLib.RegisterPostProcessor(cleanupProcessor);
KreuzbergLib.RegisterPostProcessor(languageProcessor);
try
{
var config = new ExtractionConfig();
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine($"Original content length: {result.Content.Length}");
if (result.Metadata.Additional != null)
{
if (result.Metadata.Additional.TryGetValue("word_count", out var wc))
{
Console.WriteLine($"Word count: {wc}");
}
if (result.Metadata.Additional.TryGetValue("detected_language", out var lang))
{
Console.WriteLine($"Detected language: {lang}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}