This commit is contained in:
124
docs/snippets/csharp/plugins/custom_postprocessor_plugin.cs
Normal file
124
docs/snippets/csharp/plugins/custom_postprocessor_plugin.cs
Normal file
@@ -0,0 +1,124 @@
|
||||
using Kreuzberg;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
class WordCountPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "word-count";
|
||||
public int Priority => 10;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var wordCount = result.Content.Split(
|
||||
new[] { ' ', '\n', '\r', '\t' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class CleanupPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "text-cleanup";
|
||||
public int Priority => 5;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var cleaned = Regex.Replace(result.Content, @"\s+", " ").Trim();
|
||||
|
||||
cleaned = Regex.Replace(cleaned, @"[^\w\s\.\,\!\?\-]", "");
|
||||
|
||||
result.Content = cleaned;
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class LanguageDetectionPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "language-detection";
|
||||
public int Priority => 1;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var detectedLanguage = DetectLanguage(result.Content);
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["detected_language"] = System.Text.Json.Nodes.JsonValue.Create(detectedLanguage);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string DetectLanguage(string text)
|
||||
{
|
||||
var commonEnglishWords = new[] { "the", "is", "and", "to", "of", "a", "in", "that" };
|
||||
var lowerText = text.ToLower();
|
||||
var matches = commonEnglishWords.Count(word =>
|
||||
Regex.IsMatch(lowerText, $@"\b{word}\b")
|
||||
);
|
||||
|
||||
return matches > 5 ? "en" : "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var wordCountProcessor = new WordCountPostProcessor();
|
||||
var cleanupProcessor = new CleanupPostProcessor();
|
||||
var languageProcessor = new LanguageDetectionPostProcessor();
|
||||
|
||||
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(cleanupProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(languageProcessor);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig();
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine($"Original content length: {result.Content.Length}");
|
||||
|
||||
if (result.Metadata.Additional != null)
|
||||
{
|
||||
if (result.Metadata.Additional.TryGetValue("word_count", out var wc))
|
||||
{
|
||||
Console.WriteLine($"Word count: {wc}");
|
||||
}
|
||||
if (result.Metadata.Additional.TryGetValue("detected_language", out var lang))
|
||||
{
|
||||
Console.WriteLine($"Detected language: {lang}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user