This commit is contained in:
10
docs/snippets/csharp/plugins/clear_plugins.md
Normal file
10
docs/snippets/csharp/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
KreuzbergLib.ClearDocumentExtractors();
|
||||
KreuzbergLib.ClearOcrBackends();
|
||||
KreuzbergLib.ClearPostProcessors();
|
||||
KreuzbergLib.ClearValidators();
|
||||
|
||||
Console.WriteLine("All plugins cleared");
|
||||
```
|
||||
103
docs/snippets/csharp/plugins/custom_cache_plugin.cs
Normal file
103
docs/snippets/csharp/plugins/custom_cache_plugin.cs
Normal file
@@ -0,0 +1,103 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
|
||||
// NOTE: ICacheBackend interface is not available in C# bindings
|
||||
|
||||
class CustomCacheWrapper
|
||||
{
|
||||
private readonly Dictionary<string, (ExtractionResult result, DateTime timestamp)> _cache;
|
||||
private readonly TimeSpan _cacheExpiration;
|
||||
|
||||
public CustomCacheWrapper(TimeSpan? cacheExpiration = null)
|
||||
{
|
||||
_cache = new Dictionary<string, (ExtractionResult, DateTime)>();
|
||||
_cacheExpiration = cacheExpiration ?? TimeSpan.FromHours(1);
|
||||
}
|
||||
|
||||
public ExtractionResult? Get(string key)
|
||||
{
|
||||
if (_cache.TryGetValue(key, out var entry))
|
||||
{
|
||||
if (DateTime.UtcNow - entry.timestamp < _cacheExpiration)
|
||||
{
|
||||
return entry.result;
|
||||
}
|
||||
else
|
||||
{
|
||||
_cache.Remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public void Set(string key, ExtractionResult result)
|
||||
{
|
||||
_cache[key] = (result, DateTime.UtcNow);
|
||||
}
|
||||
|
||||
public void Delete(string key)
|
||||
{
|
||||
_cache.Remove(key);
|
||||
}
|
||||
|
||||
public void Clear()
|
||||
{
|
||||
_cache.Clear();
|
||||
}
|
||||
|
||||
public string GenerateKey(string filePath, ExtractionConfig? config)
|
||||
{
|
||||
var keyData = $"{filePath}:{config?.GetHashCode() ?? 0}";
|
||||
using var sha256 = SHA256.Create();
|
||||
var hashBytes = sha256.ComputeHash(Encoding.UTF8.GetBytes(keyData));
|
||||
return Convert.ToHexString(hashBytes);
|
||||
}
|
||||
|
||||
public ExtractionResult GetOrExtract(string filePath, ExtractionConfig? config = null)
|
||||
{
|
||||
var cacheKey = GenerateKey(filePath, config);
|
||||
|
||||
var cached = Get(cacheKey);
|
||||
if (cached != null)
|
||||
{
|
||||
Console.WriteLine("Retrieved from cache");
|
||||
return cached;
|
||||
}
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync(filePath, config);
|
||||
Set(cacheKey, result);
|
||||
Console.WriteLine("Extracted and cached");
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var cache = new CustomCacheWrapper(cacheExpiration: TimeSpan.FromMinutes(30));
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig { UseCache = true };
|
||||
var filePath = "document.pdf";
|
||||
|
||||
var result1 = cache.GetOrExtract(filePath, config);
|
||||
Console.WriteLine($"First extraction: {result1.Content.Length} chars");
|
||||
|
||||
var result2 = cache.GetOrExtract(filePath, config);
|
||||
Console.WriteLine($"Second extraction: {result2.Content.Length} chars");
|
||||
|
||||
cache.Clear();
|
||||
Console.WriteLine("Cache cleared");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
70
docs/snippets/csharp/plugins/custom_extractor_plugin.cs
Normal file
70
docs/snippets/csharp/plugins/custom_extractor_plugin.cs
Normal file
@@ -0,0 +1,70 @@
|
||||
using Kreuzberg;
|
||||
using System.Text.Json;
|
||||
|
||||
// NOTE: IDocumentExtractor interface is not available in C# bindings
|
||||
|
||||
class CustomJsonProcessor
|
||||
{
|
||||
public static ExtractionResult ProcessJson(byte[] content, string mimeType)
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonContent = System.Text.Encoding.UTF8.GetString(content);
|
||||
var document = JsonDocument.Parse(jsonContent);
|
||||
|
||||
var text = ExtractText(document.RootElement);
|
||||
|
||||
return new ExtractionResult
|
||||
{
|
||||
Content = text,
|
||||
MimeType = mimeType,
|
||||
Metadata = new Metadata(),
|
||||
Tables = new List<Table>(),
|
||||
Success = true
|
||||
};
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
throw new KreuzbergParsingException($"Failed to parse JSON: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string ExtractText(JsonElement element)
|
||||
{
|
||||
return element.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => element.GetString() + "\n",
|
||||
JsonValueKind.Array => string.Concat(
|
||||
element.EnumerateArray().Select(ExtractText)
|
||||
),
|
||||
JsonValueKind.Object => string.Concat(
|
||||
element.EnumerateObject()
|
||||
.Select(p => ExtractText(p.Value))
|
||||
),
|
||||
_ => ""
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonData = new { message = "Hello, world!", timestamp = DateTime.UtcNow };
|
||||
var jsonBytes = System.Text.Encoding.UTF8.GetBytes(
|
||||
JsonSerializer.Serialize(jsonData)
|
||||
);
|
||||
|
||||
var result = CustomJsonProcessor.ProcessJson(jsonBytes, "application/json");
|
||||
|
||||
Console.WriteLine($"Extracted: {result.Content}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
90
docs/snippets/csharp/plugins/custom_ocr_plugin.cs
Normal file
90
docs/snippets/csharp/plugins/custom_ocr_plugin.cs
Normal file
@@ -0,0 +1,90 @@
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
|
||||
class CloudOcrBackend : IOcrBackend
|
||||
{
|
||||
private readonly string _apiKey;
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly string _apiEndpoint;
|
||||
|
||||
public CloudOcrBackend(string apiKey, string apiEndpoint = "https://api.example.com/ocr")
|
||||
{
|
||||
_apiKey = apiKey;
|
||||
_apiEndpoint = apiEndpoint;
|
||||
_httpClient = new HttpClient();
|
||||
}
|
||||
|
||||
public string Name => "cloud-ocr-backend";
|
||||
|
||||
public string Process(ReadOnlySpan<byte> imageBytes, OcrConfig? config)
|
||||
{
|
||||
return Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var bytes = imageBytes.ToArray();
|
||||
using var content = new MultipartFormDataContent();
|
||||
content.Add(new ByteArrayContent(bytes), "image");
|
||||
|
||||
var request = new HttpRequestMessage(
|
||||
HttpMethod.Post,
|
||||
_apiEndpoint
|
||||
)
|
||||
{
|
||||
Content = content,
|
||||
Headers =
|
||||
{
|
||||
{ "Authorization", $"Bearer {_apiKey}" }
|
||||
}
|
||||
};
|
||||
|
||||
var response = await _httpClient.SendAsync(request);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var jsonContent = await response.Content.ReadAsStringAsync();
|
||||
return jsonContent;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
throw new KreuzbergOcrException($"Cloud OCR service error: {ex.Message}");
|
||||
}
|
||||
}).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_httpClient?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
using var backend = new CloudOcrBackend(apiKey: "your-api-key-here");
|
||||
KreuzbergLib.RegisterOcrBackend(backend);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "cloud-ocr-backend"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine($"OCR text: {result.Content}");
|
||||
}
|
||||
catch (KreuzbergOcrException ex)
|
||||
{
|
||||
Console.WriteLine($"OCR error: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
124
docs/snippets/csharp/plugins/custom_postprocessor_plugin.cs
Normal file
124
docs/snippets/csharp/plugins/custom_postprocessor_plugin.cs
Normal file
@@ -0,0 +1,124 @@
|
||||
using Kreuzberg;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
class WordCountPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "word-count";
|
||||
public int Priority => 10;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var wordCount = result.Content.Split(
|
||||
new[] { ' ', '\n', '\r', '\t' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class CleanupPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "text-cleanup";
|
||||
public int Priority => 5;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var cleaned = Regex.Replace(result.Content, @"\s+", " ").Trim();
|
||||
|
||||
cleaned = Regex.Replace(cleaned, @"[^\w\s\.\,\!\?\-]", "");
|
||||
|
||||
result.Content = cleaned;
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class LanguageDetectionPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "language-detection";
|
||||
public int Priority => 1;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var detectedLanguage = DetectLanguage(result.Content);
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["detected_language"] = System.Text.Json.Nodes.JsonValue.Create(detectedLanguage);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string DetectLanguage(string text)
|
||||
{
|
||||
var commonEnglishWords = new[] { "the", "is", "and", "to", "of", "a", "in", "that" };
|
||||
var lowerText = text.ToLower();
|
||||
var matches = commonEnglishWords.Count(word =>
|
||||
Regex.IsMatch(lowerText, $@"\b{word}\b")
|
||||
);
|
||||
|
||||
return matches > 5 ? "en" : "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var wordCountProcessor = new WordCountPostProcessor();
|
||||
var cleanupProcessor = new CleanupPostProcessor();
|
||||
var languageProcessor = new LanguageDetectionPostProcessor();
|
||||
|
||||
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(cleanupProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(languageProcessor);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig();
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine($"Original content length: {result.Content.Length}");
|
||||
|
||||
if (result.Metadata.Additional != null)
|
||||
{
|
||||
if (result.Metadata.Additional.TryGetValue("word_count", out var wc))
|
||||
{
|
||||
Console.WriteLine($"Word count: {wc}");
|
||||
}
|
||||
if (result.Metadata.Additional.TryGetValue("detected_language", out var lang))
|
||||
{
|
||||
Console.WriteLine($"Detected language: {lang}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
103
docs/snippets/csharp/plugins/custom_validator_plugin.cs
Normal file
103
docs/snippets/csharp/plugins/custom_validator_plugin.cs
Normal file
@@ -0,0 +1,103 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class MinLengthValidator : IValidator
|
||||
{
|
||||
private readonly int _minLength;
|
||||
|
||||
public MinLengthValidator(int minLength)
|
||||
{
|
||||
_minLength = minLength;
|
||||
}
|
||||
|
||||
public string Name => "min-length";
|
||||
public int Priority => 10;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
if (result.Content.Length < _minLength)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Content too short: {result.Content.Length} < {_minLength}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class QualityScoreValidator : IValidator
|
||||
{
|
||||
private readonly double _minScore;
|
||||
|
||||
public QualityScoreValidator(double minScore)
|
||||
{
|
||||
_minScore = minScore;
|
||||
}
|
||||
|
||||
public string Name => "quality-score";
|
||||
public int Priority => 5;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
var score = result.QualityScore;
|
||||
|
||||
if (score < _minScore)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Quality score too low: {score:F2} < {_minScore:F2}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class ContentValidValidator : IValidator
|
||||
{
|
||||
public string Name => "content-valid";
|
||||
public int Priority => 20;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(result.Content))
|
||||
{
|
||||
throw new KreuzbergValidationException("Extracted content is empty or whitespace");
|
||||
}
|
||||
|
||||
if (result.Content.Length < 10)
|
||||
{
|
||||
throw new KreuzbergValidationException("Extracted content is too short (minimum 10 characters)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var minLengthValidator = new MinLengthValidator(minLength: 50);
|
||||
var qualityValidator = new QualityScoreValidator(minScore: 0.7);
|
||||
var contentValidator = new ContentValidValidator();
|
||||
|
||||
KreuzbergLib.RegisterValidator(minLengthValidator);
|
||||
KreuzbergLib.RegisterValidator(qualityValidator);
|
||||
KreuzbergLib.RegisterValidator(contentValidator);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine("All validations passed");
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.WriteLine($"Validation failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
43
docs/snippets/csharp/plugins/embedding_backend.md
Normal file
43
docs/snippets/csharp/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
var backend = new CustomEmbeddingBackend();
|
||||
EmbeddingBackendRegistry.Register(backend);
|
||||
|
||||
public class CustomEmbeddingBackend : IEmbeddingBackend
|
||||
{
|
||||
public string Name => "custom-embeddings";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("Embedding backend initialized");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Embedding backend shut down");
|
||||
}
|
||||
|
||||
public ulong Dimensions()
|
||||
{
|
||||
return 384;
|
||||
}
|
||||
|
||||
public List<List<float>> Embed(List<string> texts)
|
||||
{
|
||||
var embeddings = new List<List<float>>();
|
||||
foreach (var text in texts)
|
||||
{
|
||||
var embedding = new List<float>();
|
||||
for (int i = 0; i < 384; i++)
|
||||
{
|
||||
embedding.Add((float)(text.Length % (i + 1)) / (float)(i + 1));
|
||||
}
|
||||
embeddings.Add(embedding);
|
||||
}
|
||||
return embeddings;
|
||||
}
|
||||
}
|
||||
```
|
||||
51
docs/snippets/csharp/plugins/extractor_registration.md
Normal file
51
docs/snippets/csharp/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,51 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var extractor = new JsonDocumentExtractor();
|
||||
KreuzbergLib.RegisterDocumentExtractor(extractor);
|
||||
|
||||
public class JsonDocumentExtractor : IDocumentExtractor
|
||||
{
|
||||
public string Name => "json-extractor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("JSON extractor initialized");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("JSON extractor shut down");
|
||||
}
|
||||
|
||||
public ExtractionResult ExtractBytes(byte[] content, string mimeType, ExtractionConfig config)
|
||||
{
|
||||
var json = System.Text.Encoding.UTF8.GetString(content);
|
||||
|
||||
var result = new ExtractionResult
|
||||
{
|
||||
Content = json,
|
||||
MimeType = mimeType,
|
||||
DetectedLanguages = null
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
public ExtractionResult ExtractFile(string path, string mimeType, ExtractionConfig config)
|
||||
{
|
||||
var content = System.IO.File.ReadAllBytes(path);
|
||||
return ExtractBytes(content, mimeType, config);
|
||||
}
|
||||
|
||||
public string[] SupportedMimeTypes()
|
||||
{
|
||||
return new[] { "application/json", "text/json" };
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/csharp/plugins/list_plugins.md
Normal file
15
docs/snippets/csharp/plugins/list_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var extractors = KreuzbergLib.ListDocumentExtractors();
|
||||
Console.WriteLine("Registered extractors: " + string.Join(", ", extractors));
|
||||
|
||||
var ocrBackends = KreuzbergLib.ListOcrBackends();
|
||||
Console.WriteLine("Registered OCR backends: " + string.Join(", ", ocrBackends));
|
||||
|
||||
var processors = KreuzbergLib.ListPostProcessors();
|
||||
Console.WriteLine("Registered post-processors: " + string.Join(", ", processors));
|
||||
|
||||
var validators = KreuzbergLib.ListValidators();
|
||||
Console.WriteLine("Registered validators: " + string.Join(", ", validators));
|
||||
```
|
||||
45
docs/snippets/csharp/plugins/min_length_validator.md
Normal file
45
docs/snippets/csharp/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var validator = new MinimumLengthValidator();
|
||||
ValidatorRegistry.Register(validator);
|
||||
|
||||
public class MinimumLengthValidator : IValidator
|
||||
{
|
||||
private const int MinimumLength = 10;
|
||||
|
||||
public string Name => "min-length-validator";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine($"Minimum length validator initialized (min: {MinimumLength})");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Minimum length validator shut down");
|
||||
}
|
||||
|
||||
public void Validate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
if (result.Content.Length < MinimumLength)
|
||||
{
|
||||
throw new KreuzbergException(
|
||||
$"Content length {result.Content.Length} is below minimum {MinimumLength}",
|
||||
1001
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public bool ShouldValidate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return !string.IsNullOrEmpty(result.Content);
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
58
docs/snippets/csharp/plugins/pdf_metadata_extractor.md
Normal file
58
docs/snippets/csharp/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,58 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var enricher = new PdfMetadataEnricher();
|
||||
PostProcessorRegistry.Register(enricher);
|
||||
|
||||
public class PdfMetadataEnricher : IPostProcessor
|
||||
{
|
||||
private int _processedCount = 0;
|
||||
|
||||
public string Name => "pdf-metadata-enricher";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("PDF metadata enricher initialized");
|
||||
_processedCount = 0;
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine($"PDF metadata enricher processed {_processedCount} documents");
|
||||
}
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
if (result.MimeType == "application/pdf")
|
||||
{
|
||||
_processedCount++;
|
||||
if (result.Metadata == null)
|
||||
{
|
||||
result.Metadata = new Metadata();
|
||||
}
|
||||
result.Metadata.Author = result.Metadata.Author ?? "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage()
|
||||
{
|
||||
return ProcessingStage.Early;
|
||||
}
|
||||
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return result.MimeType == "application/pdf";
|
||||
}
|
||||
|
||||
public ulong EstimatedDurationMs(ExtractionResult result)
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
54
docs/snippets/csharp/plugins/pdf_only_processor.md
Normal file
54
docs/snippets/csharp/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
public class PdfOnlyProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "pdf-only-processor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
}
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
if (result.MimeType != "application/pdf")
|
||||
{
|
||||
Console.WriteLine($"Skipping non-PDF: {result.MimeType}");
|
||||
}
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage()
|
||||
{
|
||||
return ProcessingStage.Middle;
|
||||
}
|
||||
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return result.MimeType == "application/pdf";
|
||||
}
|
||||
|
||||
public ulong EstimatedDurationMs(ExtractionResult result)
|
||||
{
|
||||
return 10;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var processor = new PdfOnlyProcessor();
|
||||
PostProcessorRegistry.Register(processor);
|
||||
}
|
||||
}
|
||||
```
|
||||
50
docs/snippets/csharp/plugins/plugin_extractor.md
Normal file
50
docs/snippets/csharp/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var extractor = new CustomTextExtractor();
|
||||
KreuzbergLib.RegisterDocumentExtractor(extractor);
|
||||
|
||||
public class CustomTextExtractor : IDocumentExtractor
|
||||
{
|
||||
public string Name => "custom-text-extractor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("Custom text extractor initialized");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Custom text extractor shut down");
|
||||
}
|
||||
|
||||
public ExtractionResult ExtractBytes(byte[] content, string mimeType, ExtractionConfig config)
|
||||
{
|
||||
var text = System.Text.Encoding.UTF8.GetString(content);
|
||||
|
||||
return new ExtractionResult
|
||||
{
|
||||
Content = text.ToUpper(),
|
||||
MimeType = mimeType,
|
||||
DetectedLanguages = null
|
||||
};
|
||||
}
|
||||
|
||||
public ExtractionResult ExtractFile(string path, string mimeType, ExtractionConfig config)
|
||||
{
|
||||
var content = System.IO.File.ReadAllBytes(path);
|
||||
return ExtractBytes(content, mimeType, config);
|
||||
}
|
||||
|
||||
public string[] SupportedMimeTypes()
|
||||
{
|
||||
return new[] { "text/plain" };
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
52
docs/snippets/csharp/plugins/plugin_logging.md
Normal file
52
docs/snippets/csharp/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var processor = new LoggingPostProcessor();
|
||||
PostProcessorRegistry.Register(processor);
|
||||
|
||||
public class LoggingPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "logging-processor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("Logging post-processor initialized");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Logging post-processor shut down");
|
||||
}
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
Console.WriteLine($"Processing: {result.MimeType}, Content length: {result.Content.Length}");
|
||||
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
Console.WriteLine("Warning: Extracted content is empty");
|
||||
}
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage()
|
||||
{
|
||||
return ProcessingStage.Early;
|
||||
}
|
||||
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public ulong EstimatedDurationMs(ExtractionResult result)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
```
|
||||
59
docs/snippets/csharp/plugins/plugin_testing.md
Normal file
59
docs/snippets/csharp/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using Xunit;
|
||||
|
||||
public class CustomValidatorTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestValidatorRegistration()
|
||||
{
|
||||
var validator = new TestValidator();
|
||||
ValidatorRegistry.Register(validator);
|
||||
|
||||
var validators = KreuzbergLib.ListValidators();
|
||||
Assert.Contains("test-validator", validators);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestValidatorProcessing()
|
||||
{
|
||||
var result = new ExtractionResult
|
||||
{
|
||||
Content = "Test content with some length",
|
||||
MimeType = "text/plain"
|
||||
};
|
||||
|
||||
var config = new ExtractionConfig();
|
||||
var validator = new TestValidator();
|
||||
|
||||
validator.Initialize();
|
||||
Assert.True(validator.ShouldValidate(result, config));
|
||||
validator.Validate(result, config);
|
||||
validator.Shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
public class TestValidator : IValidator
|
||||
{
|
||||
public string Name => "test-validator";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize() { }
|
||||
public void Shutdown() { }
|
||||
|
||||
public void Validate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
if (string.IsNullOrEmpty(result.Content))
|
||||
{
|
||||
throw new KreuzbergException("Content cannot be empty", 1000);
|
||||
}
|
||||
}
|
||||
|
||||
public bool ShouldValidate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return !string.IsNullOrEmpty(result.Content);
|
||||
}
|
||||
|
||||
public int Priority() => 50;
|
||||
}
|
||||
```
|
||||
50
docs/snippets/csharp/plugins/plugin_validator.md
Normal file
50
docs/snippets/csharp/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var validator = new ContentTypeValidator("application/pdf", "text/plain");
|
||||
ValidatorRegistry.Register(validator);
|
||||
|
||||
public class ContentTypeValidator : IValidator
|
||||
{
|
||||
private readonly string[] _allowedMimeTypes;
|
||||
|
||||
public ContentTypeValidator(params string[] allowedMimeTypes)
|
||||
{
|
||||
_allowedMimeTypes = allowedMimeTypes;
|
||||
}
|
||||
|
||||
public string Name => "content-type-validator";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine($"Content type validator initialized with types: {string.Join(", ", _allowedMimeTypes)}");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Content type validator shut down");
|
||||
}
|
||||
|
||||
public void Validate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
if (!_allowedMimeTypes.Contains(result.MimeType))
|
||||
{
|
||||
throw new KreuzbergException(
|
||||
$"MIME type {result.MimeType} not allowed. Allowed types: {string.Join(", ", _allowedMimeTypes)}",
|
||||
1002
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public bool ShouldValidate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
62
docs/snippets/csharp/plugins/quality_score_validator.md
Normal file
62
docs/snippets/csharp/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,62 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
public class QualityScoreValidator : IValidator
|
||||
{
|
||||
private const float MinimumQuality = 0.7f;
|
||||
|
||||
public string Name => "quality-score-validator";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine($"Quality score validator initialized (min score: {MinimumQuality})");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Quality score validator shut down");
|
||||
}
|
||||
|
||||
public void Validate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
var qualityScore = CalculateQualityScore(result);
|
||||
|
||||
if (qualityScore < MinimumQuality)
|
||||
{
|
||||
throw new KreuzbergException(
|
||||
$"Quality score {qualityScore:F2} below minimum {MinimumQuality}",
|
||||
1003
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public bool ShouldValidate(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return !string.IsNullOrEmpty(result.Content);
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
|
||||
private float CalculateQualityScore(ExtractionResult result)
|
||||
{
|
||||
var contentLength = result.Content.Length;
|
||||
var hasMetadata = result.Metadata != null;
|
||||
|
||||
var score = (contentLength > 100 ? 0.8f : 0.5f) + (hasMetadata ? 0.2f : 0.0f);
|
||||
return Math.Min(score, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var validator = new QualityScoreValidator();
|
||||
ValidatorRegistry.Register(validator);
|
||||
}
|
||||
}
|
||||
```
|
||||
59
docs/snippets/csharp/plugins/stateful_plugin.md
Normal file
59
docs/snippets/csharp/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Concurrent;
|
||||
|
||||
var processor = new StatefulPostProcessor();
|
||||
PostProcessorRegistry.Register(processor);
|
||||
|
||||
public class StatefulPostProcessor : IPostProcessor
|
||||
{
|
||||
private int _callCount = 0;
|
||||
private readonly ConcurrentDictionary<string, string> _cache = new();
|
||||
|
||||
public string Name => "stateful-processor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("Stateful processor initialized");
|
||||
_callCount = 0;
|
||||
_cache.Clear();
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine($"Stateful processor called {_callCount} times");
|
||||
Console.WriteLine($"Cache contains {_cache.Count} entries");
|
||||
}
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
_callCount++;
|
||||
|
||||
var key = $"last_mime_{_callCount}";
|
||||
_cache.TryAdd(key, result.MimeType);
|
||||
|
||||
Console.WriteLine($"Processing #{_callCount}: {result.MimeType}");
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage()
|
||||
{
|
||||
return ProcessingStage.Middle;
|
||||
}
|
||||
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public ulong EstimatedDurationMs(ExtractionResult result)
|
||||
{
|
||||
return 5;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/csharp/plugins/unregister_plugins.md
Normal file
35
docs/snippets/csharp/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var processor = new UnregisterableProcessor();
|
||||
PostProcessorRegistry.Register(processor);
|
||||
|
||||
Console.WriteLine("Processor registered");
|
||||
var processors = KreuzbergLib.ListPostProcessors();
|
||||
Console.WriteLine($"Active processors: {string.Join(", ", processors)}");
|
||||
|
||||
PostProcessorRegistry.Unregister(processor.Name);
|
||||
Console.WriteLine("Processor unregistered");
|
||||
|
||||
processors = KreuzbergLib.ListPostProcessors();
|
||||
Console.WriteLine($"Active processors: {string.Join(", ", processors)}");
|
||||
|
||||
public class UnregisterableProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "removable-processor";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize() { }
|
||||
public void Shutdown() { }
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
Console.WriteLine("Processing...");
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage() => ProcessingStage.Middle;
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config) => true;
|
||||
public ulong EstimatedDurationMs(ExtractionResult result) => 10;
|
||||
public int Priority() => 50;
|
||||
}
|
||||
```
|
||||
62
docs/snippets/csharp/plugins/word_count_processor.md
Normal file
62
docs/snippets/csharp/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,62 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var processor = new WordCountProcessor();
|
||||
PostProcessorRegistry.Register(processor);
|
||||
|
||||
public class WordCountProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "word-count";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
Console.WriteLine("Word count processor initialized");
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
Console.WriteLine("Word count processor shut down");
|
||||
}
|
||||
|
||||
public void Process(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
var wordCount = CountWords(result.Content);
|
||||
|
||||
if (result.Metadata == null)
|
||||
{
|
||||
result.Metadata = new Metadata();
|
||||
}
|
||||
|
||||
Console.WriteLine($"Document contains {wordCount} words");
|
||||
}
|
||||
|
||||
public ProcessingStage ProcessingStage()
|
||||
{
|
||||
return ProcessingStage.Early;
|
||||
}
|
||||
|
||||
public bool ShouldProcess(ExtractionResult result, ExtractionConfig config)
|
||||
{
|
||||
return !string.IsNullOrEmpty(result.Content);
|
||||
}
|
||||
|
||||
public ulong EstimatedDurationMs(ExtractionResult result)
|
||||
{
|
||||
return 5;
|
||||
}
|
||||
|
||||
public int Priority()
|
||||
{
|
||||
return 50;
|
||||
}
|
||||
|
||||
private int CountWords(string content)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(content))
|
||||
return 0;
|
||||
|
||||
return content.Split(new[] { ' ', '\t', '\n', '\r' }, System.StringSplitOptions.RemoveEmptyEntries).Length;
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user