Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig { ChunkSize = 500, Overlap = 50 },
Pages = new PageConfig { ExtractPages = true }
};
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
if (chunk.Metadata.FirstPage.HasValue)
{
var pageRange = chunk.Metadata.FirstPage == chunk.Metadata.LastPage
? $"Page {chunk.Metadata.FirstPage}"
: $"Pages {chunk.Metadata.FirstPage}-{chunk.Metadata.LastPage}";
Console.WriteLine($"Chunk: {chunk.Text[..50]}... ({pageRange})");
}
}
}

View File

@@ -0,0 +1,33 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
var tasks = new[]
{
KreuzbergLib.ExtractFileAsync("file1.pdf"),
KreuzbergLib.ExtractFileAsync("file2.pdf"),
KreuzbergLib.ExtractFileAsync("file3.pdf")
};
var results = await Task.WhenAll(tasks);
foreach (var r in results)
{
Console.WriteLine($"Extracted {r.Content.Length} characters");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,46 @@
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var filePaths = new[]
{
"document1.pdf",
"document2.pdf",
"document3.pdf"
};
try
{
var batchResults = new List<ExtractionResult>();
foreach (var filePath in filePaths)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
batchResults.Add(result);
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
}
var tasks = filePaths.Select(path =>
KreuzbergLib.ExtractFileAsync(path, config)
).ToArray();
var results = await Task.WhenAll(tasks);
var totalChars = results.Sum(r => r.Content.Length);
Console.WriteLine($"Total extracted: {totalChars} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Batch processing error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,52 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 500,
Overlap = 50
},
Pages = new PageConfig
{
ExtractPages = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
if (chunk.Metadata.FirstPage.HasValue && chunk.Metadata.LastPage.HasValue)
{
var first = chunk.Metadata.FirstPage.Value;
var last = chunk.Metadata.LastPage.Value;
var pageRange = first == last
? $"Page {first}"
: $"Pages {first}-{last}";
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
Console.WriteLine($"Chunk: {preview}... ({pageRange})");
}
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,75 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
Normalize = true,
BatchSize = 32
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
Console.WriteLine($"Chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content length: {chunk.Content.Length}");
if (chunk.Embedding != null)
{
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task PrependHeadingContextExample()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
PrependHeadingContext = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
// Each chunk's content is prefixed with its heading breadcrumb
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,132 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
Normalize = true,
BatchSize = 32
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
Console.WriteLine($"Chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content length: {chunk.Content.Length}");
if (chunk.Embedding != null)
{
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```
```csharp title="C# - Markdown with Heading Context"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Sizing = new ChunkSizingConfig
{
Type = "tokenizer",
Model = "Xenova/gpt-4o"
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
if (chunk.HeadingContext?.Headings != null)
{
Console.WriteLine("Headings:");
foreach (var heading in chunk.HeadingContext.Headings)
{
Console.WriteLine($" Level {heading.Level}: {heading.Text}");
}
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```
```csharp title="C# - Prepend Heading Context"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
PrependHeadingContext = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
// Each chunk's content is prefixed with its heading breadcrumb
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,83 @@
using Kreuzberg;
using System.Collections.Generic;
using System.Linq;
class RagPipelineExample
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
Normalize = true,
BatchSize = 16
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
).ConfigureAwait(false);
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
.ConfigureAwait(false);
var query = "machine learning optimization";
var relevantChunks = await SearchAsync(vectorStore, query)
.ConfigureAwait(false);
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
foreach (var chunk in relevantChunks.Take(3))
{
Console.WriteLine($"Content: {chunk.Content[..80]}...");
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
IEnumerable<Chunk> chunks)
{
return await Task.Run(() =>
{
return chunks.Select(c => new VectorEntry
{
Content = c.Content,
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
Similarity = 0f
}).ToList();
}).ConfigureAwait(false);
}
static async Task<List<VectorEntry>> SearchAsync(
List<VectorEntry> store,
string query)
{
return await Task.Run(() =>
{
return store
.OrderByDescending(e => e.Similarity)
.ToList();
}).ConfigureAwait(false);
}
class VectorEntry
{
public string Content { get; set; } = string.Empty;
public float[] Embedding { get; set; } = Array.Empty<float>();
public float Similarity { get; set; }
}
}

View File

@@ -0,0 +1,85 @@
```csharp title="C#"
using Kreuzberg;
using System.Collections.Generic;
using System.Linq;
class RagPipelineExample
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
Normalize = true,
BatchSize = 16
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
).ConfigureAwait(false);
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
.ConfigureAwait(false);
var query = "machine learning optimization";
var relevantChunks = await SearchAsync(vectorStore, query)
.ConfigureAwait(false);
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
foreach (var chunk in relevantChunks.Take(3))
{
Console.WriteLine($"Content: {chunk.Content[..80]}...");
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
IEnumerable<Chunk> chunks)
{
return await Task.Run(() =>
{
return chunks.Select(c => new VectorEntry
{
Content = c.Content,
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
Similarity = 0f
}).ToList();
}).ConfigureAwait(false);
}
static async Task<List<VectorEntry>> SearchAsync(
List<VectorEntry> store,
string query)
{
return await Task.Run(() =>
{
return store
.OrderByDescending(e => e.Similarity)
.ToList();
}).ConfigureAwait(false);
}
class VectorEntry
{
public string Content { get; set; } = string.Empty;
public float[] Embedding { get; set; } = Array.Empty<float>();
public float Similarity { get; set; }
}
}
```

View File

@@ -0,0 +1,72 @@
```csharp title="C#"
using System;
using System.Threading.Tasks;
using Kreuzberg;
async Task RunRagPipeline()
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true,
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true,
MinConfidence = 0.8,
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true,
},
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new Dictionary<string, object?>
{
{ "preset", "balanced" },
},
Enabled = true,
},
Keywords = new KeywordConfig
{
Algorithm = "yake",
MaxKeywords = 10,
},
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length} characters");
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
if (result.Chunks?.Count > 0)
{
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
var firstChunk = result.Chunks[0];
Console.WriteLine($"First chunk tokens: {firstChunk.Metadata.TokenCount}");
if (firstChunk.Embedding?.Length > 0)
{
Console.WriteLine($"Embedding dimensions: {firstChunk.Embedding.Length}");
}
}
Console.WriteLine($"Quality score: {result.QualityScore}");
if (result.ExtractedKeywords?.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", result.ExtractedKeywords)}");
}
}
await RunRagPipeline();
```

View File

@@ -0,0 +1,63 @@
using Kreuzberg;
using System.Collections.Generic;
class CustomCacheBackend
{
private Dictionary<string, ExtractionResult> _cache = new();
public async Task<ExtractionResult> GetOrExtractAsync(
string filePath,
ExtractionConfig config)
{
var cacheKey = GenerateCacheKey(filePath, config);
if (_cache.TryGetValue(cacheKey, out var cachedResult))
{
Console.WriteLine("Using cached result");
return cachedResult;
}
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
_cache[cacheKey] = result;
Console.WriteLine("Result cached");
return result;
}
private string GenerateCacheKey(string filePath, ExtractionConfig config)
{
var configHash = config.ToString().GetHashCode();
return $"{filePath}:{configHash}";
}
public void ClearCache()
{
_cache.Clear();
Console.WriteLine("Cache cleared");
}
}
class Program
{
static async Task Main()
{
var cacheBackend = new CustomCacheBackend();
var config = new ExtractionConfig { UseCache = true };
try
{
var result1 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
Console.WriteLine($"Result 1: {result1.Content.Length} chars");
var result2 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
Console.WriteLine($"Result 2: {result2.Content.Length} chars");
cacheBackend.ClearCache();
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,68 @@
using Kreuzberg;
using System.Text.Json;
// NOTE: IDocumentExtractor interface is not available in C# bindings
class CustomJsonProcessor
{
public static ExtractionResult ProcessJson(byte[] content, string mimeType)
{
try
{
var jsonContent = System.Text.Encoding.UTF8.GetString(content);
var document = JsonDocument.Parse(jsonContent);
var text = ExtractText(document.RootElement);
return new ExtractionResult
{
Content = text,
MimeType = mimeType,
Metadata = new Metadata(),
Tables = new List<Table>(),
Success = true
};
}
catch (JsonException ex)
{
throw new KreuzbergParsingException($"Failed to parse JSON: {ex.Message}");
}
}
private static string ExtractText(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString() + "\n",
JsonValueKind.Array => string.Concat(
element.EnumerateArray().Select(ExtractText)
),
JsonValueKind.Object => string.Concat(
element.EnumerateObject()
.Select(p => ExtractText(p.Value))
),
_ => ""
};
}
}
class Program
{
static void Main()
{
try
{
var jsonBytes = System.Text.Encoding.UTF8.GetBytes(
@"{""name"": ""John"", ""age"": 30}"
);
var result = CustomJsonProcessor.ProcessJson(jsonBytes, "application/json");
Console.WriteLine($"Extracted: {result.Content}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,84 @@
using Kreuzberg;
using System.Net.Http;
using System.Text.Json;
class CloudOcrBackend : IOcrBackend
{
private readonly string _apiKey;
private readonly HttpClient _httpClient;
public CloudOcrBackend(string apiKey)
{
_apiKey = apiKey;
_httpClient = new HttpClient();
}
public string Name => "cloud-ocr";
public string Process(ReadOnlySpan<byte> imageBytes, OcrConfig? config)
{
return Task.Run(async () =>
{
try
{
var bytes = imageBytes.ToArray();
using var content = new MultipartFormDataContent();
content.Add(new ByteArrayContent(bytes), "image");
var request = new HttpRequestMessage(
HttpMethod.Post,
"https://api.example.com/ocr"
)
{
Content = content,
Headers =
{
{ "Authorization", $"Bearer {_apiKey}" }
}
};
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var jsonContent = await response.Content.ReadAsStringAsync();
return jsonContent;
}
catch (HttpRequestException ex)
{
throw new KreuzbergOcrException($"Cloud OCR service error: {ex.Message}");
}
}).GetAwaiter().GetResult();
}
public void Dispose()
{
_httpClient?.Dispose();
}
}
class Program
{
static void Main()
{
using var backend = new CloudOcrBackend("your-api-key");
KreuzbergLib.RegisterOcrBackend(backend);
try
{
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "cloud-ocr"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine($"OCR text: {result.Content}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,80 @@
using Kreuzberg;
class WordCountPostProcessor : IPostProcessor
{
public string Name => "word-count";
public int Priority => 10;
public ExtractionResult Process(ExtractionResult result)
{
var wordCount = result.Content.Split(
new[] { ' ', '\n', '\r', '\t' },
StringSplitOptions.RemoveEmptyEntries
).Length;
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
return result;
}
}
class SentimentPostProcessor : IPostProcessor
{
public string Name => "sentiment-analyzer";
public int Priority => 5;
public ExtractionResult Process(ExtractionResult result)
{
var sentiment = AnalyzeSentiment(result.Content);
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["sentiment"] = System.Text.Json.Nodes.JsonValue.Create(sentiment);
return result;
}
private string AnalyzeSentiment(string text)
{
return text.Length > 0 ? "neutral" : "unknown";
}
}
class Program
{
static void Main()
{
var wordCountProcessor = new WordCountPostProcessor();
var sentimentProcessor = new SentimentPostProcessor();
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
KreuzbergLib.RegisterPostProcessor(sentimentProcessor);
try
{
var result = KreuzbergLib.ExtractFileSync("document.pdf");
if (result.Metadata.Additional != null)
{
if (result.Metadata.Additional.TryGetValue("word_count", out var wordCount))
{
Console.WriteLine($"Word count: {wordCount}");
}
if (result.Metadata.Additional.TryGetValue("sentiment", out var sentiment))
{
Console.WriteLine($"Sentiment: {sentiment}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,82 @@
using Kreuzberg;
class MinLengthValidator : IValidator
{
private readonly int _minLength;
public MinLengthValidator(int minLength)
{
_minLength = minLength;
}
public string Name => "min-length";
public int Priority => 10;
public void Validate(ExtractionResult result)
{
if (result.Content.Length < _minLength)
{
throw new KreuzbergValidationException(
$"Content too short: {result.Content.Length} < {_minLength}"
);
}
}
}
class QualityScoreValidator : IValidator
{
private readonly double _minScore;
public QualityScoreValidator(double minScore)
{
_minScore = minScore;
}
public string Name => "quality-score";
public int Priority => 5;
public void Validate(ExtractionResult result)
{
var score = result.QualityScore;
if (score < _minScore)
{
throw new KreuzbergValidationException(
$"Quality score too low: {score:F2} < {_minScore:F2}"
);
}
}
}
class Program
{
static void Main()
{
var minLengthValidator = new MinLengthValidator(minLength: 50);
var qualityValidator = new QualityScoreValidator(minScore: 0.7);
KreuzbergLib.RegisterValidator(minLengthValidator);
KreuzbergLib.RegisterValidator(qualityValidator);
try
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine("Validation passed");
Console.WriteLine($"Content length: {result.Content.Length}");
}
catch (KreuzbergValidationException ex)
{
Console.WriteLine($"Validation failed: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,18 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
BatchSize = 16,
Normalize = true,
ShowDownloadProgress = true
}
}
};
```

View File

@@ -0,0 +1,49 @@
```csharp title="C#"
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
```

View File

@@ -0,0 +1,72 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Extracted {result.Content.Length} characters");
}
catch (KreuzbergParsingException ex)
{
Console.WriteLine($"Failed to parse document: {ex.Message}");
}
catch (KreuzbergOcrException ex)
{
Console.WriteLine($"OCR processing failed: {ex.Message}");
}
catch (KreuzbergMissingDependencyException ex)
{
Console.WriteLine($"Missing dependency: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
try
{
var config = new ExtractionConfig();
var pdfBytes = new byte[] { 0x25, 0x50, 0x44, 0x46 };
var result = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf",
config
);
var preview = result.Content.Length > 100
? result.Content[..100] + "..."
: result.Content;
Console.WriteLine($"Extracted: {preview}");
}
catch (KreuzbergValidationException ex)
{
Console.WriteLine($"Invalid configuration: {ex.Message}");
}
catch (KreuzbergOcrException ex)
{
Console.WriteLine($"OCR failed: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
try
{
var result = await KreuzbergLib.ExtractFileAsync("nonexistent.pdf");
}
catch (KreuzbergIOException)
{
Console.WriteLine("File not found");
}
catch (Exception ex)
{
Console.WriteLine($"Unexpected error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,66 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var pdfBytes = await File.ReadAllBytesAsync("document.pdf");
var result = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf"
);
Console.WriteLine($"Content: {result.Content}");
Console.WriteLine($"MIME type: {result.MimeType}");
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result2 = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf",
config
);
Console.WriteLine($"Configured extraction: {result2.Content.Length} chars");
var imageBytes = new byte[] { };
var imageResult = await KreuzbergLib.ExtractBytesAsync(
imageBytes,
"image/jpeg"
);
Console.WriteLine($"Image text: {imageResult.Content}");
var multipleFiles = new Dictionary<string, (byte[], string)>
{
{ "file1", (await File.ReadAllBytesAsync("file1.pdf"), "application/pdf") },
{ "file2", (await File.ReadAllBytesAsync("file2.pdf"), "application/pdf") }
};
foreach (var (name, (bytes, mimeType)) in multipleFiles)
{
var extractResult = await KreuzbergLib.ExtractBytesAsync(
bytes,
mimeType
);
Console.WriteLine($"{name}: {extractResult.Content.Length} chars");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
catch (IOException ex)
{
Console.WriteLine($"File I/O error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,73 @@
using Kreuzberg;
using System.Net.Http;
class Program
{
static async Task Main()
{
using var httpClient = new HttpClient();
try
{
var url = "https://example.com/document.pdf";
var documentBytes = await httpClient.GetByteArrayAsync(url);
var result = await KreuzbergLib.ExtractBytesAsync(
documentBytes,
"application/pdf"
);
Console.WriteLine($"Extracted from URL: {result.Content.Length} chars");
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result2 = await KreuzbergLib.ExtractBytesAsync(
documentBytes,
"application/pdf",
config
);
Console.WriteLine($"Quality score: {result2.QualityScore}");
var urls = new[]
{
"https://example.com/doc1.pdf",
"https://example.com/doc2.pdf",
"https://example.com/doc3.pdf"
};
var downloadTasks = urls.Select(async u =>
{
try
{
var bytes = await httpClient.GetByteArrayAsync(u);
return await KreuzbergLib.ExtractBytesAsync(
bytes,
"application/pdf"
);
}
catch (HttpRequestException ex)
{
Console.WriteLine($"Download failed for {u}: {ex.Message}");
return null;
}
});
var results = await Task.WhenAll(downloadTasks);
var successCount = results.Count(r => r != null);
Console.WriteLine($"Successfully processed {successCount} documents");
}
catch (HttpRequestException ex)
{
Console.WriteLine($"HTTP error: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,98 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
ForceOcr = false,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3,
Oem = 3,
MinConfidence = 0.8,
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true
},
EnableTableDetection = true
}
},
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true
},
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 150,
MaxImageDimension = 4096
},
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Preset = "default"
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = false
},
Postprocessor = new PostProcessorConfig
{
Enabled = true
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
);
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
Console.WriteLine($"Format type: {result.Metadata.FormatType}");
if (result.Tables.Any())
{
Console.WriteLine($"Found {result.Tables.Count} tables");
}
if (result.Chunks?.Any() == true)
{
Console.WriteLine($"Created {result.Chunks.Count} chunks");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,15 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3,
NgramRange = (1, 3),
Language = "en"
}
};
```

View File

@@ -0,0 +1,30 @@
```csharp title="C#"
using Kreuzberg;
using System.Collections.Generic;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
);
if (result.Metadata.ContainsKey("keywords"))
{
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
foreach (var kw in keywords)
{
var text = (string)kw["text"];
var score = (double)kw["score"];
Console.WriteLine($"{text}: {score:F3}");
}
}
```

View File

@@ -0,0 +1,37 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = false
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
}
else
{
Console.WriteLine("No language detected");
}
Console.WriteLine($"Content length: {result.Content.Length} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,39 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = false
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
}
else
{
Console.WriteLine("No language detected");
}
Console.WriteLine($"Content length: {result.Content.Length} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,40 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
var languages = result.DetectedLanguages ?? new List<string>();
if (languages.Count > 0)
{
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
}
else
{
Console.WriteLine("No languages detected");
}
Console.WriteLine($"Total content: {result.Content.Length} characters");
Console.WriteLine($"MIME type: {result.MimeType}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Processing failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,42 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
var languages = result.DetectedLanguages ?? new List<string>();
if (languages.Count > 0)
{
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
}
else
{
Console.WriteLine("No languages detected");
}
Console.WriteLine($"Total content: {result.Content.Length} characters");
Console.WriteLine($"MIME type: {result.MimeType}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Processing failed: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,65 @@
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static void Main()
{
try
{
var extractors = KreuzbergLib.ListDocumentExtractors();
Console.WriteLine("Registered Document Extractors:");
foreach (var extractor in extractors)
{
Console.WriteLine($" - {extractor}");
}
var ocrBackends = KreuzbergLib.ListOcrBackends();
Console.WriteLine("\nRegistered OCR Backends:");
foreach (var backend in ocrBackends)
{
Console.WriteLine($" - {backend}");
}
var processors = KreuzbergLib.ListPostProcessors();
Console.WriteLine("\nRegistered Post-Processors:");
foreach (var processor in processors)
{
Console.WriteLine($" - {processor}");
}
var validators = KreuzbergLib.ListValidators();
Console.WriteLine("\nRegistered Validators:");
foreach (var validator in validators)
{
Console.WriteLine($" - {validator}");
}
var customProcessor = new CustomPostProcessor();
KreuzbergLib.RegisterPostProcessor(customProcessor);
Console.WriteLine($"\nRegistered custom post-processor: {customProcessor.Name}");
KreuzbergLib.UnregisterPostProcessor(customProcessor.Name);
Console.WriteLine($"Unregistered post-processor: {customProcessor.Name}");
KreuzbergLib.ClearValidators();
Console.WriteLine("All validators cleared");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Plugin registry error: {ex.Message}");
}
}
}
class CustomPostProcessor : IPostProcessor
{
public string Name => "custom-processor";
public int Priority => 50;
public ExtractionResult Process(ExtractionResult result)
{
result.Content = result.Content.ToUpper();
return result;
}
}

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
);
var qualityScore = result.QualityScore;
Console.WriteLine($"Quality score: {qualityScore:F2}");
```

View File

@@ -0,0 +1,29 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFile(
"scanned_document.pdf",
config
);
var qualityScore = result.QualityScore;
if (qualityScore < 0.5)
{
Console.WriteLine(
$"Warning: Low quality extraction ({qualityScore:F2})"
);
Console.WriteLine(
"Consider re-scanning with higher DPI or adjusting OCR settings"
);
}
else
{
Console.WriteLine($"Quality score: {qualityScore:F2}");
}
```

View File

@@ -0,0 +1,108 @@
using Kreuzberg;
using System.IO;
class Program
{
static async Task Main()
{
try
{
var filePath = "large_document.pdf";
await ProcessLargeFileAsync(filePath);
}
catch (Exception ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task ProcessLargeFileAsync(string filePath)
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
var contentChunks = ChunkContent(result.Content, chunkSize: 1000);
Console.WriteLine($"Processing {contentChunks.Count} chunks");
foreach (var (index, chunk) in contentChunks.Select((c, i) => (i, c)))
{
Console.WriteLine($"Chunk {index}: {chunk.Length} characters");
await ProcessChunkAsync(chunk);
}
}
static async Task ProcessChunkAsync(string chunk)
{
var wordCount = chunk.Split(
new[] { ' ', '\n', '\r' },
StringSplitOptions.RemoveEmptyEntries
).Length;
Console.WriteLine($" Words: {wordCount}");
await Task.Delay(10);
}
static List<string> ChunkContent(string content, int chunkSize)
{
var chunks = new List<string>();
for (int i = 0; i < content.Length; i += chunkSize)
{
var chunk = content.Substring(
i,
Math.Min(chunkSize, content.Length - i)
);
chunks.Add(chunk);
}
return chunks;
}
static async IAsyncEnumerable<string> StreamExtractedChunksAsync(
string filePath)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath);
if (result.Chunks?.Any() == true)
{
foreach (var chunk in result.Chunks)
{
yield return chunk.Content;
await Task.Yield();
}
}
else
{
var content = result.Content;
const int chunkSize = 512;
for (int i = 0; i < content.Length; i += chunkSize)
{
var chunk = content.Substring(
i,
Math.Min(chunkSize, content.Length - i)
);
yield return chunk;
await Task.Yield();
}
}
}
static async Task StreamProcessingExample()
{
var streamEnumerator = StreamExtractedChunksAsync("document.pdf");
int index = 0;
await foreach (var chunk in streamEnumerator)
{
Console.WriteLine($"Chunk {index++}: {chunk[..50]}...");
}
}
}

View File

@@ -0,0 +1,14 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionConfig
{
Mode = "moderate", // "off", "moderate", or "aggressive"
PreserveMarkdown = true,
PreserveCode = true,
LanguageHint = "eng"
}
};
```

View File

@@ -0,0 +1,32 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveMarkdown = true
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"verbose_document.pdf",
config
);
var original = result.Metadata.ContainsKey("original_token_count")
? (int)result.Metadata["original_token_count"]
: 0;
var reduced = result.Metadata.ContainsKey("token_count")
? (int)result.Metadata["token_count"]
: 0;
var ratio = result.Metadata.ContainsKey("token_reduction_ratio")
? (double)result.Metadata["token_reduction_ratio"]
: 0.0;
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
Console.WriteLine($"Reduction: {ratio * 100:F1}%");
```

View File

@@ -0,0 +1,74 @@
```csharp title="C#"
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
public class VectorDatabaseIntegration
{
public class VectorRecord
{
public string Id { get; set; }
public float[] Embedding { get; set; }
public string Content { get; set; }
public Dictionary<string, string> Metadata { get; set; }
}
public async Task<List<VectorRecord>> ExtractAndVectorize(
string documentPath,
string documentId)
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32
}
}
};
var result = await Kreuzberg.ExtractFileAsync(documentPath, config);
var chunks = result.Chunks ?? new List<Chunk>();
var vectorRecords = chunks
.Select((chunk, index) => new VectorRecord
{
Id = $"{documentId}_chunk_{index}",
Content = chunk.Content,
Embedding = chunk.Embedding,
Metadata = new Dictionary<string, string>
{
{ "document_id", documentId },
{ "chunk_index", index.ToString() },
{ "content_length", chunk.Content.Length.ToString() }
}
})
.ToList();
await StoreInVectorDatabase(vectorRecords);
return vectorRecords;
}
private async Task StoreInVectorDatabase(List<VectorRecord> records)
{
foreach (var record in records)
{
if (record.Embedding != null && record.Embedding.Length > 0)
{
Console.WriteLine(
$"Storing {record.Id}: {record.Content.Length} chars, " +
$"{record.Embedding.Length} dims");
}
}
await Task.CompletedTask;
}
}
```