Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig { ChunkSize = 500, Overlap = 50 },
Pages = new PageConfig { ExtractPages = true }
};
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
if (chunk.Metadata.FirstPage.HasValue)
{
var pageRange = chunk.Metadata.FirstPage == chunk.Metadata.LastPage
? $"Page {chunk.Metadata.FirstPage}"
: $"Pages {chunk.Metadata.FirstPage}-{chunk.Metadata.LastPage}";
Console.WriteLine($"Chunk: {chunk.Text[..50]}... ({pageRange})");
}
}
}

View File

@@ -0,0 +1,33 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
var tasks = new[]
{
KreuzbergLib.ExtractFileAsync("file1.pdf"),
KreuzbergLib.ExtractFileAsync("file2.pdf"),
KreuzbergLib.ExtractFileAsync("file3.pdf")
};
var results = await Task.WhenAll(tasks);
foreach (var r in results)
{
Console.WriteLine($"Extracted {r.Content.Length} characters");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,46 @@
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var filePaths = new[]
{
"document1.pdf",
"document2.pdf",
"document3.pdf"
};
try
{
var batchResults = new List<ExtractionResult>();
foreach (var filePath in filePaths)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
batchResults.Add(result);
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
}
var tasks = filePaths.Select(path =>
KreuzbergLib.ExtractFileAsync(path, config)
).ToArray();
var results = await Task.WhenAll(tasks);
var totalChars = results.Sum(r => r.Content.Length);
Console.WriteLine($"Total extracted: {totalChars} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Batch processing error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,52 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 500,
Overlap = 50
},
Pages = new PageConfig
{
ExtractPages = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
if (chunk.Metadata.FirstPage.HasValue && chunk.Metadata.LastPage.HasValue)
{
var first = chunk.Metadata.FirstPage.Value;
var last = chunk.Metadata.LastPage.Value;
var pageRange = first == last
? $"Page {first}"
: $"Pages {first}-{last}";
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
Console.WriteLine($"Chunk: {preview}... ({pageRange})");
}
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,75 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
Normalize = true,
BatchSize = 32
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
Console.WriteLine($"Chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content length: {chunk.Content.Length}");
if (chunk.Embedding != null)
{
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task PrependHeadingContextExample()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
PrependHeadingContext = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
// Each chunk's content is prefixed with its heading breadcrumb
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,132 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
Normalize = true,
BatchSize = 32
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
Console.WriteLine($"Chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content length: {chunk.Content.Length}");
if (chunk.Embedding != null)
{
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```
```csharp title="C# - Markdown with Heading Context"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Sizing = new ChunkSizingConfig
{
Type = "tokenizer",
Model = "Xenova/gpt-4o"
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
if (chunk.HeadingContext?.Headings != null)
{
Console.WriteLine("Headings:");
foreach (var heading in chunk.HeadingContext.Headings)
{
Console.WriteLine($" Level {heading.Level}: {heading.Text}");
}
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```
```csharp title="C# - Prepend Heading Context"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
PrependHeadingContext = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
// Each chunk's content is prefixed with its heading breadcrumb
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,83 @@
using Kreuzberg;
using System.Collections.Generic;
using System.Linq;
class RagPipelineExample
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
Normalize = true,
BatchSize = 16
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
).ConfigureAwait(false);
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
.ConfigureAwait(false);
var query = "machine learning optimization";
var relevantChunks = await SearchAsync(vectorStore, query)
.ConfigureAwait(false);
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
foreach (var chunk in relevantChunks.Take(3))
{
Console.WriteLine($"Content: {chunk.Content[..80]}...");
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
IEnumerable<Chunk> chunks)
{
return await Task.Run(() =>
{
return chunks.Select(c => new VectorEntry
{
Content = c.Content,
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
Similarity = 0f
}).ToList();
}).ConfigureAwait(false);
}
static async Task<List<VectorEntry>> SearchAsync(
List<VectorEntry> store,
string query)
{
return await Task.Run(() =>
{
return store
.OrderByDescending(e => e.Similarity)
.ToList();
}).ConfigureAwait(false);
}
class VectorEntry
{
public string Content { get; set; } = string.Empty;
public float[] Embedding { get; set; } = Array.Empty<float>();
public float Similarity { get; set; }
}
}

View File

@@ -0,0 +1,85 @@
```csharp title="C#"
using Kreuzberg;
using System.Collections.Generic;
using System.Linq;
class RagPipelineExample
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
Normalize = true,
BatchSize = 16
}
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
).ConfigureAwait(false);
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
.ConfigureAwait(false);
var query = "machine learning optimization";
var relevantChunks = await SearchAsync(vectorStore, query)
.ConfigureAwait(false);
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
foreach (var chunk in relevantChunks.Take(3))
{
Console.WriteLine($"Content: {chunk.Content[..80]}...");
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
IEnumerable<Chunk> chunks)
{
return await Task.Run(() =>
{
return chunks.Select(c => new VectorEntry
{
Content = c.Content,
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
Similarity = 0f
}).ToList();
}).ConfigureAwait(false);
}
static async Task<List<VectorEntry>> SearchAsync(
List<VectorEntry> store,
string query)
{
return await Task.Run(() =>
{
return store
.OrderByDescending(e => e.Similarity)
.ToList();
}).ConfigureAwait(false);
}
class VectorEntry
{
public string Content { get; set; } = string.Empty;
public float[] Embedding { get; set; } = Array.Empty<float>();
public float Similarity { get; set; }
}
}
```

View File

@@ -0,0 +1,72 @@
```csharp title="C#"
using System;
using System.Threading.Tasks;
using Kreuzberg;
async Task RunRagPipeline()
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true,
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true,
MinConfidence = 0.8,
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true,
},
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new Dictionary<string, object?>
{
{ "preset", "balanced" },
},
Enabled = true,
},
Keywords = new KeywordConfig
{
Algorithm = "yake",
MaxKeywords = 10,
},
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length} characters");
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
if (result.Chunks?.Count > 0)
{
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
var firstChunk = result.Chunks[0];
Console.WriteLine($"First chunk tokens: {firstChunk.Metadata.TokenCount}");
if (firstChunk.Embedding?.Length > 0)
{
Console.WriteLine($"Embedding dimensions: {firstChunk.Embedding.Length}");
}
}
Console.WriteLine($"Quality score: {result.QualityScore}");
if (result.ExtractedKeywords?.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", result.ExtractedKeywords)}");
}
}
await RunRagPipeline();
```

View File

@@ -0,0 +1,63 @@
using Kreuzberg;
using System.Collections.Generic;
class CustomCacheBackend
{
private Dictionary<string, ExtractionResult> _cache = new();
public async Task<ExtractionResult> GetOrExtractAsync(
string filePath,
ExtractionConfig config)
{
var cacheKey = GenerateCacheKey(filePath, config);
if (_cache.TryGetValue(cacheKey, out var cachedResult))
{
Console.WriteLine("Using cached result");
return cachedResult;
}
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
_cache[cacheKey] = result;
Console.WriteLine("Result cached");
return result;
}
private string GenerateCacheKey(string filePath, ExtractionConfig config)
{
var configHash = config.ToString().GetHashCode();
return $"{filePath}:{configHash}";
}
public void ClearCache()
{
_cache.Clear();
Console.WriteLine("Cache cleared");
}
}
class Program
{
static async Task Main()
{
var cacheBackend = new CustomCacheBackend();
var config = new ExtractionConfig { UseCache = true };
try
{
var result1 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
Console.WriteLine($"Result 1: {result1.Content.Length} chars");
var result2 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
Console.WriteLine($"Result 2: {result2.Content.Length} chars");
cacheBackend.ClearCache();
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,68 @@
using Kreuzberg;
using System.Text.Json;
// NOTE: IDocumentExtractor interface is not available in C# bindings
class CustomJsonProcessor
{
public static ExtractionResult ProcessJson(byte[] content, string mimeType)
{
try
{
var jsonContent = System.Text.Encoding.UTF8.GetString(content);
var document = JsonDocument.Parse(jsonContent);
var text = ExtractText(document.RootElement);
return new ExtractionResult
{
Content = text,
MimeType = mimeType,
Metadata = new Metadata(),
Tables = new List<Table>(),
Success = true
};
}
catch (JsonException ex)
{
throw new KreuzbergParsingException($"Failed to parse JSON: {ex.Message}");
}
}
private static string ExtractText(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString() + "\n",
JsonValueKind.Array => string.Concat(
element.EnumerateArray().Select(ExtractText)
),
JsonValueKind.Object => string.Concat(
element.EnumerateObject()
.Select(p => ExtractText(p.Value))
),
_ => ""
};
}
}
class Program
{
static void Main()
{
try
{
var jsonBytes = System.Text.Encoding.UTF8.GetBytes(
@"{""name"": ""John"", ""age"": 30}"
);
var result = CustomJsonProcessor.ProcessJson(jsonBytes, "application/json");
Console.WriteLine($"Extracted: {result.Content}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,84 @@
using Kreuzberg;
using System.Net.Http;
using System.Text.Json;
class CloudOcrBackend : IOcrBackend
{
private readonly string _apiKey;
private readonly HttpClient _httpClient;
public CloudOcrBackend(string apiKey)
{
_apiKey = apiKey;
_httpClient = new HttpClient();
}
public string Name => "cloud-ocr";
public string Process(ReadOnlySpan<byte> imageBytes, OcrConfig? config)
{
return Task.Run(async () =>
{
try
{
var bytes = imageBytes.ToArray();
using var content = new MultipartFormDataContent();
content.Add(new ByteArrayContent(bytes), "image");
var request = new HttpRequestMessage(
HttpMethod.Post,
"https://api.example.com/ocr"
)
{
Content = content,
Headers =
{
{ "Authorization", $"Bearer {_apiKey}" }
}
};
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var jsonContent = await response.Content.ReadAsStringAsync();
return jsonContent;
}
catch (HttpRequestException ex)
{
throw new KreuzbergOcrException($"Cloud OCR service error: {ex.Message}");
}
}).GetAwaiter().GetResult();
}
public void Dispose()
{
_httpClient?.Dispose();
}
}
class Program
{
static void Main()
{
using var backend = new CloudOcrBackend("your-api-key");
KreuzbergLib.RegisterOcrBackend(backend);
try
{
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "cloud-ocr"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine($"OCR text: {result.Content}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,80 @@
using Kreuzberg;
class WordCountPostProcessor : IPostProcessor
{
public string Name => "word-count";
public int Priority => 10;
public ExtractionResult Process(ExtractionResult result)
{
var wordCount = result.Content.Split(
new[] { ' ', '\n', '\r', '\t' },
StringSplitOptions.RemoveEmptyEntries
).Length;
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
return result;
}
}
class SentimentPostProcessor : IPostProcessor
{
public string Name => "sentiment-analyzer";
public int Priority => 5;
public ExtractionResult Process(ExtractionResult result)
{
var sentiment = AnalyzeSentiment(result.Content);
if (result.Metadata.Additional == null)
{
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
}
result.Metadata.Additional["sentiment"] = System.Text.Json.Nodes.JsonValue.Create(sentiment);
return result;
}
private string AnalyzeSentiment(string text)
{
return text.Length > 0 ? "neutral" : "unknown";
}
}
class Program
{
static void Main()
{
var wordCountProcessor = new WordCountPostProcessor();
var sentimentProcessor = new SentimentPostProcessor();
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
KreuzbergLib.RegisterPostProcessor(sentimentProcessor);
try
{
var result = KreuzbergLib.ExtractFileSync("document.pdf");
if (result.Metadata.Additional != null)
{
if (result.Metadata.Additional.TryGetValue("word_count", out var wordCount))
{
Console.WriteLine($"Word count: {wordCount}");
}
if (result.Metadata.Additional.TryGetValue("sentiment", out var sentiment))
{
Console.WriteLine($"Sentiment: {sentiment}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,82 @@
using Kreuzberg;
class MinLengthValidator : IValidator
{
private readonly int _minLength;
public MinLengthValidator(int minLength)
{
_minLength = minLength;
}
public string Name => "min-length";
public int Priority => 10;
public void Validate(ExtractionResult result)
{
if (result.Content.Length < _minLength)
{
throw new KreuzbergValidationException(
$"Content too short: {result.Content.Length} < {_minLength}"
);
}
}
}
class QualityScoreValidator : IValidator
{
private readonly double _minScore;
public QualityScoreValidator(double minScore)
{
_minScore = minScore;
}
public string Name => "quality-score";
public int Priority => 5;
public void Validate(ExtractionResult result)
{
var score = result.QualityScore;
if (score < _minScore)
{
throw new KreuzbergValidationException(
$"Quality score too low: {score:F2} < {_minScore:F2}"
);
}
}
}
class Program
{
static void Main()
{
var minLengthValidator = new MinLengthValidator(minLength: 50);
var qualityValidator = new QualityScoreValidator(minScore: 0.7);
KreuzbergLib.RegisterValidator(minLengthValidator);
KreuzbergLib.RegisterValidator(qualityValidator);
try
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine("Validation passed");
Console.WriteLine($"Content length: {result.Content.Length}");
}
catch (KreuzbergValidationException ex)
{
Console.WriteLine($"Validation failed: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,18 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
BatchSize = 16,
Normalize = true,
ShowDownloadProgress = true
}
}
};
```

View File

@@ -0,0 +1,49 @@
```csharp title="C#"
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
```

View File

@@ -0,0 +1,72 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Extracted {result.Content.Length} characters");
}
catch (KreuzbergParsingException ex)
{
Console.WriteLine($"Failed to parse document: {ex.Message}");
}
catch (KreuzbergOcrException ex)
{
Console.WriteLine($"OCR processing failed: {ex.Message}");
}
catch (KreuzbergMissingDependencyException ex)
{
Console.WriteLine($"Missing dependency: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
try
{
var config = new ExtractionConfig();
var pdfBytes = new byte[] { 0x25, 0x50, 0x44, 0x46 };
var result = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf",
config
);
var preview = result.Content.Length > 100
? result.Content[..100] + "..."
: result.Content;
Console.WriteLine($"Extracted: {preview}");
}
catch (KreuzbergValidationException ex)
{
Console.WriteLine($"Invalid configuration: {ex.Message}");
}
catch (KreuzbergOcrException ex)
{
Console.WriteLine($"OCR failed: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
try
{
var result = await KreuzbergLib.ExtractFileAsync("nonexistent.pdf");
}
catch (KreuzbergIOException)
{
Console.WriteLine("File not found");
}
catch (Exception ex)
{
Console.WriteLine($"Unexpected error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,66 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var pdfBytes = await File.ReadAllBytesAsync("document.pdf");
var result = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf"
);
Console.WriteLine($"Content: {result.Content}");
Console.WriteLine($"MIME type: {result.MimeType}");
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result2 = await KreuzbergLib.ExtractBytesAsync(
pdfBytes,
"application/pdf",
config
);
Console.WriteLine($"Configured extraction: {result2.Content.Length} chars");
var imageBytes = new byte[] { };
var imageResult = await KreuzbergLib.ExtractBytesAsync(
imageBytes,
"image/jpeg"
);
Console.WriteLine($"Image text: {imageResult.Content}");
var multipleFiles = new Dictionary<string, (byte[], string)>
{
{ "file1", (await File.ReadAllBytesAsync("file1.pdf"), "application/pdf") },
{ "file2", (await File.ReadAllBytesAsync("file2.pdf"), "application/pdf") }
};
foreach (var (name, (bytes, mimeType)) in multipleFiles)
{
var extractResult = await KreuzbergLib.ExtractBytesAsync(
bytes,
mimeType
);
Console.WriteLine($"{name}: {extractResult.Content.Length} chars");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
catch (IOException ex)
{
Console.WriteLine($"File I/O error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,73 @@
using Kreuzberg;
using System.Net.Http;
class Program
{
static async Task Main()
{
using var httpClient = new HttpClient();
try
{
var url = "https://example.com/document.pdf";
var documentBytes = await httpClient.GetByteArrayAsync(url);
var result = await KreuzbergLib.ExtractBytesAsync(
documentBytes,
"application/pdf"
);
Console.WriteLine($"Extracted from URL: {result.Content.Length} chars");
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result2 = await KreuzbergLib.ExtractBytesAsync(
documentBytes,
"application/pdf",
config
);
Console.WriteLine($"Quality score: {result2.QualityScore}");
var urls = new[]
{
"https://example.com/doc1.pdf",
"https://example.com/doc2.pdf",
"https://example.com/doc3.pdf"
};
var downloadTasks = urls.Select(async u =>
{
try
{
var bytes = await httpClient.GetByteArrayAsync(u);
return await KreuzbergLib.ExtractBytesAsync(
bytes,
"application/pdf"
);
}
catch (HttpRequestException ex)
{
Console.WriteLine($"Download failed for {u}: {ex.Message}");
return null;
}
});
var results = await Task.WhenAll(downloadTasks);
var successCount = results.Count(r => r != null);
Console.WriteLine($"Successfully processed {successCount} documents");
}
catch (HttpRequestException ex)
{
Console.WriteLine($"HTTP error: {ex.Message}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,98 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
ForceOcr = false,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3,
Oem = 3,
MinConfidence = 0.8,
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true
},
EnableTableDetection = true
}
},
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true
},
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 150,
MaxImageDimension = 4096
},
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Preset = "default"
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = false
},
Postprocessor = new PostProcessorConfig
{
Enabled = true
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
);
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
Console.WriteLine($"Format type: {result.Metadata.FormatType}");
if (result.Tables.Any())
{
Console.WriteLine($"Found {result.Tables.Count} tables");
}
if (result.Chunks?.Any() == true)
{
Console.WriteLine($"Created {result.Chunks.Count} chunks");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,15 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3,
NgramRange = (1, 3),
Language = "en"
}
};
```

View File

@@ -0,0 +1,30 @@
```csharp title="C#"
using Kreuzberg;
using System.Collections.Generic;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"research_paper.pdf",
config
);
if (result.Metadata.ContainsKey("keywords"))
{
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
foreach (var kw in keywords)
{
var text = (string)kw["text"];
var score = (double)kw["score"];
Console.WriteLine($"{text}: {score:F3}");
}
}
```

View File

@@ -0,0 +1,37 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = false
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
}
else
{
Console.WriteLine("No language detected");
}
Console.WriteLine($"Content length: {result.Content.Length} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,39 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = false
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
}
else
{
Console.WriteLine("No language detected");
}
Console.WriteLine($"Content length: {result.Content.Length} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,40 @@
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
var languages = result.DetectedLanguages ?? new List<string>();
if (languages.Count > 0)
{
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
}
else
{
Console.WriteLine("No languages detected");
}
Console.WriteLine($"Total content: {result.Content.Length} characters");
Console.WriteLine($"MIME type: {result.MimeType}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Processing failed: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,42 @@
```csharp title="C#"
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = true
}
};
try
{
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
var languages = result.DetectedLanguages ?? new List<string>();
if (languages.Count > 0)
{
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
}
else
{
Console.WriteLine("No languages detected");
}
Console.WriteLine($"Total content: {result.Content.Length} characters");
Console.WriteLine($"MIME type: {result.MimeType}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Processing failed: {ex.Message}");
}
}
}
```

View File

@@ -0,0 +1,65 @@
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static void Main()
{
try
{
var extractors = KreuzbergLib.ListDocumentExtractors();
Console.WriteLine("Registered Document Extractors:");
foreach (var extractor in extractors)
{
Console.WriteLine($" - {extractor}");
}
var ocrBackends = KreuzbergLib.ListOcrBackends();
Console.WriteLine("\nRegistered OCR Backends:");
foreach (var backend in ocrBackends)
{
Console.WriteLine($" - {backend}");
}
var processors = KreuzbergLib.ListPostProcessors();
Console.WriteLine("\nRegistered Post-Processors:");
foreach (var processor in processors)
{
Console.WriteLine($" - {processor}");
}
var validators = KreuzbergLib.ListValidators();
Console.WriteLine("\nRegistered Validators:");
foreach (var validator in validators)
{
Console.WriteLine($" - {validator}");
}
var customProcessor = new CustomPostProcessor();
KreuzbergLib.RegisterPostProcessor(customProcessor);
Console.WriteLine($"\nRegistered custom post-processor: {customProcessor.Name}");
KreuzbergLib.UnregisterPostProcessor(customProcessor.Name);
Console.WriteLine($"Unregistered post-processor: {customProcessor.Name}");
KreuzbergLib.ClearValidators();
Console.WriteLine("All validators cleared");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Plugin registry error: {ex.Message}");
}
}
}
class CustomPostProcessor : IPostProcessor
{
public string Name => "custom-processor";
public int Priority => 50;
public ExtractionResult Process(ExtractionResult result)
{
result.Content = result.Content.ToUpper();
return result;
}
}

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
);
var qualityScore = result.QualityScore;
Console.WriteLine($"Quality score: {qualityScore:F2}");
```

View File

@@ -0,0 +1,29 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFile(
"scanned_document.pdf",
config
);
var qualityScore = result.QualityScore;
if (qualityScore < 0.5)
{
Console.WriteLine(
$"Warning: Low quality extraction ({qualityScore:F2})"
);
Console.WriteLine(
"Consider re-scanning with higher DPI or adjusting OCR settings"
);
}
else
{
Console.WriteLine($"Quality score: {qualityScore:F2}");
}
```

View File

@@ -0,0 +1,108 @@
using Kreuzberg;
using System.IO;
class Program
{
static async Task Main()
{
try
{
var filePath = "large_document.pdf";
await ProcessLargeFileAsync(filePath);
}
catch (Exception ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task ProcessLargeFileAsync(string filePath)
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
var contentChunks = ChunkContent(result.Content, chunkSize: 1000);
Console.WriteLine($"Processing {contentChunks.Count} chunks");
foreach (var (index, chunk) in contentChunks.Select((c, i) => (i, c)))
{
Console.WriteLine($"Chunk {index}: {chunk.Length} characters");
await ProcessChunkAsync(chunk);
}
}
static async Task ProcessChunkAsync(string chunk)
{
var wordCount = chunk.Split(
new[] { ' ', '\n', '\r' },
StringSplitOptions.RemoveEmptyEntries
).Length;
Console.WriteLine($" Words: {wordCount}");
await Task.Delay(10);
}
static List<string> ChunkContent(string content, int chunkSize)
{
var chunks = new List<string>();
for (int i = 0; i < content.Length; i += chunkSize)
{
var chunk = content.Substring(
i,
Math.Min(chunkSize, content.Length - i)
);
chunks.Add(chunk);
}
return chunks;
}
static async IAsyncEnumerable<string> StreamExtractedChunksAsync(
string filePath)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath);
if (result.Chunks?.Any() == true)
{
foreach (var chunk in result.Chunks)
{
yield return chunk.Content;
await Task.Yield();
}
}
else
{
var content = result.Content;
const int chunkSize = 512;
for (int i = 0; i < content.Length; i += chunkSize)
{
var chunk = content.Substring(
i,
Math.Min(chunkSize, content.Length - i)
);
yield return chunk;
await Task.Yield();
}
}
}
static async Task StreamProcessingExample()
{
var streamEnumerator = StreamExtractedChunksAsync("document.pdf");
int index = 0;
await foreach (var chunk in streamEnumerator)
{
Console.WriteLine($"Chunk {index++}: {chunk[..50]}...");
}
}
}

View File

@@ -0,0 +1,14 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionConfig
{
Mode = "moderate", // "off", "moderate", or "aggressive"
PreserveMarkdown = true,
PreserveCode = true,
LanguageHint = "eng"
}
};
```

View File

@@ -0,0 +1,32 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveMarkdown = true
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"verbose_document.pdf",
config
);
var original = result.Metadata.ContainsKey("original_token_count")
? (int)result.Metadata["original_token_count"]
: 0;
var reduced = result.Metadata.ContainsKey("token_count")
? (int)result.Metadata["token_count"]
: 0;
var ratio = result.Metadata.ContainsKey("token_reduction_ratio")
? (double)result.Metadata["token_reduction_ratio"]
: 0.0;
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
Console.WriteLine($"Reduction: {ratio * 100:F1}%");
```

View File

@@ -0,0 +1,74 @@
```csharp title="C#"
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
public class VectorDatabaseIntegration
{
public class VectorRecord
{
public string Id { get; set; }
public float[] Embedding { get; set; }
public string Content { get; set; }
public Dictionary<string, string> Metadata { get; set; }
}
public async Task<List<VectorRecord>> ExtractAndVectorize(
string documentPath,
string documentId)
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32
}
}
};
var result = await Kreuzberg.ExtractFileAsync(documentPath, config);
var chunks = result.Chunks ?? new List<Chunk>();
var vectorRecords = chunks
.Select((chunk, index) => new VectorRecord
{
Id = $"{documentId}_chunk_{index}",
Content = chunk.Content,
Embedding = chunk.Embedding,
Metadata = new Dictionary<string, string>
{
{ "document_id", documentId },
{ "chunk_index", index.ToString() },
{ "content_length", chunk.Content.Length.ToString() }
}
})
.ToList();
await StoreInVectorDatabase(vectorRecords);
return vectorRecords;
}
private async Task StoreInVectorDatabase(List<VectorRecord> records)
{
foreach (var record in records)
{
if (record.Embedding != null && record.Embedding.Length > 0)
{
Console.WriteLine(
$"Storing {record.Id}: {record.Content.Length} chars, " +
$"{record.Embedding.Length} dims");
}
}
await Task.CompletedTask;
}
}
```

View File

@@ -0,0 +1,29 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig { Backend = "tesseract", Language = "eng+deu" },
Chunking = new ChunkingConfig { MaxChars = 1000, MaxOverlap = 100 },
TokenReduction = new TokenReductionConfig { Enabled = true },
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true
},
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk: {chunk.Content[..Math.Min(100, chunk.Content.Length)]}");
}
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
```

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var items = new List<BatchBytesItem>
{
new() { Content = await File.ReadAllBytesAsync("doc1.pdf"), MimeType = "application/pdf", Config = null },
new() { Content = await File.ReadAllBytesAsync("doc2.txt"), MimeType = "text/plain", Config = null }
};
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var results = KreuzbergLib.BatchExtractBytesSync(items, config);
foreach (var result in results)
{
Console.WriteLine($"Content length: {result.Content.Length}");
}
```

View File

@@ -0,0 +1,21 @@
```csharp title="C#"
using Kreuzberg;
var items = new List<BatchFileItem>
{
new() { Path = "document1.pdf", Config = null },
new()
{
Path = "document2.pdf",
Config = new FileExtractionConfig { ForceOcr = true }
}
};
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var results = KreuzbergLib.BatchExtractFilesSync(items, config);
foreach (var result in results)
{
Console.WriteLine($"Content length: {result.Content.Length}");
}
```

View File

@@ -0,0 +1,45 @@
```csharp title="C#"
using System.Net.Http;
using System.Net.Http.Json;
using System.Text.Json.Serialization;
public record ChunkRequest(
[property: JsonPropertyName("text")] string Text,
[property: JsonPropertyName("max_characters")] int? MaxCharacters = null,
[property: JsonPropertyName("overlap")] int? Overlap = null,
[property: JsonPropertyName("chunker_type")] string? ChunkerType = null
);
public record ChunkResponse(
[property: JsonPropertyName("chunks")] List<ChunkItem> Chunks,
[property: JsonPropertyName("chunk_count")] int ChunkCount
);
public record ChunkItem(
[property: JsonPropertyName("content")] string Content,
[property: JsonPropertyName("chunk_index")] int ChunkIndex
);
class Program
{
static async Task Main()
{
var client = new HttpClient();
var request = new ChunkRequest(
Text: "Your long text content here...",
MaxCharacters: 1000,
Overlap: 50,
ChunkerType: "text"
);
var response = await client.PostAsJsonAsync("http://localhost:8000/chunk", request);
var result = await response.Content.ReadFromJsonAsync<ChunkResponse>();
Console.WriteLine($"Created {result?.ChunkCount} chunks");
foreach (var chunk in result?.Chunks ?? [])
{
Console.WriteLine($"Chunk {chunk.ChunkIndex}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}...");
}
}
}
```

View File

@@ -0,0 +1,25 @@
```csharp title="C#"
using System.Net.Http;
using System.Net.Http.Json;
var client = new HttpClient();
using (var fileStream = File.OpenRead("document.pdf"))
{
using (var content = new MultipartFormDataContent())
{
content.Add(new StreamContent(fileStream), "files", "document.pdf");
var response = await client.PostAsync("http://localhost:8000/extract", content);
if (response.IsSuccessStatusCode)
{
var json = await response.Content.ReadAsStringAsync();
Console.WriteLine(json);
}
else
{
Console.WriteLine($"Error: {response.StatusCode}");
}
}
}
```

View File

@@ -0,0 +1,44 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
OutputFormat = OutputFormat.Markdown,
UseCache = true,
Ocr = new OcrConfig
{
Enabled = true,
Backend = OcrBackendType.Tesseract,
Languages = ["eng"]
},
ImageExtraction = new ImageExtractionConfig
{
Enabled = true,
MinImageHeight = 100,
MinImageWidth = 100
},
Chunking = new ChunkingConfig
{
Enabled = true,
ChunkerType = ChunkerType.Text,
MaxCharacters = 2000,
Overlap = 100
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true
}
};
try
{
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Content: {result.Content}");
Console.WriteLine($"Language: {result.Metadata?.LanguageDetection}");
Console.WriteLine($"Format: {result.OutputFormat}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
```

View File

@@ -0,0 +1,18 @@
```csharp title="C#"
using Kreuzberg;
try
{
var result = KreuzbergLib.ExtractFileSync("nonexistent.pdf", null, null);
Console.WriteLine(result.Content);
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error Code: {ex.Code}");
Console.WriteLine($"Error Message: {ex.Message}");
}
catch (Exception ex)
{
Console.WriteLine($"Unexpected error: {ex.Message}");
}
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
try
{
var data = File.ReadAllBytes("document.unsupported");
var result = KreuzbergLib.ExtractBytesSync(data, "application/x-custom", null);
Console.WriteLine(result.Content);
}
catch (KreuzbergException ex) when (ex.Code == 1)
{
Console.WriteLine("Validation error: Invalid MIME type");
}
catch (KreuzbergException ex) when (ex.Code == 2)
{
Console.WriteLine("Format error: MIME type not supported");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed with error {ex.Code}: {ex.Message}");
}
```

View File

@@ -0,0 +1,10 @@
```csharp title="C#"
using Kreuzberg;
var data = await File.ReadAllBytesAsync("document.pdf");
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var result = await KreuzbergLib.ExtractBytes(data, "application/pdf", config);
Console.WriteLine(result.Content);
Console.WriteLine($"MIME Type: {result.MimeType}");
```

View File

@@ -0,0 +1,10 @@
```csharp title="C#"
using Kreuzberg;
var data = File.ReadAllBytes("document.pdf");
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var result = KreuzbergLib.ExtractBytesSync(data, "application/pdf", config);
Console.WriteLine(result.Content);
Console.WriteLine($"MIME Type: {result.MimeType}");
```

View File

@@ -0,0 +1,9 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
Console.WriteLine($"MIME Type: {result.MimeType}");
```

View File

@@ -0,0 +1,9 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
Console.WriteLine(result.Content);
Console.WriteLine($"MIME Type: {result.MimeType}");
```

View File

@@ -0,0 +1,13 @@
```csharp title="C#"
using Kreuzberg;
var documents = new[]
{
new BytesWithMime(await File.ReadAllBytesAsync("doc1.pdf"), "application/pdf"),
new BytesWithMime(await File.ReadAllBytesAsync("doc2.docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
};
var results = KreuzbergLib.BatchExtractBytesSync(documents, new ExtractionConfig());
Console.WriteLine($"Processed {results.Count} documents");
```

View File

@@ -0,0 +1,11 @@
```csharp title="C#"
using Kreuzberg;
var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
var results = KreuzbergLib.BatchExtractFilesSync(files, new ExtractionConfig());
foreach (var result in results)
{
Console.WriteLine($"Content length: {result.Content.Length}");
}
```

View File

@@ -0,0 +1,102 @@
```csharp title="simple_benchmark.cs"
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using Kreuzberg;
using System;
using System.Diagnostics;
using System.Threading.Tasks;
[MemoryDiagnoser]
[SimpleJob(warmupCount: 3, targetCount: 5)]
public class KreuzbergBenchmark
{
private string _testFilePath;
private ExtractionConfig _config;
[GlobalSetup]
public void Setup()
{
_testFilePath = "document.pdf";
_config = new ExtractionConfig
{
UseCache = false,
EnableQualityProcessing = true,
};
}
[Benchmark]
public void ExtractFileSync()
{
var result = KreuzbergLib.ExtractFileSync(_testFilePath, _config);
_ = result.Content.Length;
}
[Benchmark]
public async Task ExtractFileAsync()
{
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, _config);
_ = result.Content.Length;
}
[Benchmark]
public async Task ExtractWithOcr()
{
var ocrConfig = new ExtractionConfig
{
ForceOcr = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng",
}
};
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, ocrConfig);
_ = result.Content.Length;
}
[Benchmark]
public async Task ExtractWithCache()
{
var cacheConfig = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
};
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, cacheConfig);
_ = result.Content.Length;
}
}
public class ManualBenchmark
{
public static async Task Main(string[] args)
{
var filePath = "document.pdf";
var config = new ExtractionConfig();
await KreuzbergLib.ExtractFileAsync(filePath, config);
var sw = Stopwatch.StartNew();
for (int i = 0; i < 10; i++)
{
KreuzbergLib.ExtractFileSync(filePath, config);
}
sw.Stop();
Console.WriteLine($"Sync extraction (10 runs): {sw.ElapsedMilliseconds}ms avg {sw.ElapsedMilliseconds / 10f}ms");
sw.Restart();
var tasks = new System.Collections.Generic.List<Task>();
for (int i = 0; i < 10; i++)
{
tasks.Add(KreuzbergLib.ExtractFileAsync(filePath, config));
}
await Task.WhenAll(tasks);
sw.Stop();
Console.WriteLine($"Async extraction (10 parallel runs): {sw.ElapsedMilliseconds}ms");
var summary = BenchmarkRunner.Run<KreuzbergBenchmark>();
}
}
```

View File

@@ -0,0 +1,42 @@
```csharp title="disk_cache.cs"
using Kreuzberg;
using System;
using System.IO;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
UseCache = true,
CacheConfig = new CacheConfig
{
CachePath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "kreuzberg_cache"),
MaxCacheSize = 1024 * 1024 * 500,
CacheTtlSeconds = 86400 * 7,
EnableCompression = true
}
};
Console.WriteLine("First extraction (will be cached)...");
var result1 = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($" - Content length: {result1.Content.Length}");
Console.WriteLine($" - Cached: {result1.Metadata.WasCached}");
Console.WriteLine("\nSecond extraction (from cache)...");
var result2 = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($" - Content length: {result2.Content.Length}");
Console.WriteLine($" - Cached: {result2.Metadata.WasCached}");
Console.WriteLine($"\nResults are identical: {result1.Content == result2.Content}");
await KreuzbergLib.ClearCacheAsync("document.pdf");
Console.WriteLine("\nCache cleared for document.pdf");
await KreuzbergLib.ClearAllCacheAsync();
Console.WriteLine("All cache cleared");
var cacheStats = await KreuzbergLib.GetCacheStatsAsync();
Console.WriteLine($"\nCache Statistics:");
Console.WriteLine($" - Total entries: {cacheStats.TotalEntries}");
Console.WriteLine($" - Cache size: {cacheStats.CacheSizeBytes / 1024 / 1024} MB");
Console.WriteLine($" - Hit rate: {cacheStats.HitRate:P}");
```

View File

@@ -0,0 +1,10 @@
```csharp title="C#"
using Kreuzberg;
KreuzbergLib.ClearPostProcessors();
KreuzbergLib.ClearValidators();
KreuzbergLib.ClearOcrBackends();
KreuzbergLib.ClearDocumentExtractors();
Console.WriteLine("All plugins cleared");
```

View File

@@ -0,0 +1,46 @@
```csharp title="basic_cli.cs"
using System;
using System.CommandLine;
using System.CommandLine.Invocation;
using System.Threading.Tasks;
using Kreuzberg;
var rootCommand = new RootCommand("Kreuzberg document extraction CLI");
var extractFileCommand = new Command("extract-file", "Extract text from a document file");
var filePath = new Argument<string>("path", "Path to the document file");
var outputFormat = new Option<string>(
new[] { "-f", "--format" },
getDefaultValue: () => "text",
"Output format (text, json)"
);
extractFileCommand.AddArgument(filePath);
extractFileCommand.AddOption(outputFormat);
extractFileCommand.SetHandler(async (path, format) =>
{
try
{
var result = await KreuzbergLib.ExtractFileAsync(path);
if (format == "json")
{
Console.WriteLine(System.Text.Json.JsonSerializer.Serialize(result));
}
else
{
Console.WriteLine(result.Content);
}
}
catch (Exception ex)
{
Console.Error.WriteLine($"Error: {ex.Message}");
Environment.Exit(1);
}
}, filePath, outputFormat);
rootCommand.AddCommand(extractFileCommand);
return await rootCommand.InvokeAsync(args);
```

View File

@@ -0,0 +1,75 @@
```csharp title="cli_with_config.cs"
using System;
using System.CommandLine;
using System.Text.Json;
using System.Threading.Tasks;
using Kreuzberg;
var rootCommand = new RootCommand("Kreuzberg with configuration");
var extractCommand = new Command("extract", "Extract with custom configuration");
var filePath = new Argument<string>("path", "Document file path");
var configPath = new Option<string>(
new[] { "-c", "--config" },
"Path to JSON configuration file"
);
var forceOcr = new Option<bool>(
new[] { "--force-ocr" },
"Force OCR processing"
);
var useCache = new Option<bool>(
new[] { "--use-cache" },
getDefaultValue: () => true,
"Use caching (default: true)"
);
extractCommand.AddArgument(filePath);
extractCommand.AddOption(configPath);
extractCommand.AddOption(forceOcr);
extractCommand.AddOption(useCache);
extractCommand.SetHandler(async (path, config, ocr, cache) =>
{
try
{
ExtractionConfig extractionConfig;
if (!string.IsNullOrEmpty(config))
{
var json = await System.IO.File.ReadAllTextAsync(config);
extractionConfig = JsonSerializer.Deserialize<ExtractionConfig>(json);
}
else
{
extractionConfig = new ExtractionConfig
{
UseCache = cache,
ForceOcr = ocr,
};
}
Console.WriteLine("Extracting with configuration:");
Console.WriteLine($" - File: {path}");
Console.WriteLine($" - Force OCR: {extractionConfig.ForceOcr}");
Console.WriteLine($" - Use Cache: {extractionConfig.UseCache}");
var result = await KreuzbergLib.ExtractFileAsync(path, extractionConfig);
Console.WriteLine($"\nExtraction complete:");
Console.WriteLine($" - Content length: {result.Content.Length}");
Console.WriteLine($" - Format: {result.Metadata.FormatType}");
Console.WriteLine($" - Languages: {string.Join(", ", result.DetectedLanguages)}");
Console.WriteLine($"\n{result.Content}");
}
catch (Exception ex)
{
Console.Error.WriteLine($"Error: {ex.Message}");
Environment.Exit(1);
}
}, filePath, configPath, forceOcr, useCache);
rootCommand.AddCommand(extractCommand);
return await rootCommand.InvokeAsync(args);
```

View File

@@ -0,0 +1,68 @@
```csharp title="C#"
using System.Net.Http.Json;
using System.Text.Json;
using System.Text.Json.Serialization;
// Request models
public record ChunkRequest(
[property: JsonPropertyName("text")] string Text,
[property: JsonPropertyName("chunker_type")] string? ChunkerType = null,
[property: JsonPropertyName("config")] ChunkConfig? Config = null
);
public record ChunkConfig(
[property: JsonPropertyName("max_characters")] int? MaxCharacters = null,
[property: JsonPropertyName("overlap")] int? Overlap = null,
[property: JsonPropertyName("trim")] bool? Trim = null
);
// Response models
public record ChunkResponse(
[property: JsonPropertyName("chunks")] List<ChunkItem> Chunks,
[property: JsonPropertyName("chunk_count")] int ChunkCount,
[property: JsonPropertyName("input_size_bytes")] int InputSizeBytes,
[property: JsonPropertyName("chunker_type")] string ChunkerType
);
public record ChunkItem(
[property: JsonPropertyName("content")] string Content,
[property: JsonPropertyName("byte_start")] int ByteStart,
[property: JsonPropertyName("byte_end")] int ByteEnd,
[property: JsonPropertyName("chunk_index")] int ChunkIndex,
[property: JsonPropertyName("total_chunks")] int TotalChunks,
[property: JsonPropertyName("first_page")] int? FirstPage,
[property: JsonPropertyName("last_page")] int? LastPage
);
class Program
{
static async Task Main()
{
using var client = new HttpClient();
var request = new ChunkRequest(
Text: "Your long text content here...",
ChunkerType: "text",
Config: new ChunkConfig(
MaxCharacters: 1000,
Overlap: 50,
Trim: true
)
);
var response = await client.PostAsJsonAsync(
"http://localhost:8000/chunk",
request
);
var result = await response.Content.ReadFromJsonAsync<ChunkResponse>();
Console.WriteLine($"Created {result?.ChunkCount} chunks");
foreach (var chunk in result?.Chunks ?? [])
{
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
Console.WriteLine($"Chunk {chunk.ChunkIndex}: {preview}...");
}
}
}
```

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using System;
using System.IO;
using System.Net.Http;
var client = new HttpClient();
using (var fileStream = File.OpenRead("document.pdf"))
{
using (var content = new MultipartFormDataContent())
{
content.Add(new StreamContent(fileStream), "files", "document.pdf");
var response = await client.PostAsync("http://localhost:8000/extract", content);
var json = await response.Content.ReadAsStringAsync();
Console.WriteLine(json);
}
}
```

View File

@@ -0,0 +1,56 @@
```csharp title="C#"
using Kreuzberg;
using System.Net.Http;
using System.Text.Json;
public class CloudOcrBackend : IOcrBackend
{
private readonly string _apiKey;
private readonly List<string> _langs = new() { "eng", "deu", "fra" };
public CloudOcrBackend(string apiKey)
{
_apiKey = apiKey;
}
public string Name() => "cloud-ocr";
public string Version() => "1.0.0";
public List<string> SupportedLanguages() => _langs;
public Dictionary<string, object> ProcessImage(byte[] imageBytes, Dictionary<string, object> config)
{
using (var client = new HttpClient())
{
using (var form = new MultipartFormDataContent())
{
form.Add(new ByteArrayContent(imageBytes), "image");
var lang = config.ContainsKey("language") ? config["language"].ToString() : "eng";
form.Add(new StringContent(lang), "language");
var response = client.PostAsync("https://api.example.com/ocr", form).Result;
var json = response.Content.ReadAsStringAsync().Result;
var doc = JsonDocument.Parse(json);
var text = doc.RootElement.GetProperty("text").GetString();
return new Dictionary<string, object>
{
{ "content", text },
{ "mime_type", "text/plain" }
};
}
}
}
public void Initialize() { }
public void Shutdown() { }
}
class Program
{
static void Main()
{
var backend = new CloudOcrBackend(apiKey: "your-api-key");
KreuzbergLib.RegisterOcrBackend(backend);
}
}
```

View File

@@ -0,0 +1,28 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig { Psm = 3 }
},
PdfOptions = new PdfConfig { ExtractImages = true },
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-MiniLM-L6-v2")
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
```

View File

@@ -0,0 +1,48 @@
```csharp title="Element-Based Output (C#)"
using Kreuzberg;
// Configure element-based output
var config = new ExtractionConfig
{
OutputFormat = OutputFormat.ElementBased
};
// Extract document
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
// Access elements
foreach (var element in result.Elements)
{
Console.WriteLine($"Type: {element.ElementType}");
var text = element.Text.Length > 100
? element.Text.Substring(0, 100)
: element.Text;
Console.WriteLine($"Text: {text}");
if (element.Metadata.PageNumber.HasValue)
{
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
}
if (element.Metadata.Coordinates != null)
{
var coords = element.Metadata.Coordinates;
Console.WriteLine($"Coords: ({coords.Left}, {coords.Top}) - ({coords.Right}, {coords.Bottom})");
}
Console.WriteLine("---");
}
// Filter by element type
var titles = result.Elements
.Where(e => e.ElementType == "title");
foreach (var title in titles)
{
var level = title.Metadata.Additional.TryGetValue("level", out var levelValue)
? levelValue.ToString()
: "unknown";
Console.WriteLine($"[{level}] {title.Text}");
}
```

View File

@@ -0,0 +1,41 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu"
},
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true
},
TokenReduction = new TokenReductionOptions
{
Mode = "moderate"
},
Keywords = new KeywordConfig
{
MaxKeywords = 10,
MinScore = 0.1f
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
```

View File

@@ -0,0 +1,9 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,47 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200,
ChunkerType = ChunkerType.Text
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Chunks != null)
{
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk length: {chunk.Content.Length}");
}
}
```
```csharp title="C# - Markdown with Heading Context"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 500,
Overlap = 50,
ChunkerType = ChunkerType.Markdown,
PrependHeadingContext = true
}
};
var result = await KreuzbergLib.ExtractFile("document.md", null, config);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}");
}
}
```

View File

@@ -0,0 +1,12 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,8 @@
```csharp title="C#"
using Kreuzberg;
var config = ExtractionConfig.Discover() ?? new ExtractionConfig();
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,19 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
Console.WriteLine($"Content length: {result.Content.Length}");
if (result.Tables != null)
{
Console.WriteLine($"Tables detected: {result.Tables.Count}");
}
```

View File

@@ -0,0 +1,26 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu",
TesseractConfig = new TesseractConfig
{
Psm = 6
}
},
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Content length: {result.Content.Length}");
```

View File

@@ -0,0 +1,14 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractBytesSync(
new BytesWithMime(fileBytes, "application/pdf"),
config
);
var mimeType = result.MimeType;

View File

@@ -0,0 +1,8 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = false
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,18 @@
```csharp title="Document Structure Config (C#)"
using Kreuzberg;
var config = new ExtractionConfig
{
IncludeDocumentStructure = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Document is not null)
{
foreach (var node in result.Document.Nodes)
{
Console.WriteLine($"[{node.Content.NodeType}]");
}
}
```

View File

@@ -0,0 +1,37 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
ResultFormat = ResultFormat.ElementBased
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Elements != null)
{
foreach (var element in result.Elements)
{
Console.WriteLine($"Type: {element.ElementType}");
Console.WriteLine($"Text: {element.Text.Substring(0, Math.Min(100, element.Text.Length))}");
if (element.Metadata.PageNumber.HasValue)
{
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
}
if (element.Metadata.Coordinates != null)
{
Console.WriteLine($"Coords: ({element.Metadata.Coordinates.X0}, {element.Metadata.Coordinates.Y0})");
}
Console.WriteLine("---");
}
var titles = result.Elements
.Where(e => e.ElementType == ElementType.Title)
.ToList();
Console.WriteLine($"Found {titles.Count} titles");
}
```

View File

@@ -0,0 +1,106 @@
using Kreuzberg.Config;
public class EmbeddingConfigExample
{
public static void Main()
{
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
var embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "~/.cache/kreuzberg/embeddings"
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "BAAI/bge-small-en-v1.5",
Dimensions = 384
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = null // Uses default: .kreuzberg/embeddings/
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "sentence-transformers/all-mpnet-base-v2",
Dimensions = 768
},
BatchSize = 16, // Larger model requires smaller batch size
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "/var/cache/embeddings"
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
var chunkingConfig = new ChunkingConfig
{
MaxChars = 1024,
MaxOverlap = 100,
Preset = "balanced",
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true
}
};
var extractionConfig = new ExtractionConfig
{
Chunking = chunkingConfig
};
}
}
// Key parameter explanations:
//
// BatchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// Normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// CacheDir: Where to store downloaded models
// - null: Uses .kreuzberg/embeddings/ in current directory
// - String path: Custom directory for model storage
//
// ShowDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,25 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200,
Embedding = new EmbeddingConfig
{
Normalize = true,
BatchSize = 16,
ShowDownloadProgress = true,
CacheDir = null
}
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Chunks != null)
{
Console.WriteLine($"Chunks with embeddings: {result.Chunks.Count}");
}
```

View File

@@ -0,0 +1,8 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,60 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
ForceOcr = false,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3,
Oem = 3,
MinConfidence = 0.8,
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true
},
EnableTableDetection = true
}
},
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true
},
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 150,
MaxImageDimension = 4096
},
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = false
},
Postprocessor = new PostProcessorConfig
{
Enabled = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,99 @@
using Kreuzberg.Config;
using Kreuzberg;
public class HierarchyConfigExample
{
public static void Main()
{
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
var hierarchyConfigBasic = new HierarchyConfig
{
Enabled = true,
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = null // No OCR coverage threshold
};
var pdfConfigBasic = new PdfConfig
{
Hierarchy = hierarchyConfigBasic
};
var extractionConfigBasic = new ExtractionConfig
{
PdfOptions = pdfConfigBasic
};
var kreuzberg = new Kreuzberg(extractionConfigBasic);
// var result = kreuzberg.ExtractFileSync("document.pdf");
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
var hierarchyConfigMinimal = new HierarchyConfig
{
Enabled = true,
KClusters = 3, // Minimal clustering: just 3 levels
IncludeBbox = true,
OcrCoverageThreshold = null
};
var pdfConfigMinimal = new PdfConfig
{
Hierarchy = hierarchyConfigMinimal
};
var extractionConfigMinimal = new ExtractionConfig
{
PdfOptions = pdfConfigMinimal
};
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
var hierarchyConfigOcr = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
};
var pdfConfigOcr = new PdfConfig
{
Hierarchy = hierarchyConfigOcr
};
var extractionConfigOcr = new ExtractionConfig
{
PdfOptions = pdfConfigOcr
};
}
}
// Field descriptions:
//
// Enabled: bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: float? (default: null)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
// - null means no OCR coverage-based triggering

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
OutputFormat = OutputFormat.Html,
HtmlOutput = new HtmlOutputConfig
{
Theme = HtmlTheme.GitHub,
EmbedCss = true,
ClassPrefix = "kb-"
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,19 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata != null)
{
var language = result.Metadata.Language;
var format = result.Metadata.FormatType;
}

View File

@@ -0,0 +1,66 @@
using Kreuzberg;
using Kreuzberg.Keywords;
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
var basicYakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.0f,
NgramRange = (1, 3),
Language = "en",
YakeParams = null,
RakeParams = null,
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", basicYakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
var advancedYakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 15,
MinScore = 0.1f,
NgramRange = (1, 2),
Language = "en",
YakeParams = new YakeParams
{
WindowSize = 1,
},
RakeParams = null,
}
};
result = KreuzbergLib.ExtractFileSync("document.pdf", advancedYakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
var rakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Rake,
MaxKeywords = 10,
MinScore = 5.0f,
NgramRange = (1, 3),
Language = "en",
YakeParams = null,
RakeParams = new RakeParams
{
MinWordLength = 1,
MaxWordsPerPhrase = 3,
},
}
};
result = KreuzbergLib.ExtractFileSync("document.pdf", rakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");

View File

@@ -0,0 +1,21 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.1f,
NgramRange = [1, 3],
Language = "en"
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Keywords != null)
{
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
}
```

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = true
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Detected language: {result.Language}");
if (result.DetectedLanguages != null)
{
Console.WriteLine($"All detected: {string.Join(", ", result.DetectedLanguages)}");
}
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 300,
MaxImageDimension = 4096,
AutoAdjustDpi = true,
MinDpi = 150,
MaxDpi = 600
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Images != null)
{
Console.WriteLine($"Extracted images: {result.Images.Count}");
}
```

View File

@@ -0,0 +1,12 @@
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,17 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true
};
var result = KreuzbergLib.ExtractFileSync("document.html", config);
if (result.Metadata?.Format.Text?.Links != null)
{
foreach (var link in result.Metadata.Format.Text.Links)
{
var text = link[0];
var url = link[1];
}
}

View File

@@ -0,0 +1,18 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var title = result.Metadata.Format.Pdf.Title;
var author = result.Metadata.Format.Pdf.Author;
var pageCount = result.Metadata.Format.Pdf.PageCount;
}

View File

@@ -0,0 +1,21 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true,
ExtractAnnotations = false,
Passwords = new List<string> { "password123" }
}
};
var result = await KreuzbergLib.ExtractFile("encrypted.pdf", null, config);
if (result.Metadata != null)
{
Console.WriteLine($"Title: {result.Metadata.Title}");
Console.WriteLine($"Authors: {string.Join(", ", result.Metadata.Authors ?? new List<string>())}");
}
```

View File

@@ -0,0 +1,74 @@
```csharp title="C#"
using Kreuzberg;
// Basic hierarchy configuration with properties
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.8f
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length}");
// Advanced hierarchy detection with custom parameters
var advancedConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 12, // More clusters for detailed hierarchy
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = 0.7f // Higher OCR threshold for stricter detection
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("complex_document.pdf", advancedConfig);
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");
// Minimal configuration with only enabled flag
var minimalConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = true,
// Other properties use defaults:
// KClusters = 6
// IncludeBbox = true
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", minimalConfig);
Console.WriteLine("Extraction with default hierarchy settings complete");
// Disabling hierarchy detection
var noHierarchyConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = false
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", noHierarchyConfig);
Console.WriteLine("Extraction without hierarchy detection complete");
```

View File

@@ -0,0 +1,13 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Postprocessor = new PostProcessorConfig
{
Enabled = true,
EnabledProcessors = new List<string> { "normalize_whitespace", "remove_diacritics" }
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Postprocessor = new PostProcessorConfig
{
Enabled = true,
EnabledProcessors = new List<string>
{
"whitespace_normalizer",
"unicode_normalizer"
},
DisabledProcessors = null
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Processed content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,13 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true,
UseCache = true
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Quality score: {result.QualityScore}");
Console.WriteLine($"Content length: {result.Content.Length}");
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu",
TesseractConfig = new TesseractConfig
{
Psm = 6,
Oem = 3,
MinConfidence = 0.5,
Language = "eng"
}
}
};
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
Console.WriteLine($"OCR text: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,16 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionOptions
{
Mode = "moderate",
PreserveImportantWords = true
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Reduced content length: {result.Content.Length}");
Console.WriteLine($"Content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,18 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (!result.Success)
{
if (result.Metadata?.Error != null)
{
var errorType = result.Metadata.Error.ErrorType;
var errorMessage = result.Metadata.Error.Message;
}
}

View File

@@ -0,0 +1,13 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,10 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var cts = new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(30));
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config, cts.Token);

View File

@@ -0,0 +1,12 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,9 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig();
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
```

View File

@@ -0,0 +1,16 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig { Psm = 3 }
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,96 @@
```csharp title="usage.cs"
using System;
using System.Diagnostics;
using System.IO;
using System.Text.Json;
using System.Threading.Tasks;
var dockerClient = new DockerKreuzbergLib();
try
{
await dockerClient.StartContainerAsync();
await Task.Delay(2000);
var content = await dockerClient.ExtractFileAsync("document.pdf");
Console.WriteLine($"Extracted content:\n{content}");
}
finally
{
await dockerClient.StopContainerAsync();
}
class DockerKreuzbergLib
{
private const string ContainerName = "kreuzberg-api";
private const string ContainerImage = "kreuzberg:latest";
private const int ApiPort = 8000;
public async Task StartContainerAsync()
{
Console.WriteLine("Starting Kreuzberg Docker container...");
var processInfo = new ProcessStartInfo
{
FileName = "docker",
Arguments = $"run -d --name {ContainerName} -p {ApiPort}:8000 {ContainerImage}",
UseShellExecute = false,
RedirectStandardOutput = true,
};
using (var process = Process.Start(processInfo))
{
await process.WaitForExitAsync();
}
Console.WriteLine($"Container started on http://localhost:{ApiPort}");
}
public async Task<string> ExtractFileAsync(string filePath)
{
using (var client = new HttpClient())
{
var fileBytes = await File.ReadAllBytesAsync(filePath);
using (var content = new MultipartFormDataContent())
{
content.Add(new ByteArrayContent(fileBytes), "file", Path.GetFileName(filePath));
var response = await client.PostAsync(
$"http://localhost:{ApiPort}/api/extract",
content
);
response.EnsureSuccessStatusCode();
var json = await response.Content.ReadAsStringAsync();
var result = JsonSerializer.Deserialize<JsonElement>(json);
return result.GetProperty("content").GetString();
}
}
}
public async Task StopContainerAsync()
{
Console.WriteLine("Stopping Kreuzberg Docker container...");
var processInfo = new ProcessStartInfo
{
FileName = "docker",
Arguments = $"stop {ContainerName}",
UseShellExecute = false,
};
using (var process = Process.Start(processInfo))
{
await process.WaitForExitAsync();
}
processInfo.Arguments = $"rm {ContainerName}";
using (var process = Process.Start(processInfo))
{
await process.WaitForExitAsync();
}
Console.WriteLine("Container stopped and removed");
}
}
```

View File

@@ -0,0 +1,23 @@
```csharp title="C#"
using Kreuzberg;
try
{
var result = KreuzbergLib.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
```

View File

@@ -0,0 +1,39 @@
```csharp title="C#"
using System;
using System.IO;
using System.Net.Http;
using System.Text.Json;
var client = new HttpClient();
try
{
using (var fileStream = File.OpenRead("document.pdf"))
{
using (var content = new MultipartFormDataContent())
{
content.Add(new StreamContent(fileStream), "files", "document.pdf");
var response = await client.PostAsync("http://localhost:8000/extract", content);
if (!response.IsSuccessStatusCode)
{
var errorJson = await response.Content.ReadAsStringAsync();
var errorDoc = JsonDocument.Parse(errorJson);
var errorType = errorDoc.RootElement.GetProperty("error_type").GetString();
var message = errorDoc.RootElement.GetProperty("message").GetString();
Console.WriteLine($"Error: {errorType}: {message}");
return;
}
var json = await response.Content.ReadAsStringAsync();
Console.WriteLine($"Success: {json}");
}
}
}
catch (HttpRequestException e)
{
Console.WriteLine($"Request failed: {e.Message}");
}
```

View File

@@ -0,0 +1,9 @@
```csharp title="C#"
using Kreuzberg;
var data = await File.ReadAllBytesAsync("document.pdf");
var result = await KreuzbergLib.ExtractBytesAsync(data, "application/pdf");
Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
```

View File

@@ -0,0 +1,9 @@
```csharp title="C#"
using Kreuzberg;
var data = await File.ReadAllBytesAsync("document.pdf");
var result = KreuzbergLib.ExtractBytesSync(data, "application/pdf");
Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
```

Some files were not shown because too many files have changed in this diff Show More