This commit is contained in:
24
docs/snippets/csharp/advanced/ChunkPageMapping.cs
Normal file
24
docs/snippets/csharp/advanced/ChunkPageMapping.cs
Normal file
@@ -0,0 +1,24 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig { ChunkSize = 500, Overlap = 50 },
|
||||
Pages = new PageConfig { ExtractPages = true }
|
||||
};
|
||||
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.Metadata.FirstPage.HasValue)
|
||||
{
|
||||
var pageRange = chunk.Metadata.FirstPage == chunk.Metadata.LastPage
|
||||
? $"Page {chunk.Metadata.FirstPage}"
|
||||
: $"Pages {chunk.Metadata.FirstPage}-{chunk.Metadata.LastPage}";
|
||||
|
||||
Console.WriteLine($"Chunk: {chunk.Text[..50]}... ({pageRange})");
|
||||
}
|
||||
}
|
||||
}
|
||||
33
docs/snippets/csharp/advanced/async_extraction.cs
Normal file
33
docs/snippets/csharp/advanced/async_extraction.cs
Normal file
@@ -0,0 +1,33 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
|
||||
var tasks = new[]
|
||||
{
|
||||
KreuzbergLib.ExtractFileAsync("file1.pdf"),
|
||||
KreuzbergLib.ExtractFileAsync("file2.pdf"),
|
||||
KreuzbergLib.ExtractFileAsync("file3.pdf")
|
||||
};
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
foreach (var r in results)
|
||||
{
|
||||
Console.WriteLine($"Extracted {r.Content.Length} characters");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
46
docs/snippets/csharp/advanced/batch_processing.cs
Normal file
46
docs/snippets/csharp/advanced/batch_processing.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var filePaths = new[]
|
||||
{
|
||||
"document1.pdf",
|
||||
"document2.pdf",
|
||||
"document3.pdf"
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var batchResults = new List<ExtractionResult>();
|
||||
|
||||
foreach (var filePath in filePaths)
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
batchResults.Add(result);
|
||||
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
|
||||
}
|
||||
|
||||
var tasks = filePaths.Select(path =>
|
||||
KreuzbergLib.ExtractFileAsync(path, config)
|
||||
).ToArray();
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
var totalChars = results.Sum(r => r.Content.Length);
|
||||
Console.WriteLine($"Total extracted: {totalChars} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Batch processing error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
52
docs/snippets/csharp/advanced/chunk_page_mapping.md
Normal file
52
docs/snippets/csharp/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 500,
|
||||
Overlap = 50
|
||||
},
|
||||
Pages = new PageConfig
|
||||
{
|
||||
ExtractPages = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.Metadata.FirstPage.HasValue && chunk.Metadata.LastPage.HasValue)
|
||||
{
|
||||
var first = chunk.Metadata.FirstPage.Value;
|
||||
var last = chunk.Metadata.LastPage.Value;
|
||||
var pageRange = first == last
|
||||
? $"Page {first}"
|
||||
: $"Pages {first}-{last}";
|
||||
|
||||
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
|
||||
Console.WriteLine($"Chunk: {preview}... ({pageRange})");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
75
docs/snippets/csharp/advanced/chunking_config.cs
Normal file
75
docs/snippets/csharp/advanced/chunking_config.cs
Normal file
@@ -0,0 +1,75 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content length: {chunk.Content.Length}");
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task PrependHeadingContextExample()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
132
docs/snippets/csharp/advanced/chunking_config.md
Normal file
132
docs/snippets/csharp/advanced/chunking_config.md
Normal file
@@ -0,0 +1,132 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content length: {chunk.Content.Length}");
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Markdown with Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Sizing = new ChunkSizingConfig
|
||||
{
|
||||
Type = "tokenizer",
|
||||
Model = "Xenova/gpt-4o"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.HeadingContext?.Headings != null)
|
||||
{
|
||||
Console.WriteLine("Headings:");
|
||||
foreach (var heading in chunk.HeadingContext.Headings)
|
||||
{
|
||||
Console.WriteLine($" Level {heading.Level}: {heading.Text}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Prepend Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
83
docs/snippets/csharp/advanced/chunking_rag.cs
Normal file
83
docs/snippets/csharp/advanced/chunking_rag.cs
Normal file
@@ -0,0 +1,83 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
class RagPipelineExample
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 16
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var query = "machine learning optimization";
|
||||
var relevantChunks = await SearchAsync(vectorStore, query)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
|
||||
foreach (var chunk in relevantChunks.Take(3))
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content[..80]}...");
|
||||
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
|
||||
IEnumerable<Chunk> chunks)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return chunks.Select(c => new VectorEntry
|
||||
{
|
||||
Content = c.Content,
|
||||
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
|
||||
Similarity = 0f
|
||||
}).ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> SearchAsync(
|
||||
List<VectorEntry> store,
|
||||
string query)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return store
|
||||
.OrderByDescending(e => e.Similarity)
|
||||
.ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
class VectorEntry
|
||||
{
|
||||
public string Content { get; set; } = string.Empty;
|
||||
public float[] Embedding { get; set; } = Array.Empty<float>();
|
||||
public float Similarity { get; set; }
|
||||
}
|
||||
}
|
||||
85
docs/snippets/csharp/advanced/chunking_rag.md
Normal file
85
docs/snippets/csharp/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,85 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
class RagPipelineExample
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 16
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var query = "machine learning optimization";
|
||||
var relevantChunks = await SearchAsync(vectorStore, query)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
|
||||
foreach (var chunk in relevantChunks.Take(3))
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content[..80]}...");
|
||||
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
|
||||
IEnumerable<Chunk> chunks)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return chunks.Select(c => new VectorEntry
|
||||
{
|
||||
Content = c.Content,
|
||||
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
|
||||
Similarity = 0f
|
||||
}).ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> SearchAsync(
|
||||
List<VectorEntry> store,
|
||||
string query)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return store
|
||||
.OrderByDescending(e => e.Similarity)
|
||||
.ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
class VectorEntry
|
||||
{
|
||||
public string Content { get; set; } = string.Empty;
|
||||
public float[] Embedding { get; set; } = Array.Empty<float>();
|
||||
public float Similarity { get; set; }
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/csharp/advanced/combining_all_features.md
Normal file
72
docs/snippets/csharp/advanced/combining_all_features.md
Normal file
@@ -0,0 +1,72 @@
|
||||
```csharp title="C#"
|
||||
using System;
|
||||
using System.Threading.Tasks;
|
||||
using Kreuzberg;
|
||||
|
||||
async Task RunRagPipeline()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true,
|
||||
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
DetectMultiple = true,
|
||||
MinConfidence = 0.8,
|
||||
},
|
||||
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true,
|
||||
},
|
||||
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new Dictionary<string, object?>
|
||||
{
|
||||
{ "preset", "balanced" },
|
||||
},
|
||||
Enabled = true,
|
||||
},
|
||||
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = "yake",
|
||||
MaxKeywords = 10,
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
|
||||
if (result.Chunks?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
|
||||
var firstChunk = result.Chunks[0];
|
||||
Console.WriteLine($"First chunk tokens: {firstChunk.Metadata.TokenCount}");
|
||||
if (firstChunk.Embedding?.Length > 0)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {firstChunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine($"Quality score: {result.QualityScore}");
|
||||
|
||||
if (result.ExtractedKeywords?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.ExtractedKeywords)}");
|
||||
}
|
||||
}
|
||||
|
||||
await RunRagPipeline();
|
||||
```
|
||||
63
docs/snippets/csharp/advanced/custom_cache.cs
Normal file
63
docs/snippets/csharp/advanced/custom_cache.cs
Normal file
@@ -0,0 +1,63 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class CustomCacheBackend
|
||||
{
|
||||
private Dictionary<string, ExtractionResult> _cache = new();
|
||||
|
||||
public async Task<ExtractionResult> GetOrExtractAsync(
|
||||
string filePath,
|
||||
ExtractionConfig config)
|
||||
{
|
||||
var cacheKey = GenerateCacheKey(filePath, config);
|
||||
|
||||
if (_cache.TryGetValue(cacheKey, out var cachedResult))
|
||||
{
|
||||
Console.WriteLine("Using cached result");
|
||||
return cachedResult;
|
||||
}
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
|
||||
_cache[cacheKey] = result;
|
||||
Console.WriteLine("Result cached");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string GenerateCacheKey(string filePath, ExtractionConfig config)
|
||||
{
|
||||
var configHash = config.ToString().GetHashCode();
|
||||
return $"{filePath}:{configHash}";
|
||||
}
|
||||
|
||||
public void ClearCache()
|
||||
{
|
||||
_cache.Clear();
|
||||
Console.WriteLine("Cache cleared");
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var cacheBackend = new CustomCacheBackend();
|
||||
var config = new ExtractionConfig { UseCache = true };
|
||||
|
||||
try
|
||||
{
|
||||
var result1 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
|
||||
Console.WriteLine($"Result 1: {result1.Content.Length} chars");
|
||||
|
||||
var result2 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
|
||||
Console.WriteLine($"Result 2: {result2.Content.Length} chars");
|
||||
|
||||
cacheBackend.ClearCache();
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
68
docs/snippets/csharp/advanced/custom_extractor.cs
Normal file
68
docs/snippets/csharp/advanced/custom_extractor.cs
Normal file
@@ -0,0 +1,68 @@
|
||||
using Kreuzberg;
|
||||
using System.Text.Json;
|
||||
|
||||
// NOTE: IDocumentExtractor interface is not available in C# bindings
|
||||
|
||||
class CustomJsonProcessor
|
||||
{
|
||||
public static ExtractionResult ProcessJson(byte[] content, string mimeType)
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonContent = System.Text.Encoding.UTF8.GetString(content);
|
||||
var document = JsonDocument.Parse(jsonContent);
|
||||
|
||||
var text = ExtractText(document.RootElement);
|
||||
|
||||
return new ExtractionResult
|
||||
{
|
||||
Content = text,
|
||||
MimeType = mimeType,
|
||||
Metadata = new Metadata(),
|
||||
Tables = new List<Table>(),
|
||||
Success = true
|
||||
};
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
throw new KreuzbergParsingException($"Failed to parse JSON: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string ExtractText(JsonElement element)
|
||||
{
|
||||
return element.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => element.GetString() + "\n",
|
||||
JsonValueKind.Array => string.Concat(
|
||||
element.EnumerateArray().Select(ExtractText)
|
||||
),
|
||||
JsonValueKind.Object => string.Concat(
|
||||
element.EnumerateObject()
|
||||
.Select(p => ExtractText(p.Value))
|
||||
),
|
||||
_ => ""
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonBytes = System.Text.Encoding.UTF8.GetBytes(
|
||||
@"{""name"": ""John"", ""age"": 30}"
|
||||
);
|
||||
|
||||
var result = CustomJsonProcessor.ProcessJson(jsonBytes, "application/json");
|
||||
|
||||
Console.WriteLine($"Extracted: {result.Content}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
84
docs/snippets/csharp/advanced/custom_ocr_backend.cs
Normal file
84
docs/snippets/csharp/advanced/custom_ocr_backend.cs
Normal file
@@ -0,0 +1,84 @@
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
|
||||
class CloudOcrBackend : IOcrBackend
|
||||
{
|
||||
private readonly string _apiKey;
|
||||
private readonly HttpClient _httpClient;
|
||||
|
||||
public CloudOcrBackend(string apiKey)
|
||||
{
|
||||
_apiKey = apiKey;
|
||||
_httpClient = new HttpClient();
|
||||
}
|
||||
|
||||
public string Name => "cloud-ocr";
|
||||
|
||||
public string Process(ReadOnlySpan<byte> imageBytes, OcrConfig? config)
|
||||
{
|
||||
return Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var bytes = imageBytes.ToArray();
|
||||
using var content = new MultipartFormDataContent();
|
||||
content.Add(new ByteArrayContent(bytes), "image");
|
||||
|
||||
var request = new HttpRequestMessage(
|
||||
HttpMethod.Post,
|
||||
"https://api.example.com/ocr"
|
||||
)
|
||||
{
|
||||
Content = content,
|
||||
Headers =
|
||||
{
|
||||
{ "Authorization", $"Bearer {_apiKey}" }
|
||||
}
|
||||
};
|
||||
|
||||
var response = await _httpClient.SendAsync(request);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var jsonContent = await response.Content.ReadAsStringAsync();
|
||||
return jsonContent;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
throw new KreuzbergOcrException($"Cloud OCR service error: {ex.Message}");
|
||||
}
|
||||
}).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_httpClient?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
using var backend = new CloudOcrBackend("your-api-key");
|
||||
KreuzbergLib.RegisterOcrBackend(backend);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "cloud-ocr"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine($"OCR text: {result.Content}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
80
docs/snippets/csharp/advanced/custom_postprocessor.cs
Normal file
80
docs/snippets/csharp/advanced/custom_postprocessor.cs
Normal file
@@ -0,0 +1,80 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class WordCountPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "word-count";
|
||||
public int Priority => 10;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
var wordCount = result.Content.Split(
|
||||
new[] { ' ', '\n', '\r', '\t' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class SentimentPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "sentiment-analyzer";
|
||||
public int Priority => 5;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
var sentiment = AnalyzeSentiment(result.Content);
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["sentiment"] = System.Text.Json.Nodes.JsonValue.Create(sentiment);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string AnalyzeSentiment(string text)
|
||||
{
|
||||
return text.Length > 0 ? "neutral" : "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var wordCountProcessor = new WordCountPostProcessor();
|
||||
var sentimentProcessor = new SentimentPostProcessor();
|
||||
|
||||
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(sentimentProcessor);
|
||||
|
||||
try
|
||||
{
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf");
|
||||
|
||||
if (result.Metadata.Additional != null)
|
||||
{
|
||||
if (result.Metadata.Additional.TryGetValue("word_count", out var wordCount))
|
||||
{
|
||||
Console.WriteLine($"Word count: {wordCount}");
|
||||
}
|
||||
if (result.Metadata.Additional.TryGetValue("sentiment", out var sentiment))
|
||||
{
|
||||
Console.WriteLine($"Sentiment: {sentiment}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
82
docs/snippets/csharp/advanced/custom_validator.cs
Normal file
82
docs/snippets/csharp/advanced/custom_validator.cs
Normal file
@@ -0,0 +1,82 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class MinLengthValidator : IValidator
|
||||
{
|
||||
private readonly int _minLength;
|
||||
|
||||
public MinLengthValidator(int minLength)
|
||||
{
|
||||
_minLength = minLength;
|
||||
}
|
||||
|
||||
public string Name => "min-length";
|
||||
public int Priority => 10;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
if (result.Content.Length < _minLength)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Content too short: {result.Content.Length} < {_minLength}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class QualityScoreValidator : IValidator
|
||||
{
|
||||
private readonly double _minScore;
|
||||
|
||||
public QualityScoreValidator(double minScore)
|
||||
{
|
||||
_minScore = minScore;
|
||||
}
|
||||
|
||||
public string Name => "quality-score";
|
||||
public int Priority => 5;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
var score = result.QualityScore;
|
||||
|
||||
if (score < _minScore)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Quality score too low: {score:F2} < {_minScore:F2}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var minLengthValidator = new MinLengthValidator(minLength: 50);
|
||||
var qualityValidator = new QualityScoreValidator(minScore: 0.7);
|
||||
|
||||
KreuzbergLib.RegisterValidator(minLengthValidator);
|
||||
KreuzbergLib.RegisterValidator(qualityValidator);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine("Validation passed");
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.WriteLine($"Validation failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
18
docs/snippets/csharp/advanced/embedding_config.md
Normal file
18
docs/snippets/csharp/advanced/embedding_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
BatchSize = 16,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true
|
||||
}
|
||||
}
|
||||
};
|
||||
```
|
||||
49
docs/snippets/csharp/advanced/embedding_with_chunking.md
Normal file
49
docs/snippets/csharp/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 32,
|
||||
ShowDownloadProgress = false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
var chunks = result.Chunks ?? new List<Chunk>();
|
||||
foreach (var (index, chunk) in chunks.WithIndex())
|
||||
{
|
||||
var chunkId = $"doc_chunk_{index}";
|
||||
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
|
||||
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
|
||||
internal static class EnumerableExtensions
|
||||
{
|
||||
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
|
||||
this IEnumerable<T> items)
|
||||
{
|
||||
var index = 0;
|
||||
foreach (var item in items)
|
||||
{
|
||||
yield return (index++, item);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/csharp/advanced/error_handling.cs
Normal file
72
docs/snippets/csharp/advanced/error_handling.cs
Normal file
@@ -0,0 +1,72 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
|
||||
Console.WriteLine($"Extracted {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergParsingException ex)
|
||||
{
|
||||
Console.WriteLine($"Failed to parse document: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergOcrException ex)
|
||||
{
|
||||
Console.WriteLine($"OCR processing failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergMissingDependencyException ex)
|
||||
{
|
||||
Console.WriteLine($"Missing dependency: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig();
|
||||
var pdfBytes = new byte[] { 0x25, 0x50, 0x44, 0x46 };
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var preview = result.Content.Length > 100
|
||||
? result.Content[..100] + "..."
|
||||
: result.Content;
|
||||
|
||||
Console.WriteLine($"Extracted: {preview}");
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.WriteLine($"Invalid configuration: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergOcrException ex)
|
||||
{
|
||||
Console.WriteLine($"OCR failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("nonexistent.pdf");
|
||||
}
|
||||
catch (KreuzbergIOException)
|
||||
{
|
||||
Console.WriteLine("File not found");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Unexpected error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
66
docs/snippets/csharp/advanced/extract_from_bytes.cs
Normal file
66
docs/snippets/csharp/advanced/extract_from_bytes.cs
Normal file
@@ -0,0 +1,66 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var pdfBytes = await File.ReadAllBytesAsync("document.pdf");
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Content: {result.Content}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result2 = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Configured extraction: {result2.Content.Length} chars");
|
||||
|
||||
var imageBytes = new byte[] { };
|
||||
|
||||
var imageResult = await KreuzbergLib.ExtractBytesAsync(
|
||||
imageBytes,
|
||||
"image/jpeg"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Image text: {imageResult.Content}");
|
||||
|
||||
var multipleFiles = new Dictionary<string, (byte[], string)>
|
||||
{
|
||||
{ "file1", (await File.ReadAllBytesAsync("file1.pdf"), "application/pdf") },
|
||||
{ "file2", (await File.ReadAllBytesAsync("file2.pdf"), "application/pdf") }
|
||||
};
|
||||
|
||||
foreach (var (name, (bytes, mimeType)) in multipleFiles)
|
||||
{
|
||||
var extractResult = await KreuzbergLib.ExtractBytesAsync(
|
||||
bytes,
|
||||
mimeType
|
||||
);
|
||||
Console.WriteLine($"{name}: {extractResult.Content.Length} chars");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
catch (IOException ex)
|
||||
{
|
||||
Console.WriteLine($"File I/O error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
73
docs/snippets/csharp/advanced/extract_from_url.cs
Normal file
73
docs/snippets/csharp/advanced/extract_from_url.cs
Normal file
@@ -0,0 +1,73 @@
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
using var httpClient = new HttpClient();
|
||||
|
||||
try
|
||||
{
|
||||
var url = "https://example.com/document.pdf";
|
||||
var documentBytes = await httpClient.GetByteArrayAsync(url);
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
documentBytes,
|
||||
"application/pdf"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Extracted from URL: {result.Content.Length} chars");
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result2 = await KreuzbergLib.ExtractBytesAsync(
|
||||
documentBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Quality score: {result2.QualityScore}");
|
||||
|
||||
var urls = new[]
|
||||
{
|
||||
"https://example.com/doc1.pdf",
|
||||
"https://example.com/doc2.pdf",
|
||||
"https://example.com/doc3.pdf"
|
||||
};
|
||||
|
||||
var downloadTasks = urls.Select(async u =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var bytes = await httpClient.GetByteArrayAsync(u);
|
||||
return await KreuzbergLib.ExtractBytesAsync(
|
||||
bytes,
|
||||
"application/pdf"
|
||||
);
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
Console.WriteLine($"Download failed for {u}: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
var results = await Task.WhenAll(downloadTasks);
|
||||
|
||||
var successCount = results.Count(r => r != null);
|
||||
Console.WriteLine($"Successfully processed {successCount} documents");
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
Console.WriteLine($"HTTP error: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
98
docs/snippets/csharp/advanced/extract_with_config.cs
Normal file
98
docs/snippets/csharp/advanced/extract_with_config.cs
Normal file
@@ -0,0 +1,98 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
ForceOcr = false,
|
||||
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 3,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.8,
|
||||
Preprocessing = new ImagePreprocessingConfig
|
||||
{
|
||||
TargetDpi = 300,
|
||||
Denoise = true,
|
||||
Deskew = true,
|
||||
ContrastEnhance = true
|
||||
},
|
||||
EnableTableDetection = true
|
||||
}
|
||||
},
|
||||
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true
|
||||
},
|
||||
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 150,
|
||||
MaxImageDimension = 4096
|
||||
},
|
||||
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Preset = "default"
|
||||
},
|
||||
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
},
|
||||
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = false
|
||||
},
|
||||
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
Console.WriteLine($"Format type: {result.Metadata.FormatType}");
|
||||
|
||||
if (result.Tables.Any())
|
||||
{
|
||||
Console.WriteLine($"Found {result.Tables.Count} tables");
|
||||
}
|
||||
|
||||
if (result.Chunks?.Any() == true)
|
||||
{
|
||||
Console.WriteLine($"Created {result.Chunks.Count} chunks");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
15
docs/snippets/csharp/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/csharp/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
```
|
||||
30
docs/snippets/csharp/advanced/keyword_extraction_example.md
Normal file
30
docs/snippets/csharp/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
if (result.Metadata.ContainsKey("keywords"))
|
||||
{
|
||||
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
|
||||
foreach (var kw in keywords)
|
||||
{
|
||||
var text = (string)kw["text"];
|
||||
var score = (double)kw["score"];
|
||||
Console.WriteLine($"{text}: {score:F3}");
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/csharp/advanced/language_detection_config.cs
Normal file
37
docs/snippets/csharp/advanced/language_detection_config.cs
Normal file
@@ -0,0 +1,37 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No language detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
39
docs/snippets/csharp/advanced/language_detection_config.md
Normal file
39
docs/snippets/csharp/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No language detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,40 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
|
||||
|
||||
var languages = result.DetectedLanguages ?? new List<string>();
|
||||
|
||||
if (languages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No languages detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Total content: {result.Content.Length} characters");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Processing failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
|
||||
|
||||
var languages = result.DetectedLanguages ?? new List<string>();
|
||||
|
||||
if (languages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No languages detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Total content: {result.Content.Length} characters");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Processing failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
65
docs/snippets/csharp/advanced/plugin_registry.cs
Normal file
65
docs/snippets/csharp/advanced/plugin_registry.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var extractors = KreuzbergLib.ListDocumentExtractors();
|
||||
Console.WriteLine("Registered Document Extractors:");
|
||||
foreach (var extractor in extractors)
|
||||
{
|
||||
Console.WriteLine($" - {extractor}");
|
||||
}
|
||||
|
||||
var ocrBackends = KreuzbergLib.ListOcrBackends();
|
||||
Console.WriteLine("\nRegistered OCR Backends:");
|
||||
foreach (var backend in ocrBackends)
|
||||
{
|
||||
Console.WriteLine($" - {backend}");
|
||||
}
|
||||
|
||||
var processors = KreuzbergLib.ListPostProcessors();
|
||||
Console.WriteLine("\nRegistered Post-Processors:");
|
||||
foreach (var processor in processors)
|
||||
{
|
||||
Console.WriteLine($" - {processor}");
|
||||
}
|
||||
|
||||
var validators = KreuzbergLib.ListValidators();
|
||||
Console.WriteLine("\nRegistered Validators:");
|
||||
foreach (var validator in validators)
|
||||
{
|
||||
Console.WriteLine($" - {validator}");
|
||||
}
|
||||
|
||||
var customProcessor = new CustomPostProcessor();
|
||||
KreuzbergLib.RegisterPostProcessor(customProcessor);
|
||||
Console.WriteLine($"\nRegistered custom post-processor: {customProcessor.Name}");
|
||||
|
||||
KreuzbergLib.UnregisterPostProcessor(customProcessor.Name);
|
||||
Console.WriteLine($"Unregistered post-processor: {customProcessor.Name}");
|
||||
|
||||
KreuzbergLib.ClearValidators();
|
||||
Console.WriteLine("All validators cleared");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Plugin registry error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CustomPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "custom-processor";
|
||||
public int Priority => 50;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
result.Content = result.Content.ToUpper();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
17
docs/snippets/csharp/advanced/quality_processing_config.md
Normal file
17
docs/snippets/csharp/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var qualityScore = result.QualityScore;
|
||||
|
||||
Console.WriteLine($"Quality score: {qualityScore:F2}");
|
||||
```
|
||||
29
docs/snippets/csharp/advanced/quality_processing_example.md
Normal file
29
docs/snippets/csharp/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFile(
|
||||
"scanned_document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var qualityScore = result.QualityScore;
|
||||
|
||||
if (qualityScore < 0.5)
|
||||
{
|
||||
Console.WriteLine(
|
||||
$"Warning: Low quality extraction ({qualityScore:F2})"
|
||||
);
|
||||
Console.WriteLine(
|
||||
"Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Quality score: {qualityScore:F2}");
|
||||
}
|
||||
```
|
||||
108
docs/snippets/csharp/advanced/streaming.cs
Normal file
108
docs/snippets/csharp/advanced/streaming.cs
Normal file
@@ -0,0 +1,108 @@
|
||||
using Kreuzberg;
|
||||
using System.IO;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var filePath = "large_document.pdf";
|
||||
|
||||
await ProcessLargeFileAsync(filePath);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task ProcessLargeFileAsync(string filePath)
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
|
||||
var contentChunks = ChunkContent(result.Content, chunkSize: 1000);
|
||||
|
||||
Console.WriteLine($"Processing {contentChunks.Count} chunks");
|
||||
|
||||
foreach (var (index, chunk) in contentChunks.Select((c, i) => (i, c)))
|
||||
{
|
||||
Console.WriteLine($"Chunk {index}: {chunk.Length} characters");
|
||||
await ProcessChunkAsync(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
static async Task ProcessChunkAsync(string chunk)
|
||||
{
|
||||
var wordCount = chunk.Split(
|
||||
new[] { ' ', '\n', '\r' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
Console.WriteLine($" Words: {wordCount}");
|
||||
|
||||
await Task.Delay(10);
|
||||
}
|
||||
|
||||
static List<string> ChunkContent(string content, int chunkSize)
|
||||
{
|
||||
var chunks = new List<string>();
|
||||
|
||||
for (int i = 0; i < content.Length; i += chunkSize)
|
||||
{
|
||||
var chunk = content.Substring(
|
||||
i,
|
||||
Math.Min(chunkSize, content.Length - i)
|
||||
);
|
||||
chunks.Add(chunk);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
static async IAsyncEnumerable<string> StreamExtractedChunksAsync(
|
||||
string filePath)
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath);
|
||||
|
||||
if (result.Chunks?.Any() == true)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
yield return chunk.Content;
|
||||
await Task.Yield();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
var content = result.Content;
|
||||
const int chunkSize = 512;
|
||||
|
||||
for (int i = 0; i < content.Length; i += chunkSize)
|
||||
{
|
||||
var chunk = content.Substring(
|
||||
i,
|
||||
Math.Min(chunkSize, content.Length - i)
|
||||
);
|
||||
yield return chunk;
|
||||
await Task.Yield();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static async Task StreamProcessingExample()
|
||||
{
|
||||
var streamEnumerator = StreamExtractedChunksAsync("document.pdf");
|
||||
|
||||
int index = 0;
|
||||
await foreach (var chunk in streamEnumerator)
|
||||
{
|
||||
Console.WriteLine($"Chunk {index++}: {chunk[..50]}...");
|
||||
}
|
||||
}
|
||||
}
|
||||
14
docs/snippets/csharp/advanced/token_reduction_config.md
Normal file
14
docs/snippets/csharp/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate", // "off", "moderate", or "aggressive"
|
||||
PreserveMarkdown = true,
|
||||
PreserveCode = true,
|
||||
LanguageHint = "eng"
|
||||
}
|
||||
};
|
||||
```
|
||||
32
docs/snippets/csharp/advanced/token_reduction_example.md
Normal file
32
docs/snippets/csharp/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveMarkdown = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"verbose_document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var original = result.Metadata.ContainsKey("original_token_count")
|
||||
? (int)result.Metadata["original_token_count"]
|
||||
: 0;
|
||||
|
||||
var reduced = result.Metadata.ContainsKey("token_count")
|
||||
? (int)result.Metadata["token_count"]
|
||||
: 0;
|
||||
|
||||
var ratio = result.Metadata.ContainsKey("token_reduction_ratio")
|
||||
? (double)result.Metadata["token_reduction_ratio"]
|
||||
: 0.0;
|
||||
|
||||
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
|
||||
Console.WriteLine($"Reduction: {ratio * 100:F1}%");
|
||||
```
|
||||
74
docs/snippets/csharp/advanced/vector_database_integration.md
Normal file
74
docs/snippets/csharp/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
public class VectorDatabaseIntegration
|
||||
{
|
||||
public class VectorRecord
|
||||
{
|
||||
public string Id { get; set; }
|
||||
public float[] Embedding { get; set; }
|
||||
public string Content { get; set; }
|
||||
public Dictionary<string, string> Metadata { get; set; }
|
||||
}
|
||||
|
||||
public async Task<List<VectorRecord>> ExtractAndVectorize(
|
||||
string documentPath,
|
||||
string documentId)
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await Kreuzberg.ExtractFileAsync(documentPath, config);
|
||||
var chunks = result.Chunks ?? new List<Chunk>();
|
||||
|
||||
var vectorRecords = chunks
|
||||
.Select((chunk, index) => new VectorRecord
|
||||
{
|
||||
Id = $"{documentId}_chunk_{index}",
|
||||
Content = chunk.Content,
|
||||
Embedding = chunk.Embedding,
|
||||
Metadata = new Dictionary<string, string>
|
||||
{
|
||||
{ "document_id", documentId },
|
||||
{ "chunk_index", index.ToString() },
|
||||
{ "content_length", chunk.Content.Length.ToString() }
|
||||
}
|
||||
})
|
||||
.ToList();
|
||||
|
||||
await StoreInVectorDatabase(vectorRecords);
|
||||
return vectorRecords;
|
||||
}
|
||||
|
||||
private async Task StoreInVectorDatabase(List<VectorRecord> records)
|
||||
{
|
||||
foreach (var record in records)
|
||||
{
|
||||
if (record.Embedding != null && record.Embedding.Length > 0)
|
||||
{
|
||||
Console.WriteLine(
|
||||
$"Storing {record.Id}: {record.Content.Length} chars, " +
|
||||
$"{record.Embedding.Length} dims");
|
||||
}
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user