This commit is contained in:
24
docs/snippets/csharp/advanced/ChunkPageMapping.cs
Normal file
24
docs/snippets/csharp/advanced/ChunkPageMapping.cs
Normal file
@@ -0,0 +1,24 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig { ChunkSize = 500, Overlap = 50 },
|
||||
Pages = new PageConfig { ExtractPages = true }
|
||||
};
|
||||
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.Metadata.FirstPage.HasValue)
|
||||
{
|
||||
var pageRange = chunk.Metadata.FirstPage == chunk.Metadata.LastPage
|
||||
? $"Page {chunk.Metadata.FirstPage}"
|
||||
: $"Pages {chunk.Metadata.FirstPage}-{chunk.Metadata.LastPage}";
|
||||
|
||||
Console.WriteLine($"Chunk: {chunk.Text[..50]}... ({pageRange})");
|
||||
}
|
||||
}
|
||||
}
|
||||
33
docs/snippets/csharp/advanced/async_extraction.cs
Normal file
33
docs/snippets/csharp/advanced/async_extraction.cs
Normal file
@@ -0,0 +1,33 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
|
||||
var tasks = new[]
|
||||
{
|
||||
KreuzbergLib.ExtractFileAsync("file1.pdf"),
|
||||
KreuzbergLib.ExtractFileAsync("file2.pdf"),
|
||||
KreuzbergLib.ExtractFileAsync("file3.pdf")
|
||||
};
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
foreach (var r in results)
|
||||
{
|
||||
Console.WriteLine($"Extracted {r.Content.Length} characters");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
46
docs/snippets/csharp/advanced/batch_processing.cs
Normal file
46
docs/snippets/csharp/advanced/batch_processing.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var filePaths = new[]
|
||||
{
|
||||
"document1.pdf",
|
||||
"document2.pdf",
|
||||
"document3.pdf"
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var batchResults = new List<ExtractionResult>();
|
||||
|
||||
foreach (var filePath in filePaths)
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
batchResults.Add(result);
|
||||
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
|
||||
}
|
||||
|
||||
var tasks = filePaths.Select(path =>
|
||||
KreuzbergLib.ExtractFileAsync(path, config)
|
||||
).ToArray();
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
var totalChars = results.Sum(r => r.Content.Length);
|
||||
Console.WriteLine($"Total extracted: {totalChars} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Batch processing error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
52
docs/snippets/csharp/advanced/chunk_page_mapping.md
Normal file
52
docs/snippets/csharp/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 500,
|
||||
Overlap = 50
|
||||
},
|
||||
Pages = new PageConfig
|
||||
{
|
||||
ExtractPages = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.Metadata.FirstPage.HasValue && chunk.Metadata.LastPage.HasValue)
|
||||
{
|
||||
var first = chunk.Metadata.FirstPage.Value;
|
||||
var last = chunk.Metadata.LastPage.Value;
|
||||
var pageRange = first == last
|
||||
? $"Page {first}"
|
||||
: $"Pages {first}-{last}";
|
||||
|
||||
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
|
||||
Console.WriteLine($"Chunk: {preview}... ({pageRange})");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
75
docs/snippets/csharp/advanced/chunking_config.cs
Normal file
75
docs/snippets/csharp/advanced/chunking_config.cs
Normal file
@@ -0,0 +1,75 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content length: {chunk.Content.Length}");
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task PrependHeadingContextExample()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
132
docs/snippets/csharp/advanced/chunking_config.md
Normal file
132
docs/snippets/csharp/advanced/chunking_config.md
Normal file
@@ -0,0 +1,132 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content length: {chunk.Content.Length}");
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Markdown with Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Sizing = new ChunkSizingConfig
|
||||
{
|
||||
Type = "tokenizer",
|
||||
Model = "Xenova/gpt-4o"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
if (chunk.HeadingContext?.Headings != null)
|
||||
{
|
||||
Console.WriteLine("Headings:");
|
||||
foreach (var heading in chunk.HeadingContext.Headings)
|
||||
{
|
||||
Console.WriteLine($" Level {heading.Level}: {heading.Text}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Prepend Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.md",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
83
docs/snippets/csharp/advanced/chunking_rag.cs
Normal file
83
docs/snippets/csharp/advanced/chunking_rag.cs
Normal file
@@ -0,0 +1,83 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
class RagPipelineExample
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 16
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var query = "machine learning optimization";
|
||||
var relevantChunks = await SearchAsync(vectorStore, query)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
|
||||
foreach (var chunk in relevantChunks.Take(3))
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content[..80]}...");
|
||||
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
|
||||
IEnumerable<Chunk> chunks)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return chunks.Select(c => new VectorEntry
|
||||
{
|
||||
Content = c.Content,
|
||||
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
|
||||
Similarity = 0f
|
||||
}).ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> SearchAsync(
|
||||
List<VectorEntry> store,
|
||||
string query)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return store
|
||||
.OrderByDescending(e => e.Similarity)
|
||||
.ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
class VectorEntry
|
||||
{
|
||||
public string Content { get; set; } = string.Empty;
|
||||
public float[] Embedding { get; set; } = Array.Empty<float>();
|
||||
public float Similarity { get; set; }
|
||||
}
|
||||
}
|
||||
85
docs/snippets/csharp/advanced/chunking_rag.md
Normal file
85
docs/snippets/csharp/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,85 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
class RagPipelineExample
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 500,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
Normalize = true,
|
||||
BatchSize = 16
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
).ConfigureAwait(false);
|
||||
|
||||
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var query = "machine learning optimization";
|
||||
var relevantChunks = await SearchAsync(vectorStore, query)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
|
||||
foreach (var chunk in relevantChunks.Take(3))
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content[..80]}...");
|
||||
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
|
||||
IEnumerable<Chunk> chunks)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return chunks.Select(c => new VectorEntry
|
||||
{
|
||||
Content = c.Content,
|
||||
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
|
||||
Similarity = 0f
|
||||
}).ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
static async Task<List<VectorEntry>> SearchAsync(
|
||||
List<VectorEntry> store,
|
||||
string query)
|
||||
{
|
||||
return await Task.Run(() =>
|
||||
{
|
||||
return store
|
||||
.OrderByDescending(e => e.Similarity)
|
||||
.ToList();
|
||||
}).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
class VectorEntry
|
||||
{
|
||||
public string Content { get; set; } = string.Empty;
|
||||
public float[] Embedding { get; set; } = Array.Empty<float>();
|
||||
public float Similarity { get; set; }
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/csharp/advanced/combining_all_features.md
Normal file
72
docs/snippets/csharp/advanced/combining_all_features.md
Normal file
@@ -0,0 +1,72 @@
|
||||
```csharp title="C#"
|
||||
using System;
|
||||
using System.Threading.Tasks;
|
||||
using Kreuzberg;
|
||||
|
||||
async Task RunRagPipeline()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true,
|
||||
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
DetectMultiple = true,
|
||||
MinConfidence = 0.8,
|
||||
},
|
||||
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true,
|
||||
},
|
||||
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new Dictionary<string, object?>
|
||||
{
|
||||
{ "preset", "balanced" },
|
||||
},
|
||||
Enabled = true,
|
||||
},
|
||||
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = "yake",
|
||||
MaxKeywords = 10,
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
|
||||
if (result.Chunks?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
|
||||
var firstChunk = result.Chunks[0];
|
||||
Console.WriteLine($"First chunk tokens: {firstChunk.Metadata.TokenCount}");
|
||||
if (firstChunk.Embedding?.Length > 0)
|
||||
{
|
||||
Console.WriteLine($"Embedding dimensions: {firstChunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine($"Quality score: {result.QualityScore}");
|
||||
|
||||
if (result.ExtractedKeywords?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.ExtractedKeywords)}");
|
||||
}
|
||||
}
|
||||
|
||||
await RunRagPipeline();
|
||||
```
|
||||
63
docs/snippets/csharp/advanced/custom_cache.cs
Normal file
63
docs/snippets/csharp/advanced/custom_cache.cs
Normal file
@@ -0,0 +1,63 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class CustomCacheBackend
|
||||
{
|
||||
private Dictionary<string, ExtractionResult> _cache = new();
|
||||
|
||||
public async Task<ExtractionResult> GetOrExtractAsync(
|
||||
string filePath,
|
||||
ExtractionConfig config)
|
||||
{
|
||||
var cacheKey = GenerateCacheKey(filePath, config);
|
||||
|
||||
if (_cache.TryGetValue(cacheKey, out var cachedResult))
|
||||
{
|
||||
Console.WriteLine("Using cached result");
|
||||
return cachedResult;
|
||||
}
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
|
||||
_cache[cacheKey] = result;
|
||||
Console.WriteLine("Result cached");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string GenerateCacheKey(string filePath, ExtractionConfig config)
|
||||
{
|
||||
var configHash = config.ToString().GetHashCode();
|
||||
return $"{filePath}:{configHash}";
|
||||
}
|
||||
|
||||
public void ClearCache()
|
||||
{
|
||||
_cache.Clear();
|
||||
Console.WriteLine("Cache cleared");
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var cacheBackend = new CustomCacheBackend();
|
||||
var config = new ExtractionConfig { UseCache = true };
|
||||
|
||||
try
|
||||
{
|
||||
var result1 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
|
||||
Console.WriteLine($"Result 1: {result1.Content.Length} chars");
|
||||
|
||||
var result2 = await cacheBackend.GetOrExtractAsync("document.pdf", config);
|
||||
Console.WriteLine($"Result 2: {result2.Content.Length} chars");
|
||||
|
||||
cacheBackend.ClearCache();
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
68
docs/snippets/csharp/advanced/custom_extractor.cs
Normal file
68
docs/snippets/csharp/advanced/custom_extractor.cs
Normal file
@@ -0,0 +1,68 @@
|
||||
using Kreuzberg;
|
||||
using System.Text.Json;
|
||||
|
||||
// NOTE: IDocumentExtractor interface is not available in C# bindings
|
||||
|
||||
class CustomJsonProcessor
|
||||
{
|
||||
public static ExtractionResult ProcessJson(byte[] content, string mimeType)
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonContent = System.Text.Encoding.UTF8.GetString(content);
|
||||
var document = JsonDocument.Parse(jsonContent);
|
||||
|
||||
var text = ExtractText(document.RootElement);
|
||||
|
||||
return new ExtractionResult
|
||||
{
|
||||
Content = text,
|
||||
MimeType = mimeType,
|
||||
Metadata = new Metadata(),
|
||||
Tables = new List<Table>(),
|
||||
Success = true
|
||||
};
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
throw new KreuzbergParsingException($"Failed to parse JSON: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string ExtractText(JsonElement element)
|
||||
{
|
||||
return element.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => element.GetString() + "\n",
|
||||
JsonValueKind.Array => string.Concat(
|
||||
element.EnumerateArray().Select(ExtractText)
|
||||
),
|
||||
JsonValueKind.Object => string.Concat(
|
||||
element.EnumerateObject()
|
||||
.Select(p => ExtractText(p.Value))
|
||||
),
|
||||
_ => ""
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var jsonBytes = System.Text.Encoding.UTF8.GetBytes(
|
||||
@"{""name"": ""John"", ""age"": 30}"
|
||||
);
|
||||
|
||||
var result = CustomJsonProcessor.ProcessJson(jsonBytes, "application/json");
|
||||
|
||||
Console.WriteLine($"Extracted: {result.Content}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
84
docs/snippets/csharp/advanced/custom_ocr_backend.cs
Normal file
84
docs/snippets/csharp/advanced/custom_ocr_backend.cs
Normal file
@@ -0,0 +1,84 @@
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
|
||||
class CloudOcrBackend : IOcrBackend
|
||||
{
|
||||
private readonly string _apiKey;
|
||||
private readonly HttpClient _httpClient;
|
||||
|
||||
public CloudOcrBackend(string apiKey)
|
||||
{
|
||||
_apiKey = apiKey;
|
||||
_httpClient = new HttpClient();
|
||||
}
|
||||
|
||||
public string Name => "cloud-ocr";
|
||||
|
||||
public string Process(ReadOnlySpan<byte> imageBytes, OcrConfig? config)
|
||||
{
|
||||
return Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var bytes = imageBytes.ToArray();
|
||||
using var content = new MultipartFormDataContent();
|
||||
content.Add(new ByteArrayContent(bytes), "image");
|
||||
|
||||
var request = new HttpRequestMessage(
|
||||
HttpMethod.Post,
|
||||
"https://api.example.com/ocr"
|
||||
)
|
||||
{
|
||||
Content = content,
|
||||
Headers =
|
||||
{
|
||||
{ "Authorization", $"Bearer {_apiKey}" }
|
||||
}
|
||||
};
|
||||
|
||||
var response = await _httpClient.SendAsync(request);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var jsonContent = await response.Content.ReadAsStringAsync();
|
||||
return jsonContent;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
throw new KreuzbergOcrException($"Cloud OCR service error: {ex.Message}");
|
||||
}
|
||||
}).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_httpClient?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
using var backend = new CloudOcrBackend("your-api-key");
|
||||
KreuzbergLib.RegisterOcrBackend(backend);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "cloud-ocr"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine($"OCR text: {result.Content}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
80
docs/snippets/csharp/advanced/custom_postprocessor.cs
Normal file
80
docs/snippets/csharp/advanced/custom_postprocessor.cs
Normal file
@@ -0,0 +1,80 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class WordCountPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "word-count";
|
||||
public int Priority => 10;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
var wordCount = result.Content.Split(
|
||||
new[] { ' ', '\n', '\r', '\t' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["word_count"] = System.Text.Json.Nodes.JsonValue.Create(wordCount);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class SentimentPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "sentiment-analyzer";
|
||||
public int Priority => 5;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
var sentiment = AnalyzeSentiment(result.Content);
|
||||
|
||||
if (result.Metadata.Additional == null)
|
||||
{
|
||||
result.Metadata.Additional = new Dictionary<string, System.Text.Json.Nodes.JsonNode?>();
|
||||
}
|
||||
result.Metadata.Additional["sentiment"] = System.Text.Json.Nodes.JsonValue.Create(sentiment);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private string AnalyzeSentiment(string text)
|
||||
{
|
||||
return text.Length > 0 ? "neutral" : "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var wordCountProcessor = new WordCountPostProcessor();
|
||||
var sentimentProcessor = new SentimentPostProcessor();
|
||||
|
||||
KreuzbergLib.RegisterPostProcessor(wordCountProcessor);
|
||||
KreuzbergLib.RegisterPostProcessor(sentimentProcessor);
|
||||
|
||||
try
|
||||
{
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf");
|
||||
|
||||
if (result.Metadata.Additional != null)
|
||||
{
|
||||
if (result.Metadata.Additional.TryGetValue("word_count", out var wordCount))
|
||||
{
|
||||
Console.WriteLine($"Word count: {wordCount}");
|
||||
}
|
||||
if (result.Metadata.Additional.TryGetValue("sentiment", out var sentiment))
|
||||
{
|
||||
Console.WriteLine($"Sentiment: {sentiment}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
82
docs/snippets/csharp/advanced/custom_validator.cs
Normal file
82
docs/snippets/csharp/advanced/custom_validator.cs
Normal file
@@ -0,0 +1,82 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class MinLengthValidator : IValidator
|
||||
{
|
||||
private readonly int _minLength;
|
||||
|
||||
public MinLengthValidator(int minLength)
|
||||
{
|
||||
_minLength = minLength;
|
||||
}
|
||||
|
||||
public string Name => "min-length";
|
||||
public int Priority => 10;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
if (result.Content.Length < _minLength)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Content too short: {result.Content.Length} < {_minLength}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class QualityScoreValidator : IValidator
|
||||
{
|
||||
private readonly double _minScore;
|
||||
|
||||
public QualityScoreValidator(double minScore)
|
||||
{
|
||||
_minScore = minScore;
|
||||
}
|
||||
|
||||
public string Name => "quality-score";
|
||||
public int Priority => 5;
|
||||
|
||||
public void Validate(ExtractionResult result)
|
||||
{
|
||||
var score = result.QualityScore;
|
||||
|
||||
if (score < _minScore)
|
||||
{
|
||||
throw new KreuzbergValidationException(
|
||||
$"Quality score too low: {score:F2} < {_minScore:F2}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var minLengthValidator = new MinLengthValidator(minLength: 50);
|
||||
var qualityValidator = new QualityScoreValidator(minScore: 0.7);
|
||||
|
||||
KreuzbergLib.RegisterValidator(minLengthValidator);
|
||||
KreuzbergLib.RegisterValidator(qualityValidator);
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine("Validation passed");
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.WriteLine($"Validation failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
18
docs/snippets/csharp/advanced/embedding_config.md
Normal file
18
docs/snippets/csharp/advanced/embedding_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
|
||||
BatchSize = 16,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true
|
||||
}
|
||||
}
|
||||
};
|
||||
```
|
||||
49
docs/snippets/csharp/advanced/embedding_with_chunking.md
Normal file
49
docs/snippets/csharp/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 32,
|
||||
ShowDownloadProgress = false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
var chunks = result.Chunks ?? new List<Chunk>();
|
||||
foreach (var (index, chunk) in chunks.WithIndex())
|
||||
{
|
||||
var chunkId = $"doc_chunk_{index}";
|
||||
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
|
||||
|
||||
if (chunk.Embedding != null)
|
||||
{
|
||||
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
|
||||
}
|
||||
}
|
||||
|
||||
internal static class EnumerableExtensions
|
||||
{
|
||||
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
|
||||
this IEnumerable<T> items)
|
||||
{
|
||||
var index = 0;
|
||||
foreach (var item in items)
|
||||
{
|
||||
yield return (index++, item);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/csharp/advanced/error_handling.cs
Normal file
72
docs/snippets/csharp/advanced/error_handling.cs
Normal file
@@ -0,0 +1,72 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
|
||||
Console.WriteLine($"Extracted {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergParsingException ex)
|
||||
{
|
||||
Console.WriteLine($"Failed to parse document: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergOcrException ex)
|
||||
{
|
||||
Console.WriteLine($"OCR processing failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergMissingDependencyException ex)
|
||||
{
|
||||
Console.WriteLine($"Missing dependency: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig();
|
||||
var pdfBytes = new byte[] { 0x25, 0x50, 0x44, 0x46 };
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var preview = result.Content.Length > 100
|
||||
? result.Content[..100] + "..."
|
||||
: result.Content;
|
||||
|
||||
Console.WriteLine($"Extracted: {preview}");
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.WriteLine($"Invalid configuration: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergOcrException ex)
|
||||
{
|
||||
Console.WriteLine($"OCR failed: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("nonexistent.pdf");
|
||||
}
|
||||
catch (KreuzbergIOException)
|
||||
{
|
||||
Console.WriteLine("File not found");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Unexpected error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
66
docs/snippets/csharp/advanced/extract_from_bytes.cs
Normal file
66
docs/snippets/csharp/advanced/extract_from_bytes.cs
Normal file
@@ -0,0 +1,66 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var pdfBytes = await File.ReadAllBytesAsync("document.pdf");
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Content: {result.Content}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result2 = await KreuzbergLib.ExtractBytesAsync(
|
||||
pdfBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Configured extraction: {result2.Content.Length} chars");
|
||||
|
||||
var imageBytes = new byte[] { };
|
||||
|
||||
var imageResult = await KreuzbergLib.ExtractBytesAsync(
|
||||
imageBytes,
|
||||
"image/jpeg"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Image text: {imageResult.Content}");
|
||||
|
||||
var multipleFiles = new Dictionary<string, (byte[], string)>
|
||||
{
|
||||
{ "file1", (await File.ReadAllBytesAsync("file1.pdf"), "application/pdf") },
|
||||
{ "file2", (await File.ReadAllBytesAsync("file2.pdf"), "application/pdf") }
|
||||
};
|
||||
|
||||
foreach (var (name, (bytes, mimeType)) in multipleFiles)
|
||||
{
|
||||
var extractResult = await KreuzbergLib.ExtractBytesAsync(
|
||||
bytes,
|
||||
mimeType
|
||||
);
|
||||
Console.WriteLine($"{name}: {extractResult.Content.Length} chars");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
catch (IOException ex)
|
||||
{
|
||||
Console.WriteLine($"File I/O error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
73
docs/snippets/csharp/advanced/extract_from_url.cs
Normal file
73
docs/snippets/csharp/advanced/extract_from_url.cs
Normal file
@@ -0,0 +1,73 @@
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
using var httpClient = new HttpClient();
|
||||
|
||||
try
|
||||
{
|
||||
var url = "https://example.com/document.pdf";
|
||||
var documentBytes = await httpClient.GetByteArrayAsync(url);
|
||||
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(
|
||||
documentBytes,
|
||||
"application/pdf"
|
||||
);
|
||||
|
||||
Console.WriteLine($"Extracted from URL: {result.Content.Length} chars");
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result2 = await KreuzbergLib.ExtractBytesAsync(
|
||||
documentBytes,
|
||||
"application/pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Quality score: {result2.QualityScore}");
|
||||
|
||||
var urls = new[]
|
||||
{
|
||||
"https://example.com/doc1.pdf",
|
||||
"https://example.com/doc2.pdf",
|
||||
"https://example.com/doc3.pdf"
|
||||
};
|
||||
|
||||
var downloadTasks = urls.Select(async u =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var bytes = await httpClient.GetByteArrayAsync(u);
|
||||
return await KreuzbergLib.ExtractBytesAsync(
|
||||
bytes,
|
||||
"application/pdf"
|
||||
);
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
Console.WriteLine($"Download failed for {u}: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
var results = await Task.WhenAll(downloadTasks);
|
||||
|
||||
var successCount = results.Count(r => r != null);
|
||||
Console.WriteLine($"Successfully processed {successCount} documents");
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
Console.WriteLine($"HTTP error: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
98
docs/snippets/csharp/advanced/extract_with_config.cs
Normal file
98
docs/snippets/csharp/advanced/extract_with_config.cs
Normal file
@@ -0,0 +1,98 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
ForceOcr = false,
|
||||
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 3,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.8,
|
||||
Preprocessing = new ImagePreprocessingConfig
|
||||
{
|
||||
TargetDpi = 300,
|
||||
Denoise = true,
|
||||
Deskew = true,
|
||||
ContrastEnhance = true
|
||||
},
|
||||
EnableTableDetection = true
|
||||
}
|
||||
},
|
||||
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true
|
||||
},
|
||||
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 150,
|
||||
MaxImageDimension = 4096
|
||||
},
|
||||
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Preset = "default"
|
||||
},
|
||||
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
},
|
||||
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = false
|
||||
},
|
||||
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
Console.WriteLine($"Format type: {result.Metadata.FormatType}");
|
||||
|
||||
if (result.Tables.Any())
|
||||
{
|
||||
Console.WriteLine($"Found {result.Tables.Count} tables");
|
||||
}
|
||||
|
||||
if (result.Chunks?.Any() == true)
|
||||
{
|
||||
Console.WriteLine($"Created {result.Chunks.Count} chunks");
|
||||
}
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
15
docs/snippets/csharp/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/csharp/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
```
|
||||
30
docs/snippets/csharp/advanced/keyword_extraction_example.md
Normal file
30
docs/snippets/csharp/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"research_paper.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
if (result.Metadata.ContainsKey("keywords"))
|
||||
{
|
||||
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
|
||||
foreach (var kw in keywords)
|
||||
{
|
||||
var text = (string)kw["text"];
|
||||
var score = (double)kw["score"];
|
||||
Console.WriteLine($"{text}: {score:F3}");
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/csharp/advanced/language_detection_config.cs
Normal file
37
docs/snippets/csharp/advanced/language_detection_config.cs
Normal file
@@ -0,0 +1,37 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No language detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
39
docs/snippets/csharp/advanced/language_detection_config.md
Normal file
39
docs/snippets/csharp/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No language detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Content length: {result.Content.Length} characters");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,40 @@
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
|
||||
|
||||
var languages = result.DetectedLanguages ?? new List<string>();
|
||||
|
||||
if (languages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No languages detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Total content: {result.Content.Length} characters");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Processing failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8m,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync("multilingual_document.pdf", config);
|
||||
|
||||
var languages = result.DetectedLanguages ?? new List<string>();
|
||||
|
||||
if (languages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("No languages detected");
|
||||
}
|
||||
|
||||
Console.WriteLine($"Total content: {result.Content.Length} characters");
|
||||
Console.WriteLine($"MIME type: {result.MimeType}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Processing failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
65
docs/snippets/csharp/advanced/plugin_registry.cs
Normal file
65
docs/snippets/csharp/advanced/plugin_registry.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var extractors = KreuzbergLib.ListDocumentExtractors();
|
||||
Console.WriteLine("Registered Document Extractors:");
|
||||
foreach (var extractor in extractors)
|
||||
{
|
||||
Console.WriteLine($" - {extractor}");
|
||||
}
|
||||
|
||||
var ocrBackends = KreuzbergLib.ListOcrBackends();
|
||||
Console.WriteLine("\nRegistered OCR Backends:");
|
||||
foreach (var backend in ocrBackends)
|
||||
{
|
||||
Console.WriteLine($" - {backend}");
|
||||
}
|
||||
|
||||
var processors = KreuzbergLib.ListPostProcessors();
|
||||
Console.WriteLine("\nRegistered Post-Processors:");
|
||||
foreach (var processor in processors)
|
||||
{
|
||||
Console.WriteLine($" - {processor}");
|
||||
}
|
||||
|
||||
var validators = KreuzbergLib.ListValidators();
|
||||
Console.WriteLine("\nRegistered Validators:");
|
||||
foreach (var validator in validators)
|
||||
{
|
||||
Console.WriteLine($" - {validator}");
|
||||
}
|
||||
|
||||
var customProcessor = new CustomPostProcessor();
|
||||
KreuzbergLib.RegisterPostProcessor(customProcessor);
|
||||
Console.WriteLine($"\nRegistered custom post-processor: {customProcessor.Name}");
|
||||
|
||||
KreuzbergLib.UnregisterPostProcessor(customProcessor.Name);
|
||||
Console.WriteLine($"Unregistered post-processor: {customProcessor.Name}");
|
||||
|
||||
KreuzbergLib.ClearValidators();
|
||||
Console.WriteLine("All validators cleared");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Plugin registry error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CustomPostProcessor : IPostProcessor
|
||||
{
|
||||
public string Name => "custom-processor";
|
||||
public int Priority => 50;
|
||||
|
||||
public ExtractionResult Process(ExtractionResult result)
|
||||
{
|
||||
result.Content = result.Content.ToUpper();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
17
docs/snippets/csharp/advanced/quality_processing_config.md
Normal file
17
docs/snippets/csharp/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var qualityScore = result.QualityScore;
|
||||
|
||||
Console.WriteLine($"Quality score: {qualityScore:F2}");
|
||||
```
|
||||
29
docs/snippets/csharp/advanced/quality_processing_example.md
Normal file
29
docs/snippets/csharp/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFile(
|
||||
"scanned_document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var qualityScore = result.QualityScore;
|
||||
|
||||
if (qualityScore < 0.5)
|
||||
{
|
||||
Console.WriteLine(
|
||||
$"Warning: Low quality extraction ({qualityScore:F2})"
|
||||
);
|
||||
Console.WriteLine(
|
||||
"Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Quality score: {qualityScore:F2}");
|
||||
}
|
||||
```
|
||||
108
docs/snippets/csharp/advanced/streaming.cs
Normal file
108
docs/snippets/csharp/advanced/streaming.cs
Normal file
@@ -0,0 +1,108 @@
|
||||
using Kreuzberg;
|
||||
using System.IO;
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
try
|
||||
{
|
||||
var filePath = "large_document.pdf";
|
||||
|
||||
await ProcessLargeFileAsync(filePath);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
static async Task ProcessLargeFileAsync(string filePath)
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
|
||||
var contentChunks = ChunkContent(result.Content, chunkSize: 1000);
|
||||
|
||||
Console.WriteLine($"Processing {contentChunks.Count} chunks");
|
||||
|
||||
foreach (var (index, chunk) in contentChunks.Select((c, i) => (i, c)))
|
||||
{
|
||||
Console.WriteLine($"Chunk {index}: {chunk.Length} characters");
|
||||
await ProcessChunkAsync(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
static async Task ProcessChunkAsync(string chunk)
|
||||
{
|
||||
var wordCount = chunk.Split(
|
||||
new[] { ' ', '\n', '\r' },
|
||||
StringSplitOptions.RemoveEmptyEntries
|
||||
).Length;
|
||||
|
||||
Console.WriteLine($" Words: {wordCount}");
|
||||
|
||||
await Task.Delay(10);
|
||||
}
|
||||
|
||||
static List<string> ChunkContent(string content, int chunkSize)
|
||||
{
|
||||
var chunks = new List<string>();
|
||||
|
||||
for (int i = 0; i < content.Length; i += chunkSize)
|
||||
{
|
||||
var chunk = content.Substring(
|
||||
i,
|
||||
Math.Min(chunkSize, content.Length - i)
|
||||
);
|
||||
chunks.Add(chunk);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
static async IAsyncEnumerable<string> StreamExtractedChunksAsync(
|
||||
string filePath)
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(filePath);
|
||||
|
||||
if (result.Chunks?.Any() == true)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
yield return chunk.Content;
|
||||
await Task.Yield();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
var content = result.Content;
|
||||
const int chunkSize = 512;
|
||||
|
||||
for (int i = 0; i < content.Length; i += chunkSize)
|
||||
{
|
||||
var chunk = content.Substring(
|
||||
i,
|
||||
Math.Min(chunkSize, content.Length - i)
|
||||
);
|
||||
yield return chunk;
|
||||
await Task.Yield();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static async Task StreamProcessingExample()
|
||||
{
|
||||
var streamEnumerator = StreamExtractedChunksAsync("document.pdf");
|
||||
|
||||
int index = 0;
|
||||
await foreach (var chunk in streamEnumerator)
|
||||
{
|
||||
Console.WriteLine($"Chunk {index++}: {chunk[..50]}...");
|
||||
}
|
||||
}
|
||||
}
|
||||
14
docs/snippets/csharp/advanced/token_reduction_config.md
Normal file
14
docs/snippets/csharp/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate", // "off", "moderate", or "aggressive"
|
||||
PreserveMarkdown = true,
|
||||
PreserveCode = true,
|
||||
LanguageHint = "eng"
|
||||
}
|
||||
};
|
||||
```
|
||||
32
docs/snippets/csharp/advanced/token_reduction_example.md
Normal file
32
docs/snippets/csharp/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveMarkdown = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(
|
||||
"verbose_document.pdf",
|
||||
config
|
||||
);
|
||||
|
||||
var original = result.Metadata.ContainsKey("original_token_count")
|
||||
? (int)result.Metadata["original_token_count"]
|
||||
: 0;
|
||||
|
||||
var reduced = result.Metadata.ContainsKey("token_count")
|
||||
? (int)result.Metadata["token_count"]
|
||||
: 0;
|
||||
|
||||
var ratio = result.Metadata.ContainsKey("token_reduction_ratio")
|
||||
? (double)result.Metadata["token_reduction_ratio"]
|
||||
: 0.0;
|
||||
|
||||
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
|
||||
Console.WriteLine($"Reduction: {ratio * 100:F1}%");
|
||||
```
|
||||
74
docs/snippets/csharp/advanced/vector_database_integration.md
Normal file
74
docs/snippets/csharp/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
public class VectorDatabaseIntegration
|
||||
{
|
||||
public class VectorRecord
|
||||
{
|
||||
public string Id { get; set; }
|
||||
public float[] Embedding { get; set; }
|
||||
public string Content { get; set; }
|
||||
public Dictionary<string, string> Metadata { get; set; }
|
||||
}
|
||||
|
||||
public async Task<List<VectorRecord>> ExtractAndVectorize(
|
||||
string documentPath,
|
||||
string documentId)
|
||||
{
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 512,
|
||||
MaxOverlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await Kreuzberg.ExtractFileAsync(documentPath, config);
|
||||
var chunks = result.Chunks ?? new List<Chunk>();
|
||||
|
||||
var vectorRecords = chunks
|
||||
.Select((chunk, index) => new VectorRecord
|
||||
{
|
||||
Id = $"{documentId}_chunk_{index}",
|
||||
Content = chunk.Content,
|
||||
Embedding = chunk.Embedding,
|
||||
Metadata = new Dictionary<string, string>
|
||||
{
|
||||
{ "document_id", documentId },
|
||||
{ "chunk_index", index.ToString() },
|
||||
{ "content_length", chunk.Content.Length.ToString() }
|
||||
}
|
||||
})
|
||||
.ToList();
|
||||
|
||||
await StoreInVectorDatabase(vectorRecords);
|
||||
return vectorRecords;
|
||||
}
|
||||
|
||||
private async Task StoreInVectorDatabase(List<VectorRecord> records)
|
||||
{
|
||||
foreach (var record in records)
|
||||
{
|
||||
if (record.Embedding != null && record.Embedding.Length > 0)
|
||||
{
|
||||
Console.WriteLine(
|
||||
$"Storing {record.Id}: {record.Content.Length} chars, " +
|
||||
$"{record.Embedding.Length} dims");
|
||||
}
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/csharp/advanced_config.md
Normal file
29
docs/snippets/csharp/advanced_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig { Backend = "tesseract", Language = "eng+deu" },
|
||||
Chunking = new ChunkingConfig { MaxChars = 1000, MaxOverlap = 100 },
|
||||
TokenReduction = new TokenReductionConfig { Enabled = true },
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
DetectMultiple = true
|
||||
},
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Chunk: {chunk.Content[..Math.Min(100, chunk.Content.Length)]}");
|
||||
}
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
```
|
||||
17
docs/snippets/csharp/api/batch_extract_bytes_sync.md
Normal file
17
docs/snippets/csharp/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var items = new List<BatchBytesItem>
|
||||
{
|
||||
new() { Content = await File.ReadAllBytesAsync("doc1.pdf"), MimeType = "application/pdf", Config = null },
|
||||
new() { Content = await File.ReadAllBytesAsync("doc2.txt"), MimeType = "text/plain", Config = null }
|
||||
};
|
||||
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var results = KreuzbergLib.BatchExtractBytesSync(items, config);
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
```
|
||||
21
docs/snippets/csharp/api/batch_extract_files_sync.md
Normal file
21
docs/snippets/csharp/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var items = new List<BatchFileItem>
|
||||
{
|
||||
new() { Path = "document1.pdf", Config = null },
|
||||
new()
|
||||
{
|
||||
Path = "document2.pdf",
|
||||
Config = new FileExtractionConfig { ForceOcr = true }
|
||||
}
|
||||
};
|
||||
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var results = KreuzbergLib.BatchExtractFilesSync(items, config);
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
```
|
||||
45
docs/snippets/csharp/api/client_chunk_text.md
Normal file
45
docs/snippets/csharp/api/client_chunk_text.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```csharp title="C#"
|
||||
using System.Net.Http;
|
||||
using System.Net.Http.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
public record ChunkRequest(
|
||||
[property: JsonPropertyName("text")] string Text,
|
||||
[property: JsonPropertyName("max_characters")] int? MaxCharacters = null,
|
||||
[property: JsonPropertyName("overlap")] int? Overlap = null,
|
||||
[property: JsonPropertyName("chunker_type")] string? ChunkerType = null
|
||||
);
|
||||
|
||||
public record ChunkResponse(
|
||||
[property: JsonPropertyName("chunks")] List<ChunkItem> Chunks,
|
||||
[property: JsonPropertyName("chunk_count")] int ChunkCount
|
||||
);
|
||||
|
||||
public record ChunkItem(
|
||||
[property: JsonPropertyName("content")] string Content,
|
||||
[property: JsonPropertyName("chunk_index")] int ChunkIndex
|
||||
);
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
var request = new ChunkRequest(
|
||||
Text: "Your long text content here...",
|
||||
MaxCharacters: 1000,
|
||||
Overlap: 50,
|
||||
ChunkerType: "text"
|
||||
);
|
||||
|
||||
var response = await client.PostAsJsonAsync("http://localhost:8000/chunk", request);
|
||||
var result = await response.Content.ReadFromJsonAsync<ChunkResponse>();
|
||||
|
||||
Console.WriteLine($"Created {result?.ChunkCount} chunks");
|
||||
foreach (var chunk in result?.Chunks ?? [])
|
||||
{
|
||||
Console.WriteLine($"Chunk {chunk.ChunkIndex}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}...");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/csharp/api/client_extract_single_file.md
Normal file
25
docs/snippets/csharp/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```csharp title="C#"
|
||||
using System.Net.Http;
|
||||
using System.Net.Http.Json;
|
||||
|
||||
var client = new HttpClient();
|
||||
|
||||
using (var fileStream = File.OpenRead("document.pdf"))
|
||||
{
|
||||
using (var content = new MultipartFormDataContent())
|
||||
{
|
||||
content.Add(new StreamContent(fileStream), "files", "document.pdf");
|
||||
var response = await client.PostAsync("http://localhost:8000/extract", content);
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
var json = await response.Content.ReadAsStringAsync();
|
||||
Console.WriteLine(json);
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Error: {response.StatusCode}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
44
docs/snippets/csharp/api/combining_all_features.md
Normal file
44
docs/snippets/csharp/api/combining_all_features.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
OutputFormat = OutputFormat.Markdown,
|
||||
UseCache = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Enabled = true,
|
||||
Backend = OcrBackendType.Tesseract,
|
||||
Languages = ["eng"]
|
||||
},
|
||||
ImageExtraction = new ImageExtractionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinImageHeight = 100,
|
||||
MinImageWidth = 100
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
Enabled = true,
|
||||
ChunkerType = ChunkerType.Text,
|
||||
MaxCharacters = 2000,
|
||||
Overlap = 100
|
||||
},
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Content: {result.Content}");
|
||||
Console.WriteLine($"Language: {result.Metadata?.LanguageDetection}");
|
||||
Console.WriteLine($"Format: {result.OutputFormat}");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction error: {ex.Message}");
|
||||
}
|
||||
```
|
||||
18
docs/snippets/csharp/api/error_handling.md
Normal file
18
docs/snippets/csharp/api/error_handling.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
try
|
||||
{
|
||||
var result = KreuzbergLib.ExtractFileSync("nonexistent.pdf", null, null);
|
||||
Console.WriteLine(result.Content);
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Error Code: {ex.Code}");
|
||||
Console.WriteLine($"Error Message: {ex.Message}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Unexpected error: {ex.Message}");
|
||||
}
|
||||
```
|
||||
22
docs/snippets/csharp/api/error_handling_extract.md
Normal file
22
docs/snippets/csharp/api/error_handling_extract.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
try
|
||||
{
|
||||
var data = File.ReadAllBytes("document.unsupported");
|
||||
var result = KreuzbergLib.ExtractBytesSync(data, "application/x-custom", null);
|
||||
Console.WriteLine(result.Content);
|
||||
}
|
||||
catch (KreuzbergException ex) when (ex.Code == 1)
|
||||
{
|
||||
Console.WriteLine("Validation error: Invalid MIME type");
|
||||
}
|
||||
catch (KreuzbergException ex) when (ex.Code == 2)
|
||||
{
|
||||
Console.WriteLine("Format error: MIME type not supported");
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.WriteLine($"Extraction failed with error {ex.Code}: {ex.Message}");
|
||||
}
|
||||
```
|
||||
10
docs/snippets/csharp/api/extract_bytes_async.md
Normal file
10
docs/snippets/csharp/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var data = await File.ReadAllBytesAsync("document.pdf");
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var result = await KreuzbergLib.ExtractBytes(data, "application/pdf", config);
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine($"MIME Type: {result.MimeType}");
|
||||
```
|
||||
10
docs/snippets/csharp/api/extract_bytes_sync.md
Normal file
10
docs/snippets/csharp/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var data = File.ReadAllBytes("document.pdf");
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var result = KreuzbergLib.ExtractBytesSync(data, "application/pdf", config);
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine($"MIME Type: {result.MimeType}");
|
||||
```
|
||||
9
docs/snippets/csharp/api/extract_file_async.md
Normal file
9
docs/snippets/csharp/api/extract_file_async.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine($"MIME Type: {result.MimeType}");
|
||||
```
|
||||
9
docs/snippets/csharp/api/extract_file_sync.md
Normal file
9
docs/snippets/csharp/api/extract_file_sync.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig { OutputFormat = OutputFormat.Text };
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine($"MIME Type: {result.MimeType}");
|
||||
```
|
||||
13
docs/snippets/csharp/batch_extract_bytes_sync.md
Normal file
13
docs/snippets/csharp/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var documents = new[]
|
||||
{
|
||||
new BytesWithMime(await File.ReadAllBytesAsync("doc1.pdf"), "application/pdf"),
|
||||
new BytesWithMime(await File.ReadAllBytesAsync("doc2.docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||
};
|
||||
|
||||
var results = KreuzbergLib.BatchExtractBytesSync(documents, new ExtractionConfig());
|
||||
|
||||
Console.WriteLine($"Processed {results.Count} documents");
|
||||
```
|
||||
11
docs/snippets/csharp/batch_extract_files_sync.md
Normal file
11
docs/snippets/csharp/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
|
||||
var results = KreuzbergLib.BatchExtractFilesSync(files, new ExtractionConfig());
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
}
|
||||
```
|
||||
102
docs/snippets/csharp/benchmarking/simple_benchmark.cs
Normal file
102
docs/snippets/csharp/benchmarking/simple_benchmark.cs
Normal file
@@ -0,0 +1,102 @@
|
||||
```csharp title="simple_benchmark.cs"
|
||||
using BenchmarkDotNet.Attributes;
|
||||
using BenchmarkDotNet.Running;
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
[MemoryDiagnoser]
|
||||
[SimpleJob(warmupCount: 3, targetCount: 5)]
|
||||
public class KreuzbergBenchmark
|
||||
{
|
||||
private string _testFilePath;
|
||||
private ExtractionConfig _config;
|
||||
|
||||
[GlobalSetup]
|
||||
public void Setup()
|
||||
{
|
||||
_testFilePath = "document.pdf";
|
||||
_config = new ExtractionConfig
|
||||
{
|
||||
UseCache = false,
|
||||
EnableQualityProcessing = true,
|
||||
};
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public void ExtractFileSync()
|
||||
{
|
||||
var result = KreuzbergLib.ExtractFileSync(_testFilePath, _config);
|
||||
_ = result.Content.Length;
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public async Task ExtractFileAsync()
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, _config);
|
||||
_ = result.Content.Length;
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public async Task ExtractWithOcr()
|
||||
{
|
||||
var ocrConfig = new ExtractionConfig
|
||||
{
|
||||
ForceOcr = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng",
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, ocrConfig);
|
||||
_ = result.Content.Length;
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public async Task ExtractWithCache()
|
||||
{
|
||||
var cacheConfig = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(_testFilePath, cacheConfig);
|
||||
_ = result.Content.Length;
|
||||
}
|
||||
}
|
||||
|
||||
public class ManualBenchmark
|
||||
{
|
||||
public static async Task Main(string[] args)
|
||||
{
|
||||
var filePath = "document.pdf";
|
||||
var config = new ExtractionConfig();
|
||||
|
||||
await KreuzbergLib.ExtractFileAsync(filePath, config);
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
KreuzbergLib.ExtractFileSync(filePath, config);
|
||||
}
|
||||
sw.Stop();
|
||||
Console.WriteLine($"Sync extraction (10 runs): {sw.ElapsedMilliseconds}ms avg {sw.ElapsedMilliseconds / 10f}ms");
|
||||
|
||||
sw.Restart();
|
||||
var tasks = new System.Collections.Generic.List<Task>();
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
tasks.Add(KreuzbergLib.ExtractFileAsync(filePath, config));
|
||||
}
|
||||
await Task.WhenAll(tasks);
|
||||
sw.Stop();
|
||||
Console.WriteLine($"Async extraction (10 parallel runs): {sw.ElapsedMilliseconds}ms");
|
||||
|
||||
var summary = BenchmarkRunner.Run<KreuzbergBenchmark>();
|
||||
}
|
||||
}
|
||||
```
|
||||
42
docs/snippets/csharp/cache/disk_cache.cs
vendored
Normal file
42
docs/snippets/csharp/cache/disk_cache.cs
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
```csharp title="disk_cache.cs"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
CacheConfig = new CacheConfig
|
||||
{
|
||||
CachePath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "kreuzberg_cache"),
|
||||
MaxCacheSize = 1024 * 1024 * 500,
|
||||
CacheTtlSeconds = 86400 * 7,
|
||||
EnableCompression = true
|
||||
}
|
||||
};
|
||||
|
||||
Console.WriteLine("First extraction (will be cached)...");
|
||||
var result1 = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine($" - Content length: {result1.Content.Length}");
|
||||
Console.WriteLine($" - Cached: {result1.Metadata.WasCached}");
|
||||
|
||||
Console.WriteLine("\nSecond extraction (from cache)...");
|
||||
var result2 = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine($" - Content length: {result2.Content.Length}");
|
||||
Console.WriteLine($" - Cached: {result2.Metadata.WasCached}");
|
||||
|
||||
Console.WriteLine($"\nResults are identical: {result1.Content == result2.Content}");
|
||||
|
||||
await KreuzbergLib.ClearCacheAsync("document.pdf");
|
||||
Console.WriteLine("\nCache cleared for document.pdf");
|
||||
|
||||
await KreuzbergLib.ClearAllCacheAsync();
|
||||
Console.WriteLine("All cache cleared");
|
||||
|
||||
var cacheStats = await KreuzbergLib.GetCacheStatsAsync();
|
||||
Console.WriteLine($"\nCache Statistics:");
|
||||
Console.WriteLine($" - Total entries: {cacheStats.TotalEntries}");
|
||||
Console.WriteLine($" - Cache size: {cacheStats.CacheSizeBytes / 1024 / 1024} MB");
|
||||
Console.WriteLine($" - Hit rate: {cacheStats.HitRate:P}");
|
||||
```
|
||||
10
docs/snippets/csharp/clear_plugins.md
Normal file
10
docs/snippets/csharp/clear_plugins.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
KreuzbergLib.ClearPostProcessors();
|
||||
KreuzbergLib.ClearValidators();
|
||||
KreuzbergLib.ClearOcrBackends();
|
||||
KreuzbergLib.ClearDocumentExtractors();
|
||||
|
||||
Console.WriteLine("All plugins cleared");
|
||||
```
|
||||
46
docs/snippets/csharp/cli/basic_cli.cs
Normal file
46
docs/snippets/csharp/cli/basic_cli.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
```csharp title="basic_cli.cs"
|
||||
using System;
|
||||
using System.CommandLine;
|
||||
using System.CommandLine.Invocation;
|
||||
using System.Threading.Tasks;
|
||||
using Kreuzberg;
|
||||
|
||||
var rootCommand = new RootCommand("Kreuzberg document extraction CLI");
|
||||
|
||||
var extractFileCommand = new Command("extract-file", "Extract text from a document file");
|
||||
var filePath = new Argument<string>("path", "Path to the document file");
|
||||
var outputFormat = new Option<string>(
|
||||
new[] { "-f", "--format" },
|
||||
getDefaultValue: () => "text",
|
||||
"Output format (text, json)"
|
||||
);
|
||||
|
||||
extractFileCommand.AddArgument(filePath);
|
||||
extractFileCommand.AddOption(outputFormat);
|
||||
|
||||
extractFileCommand.SetHandler(async (path, format) =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await KreuzbergLib.ExtractFileAsync(path);
|
||||
|
||||
if (format == "json")
|
||||
{
|
||||
Console.WriteLine(System.Text.Json.JsonSerializer.Serialize(result));
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine(result.Content);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.Error.WriteLine($"Error: {ex.Message}");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
}, filePath, outputFormat);
|
||||
|
||||
rootCommand.AddCommand(extractFileCommand);
|
||||
|
||||
return await rootCommand.InvokeAsync(args);
|
||||
```
|
||||
75
docs/snippets/csharp/cli/cli_with_config.cs
Normal file
75
docs/snippets/csharp/cli/cli_with_config.cs
Normal file
@@ -0,0 +1,75 @@
|
||||
```csharp title="cli_with_config.cs"
|
||||
using System;
|
||||
using System.CommandLine;
|
||||
using System.Text.Json;
|
||||
using System.Threading.Tasks;
|
||||
using Kreuzberg;
|
||||
|
||||
var rootCommand = new RootCommand("Kreuzberg with configuration");
|
||||
|
||||
var extractCommand = new Command("extract", "Extract with custom configuration");
|
||||
var filePath = new Argument<string>("path", "Document file path");
|
||||
var configPath = new Option<string>(
|
||||
new[] { "-c", "--config" },
|
||||
"Path to JSON configuration file"
|
||||
);
|
||||
var forceOcr = new Option<bool>(
|
||||
new[] { "--force-ocr" },
|
||||
"Force OCR processing"
|
||||
);
|
||||
var useCache = new Option<bool>(
|
||||
new[] { "--use-cache" },
|
||||
getDefaultValue: () => true,
|
||||
"Use caching (default: true)"
|
||||
);
|
||||
|
||||
extractCommand.AddArgument(filePath);
|
||||
extractCommand.AddOption(configPath);
|
||||
extractCommand.AddOption(forceOcr);
|
||||
extractCommand.AddOption(useCache);
|
||||
|
||||
extractCommand.SetHandler(async (path, config, ocr, cache) =>
|
||||
{
|
||||
try
|
||||
{
|
||||
ExtractionConfig extractionConfig;
|
||||
|
||||
if (!string.IsNullOrEmpty(config))
|
||||
{
|
||||
var json = await System.IO.File.ReadAllTextAsync(config);
|
||||
extractionConfig = JsonSerializer.Deserialize<ExtractionConfig>(json);
|
||||
}
|
||||
else
|
||||
{
|
||||
extractionConfig = new ExtractionConfig
|
||||
{
|
||||
UseCache = cache,
|
||||
ForceOcr = ocr,
|
||||
};
|
||||
}
|
||||
|
||||
Console.WriteLine("Extracting with configuration:");
|
||||
Console.WriteLine($" - File: {path}");
|
||||
Console.WriteLine($" - Force OCR: {extractionConfig.ForceOcr}");
|
||||
Console.WriteLine($" - Use Cache: {extractionConfig.UseCache}");
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync(path, extractionConfig);
|
||||
|
||||
Console.WriteLine($"\nExtraction complete:");
|
||||
Console.WriteLine($" - Content length: {result.Content.Length}");
|
||||
Console.WriteLine($" - Format: {result.Metadata.FormatType}");
|
||||
Console.WriteLine($" - Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
|
||||
Console.WriteLine($"\n{result.Content}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.Error.WriteLine($"Error: {ex.Message}");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
}, filePath, configPath, forceOcr, useCache);
|
||||
|
||||
rootCommand.AddCommand(extractCommand);
|
||||
|
||||
return await rootCommand.InvokeAsync(args);
|
||||
```
|
||||
68
docs/snippets/csharp/client_chunk_text.md
Normal file
68
docs/snippets/csharp/client_chunk_text.md
Normal file
@@ -0,0 +1,68 @@
|
||||
```csharp title="C#"
|
||||
using System.Net.Http.Json;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
// Request models
|
||||
public record ChunkRequest(
|
||||
[property: JsonPropertyName("text")] string Text,
|
||||
[property: JsonPropertyName("chunker_type")] string? ChunkerType = null,
|
||||
[property: JsonPropertyName("config")] ChunkConfig? Config = null
|
||||
);
|
||||
|
||||
public record ChunkConfig(
|
||||
[property: JsonPropertyName("max_characters")] int? MaxCharacters = null,
|
||||
[property: JsonPropertyName("overlap")] int? Overlap = null,
|
||||
[property: JsonPropertyName("trim")] bool? Trim = null
|
||||
);
|
||||
|
||||
// Response models
|
||||
public record ChunkResponse(
|
||||
[property: JsonPropertyName("chunks")] List<ChunkItem> Chunks,
|
||||
[property: JsonPropertyName("chunk_count")] int ChunkCount,
|
||||
[property: JsonPropertyName("input_size_bytes")] int InputSizeBytes,
|
||||
[property: JsonPropertyName("chunker_type")] string ChunkerType
|
||||
);
|
||||
|
||||
public record ChunkItem(
|
||||
[property: JsonPropertyName("content")] string Content,
|
||||
[property: JsonPropertyName("byte_start")] int ByteStart,
|
||||
[property: JsonPropertyName("byte_end")] int ByteEnd,
|
||||
[property: JsonPropertyName("chunk_index")] int ChunkIndex,
|
||||
[property: JsonPropertyName("total_chunks")] int TotalChunks,
|
||||
[property: JsonPropertyName("first_page")] int? FirstPage,
|
||||
[property: JsonPropertyName("last_page")] int? LastPage
|
||||
);
|
||||
|
||||
class Program
|
||||
{
|
||||
static async Task Main()
|
||||
{
|
||||
using var client = new HttpClient();
|
||||
|
||||
var request = new ChunkRequest(
|
||||
Text: "Your long text content here...",
|
||||
ChunkerType: "text",
|
||||
Config: new ChunkConfig(
|
||||
MaxCharacters: 1000,
|
||||
Overlap: 50,
|
||||
Trim: true
|
||||
)
|
||||
);
|
||||
|
||||
var response = await client.PostAsJsonAsync(
|
||||
"http://localhost:8000/chunk",
|
||||
request
|
||||
);
|
||||
|
||||
var result = await response.Content.ReadFromJsonAsync<ChunkResponse>();
|
||||
|
||||
Console.WriteLine($"Created {result?.ChunkCount} chunks");
|
||||
foreach (var chunk in result?.Chunks ?? [])
|
||||
{
|
||||
var preview = chunk.Content[..Math.Min(50, chunk.Content.Length)];
|
||||
Console.WriteLine($"Chunk {chunk.ChunkIndex}: {preview}...");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/csharp/client_extract_single_file.md
Normal file
20
docs/snippets/csharp/client_extract_single_file.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Net.Http;
|
||||
|
||||
var client = new HttpClient();
|
||||
|
||||
using (var fileStream = File.OpenRead("document.pdf"))
|
||||
{
|
||||
using (var content = new MultipartFormDataContent())
|
||||
{
|
||||
content.Add(new StreamContent(fileStream), "files", "document.pdf");
|
||||
|
||||
var response = await client.PostAsync("http://localhost:8000/extract", content);
|
||||
var json = await response.Content.ReadAsStringAsync();
|
||||
|
||||
Console.WriteLine(json);
|
||||
}
|
||||
}
|
||||
```
|
||||
56
docs/snippets/csharp/cloud_ocr_backend.md
Normal file
56
docs/snippets/csharp/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
|
||||
public class CloudOcrBackend : IOcrBackend
|
||||
{
|
||||
private readonly string _apiKey;
|
||||
private readonly List<string> _langs = new() { "eng", "deu", "fra" };
|
||||
|
||||
public CloudOcrBackend(string apiKey)
|
||||
{
|
||||
_apiKey = apiKey;
|
||||
}
|
||||
|
||||
public string Name() => "cloud-ocr";
|
||||
public string Version() => "1.0.0";
|
||||
public List<string> SupportedLanguages() => _langs;
|
||||
|
||||
public Dictionary<string, object> ProcessImage(byte[] imageBytes, Dictionary<string, object> config)
|
||||
{
|
||||
using (var client = new HttpClient())
|
||||
{
|
||||
using (var form = new MultipartFormDataContent())
|
||||
{
|
||||
form.Add(new ByteArrayContent(imageBytes), "image");
|
||||
var lang = config.ContainsKey("language") ? config["language"].ToString() : "eng";
|
||||
form.Add(new StringContent(lang), "language");
|
||||
|
||||
var response = client.PostAsync("https://api.example.com/ocr", form).Result;
|
||||
var json = response.Content.ReadAsStringAsync().Result;
|
||||
var doc = JsonDocument.Parse(json);
|
||||
var text = doc.RootElement.GetProperty("text").GetString();
|
||||
|
||||
return new Dictionary<string, object>
|
||||
{
|
||||
{ "content", text },
|
||||
{ "mime_type", "text/plain" }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void Initialize() { }
|
||||
public void Shutdown() { }
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
var backend = new CloudOcrBackend(apiKey: "your-api-key");
|
||||
KreuzbergLib.RegisterOcrBackend(backend);
|
||||
}
|
||||
}
|
||||
```
|
||||
28
docs/snippets/csharp/complete_example.md
Normal file
28
docs/snippets/csharp/complete_example.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig { Psm = 3 }
|
||||
},
|
||||
PdfOptions = new PdfConfig { ExtractImages = true },
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = EmbeddingModelType.Preset("all-MiniLM-L6-v2")
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
|
||||
```
|
||||
48
docs/snippets/csharp/config/ElementBasedOutput.md
Normal file
48
docs/snippets/csharp/config/ElementBasedOutput.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```csharp title="Element-Based Output (C#)"
|
||||
using Kreuzberg;
|
||||
|
||||
// Configure element-based output
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
OutputFormat = OutputFormat.ElementBased
|
||||
};
|
||||
|
||||
// Extract document
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
|
||||
|
||||
// Access elements
|
||||
foreach (var element in result.Elements)
|
||||
{
|
||||
Console.WriteLine($"Type: {element.ElementType}");
|
||||
|
||||
var text = element.Text.Length > 100
|
||||
? element.Text.Substring(0, 100)
|
||||
: element.Text;
|
||||
Console.WriteLine($"Text: {text}");
|
||||
|
||||
if (element.Metadata.PageNumber.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
|
||||
}
|
||||
|
||||
if (element.Metadata.Coordinates != null)
|
||||
{
|
||||
var coords = element.Metadata.Coordinates;
|
||||
Console.WriteLine($"Coords: ({coords.Left}, {coords.Top}) - ({coords.Right}, {coords.Bottom})");
|
||||
}
|
||||
|
||||
Console.WriteLine("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
var titles = result.Elements
|
||||
.Where(e => e.ElementType == "title");
|
||||
|
||||
foreach (var title in titles)
|
||||
{
|
||||
var level = title.Metadata.Additional.TryGetValue("level", out var levelValue)
|
||||
? levelValue.ToString()
|
||||
: "unknown";
|
||||
Console.WriteLine($"[{level}] {title.Text}");
|
||||
}
|
||||
```
|
||||
41
docs/snippets/csharp/config/advanced_config.md
Normal file
41
docs/snippets/csharp/config/advanced_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu"
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200
|
||||
},
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
DetectMultiple = true
|
||||
},
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate"
|
||||
},
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.1f
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
```
|
||||
9
docs/snippets/csharp/config/basic.cs
Normal file
9
docs/snippets/csharp/config/basic.cs
Normal file
@@ -0,0 +1,9 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
47
docs/snippets/csharp/config/chunking_config.md
Normal file
47
docs/snippets/csharp/config/chunking_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200,
|
||||
ChunkerType = ChunkerType.Text
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Chunk length: {chunk.Content.Length}");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Markdown with Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 500,
|
||||
Overlap = 50,
|
||||
ChunkerType = ChunkerType.Markdown,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.md", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}");
|
||||
}
|
||||
}
|
||||
```
|
||||
12
docs/snippets/csharp/config/config_basic.md
Normal file
12
docs/snippets/csharp/config/config_basic.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
8
docs/snippets/csharp/config/config_discover.md
Normal file
8
docs/snippets/csharp/config/config_discover.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = ExtractionConfig.Discover() ?? new ExtractionConfig();
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
19
docs/snippets/csharp/config/config_ocr.md
Normal file
19
docs/snippets/csharp/config/config_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
if (result.Tables != null)
|
||||
{
|
||||
Console.WriteLine($"Tables detected: {result.Tables.Count}");
|
||||
}
|
||||
```
|
||||
26
docs/snippets/csharp/config/config_programmatic.md
Normal file
26
docs/snippets/csharp/config/config_programmatic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 6
|
||||
}
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
```
|
||||
14
docs/snippets/csharp/config/custom_mime_types.cs
Normal file
14
docs/snippets/csharp/config/custom_mime_types.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractBytesSync(
|
||||
new BytesWithMime(fileBytes, "application/pdf"),
|
||||
config
|
||||
);
|
||||
|
||||
var mimeType = result.MimeType;
|
||||
8
docs/snippets/csharp/config/disable_cache.cs
Normal file
8
docs/snippets/csharp/config/disable_cache.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = false
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
18
docs/snippets/csharp/config/document_structure_config.md
Normal file
18
docs/snippets/csharp/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="Document Structure Config (C#)"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
IncludeDocumentStructure = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Document is not null)
|
||||
{
|
||||
foreach (var node in result.Document.Nodes)
|
||||
{
|
||||
Console.WriteLine($"[{node.Content.NodeType}]");
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/csharp/config/element_based_output.md
Normal file
37
docs/snippets/csharp/config/element_based_output.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
ResultFormat = ResultFormat.ElementBased
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
|
||||
if (result.Elements != null)
|
||||
{
|
||||
foreach (var element in result.Elements)
|
||||
{
|
||||
Console.WriteLine($"Type: {element.ElementType}");
|
||||
Console.WriteLine($"Text: {element.Text.Substring(0, Math.Min(100, element.Text.Length))}");
|
||||
|
||||
if (element.Metadata.PageNumber.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
|
||||
}
|
||||
|
||||
if (element.Metadata.Coordinates != null)
|
||||
{
|
||||
Console.WriteLine($"Coords: ({element.Metadata.Coordinates.X0}, {element.Metadata.Coordinates.Y0})");
|
||||
}
|
||||
|
||||
Console.WriteLine("---");
|
||||
}
|
||||
|
||||
var titles = result.Elements
|
||||
.Where(e => e.ElementType == ElementType.Title)
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {titles.Count} titles");
|
||||
}
|
||||
```
|
||||
106
docs/snippets/csharp/config/embedding_config.cs
Normal file
106
docs/snippets/csharp/config/embedding_config.cs
Normal file
@@ -0,0 +1,106 @@
|
||||
using Kreuzberg.Config;
|
||||
|
||||
public class EmbeddingConfigExample
|
||||
{
|
||||
public static void Main()
|
||||
{
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
var embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset
|
||||
{
|
||||
Name = "balanced"
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = "~/.cache/kreuzberg/embeddings"
|
||||
};
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Custom
|
||||
{
|
||||
ModelId = "BAAI/bge-small-en-v1.5",
|
||||
Dimensions = 384
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = null // Uses default: .kreuzberg/embeddings/
|
||||
};
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Custom
|
||||
{
|
||||
ModelId = "sentence-transformers/all-mpnet-base-v2",
|
||||
Dimensions = 768
|
||||
},
|
||||
BatchSize = 16, // Larger model requires smaller batch size
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = "/var/cache/embeddings"
|
||||
};
|
||||
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
var chunkingConfig = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1024,
|
||||
MaxOverlap = 100,
|
||||
Preset = "balanced",
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset
|
||||
{
|
||||
Name = "balanced"
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true
|
||||
}
|
||||
};
|
||||
|
||||
var extractionConfig = new ExtractionConfig
|
||||
{
|
||||
Chunking = chunkingConfig
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// BatchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// Normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// CacheDir: Where to store downloaded models
|
||||
// - null: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - String path: Custom directory for model storage
|
||||
//
|
||||
// ShowDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
25
docs/snippets/csharp/config/embedding_config.md
Normal file
25
docs/snippets/csharp/config/embedding_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Normalize = true,
|
||||
BatchSize = 16,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = null
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
Console.WriteLine($"Chunks with embeddings: {result.Chunks.Count}");
|
||||
}
|
||||
```
|
||||
8
docs/snippets/csharp/config/enable_cache.cs
Normal file
8
docs/snippets/csharp/config/enable_cache.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
60
docs/snippets/csharp/config/full_example.cs
Normal file
60
docs/snippets/csharp/config/full_example.cs
Normal file
@@ -0,0 +1,60 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
ForceOcr = false,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 3,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.8,
|
||||
Preprocessing = new ImagePreprocessingConfig
|
||||
{
|
||||
TargetDpi = 300,
|
||||
Denoise = true,
|
||||
Deskew = true,
|
||||
ContrastEnhance = true
|
||||
},
|
||||
EnableTableDetection = true
|
||||
}
|
||||
},
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true
|
||||
},
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 150,
|
||||
MaxImageDimension = 4096
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200
|
||||
},
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
},
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = false
|
||||
},
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
99
docs/snippets/csharp/config/hierarchy_config.cs
Normal file
99
docs/snippets/csharp/config/hierarchy_config.cs
Normal file
@@ -0,0 +1,99 @@
|
||||
using Kreuzberg.Config;
|
||||
using Kreuzberg;
|
||||
|
||||
public class HierarchyConfigExample
|
||||
{
|
||||
public static void Main()
|
||||
{
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
var hierarchyConfigBasic = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
IncludeBbox = true, // Include bounding box coordinates
|
||||
OcrCoverageThreshold = null // No OCR coverage threshold
|
||||
};
|
||||
|
||||
var pdfConfigBasic = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigBasic
|
||||
};
|
||||
|
||||
var extractionConfigBasic = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigBasic
|
||||
};
|
||||
|
||||
var kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// var result = kreuzberg.ExtractFileSync("document.pdf");
|
||||
|
||||
|
||||
// Example 2: Custom KClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
var hierarchyConfigMinimal = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 3, // Minimal clustering: just 3 levels
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = null
|
||||
};
|
||||
|
||||
var pdfConfigMinimal = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigMinimal
|
||||
};
|
||||
|
||||
var extractionConfigMinimal = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigMinimal
|
||||
};
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
var hierarchyConfigOcr = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6,
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
var pdfConfigOcr = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigOcr
|
||||
};
|
||||
|
||||
var extractionConfigOcr = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigOcr
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// Enabled: bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// KClusters: int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// IncludeBbox: bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// OcrCoverageThreshold: float? (default: null)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
|
||||
// - null means no OCR coverage-based triggering
|
||||
17
docs/snippets/csharp/config/html_output.md
Normal file
17
docs/snippets/csharp/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
OutputFormat = OutputFormat.Html,
|
||||
HtmlOutput = new HtmlOutputConfig
|
||||
{
|
||||
Theme = HtmlTheme.GitHub,
|
||||
EmbedCss = true,
|
||||
ClassPrefix = "kb-"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
19
docs/snippets/csharp/config/include_meta.cs
Normal file
19
docs/snippets/csharp/config/include_meta.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata != null)
|
||||
{
|
||||
var language = result.Metadata.Language;
|
||||
var format = result.Metadata.FormatType;
|
||||
}
|
||||
66
docs/snippets/csharp/config/keyword_config.cs
Normal file
66
docs/snippets/csharp/config/keyword_config.cs
Normal file
@@ -0,0 +1,66 @@
|
||||
using Kreuzberg;
|
||||
using Kreuzberg.Keywords;
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
var basicYakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.0f,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en",
|
||||
YakeParams = null,
|
||||
RakeParams = null,
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", basicYakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
var advancedYakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 15,
|
||||
MinScore = 0.1f,
|
||||
NgramRange = (1, 2),
|
||||
Language = "en",
|
||||
YakeParams = new YakeParams
|
||||
{
|
||||
WindowSize = 1,
|
||||
},
|
||||
RakeParams = null,
|
||||
}
|
||||
};
|
||||
|
||||
result = KreuzbergLib.ExtractFileSync("document.pdf", advancedYakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
var rakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Rake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 5.0f,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en",
|
||||
YakeParams = null,
|
||||
RakeParams = new RakeParams
|
||||
{
|
||||
MinWordLength = 1,
|
||||
MaxWordsPerPhrase = 3,
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
result = KreuzbergLib.ExtractFileSync("document.pdf", rakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
21
docs/snippets/csharp/config/keyword_extraction_config.md
Normal file
21
docs/snippets/csharp/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.1f,
|
||||
NgramRange = [1, 3],
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Keywords != null)
|
||||
{
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
}
|
||||
```
|
||||
20
docs/snippets/csharp/config/language_detection_config.md
Normal file
20
docs/snippets/csharp/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Detected language: {result.Language}");
|
||||
if (result.DetectedLanguages != null)
|
||||
{
|
||||
Console.WriteLine($"All detected: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
```
|
||||
22
docs/snippets/csharp/config/ocr_dpi_config.md
Normal file
22
docs/snippets/csharp/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 300,
|
||||
MaxImageDimension = 4096,
|
||||
AutoAdjustDpi = true,
|
||||
MinDpi = 150,
|
||||
MaxDpi = 600
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Images != null)
|
||||
{
|
||||
Console.WriteLine($"Extracted images: {result.Images.Count}");
|
||||
}
|
||||
```
|
||||
12
docs/snippets/csharp/config/ocr_lang.cs
Normal file
12
docs/snippets/csharp/config/ocr_lang.cs
Normal file
@@ -0,0 +1,12 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
17
docs/snippets/csharp/config/parse_links.cs
Normal file
17
docs/snippets/csharp/config/parse_links.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.html", config);
|
||||
|
||||
if (result.Metadata?.Format.Text?.Links != null)
|
||||
{
|
||||
foreach (var link in result.Metadata.Format.Text.Links)
|
||||
{
|
||||
var text = link[0];
|
||||
var url = link[1];
|
||||
}
|
||||
}
|
||||
18
docs/snippets/csharp/config/parse_metadata.cs
Normal file
18
docs/snippets/csharp/config/parse_metadata.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var title = result.Metadata.Format.Pdf.Title;
|
||||
var author = result.Metadata.Format.Pdf.Author;
|
||||
var pageCount = result.Metadata.Format.Pdf.PageCount;
|
||||
}
|
||||
21
docs/snippets/csharp/config/pdf_config.md
Normal file
21
docs/snippets/csharp/config/pdf_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true,
|
||||
ExtractAnnotations = false,
|
||||
Passwords = new List<string> { "password123" }
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("encrypted.pdf", null, config);
|
||||
if (result.Metadata != null)
|
||||
{
|
||||
Console.WriteLine($"Title: {result.Metadata.Title}");
|
||||
Console.WriteLine($"Authors: {string.Join(", ", result.Metadata.Authors ?? new List<string>())}");
|
||||
}
|
||||
```
|
||||
74
docs/snippets/csharp/config/pdf_hierarchy_config.md
Normal file
74
docs/snippets/csharp/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
// Basic hierarchy configuration with properties
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6,
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = 0.8f
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
|
||||
// Advanced hierarchy detection with custom parameters
|
||||
var advancedConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 12, // More clusters for detailed hierarchy
|
||||
IncludeBbox = true, // Include bounding box coordinates
|
||||
OcrCoverageThreshold = 0.7f // Higher OCR threshold for stricter detection
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("complex_document.pdf", advancedConfig);
|
||||
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");
|
||||
|
||||
// Minimal configuration with only enabled flag
|
||||
var minimalConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
// Other properties use defaults:
|
||||
// KClusters = 6
|
||||
// IncludeBbox = true
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", minimalConfig);
|
||||
Console.WriteLine("Extraction with default hierarchy settings complete");
|
||||
|
||||
// Disabling hierarchy detection
|
||||
var noHierarchyConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", noHierarchyConfig);
|
||||
Console.WriteLine("Extraction without hierarchy detection complete");
|
||||
```
|
||||
13
docs/snippets/csharp/config/postprocessor.cs
Normal file
13
docs/snippets/csharp/config/postprocessor.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true,
|
||||
EnabledProcessors = new List<string> { "normalize_whitespace", "remove_diacritics" }
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
20
docs/snippets/csharp/config/postprocessor_config.md
Normal file
20
docs/snippets/csharp/config/postprocessor_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true,
|
||||
EnabledProcessors = new List<string>
|
||||
{
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
},
|
||||
DisabledProcessors = null
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Processed content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
13
docs/snippets/csharp/config/quality_processing_config.md
Normal file
13
docs/snippets/csharp/config/quality_processing_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true,
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Quality score: {result.QualityScore}");
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
```
|
||||
22
docs/snippets/csharp/config/tesseract_config.md
Normal file
22
docs/snippets/csharp/config/tesseract_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 6,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.5,
|
||||
Language = "eng"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
|
||||
Console.WriteLine($"OCR text: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
16
docs/snippets/csharp/config/token_reduction_config.md
Normal file
16
docs/snippets/csharp/config/token_reduction_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Reduced content length: {result.Content.Length}");
|
||||
Console.WriteLine($"Content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
18
docs/snippets/csharp/config/validator.cs
Normal file
18
docs/snippets/csharp/config/validator.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
if (result.Metadata?.Error != null)
|
||||
{
|
||||
var errorType = result.Metadata.Error.ErrorType;
|
||||
var errorMessage = result.Metadata.Error.Message;
|
||||
}
|
||||
}
|
||||
13
docs/snippets/csharp/config/with_cache.cs
Normal file
13
docs/snippets/csharp/config/with_cache.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
10
docs/snippets/csharp/config/with_timeout.cs
Normal file
10
docs/snippets/csharp/config/with_timeout.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var cts = new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config, cts.Token);
|
||||
12
docs/snippets/csharp/config_basic.md
Normal file
12
docs/snippets/csharp/config_basic.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
9
docs/snippets/csharp/config_discover.md
Normal file
9
docs/snippets/csharp/config_discover.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig();
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
|
||||
Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
|
||||
Console.WriteLine($"Total length: {result.Content.Length}");
|
||||
```
|
||||
16
docs/snippets/csharp/config_ocr.md
Normal file
16
docs/snippets/csharp/config_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig { Psm = 3 }
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
96
docs/snippets/csharp/docker/usage.cs
Normal file
96
docs/snippets/csharp/docker/usage.cs
Normal file
@@ -0,0 +1,96 @@
|
||||
```csharp title="usage.cs"
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Text.Json;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
var dockerClient = new DockerKreuzbergLib();
|
||||
|
||||
try
|
||||
{
|
||||
await dockerClient.StartContainerAsync();
|
||||
await Task.Delay(2000);
|
||||
|
||||
var content = await dockerClient.ExtractFileAsync("document.pdf");
|
||||
Console.WriteLine($"Extracted content:\n{content}");
|
||||
}
|
||||
finally
|
||||
{
|
||||
await dockerClient.StopContainerAsync();
|
||||
}
|
||||
|
||||
class DockerKreuzbergLib
|
||||
{
|
||||
private const string ContainerName = "kreuzberg-api";
|
||||
private const string ContainerImage = "kreuzberg:latest";
|
||||
private const int ApiPort = 8000;
|
||||
|
||||
public async Task StartContainerAsync()
|
||||
{
|
||||
Console.WriteLine("Starting Kreuzberg Docker container...");
|
||||
|
||||
var processInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = "docker",
|
||||
Arguments = $"run -d --name {ContainerName} -p {ApiPort}:8000 {ContainerImage}",
|
||||
UseShellExecute = false,
|
||||
RedirectStandardOutput = true,
|
||||
};
|
||||
|
||||
using (var process = Process.Start(processInfo))
|
||||
{
|
||||
await process.WaitForExitAsync();
|
||||
}
|
||||
|
||||
Console.WriteLine($"Container started on http://localhost:{ApiPort}");
|
||||
}
|
||||
|
||||
public async Task<string> ExtractFileAsync(string filePath)
|
||||
{
|
||||
using (var client = new HttpClient())
|
||||
{
|
||||
var fileBytes = await File.ReadAllBytesAsync(filePath);
|
||||
using (var content = new MultipartFormDataContent())
|
||||
{
|
||||
content.Add(new ByteArrayContent(fileBytes), "file", Path.GetFileName(filePath));
|
||||
|
||||
var response = await client.PostAsync(
|
||||
$"http://localhost:{ApiPort}/api/extract",
|
||||
content
|
||||
);
|
||||
|
||||
response.EnsureSuccessStatusCode();
|
||||
var json = await response.Content.ReadAsStringAsync();
|
||||
var result = JsonSerializer.Deserialize<JsonElement>(json);
|
||||
return result.GetProperty("content").GetString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public async Task StopContainerAsync()
|
||||
{
|
||||
Console.WriteLine("Stopping Kreuzberg Docker container...");
|
||||
|
||||
var processInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = "docker",
|
||||
Arguments = $"stop {ContainerName}",
|
||||
UseShellExecute = false,
|
||||
};
|
||||
|
||||
using (var process = Process.Start(processInfo))
|
||||
{
|
||||
await process.WaitForExitAsync();
|
||||
}
|
||||
|
||||
processInfo.Arguments = $"rm {ContainerName}";
|
||||
using (var process = Process.Start(processInfo))
|
||||
{
|
||||
await process.WaitForExitAsync();
|
||||
}
|
||||
|
||||
Console.WriteLine("Container stopped and removed");
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/csharp/error_handling.md
Normal file
23
docs/snippets/csharp/error_handling.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
try
|
||||
{
|
||||
var result = KreuzbergLib.ExtractFileSync("missing.pdf");
|
||||
Console.WriteLine(result.Content);
|
||||
}
|
||||
catch (KreuzbergValidationException ex)
|
||||
{
|
||||
Console.Error.WriteLine($"Validation error: {ex.Message}");
|
||||
}
|
||||
catch (KreuzbergIOException ex)
|
||||
{
|
||||
Console.Error.WriteLine($"IO error: {ex.Message}");
|
||||
throw;
|
||||
}
|
||||
catch (KreuzbergException ex)
|
||||
{
|
||||
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
|
||||
throw;
|
||||
}
|
||||
```
|
||||
39
docs/snippets/csharp/error_handling_extract.md
Normal file
39
docs/snippets/csharp/error_handling_extract.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```csharp title="C#"
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
|
||||
var client = new HttpClient();
|
||||
|
||||
try
|
||||
{
|
||||
using (var fileStream = File.OpenRead("document.pdf"))
|
||||
{
|
||||
using (var content = new MultipartFormDataContent())
|
||||
{
|
||||
content.Add(new StreamContent(fileStream), "files", "document.pdf");
|
||||
|
||||
var response = await client.PostAsync("http://localhost:8000/extract", content);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
var errorJson = await response.Content.ReadAsStringAsync();
|
||||
var errorDoc = JsonDocument.Parse(errorJson);
|
||||
var errorType = errorDoc.RootElement.GetProperty("error_type").GetString();
|
||||
var message = errorDoc.RootElement.GetProperty("message").GetString();
|
||||
|
||||
Console.WriteLine($"Error: {errorType}: {message}");
|
||||
return;
|
||||
}
|
||||
|
||||
var json = await response.Content.ReadAsStringAsync();
|
||||
Console.WriteLine($"Success: {json}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException e)
|
||||
{
|
||||
Console.WriteLine($"Request failed: {e.Message}");
|
||||
}
|
||||
```
|
||||
9
docs/snippets/csharp/extract_bytes_async.md
Normal file
9
docs/snippets/csharp/extract_bytes_async.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var data = await File.ReadAllBytesAsync("document.pdf");
|
||||
var result = await KreuzbergLib.ExtractBytesAsync(data, "application/pdf");
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine(result.MimeType);
|
||||
```
|
||||
9
docs/snippets/csharp/extract_bytes_sync.md
Normal file
9
docs/snippets/csharp/extract_bytes_sync.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var data = await File.ReadAllBytesAsync("document.pdf");
|
||||
var result = KreuzbergLib.ExtractBytesSync(data, "application/pdf");
|
||||
|
||||
Console.WriteLine(result.Content);
|
||||
Console.WriteLine(result.MimeType);
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user