This commit is contained in:
16
docs/snippets/csharp/utils/chunking.md
Normal file
16
docs/snippets/csharp/utils/chunking.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1500,
|
||||
Overlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset("balanced"),
|
||||
},
|
||||
},
|
||||
};
|
||||
```
|
||||
32
docs/snippets/csharp/utils/chunking_rag.md
Normal file
32
docs/snippets/csharp/utils/chunking_rag.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 500,
|
||||
Overlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 16,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("research_paper.pdf", null, config);
|
||||
|
||||
var chunksWithEmbeddings = new List<(string Preview, int Dimensions)>();
|
||||
foreach (var chunk in result.Chunks ?? new List<Chunk>())
|
||||
{
|
||||
if (chunk.Embedding is { Count: > 0 } embedding)
|
||||
{
|
||||
var preview = chunk.Content.Length > 100 ? chunk.Content[..100] : chunk.Content;
|
||||
chunksWithEmbeddings.Add((preview, embedding.Count));
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine($"Chunks with embeddings: {chunksWithEmbeddings.Count}");
|
||||
```
|
||||
46
docs/snippets/csharp/utils/detect_language.cs
Normal file
46
docs/snippets/csharp/utils/detect_language.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
```csharp title="detect_language.cs"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.9,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
Console.WriteLine("Detected Language:");
|
||||
foreach (var lang in result.DetectedLanguages)
|
||||
{
|
||||
Console.WriteLine($" - {lang}");
|
||||
}
|
||||
|
||||
var multiLangConfig = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
var multiResult = KreuzbergLib.ExtractFileSync("multilingual_document.pdf", multiLangConfig);
|
||||
|
||||
Console.WriteLine("Detected Languages:");
|
||||
foreach (var lang in multiResult.DetectedLanguages)
|
||||
{
|
||||
Console.WriteLine($" - {lang}");
|
||||
}
|
||||
|
||||
Console.WriteLine($"\nLanguage Detection Summary:");
|
||||
Console.WriteLine($" - Content: {multiResult.Content.Substring(0, 100)}...");
|
||||
Console.WriteLine($" - Languages: {string.Join(", ", multiResult.DetectedLanguages)}");
|
||||
Console.WriteLine($" - Quality Score: {multiResult.Metadata.QualityScore}");
|
||||
```
|
||||
19
docs/snippets/csharp/utils/embedding_with_chunking.md
Normal file
19
docs/snippets/csharp/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1024,
|
||||
Overlap = 100,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
BatchSize = 32,
|
||||
ShowDownloadProgress = false,
|
||||
},
|
||||
},
|
||||
};
|
||||
```
|
||||
62
docs/snippets/csharp/utils/extract_keywords.cs
Normal file
62
docs/snippets/csharp/utils/extract_keywords.cs
Normal file
@@ -0,0 +1,62 @@
|
||||
```csharp title="extract_keywords.cs"
|
||||
using Kreuzberg;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.YAKE,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("research_paper.pdf", config);
|
||||
|
||||
Console.WriteLine("Extracted Keywords:");
|
||||
if (result.Metadata.Keywords != null)
|
||||
{
|
||||
foreach (var keyword in result.Metadata.Keywords.OrderByDescending(k => k.Score))
|
||||
{
|
||||
Console.WriteLine($" - {keyword.Text}: {keyword.Score:F3}");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine(" (No keywords extracted)");
|
||||
}
|
||||
|
||||
var tfidfConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.TfIdf,
|
||||
MaxKeywords = 15,
|
||||
MinScore = 0.2
|
||||
}
|
||||
};
|
||||
|
||||
var tfidfResult = KreuzbergLib.ExtractFileSync("document.pdf", tfidfConfig);
|
||||
|
||||
Console.WriteLine("\nTF-IDF Keywords:");
|
||||
if (tfidfResult.Metadata.Keywords != null)
|
||||
{
|
||||
var topKeywords = tfidfResult.Metadata.Keywords
|
||||
.OrderByDescending(k => k.Score)
|
||||
.Take(10)
|
||||
.ToList();
|
||||
|
||||
foreach (var keyword in topKeywords)
|
||||
{
|
||||
Console.WriteLine($" - {keyword.Text}: {keyword.Score:F3}");
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine($"\nKeyword Extraction Summary:");
|
||||
Console.WriteLine($" - Algorithm: YAKE");
|
||||
Console.WriteLine($" - Total Keywords: {result.Metadata.Keywords?.Count ?? 0}");
|
||||
Console.WriteLine($" - Top Keyword: {result.Metadata.Keywords?.FirstOrDefault()?.Text}");
|
||||
```
|
||||
20
docs/snippets/csharp/utils/keyword_extraction_example.md
Normal file
20
docs/snippets/csharp/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.YAKE,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.3f,
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("research_paper.pdf", null, config);
|
||||
|
||||
foreach (var keyword in result.ExtractedKeywords ?? new List<Keyword>())
|
||||
{
|
||||
Console.WriteLine($"{keyword.Text}: {keyword.Score:F3}");
|
||||
}
|
||||
```
|
||||
18
docs/snippets/csharp/utils/quality_processing_example.md
Normal file
18
docs/snippets/csharp/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig { EnableQualityProcessing = true };
|
||||
var result = await KreuzbergLib.ExtractFile("scanned_document.pdf", null, config);
|
||||
|
||||
var qualityScore = result.QualityScore ?? 0.0;
|
||||
|
||||
if (qualityScore < 0.5)
|
||||
{
|
||||
Console.WriteLine($"Warning: Low quality extraction ({qualityScore:F2})");
|
||||
Console.WriteLine("Consider re-scanning with higher DPI or adjusting OCR settings");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Quality score: {qualityScore:F2}");
|
||||
}
|
||||
```
|
||||
17
docs/snippets/csharp/utils/standalone_embed.md
Normal file
17
docs/snippets/csharp/utils/standalone_embed.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var client = new KreuzbergLib();
|
||||
|
||||
var config = new EmbeddingConfig { Model = EmbeddingModelType.Preset("balanced"), Normalize = true };
|
||||
var texts = new[] { "Hello, world!", "Kreuzberg is fast" };
|
||||
|
||||
// Synchronous
|
||||
var embeddings = client.EmbedSync(texts, config).ToList();
|
||||
Console.WriteLine(embeddings.Count); // 2
|
||||
Console.WriteLine(embeddings[0].Length); // 768
|
||||
|
||||
// Asynchronous
|
||||
var asyncEmbeddings = await client.EmbedAsync(texts, config);
|
||||
Console.WriteLine(asyncEmbeddings.First().Length); // 768
|
||||
```
|
||||
15
docs/snippets/csharp/utils/token_reduction.md
Normal file
15
docs/snippets/csharp/utils/token_reduction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true,
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
```
|
||||
22
docs/snippets/csharp/utils/token_reduction_example.md
Normal file
22
docs/snippets/csharp/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true,
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("verbose_document.pdf", null, config);
|
||||
|
||||
var additional = result.Metadata.Additional;
|
||||
var original = additional.TryGetValue("original_token_count", out var o) ? o : 0;
|
||||
var reduced = additional.TryGetValue("token_count", out var r) ? r : 0;
|
||||
var ratio = additional.TryGetValue("token_reduction_ratio", out var rr) ? rr : 0.0;
|
||||
|
||||
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
|
||||
Console.WriteLine($"Reduction: {Convert.ToDouble(ratio) * 100:F1}%");
|
||||
```
|
||||
29
docs/snippets/csharp/utils/vector_database_integration.md
Normal file
29
docs/snippets/csharp/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 512,
|
||||
Overlap = 50,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset("balanced"),
|
||||
Normalize = true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
|
||||
var chunks = result.Chunks ?? new List<Chunk>();
|
||||
for (var i = 0; i < chunks.Count; i++)
|
||||
{
|
||||
var chunkId = $"doc_chunk_{i}";
|
||||
var preview = chunks[i].Content.Length > 50
|
||||
? chunks[i].Content[..50]
|
||||
: chunks[i].Content;
|
||||
Console.WriteLine($"Chunk {chunkId}: {preview}");
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user