Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1500,
Overlap = 200,
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset("balanced"),
},
},
};
```

View File

@@ -0,0 +1,32 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 500,
Overlap = 50,
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 16,
},
},
};
var result = await KreuzbergLib.ExtractFile("research_paper.pdf", null, config);
var chunksWithEmbeddings = new List<(string Preview, int Dimensions)>();
foreach (var chunk in result.Chunks ?? new List<Chunk>())
{
if (chunk.Embedding is { Count: > 0 } embedding)
{
var preview = chunk.Content.Length > 100 ? chunk.Content[..100] : chunk.Content;
chunksWithEmbeddings.Add((preview, embedding.Count));
}
}
Console.WriteLine($"Chunks with embeddings: {chunksWithEmbeddings.Count}");
```

View File

@@ -0,0 +1,46 @@
```csharp title="detect_language.cs"
using Kreuzberg;
using System;
using System.Collections.Generic;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.9,
DetectMultiple = false
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine("Detected Language:");
foreach (var lang in result.DetectedLanguages)
{
Console.WriteLine($" - {lang}");
}
var multiLangConfig = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = true
}
};
var multiResult = KreuzbergLib.ExtractFileSync("multilingual_document.pdf", multiLangConfig);
Console.WriteLine("Detected Languages:");
foreach (var lang in multiResult.DetectedLanguages)
{
Console.WriteLine($" - {lang}");
}
Console.WriteLine($"\nLanguage Detection Summary:");
Console.WriteLine($" - Content: {multiResult.Content.Substring(0, 100)}...");
Console.WriteLine($" - Languages: {string.Join(", ", multiResult.DetectedLanguages)}");
Console.WriteLine($" - Quality Score: {multiResult.Metadata.QualityScore}");
```

View File

@@ -0,0 +1,19 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1024,
Overlap = 100,
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false,
},
},
};
```

View File

@@ -0,0 +1,62 @@
```csharp title="extract_keywords.cs"
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Linq;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.YAKE,
MaxKeywords = 10,
MinScore = 0.3
}
};
var result = KreuzbergLib.ExtractFileSync("research_paper.pdf", config);
Console.WriteLine("Extracted Keywords:");
if (result.Metadata.Keywords != null)
{
foreach (var keyword in result.Metadata.Keywords.OrderByDescending(k => k.Score))
{
Console.WriteLine($" - {keyword.Text}: {keyword.Score:F3}");
}
}
else
{
Console.WriteLine(" (No keywords extracted)");
}
var tfidfConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.TfIdf,
MaxKeywords = 15,
MinScore = 0.2
}
};
var tfidfResult = KreuzbergLib.ExtractFileSync("document.pdf", tfidfConfig);
Console.WriteLine("\nTF-IDF Keywords:");
if (tfidfResult.Metadata.Keywords != null)
{
var topKeywords = tfidfResult.Metadata.Keywords
.OrderByDescending(k => k.Score)
.Take(10)
.ToList();
foreach (var keyword in topKeywords)
{
Console.WriteLine($" - {keyword.Text}: {keyword.Score:F3}");
}
}
Console.WriteLine($"\nKeyword Extraction Summary:");
Console.WriteLine($" - Algorithm: YAKE");
Console.WriteLine($" - Total Keywords: {result.Metadata.Keywords?.Count ?? 0}");
Console.WriteLine($" - Top Keyword: {result.Metadata.Keywords?.FirstOrDefault()?.Text}");
```

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.YAKE,
MaxKeywords = 10,
MinScore = 0.3f,
},
};
var result = await KreuzbergLib.ExtractFile("research_paper.pdf", null, config);
foreach (var keyword in result.ExtractedKeywords ?? new List<Keyword>())
{
Console.WriteLine($"{keyword.Text}: {keyword.Score:F3}");
}
```

View File

@@ -0,0 +1,18 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig { EnableQualityProcessing = true };
var result = await KreuzbergLib.ExtractFile("scanned_document.pdf", null, config);
var qualityScore = result.QualityScore ?? 0.0;
if (qualityScore < 0.5)
{
Console.WriteLine($"Warning: Low quality extraction ({qualityScore:F2})");
Console.WriteLine("Consider re-scanning with higher DPI or adjusting OCR settings");
}
else
{
Console.WriteLine($"Quality score: {qualityScore:F2}");
}
```

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var client = new KreuzbergLib();
var config = new EmbeddingConfig { Model = EmbeddingModelType.Preset("balanced"), Normalize = true };
var texts = new[] { "Hello, world!", "Kreuzberg is fast" };
// Synchronous
var embeddings = client.EmbedSync(texts, config).ToList();
Console.WriteLine(embeddings.Count); // 2
Console.WriteLine(embeddings[0].Length); // 768
// Asynchronous
var asyncEmbeddings = await client.EmbedAsync(texts, config);
Console.WriteLine(asyncEmbeddings.First().Length); // 768
```

View File

@@ -0,0 +1,15 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionOptions
{
Mode = "moderate",
PreserveImportantWords = true,
},
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Content length: {result.Content.Length}");
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionOptions
{
Mode = "moderate",
PreserveImportantWords = true,
},
};
var result = await KreuzbergLib.ExtractFile("verbose_document.pdf", null, config);
var additional = result.Metadata.Additional;
var original = additional.TryGetValue("original_token_count", out var o) ? o : 0;
var reduced = additional.TryGetValue("token_count", out var r) ? r : 0;
var ratio = additional.TryGetValue("token_reduction_ratio", out var rr) ? rr : 0.0;
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
Console.WriteLine($"Reduction: {Convert.ToDouble(ratio) * 100:F1}%");
```

View File

@@ -0,0 +1,29 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 512,
Overlap = 50,
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset("balanced"),
Normalize = true,
},
},
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
var chunks = result.Chunks ?? new List<Chunk>();
for (var i = 0; i < chunks.Count; i++)
{
var chunkId = $"doc_chunk_{i}";
var preview = chunks[i].Content.Length > 50
? chunks[i].Content[..50]
: chunks[i].Content;
Console.WriteLine($"Chunk {chunkId}: {preview}");
}
```