Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,48 @@
```csharp title="Element-Based Output (C#)"
using Kreuzberg;
// Configure element-based output
var config = new ExtractionConfig
{
OutputFormat = OutputFormat.ElementBased
};
// Extract document
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
// Access elements
foreach (var element in result.Elements)
{
Console.WriteLine($"Type: {element.ElementType}");
var text = element.Text.Length > 100
? element.Text.Substring(0, 100)
: element.Text;
Console.WriteLine($"Text: {text}");
if (element.Metadata.PageNumber.HasValue)
{
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
}
if (element.Metadata.Coordinates != null)
{
var coords = element.Metadata.Coordinates;
Console.WriteLine($"Coords: ({coords.Left}, {coords.Top}) - ({coords.Right}, {coords.Bottom})");
}
Console.WriteLine("---");
}
// Filter by element type
var titles = result.Elements
.Where(e => e.ElementType == "title");
foreach (var title in titles)
{
var level = title.Metadata.Additional.TryGetValue("level", out var levelValue)
? levelValue.ToString()
: "unknown";
Console.WriteLine($"[{level}] {title.Text}");
}
```

View File

@@ -0,0 +1,41 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu"
},
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true
},
TokenReduction = new TokenReductionOptions
{
Mode = "moderate"
},
Keywords = new KeywordConfig
{
MaxKeywords = 10,
MinScore = 0.1f
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
```

View File

@@ -0,0 +1,9 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,47 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200,
ChunkerType = ChunkerType.Text
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Chunks != null)
{
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk length: {chunk.Content.Length}");
}
}
```
```csharp title="C# - Markdown with Heading Context"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 500,
Overlap = 50,
ChunkerType = ChunkerType.Markdown,
PrependHeadingContext = true
}
};
var result = await KreuzbergLib.ExtractFile("document.md", null, config);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}");
}
}
```

View File

@@ -0,0 +1,12 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,8 @@
```csharp title="C#"
using Kreuzberg;
var config = ExtractionConfig.Discover() ?? new ExtractionConfig();
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,19 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
Console.WriteLine($"Content length: {result.Content.Length}");
if (result.Tables != null)
{
Console.WriteLine($"Tables detected: {result.Tables.Count}");
}
```

View File

@@ -0,0 +1,26 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu",
TesseractConfig = new TesseractConfig
{
Psm = 6
}
},
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Content length: {result.Content.Length}");
```

View File

@@ -0,0 +1,14 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractBytesSync(
new BytesWithMime(fileBytes, "application/pdf"),
config
);
var mimeType = result.MimeType;

View File

@@ -0,0 +1,8 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = false
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,18 @@
```csharp title="Document Structure Config (C#)"
using Kreuzberg;
var config = new ExtractionConfig
{
IncludeDocumentStructure = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Document is not null)
{
foreach (var node in result.Document.Nodes)
{
Console.WriteLine($"[{node.Content.NodeType}]");
}
}
```

View File

@@ -0,0 +1,37 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
ResultFormat = ResultFormat.ElementBased
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Elements != null)
{
foreach (var element in result.Elements)
{
Console.WriteLine($"Type: {element.ElementType}");
Console.WriteLine($"Text: {element.Text.Substring(0, Math.Min(100, element.Text.Length))}");
if (element.Metadata.PageNumber.HasValue)
{
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
}
if (element.Metadata.Coordinates != null)
{
Console.WriteLine($"Coords: ({element.Metadata.Coordinates.X0}, {element.Metadata.Coordinates.Y0})");
}
Console.WriteLine("---");
}
var titles = result.Elements
.Where(e => e.ElementType == ElementType.Title)
.ToList();
Console.WriteLine($"Found {titles.Count} titles");
}
```

View File

@@ -0,0 +1,106 @@
using Kreuzberg.Config;
public class EmbeddingConfigExample
{
public static void Main()
{
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
var embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "~/.cache/kreuzberg/embeddings"
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "BAAI/bge-small-en-v1.5",
Dimensions = 384
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = null // Uses default: .kreuzberg/embeddings/
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "sentence-transformers/all-mpnet-base-v2",
Dimensions = 768
},
BatchSize = 16, // Larger model requires smaller batch size
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "/var/cache/embeddings"
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
var chunkingConfig = new ChunkingConfig
{
MaxChars = 1024,
MaxOverlap = 100,
Preset = "balanced",
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true
}
};
var extractionConfig = new ExtractionConfig
{
Chunking = chunkingConfig
};
}
}
// Key parameter explanations:
//
// BatchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// Normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// CacheDir: Where to store downloaded models
// - null: Uses .kreuzberg/embeddings/ in current directory
// - String path: Custom directory for model storage
//
// ShowDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,25 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxCharacters = 1000,
Overlap = 200,
Embedding = new EmbeddingConfig
{
Normalize = true,
BatchSize = 16,
ShowDownloadProgress = true,
CacheDir = null
}
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Chunks != null)
{
Console.WriteLine($"Chunks with embeddings: {result.Chunks.Count}");
}
```

View File

@@ -0,0 +1,8 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,60 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
ForceOcr = false,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3,
Oem = 3,
MinConfidence = 0.8,
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true
},
EnableTableDetection = true
}
},
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true
},
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 150,
MaxImageDimension = 4096
},
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = false
},
Postprocessor = new PostProcessorConfig
{
Enabled = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,99 @@
using Kreuzberg.Config;
using Kreuzberg;
public class HierarchyConfigExample
{
public static void Main()
{
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
var hierarchyConfigBasic = new HierarchyConfig
{
Enabled = true,
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = null // No OCR coverage threshold
};
var pdfConfigBasic = new PdfConfig
{
Hierarchy = hierarchyConfigBasic
};
var extractionConfigBasic = new ExtractionConfig
{
PdfOptions = pdfConfigBasic
};
var kreuzberg = new Kreuzberg(extractionConfigBasic);
// var result = kreuzberg.ExtractFileSync("document.pdf");
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
var hierarchyConfigMinimal = new HierarchyConfig
{
Enabled = true,
KClusters = 3, // Minimal clustering: just 3 levels
IncludeBbox = true,
OcrCoverageThreshold = null
};
var pdfConfigMinimal = new PdfConfig
{
Hierarchy = hierarchyConfigMinimal
};
var extractionConfigMinimal = new ExtractionConfig
{
PdfOptions = pdfConfigMinimal
};
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
var hierarchyConfigOcr = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
};
var pdfConfigOcr = new PdfConfig
{
Hierarchy = hierarchyConfigOcr
};
var extractionConfigOcr = new ExtractionConfig
{
PdfOptions = pdfConfigOcr
};
}
}
// Field descriptions:
//
// Enabled: bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: float? (default: null)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
// - null means no OCR coverage-based triggering

View File

@@ -0,0 +1,17 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
OutputFormat = OutputFormat.Html,
HtmlOutput = new HtmlOutputConfig
{
Theme = HtmlTheme.GitHub,
EmbedCss = true,
ClassPrefix = "kb-"
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine(result.Content);
```

View File

@@ -0,0 +1,19 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata != null)
{
var language = result.Metadata.Language;
var format = result.Metadata.FormatType;
}

View File

@@ -0,0 +1,66 @@
using Kreuzberg;
using Kreuzberg.Keywords;
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
var basicYakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.0f,
NgramRange = (1, 3),
Language = "en",
YakeParams = null,
RakeParams = null,
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", basicYakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
var advancedYakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 15,
MinScore = 0.1f,
NgramRange = (1, 2),
Language = "en",
YakeParams = new YakeParams
{
WindowSize = 1,
},
RakeParams = null,
}
};
result = KreuzbergLib.ExtractFileSync("document.pdf", advancedYakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
var rakeConfig = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Rake,
MaxKeywords = 10,
MinScore = 5.0f,
NgramRange = (1, 3),
Language = "en",
YakeParams = null,
RakeParams = new RakeParams
{
MinWordLength = 1,
MaxWordsPerPhrase = 3,
},
}
};
result = KreuzbergLib.ExtractFileSync("document.pdf", rakeConfig);
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");

View File

@@ -0,0 +1,21 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.1f,
NgramRange = [1, 3],
Language = "en"
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Keywords != null)
{
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
}
```

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = true
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Detected language: {result.Language}");
if (result.DetectedLanguages != null)
{
Console.WriteLine($"All detected: {string.Join(", ", result.DetectedLanguages)}");
}
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 300,
MaxImageDimension = 4096,
AutoAdjustDpi = true,
MinDpi = 150,
MaxDpi = 600
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
if (result.Images != null)
{
Console.WriteLine($"Extracted images: {result.Images.Count}");
}
```

View File

@@ -0,0 +1,12 @@
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,17 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true
};
var result = KreuzbergLib.ExtractFileSync("document.html", config);
if (result.Metadata?.Format.Text?.Links != null)
{
foreach (var link in result.Metadata.Format.Text.Links)
{
var text = link[0];
var url = link[1];
}
}

View File

@@ -0,0 +1,18 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var title = result.Metadata.Format.Pdf.Title;
var author = result.Metadata.Format.Pdf.Author;
var pageCount = result.Metadata.Format.Pdf.PageCount;
}

View File

@@ -0,0 +1,21 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true,
ExtractAnnotations = false,
Passwords = new List<string> { "password123" }
}
};
var result = await KreuzbergLib.ExtractFile("encrypted.pdf", null, config);
if (result.Metadata != null)
{
Console.WriteLine($"Title: {result.Metadata.Title}");
Console.WriteLine($"Authors: {string.Join(", ", result.Metadata.Authors ?? new List<string>())}");
}
```

View File

@@ -0,0 +1,74 @@
```csharp title="C#"
using Kreuzberg;
// Basic hierarchy configuration with properties
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.8f
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length}");
// Advanced hierarchy detection with custom parameters
var advancedConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 12, // More clusters for detailed hierarchy
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = 0.7f // Higher OCR threshold for stricter detection
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("complex_document.pdf", advancedConfig);
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");
// Minimal configuration with only enabled flag
var minimalConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = true,
// Other properties use defaults:
// KClusters = 6
// IncludeBbox = true
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", minimalConfig);
Console.WriteLine("Extraction with default hierarchy settings complete");
// Disabling hierarchy detection
var noHierarchyConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = false
}
}
};
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", noHierarchyConfig);
Console.WriteLine("Extraction without hierarchy detection complete");
```

View File

@@ -0,0 +1,13 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Postprocessor = new PostProcessorConfig
{
Enabled = true,
EnabledProcessors = new List<string> { "normalize_whitespace", "remove_diacritics" }
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Postprocessor = new PostProcessorConfig
{
Enabled = true,
EnabledProcessors = new List<string>
{
"whitespace_normalizer",
"unicode_normalizer"
},
DisabledProcessors = null
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Processed content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,13 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true,
UseCache = true
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Quality score: {result.QualityScore}");
Console.WriteLine($"Content length: {result.Content.Length}");
```

View File

@@ -0,0 +1,22 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu",
TesseractConfig = new TesseractConfig
{
Psm = 6,
Oem = 3,
MinConfidence = 0.5,
Language = "eng"
}
}
};
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
Console.WriteLine($"OCR text: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,16 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionOptions
{
Mode = "moderate",
PreserveImportantWords = true
}
};
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
Console.WriteLine($"Reduced content length: {result.Content.Length}");
Console.WriteLine($"Content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
```

View File

@@ -0,0 +1,18 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (!result.Success)
{
if (result.Metadata?.Error != null)
{
var errorType = result.Metadata.Error.ErrorType;
var errorMessage = result.Metadata.Error.Message;
}
}

View File

@@ -0,0 +1,13 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng"
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

View File

@@ -0,0 +1,10 @@
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var cts = new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(30));
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config, cts.Token);