Files
fil/docs/snippets/csharp/advanced/extract_with_config.cs

99 lines
2.8 KiB
C#
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
ForceOcr = false,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3,
Oem = 3,
MinConfidence = 0.8,
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true
},
EnableTableDetection = true
}
},
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true
},
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 150,
MaxImageDimension = 4096
},
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Preset = "default"
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true
},
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = false
},
Postprocessor = new PostProcessorConfig
{
Enabled = true
}
};
var result = await KreuzbergLib.ExtractFileAsync(
"document.pdf",
config
);
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
Console.WriteLine($"Format type: {result.Metadata.FormatType}");
if (result.Tables.Any())
{
Console.WriteLine($"Found {result.Tables.Count} tables");
}
if (result.Chunks?.Any() == true)
{
Console.WriteLine($"Created {result.Chunks.Count} chunks");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction error: {ex.Message}");
}
}
}