This commit is contained in:
13
docs/snippets/csharp/ocr/auto_ocr.cs
Normal file
13
docs/snippets/csharp/ocr/auto_ocr.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "auto",
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
78
docs/snippets/csharp/ocr/cloud_ocr_backend.md
Normal file
78
docs/snippets/csharp/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,78 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
using System.Collections.Generic;
|
||||
|
||||
public class CloudOcrBackend : IOcrBackend
|
||||
{
|
||||
private string _apiKey;
|
||||
|
||||
public string Name => "cloud-ocr";
|
||||
public string Version => "1.0.0";
|
||||
|
||||
public CloudOcrBackend(string apiKey)
|
||||
{
|
||||
_apiKey = apiKey;
|
||||
}
|
||||
|
||||
public void Initialize()
|
||||
{
|
||||
}
|
||||
|
||||
public void Shutdown()
|
||||
{
|
||||
}
|
||||
|
||||
public ExtractionResult ProcessImage(byte[] imageBytes, OcrConfig config)
|
||||
{
|
||||
// Call cloud OCR API with imageBytes and config.Language
|
||||
// Return ExtractionResult with extracted text
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public ExtractionResult ProcessImageFile(string path, OcrConfig config)
|
||||
{
|
||||
var imageBytes = File.ReadAllBytes(path);
|
||||
return ProcessImage(imageBytes, config);
|
||||
}
|
||||
|
||||
public bool SupportsLanguage(string language)
|
||||
{
|
||||
return SupportedLanguages().Contains(language);
|
||||
}
|
||||
|
||||
public OcrBackendType BackendType()
|
||||
{
|
||||
return OcrBackendType.Cloud;
|
||||
}
|
||||
|
||||
public List<string> SupportedLanguages()
|
||||
{
|
||||
return new List<string> { "eng", "deu", "fra" };
|
||||
}
|
||||
|
||||
public bool SupportsTableDetection()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public bool SupportsDocumentProcessing()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public ExtractionResult ProcessDocument(string path, OcrConfig config)
|
||||
{
|
||||
throw new NotSupportedException("Document processing not supported by CloudOcrBackend");
|
||||
}
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
// Register the backend
|
||||
var backend = new CloudOcrBackend(apiKey: "your-api-key");
|
||||
OcrBackendBridge.Register(backend);
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/csharp/ocr/easyocr_backend.cs
Normal file
14
docs/snippets/csharp/ocr/easyocr_backend.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "easyocr",
|
||||
Language = "en",
|
||||
UseGpu = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
14
docs/snippets/csharp/ocr/force_ocr.cs
Normal file
14
docs/snippets/csharp/ocr/force_ocr.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
ForceOcr = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
21
docs/snippets/csharp/ocr/image_extraction.md
Normal file
21
docs/snippets/csharp/ocr/image_extraction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 200,
|
||||
MaxImageDimension = 2048,
|
||||
InjectPlaceholders = true, // set to false to extract images without markdown references
|
||||
AutoAdjustDpi = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
Console.WriteLine($"Extracted: {preview}");
|
||||
```
|
||||
27
docs/snippets/csharp/ocr/image_preprocessing.md
Normal file
27
docs/snippets/csharp/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Preprocessing = new ImagePreprocessingConfig
|
||||
{
|
||||
TargetDpi = 300,
|
||||
Denoise = true,
|
||||
Deskew = true,
|
||||
ContrastEnhance = true,
|
||||
BinarizationMethod = "otsu"
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
Console.WriteLine($"Content: {preview}");
|
||||
```
|
||||
23
docs/snippets/csharp/ocr/ocr_easyocr.md
Normal file
23
docs/snippets/csharp/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "easyocr",
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
|
||||
// EasyOCR-specific options (use_gpu, beam_width, etc.) can be passed through
|
||||
// OcrConfig's EasyocrConfig field if available, or via backend-specific configuration.
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
int totalLength = content.Length;
|
||||
|
||||
Console.WriteLine($"Extracted content (preview): {preview}");
|
||||
Console.WriteLine($"Total characters: {totalLength}");
|
||||
```
|
||||
29
docs/snippets/csharp/ocr/ocr_elements.md
Normal file
29
docs/snippets/csharp/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "paddle-ocr",
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", config);
|
||||
|
||||
if (result.OcrElements is not null)
|
||||
{
|
||||
foreach (var element in result.OcrElements)
|
||||
{
|
||||
Console.WriteLine($"Text: {element.Text}");
|
||||
Console.WriteLine($"Confidence: {element.Confidence.Recognition:F2}");
|
||||
Console.WriteLine($"Geometry: {element.Geometry}");
|
||||
if (element.Rotation is not null)
|
||||
{
|
||||
Console.WriteLine($"Rotation: {element.Rotation.Angle}°");
|
||||
}
|
||||
Console.WriteLine();
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/csharp/ocr/ocr_extraction.md
Normal file
21
docs/snippets/csharp/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
int totalLength = content.Length;
|
||||
|
||||
Console.WriteLine($"Extracted content (preview): {preview}");
|
||||
Console.WriteLine($"Total characters: {totalLength}");
|
||||
```
|
||||
21
docs/snippets/csharp/ocr/ocr_force_all_pages.md
Normal file
21
docs/snippets/csharp/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract"
|
||||
},
|
||||
ForceOcr = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
int totalLength = content.Length;
|
||||
|
||||
Console.WriteLine($"Extracted content (preview): {preview}");
|
||||
Console.WriteLine($"Total characters: {totalLength}");
|
||||
```
|
||||
21
docs/snippets/csharp/ocr/ocr_multi_language.md
Normal file
21
docs/snippets/csharp/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu+fra"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("multilingual.pdf", null, config);
|
||||
|
||||
string content = result.Content;
|
||||
string preview = content.Length > 100 ? content[..100] : content;
|
||||
int totalLength = content.Length;
|
||||
|
||||
Console.WriteLine($"Extracted content (preview): {preview}");
|
||||
Console.WriteLine($"Total characters: {totalLength}");
|
||||
```
|
||||
16
docs/snippets/csharp/ocr/ocr_paddleocr.md
Normal file
16
docs/snippets/csharp/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "paddle-ocr",
|
||||
Language = "en",
|
||||
// PaddleOcrConfig = new PaddleOcrConfig { ModelTier = "server" } // for max accuracy
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("scanned.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
17
docs/snippets/csharp/ocr/tesseract_backend.cs
Normal file
17
docs/snippets/csharp/ocr/tesseract_backend.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu+fra",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 3
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
Console.WriteLine(result.Content);
|
||||
Reference in New Issue
Block a user