This commit is contained in:
48
docs/snippets/csharp/config/ElementBasedOutput.md
Normal file
48
docs/snippets/csharp/config/ElementBasedOutput.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```csharp title="Element-Based Output (C#)"
|
||||
using Kreuzberg;
|
||||
|
||||
// Configure element-based output
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
OutputFormat = OutputFormat.ElementBased
|
||||
};
|
||||
|
||||
// Extract document
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
|
||||
|
||||
// Access elements
|
||||
foreach (var element in result.Elements)
|
||||
{
|
||||
Console.WriteLine($"Type: {element.ElementType}");
|
||||
|
||||
var text = element.Text.Length > 100
|
||||
? element.Text.Substring(0, 100)
|
||||
: element.Text;
|
||||
Console.WriteLine($"Text: {text}");
|
||||
|
||||
if (element.Metadata.PageNumber.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
|
||||
}
|
||||
|
||||
if (element.Metadata.Coordinates != null)
|
||||
{
|
||||
var coords = element.Metadata.Coordinates;
|
||||
Console.WriteLine($"Coords: ({coords.Left}, {coords.Top}) - ({coords.Right}, {coords.Bottom})");
|
||||
}
|
||||
|
||||
Console.WriteLine("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
var titles = result.Elements
|
||||
.Where(e => e.ElementType == "title");
|
||||
|
||||
foreach (var title in titles)
|
||||
{
|
||||
var level = title.Metadata.Additional.TryGetValue("level", out var levelValue)
|
||||
? levelValue.ToString()
|
||||
: "unknown";
|
||||
Console.WriteLine($"[{level}] {title.Text}");
|
||||
}
|
||||
```
|
||||
41
docs/snippets/csharp/config/advanced_config.md
Normal file
41
docs/snippets/csharp/config/advanced_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu"
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200
|
||||
},
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
DetectMultiple = true
|
||||
},
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate"
|
||||
},
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.1f
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
|
||||
if (result.DetectedLanguages?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
```
|
||||
9
docs/snippets/csharp/config/basic.cs
Normal file
9
docs/snippets/csharp/config/basic.cs
Normal file
@@ -0,0 +1,9 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
47
docs/snippets/csharp/config/chunking_config.md
Normal file
47
docs/snippets/csharp/config/chunking_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200,
|
||||
ChunkerType = ChunkerType.Text
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Chunk length: {chunk.Content.Length}");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```csharp title="C# - Markdown with Heading Context"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 500,
|
||||
Overlap = 50,
|
||||
ChunkerType = ChunkerType.Markdown,
|
||||
PrependHeadingContext = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.md", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Content: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}");
|
||||
}
|
||||
}
|
||||
```
|
||||
12
docs/snippets/csharp/config/config_basic.md
Normal file
12
docs/snippets/csharp/config/config_basic.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
8
docs/snippets/csharp/config/config_discover.md
Normal file
8
docs/snippets/csharp/config/config_discover.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = ExtractionConfig.Discover() ?? new ExtractionConfig();
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
19
docs/snippets/csharp/config/config_ocr.md
Normal file
19
docs/snippets/csharp/config/config_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
if (result.Tables != null)
|
||||
{
|
||||
Console.WriteLine($"Tables detected: {result.Tables.Count}");
|
||||
}
|
||||
```
|
||||
26
docs/snippets/csharp/config/config_programmatic.md
Normal file
26
docs/snippets/csharp/config/config_programmatic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 6
|
||||
}
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
```
|
||||
14
docs/snippets/csharp/config/custom_mime_types.cs
Normal file
14
docs/snippets/csharp/config/custom_mime_types.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractBytesSync(
|
||||
new BytesWithMime(fileBytes, "application/pdf"),
|
||||
config
|
||||
);
|
||||
|
||||
var mimeType = result.MimeType;
|
||||
8
docs/snippets/csharp/config/disable_cache.cs
Normal file
8
docs/snippets/csharp/config/disable_cache.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = false
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
18
docs/snippets/csharp/config/document_structure_config.md
Normal file
18
docs/snippets/csharp/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```csharp title="Document Structure Config (C#)"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
IncludeDocumentStructure = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Document is not null)
|
||||
{
|
||||
foreach (var node in result.Document.Nodes)
|
||||
{
|
||||
Console.WriteLine($"[{node.Content.NodeType}]");
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/csharp/config/element_based_output.md
Normal file
37
docs/snippets/csharp/config/element_based_output.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
ResultFormat = ResultFormat.ElementBased
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
|
||||
if (result.Elements != null)
|
||||
{
|
||||
foreach (var element in result.Elements)
|
||||
{
|
||||
Console.WriteLine($"Type: {element.ElementType}");
|
||||
Console.WriteLine($"Text: {element.Text.Substring(0, Math.Min(100, element.Text.Length))}");
|
||||
|
||||
if (element.Metadata.PageNumber.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Page: {element.Metadata.PageNumber}");
|
||||
}
|
||||
|
||||
if (element.Metadata.Coordinates != null)
|
||||
{
|
||||
Console.WriteLine($"Coords: ({element.Metadata.Coordinates.X0}, {element.Metadata.Coordinates.Y0})");
|
||||
}
|
||||
|
||||
Console.WriteLine("---");
|
||||
}
|
||||
|
||||
var titles = result.Elements
|
||||
.Where(e => e.ElementType == ElementType.Title)
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {titles.Count} titles");
|
||||
}
|
||||
```
|
||||
106
docs/snippets/csharp/config/embedding_config.cs
Normal file
106
docs/snippets/csharp/config/embedding_config.cs
Normal file
@@ -0,0 +1,106 @@
|
||||
using Kreuzberg.Config;
|
||||
|
||||
public class EmbeddingConfigExample
|
||||
{
|
||||
public static void Main()
|
||||
{
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
var embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset
|
||||
{
|
||||
Name = "balanced"
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = "~/.cache/kreuzberg/embeddings"
|
||||
};
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Custom
|
||||
{
|
||||
ModelId = "BAAI/bge-small-en-v1.5",
|
||||
Dimensions = 384
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = null // Uses default: .kreuzberg/embeddings/
|
||||
};
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Custom
|
||||
{
|
||||
ModelId = "sentence-transformers/all-mpnet-base-v2",
|
||||
Dimensions = 768
|
||||
},
|
||||
BatchSize = 16, // Larger model requires smaller batch size
|
||||
Normalize = true,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = "/var/cache/embeddings"
|
||||
};
|
||||
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
var chunkingConfig = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1024,
|
||||
MaxOverlap = 100,
|
||||
Preset = "balanced",
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Model = new EmbeddingModelType.Preset
|
||||
{
|
||||
Name = "balanced"
|
||||
},
|
||||
BatchSize = 32,
|
||||
Normalize = true
|
||||
}
|
||||
};
|
||||
|
||||
var extractionConfig = new ExtractionConfig
|
||||
{
|
||||
Chunking = chunkingConfig
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// BatchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// Normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// CacheDir: Where to store downloaded models
|
||||
// - null: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - String path: Custom directory for model storage
|
||||
//
|
||||
// ShowDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
25
docs/snippets/csharp/config/embedding_config.md
Normal file
25
docs/snippets/csharp/config/embedding_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxCharacters = 1000,
|
||||
Overlap = 200,
|
||||
Embedding = new EmbeddingConfig
|
||||
{
|
||||
Normalize = true,
|
||||
BatchSize = 16,
|
||||
ShowDownloadProgress = true,
|
||||
CacheDir = null
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
Console.WriteLine($"Chunks with embeddings: {result.Chunks.Count}");
|
||||
}
|
||||
```
|
||||
8
docs/snippets/csharp/config/enable_cache.cs
Normal file
8
docs/snippets/csharp/config/enable_cache.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
60
docs/snippets/csharp/config/full_example.cs
Normal file
60
docs/snippets/csharp/config/full_example.cs
Normal file
@@ -0,0 +1,60 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true,
|
||||
ForceOcr = false,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 3,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.8,
|
||||
Preprocessing = new ImagePreprocessingConfig
|
||||
{
|
||||
TargetDpi = 300,
|
||||
Denoise = true,
|
||||
Deskew = true,
|
||||
ContrastEnhance = true
|
||||
},
|
||||
EnableTableDetection = true
|
||||
}
|
||||
},
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true
|
||||
},
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 150,
|
||||
MaxImageDimension = 4096
|
||||
},
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
MaxChars = 1000,
|
||||
MaxOverlap = 200
|
||||
},
|
||||
TokenReduction = new TokenReductionConfig
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
},
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = false
|
||||
},
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
99
docs/snippets/csharp/config/hierarchy_config.cs
Normal file
99
docs/snippets/csharp/config/hierarchy_config.cs
Normal file
@@ -0,0 +1,99 @@
|
||||
using Kreuzberg.Config;
|
||||
using Kreuzberg;
|
||||
|
||||
public class HierarchyConfigExample
|
||||
{
|
||||
public static void Main()
|
||||
{
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
var hierarchyConfigBasic = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
IncludeBbox = true, // Include bounding box coordinates
|
||||
OcrCoverageThreshold = null // No OCR coverage threshold
|
||||
};
|
||||
|
||||
var pdfConfigBasic = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigBasic
|
||||
};
|
||||
|
||||
var extractionConfigBasic = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigBasic
|
||||
};
|
||||
|
||||
var kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// var result = kreuzberg.ExtractFileSync("document.pdf");
|
||||
|
||||
|
||||
// Example 2: Custom KClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
var hierarchyConfigMinimal = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 3, // Minimal clustering: just 3 levels
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = null
|
||||
};
|
||||
|
||||
var pdfConfigMinimal = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigMinimal
|
||||
};
|
||||
|
||||
var extractionConfigMinimal = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigMinimal
|
||||
};
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
var hierarchyConfigOcr = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6,
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
var pdfConfigOcr = new PdfConfig
|
||||
{
|
||||
Hierarchy = hierarchyConfigOcr
|
||||
};
|
||||
|
||||
var extractionConfigOcr = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = pdfConfigOcr
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// Enabled: bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// KClusters: int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// IncludeBbox: bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// OcrCoverageThreshold: float? (default: null)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
|
||||
// - null means no OCR coverage-based triggering
|
||||
17
docs/snippets/csharp/config/html_output.md
Normal file
17
docs/snippets/csharp/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
OutputFormat = OutputFormat.Html,
|
||||
HtmlOutput = new HtmlOutputConfig
|
||||
{
|
||||
Theme = HtmlTheme.GitHub,
|
||||
EmbedCss = true,
|
||||
ClassPrefix = "kb-"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
19
docs/snippets/csharp/config/include_meta.cs
Normal file
19
docs/snippets/csharp/config/include_meta.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata != null)
|
||||
{
|
||||
var language = result.Metadata.Language;
|
||||
var format = result.Metadata.FormatType;
|
||||
}
|
||||
66
docs/snippets/csharp/config/keyword_config.cs
Normal file
66
docs/snippets/csharp/config/keyword_config.cs
Normal file
@@ -0,0 +1,66 @@
|
||||
using Kreuzberg;
|
||||
using Kreuzberg.Keywords;
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
var basicYakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.0f,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en",
|
||||
YakeParams = null,
|
||||
RakeParams = null,
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", basicYakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
var advancedYakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 15,
|
||||
MinScore = 0.1f,
|
||||
NgramRange = (1, 2),
|
||||
Language = "en",
|
||||
YakeParams = new YakeParams
|
||||
{
|
||||
WindowSize = 1,
|
||||
},
|
||||
RakeParams = null,
|
||||
}
|
||||
};
|
||||
|
||||
result = KreuzbergLib.ExtractFileSync("document.pdf", advancedYakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
var rakeConfig = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Rake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 5.0f,
|
||||
NgramRange = (1, 3),
|
||||
Language = "en",
|
||||
YakeParams = null,
|
||||
RakeParams = new RakeParams
|
||||
{
|
||||
MinWordLength = 1,
|
||||
MaxWordsPerPhrase = 3,
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
result = KreuzbergLib.ExtractFileSync("document.pdf", rakeConfig);
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
21
docs/snippets/csharp/config/keyword_extraction_config.md
Normal file
21
docs/snippets/csharp/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Keywords = new KeywordConfig
|
||||
{
|
||||
Algorithm = KeywordAlgorithm.Yake,
|
||||
MaxKeywords = 10,
|
||||
MinScore = 0.1f,
|
||||
NgramRange = [1, 3],
|
||||
Language = "en"
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Keywords != null)
|
||||
{
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", result.Keywords)}");
|
||||
}
|
||||
```
|
||||
20
docs/snippets/csharp/config/language_detection_config.md
Normal file
20
docs/snippets/csharp/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Detected language: {result.Language}");
|
||||
if (result.DetectedLanguages != null)
|
||||
{
|
||||
Console.WriteLine($"All detected: {string.Join(", ", result.DetectedLanguages)}");
|
||||
}
|
||||
```
|
||||
22
docs/snippets/csharp/config/ocr_dpi_config.md
Normal file
22
docs/snippets/csharp/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Images = new ImageExtractionConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
TargetDpi = 300,
|
||||
MaxImageDimension = 4096,
|
||||
AutoAdjustDpi = true,
|
||||
MinDpi = 150,
|
||||
MaxDpi = 600
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
if (result.Images != null)
|
||||
{
|
||||
Console.WriteLine($"Extracted images: {result.Images.Count}");
|
||||
}
|
||||
```
|
||||
12
docs/snippets/csharp/config/ocr_lang.cs
Normal file
12
docs/snippets/csharp/config/ocr_lang.cs
Normal file
@@ -0,0 +1,12 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+fra"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
17
docs/snippets/csharp/config/parse_links.cs
Normal file
17
docs/snippets/csharp/config/parse_links.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.html", config);
|
||||
|
||||
if (result.Metadata?.Format.Text?.Links != null)
|
||||
{
|
||||
foreach (var link in result.Metadata.Format.Text.Links)
|
||||
{
|
||||
var text = link[0];
|
||||
var url = link[1];
|
||||
}
|
||||
}
|
||||
18
docs/snippets/csharp/config/parse_metadata.cs
Normal file
18
docs/snippets/csharp/config/parse_metadata.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var title = result.Metadata.Format.Pdf.Title;
|
||||
var author = result.Metadata.Format.Pdf.Author;
|
||||
var pageCount = result.Metadata.Format.Pdf.PageCount;
|
||||
}
|
||||
21
docs/snippets/csharp/config/pdf_config.md
Normal file
21
docs/snippets/csharp/config/pdf_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
ExtractMetadata = true,
|
||||
ExtractAnnotations = false,
|
||||
Passwords = new List<string> { "password123" }
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("encrypted.pdf", null, config);
|
||||
if (result.Metadata != null)
|
||||
{
|
||||
Console.WriteLine($"Title: {result.Metadata.Title}");
|
||||
Console.WriteLine($"Authors: {string.Join(", ", result.Metadata.Authors ?? new List<string>())}");
|
||||
}
|
||||
```
|
||||
74
docs/snippets/csharp/config/pdf_hierarchy_config.md
Normal file
74
docs/snippets/csharp/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
// Basic hierarchy configuration with properties
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 6,
|
||||
IncludeBbox = true,
|
||||
OcrCoverageThreshold = 0.8f
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
|
||||
// Advanced hierarchy detection with custom parameters
|
||||
var advancedConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractImages = true,
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
KClusters = 12, // More clusters for detailed hierarchy
|
||||
IncludeBbox = true, // Include bounding box coordinates
|
||||
OcrCoverageThreshold = 0.7f // Higher OCR threshold for stricter detection
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("complex_document.pdf", advancedConfig);
|
||||
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");
|
||||
|
||||
// Minimal configuration with only enabled flag
|
||||
var minimalConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = true,
|
||||
// Other properties use defaults:
|
||||
// KClusters = 6
|
||||
// IncludeBbox = true
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", minimalConfig);
|
||||
Console.WriteLine("Extraction with default hierarchy settings complete");
|
||||
|
||||
// Disabling hierarchy detection
|
||||
var noHierarchyConfig = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
Hierarchy = new HierarchyConfig
|
||||
{
|
||||
Enabled = false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", noHierarchyConfig);
|
||||
Console.WriteLine("Extraction without hierarchy detection complete");
|
||||
```
|
||||
13
docs/snippets/csharp/config/postprocessor.cs
Normal file
13
docs/snippets/csharp/config/postprocessor.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true,
|
||||
EnabledProcessors = new List<string> { "normalize_whitespace", "remove_diacritics" }
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
20
docs/snippets/csharp/config/postprocessor_config.md
Normal file
20
docs/snippets/csharp/config/postprocessor_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Postprocessor = new PostProcessorConfig
|
||||
{
|
||||
Enabled = true,
|
||||
EnabledProcessors = new List<string>
|
||||
{
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
},
|
||||
DisabledProcessors = null
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Processed content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
13
docs/snippets/csharp/config/quality_processing_config.md
Normal file
13
docs/snippets/csharp/config/quality_processing_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
EnableQualityProcessing = true,
|
||||
UseCache = true
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Quality score: {result.QualityScore}");
|
||||
Console.WriteLine($"Content length: {result.Content.Length}");
|
||||
```
|
||||
22
docs/snippets/csharp/config/tesseract_config.md
Normal file
22
docs/snippets/csharp/config/tesseract_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng+deu",
|
||||
TesseractConfig = new TesseractConfig
|
||||
{
|
||||
Psm = 6,
|
||||
Oem = 3,
|
||||
MinConfidence = 0.5,
|
||||
Language = "eng"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("scanned.pdf", null, config);
|
||||
Console.WriteLine($"OCR text: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
16
docs/snippets/csharp/config/token_reduction_config.md
Normal file
16
docs/snippets/csharp/config/token_reduction_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
TokenReduction = new TokenReductionOptions
|
||||
{
|
||||
Mode = "moderate",
|
||||
PreserveImportantWords = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = await KreuzbergLib.ExtractFile("document.pdf", null, config);
|
||||
Console.WriteLine($"Reduced content length: {result.Content.Length}");
|
||||
Console.WriteLine($"Content: {result.Content.Substring(0, Math.Min(100, result.Content.Length))}");
|
||||
```
|
||||
18
docs/snippets/csharp/config/validator.cs
Normal file
18
docs/snippets/csharp/config/validator.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
if (result.Metadata?.Error != null)
|
||||
{
|
||||
var errorType = result.Metadata.Error.ErrorType;
|
||||
var errorMessage = result.Metadata.Error.Message;
|
||||
}
|
||||
}
|
||||
13
docs/snippets/csharp/config/with_cache.cs
Normal file
13
docs/snippets/csharp/config/with_cache.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
Ocr = new OcrConfig
|
||||
{
|
||||
Backend = "tesseract",
|
||||
Language = "eng"
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
10
docs/snippets/csharp/config/with_timeout.cs
Normal file
10
docs/snippets/csharp/config/with_timeout.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
UseCache = true,
|
||||
EnableQualityProcessing = true
|
||||
};
|
||||
|
||||
var cts = new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config, cts.Token);
|
||||
Reference in New Issue
Block a user