This commit is contained in:
19
docs/snippets/csharp/metadata/PageBoundaries.cs
Normal file
19
docs/snippets/csharp/metadata/PageBoundaries.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
using Kreuzberg;
|
||||
using System.Text;
|
||||
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf");
|
||||
|
||||
if (result.Metadata.Pages?.Boundaries != null)
|
||||
{
|
||||
var contentBytes = Encoding.UTF8.GetBytes(result.Content);
|
||||
|
||||
foreach (var boundary in result.Metadata.Pages.Boundaries.Take(3))
|
||||
{
|
||||
var pageBytes = contentBytes[boundary.ByteStart..boundary.ByteEnd];
|
||||
var pageText = Encoding.UTF8.GetString(pageBytes);
|
||||
|
||||
Console.WriteLine($"Page {boundary.PageNumber}:");
|
||||
Console.WriteLine($" Byte range: {boundary.ByteStart}-{boundary.ByteEnd}");
|
||||
Console.WriteLine($" Preview: {pageText[..100]}...");
|
||||
}
|
||||
}
|
||||
22
docs/snippets/csharp/metadata/PageTrackingBasic.cs
Normal file
22
docs/snippets/csharp/metadata/PageTrackingBasic.cs
Normal file
@@ -0,0 +1,22 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Pages = new PageConfig
|
||||
{
|
||||
ExtractPages = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Pages != null)
|
||||
{
|
||||
foreach (var page in result.Pages)
|
||||
{
|
||||
Console.WriteLine($"Page {page.PageNumber}:");
|
||||
Console.WriteLine($" Content: {page.Content.Length} chars");
|
||||
Console.WriteLine($" Tables: {page.Tables.Count}");
|
||||
Console.WriteLine($" Images: {page.Images.Count}");
|
||||
}
|
||||
}
|
||||
17
docs/snippets/csharp/metadata/author.cs
Normal file
17
docs/snippets/csharp/metadata/author.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var author = result.Metadata.Format.Pdf.Author;
|
||||
Console.WriteLine($"Author: {author}");
|
||||
}
|
||||
20
docs/snippets/csharp/metadata/created_date.cs
Normal file
20
docs/snippets/csharp/metadata/created_date.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var createdDate = result.Metadata.Format.Pdf.CreatedDate;
|
||||
if (createdDate.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Created: {createdDate.Value:O}");
|
||||
}
|
||||
}
|
||||
20
docs/snippets/csharp/metadata/language_detection.md
Normal file
20
docs/snippets/csharp/metadata/language_detection.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.9,
|
||||
DetectMultiple = false
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.DetectedLanguages != null && result.DetectedLanguages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Primary language: {result.DetectedLanguages[0]}");
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,24 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
LanguageDetection = new LanguageDetectionConfig
|
||||
{
|
||||
Enabled = true,
|
||||
MinConfidence = 0.8,
|
||||
DetectMultiple = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.DetectedLanguages != null && result.DetectedLanguages.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Detected languages: {string.Join(", ", result.DetectedLanguages)}");
|
||||
foreach (var language in result.DetectedLanguages)
|
||||
{
|
||||
Console.WriteLine($" - {language}");
|
||||
}
|
||||
}
|
||||
```
|
||||
65
docs/snippets/csharp/metadata/metadata.md
Normal file
65
docs/snippets/csharp/metadata/metadata.md
Normal file
@@ -0,0 +1,65 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig { ExtractMetadata = true }
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.Metadata?.Format?.Pdf != null)
|
||||
{
|
||||
var pdfMeta = result.Metadata.Format.Pdf;
|
||||
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
|
||||
Console.WriteLine($"Author: {pdfMeta.Author}");
|
||||
Console.WriteLine($"Title: {pdfMeta.Title}");
|
||||
}
|
||||
|
||||
var htmlResult = KreuzbergLib.ExtractFileSync("page.html", null, config);
|
||||
if (htmlResult.Metadata?.Format?.Html != null)
|
||||
{
|
||||
var htmlMeta = htmlResult.Metadata.Format.Html;
|
||||
Console.WriteLine($"Title: {htmlMeta.Title}");
|
||||
Console.WriteLine($"Description: {htmlMeta.Description}");
|
||||
|
||||
if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
|
||||
}
|
||||
|
||||
if (htmlMeta.CanonicalUrl != null)
|
||||
{
|
||||
Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
|
||||
}
|
||||
|
||||
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
|
||||
{
|
||||
if (htmlMeta.OpenGraph.ContainsKey("image"))
|
||||
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
|
||||
if (htmlMeta.OpenGraph.ContainsKey("title"))
|
||||
Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
|
||||
}
|
||||
|
||||
if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
|
||||
{
|
||||
if (htmlMeta.TwitterCard.ContainsKey("card"))
|
||||
Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
|
||||
}
|
||||
|
||||
if (htmlMeta.Language != null)
|
||||
Console.WriteLine($"Language: {htmlMeta.Language}");
|
||||
|
||||
if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
|
||||
Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");
|
||||
|
||||
if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
|
||||
{
|
||||
foreach (var link in htmlMeta.Links)
|
||||
Console.WriteLine($"Link: {link.Href} ({link.Text})");
|
||||
}
|
||||
|
||||
if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
|
||||
Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");
|
||||
}
|
||||
```
|
||||
26
docs/snippets/csharp/metadata/page_boundaries.md
Normal file
26
docs/snippets/csharp/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig();
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.Metadata?.Pages?.Boundaries != null)
|
||||
{
|
||||
foreach (var boundary in result.Metadata.Pages.Boundaries.Take(3))
|
||||
{
|
||||
var pageStart = (int)boundary.ByteStart;
|
||||
var pageEnd = (int)boundary.ByteEnd;
|
||||
|
||||
if (pageEnd > result.Content.Length)
|
||||
pageEnd = result.Content.Length;
|
||||
|
||||
var pageText = result.Content.Substring(pageStart, pageEnd - pageStart);
|
||||
var previewEnd = Math.Min(100, pageText.Length);
|
||||
var preview = pageText.Substring(0, previewEnd);
|
||||
|
||||
Console.WriteLine($"Page {boundary.PageNumber}:");
|
||||
Console.WriteLine($" Byte range: {boundary.ByteStart}-{boundary.ByteEnd}");
|
||||
Console.WriteLine($" Preview: {preview}...");
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/csharp/metadata/page_tracking_basic.md
Normal file
24
docs/snippets/csharp/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Pages = new PageConfig
|
||||
{
|
||||
ExtractPages = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.Pages != null)
|
||||
{
|
||||
foreach (var page in result.Pages)
|
||||
{
|
||||
Console.WriteLine($"Page {page.PageNumber}:");
|
||||
Console.WriteLine($" Content: {page.Content.Length} chars");
|
||||
Console.WriteLine($" Tables: {page.Tables.Count}");
|
||||
Console.WriteLine($" Images: {page.Images.Count}");
|
||||
}
|
||||
}
|
||||
```
|
||||
31
docs/snippets/csharp/metadata/parse_all.cs
Normal file
31
docs/snippets/csharp/metadata/parse_all.cs
Normal file
@@ -0,0 +1,31 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var pdfMeta = result.Metadata.Format.Pdf;
|
||||
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
|
||||
Console.WriteLine($"Author: {pdfMeta.Author}");
|
||||
Console.WriteLine($"Title: {pdfMeta.Title}");
|
||||
Console.WriteLine($"Subject: {pdfMeta.Subject}");
|
||||
Console.WriteLine($"Created: {pdfMeta.CreatedDate:O}");
|
||||
}
|
||||
|
||||
var htmlResult = KreuzbergLib.ExtractFileSync("page.html", config);
|
||||
if (htmlResult.Metadata?.Format.Html != null)
|
||||
{
|
||||
var htmlMeta = htmlResult.Metadata.Format.Html;
|
||||
Console.WriteLine($"Title: {htmlMeta.Title}");
|
||||
Console.WriteLine($"Description: {htmlMeta.Description}");
|
||||
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.ContainsKey("image"))
|
||||
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
|
||||
}
|
||||
16
docs/snippets/csharp/metadata/tables.md
Normal file
16
docs/snippets/csharp/metadata/tables.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, new ExtractionConfig());
|
||||
|
||||
foreach (var table in result.Tables)
|
||||
{
|
||||
Console.WriteLine($"Table with {table.Cells.Count} rows");
|
||||
Console.WriteLine(table.Markdown);
|
||||
|
||||
foreach (var row in table.Cells)
|
||||
{
|
||||
Console.WriteLine(string.Join(" | ", row));
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/csharp/metadata/title.cs
Normal file
17
docs/snippets/csharp/metadata/title.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
PdfOptions = new PdfConfig
|
||||
{
|
||||
ExtractMetadata = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
|
||||
|
||||
if (result.Metadata?.Format.Pdf != null)
|
||||
{
|
||||
var title = result.Metadata.Format.Pdf.Title;
|
||||
Console.WriteLine($"Title: {title}");
|
||||
}
|
||||
41
docs/snippets/csharp/metadata/vector_database_integration.md
Normal file
41
docs/snippets/csharp/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```csharp title="C#"
|
||||
using Kreuzberg;
|
||||
|
||||
var config = new ExtractionConfig
|
||||
{
|
||||
Chunking = new ChunkingConfig
|
||||
{
|
||||
Enabled = true,
|
||||
ChunkSize = 512,
|
||||
OverlapSize = 50
|
||||
},
|
||||
Embeddings = new EmbeddingConfig
|
||||
{
|
||||
Enabled = true
|
||||
}
|
||||
};
|
||||
|
||||
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
|
||||
|
||||
if (result.Chunks != null)
|
||||
{
|
||||
foreach (var chunk in result.Chunks)
|
||||
{
|
||||
Console.WriteLine($"Chunk: {chunk.Text.Substring(0, Math.Min(50, chunk.Text.Length))}...");
|
||||
|
||||
if (chunk.Embeddings != null && chunk.Embeddings.Count > 0)
|
||||
{
|
||||
Console.WriteLine($" Embedding dimensions: {chunk.Embeddings.Count}");
|
||||
Console.WriteLine($" First values: {string.Join(", ", chunk.Embeddings.Take(5))}");
|
||||
}
|
||||
|
||||
if (chunk.Metadata != null)
|
||||
{
|
||||
if (chunk.Metadata.ContainsKey("page_number"))
|
||||
Console.WriteLine($" Page: {chunk.Metadata["page_number"]}");
|
||||
if (chunk.Metadata.ContainsKey("token_count"))
|
||||
Console.WriteLine($" Tokens: {chunk.Metadata["token_count"]}");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user