Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
using Kreuzberg;
using System.Text;
var result = Kreuzberg.ExtractFileSync("document.pdf");
if (result.Metadata.Pages?.Boundaries != null)
{
var contentBytes = Encoding.UTF8.GetBytes(result.Content);
foreach (var boundary in result.Metadata.Pages.Boundaries.Take(3))
{
var pageBytes = contentBytes[boundary.ByteStart..boundary.ByteEnd];
var pageText = Encoding.UTF8.GetString(pageBytes);
Console.WriteLine($"Page {boundary.PageNumber}:");
Console.WriteLine($" Byte range: {boundary.ByteStart}-{boundary.ByteEnd}");
Console.WriteLine($" Preview: {pageText[..100]}...");
}
}

View File

@@ -0,0 +1,22 @@
using Kreuzberg;
var config = new ExtractionConfig
{
Pages = new PageConfig
{
ExtractPages = true
}
};
var result = Kreuzberg.ExtractFileSync("document.pdf", config);
if (result.Pages != null)
{
foreach (var page in result.Pages)
{
Console.WriteLine($"Page {page.PageNumber}:");
Console.WriteLine($" Content: {page.Content.Length} chars");
Console.WriteLine($" Tables: {page.Tables.Count}");
Console.WriteLine($" Images: {page.Images.Count}");
}
}

View File

@@ -0,0 +1,17 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var author = result.Metadata.Format.Pdf.Author;
Console.WriteLine($"Author: {author}");
}

View File

@@ -0,0 +1,20 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var createdDate = result.Metadata.Format.Pdf.CreatedDate;
if (createdDate.HasValue)
{
Console.WriteLine($"Created: {createdDate.Value:O}");
}
}

View File

@@ -0,0 +1,20 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.9,
DetectMultiple = false
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.DetectedLanguages != null && result.DetectedLanguages.Count > 0)
{
Console.WriteLine($"Primary language: {result.DetectedLanguages[0]}");
}
```

View File

@@ -0,0 +1,24 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8,
DetectMultiple = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.DetectedLanguages != null && result.DetectedLanguages.Count > 0)
{
Console.WriteLine($"Detected languages: {string.Join(", ", result.DetectedLanguages)}");
foreach (var language in result.DetectedLanguages)
{
Console.WriteLine($" - {language}");
}
}
```

View File

@@ -0,0 +1,65 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig { ExtractMetadata = true }
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.Metadata?.Format?.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
}
var htmlResult = KreuzbergLib.ExtractFileSync("page.html", null, config);
if (htmlResult.Metadata?.Format?.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
}
if (htmlMeta.CanonicalUrl != null)
{
Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
}
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
{
if (htmlMeta.OpenGraph.ContainsKey("image"))
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
if (htmlMeta.OpenGraph.ContainsKey("title"))
Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
}
if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
{
if (htmlMeta.TwitterCard.ContainsKey("card"))
Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
}
if (htmlMeta.Language != null)
Console.WriteLine($"Language: {htmlMeta.Language}");
if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");
if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
{
foreach (var link in htmlMeta.Links)
Console.WriteLine($"Link: {link.Href} ({link.Text})");
}
if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");
}
```

View File

@@ -0,0 +1,26 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig();
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.Metadata?.Pages?.Boundaries != null)
{
foreach (var boundary in result.Metadata.Pages.Boundaries.Take(3))
{
var pageStart = (int)boundary.ByteStart;
var pageEnd = (int)boundary.ByteEnd;
if (pageEnd > result.Content.Length)
pageEnd = result.Content.Length;
var pageText = result.Content.Substring(pageStart, pageEnd - pageStart);
var previewEnd = Math.Min(100, pageText.Length);
var preview = pageText.Substring(0, previewEnd);
Console.WriteLine($"Page {boundary.PageNumber}:");
Console.WriteLine($" Byte range: {boundary.ByteStart}-{boundary.ByteEnd}");
Console.WriteLine($" Preview: {preview}...");
}
}
```

View File

@@ -0,0 +1,24 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Pages = new PageConfig
{
ExtractPages = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.Pages != null)
{
foreach (var page in result.Pages)
{
Console.WriteLine($"Page {page.PageNumber}:");
Console.WriteLine($" Content: {page.Content.Length} chars");
Console.WriteLine($" Tables: {page.Tables.Count}");
Console.WriteLine($" Images: {page.Images.Count}");
}
}
```

View File

@@ -0,0 +1,31 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
Console.WriteLine($"Subject: {pdfMeta.Subject}");
Console.WriteLine($"Created: {pdfMeta.CreatedDate:O}");
}
var htmlResult = KreuzbergLib.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.ContainsKey("image"))
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
}

View File

@@ -0,0 +1,16 @@
```csharp title="C#"
using Kreuzberg;
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, new ExtractionConfig());
foreach (var table in result.Tables)
{
Console.WriteLine($"Table with {table.Cells.Count} rows");
Console.WriteLine(table.Markdown);
foreach (var row in table.Cells)
{
Console.WriteLine(string.Join(" | ", row));
}
}
```

View File

@@ -0,0 +1,17 @@
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractMetadata = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var title = result.Metadata.Format.Pdf.Title;
Console.WriteLine($"Title: {title}");
}

View File

@@ -0,0 +1,41 @@
```csharp title="C#"
using Kreuzberg;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
Enabled = true,
ChunkSize = 512,
OverlapSize = 50
},
Embeddings = new EmbeddingConfig
{
Enabled = true
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", null, config);
if (result.Chunks != null)
{
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk: {chunk.Text.Substring(0, Math.Min(50, chunk.Text.Length))}...");
if (chunk.Embeddings != null && chunk.Embeddings.Count > 0)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embeddings.Count}");
Console.WriteLine($" First values: {string.Join(", ", chunk.Embeddings.Take(5))}");
}
if (chunk.Metadata != null)
{
if (chunk.Metadata.ContainsKey("page_number"))
Console.WriteLine($" Page: {chunk.Metadata["page_number"]}");
if (chunk.Metadata.ContainsKey("token_count"))
Console.WriteLine($" Tokens: {chunk.Metadata["token_count"]}");
}
}
}
```