This commit is contained in:
26
docs/snippets/go/metadata/language_detection.md
Normal file
26
docs/snippets/go/metadata/language_detection.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
minConfidence := 0.9
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/metadata/language_detection_multilingual.md
Normal file
29
docs/snippets/go/metadata/language_detection_multilingual.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
minConfidence := 0.8
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: true,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Detected languages: %v\n", result.DetectedLanguages)
|
||||
// Output: [eng fra deu]
|
||||
}
|
||||
```
|
||||
115
docs/snippets/go/metadata/metadata.md
Normal file
115
docs/snippets/go/metadata/metadata.md
Normal file
@@ -0,0 +1,115 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract pdf: %v", err)
|
||||
}
|
||||
|
||||
// Access PDF metadata
|
||||
if pdf, ok := result.Metadata.PdfMetadata(); ok {
|
||||
if pdf.PageCount != nil {
|
||||
fmt.Printf("Pages: %d\n", *pdf.PageCount)
|
||||
}
|
||||
if pdf.Author != nil {
|
||||
fmt.Printf("Author: %s\n", *pdf.Author)
|
||||
}
|
||||
if pdf.Title != nil {
|
||||
fmt.Printf("Title: %s\n", *pdf.Title)
|
||||
}
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract html: %v", err)
|
||||
}
|
||||
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
|
||||
if html.Title != nil {
|
||||
fmt.Printf("Title: %s\n", *html.Title)
|
||||
}
|
||||
if html.Description != nil {
|
||||
fmt.Printf("Description: %s\n", *html.Description)
|
||||
}
|
||||
|
||||
// Access keywords as array
|
||||
if len(html.Keywords) > 0 {
|
||||
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
|
||||
}
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
if html.CanonicalURL != nil {
|
||||
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
|
||||
}
|
||||
|
||||
// Access Open Graph fields from map
|
||||
if len(html.OpenGraph) > 0 {
|
||||
if image, ok := html.OpenGraph["image"]; ok {
|
||||
fmt.Printf("Open Graph Image: %s\n", image)
|
||||
}
|
||||
if ogTitle, ok := html.OpenGraph["title"]; ok {
|
||||
fmt.Printf("Open Graph Title: %s\n", ogTitle)
|
||||
}
|
||||
if ogType, ok := html.OpenGraph["type"]; ok {
|
||||
fmt.Printf("Open Graph Type: %s\n", ogType)
|
||||
}
|
||||
}
|
||||
|
||||
// Access Twitter Card fields from map
|
||||
if len(html.TwitterCard) > 0 {
|
||||
if card, ok := html.TwitterCard["card"]; ok {
|
||||
fmt.Printf("Twitter Card Type: %s\n", card)
|
||||
}
|
||||
if creator, ok := html.TwitterCard["creator"]; ok {
|
||||
fmt.Printf("Twitter Creator: %s\n", creator)
|
||||
}
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
if html.Language != nil {
|
||||
fmt.Printf("Language: %s\n", *html.Language)
|
||||
}
|
||||
|
||||
if html.TextDirection != nil {
|
||||
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if len(html.Headers) > 0 {
|
||||
headers := make([]string, len(html.Headers))
|
||||
for i, h := range html.Headers {
|
||||
headers[i] = h.Text
|
||||
}
|
||||
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
|
||||
}
|
||||
|
||||
// Access links
|
||||
if len(html.Links) > 0 {
|
||||
for _, link := range html.Links {
|
||||
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
if len(html.Images) > 0 {
|
||||
for _, image := range html.Images {
|
||||
fmt.Printf("Image: %s\n", image.Src)
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if len(html.StructuredData) > 0 {
|
||||
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/metadata/page_boundaries.md
Normal file
37
docs/snippets/go/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
if result.Metadata.Pages == nil || result.Metadata.Pages.Boundaries == nil {
|
||||
return
|
||||
}
|
||||
|
||||
contentBytes := []byte(result.Content)
|
||||
for i, boundary := range result.Metadata.Pages.Boundaries {
|
||||
if i >= 3 {
|
||||
break
|
||||
}
|
||||
pageText := string(contentBytes[boundary.ByteStart:boundary.ByteEnd])
|
||||
preview := pageText
|
||||
if len(preview) > 100 {
|
||||
preview = preview[:100]
|
||||
}
|
||||
|
||||
fmt.Printf("Page %d:\n", boundary.PageNumber)
|
||||
fmt.Printf(" Byte range: %d-%d\n", boundary.ByteStart, boundary.ByteEnd)
|
||||
fmt.Printf(" Preview: %s...\n", preview)
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/metadata/page_tracking_basic.md
Normal file
29
docs/snippets/go/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Package main
|
||||
|
||||
Import (
|
||||
"fmt"
|
||||
"Kreuzberg"
|
||||
)
|
||||
|
||||
Func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Pages: &kreuzberg.PageConfig{
|
||||
ExtractPages: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if result.Pages != nil {
|
||||
for _, page := range result.Pages {
|
||||
fmt.Printf("Page %d:\n", page.PageNumber)
|
||||
fmt.Printf(" Content: %d chars\n", len(page.Content))
|
||||
fmt.Printf(" Tables: %d\n", len(page.Tables))
|
||||
fmt.Printf(" Images: %d\n", len(page.Images))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
28
docs/snippets/go/metadata/tables.md
Normal file
28
docs/snippets/go/metadata/tables.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
// Iterate over tables
|
||||
for _, table := range result.Tables {
|
||||
fmt.Printf("Table with %d rows\n", len(table.Cells))
|
||||
fmt.Println(table.Markdown) // Markdown representation
|
||||
|
||||
// Access cells
|
||||
for _, row := range table.Cells {
|
||||
fmt.Println(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
39
docs/snippets/go/metadata/vector_database_integration.md
Normal file
39
docs/snippets/go/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Chunks != nil {
|
||||
for i, chunk := range result.Chunks {
|
||||
if chunk.Embedding != nil {
|
||||
fmt.Printf("Chunk %d: %d dimensions\n", i, len(chunk.Embedding))
|
||||
// Store in vector database
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user