This commit is contained in:
38
docs/snippets/go/utils/chunking.md
Normal file
38
docs/snippets/go/utils/chunking.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1000
|
||||
maxOverlap := 200
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for i, chunk := range result.Chunks {
|
||||
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
|
||||
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
```
|
||||
45
docs/snippets/go/utils/chunking_rag.md
Normal file
45
docs/snippets/go/utils/chunking_rag.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 500
|
||||
maxOverlap := 50
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for i, chunk := range result.Chunks {
|
||||
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
|
||||
fmt.Printf("Content: %s...\n", chunk.Content[:min(len(chunk.Content), 100)])
|
||||
if chunk.Embedding != nil {
|
||||
fmt.Printf("Embedding: %d dimensions\n", len(chunk.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
```
|
||||
32
docs/snippets/go/utils/embedding_with_chunking.md
Normal file
32
docs/snippets/go/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1024
|
||||
maxOverlap := 100
|
||||
batchSize := int32(32)
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
BatchSize: &batchSize,
|
||||
ShowDownloadProgress: false,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Config: MaxChars=%d, MaxOverlap=%d, Model=%s\n",
|
||||
*config.Chunking.MaxChars,
|
||||
*config.Chunking.MaxOverlap,
|
||||
config.Chunking.Embedding.Model)
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/utils/keyword_extraction_example.md
Normal file
29
docs/snippets/go/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "YAKE",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.3,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
if keywords, ok := result.Metadata.Additional["keywords"]; ok {
|
||||
fmt.Printf("Keywords: %v\n", keywords)
|
||||
}
|
||||
}
|
||||
```
|
||||
33
docs/snippets/go/utils/quality_processing_example.md
Normal file
33
docs/snippets/go/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: true,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
qualityScore := 0.0
|
||||
if result.QualityScore != nil {
|
||||
qualityScore = *result.QualityScore
|
||||
}
|
||||
|
||||
if qualityScore < 0.5 {
|
||||
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
|
||||
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
} else {
|
||||
fmt.Printf("Quality score: %.2f\n", qualityScore)
|
||||
}
|
||||
}
|
||||
```
|
||||
36
docs/snippets/go/utils/standalone_embed.md
Normal file
36
docs/snippets/go/utils/standalone_embed.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preset := "balanced"
|
||||
normalize := true
|
||||
config := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: &preset,
|
||||
},
|
||||
Normalize: &normalize,
|
||||
}
|
||||
|
||||
// Synchronous
|
||||
embeddings, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Kreuzberg is fast"}, config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(len(embeddings)) // 2
|
||||
fmt.Println(len(embeddings[0])) // 768
|
||||
|
||||
// Asynchronous
|
||||
embeddings, err = kreuzberg.EmbedTextsAsync([]string{"Hello, world!"}, config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(len(embeddings[0])) // 768
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/utils/token_reduction.md
Normal file
24
docs/snippets/go/utils/token_reduction.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserve := true
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: "moderate",
|
||||
PreserveImportantWords: &preserve,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
28
docs/snippets/go/utils/token_reduction_example.md
Normal file
28
docs/snippets/go/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: "moderate",
|
||||
PreserveMarkdown: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Original tokens: %v\n", result.Metadata.Additional["original_token_count"])
|
||||
fmt.Printf("Reduced tokens: %v\n", result.Metadata.Additional["token_count"])
|
||||
fmt.Printf("Reduction ratio: %v\n", result.Metadata.Additional["token_reduction_ratio"])
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user