This commit is contained in:
44
docs/snippets/go/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/go/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
first := chunk.Metadata.FirstPage
|
||||
last := chunk.Metadata.LastPage
|
||||
if first == nil {
|
||||
continue
|
||||
}
|
||||
pageRange := fmt.Sprintf("Page %d", *first)
|
||||
if last != nil && *first != *last {
|
||||
pageRange = fmt.Sprintf("Pages %d-%d", *first, *last)
|
||||
}
|
||||
|
||||
preview := chunk.Content
|
||||
if len(preview) > 50 {
|
||||
preview = preview[:50]
|
||||
}
|
||||
fmt.Printf("Chunk: %s... (%s)\n", preview, pageRange)
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/go/advanced/chunking_config.md
Normal file
27
docs/snippets/go/advanced/chunking_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1000
|
||||
maxOverlap := 200
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("all-minilm-l6-v2"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
46
docs/snippets/go/advanced/chunking_rag.md
Normal file
46
docs/snippets/go/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 500
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(16)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("all-mpnet-base-v2"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("RAG extraction failed: %v", err)
|
||||
}
|
||||
|
||||
chunks := result.Chunks
|
||||
fmt.Printf("Found %d chunks for RAG pipeline\n", len(chunks))
|
||||
|
||||
for i := 0; i < len(chunks) && i < 3; i++ {
|
||||
chunk := chunks[i]
|
||||
content := chunk.Content
|
||||
if len(content) > 80 {
|
||||
content = content[:80]
|
||||
}
|
||||
fmt.Printf("Chunk %d: %s...\n", i, content)
|
||||
}
|
||||
}
|
||||
```
|
||||
49
docs/snippets/go/advanced/embedding_with_chunking.md
Normal file
49
docs/snippets/go/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
showProgress := false
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
ShowDownloadProgress: &showProgress,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
fmt.Printf("Error: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
for index, chunk := range result.Chunks {
|
||||
chunkID := fmt.Sprintf("doc_chunk_%d", index)
|
||||
content := chunk.Content
|
||||
if len(content) > 50 {
|
||||
content = content[:50]
|
||||
}
|
||||
fmt.Printf("Chunk %s: %s\n", chunkID, content)
|
||||
|
||||
if chunk.Embedding != nil && len(chunk.Embedding) > 0 {
|
||||
fmt.Printf(" Embedding dimensions: %d\n", len(chunk.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/advanced/keyword_extraction_config.md
Normal file
23
docs/snippets/go/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxKeywords := int32(10)
|
||||
minScore := 0.3
|
||||
language := "en"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
|
||||
MaxKeywords: &maxKeywords,
|
||||
MinScore: &minScore,
|
||||
Language: &language,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/advanced/keyword_extraction_example.md
Normal file
37
docs/snippets/go/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxKeywords := int32(10)
|
||||
minScore := 0.3
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
|
||||
MaxKeywords: &maxKeywords,
|
||||
MinScore: &minScore,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
if keywords, ok := result.Metadata["keywords"]; ok {
|
||||
keywordList := keywords.([]map[string]interface{})
|
||||
for _, kw := range keywordList {
|
||||
text := kw["text"].(string)
|
||||
score := kw["score"].(float64)
|
||||
fmt.Printf("%s: %.3f\n", text, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/go/advanced/language_detection_config.md
Normal file
22
docs/snippets/go/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
detectMultiple := false
|
||||
minConfidence := 0.8
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: &detectMultiple,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
40
docs/snippets/go/advanced/language_detection_multilingual.md
Normal file
40
docs/snippets/go/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
detectMultiple := true
|
||||
minConfidence := 0.8
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: &detectMultiple,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("Processing failed: %v", err)
|
||||
}
|
||||
|
||||
languages := result.DetectedLanguages
|
||||
if len(languages) > 0 {
|
||||
fmt.Printf("Detected %d language(s): %s\n", len(languages), strings.Join(languages, ", "))
|
||||
} else {
|
||||
fmt.Println("No languages detected")
|
||||
}
|
||||
|
||||
fmt.Printf("Total content: %d characters\n", len(result.Content))
|
||||
fmt.Printf("MIME type: %s\n", result.MimeType)
|
||||
}
|
||||
```
|
||||
16
docs/snippets/go/advanced/quality_processing_config.md
Normal file
16
docs/snippets/go/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enableQualityProcessing := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: &enableQualityProcessing,
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/advanced/quality_processing_example.md
Normal file
35
docs/snippets/go/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enableQualityProcessing := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: &enableQualityProcessing,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
qualityScore := 0.0
|
||||
if result.QualityScore != nil {
|
||||
qualityScore = *result.QualityScore
|
||||
}
|
||||
|
||||
if qualityScore < 0.5 {
|
||||
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
|
||||
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
} else {
|
||||
fmt.Printf("Quality score: %.2f\n", qualityScore)
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/advanced/token_reduction_config.md
Normal file
24
docs/snippets/go/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveMarkdown := true
|
||||
preserveCode := true
|
||||
mode := "moderate"
|
||||
languageHint := "eng"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: &mode,
|
||||
PreserveMarkdown: &preserveMarkdown,
|
||||
PreserveCode: &preserveCode,
|
||||
LanguageHint: &languageHint,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
46
docs/snippets/go/advanced/token_reduction_example.md
Normal file
46
docs/snippets/go/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveMarkdown := true
|
||||
mode := "moderate"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: &mode,
|
||||
PreserveMarkdown: &preserveMarkdown,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
original := 0
|
||||
reduced := 0
|
||||
ratio := 0.0
|
||||
|
||||
if val, ok := result.Metadata["original_token_count"]; ok {
|
||||
original = val.(int)
|
||||
}
|
||||
|
||||
if val, ok := result.Metadata["token_count"]; ok {
|
||||
reduced = val.(int)
|
||||
}
|
||||
|
||||
if val, ok := result.Metadata["token_reduction_ratio"]; ok {
|
||||
ratio = val.(float64)
|
||||
}
|
||||
|
||||
fmt.Printf("Reduced from %d to %d tokens\n", original, reduced)
|
||||
fmt.Printf("Reduction: %.1f%%\n", ratio*100)
|
||||
}
|
||||
```
|
||||
67
docs/snippets/go/advanced/vector_database_integration.md
Normal file
67
docs/snippets/go/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,67 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
type VectorRecord struct {
|
||||
ID string
|
||||
Embedding []float32
|
||||
Content string
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
func extractAndVectorize(documentPath string, documentID string) ([]VectorRecord, error) {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync(documentPath, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var vectorRecords []VectorRecord
|
||||
for index, chunk := range result.Chunks {
|
||||
record := VectorRecord{
|
||||
ID: fmt.Sprintf("%s_chunk_%d", documentID, index),
|
||||
Content: chunk.Content,
|
||||
Embedding: chunk.Embedding,
|
||||
Metadata: map[string]string{
|
||||
"document_id": documentID,
|
||||
"chunk_index": fmt.Sprintf("%d", index),
|
||||
"content_length": fmt.Sprintf("%d", len(chunk.Content)),
|
||||
},
|
||||
}
|
||||
vectorRecords = append(vectorRecords, record)
|
||||
}
|
||||
|
||||
storeInVectorDatabase(vectorRecords)
|
||||
return vectorRecords, nil
|
||||
}
|
||||
|
||||
func storeInVectorDatabase(records []VectorRecord) {
|
||||
for _, record := range records {
|
||||
if len(record.Embedding) > 0 {
|
||||
fmt.Printf("Storing %s: %d chars, %d dims\n",
|
||||
record.ID, len(record.Content), len(record.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user