This commit is contained in:
44
docs/snippets/go/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/go/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
first := chunk.Metadata.FirstPage
|
||||
last := chunk.Metadata.LastPage
|
||||
if first == nil {
|
||||
continue
|
||||
}
|
||||
pageRange := fmt.Sprintf("Page %d", *first)
|
||||
if last != nil && *first != *last {
|
||||
pageRange = fmt.Sprintf("Pages %d-%d", *first, *last)
|
||||
}
|
||||
|
||||
preview := chunk.Content
|
||||
if len(preview) > 50 {
|
||||
preview = preview[:50]
|
||||
}
|
||||
fmt.Printf("Chunk: %s... (%s)\n", preview, pageRange)
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/go/advanced/chunking_config.md
Normal file
27
docs/snippets/go/advanced/chunking_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1000
|
||||
maxOverlap := 200
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("all-minilm-l6-v2"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
46
docs/snippets/go/advanced/chunking_rag.md
Normal file
46
docs/snippets/go/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 500
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(16)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("all-mpnet-base-v2"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("RAG extraction failed: %v", err)
|
||||
}
|
||||
|
||||
chunks := result.Chunks
|
||||
fmt.Printf("Found %d chunks for RAG pipeline\n", len(chunks))
|
||||
|
||||
for i := 0; i < len(chunks) && i < 3; i++ {
|
||||
chunk := chunks[i]
|
||||
content := chunk.Content
|
||||
if len(content) > 80 {
|
||||
content = content[:80]
|
||||
}
|
||||
fmt.Printf("Chunk %d: %s...\n", i, content)
|
||||
}
|
||||
}
|
||||
```
|
||||
49
docs/snippets/go/advanced/embedding_with_chunking.md
Normal file
49
docs/snippets/go/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
showProgress := false
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
ShowDownloadProgress: &showProgress,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
fmt.Printf("Error: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
for index, chunk := range result.Chunks {
|
||||
chunkID := fmt.Sprintf("doc_chunk_%d", index)
|
||||
content := chunk.Content
|
||||
if len(content) > 50 {
|
||||
content = content[:50]
|
||||
}
|
||||
fmt.Printf("Chunk %s: %s\n", chunkID, content)
|
||||
|
||||
if chunk.Embedding != nil && len(chunk.Embedding) > 0 {
|
||||
fmt.Printf(" Embedding dimensions: %d\n", len(chunk.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/advanced/keyword_extraction_config.md
Normal file
23
docs/snippets/go/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxKeywords := int32(10)
|
||||
minScore := 0.3
|
||||
language := "en"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
|
||||
MaxKeywords: &maxKeywords,
|
||||
MinScore: &minScore,
|
||||
Language: &language,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/advanced/keyword_extraction_example.md
Normal file
37
docs/snippets/go/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxKeywords := int32(10)
|
||||
minScore := 0.3
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
|
||||
MaxKeywords: &maxKeywords,
|
||||
MinScore: &minScore,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
if keywords, ok := result.Metadata["keywords"]; ok {
|
||||
keywordList := keywords.([]map[string]interface{})
|
||||
for _, kw := range keywordList {
|
||||
text := kw["text"].(string)
|
||||
score := kw["score"].(float64)
|
||||
fmt.Printf("%s: %.3f\n", text, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/go/advanced/language_detection_config.md
Normal file
22
docs/snippets/go/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
detectMultiple := false
|
||||
minConfidence := 0.8
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: &detectMultiple,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
40
docs/snippets/go/advanced/language_detection_multilingual.md
Normal file
40
docs/snippets/go/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
detectMultiple := true
|
||||
minConfidence := 0.8
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: &detectMultiple,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("Processing failed: %v", err)
|
||||
}
|
||||
|
||||
languages := result.DetectedLanguages
|
||||
if len(languages) > 0 {
|
||||
fmt.Printf("Detected %d language(s): %s\n", len(languages), strings.Join(languages, ", "))
|
||||
} else {
|
||||
fmt.Println("No languages detected")
|
||||
}
|
||||
|
||||
fmt.Printf("Total content: %d characters\n", len(result.Content))
|
||||
fmt.Printf("MIME type: %s\n", result.MimeType)
|
||||
}
|
||||
```
|
||||
16
docs/snippets/go/advanced/quality_processing_config.md
Normal file
16
docs/snippets/go/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enableQualityProcessing := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: &enableQualityProcessing,
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/advanced/quality_processing_example.md
Normal file
35
docs/snippets/go/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enableQualityProcessing := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: &enableQualityProcessing,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
qualityScore := 0.0
|
||||
if result.QualityScore != nil {
|
||||
qualityScore = *result.QualityScore
|
||||
}
|
||||
|
||||
if qualityScore < 0.5 {
|
||||
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
|
||||
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
} else {
|
||||
fmt.Printf("Quality score: %.2f\n", qualityScore)
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/advanced/token_reduction_config.md
Normal file
24
docs/snippets/go/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveMarkdown := true
|
||||
preserveCode := true
|
||||
mode := "moderate"
|
||||
languageHint := "eng"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: &mode,
|
||||
PreserveMarkdown: &preserveMarkdown,
|
||||
PreserveCode: &preserveCode,
|
||||
LanguageHint: &languageHint,
|
||||
},
|
||||
}
|
||||
_ = config
|
||||
}
|
||||
```
|
||||
46
docs/snippets/go/advanced/token_reduction_example.md
Normal file
46
docs/snippets/go/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveMarkdown := true
|
||||
mode := "moderate"
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: &mode,
|
||||
PreserveMarkdown: &preserveMarkdown,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
original := 0
|
||||
reduced := 0
|
||||
ratio := 0.0
|
||||
|
||||
if val, ok := result.Metadata["original_token_count"]; ok {
|
||||
original = val.(int)
|
||||
}
|
||||
|
||||
if val, ok := result.Metadata["token_count"]; ok {
|
||||
reduced = val.(int)
|
||||
}
|
||||
|
||||
if val, ok := result.Metadata["token_reduction_ratio"]; ok {
|
||||
ratio = val.(float64)
|
||||
}
|
||||
|
||||
fmt.Printf("Reduced from %d to %d tokens\n", original, reduced)
|
||||
fmt.Printf("Reduction: %.1f%%\n", ratio*100)
|
||||
}
|
||||
```
|
||||
67
docs/snippets/go/advanced/vector_database_integration.md
Normal file
67
docs/snippets/go/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,67 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
type VectorRecord struct {
|
||||
ID string
|
||||
Embedding []float32
|
||||
Content string
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
func extractAndVectorize(documentPath string, documentID string) ([]VectorRecord, error) {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
normalize := true
|
||||
batchSize := int32(32)
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
|
||||
Normalize: &normalize,
|
||||
BatchSize: &batchSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync(documentPath, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var vectorRecords []VectorRecord
|
||||
for index, chunk := range result.Chunks {
|
||||
record := VectorRecord{
|
||||
ID: fmt.Sprintf("%s_chunk_%d", documentID, index),
|
||||
Content: chunk.Content,
|
||||
Embedding: chunk.Embedding,
|
||||
Metadata: map[string]string{
|
||||
"document_id": documentID,
|
||||
"chunk_index": fmt.Sprintf("%d", index),
|
||||
"content_length": fmt.Sprintf("%d", len(chunk.Content)),
|
||||
},
|
||||
}
|
||||
vectorRecords = append(vectorRecords, record)
|
||||
}
|
||||
|
||||
storeInVectorDatabase(vectorRecords)
|
||||
return vectorRecords, nil
|
||||
}
|
||||
|
||||
func storeInVectorDatabase(records []VectorRecord) {
|
||||
for _, record := range records {
|
||||
if len(record.Embedding) > 0 {
|
||||
fmt.Printf("Storing %s: %d chars, %d dims\n",
|
||||
record.ID, len(record.Content), len(record.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/go/api/batch_extract_bytes_sync.md
Normal file
27
docs/snippets/go/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
doc1, _ := os.ReadFile("doc1.pdf")
|
||||
doc2, _ := os.ReadFile("doc2.docx")
|
||||
|
||||
items := []kreuzberg.BatchBytesItem{
|
||||
{Content: doc1, MimeType: "application/pdf"},
|
||||
{Content: doc2, MimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
|
||||
}
|
||||
|
||||
results, err := kreuzberg.BatchExtractBytesSync(items, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("batch extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Processed", len(results), "documents")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/go/api/batch_extract_files_sync.md
Normal file
26
docs/snippets/go/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
items := []kreuzberg.BatchFileItem{
|
||||
{Path: "doc1.pdf"},
|
||||
{Path: "doc2.docx"},
|
||||
{Path: "doc3.pptx"},
|
||||
}
|
||||
|
||||
results, err := kreuzberg.BatchExtractFilesSync(items, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("batch extraction failed: %v", err)
|
||||
}
|
||||
|
||||
for i, result := range results {
|
||||
println("Doc", i, "content length:", len(result.Content))
|
||||
}
|
||||
}
|
||||
```
|
||||
42
docs/snippets/go/api/client_chunk_text.md
Normal file
42
docs/snippets/go/api/client_chunk_text.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func main() {
|
||||
client := &http.Client{}
|
||||
|
||||
payload := map[string]interface{}{
|
||||
"text": "Your long text content here...",
|
||||
"chunker_type": "text",
|
||||
"config": map[string]interface{}{
|
||||
"max_characters": 1000,
|
||||
"overlap": 50,
|
||||
"trim": true,
|
||||
},
|
||||
}
|
||||
|
||||
data, _ := json.Marshal(payload)
|
||||
resp, err := client.Post("http://localhost:8000/chunk", "application/json", bytes.NewBuffer(data))
|
||||
if err != nil {
|
||||
log.Fatalf("request failed: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result map[string]interface{}
|
||||
json.NewDecoder(resp.Body).Decode(&result)
|
||||
|
||||
chunks := result["chunks"].([]interface{})
|
||||
log.Printf("Created %d chunks", len(chunks))
|
||||
for _, chunk := range chunks {
|
||||
c := chunk.(map[string]interface{})
|
||||
println("Chunk content:", c["content"].(string))
|
||||
}
|
||||
}
|
||||
```
|
||||
34
docs/snippets/go/api/client_extract_single_file.md
Normal file
34
docs/snippets/go/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"log"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
)
|
||||
|
||||
func main() {
|
||||
file, err := os.Open("document.pdf")
|
||||
if err != nil {
|
||||
log.Fatalf("failed to open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
part, _ := writer.CreateFormFile("files", "document.pdf")
|
||||
io.Copy(part, file)
|
||||
writer.Close()
|
||||
|
||||
resp, err := http.Post("http://localhost:8000/extract", writer.FormDataContentType(), body)
|
||||
if err != nil {
|
||||
log.Fatalf("request failed: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
io.Copy(os.Stdout, resp.Body)
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/api/combining_all_features.md
Normal file
35
docs/snippets/go/api/combining_all_features.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
trueVal := true
|
||||
maxChars := uint(1000)
|
||||
overlap := uint(200)
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
UseCache: &trueVal,
|
||||
EnableQualityProcessing: &trueVal,
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
},
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Content length:", len(result.Content))
|
||||
println("Chunks:", len(result.Chunks))
|
||||
}
|
||||
```
|
||||
26
docs/snippets/go/api/error_handling.md
Normal file
26
docs/snippets/go/api/error_handling.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("missing.pdf", nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
if errors.Is(err, kreuzberg.ErrIo) {
|
||||
log.Printf("file not found: %v", err)
|
||||
} else if errors.Is(err, kreuzberg.ErrUnsupportedFormat) {
|
||||
log.Printf("unsupported format: %v", err)
|
||||
} else {
|
||||
log.Printf("extraction error: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
println("Content:", result.Content)
|
||||
}
|
||||
```
|
||||
43
docs/snippets/go/api/error_handling_extract.md
Normal file
43
docs/snippets/go/api/error_handling_extract.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"log"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
)
|
||||
|
||||
func main() {
|
||||
file, err := os.Open("document.pdf")
|
||||
if err != nil {
|
||||
log.Fatalf("failed to open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
part, _ := writer.CreateFormFile("files", "document.pdf")
|
||||
io.Copy(part, file)
|
||||
writer.Close()
|
||||
|
||||
resp, err := http.Post("http://localhost:8000/extract", writer.FormDataContentType(), body)
|
||||
if err != nil {
|
||||
log.Fatalf("request failed: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
var errResp map[string]string
|
||||
json.NewDecoder(resp.Body).Decode(&errResp)
|
||||
log.Fatalf("error: %s: %s", errResp["error_type"], errResp["message"])
|
||||
}
|
||||
|
||||
var result map[string]interface{}
|
||||
json.NewDecoder(resp.Body).Decode(&result)
|
||||
println("Success:", result["content"].(string))
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/api/extract_bytes_async.md
Normal file
24
docs/snippets/go/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
content, err := os.ReadFile("document.pdf")
|
||||
if err != nil {
|
||||
log.Fatalf("failed to read file: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractBytes(content, "application/pdf", kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Content:", result.Content)
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/api/extract_bytes_sync.md
Normal file
24
docs/snippets/go/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
content, err := os.ReadFile("document.pdf")
|
||||
if err != nil {
|
||||
log.Fatalf("failed to read file: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractBytesSync(content, "application/pdf", kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Content:", result.Content)
|
||||
}
|
||||
```
|
||||
19
docs/snippets/go/api/extract_file_async.md
Normal file
19
docs/snippets/go/api/extract_file_async.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFile("document.pdf", nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Content:", result.Content)
|
||||
println("MIME type:", result.MimeType)
|
||||
}
|
||||
```
|
||||
18
docs/snippets/go/api/extract_file_sync.md
Normal file
18
docs/snippets/go/api/extract_file_sync.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
println("Content:", result.Content)
|
||||
}
|
||||
```
|
||||
85
docs/snippets/go/benchmarking/simple_benchmark.go
Normal file
85
docs/snippets/go/benchmarking/simple_benchmark.go
Normal file
@@ -0,0 +1,85 @@
|
||||
```go title="simple_benchmark.go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
UseCache: false,
|
||||
}
|
||||
client, err := kreuzberg.New(config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
filePath := "document.pdf"
|
||||
numRuns := 10
|
||||
|
||||
fmt.Printf("Sync extraction (%d runs):\n", numRuns)
|
||||
start := time.Now()
|
||||
for i := 0; i < numRuns; i++ {
|
||||
_, err := client.ExtractFile(filePath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
syncDuration := time.Since(start).Seconds()
|
||||
avgSync := syncDuration / float64(numRuns)
|
||||
fmt.Printf(" - Total time: %.3fs\n", syncDuration)
|
||||
fmt.Printf(" - Average: %.3fs per extraction\n", avgSync)
|
||||
|
||||
fmt.Printf("\nAsync extraction (%d parallel runs):\n", numRuns)
|
||||
start = time.Now()
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numRuns)
|
||||
for i := 0; i < numRuns; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
_, err := client.ExtractFile(filePath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
asyncDuration := time.Since(start).Seconds()
|
||||
fmt.Printf(" - Total time: %.3fs\n", asyncDuration)
|
||||
fmt.Printf(" - Average: %.3fs per extraction\n", asyncDuration/float64(numRuns))
|
||||
fmt.Printf(" - Speedup: %.1fx\n", syncDuration/asyncDuration)
|
||||
|
||||
cacheConfig := &kreuzberg.ExtractionConfig{
|
||||
UseCache: true,
|
||||
}
|
||||
clientCached, err := kreuzberg.New(cacheConfig)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer clientCached.Close()
|
||||
|
||||
fmt.Println("\nFirst extraction (populates cache)...")
|
||||
start = time.Now()
|
||||
_, err = clientCached.ExtractFile(filePath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
firstDuration := time.Since(start).Seconds()
|
||||
fmt.Printf(" - Time: %.3fs\n", firstDuration)
|
||||
|
||||
fmt.Println("Second extraction (from cache)...")
|
||||
start = time.Now()
|
||||
_, err = clientCached.ExtractFile(filePath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
cachedDuration := time.Since(start).Seconds()
|
||||
fmt.Printf(" - Time: %.3fs\n", cachedDuration)
|
||||
fmt.Printf(" - Cache speedup: %.1fx\n", firstDuration/cachedDuration)
|
||||
}
|
||||
```
|
||||
38
docs/snippets/go/cache/disk_cache.go
vendored
Normal file
38
docs/snippets/go/cache/disk_cache.go
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
```go title="disk_cache.go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
useCache := true
|
||||
namespace := "documents"
|
||||
ttl := uint64(7 * 86400)
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
CacheNamespace: &namespace,
|
||||
CacheTTLSecs: &ttl,
|
||||
}
|
||||
|
||||
fmt.Println("First extraction (will be cached)...")
|
||||
result1, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
fmt.Printf(" - Content length: %d\n", len(result1.Content))
|
||||
|
||||
fmt.Println("\nSecond extraction (from cache)...")
|
||||
result2, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
fmt.Printf(" - Content length: %d\n", len(result2.Content))
|
||||
|
||||
fmt.Printf("\nResults are identical: %v\n", result1.Content == result2.Content)
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/cli/basic_cli.go
Normal file
35
docs/snippets/go/cli/basic_cli.go
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="basic_cli.go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
func extractWithCli(filePath string, outputFormat string) (string, error) {
|
||||
cmd := exec.Command("kreuzberg", "extract", filePath, "--format", outputFormat)
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("CLI error: %w, output: %s", err, string(output))
|
||||
}
|
||||
|
||||
return string(output), nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
document := "document.pdf"
|
||||
|
||||
textOutput, err := extractWithCli(document, "text")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Printf("Extracted: %d characters\n", len(textOutput))
|
||||
|
||||
jsonOutput, err := extractWithCli(document, "json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Printf("JSON output received: %d bytes\n", len(jsonOutput))
|
||||
}
|
||||
```
|
||||
54
docs/snippets/go/cli/cli_with_config.go
Normal file
54
docs/snippets/go/cli/cli_with_config.go
Normal file
@@ -0,0 +1,54 @@
|
||||
```go title="cli_with_config.go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
type ExtractionResult struct {
|
||||
Content string `json:"content"`
|
||||
Format string `json:"format"`
|
||||
Languages []string `json:"languages"`
|
||||
}
|
||||
|
||||
func extractWithConfig(filePath string, configPath string) (*ExtractionResult, error) {
|
||||
cmd := exec.Command(
|
||||
"kreuzberg",
|
||||
"extract",
|
||||
filePath,
|
||||
"--config",
|
||||
configPath,
|
||||
"--format",
|
||||
"json",
|
||||
)
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("CLI error: %w, output: %s", err, string(output))
|
||||
}
|
||||
|
||||
var result ExtractionResult
|
||||
if err := json.Unmarshal(output, &result); err != nil {
|
||||
return nil, fmt.Errorf("JSON parse error: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
configFile := "kreuzberg.toml"
|
||||
document := "document.pdf"
|
||||
|
||||
fmt.Printf("Extracting %s with config %s\n", document, configFile)
|
||||
result, err := extractWithConfig(document, configFile)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Printf("Content length: %d\n", len(result.Content))
|
||||
fmt.Printf("Format: %s\n", result.Format)
|
||||
fmt.Printf("Languages: %v\n", result.Languages)
|
||||
}
|
||||
```
|
||||
54
docs/snippets/go/config/advanced_config.md
Normal file
54
docs/snippets/go/config/advanced_config.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
maxOverlap := uint(100)
|
||||
useCache := true
|
||||
enableQuality := true
|
||||
languageDetectionEnabled := true
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+deu",
|
||||
},
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &maxOverlap,
|
||||
},
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &languageDetectionEnabled,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
UseCache: &useCache,
|
||||
EnableQualityProcessing: &enableQuality,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
// Access chunks
|
||||
if len(result.Chunks) > 0 {
|
||||
snippet := result.Chunks[0].Content
|
||||
if len(snippet) > 100 {
|
||||
snippet = snippet[:100]
|
||||
}
|
||||
fmt.Printf("First chunk: %s...\n", snippet)
|
||||
}
|
||||
|
||||
// Access detected languages
|
||||
if len(result.DetectedLanguages) > 0 {
|
||||
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
|
||||
}
|
||||
}
|
||||
```
|
||||
103
docs/snippets/go/config/chunking_config.md
Normal file
103
docs/snippets/go/config/chunking_config.md
Normal file
@@ -0,0 +1,103 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
overlap := uint(200)
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Config: MaxCharacters=%d, Overlap=%d\n",
|
||||
*config.Chunking.MaxCharacters, *config.Chunking.Overlap)
|
||||
}
|
||||
```
|
||||
|
||||
```go title="Go - Markdown with Heading Context"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
model := "Xenova/gpt-4o"
|
||||
chunkerType := kreuzberg.ChunkerTypeMarkdown
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
ChunkerType: &chunkerType,
|
||||
Sizing: kreuzberg.ChunkSizing{
|
||||
Type: "tokenizer",
|
||||
Model: &model,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFile("document.md", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
if chunk.Metadata.HeadingContext != nil {
|
||||
for _, heading := range chunk.Metadata.HeadingContext.Headings {
|
||||
fmt.Printf("Heading L%d: %s\n", heading.Level, heading.Text)
|
||||
}
|
||||
}
|
||||
fmt.Printf("Content: %.100s...\n", chunk.Content)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```go title="Go - Prepend Heading Context"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
chunkerType := kreuzberg.ChunkerTypeMarkdown
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
ChunkerType: &chunkerType,
|
||||
PrependHeadingContext: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFile("document.md", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
fmt.Printf("Content: %.100s...\n", chunk.Content)
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/config/config_basic.md
Normal file
24
docs/snippets/go/config/config_basic.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
useCache := true
|
||||
enableQP := true
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
EnableQualityProcessing: &enableQP,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/config_discover.md
Normal file
23
docs/snippets/go/config/config_discover.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config, err := kreuzberg.LoadExtractionConfigFromFile("")
|
||||
if err != nil {
|
||||
log.Fatalf("discover config failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/config_file.md
Normal file
23
docs/snippets/go/config/config_file.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config, err := kreuzberg.LoadExtractionConfigFromFile("kreuzberg.toml")
|
||||
if err != nil {
|
||||
log.Fatalf("load config failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Detected MIME: %s", result.MimeType)
|
||||
}
|
||||
```
|
||||
19
docs/snippets/go/config/config_ocr.md
Normal file
19
docs/snippets/go/config/config_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
psm := int32(3)
|
||||
|
||||
_ = kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+fra",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/config/config_programmatic.md
Normal file
37
docs/snippets/go/config/config_programmatic.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
psm := int32(6)
|
||||
maxChars := uint(1000)
|
||||
overlap := uint(200)
|
||||
useCache := true
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
},
|
||||
},
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/go/config/document_structure_config.md
Normal file
25
docs/snippets/go/config/document_structure_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```go title="Document Structure Config (Go)"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
kreuzberg "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := kreuzberg.NewExtractionConfig(
|
||||
kreuzberg.WithIncludeDocumentStructure(true),
|
||||
)
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if result.Document != nil {
|
||||
for _, node := range result.Document.Nodes {
|
||||
fmt.Printf("[%s]\n", node.Content.NodeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/go/config/element_based_output.md
Normal file
60
docs/snippets/go/config/element_based_output.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```go title="Element-Based Output (Go)"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Configure element-based output
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
OutputFormat: "element_based",
|
||||
}
|
||||
|
||||
// Extract document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Access elements
|
||||
for _, element := range result.Elements {
|
||||
fmt.Printf("Type: %s\n", element.ElementType)
|
||||
|
||||
text := element.Text
|
||||
if len(text) > 100 {
|
||||
text = text[:100]
|
||||
}
|
||||
fmt.Printf("Text: %s\n", text)
|
||||
|
||||
if element.Metadata.PageNumber != nil {
|
||||
fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
|
||||
}
|
||||
|
||||
if element.Metadata.Coordinates != nil {
|
||||
coords := element.Metadata.Coordinates
|
||||
fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
|
||||
coords.Left, coords.Top, coords.Right, coords.Bottom)
|
||||
}
|
||||
|
||||
fmt.Println("---")
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
var titles []kreuzberg.Element
|
||||
for _, element := range result.Elements {
|
||||
if element.ElementType == "title" {
|
||||
titles = append(titles, element)
|
||||
}
|
||||
}
|
||||
|
||||
for _, title := range titles {
|
||||
level, ok := title.Metadata.Additional["level"].(string)
|
||||
if !ok {
|
||||
level = "unknown"
|
||||
}
|
||||
fmt.Printf("[%s] %s\n", level, title.Text)
|
||||
}
|
||||
}
|
||||
```
|
||||
100
docs/snippets/go/config/embedding_config.go
Normal file
100
docs/snippets/go/config/embedding_config.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embeddingConfig := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: "balanced",
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "~/.cache/kreuzberg/embeddings",
|
||||
}
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "custom",
|
||||
ModelID: "BAAI/bge-small-en-v1.5",
|
||||
Dimensions: 384,
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "", // Uses default: .kreuzberg/embeddings/
|
||||
}
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "custom",
|
||||
ModelID: "sentence-transformers/all-mpnet-base-v2",
|
||||
Dimensions: 768,
|
||||
},
|
||||
BatchSize: 16, // Larger model requires smaller batch size
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "/var/cache/embeddings",
|
||||
}
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
chunkingConfig := kreuzberg.ChunkingConfig{
|
||||
MaxChars: 1024,
|
||||
MaxOverlap: 100,
|
||||
Preset: "balanced",
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: "balanced",
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
},
|
||||
}
|
||||
|
||||
extractionConfig := kreuzberg.ExtractionConfig{
|
||||
Chunking: &chunkingConfig,
|
||||
}
|
||||
|
||||
_ = embeddingConfig
|
||||
_ = extractionConfig
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// BatchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// Normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// CacheDir: Where to store downloaded models
|
||||
// - Empty string: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - Non-empty: Custom directory for model storage
|
||||
//
|
||||
// ShowDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
37
docs/snippets/go/config/embedding_config.md
Normal file
37
docs/snippets/go/config/embedding_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
batchSize := uint(16)
|
||||
normalize := true
|
||||
modelName := "all-mpnet-base-v2"
|
||||
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: &modelName,
|
||||
},
|
||||
BatchSize: &batchSize,
|
||||
Normalize: &normalize,
|
||||
ShowDownloadProgress: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
94
docs/snippets/go/config/hierarchy_config.go
Normal file
94
docs/snippets/go/config/hierarchy_config.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
|
||||
OcrCoverageThreshold: nil, // No OCR coverage threshold
|
||||
}
|
||||
|
||||
pdfConfigBasic := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigBasic,
|
||||
}
|
||||
|
||||
extractionConfigBasic := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigBasic,
|
||||
}
|
||||
|
||||
// Use with ExtractFileSync or ExtractBytesSync
|
||||
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
|
||||
|
||||
|
||||
// Example 2: Custom KClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: nil,
|
||||
}
|
||||
|
||||
pdfConfigMinimal := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigMinimal,
|
||||
}
|
||||
|
||||
extractionConfigMinimal := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigMinimal,
|
||||
}
|
||||
|
||||
_ = extractionConfigMinimal
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
ocrThreshold := 0.5
|
||||
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6),
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
|
||||
}
|
||||
|
||||
pdfConfigOcr := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigOcr,
|
||||
}
|
||||
|
||||
extractionConfigOcr := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigOcr,
|
||||
}
|
||||
|
||||
_ = extractionConfigOcr
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// Enabled: *bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// KClusters: *int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// IncludeBbox: *bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// OcrCoverageThreshold: *float64 (default: nil)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - nil means no OCR coverage-based triggering
|
||||
31
docs/snippets/go/config/html_output.md
Normal file
31
docs/snippets/go/config/html_output.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
outputFormat := kreuzberg.OutputFormatHTML
|
||||
theme := kreuzberg.HTMLThemeGitHub
|
||||
embedCSS := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
OutputFormat: &outputFormat,
|
||||
HTMLOutput: &kreuzberg.HTMLOutputConfig{
|
||||
Theme: &theme,
|
||||
EmbedCSS: &embedCSS,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println(result.Content) // HTML with kb-* classes
|
||||
}
|
||||
```
|
||||
89
docs/snippets/go/config/keyword_config.go
Normal file
89
docs/snippets/go/config/keyword_config.go
Normal file
@@ -0,0 +1,89 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
func basicYake() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "yake",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.0,
|
||||
NgramRange: [2]int{1, 3},
|
||||
Language: "en",
|
||||
YakeParams: nil,
|
||||
RakeParams: nil,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
func advancedYake() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "yake",
|
||||
MaxKeywords: 15,
|
||||
MinScore: 0.1,
|
||||
NgramRange: [2]int{1, 2},
|
||||
Language: "en",
|
||||
YakeParams: &kreuzberg.YakeParams{
|
||||
WindowSize: 1,
|
||||
},
|
||||
RakeParams: nil,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
func rakeConfig() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "rake",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 5.0,
|
||||
NgramRange: [2]int{1, 3},
|
||||
Language: "en",
|
||||
YakeParams: nil,
|
||||
RakeParams: &kreuzberg.RakeParams{
|
||||
MinWordLength: 1,
|
||||
MaxWordsPerPhrase: 3,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := basicYake(); err != nil {
|
||||
fmt.Println("Error:", err)
|
||||
}
|
||||
}
|
||||
26
docs/snippets/go/config/keyword_extraction_config.md
Normal file
26
docs/snippets/go/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "YAKE",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.3,
|
||||
NgramRange: "1,3",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
|
||||
config.Keywords.Algorithm,
|
||||
config.Keywords.MaxKeywords,
|
||||
config.Keywords.MinScore)
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/language_detection_config.md
Normal file
23
docs/snippets/go/config/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
minConfidence := 0.8
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: true,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: false,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Language detection enabled: %v\n", config.LanguageDetection.Enabled)
|
||||
fmt.Printf("Min confidence: %f\n", *config.LanguageDetection.MinConfidence)
|
||||
}
|
||||
```
|
||||
28
docs/snippets/go/config/ocr_dpi_config.md
Normal file
28
docs/snippets/go/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
targetDpi := int32(300)
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
|
||||
TargetDpi: &targetDpi,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
26
docs/snippets/go/config/pdf_config.md
Normal file
26
docs/snippets/go/config/pdf_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
extractMetadata := true
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
ExtractMetadata: &extractMetadata,
|
||||
Passwords: []string{"password1", "password2"},
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
42
docs/snippets/go/config/pdf_hierarchy_config.md
Normal file
42
docs/snippets/go/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
includeBbox := true
|
||||
kClusters := uint(6)
|
||||
kClustersAdvanced := uint(12)
|
||||
threshold := float32(0.8)
|
||||
|
||||
// Basic hierarchy configuration
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{
|
||||
Enabled: &enabled,
|
||||
KClusters: &kClusters,
|
||||
IncludeBbox: &includeBbox,
|
||||
OcrCoverageThreshold: &threshold,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Advanced hierarchy configuration with more clusters
|
||||
advancedConfig := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{
|
||||
Enabled: &enabled,
|
||||
KClusters: &kClustersAdvanced,
|
||||
IncludeBbox: &includeBbox,
|
||||
OcrCoverageThreshold: &threshold,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_ = config
|
||||
_ = advancedConfig
|
||||
}
|
||||
```
|
||||
18
docs/snippets/go/config/postprocessor_config.md
Normal file
18
docs/snippets/go/config/postprocessor_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
cfg := &kreuzberg.ExtractionConfig{
|
||||
Postprocessor: &kreuzberg.PostProcessorConfig{
|
||||
Enabled: &enabled,
|
||||
EnabledProcessors: []string{"deduplication", "whitespace_normalization"},
|
||||
DisabledProcessors: []string{"mojibake_fix"},
|
||||
},
|
||||
}
|
||||
|
||||
_ = cfg
|
||||
}
|
||||
```
|
||||
17
docs/snippets/go/config/quality_processing_config.md
Normal file
17
docs/snippets/go/config/quality_processing_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: true, // Default
|
||||
}
|
||||
|
||||
fmt.Printf("Quality processing enabled: %v\n", config.EnableQualityProcessing)
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/config/tesseract_config.md
Normal file
37
docs/snippets/go/config/tesseract_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
psm := int32(6)
|
||||
oem := int32(1)
|
||||
enableTableDetection := true
|
||||
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+fra+deu",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
Oem: &oem,
|
||||
MinConfidence: 0.8,
|
||||
EnableTableDetection: &enableTableDetection,
|
||||
TesseditCharWhitelist: whitelist,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/token_reduction_config.md
Normal file
23
docs/snippets/go/config/token_reduction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveImportant := true
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionOptions{
|
||||
Mode: "moderate",
|
||||
PreserveImportantWords: &preserveImportant,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Mode: %s, Preserve Important Words: %v\n",
|
||||
config.TokenReduction.Mode,
|
||||
*config.TokenReduction.PreserveImportantWords)
|
||||
}
|
||||
```
|
||||
118
docs/snippets/go/docker/usage.go
Normal file
118
docs/snippets/go/docker/usage.go
Normal file
@@ -0,0 +1,118 @@
|
||||
```go title="usage.go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DockerKreuzbergClient struct {
|
||||
containerName string
|
||||
containerImage string
|
||||
apiPort int
|
||||
}
|
||||
|
||||
func NewDockerKreuzbergClient(containerName, image string, port int) *DockerKreuzbergClient {
|
||||
return &DockerKreuzbergClient{
|
||||
containerName: containerName,
|
||||
containerImage: image,
|
||||
apiPort: port,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *DockerKreuzbergClient) StartContainer() error {
|
||||
fmt.Println("Starting Kreuzberg Docker container...")
|
||||
cmd := exec.Command("docker", "run", "-d",
|
||||
"--name", c.containerName,
|
||||
"-p", fmt.Sprintf("%d:8000", c.apiPort),
|
||||
c.containerImage)
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("failed to start container: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Container started on http://localhost:%d\n", c.apiPort)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *DockerKreuzbergClient) ExtractFile(filePath string) (string, error) {
|
||||
fileBytes, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
writer := multipart.NewWriter(&buf)
|
||||
|
||||
part, err := writer.CreateFormFile("file", filepath.Base(filePath))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if _, err := io.Copy(part, bytes.NewReader(fileBytes)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if err := writer.Close(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
resp, err := http.Post(
|
||||
fmt.Sprintf("http://localhost:%d/api/extract", c.apiPort),
|
||||
writer.FormDataContentType(),
|
||||
&buf,
|
||||
)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result map[string]string
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result["content"], nil
|
||||
}
|
||||
|
||||
func (c *DockerKreuzbergClient) StopContainer() error {
|
||||
fmt.Println("Stopping Kreuzberg Docker container...")
|
||||
if err := exec.Command("docker", "stop", c.containerName).Run(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("docker", "rm", c.containerName).Run(); err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println("Container stopped and removed")
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
client := NewDockerKreuzbergClient("kreuzberg-api", "kreuzberg:latest", 8000)
|
||||
|
||||
if err := client.StartContainer(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
content, err := client.ExtractFile("document.pdf")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Printf("Extracted content:\n%s\n", content)
|
||||
|
||||
if err := client.StopContainer(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/getting-started/basic_usage.md
Normal file
29
docs/snippets/go/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println("Content:")
|
||||
fmt.Println(result.Content)
|
||||
|
||||
fmt.Println("\nMetadata:")
|
||||
if result.Metadata != nil {
|
||||
fmt.Printf("Title: %v\n", result.Metadata["title"])
|
||||
fmt.Printf("Author: %v\n", result.Metadata["author"])
|
||||
}
|
||||
|
||||
fmt.Printf("\nTables found: %d\n", len(result.Tables))
|
||||
fmt.Printf("Images found: %d\n", len(result.Images))
|
||||
}
|
||||
```
|
||||
31
docs/snippets/go/getting-started/extract_file.md
Normal file
31
docs/snippets/go/getting-started/extract_file.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
useCache := true
|
||||
enableQP := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
EnableQualityProcessing: &enableQP,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("contract.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Extracted %d characters\n", len(result.Content))
|
||||
if result.QualityScore != nil {
|
||||
fmt.Printf("Quality score: %.2f\n", *result.QualityScore)
|
||||
}
|
||||
fmt.Printf("Processing time: %v\n", result.ProcessingTime)
|
||||
}
|
||||
```
|
||||
30
docs/snippets/go/getting-started/extract_with_ocr.md
Normal file
30
docs/snippets/go/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
ocrConfig := &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
}
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: ocrConfig,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println("Extracted text from scanned document:")
|
||||
fmt.Println(result.Content)
|
||||
fmt.Println("Used OCR backend: tesseract")
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/getting-started/hello_world.md
Normal file
24
docs/snippets/go/getting-started/hello_world.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println("Extracted content:")
|
||||
if len(result.Content) > 200 {
|
||||
fmt.Println(result.Content[:200])
|
||||
} else {
|
||||
fmt.Println(result.Content)
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/go/getting-started/install_verify.md
Normal file
22
docs/snippets/go/getting-started/install_verify.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Println("Kreuzberg CGO bindings loaded successfully")
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("sample.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println("Installation verified!")
|
||||
fmt.Printf("Extracted %d characters\n", len(result.Content))
|
||||
}
|
||||
```
|
||||
34
docs/snippets/go/getting-started/read_content.md
Normal file
34
docs/snippets/go/getting-started/read_content.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
content := result.Content
|
||||
tables := result.Tables
|
||||
images := result.Images
|
||||
metadata := result.Metadata
|
||||
|
||||
fmt.Printf("Content: %d characters\n", len(content))
|
||||
fmt.Printf("Tables: %d\n", len(tables))
|
||||
fmt.Printf("Images: %d\n", len(images))
|
||||
|
||||
if metadata != nil {
|
||||
fmt.Print("Metadata keys: ")
|
||||
for key := range metadata {
|
||||
fmt.Print(key + " ")
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
```
|
||||
47
docs/snippets/go/llm/structured_extraction.md
Normal file
47
docs/snippets/go/llm/structured_extraction.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
schema, err := json.Marshal(map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"authors": map[string]any{"type": "array", "items": map[string]string{"type": "string"}},
|
||||
"date": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "authors", "date"},
|
||||
"additionalProperties": false,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("marshal schema: %v", err)
|
||||
}
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
StructuredExtraction: &kreuzberg.StructuredExtractionConfig{
|
||||
Schema: schema,
|
||||
SchemaName: "PaperMetadata",
|
||||
Strict: true,
|
||||
Llm: kreuzberg.LlmConfig{
|
||||
Model: "openai/gpt-4o-mini",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFile("paper.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract: %v", err)
|
||||
}
|
||||
|
||||
if result.StructuredOutput != nil {
|
||||
fmt.Println(string(*result.StructuredOutput))
|
||||
}
|
||||
}
|
||||
```
|
||||
63
docs/snippets/go/mcp/mcp_custom_client.md
Normal file
63
docs/snippets/go/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
type MCPRequest struct {
|
||||
Method string `json:"method"`
|
||||
Params MCPParams `json:"params"`
|
||||
}
|
||||
|
||||
type MCPParams struct {
|
||||
Name string `json:"name"`
|
||||
Arguments map[string]interface{} `json:"arguments"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
cmd := exec.Command("kreuzberg", "mcp")
|
||||
stdin, err := cmd.StdinPipe()
|
||||
if err != nil {
|
||||
log.Fatalf("create stdin pipe: %v", err)
|
||||
}
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
log.Fatalf("create stdout pipe: %v", err)
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
log.Fatalf("start command: %v", err)
|
||||
}
|
||||
|
||||
request := MCPRequest{
|
||||
Method: "tools/call",
|
||||
Params: MCPParams{
|
||||
Name: "extract_file",
|
||||
Arguments: map[string]interface{}{
|
||||
"path": "document.pdf",
|
||||
"async": true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
log.Fatalf("marshal request: %v", err)
|
||||
}
|
||||
fmt.Fprintf(stdin, "%s\n", string(data))
|
||||
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
if scanner.Scan() {
|
||||
fmt.Println(scanner.Text())
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
log.Fatalf("wait for command: %v", err)
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/go/mcp/mcp_server_start.md
Normal file
19
docs/snippets/go/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cmd := exec.Command("kreuzberg", "mcp")
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to start MCP server: %v\n", err)
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/go/metadata/language_detection.md
Normal file
26
docs/snippets/go/metadata/language_detection.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
minConfidence := 0.9
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &enabled,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/metadata/language_detection_multilingual.md
Normal file
29
docs/snippets/go/metadata/language_detection_multilingual.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
minConfidence := 0.8
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: true,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Detected languages: %v\n", result.DetectedLanguages)
|
||||
// Output: [eng fra deu]
|
||||
}
|
||||
```
|
||||
115
docs/snippets/go/metadata/metadata.md
Normal file
115
docs/snippets/go/metadata/metadata.md
Normal file
@@ -0,0 +1,115 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract pdf: %v", err)
|
||||
}
|
||||
|
||||
// Access PDF metadata
|
||||
if pdf, ok := result.Metadata.PdfMetadata(); ok {
|
||||
if pdf.PageCount != nil {
|
||||
fmt.Printf("Pages: %d\n", *pdf.PageCount)
|
||||
}
|
||||
if pdf.Author != nil {
|
||||
fmt.Printf("Author: %s\n", *pdf.Author)
|
||||
}
|
||||
if pdf.Title != nil {
|
||||
fmt.Printf("Title: %s\n", *pdf.Title)
|
||||
}
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract html: %v", err)
|
||||
}
|
||||
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
|
||||
if html.Title != nil {
|
||||
fmt.Printf("Title: %s\n", *html.Title)
|
||||
}
|
||||
if html.Description != nil {
|
||||
fmt.Printf("Description: %s\n", *html.Description)
|
||||
}
|
||||
|
||||
// Access keywords as array
|
||||
if len(html.Keywords) > 0 {
|
||||
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
|
||||
}
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
if html.CanonicalURL != nil {
|
||||
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
|
||||
}
|
||||
|
||||
// Access Open Graph fields from map
|
||||
if len(html.OpenGraph) > 0 {
|
||||
if image, ok := html.OpenGraph["image"]; ok {
|
||||
fmt.Printf("Open Graph Image: %s\n", image)
|
||||
}
|
||||
if ogTitle, ok := html.OpenGraph["title"]; ok {
|
||||
fmt.Printf("Open Graph Title: %s\n", ogTitle)
|
||||
}
|
||||
if ogType, ok := html.OpenGraph["type"]; ok {
|
||||
fmt.Printf("Open Graph Type: %s\n", ogType)
|
||||
}
|
||||
}
|
||||
|
||||
// Access Twitter Card fields from map
|
||||
if len(html.TwitterCard) > 0 {
|
||||
if card, ok := html.TwitterCard["card"]; ok {
|
||||
fmt.Printf("Twitter Card Type: %s\n", card)
|
||||
}
|
||||
if creator, ok := html.TwitterCard["creator"]; ok {
|
||||
fmt.Printf("Twitter Creator: %s\n", creator)
|
||||
}
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
if html.Language != nil {
|
||||
fmt.Printf("Language: %s\n", *html.Language)
|
||||
}
|
||||
|
||||
if html.TextDirection != nil {
|
||||
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if len(html.Headers) > 0 {
|
||||
headers := make([]string, len(html.Headers))
|
||||
for i, h := range html.Headers {
|
||||
headers[i] = h.Text
|
||||
}
|
||||
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
|
||||
}
|
||||
|
||||
// Access links
|
||||
if len(html.Links) > 0 {
|
||||
for _, link := range html.Links {
|
||||
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
if len(html.Images) > 0 {
|
||||
for _, image := range html.Images {
|
||||
fmt.Printf("Image: %s\n", image.Src)
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if len(html.StructuredData) > 0 {
|
||||
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/metadata/page_boundaries.md
Normal file
37
docs/snippets/go/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
if result.Metadata.Pages == nil || result.Metadata.Pages.Boundaries == nil {
|
||||
return
|
||||
}
|
||||
|
||||
contentBytes := []byte(result.Content)
|
||||
for i, boundary := range result.Metadata.Pages.Boundaries {
|
||||
if i >= 3 {
|
||||
break
|
||||
}
|
||||
pageText := string(contentBytes[boundary.ByteStart:boundary.ByteEnd])
|
||||
preview := pageText
|
||||
if len(preview) > 100 {
|
||||
preview = preview[:100]
|
||||
}
|
||||
|
||||
fmt.Printf("Page %d:\n", boundary.PageNumber)
|
||||
fmt.Printf(" Byte range: %d-%d\n", boundary.ByteStart, boundary.ByteEnd)
|
||||
fmt.Printf(" Preview: %s...\n", preview)
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/metadata/page_tracking_basic.md
Normal file
29
docs/snippets/go/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Package main
|
||||
|
||||
Import (
|
||||
"fmt"
|
||||
"Kreuzberg"
|
||||
)
|
||||
|
||||
Func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Pages: &kreuzberg.PageConfig{
|
||||
ExtractPages: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if result.Pages != nil {
|
||||
for _, page := range result.Pages {
|
||||
fmt.Printf("Page %d:\n", page.PageNumber)
|
||||
fmt.Printf(" Content: %d chars\n", len(page.Content))
|
||||
fmt.Printf(" Tables: %d\n", len(page.Tables))
|
||||
fmt.Printf(" Images: %d\n", len(page.Images))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
28
docs/snippets/go/metadata/tables.md
Normal file
28
docs/snippets/go/metadata/tables.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
// Iterate over tables
|
||||
for _, table := range result.Tables {
|
||||
fmt.Printf("Table with %d rows\n", len(table.Cells))
|
||||
fmt.Println(table.Markdown) // Markdown representation
|
||||
|
||||
// Access cells
|
||||
for _, row := range table.Cells {
|
||||
fmt.Println(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
39
docs/snippets/go/metadata/vector_database_integration.md
Normal file
39
docs/snippets/go/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 512
|
||||
maxOverlap := 50
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Chunks != nil {
|
||||
for i, chunk := range result.Chunks {
|
||||
if chunk.Embedding != nil {
|
||||
fmt.Printf("Chunk %d: %d dimensions\n", i, len(chunk.Embedding))
|
||||
// Store in vector database
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/go/ocr/cloud_ocr_backend.md
Normal file
25
docs/snippets/go/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// The Go binding does not currently expose plugin OCR backend registration.
|
||||
// Use one of the built-in backends ("tesseract", "paddle-ocr", or VLM via "vlm").
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
32
docs/snippets/go/ocr/image_extraction.md
Normal file
32
docs/snippets/go/ocr/image_extraction.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
extractImages := true
|
||||
injectPlaceholders := true
|
||||
autoAdjustDpi := true
|
||||
targetDpi := int32(200)
|
||||
maxDim := int32(2048)
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Images: &kreuzberg.ImageExtractionConfig{
|
||||
ExtractImages: &extractImages,
|
||||
TargetDpi: &targetDpi,
|
||||
MaxImageDimension: &maxDim,
|
||||
InjectPlaceholders: &injectPlaceholders, // set to false to extract images without markdown references
|
||||
AutoAdjustDpi: &autoAdjustDpi,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
36
docs/snippets/go/ocr/image_preprocessing.md
Normal file
36
docs/snippets/go/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
targetDpi := int32(300)
|
||||
deskew := true
|
||||
binarization := "otsu"
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
|
||||
TargetDpi: &targetDpi,
|
||||
Denoise: true,
|
||||
Deskew: &deskew,
|
||||
ContrastEnhance: true,
|
||||
BinarizationMethod: &binarization,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
1
docs/snippets/go/ocr/ocr_easyocr.md
Normal file
1
docs/snippets/go/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1 @@
|
||||
EasyOCR is only available in Python.
|
||||
34
docs/snippets/go/ocr/ocr_elements.md
Normal file
34
docs/snippets/go/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "paddle-ocr",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, element := range result.OcrElements {
|
||||
fmt.Printf("Text: %s\n", element.Text)
|
||||
fmt.Printf("Confidence: %.2f\n", element.Confidence.Recognition)
|
||||
fmt.Printf("Geometry: %+v\n", element.Geometry)
|
||||
if element.Rotation != nil {
|
||||
fmt.Printf("Rotation: %.1f°\n", element.Rotation.AngleDegrees)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_extraction.md
Normal file
24
docs/snippets/go/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println(len(result.Content))
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_force_all_pages.md
Normal file
24
docs/snippets/go/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
},
|
||||
ForceOcr: true,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println(result.Content)
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/ocr/ocr_multi_language.md
Normal file
23
docs/snippets/go/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+deu+fra",
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println(result.Content)
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_paddleocr.md
Normal file
24
docs/snippets/go/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "paddle-ocr",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println(len(result.Content))
|
||||
}
|
||||
```
|
||||
32
docs/snippets/go/plugins/clear_plugins.md
Normal file
32
docs/snippets/go/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Clear all plugins of a specific type
|
||||
if err := kreuzberg.ClearPostProcessors(); err != nil {
|
||||
log.Fatalf("clear post processors: %v", err)
|
||||
}
|
||||
log.Println("Post processors cleared")
|
||||
|
||||
if err := kreuzberg.ClearValidators(); err != nil {
|
||||
log.Fatalf("clear validators: %v", err)
|
||||
}
|
||||
log.Println("Validators cleared")
|
||||
|
||||
if err := kreuzberg.ClearOCRBackends(); err != nil {
|
||||
log.Fatalf("clear OCR backends: %v", err)
|
||||
}
|
||||
log.Println("OCR backends cleared")
|
||||
|
||||
if err := kreuzberg.ClearDocumentExtractors(); err != nil {
|
||||
log.Fatalf("clear document extractors: %v", err)
|
||||
}
|
||||
log.Println("Document extractors cleared")
|
||||
}
|
||||
```
|
||||
64
docs/snippets/go/plugins/embedding_backend.md
Normal file
64
docs/snippets/go/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,64 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// MyEmbedder wraps an already-loaded embedder so kreuzberg can call back into
|
||||
// it during chunking and standalone embed requests. Implement the
|
||||
// kreuzberg.EmbeddingBackend interface.
|
||||
type MyEmbedder struct{}
|
||||
|
||||
func (e *MyEmbedder) Name() string { return "my-embedder" }
|
||||
func (e *MyEmbedder) Version() string { return "1.0.0" }
|
||||
func (e *MyEmbedder) Initialize() error {
|
||||
// Optional warm-up; runs once at registration before Dimensions() is cached.
|
||||
return nil
|
||||
}
|
||||
func (e *MyEmbedder) Shutdown() error { return nil }
|
||||
|
||||
// Captured once at registration; the dispatcher uses this for shape validation.
|
||||
func (e *MyEmbedder) Dimensions() uint { return 768 }
|
||||
|
||||
func (e *MyEmbedder) Embed(texts []string) ([][]float32, error) {
|
||||
// Delegate to the already-loaded host model.
|
||||
out := make([][]float32, len(texts))
|
||||
for i := range texts {
|
||||
out[i] = make([]float32, 768)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register once at startup.
|
||||
if err := kreuzberg.RegisterEmbeddingBackend(&MyEmbedder{}); err != nil {
|
||||
log.Fatalf("failed to register embedding backend: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterEmbeddingBackend("my-embedder"); err != nil {
|
||||
log.Printf("warning: failed to unregister embedding backend: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
maxDuration := uint64(30)
|
||||
embedderName := "my-embedder"
|
||||
config := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Variant: "plugin",
|
||||
Type: "plugin",
|
||||
Name: &embedderName,
|
||||
},
|
||||
// Optional: bound the wait on a hung backend (default 60s; nil disables).
|
||||
MaxEmbedDurationSecs: &maxDuration,
|
||||
}
|
||||
|
||||
vectors, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Second text"}, config)
|
||||
if err != nil {
|
||||
log.Fatalf("embed failed: %v", err)
|
||||
}
|
||||
log.Printf("Generated %d vectors", len(vectors))
|
||||
}
|
||||
```
|
||||
22
docs/snippets/go/plugins/extractor_registration.md
Normal file
22
docs/snippets/go/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Register custom extractor with priority 50
|
||||
if err := kreuzberg.RegisterDocumentExtractor("custom-json-extractor", 50); err != nil {
|
||||
log.Fatalf("register extractor failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.json", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Printf("Extracted content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
52
docs/snippets/go/plugins/list_plugins.md
Normal file
52
docs/snippets/go/plugins/list_plugins.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// List all registered document extractors
|
||||
extractors, err := kreuzberg.ListDocumentExtractors()
|
||||
if err != nil {
|
||||
log.Fatalf("list document extractors: %v", err)
|
||||
}
|
||||
fmt.Println("Document Extractors:")
|
||||
for _, extractor := range extractors {
|
||||
fmt.Printf(" - %s\n", extractor)
|
||||
}
|
||||
|
||||
// List all registered post-processors
|
||||
processors, err := kreuzberg.ListPostProcessors()
|
||||
if err != nil {
|
||||
log.Fatalf("list post processors: %v", err)
|
||||
}
|
||||
fmt.Println("\nPost-Processors:")
|
||||
for _, processor := range processors {
|
||||
fmt.Printf(" - %s\n", processor)
|
||||
}
|
||||
|
||||
// List all registered OCR backends
|
||||
backends, err := kreuzberg.ListOCRBackends()
|
||||
if err != nil {
|
||||
log.Fatalf("list OCR backends: %v", err)
|
||||
}
|
||||
fmt.Println("\nOCR Backends:")
|
||||
for _, backend := range backends {
|
||||
fmt.Printf(" - %s\n", backend)
|
||||
}
|
||||
|
||||
// List all registered validators
|
||||
validators, err := kreuzberg.ListValidators()
|
||||
if err != nil {
|
||||
log.Fatalf("list validators: %v", err)
|
||||
}
|
||||
fmt.Println("\nValidators:")
|
||||
for _, validator := range validators {
|
||||
fmt.Printf(" - %s\n", validator)
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/go/plugins/min_length_validator.md
Normal file
72
docs/snippets/go/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,72 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// minLengthConfig holds the configuration for the minimum length validator
|
||||
var minLengthConfig = struct {
|
||||
minLength int
|
||||
}{
|
||||
minLength: 100,
|
||||
}
|
||||
|
||||
// minLengthValidator validates that extracted content meets minimum length requirement
|
||||
//export minLengthValidator
|
||||
func minLengthValidator(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse result JSON")
|
||||
}
|
||||
|
||||
content, ok := result["content"].(string)
|
||||
if !ok {
|
||||
return C.CString("Missing content field in result")
|
||||
}
|
||||
|
||||
if len(content) < minLengthConfig.minLength {
|
||||
errMsg := fmt.Sprintf("Content too short: %d < %d", len(content), minLengthConfig.minLength)
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
// Validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the validator with priority 100 (runs early)
|
||||
if err := kreuzberg.RegisterValidator("min_length_validator", 100,
|
||||
(C.ValidatorCallback)(C.minLengthValidator)); err != nil {
|
||||
log.Fatalf("failed to register validator: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
|
||||
log.Printf("warning: failed to unregister validator: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract and validate
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Validation passed. Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
114
docs/snippets/go/plugins/pdf_metadata_extractor.md
Normal file
114
docs/snippets/go/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,114 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"sync/atomic"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// pdfMetadataState tracks statistics about PDF processing
|
||||
var pdfMetadataState = struct {
|
||||
processedCount int64
|
||||
}{
|
||||
processedCount: 0,
|
||||
}
|
||||
|
||||
// pdfMetadataExtractor enriches PDF extraction results with additional metadata
|
||||
//export pdfMetadataExtractor
|
||||
func pdfMetadataExtractor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Only process PDFs
|
||||
mimeType, ok := result["mime_type"].(string)
|
||||
if !ok || mimeType != "application/pdf" {
|
||||
// Return unchanged for non-PDF documents
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// Process PDF-specific metadata
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Mark as processed by this processor
|
||||
metadata["pdf_processed"] = true
|
||||
|
||||
// Add content statistics
|
||||
content, ok := result["content"].(string)
|
||||
if ok {
|
||||
metadata["content_length"] = len(content)
|
||||
}
|
||||
|
||||
// Increment processed count atomically
|
||||
atomic.AddInt64(&pdfMetadataState.processedCount, 1)
|
||||
metadata["pdf_processor_version"] = "1.0.0"
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 80, early stage
|
||||
if err := kreuzberg.RegisterPostProcessor("pdf_metadata_extractor", 80,
|
||||
(C.PostProcessorCallback)(C.pdfMetadataExtractor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("pdf_metadata_extractor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Total PDFs processed: %d", atomic.LoadInt64(&pdfMetadataState.processedCount))
|
||||
}()
|
||||
|
||||
// Extract PDF document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("PDF MIME type: %s", result.MimeType)
|
||||
|
||||
// Parse and display metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if pdfProcessed, ok := metadata["pdf_processed"].(bool); ok && pdfProcessed {
|
||||
log.Printf("PDF metadata extracted successfully")
|
||||
if contentLen, ok := metadata["content_length"].(float64); ok {
|
||||
log.Printf("Content length: %.0f bytes", contentLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
116
docs/snippets/go/plugins/pdf_only_processor.md
Normal file
116
docs/snippets/go/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,116 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// pdfOnlyProcessor applies PDF-specific processing logic only to PDF documents
|
||||
//export pdfOnlyProcessor
|
||||
func pdfOnlyProcessor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Check MIME type - only process PDFs
|
||||
mimeType, ok := result["mime_type"].(string)
|
||||
if !ok || mimeType != "application/pdf" {
|
||||
// Return unchanged for non-PDF documents
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// Perform PDF-specific processing
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Example PDF-specific processing:
|
||||
// - Extract tables as structured data
|
||||
// - Handle PDF-specific formatting
|
||||
// - Preserve document hierarchy
|
||||
|
||||
metadata["pdf_specific_processing"] = true
|
||||
metadata["processor_type"] = "pdf_only"
|
||||
|
||||
// Check for tables in PDF
|
||||
if tablesJSON, ok := result["tables_json"].(string); ok && tablesJSON != "" {
|
||||
var tables []interface{}
|
||||
if err := json.Unmarshal([]byte(tablesJSON), &tables); err == nil {
|
||||
metadata["table_count"] = len(tables)
|
||||
}
|
||||
}
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 70
|
||||
if err := kreuzberg.RegisterPostProcessor("pdf_only_processor", 70,
|
||||
(C.PostProcessorCallback)(C.pdfOnlyProcessor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("pdf_only_processor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Process multiple documents - processor will only affect PDFs
|
||||
files := []string{
|
||||
"document.pdf",
|
||||
"image.jpg",
|
||||
"spreadsheet.xlsx",
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
result, err := kreuzberg.ExtractFileSync(file, nil)
|
||||
if err != nil {
|
||||
log.Printf("Warning: extraction failed for %s: %v", file, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse metadata to check if PDF processing occurred
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if pdfProcessing, ok := metadata["pdf_specific_processing"].(bool); ok && pdfProcessing {
|
||||
log.Printf("PDF-specific processing applied to: %s", file)
|
||||
if tableCount, ok := metadata["table_count"].(float64); ok {
|
||||
log.Printf(" Tables found: %.0f", tableCount)
|
||||
}
|
||||
} else {
|
||||
log.Printf("Skipped PDF processor for: %s (MIME: %s)", file, result.MimeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/go/plugins/plugin_extractor.md
Normal file
13
docs/snippets/go/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,13 @@
|
||||
<!-- snippet:skip reason="Go bindings do not support custom document extractor plugins" -->
|
||||
```markdown title="Markdown"
|
||||
!!! note "Not Supported"
|
||||
The Go binding is a thin CGO wrapper and does not currently support
|
||||
custom document extractors. Custom plugins must be implemented in Rust.
|
||||
|
||||
See the [Rust plugin documentation](../../rust/plugins/plugin_extractor.md) for details on creating custom document extractors.
|
||||
|
||||
Go currently supports:
|
||||
- **PostProcessor** - Transform extraction results
|
||||
- **Validator** - Validate extraction results
|
||||
- **OcrBackend** - Custom OCR implementations
|
||||
```
|
||||
92
docs/snippets/go/plugins/plugin_logging.md
Normal file
92
docs/snippets/go/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,92 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"encoding/json"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
//export loggingPostProcessor
|
||||
func loggingPostProcessor(resultJSON *C.char) *C.char {
|
||||
log.Println("[PostProcessor] Processing extraction result")
|
||||
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
log.Printf("[PostProcessor] Error parsing result: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
log.Printf("[PostProcessor] Content length: %d bytes", len(content))
|
||||
if len(content) == 0 {
|
||||
log.Println("[PostProcessor] Warning: Empty content extracted")
|
||||
}
|
||||
}
|
||||
|
||||
if mimeType, ok := result["mime_type"].(string); ok {
|
||||
log.Printf("[PostProcessor] Processing %s", mimeType)
|
||||
}
|
||||
|
||||
// Return NULL to indicate success (no modification)
|
||||
return nil
|
||||
}
|
||||
|
||||
//export loggingValidator
|
||||
func loggingValidator(resultJSON *C.char) *C.char {
|
||||
log.Println("[Validator] Validating extraction result")
|
||||
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
log.Printf("[Validator] Error parsing result: %v", err)
|
||||
errMsg := "Failed to parse validation input"
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
log.Printf("[Validator] Content length: %d bytes", len(content))
|
||||
if len(content) < 50 {
|
||||
log.Println("[Validator] Error: Content below minimum threshold")
|
||||
errMsg := "Content too short (minimum 50 characters)"
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Return NULL to indicate validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register post processor with logging
|
||||
if err := kreuzberg.RegisterPostProcessor(
|
||||
"logging-processor",
|
||||
100, // priority
|
||||
(C.PostProcessorCallback)(C.loggingPostProcessor),
|
||||
); err != nil {
|
||||
log.Fatalf("register post processor failed: %v", err)
|
||||
}
|
||||
log.Println("[Main] PostProcessor registered with logging enabled")
|
||||
|
||||
// Register validator with logging
|
||||
if err := kreuzberg.RegisterValidator(
|
||||
"logging-validator",
|
||||
50, // priority
|
||||
(C.ValidatorCallback)(C.loggingValidator),
|
||||
); err != nil {
|
||||
log.Fatalf("register validator failed: %v", err)
|
||||
}
|
||||
log.Println("[Main] Validator registered with logging enabled")
|
||||
|
||||
// Extract with logging
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("[Main] Extraction complete: %d bytes content", len(result.Content))
|
||||
}
|
||||
```
|
||||
213
docs/snippets/go/plugins/plugin_testing.md
Normal file
213
docs/snippets/go/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,213 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"testing"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// TestPostProcessor tests custom post processor behavior
|
||||
func TestPostProcessor(t *testing.T) {
|
||||
// Create a post processor that adds metadata
|
||||
metricsMap := make(map[string]int64)
|
||||
|
||||
//export testPostProcessor
|
||||
testPostProcessor := func(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
metricsMap["content_length"] = int64(len(content))
|
||||
metricsMap["processed"] = 1
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register the processor
|
||||
err := kreuzberg.RegisterPostProcessor(
|
||||
"test-processor",
|
||||
10,
|
||||
(C.PostProcessorCallback)(C.testPostProcessor),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register post processor: %v", err)
|
||||
}
|
||||
|
||||
// Simulate a mock result
|
||||
mockResult := map[string]interface{}{
|
||||
"content": "Test extraction content",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
resultJSON, err := json.Marshal(mockResult)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal mock result: %v", err)
|
||||
}
|
||||
cResultJSON := C.CString(string(resultJSON))
|
||||
defer C.free(unsafe.Pointer(cResultJSON))
|
||||
|
||||
// Call the processor
|
||||
testPostProcessor(cResultJSON)
|
||||
|
||||
// Verify metrics were recorded
|
||||
if metricsMap["content_length"] != 22 {
|
||||
t.Errorf("Expected content_length 22, got %d", metricsMap["content_length"])
|
||||
}
|
||||
if metricsMap["processed"] != 1 {
|
||||
t.Errorf("Expected processed flag to be 1")
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterPostProcessor("test-processor")
|
||||
}
|
||||
|
||||
// TestValidator tests custom validator behavior
|
||||
func TestValidator(t *testing.T) {
|
||||
validatorCalled := false
|
||||
|
||||
//export testValidator
|
||||
testValidator := func(resultJSON *C.char) *C.char {
|
||||
validatorCalled = true
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse validation input")
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
if len(content) < 10 {
|
||||
return C.CString("Content too short")
|
||||
}
|
||||
}
|
||||
|
||||
return nil // Success
|
||||
}
|
||||
|
||||
// Register the validator
|
||||
err := kreuzberg.RegisterValidator(
|
||||
"test-validator",
|
||||
50,
|
||||
(C.ValidatorCallback)(C.testValidator),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register validator: %v", err)
|
||||
}
|
||||
|
||||
// Test 1: Valid content
|
||||
validContent := map[string]interface{}{
|
||||
"content": "This is valid content",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
validJSON, err := json.Marshal(validContent)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal valid content: %v", err)
|
||||
}
|
||||
cValidJSON := C.CString(string(validJSON))
|
||||
defer C.free(unsafe.Pointer(cValidJSON))
|
||||
|
||||
result := testValidator(cValidJSON)
|
||||
if result != nil {
|
||||
t.Errorf("Expected nil (success), got error: %s", C.GoString(result))
|
||||
}
|
||||
|
||||
if !validatorCalled {
|
||||
t.Errorf("Validator was not called")
|
||||
}
|
||||
|
||||
// Test 2: Invalid content (too short)
|
||||
invalidContent := map[string]interface{}{
|
||||
"content": "Short",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
invalidJSON, err := json.Marshal(invalidContent)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal invalid content: %v", err)
|
||||
}
|
||||
cInvalidJSON := C.CString(string(invalidJSON))
|
||||
defer C.free(unsafe.Pointer(cInvalidJSON))
|
||||
|
||||
result = testValidator(cInvalidJSON)
|
||||
if result == nil {
|
||||
t.Errorf("Expected error for short content, got nil")
|
||||
} else {
|
||||
errorMsg := C.GoString(result)
|
||||
if errorMsg != "Content too short" {
|
||||
t.Errorf("Expected 'Content too short', got: %s", errorMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterValidator("test-validator")
|
||||
}
|
||||
|
||||
// TestValidatorIntegration tests validator with actual extraction
|
||||
func TestValidatorIntegration(t *testing.T) {
|
||||
//export integrationValidator
|
||||
integrationValidator := func(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString(fmt.Sprintf("Parse error: %v", err))
|
||||
}
|
||||
|
||||
// Validate that mime_type is set
|
||||
if _, ok := result["mime_type"]; !ok {
|
||||
return C.CString("Missing mime_type in result")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register validator
|
||||
err := kreuzberg.RegisterValidator(
|
||||
"integration-validator",
|
||||
100,
|
||||
(C.ValidatorCallback)(C.integrationValidator),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register validator: %v", err)
|
||||
}
|
||||
|
||||
// The validator will be called automatically during extraction
|
||||
// This test verifies the registration was successful
|
||||
validators, err := kreuzberg.ListValidators()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to list validators: %v", err)
|
||||
}
|
||||
|
||||
found := false
|
||||
for _, v := range validators {
|
||||
if v == "integration-validator" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
t.Errorf("Validator not found in registered validators list")
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterValidator("integration-validator")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/plugins/plugin_validator.md
Normal file
35
docs/snippets/go/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
//export customValidator
|
||||
func customValidator(resultJSON *C.char) *C.char {
|
||||
// Inspect resultJSON, return error message or NULL
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := kreuzberg.RegisterValidator("go-validator", 50, (C.ValidatorCallback)(C.customValidator)); err != nil {
|
||||
log.Fatalf("register validator failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
77
docs/snippets/go/plugins/quality_score_validator.md
Normal file
77
docs/snippets/go/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,77 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// qualityThreshold is the minimum acceptable quality score
|
||||
const qualityThreshold = 0.5
|
||||
|
||||
// qualityScoreValidator validates that extraction quality meets minimum threshold
|
||||
//export qualityScoreValidator
|
||||
func qualityScoreValidator(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse result JSON")
|
||||
}
|
||||
|
||||
// Extract metadata object
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
// No metadata is not an error, just skip quality check
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get quality score from result
|
||||
qualityScore := 0.0
|
||||
if score, ok := result["quality_score"].(float64); ok {
|
||||
qualityScore = score
|
||||
}
|
||||
|
||||
// Validate against threshold
|
||||
if qualityScore < qualityThreshold {
|
||||
errMsg := fmt.Sprintf("Quality score too low: %.0f%% < %.0f%%", qualityScore*100, qualityThreshold*100)
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
// Validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the validator with priority 50
|
||||
if err := kreuzberg.RegisterValidator("quality_score_validator", 50,
|
||||
(C.ValidatorCallback)(C.qualityScoreValidator)); err != nil {
|
||||
log.Fatalf("failed to register validator: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterValidator("quality_score_validator"); err != nil {
|
||||
log.Printf("warning: failed to unregister validator: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract and validate
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction or validation failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Quality validation passed for: %s", result.MimeType)
|
||||
}
|
||||
```
|
||||
165
docs/snippets/go/plugins/stateful_plugin.md
Normal file
165
docs/snippets/go/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,165 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// PluginState manages thread-safe state for the stateful plugin
|
||||
type PluginState struct {
|
||||
mu sync.Mutex
|
||||
callCount int
|
||||
cache map[string]string
|
||||
lastMimeType string
|
||||
}
|
||||
|
||||
// globalState holds the plugin's persistent state across calls
|
||||
var globalState = &PluginState{
|
||||
cache: make(map[string]string),
|
||||
}
|
||||
|
||||
// statefulPlugin demonstrates a thread-safe plugin with persistent state
|
||||
//export statefulPlugin
|
||||
func statefulPlugin(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Acquire lock to safely modify state
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
// Increment call counter
|
||||
globalState.callCount++
|
||||
|
||||
// Extract and store MIME type
|
||||
if mimeType, ok := result["mime_type"].(string); ok {
|
||||
globalState.lastMimeType = mimeType
|
||||
globalState.cache[mimeType] = "processed"
|
||||
}
|
||||
|
||||
// Ensure metadata exists
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Add state information to metadata
|
||||
metadata["plugin_call_count"] = globalState.callCount
|
||||
metadata["last_mime_type"] = globalState.lastMimeType
|
||||
metadata["cached_types_count"] = len(globalState.cache)
|
||||
metadata["plugin_version"] = "1.0.0"
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// GetPluginStats safely retrieves the current plugin state for logging
|
||||
func GetPluginStats() (int, string, []string) {
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
callCount := globalState.callCount
|
||||
lastMime := globalState.lastMimeType
|
||||
|
||||
mimeTypes := make([]string, 0, len(globalState.cache))
|
||||
for mimeType := range globalState.cache {
|
||||
mimeTypes = append(mimeTypes, mimeType)
|
||||
}
|
||||
|
||||
return callCount, lastMime, mimeTypes
|
||||
}
|
||||
|
||||
// ResetPluginState clears the plugin state - useful for testing
|
||||
func ResetPluginState() {
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
globalState.callCount = 0
|
||||
globalState.lastMimeType = ""
|
||||
globalState.cache = make(map[string]string)
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the stateful post-processor with priority 60
|
||||
if err := kreuzberg.RegisterPostProcessor("stateful_plugin", 60,
|
||||
(C.PostProcessorCallback)(C.statefulPlugin)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("stateful_plugin"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
|
||||
// Print final statistics
|
||||
callCount, lastMime, mimeTypes := GetPluginStats()
|
||||
log.Printf("Plugin Statistics:")
|
||||
log.Printf(" Total calls: %d", callCount)
|
||||
log.Printf(" Last MIME type: %s", lastMime)
|
||||
log.Printf(" Unique MIME types processed: %d", len(mimeTypes))
|
||||
if len(mimeTypes) > 0 {
|
||||
log.Printf(" Processed types: %v", mimeTypes)
|
||||
}
|
||||
}()
|
||||
|
||||
// Process multiple documents to demonstrate state accumulation
|
||||
files := []string{
|
||||
"document1.pdf",
|
||||
"document2.pdf",
|
||||
"image.png",
|
||||
"document3.txt",
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
log.Printf("Processing: %s", file)
|
||||
result, err := kreuzberg.ExtractFileSync(file, nil)
|
||||
if err != nil {
|
||||
log.Printf(" Warning: extraction failed: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse and display metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if callCount, ok := metadata["plugin_call_count"].(float64); ok {
|
||||
log.Printf(" Plugin call count: %.0f", callCount)
|
||||
}
|
||||
if cachedCount, ok := metadata["cached_types_count"].(float64); ok {
|
||||
log.Printf(" Cached MIME types: %.0f", cachedCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Demonstrate thread-safe state access
|
||||
callCount, lastMime, mimeTypes := GetPluginStats()
|
||||
log.Printf("\nFinal Plugin State:")
|
||||
log.Printf(" Total calls: %d", callCount)
|
||||
log.Printf(" Last MIME type: %s", lastMime)
|
||||
log.Printf(" Processed MIME types: %v", mimeTypes)
|
||||
}
|
||||
```
|
||||
55
docs/snippets/go/plugins/unregister_plugins.md
Normal file
55
docs/snippets/go/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Unregister a specific document extractor
|
||||
if err := kreuzberg.UnregisterDocumentExtractor("custom-json-extractor"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister document extractor: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific post-processor
|
||||
if err := kreuzberg.UnregisterPostProcessor("word_count"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister post processor: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific OCR backend
|
||||
if err := kreuzberg.UnregisterOCRBackend("cloud-ocr"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister OCR backend: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific validator
|
||||
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister validator: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Plugins unregistered successfully")
|
||||
}
|
||||
```
|
||||
90
docs/snippets/go/plugins/word_count_processor.md
Normal file
90
docs/snippets/go/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,90 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// wordCountProcessor adds word count metadata to extraction results
|
||||
//export wordCountProcessor
|
||||
func wordCountProcessor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Extract content
|
||||
content, ok := result["content"].(string)
|
||||
if !ok {
|
||||
return C.CString("{\"error\":\"Missing content field\"}")
|
||||
}
|
||||
|
||||
// Count words by splitting on whitespace
|
||||
words := strings.Fields(content)
|
||||
wordCount := len(words)
|
||||
|
||||
// Ensure metadata exists
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Add word count to metadata
|
||||
metadata["word_count"] = wordCount
|
||||
|
||||
// Update result
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 100, early stage
|
||||
if err := kreuzberg.RegisterPostProcessor("word_count_processor", 100,
|
||||
(C.PostProcessorCallback)(C.wordCountProcessor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("word_count_processor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
// Access word count from metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if wordCount, ok := metadata["word_count"].(float64); ok {
|
||||
log.Printf("Word count: %.0f", wordCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/go/utils/chunking.md
Normal file
38
docs/snippets/go/utils/chunking.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1000
|
||||
maxOverlap := 200
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for i, chunk := range result.Chunks {
|
||||
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
|
||||
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
```
|
||||
45
docs/snippets/go/utils/chunking_rag.md
Normal file
45
docs/snippets/go/utils/chunking_rag.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 500
|
||||
maxOverlap := 50
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for i, chunk := range result.Chunks {
|
||||
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
|
||||
fmt.Printf("Content: %s...\n", chunk.Content[:min(len(chunk.Content), 100)])
|
||||
if chunk.Embedding != nil {
|
||||
fmt.Printf("Embedding: %d dimensions\n", len(chunk.Embedding))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
```
|
||||
32
docs/snippets/go/utils/embedding_with_chunking.md
Normal file
32
docs/snippets/go/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := 1024
|
||||
maxOverlap := 100
|
||||
batchSize := int32(32)
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxChars: &maxChars,
|
||||
MaxOverlap: &maxOverlap,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: "balanced",
|
||||
Normalize: true,
|
||||
BatchSize: &batchSize,
|
||||
ShowDownloadProgress: false,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Config: MaxChars=%d, MaxOverlap=%d, Model=%s\n",
|
||||
*config.Chunking.MaxChars,
|
||||
*config.Chunking.MaxOverlap,
|
||||
config.Chunking.Embedding.Model)
|
||||
}
|
||||
```
|
||||
29
docs/snippets/go/utils/keyword_extraction_example.md
Normal file
29
docs/snippets/go/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "YAKE",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.3,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
if keywords, ok := result.Metadata.Additional["keywords"]; ok {
|
||||
fmt.Printf("Keywords: %v\n", keywords)
|
||||
}
|
||||
}
|
||||
```
|
||||
33
docs/snippets/go/utils/quality_processing_example.md
Normal file
33
docs/snippets/go/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: true,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
qualityScore := 0.0
|
||||
if result.QualityScore != nil {
|
||||
qualityScore = *result.QualityScore
|
||||
}
|
||||
|
||||
if qualityScore < 0.5 {
|
||||
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
|
||||
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
} else {
|
||||
fmt.Printf("Quality score: %.2f\n", qualityScore)
|
||||
}
|
||||
}
|
||||
```
|
||||
36
docs/snippets/go/utils/standalone_embed.md
Normal file
36
docs/snippets/go/utils/standalone_embed.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preset := "balanced"
|
||||
normalize := true
|
||||
config := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: &preset,
|
||||
},
|
||||
Normalize: &normalize,
|
||||
}
|
||||
|
||||
// Synchronous
|
||||
embeddings, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Kreuzberg is fast"}, config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(len(embeddings)) // 2
|
||||
fmt.Println(len(embeddings[0])) // 768
|
||||
|
||||
// Asynchronous
|
||||
embeddings, err = kreuzberg.EmbedTextsAsync([]string{"Hello, world!"}, config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(len(embeddings[0])) // 768
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/utils/token_reduction.md
Normal file
24
docs/snippets/go/utils/token_reduction.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserve := true
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: "moderate",
|
||||
PreserveImportantWords: &preserve,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
28
docs/snippets/go/utils/token_reduction_example.md
Normal file
28
docs/snippets/go/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionConfig{
|
||||
Mode: "moderate",
|
||||
PreserveMarkdown: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Original tokens: %v\n", result.Metadata.Additional["original_token_count"])
|
||||
fmt.Printf("Reduced tokens: %v\n", result.Metadata.Additional["token_count"])
|
||||
fmt.Printf("Reduction ratio: %v\n", result.Metadata.Additional["token_reduction_ratio"])
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user