Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
for (i in seq_len(min(5L, length(result$chunks)))) {
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}
```

View File

@@ -0,0 +1,25 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 500L,
overlap = 50L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE
)
)
)
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(result$chunks)) {
chunk <- result$chunks[[i]]
cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
if (!is.null(chunk$embedding)) {
cat(sprintf(" Embedding: %d dimensions\n", length(chunk$embedding)))
}
}
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 1024L,
overlap = 100L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE,
batch_size = 32L
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
```

View File

@@ -0,0 +1,21 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(
algorithm = "yake",
max_keywords = 10L,
min_score = 0.3
)
)
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
if (!is.null(result$metadata$keywords)) {
for (kw in result$metadata$keywords) {
cat(sprintf(" - %s\n", kw))
}
}
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
if (!is.null(result$quality_score)) {
cat(sprintf("Quality score: %.2f\n", result$quality_score))
if (result$quality_score < 0.5) {
cat("Warning: low quality extraction\n")
}
}
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE
)
texts <- c("Hello, world!", "Kreuzberg is fast")
embeddings <- embed_texts(texts, config)
stopifnot(length(embeddings) == 2L)
cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_important_words = TRUE
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$content)
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_important_words = TRUE
)
)
json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
cat(sprintf("MIME type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,26 @@
```r title="R"
library(kreuzberg)
document_id <- "doc-001"
config <- list(
chunking = list(
max_characters = 512L,
overlap = 50L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE,
batch_size = 32L
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
# Each chunk has $content, $embedding, and $metadata. Pass these directly
# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
# the document_id stored as a metadata field.
cat(sprintf("document_id: %s\n", document_id))
cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
```