Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
```r
library(kreuzberg)
# Batch extract from multiple files
files <- c("report.pdf", "slides.pptx", "data.xlsx")
results <- batch_extract_files_sync(files)
for (i in seq_along(results)) {
cat(sprintf("File: %s\n", files[i]))
cat(sprintf(" MIME: %s\n", results[[i]]$mime_type))
cat(sprintf(" Length: %d chars\n\n", nchar(results[[i]]$content)))
}
```

View File

@@ -0,0 +1,27 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 500L, overlap = 50L),
pages = list(extract_pages = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(result$chunks)) {
chunk <- result$chunks[[i]]
metadata <- result$chunk_metadata[[i]]
if (!is.null(metadata$first_page) && !is.null(metadata$last_page)) {
page_range <- if (metadata$first_page == metadata$last_page) {
sprintf("Page %d", metadata$first_page)
} else {
sprintf("Pages %d-%d", metadata$first_page, metadata$last_page)
}
preview <- substr(chunk, 1L, min(50L, nchar(chunk)))
cat(sprintf("Chunk: %s... (%s)\n", preview, page_range))
}
}
```

View File

@@ -0,0 +1,37 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks produced: %d\n", length(result$chunks)))
for (i in seq_len(min(3L, length(result$chunks)))) {
cat(sprintf("Chunk %d length: %d characters\n", i, nchar(result$chunks[[i]])))
}
```
```r title="R - Prepend Heading Context"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 500L,
overlap = 50L,
chunker_type = "markdown",
prepend_heading_context = TRUE
)
)
json <- extract_file_sync("document.md", "text/markdown", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
preview <- substr(chunk, 1L, min(100L, nchar(chunk)))
cat(sprintf("%s\n", preview))
}
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 800L, overlap = 150L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
cat("Processing chunks for RAG pipeline:\n")
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
cat(sprintf("Chunk %d: %d characters\n", i, nchar(chunk)))
}
```

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Preparing %d chunks for embedding:\n", length(result$chunks)))
embeddings_data <- list()
for (i in seq_along(result$chunks)) {
embeddings_data[[i]] <- list(
chunk_id = i,
text = result$chunks[[i]],
length = nchar(result$chunks[[i]])
)
}
cat(sprintf("Ready to embed %d chunks\n", length(embeddings_data)))
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(
algorithm = "yake",
max_keywords = 10L,
min_score = 0.3,
ngram_range = c(1L, 3L),
language = "en"
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
if (length(result$keywords) > 0) {
cat("Top keywords:\n")
for (i in seq_len(min(10L, length(result$keywords)))) {
cat(sprintf(" %d. %s\n", i, result$keywords[[i]]))
}
}
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
config <- list(
language_detection = list(
enabled = TRUE,
min_confidence = 0.8,
detect_multiple = FALSE
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
if (length(result$detected_languages) > 0) {
cat(sprintf("Detected language: %s\n", result$detected_languages[[1]]))
} else {
cat("No language detected\n")
}
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
files <- c("english.pdf", "spanish.pdf", "french.pdf")
config <- list(language_detection = list(enabled = TRUE))
for (file in files) {
json <- extract_file_sync(file, "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("%s: detected language = %s\n",
file, result$detected_language))
}
```

View File

@@ -0,0 +1,10 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Quality score: %.2f\n", result$quality_score))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Quality Metrics:\n")
cat(sprintf("Quality Score: %.2f\n", result$quality_score))
cat(sprintf("Content Length: %d characters\n", nchar(result$content)))
cat(sprintf("Pages: %d\n", length(result$pages)))
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_markdown = TRUE,
preserve_code = TRUE,
language_hint = "eng"
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,14 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Token-reduced content:\n")
cat(sprintf("Length: %d characters\n", nchar(result$content)))
cat(sprintf("Preview: %.60s...\n", result$content))
```

View File

@@ -0,0 +1,24 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
vector_doc <- list(
id = sprintf("doc_%d", i),
text = chunk,
metadata = list(
source = "document.pdf",
chunk_index = i,
length = nchar(chunk)
)
)
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
}
```