Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
```r
library(kreuzberg)
# Batch extract from multiple files
files <- c("report.pdf", "slides.pptx", "data.xlsx")
results <- batch_extract_files_sync(files)
for (i in seq_along(results)) {
cat(sprintf("File: %s\n", files[i]))
cat(sprintf(" MIME: %s\n", results[[i]]$mime_type))
cat(sprintf(" Length: %d chars\n\n", nchar(results[[i]]$content)))
}
```

View File

@@ -0,0 +1,27 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 500L, overlap = 50L),
pages = list(extract_pages = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(result$chunks)) {
chunk <- result$chunks[[i]]
metadata <- result$chunk_metadata[[i]]
if (!is.null(metadata$first_page) && !is.null(metadata$last_page)) {
page_range <- if (metadata$first_page == metadata$last_page) {
sprintf("Page %d", metadata$first_page)
} else {
sprintf("Pages %d-%d", metadata$first_page, metadata$last_page)
}
preview <- substr(chunk, 1L, min(50L, nchar(chunk)))
cat(sprintf("Chunk: %s... (%s)\n", preview, page_range))
}
}
```

View File

@@ -0,0 +1,37 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks produced: %d\n", length(result$chunks)))
for (i in seq_len(min(3L, length(result$chunks)))) {
cat(sprintf("Chunk %d length: %d characters\n", i, nchar(result$chunks[[i]])))
}
```
```r title="R - Prepend Heading Context"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 500L,
overlap = 50L,
chunker_type = "markdown",
prepend_heading_context = TRUE
)
)
json <- extract_file_sync("document.md", "text/markdown", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
preview <- substr(chunk, 1L, min(100L, nchar(chunk)))
cat(sprintf("%s\n", preview))
}
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 800L, overlap = 150L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
cat("Processing chunks for RAG pipeline:\n")
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
cat(sprintf("Chunk %d: %d characters\n", i, nchar(chunk)))
}
```

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Preparing %d chunks for embedding:\n", length(result$chunks)))
embeddings_data <- list()
for (i in seq_along(result$chunks)) {
embeddings_data[[i]] <- list(
chunk_id = i,
text = result$chunks[[i]],
length = nchar(result$chunks[[i]])
)
}
cat(sprintf("Ready to embed %d chunks\n", length(embeddings_data)))
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(
algorithm = "yake",
max_keywords = 10L,
min_score = 0.3,
ngram_range = c(1L, 3L),
language = "en"
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
if (length(result$keywords) > 0) {
cat("Top keywords:\n")
for (i in seq_len(min(10L, length(result$keywords)))) {
cat(sprintf(" %d. %s\n", i, result$keywords[[i]]))
}
}
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
config <- list(
language_detection = list(
enabled = TRUE,
min_confidence = 0.8,
detect_multiple = FALSE
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
if (length(result$detected_languages) > 0) {
cat(sprintf("Detected language: %s\n", result$detected_languages[[1]]))
} else {
cat("No language detected\n")
}
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
files <- c("english.pdf", "spanish.pdf", "french.pdf")
config <- list(language_detection = list(enabled = TRUE))
for (file in files) {
json <- extract_file_sync(file, "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("%s: detected language = %s\n",
file, result$detected_language))
}
```

View File

@@ -0,0 +1,10 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Quality score: %.2f\n", result$quality_score))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Quality Metrics:\n")
cat(sprintf("Quality Score: %.2f\n", result$quality_score))
cat(sprintf("Content Length: %d characters\n", nchar(result$content)))
cat(sprintf("Pages: %d\n", length(result$pages)))
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_markdown = TRUE,
preserve_code = TRUE,
language_hint = "eng"
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,14 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Token-reduced content:\n")
cat(sprintf("Length: %d characters\n", nchar(result$content)))
cat(sprintf("Preview: %.60s...\n", result$content))
```

View File

@@ -0,0 +1,24 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
vector_doc <- list(
id = sprintf("doc_%d", i),
text = chunk,
metadata = list(
source = "document.pdf",
chunk_index = i,
length = nchar(chunk)
)
)
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
}
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
paths <- c("report.pdf", "notes.txt")
mimes <- c("application/pdf", "text/plain")
items <- jsonlite::toJSON(lapply(seq_along(paths), function(i) {
bytes <- readBin(paths[i], what = "raw", n = file.info(paths[i])$size)
list(content = as.integer(bytes), mime_type = mimes[i])
}), auto_unbox = TRUE)
json <- batch_extract_bytes_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(results)) {
cat(sprintf("[%d] mime=%s chars=%d\n",
i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
items <- jsonlite::toJSON(list(
list(path = "report.pdf"),
list(path = "slides.pptx"),
list(path = "data.xlsx")
), auto_unbox = TRUE)
json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(results)) {
cat(sprintf("[%d] mime=%s chars=%d\n",
i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
library(httr2)
payload <- list(
text = "Your long text content here...",
chunker_type = "text",
config = list(
max_characters = 1000,
overlap = 50,
trim = TRUE
)
)
response <- request("http://localhost:8000/chunk") |>
req_method("POST") |>
req_body_json(payload) |>
req_perform()
result <- resp_body_json(response)
cat(sprintf("Created %d chunks\n", result$chunk_count))
for (chunk in result$chunks) {
preview <- substr(chunk$content, 1, 50)
cat(sprintf("Chunk %d: %s...\n", chunk$chunk_index, preview))
}
```

View File

@@ -0,0 +1,18 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
library(httr2)
response <- request("http://localhost:8000/extract") |>
req_method("POST") |>
req_multipart_part(
name = "files",
path = "document.pdf",
type = "application/pdf"
) |>
req_perform()
data <- resp_body_json(response)
cat(jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE))
```

View File

@@ -0,0 +1,34 @@
```r title="R"
library(kreuzberg)
config_json <- jsonlite::toJSON(list(
output_format = "markdown",
force_ocr = TRUE,
extract_tables = TRUE,
extract_metadata = TRUE,
ocr = list(
backend = "tesseract",
language = "eng",
dpi = 300L
),
chunking = list(
chunker_type = "markdown",
max_characters = 1000L,
overlap = 200L
)
), auto_unbox = TRUE)
config <- ExtractionConfig$from_json(config_json)
json <- extract_file_sync(
path = "scanned_report.pdf",
mime_type = "application/pdf",
config = config
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks: %d\n", length(result$chunks)))
cat(sprintf("Tables: %d\n", length(result$tables)))
title <- if (!is.null(result$metadata$title)) result$metadata$title else "<none>"
cat(sprintf("Title: %s\n", title))
```

View File

@@ -0,0 +1,26 @@
```r title="R"
library(kreuzberg)
content <- charToRaw("Hello, world!")
result <- tryCatch(
{
json <- extract_bytes_sync(
content = content,
mime_type = "application/x-nonexistent",
config = ExtractionConfig$default()
)
jsonlite::fromJSON(json, simplifyVector = FALSE)
},
error = function(e) {
message(sprintf("Extraction failed: %s", conditionMessage(e)))
NULL
}
)
if (is.null(result)) {
cat("No content extracted; falling back to original bytes\n")
} else {
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
}
```

View File

@@ -0,0 +1,35 @@
```r title="R"
library(kreuzberg)
items <- jsonlite::toJSON(list(
list(path = "doc1.pdf"),
list(path = "doc2.docx"),
list(path = "missing.html")
), auto_unbox = TRUE)
result <- tryCatch(
{
json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
jsonlite::fromJSON(json, simplifyVector = FALSE)
},
error = function(e) {
message(sprintf("Batch extraction failed: %s", conditionMessage(e)))
NULL
}
)
if (is.null(result)) {
cat("No results returned\n")
} else {
for (i in seq_along(result)) {
item <- result[[i]]
err <- item$metadata$error
if (!is.null(err)) {
cat(sprintf("Document %d: ERROR - %s\n", i, err))
} else {
cat(sprintf("Document %d: %d chars, %d tables\n",
i, nchar(item$content), length(item$tables)))
}
}
}
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
# extract_bytes is the async variant; the call blocks the calling R thread
# until the underlying tokio task completes. Use future/promises if you need
# to fan out without blocking.
path <- "document.pdf"
content <- readBin(path, what = "raw", n = file.info(path)$size)
json <- extract_bytes(
content = content,
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
path <- "document.pdf"
content <- readBin(path, what = "raw", n = file.info(path)$size)
json <- extract_bytes_sync(
content = content,
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content preview: %s\n", substr(result$content, 1, 200)))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
# extract_file is the async variant; extendr drives the tokio runtime so the
# call returns once extraction completes. R has no native async, so wrap with
# the future/promises packages if non-blocking dispatch is required.
json <- extract_file(
path = "document.pdf",
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
json <- extract_file_sync(
path = "document.pdf",
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
config <- list(
ocr = list(backend = "tesseract", language = "eng"),
chunking = list(max_characters = 1500L, overlap = 300L),
output_format = "markdown",
include_document_structure = TRUE,
force_ocr = TRUE
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Format: %s\n", result$mime_type))
cat(sprintf("Chunks: %d\n", length(result$chunks)))
cat(sprintf("Content preview: %.50s...\n", result$content))
```

View File

@@ -0,0 +1,50 @@
```r title="R"
library(kreuzberg)
# Example 1: Basic character-based chunking
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
num_chunks <- length(result$chunks)
cat(sprintf("Document split into %d chunks\n", num_chunks))
for (i in seq_len(min(3L, num_chunks))) {
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}
```
```r title="R - Markdown chunker with token-based sizing"
library(kreuzberg)
config <- list(
chunking = list(
chunker_type = "markdown",
sizing = list(
type = "tokenizer",
model = "Xenova/gpt-4o"
)
)
)
json <- extract_file_sync("document.md", "text/markdown", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Markdown document split into %d chunks\n", length(result$chunks)))
```
```r title="R - Prepend heading context"
library(kreuzberg)
config <- list(
chunking = list(
chunker_type = "markdown",
prepend_heading_context = TRUE
)
)
json <- extract_file_sync("document.md", "text/markdown", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Document split into %d chunks with prepended headings\n", length(result$chunks)))
```

View File

@@ -0,0 +1,33 @@
```r
library(kreuzberg)
# Configure text chunking for RAG pipelines
config <- list(
chunking = list(
max_characters = 1000L,
overlap = 200L
)
)
json <- extract_file_sync("large_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Number of chunks:", length(result$chunks), "\n")
```
```r title="R - Prepend Heading Context"
library(kreuzberg)
# Prepend heading context to chunk content for structured documents
config <- list(
chunking = list(
chunker_type = "markdown",
max_characters = 500L,
overlap = 50L,
prepend_heading_context = TRUE
)
)
json <- extract_file_sync("document.md", "text/markdown", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Number of chunks:", length(result$chunks), "\n")
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
output_format = "markdown"
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
```

View File

@@ -0,0 +1,11 @@
```r title="R"
library(kreuzberg)
# Load configuration from a JSON file and pass it to extract_file_sync.
config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
config <- ExtractionConfig$from_json(config_json)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,14 @@
```r title="R"
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
cat(sprintf("Detected language: %s\n", result$detected_language))
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(
backend = "tesseract",
language = "eng"
),
chunking = list(
max_characters = 2000L,
overlap = 300L
),
output_format = "markdown"
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,21 @@
```r title="R"
library(kreuzberg)
config <- list(
include_document_structure = TRUE,
output_format = "markdown"
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total pages: %d\n", length(result$pages)))
cat(sprintf("MIME type: %s\n\n", result$mime_type))
for (i in seq_along(result$pages)) {
page <- result$pages[[i]]
cat(sprintf("Page %d structure:\n", i))
cat(sprintf(" Content: %s\n", substr(page$content, 1, 100)))
cat("\n")
}
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
config <- list(
result_format = "element_based",
output_format = "markdown"
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total elements: %d\n\n", length(result$elements)))
for (i in seq_along(result$elements)) {
element <- result$elements[[i]]
cat(sprintf("Element %d:\n", i))
cat(sprintf(" Type: %s\n", element$element_type))
cat(sprintf(" Content: %s\n\n", substr(element$content, 1, 100)))
}
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 1000L,
overlap = 200L,
embedding = list(
model = list(type = "preset", name = "balanced"),
batch_size = 16L,
normalize = TRUE,
show_download_progress = TRUE
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
output_format = "html",
html_output = list(
theme = "git_hub",
embed_css = TRUE
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$content) # HTML with kb-* classes
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
if (length(result$keywords) > 0) {
for (i in seq_len(min(5L, length(result$keywords)))) {
cat(sprintf(" - %s\n", result$keywords[[i]]))
}
}
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(
language_detection = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Detected language: %s\n", result$detected_language))
cat(sprintf("Content preview: %.60s...\n", result$content))
```

View File

@@ -0,0 +1,16 @@
```r
library(kreuzberg)
# Configure OCR with Tesseract
config <- list(
force_ocr = TRUE,
ocr = list(
backend = "tesseract",
language = "eng+deu"
)
)
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$content)
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
# Tesseract OCR via the kreuzberg R bindings does not expose a DPI setting in
# the high-level config; PDF rasterization DPI is determined by the pipeline.
# This example demonstrates running Tesseract OCR end-to-end on a PDF.
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Characters extracted: %d\n", nchar(result$content)))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(
pdf_options = list(extract_images = TRUE, extract_metadata = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Tables extracted: %d\n", length(result$tables)))
cat(sprintf("Content preview: %.50s...\n", result$content))
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
config <- list(
pdf_options = list(
extract_metadata = TRUE,
hierarchy = list(
enabled = TRUE,
k_clusters = 6L,
include_bbox = TRUE,
ocr_coverage_threshold = 0.8
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Pages: %d\n", length(result$pages)))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(
postprocessor = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat(sprintf("Mime type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,11 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Quality score: %.2f\n", result$quality_score))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(
backend = "tesseract",
language = "eng+deu"
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Detected language: %s\n", result$detected_language))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(enabled = TRUE)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Original content length: %d characters\n", nchar(result$content)))
cat(sprintf("Content preview: %.60s...\n", result$content))
```

View File

@@ -0,0 +1,7 @@
```r
library(kreuzberg)
# Extract text from a PDF file
result <- extract_file_sync("document.pdf")
cat(result$content)
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- ExtractionConfig$default()
json <- extract_file_sync(
path = "document.pdf",
mime_type = NULL,
config = config
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$content)
cat(sprintf("\nMIME Type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,14 @@
```r title="R"
library(kreuzberg)
# Extract a file and inspect the result
result <- extract_file_sync("document.pdf")
# Print result information
cat(sprintf("MIME type: %s\n", mime_type(result)))
cat(sprintf("Content length: %d characters\n", nchar(content(result))))
cat(sprintf("Page count: %d\n", page_count(result)))
# View additional metadata
cat(sprintf("Detected language: %s\n", detected_language(result)))
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
# Configure OCR settings via a plain list mirroring the config JSON.
config <- list(
force_ocr = TRUE,
ocr = list(
backend = "tesseract",
language = "eng"
)
)
# Extract an image file with OCR enabled
json <- extract_file_sync("image.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Extracted text from image:\n")
cat(result$content)
```

View File

@@ -0,0 +1,12 @@
```r title="R"
library(kreuzberg)
# Extract a PDF file
result <- extract_file_sync("example.pdf")
# Print a preview of the extracted content
content_preview <- substr(content(result), 1L, 200L)
cat("Content preview:\n")
cat(content_preview)
cat("\n...\n")
```

View File

@@ -0,0 +1,7 @@
```r title="R"
library(kreuzberg)
# Confirm the native extension loaded by listing registered extractors
extractors <- list_document_extractors()
cat(sprintf("kreuzberg ready: %d document extractors registered\n", length(extractors)))
```

View File

@@ -0,0 +1,7 @@
```r
# Install from source (requires Rust toolchain)
# install.packages("kreuzberg")
# Or install from GitHub
# remotes::install_github("kreuzberg-dev/kreuzberg", subdir = "packages/r")
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
# Extract a document
result <- extract_file_sync("document.docx")
# Access core content fields
cat(sprintf("MIME type: %s\n", mime_type(result)))
cat(sprintf("Content length: %d characters\n", nchar(content(result))))
# Access structured data
cat(sprintf("Number of tables: %d\n", length(result$tables)))
cat(sprintf("Detected language: %s\n", detected_language(result)))
# Access metadata
author <- metadata_field(result, "author")
if (!is.null(author)) {
cat(sprintf("Document author: %s\n", author))
}
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment.
```r title="R"
library(kreuzberg)
schema <- list(
type = "object",
properties = list(
title = list(type = "string"),
authors = list(type = "array", items = list(type = "string")),
date = list(type = "string")
),
required = c("title", "authors", "date"),
additionalProperties = FALSE
)
config <- list(
structured_extraction = list(
schema = schema,
llm = list(model = "openai/gpt-4o-mini"),
strict = TRUE
)
)
json <- extract_file_sync("paper.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$structured_output, "\n")
```

View File

@@ -0,0 +1,25 @@
```r title="R"
# The kreuzberg R bindings ship no MCP client. Drive the kreuzberg CLI's
# stdio MCP transport from R using a piped subprocess.
mcp <- pipe("kreuzberg mcp", open = "w+")
on.exit(close(mcp), add = TRUE)
request <- list(
method = "tools/call",
params = list(
name = "extract_file",
arguments = list(
path = "document.pdf",
async = TRUE
)
)
)
writeLines(jsonlite::toJSON(request, auto_unbox = TRUE), con = mcp)
flush(mcp)
response_line <- readLines(mcp, n = 1L)
cat(response_line, "\n")
```
<!-- snippet:syntax-only --> The R bindings have no MCP client; this snippet drives the MCP CLI over stdio. Requires the `jsonlite` package.

View File

@@ -0,0 +1,11 @@
```r title="R"
# The kreuzberg R bindings do not embed an MCP server: MCP is provided by the
# kreuzberg CLI (Rust binary). Spawn it from the same R session that uses the
# kreuzberg package for in-process extraction.
status <- system2("kreuzberg", args = "mcp", stdout = "", stderr = "")
if (status != 0L) {
stop(sprintf("MCP server exited with status %d", status))
}
```
<!-- snippet:syntax-only --> The R bindings expose extraction primitives only; MCP transport requires the standalone kreuzberg CLI.

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
result <- extract_file_sync("document.pdf")
cat("Language Detection Results:\n\n")
cat("Using direct field access:\n")
cat("Detected Language:", result$detected_language, "\n\n")
cat("Using S3 helper function:\n")
lang <- detected_language(result)
cat("Language (via helper):", lang, "\n\n")
cat("Language Information:\n")
if (lang == "en") {
cat("This is an English document\n")
} else if (lang == "es") {
cat("This is a Spanish document\n")
} else {
cat(sprintf("This is a %s document\n", lang))
}
```

View File

@@ -0,0 +1,13 @@
```r title="R"
library(kreuzberg)
files <- c("english.pdf", "spanish.pdf", "french.pdf")
config <- list(language_detection = list(enabled = TRUE))
for (file in files) {
json <- extract_file_sync(file, "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("%s: detected language = %s\n",
file, result$detected_language))
}
```

View File

@@ -0,0 +1,25 @@
```r title="R"
library(kreuzberg)
result <- extract_file_sync("document.pdf")
cat("Detected Language:", result$detected_language, "\n")
cat("Quality Score:", result$quality_score, "\n")
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
cat("Metadata fields:\n")
authors <- metadata_field(result, "authors")
if (!is.null(authors)) {
cat("Authors:", paste(authors, collapse=", "), "\n")
}
created <- metadata_field(result, "created_date")
if (!is.null(created)) {
cat("Created Date:", created, "\n")
}
pages_meta <- metadata_field(result, "page_count")
if (!is.null(pages_meta)) {
cat("Pages:", pages_meta, "\n")
}
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
result <- extract_file_sync("document.pdf")
boundaries <- result$metadata$pages$boundaries
if (!is.null(boundaries) && length(boundaries) > 0L) {
content_bytes <- charToRaw(result$content)
for (i in seq_len(min(3L, length(boundaries)))) {
boundary <- boundaries[[i]]
page_bytes <- content_bytes[(boundary$byte_start + 1L):boundary$byte_end]
page_text <- rawToChar(page_bytes)
preview_end <- min(100L, nchar(page_text))
cat(sprintf("Page %d:\n", boundary$page_number))
cat(sprintf(" Byte range: %d-%d\n", boundary$byte_start, boundary$byte_end))
cat(sprintf(" Preview: %s...\n", substr(page_text, 1L, preview_end)))
}
}
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
result <- extract_file_sync("document.pdf")
cat("Total pages:", page_count(result), "\n\n")
for (i in seq_along(result$pages)) {
page <- result$pages[[i]]
cat(sprintf("Page %d:\n", i))
cat(" Elements:", length(page$elements), "\n")
cat(" Text content length:", nchar(page$content), "chars\n")
if (nchar(page$content) > 0L) {
preview <- substr(page$content, 1L, 100L)
cat(sprintf(" Preview: %s...\n", preview))
}
cat("\n")
}
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
result <- extract_file_sync("spreadsheet.xlsx")
cat("Tables extracted:", length(result$tables), "\n\n")
for (i in seq_along(result$tables)) {
table <- result$tables[[i]]
cat(sprintf("Table %d:\n", i))
cat(" Rows:", nrow(table), "\n")
cat(" Columns:", ncol(table), "\n")
cat(" Column names:", paste(colnames(table), collapse=", "), "\n")
cat("\n")
if (nrow(table) > 0L) {
cat(" Preview (first 3 rows):\n")
print(head(table, 3L))
cat("\n")
}
}
```

View File

@@ -0,0 +1,24 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
vector_doc <- list(
id = sprintf("doc_%d", i),
text = chunk,
metadata = list(
source = "document.pdf",
chunk_index = i,
length = nchar(chunk)
)
)
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
}
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
custom_ocr_backend <- function(image_path, language) {
cat(sprintf("Processing image: %s\n", image_path))
return(sprintf("Extracted text from %s", image_path))
}
register_ocr_backend("custom_cloud", custom_ocr_backend)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "custom_cloud", language = "en")
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Custom backend result: %d chars\n", nchar(result$content)))
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
json <- extract_file_sync("scan.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("Image extraction via OCR:\n")
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat(sprintf("Mime type: %s\n", result$mime_type))
cat(sprintf("Detected language: %s\n", result$detected_language))
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng"),
enable_quality_processing = TRUE
)
json <- extract_file_sync("scan.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Quality: %.2f, Length: %d\n",
result$quality_score %||% 0,
nchar(result$content)))
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
# Note: EasyOCR backend requires Python to be installed
config <- list(
force_ocr = TRUE,
ocr = list(backend = "easyocr", language = "en")
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat("EasyOCR extraction:\n")
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat(sprintf("Detected language: %s\n", result$detected_language))
```

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
# Enable structured OCR elements alongside text extraction
config <- list(
ocr = list(
backend = "paddleocr",
language = "en",
element_config = list(include_elements = TRUE)
)
)
json <- extract_file_sync("scanned.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
if (!is.null(result$ocr_elements)) {
for (element in result$ocr_elements) {
cat(sprintf("Text: %s\n", element$text))
cat(sprintf("Confidence: %.2f\n", element$confidence$recognition))
cat("\n")
}
}
```

View File

@@ -0,0 +1,17 @@
```r title="R"
library(kreuzberg)
# Configure Tesseract OCR
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
# Extract text from a scanned image
json <- extract_file_sync("scan.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
```

View File

@@ -0,0 +1,12 @@
```r title="R"
library(kreuzberg)
config <- list(force_ocr = TRUE)
json <- extract_file_sync("multipage_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total pages: %d\n", length(result$pages)))
cat(sprintf("Content extracted via OCR: %d characters\n", nchar(result$content)))
cat(sprintf("Detected language: %s\n", result$detected_language))
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
# Configure multi-language OCR (English, French, German)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng+fra+deu")
)
# Extract from a multilingual document
json <- extract_file_sync("multilingual.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Detected language: %s\n", result$detected_language))
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
```

View File

@@ -0,0 +1,18 @@
```r title="R"
library(kreuzberg)
# Configure PaddleOCR backend (defaults to mobile tier)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "paddle-ocr", language = "en")
)
# Extract text from an image using PaddleOCR
json <- extract_file_sync("document.jpg", "image/jpeg", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat(sprintf("MIME type: %s\n", result$mime_type))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
```

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
# Clear all custom OCR backends
clear_ocr_backends()
cat("OCR backends cleared\n")
# Clear all custom validators
clear_validators()
cat("Validators cleared\n")
# Clear all custom post-processors
clear_post_processors()
cat("Post-processors cleared\n")
# Clear all custom document extractors
clear_document_extractors()
cat("Document extractors cleared\n")
```

View File

@@ -0,0 +1,15 @@
```r
library(kreuzberg)
# List available OCR backends
backends <- list_ocr_backends()
cat("Available backends:", paste(backends, collapse = ", "), "\n")
# List registered post-processors
processors <- list_post_processors()
cat("Post-processors:", paste(processors, collapse = ", "), "\n")
# Clear all custom registrations
clear_post_processors()
clear_validators()
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Wrap an already-loaded embedder (e.g. an ONNX session) so kreuzberg can
# call back into it during chunking and standalone embed requests.
my_embedder <- list(
name = "my-embedder",
version = "1.0.0",
dimensions = 768L,
embed = function(texts) {
# Delegate to the already-loaded host model.
lapply(texts, function(.) rep(0.0, 768))
}
)
register_embedding_backend(my_embedder)
config <- list(
embedding = list(
model = list(type = "plugin", name = "my-embedder"),
max_embed_duration_secs = 30L
)
)
vectors <- embed_texts(c("Hello, world!", "Second text"), config)
cat(sprintf("Generated %d embedding vectors\n", length(vectors)))
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
custom_extractor <- function(path, mime_type) {
content <- sprintf("Extracted from %s (%s)", path, mime_type)
return(list(
content = content,
mime_type = mime_type,
pages = 1L
))
}
register_document_extractor("custom_format", custom_extractor)
result <- extract_file_sync("custom_document.xyz", "application/custom", NULL)
cat(sprintf("Custom extractor result:\n"))
cat(sprintf("Content: %s\n", result$content))
cat(sprintf("Mime type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
ocr_backends <- list_ocr_backends()
cat(sprintf("OCR backends: %s\n", paste(ocr_backends, collapse=", ")))
validators <- list_validators()
cat(sprintf("Validators: %s\n", paste(validators, collapse=", ")))
post_processors <- list_post_processors()
cat(sprintf("Post-processors: %s\n", paste(post_processors, collapse=", ")))
extractors <- list_document_extractors()
cat(sprintf("Document extractors: %s\n", paste(extractors, collapse=", ")))
```

View File

@@ -0,0 +1,27 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
min_length_validator <- function(result) {
min_length <- 50L
if (nchar(result$content) < min_length) {
return(list(
valid = FALSE,
message = sprintf(
"Content too short: %d < %d characters",
nchar(result$content), min_length
)
))
}
return(list(valid = TRUE, message = "Content length validation passed"))
}
register_validator("min_length", min_length_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
extract_pdf_metadata <- function(result) {
processed_result <- result
if (!is.null(result$metadata)) {
cat(sprintf("PDF Metadata:\n"))
for (key in names(result$metadata)) {
cat(sprintf(" %s: %s\n", key, result$metadata[[key]]))
}
}
return(processed_result)
}
register_post_processor("pdf_metadata", extract_pdf_metadata)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extraction complete: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
pdf_only_processor <- function(result) {
# Gate the processor so it only runs for PDF documents.
if (is.null(result$mime_type) || result$mime_type != "application/pdf") {
return(result)
}
return(result)
}
register_post_processor("pdf_only", pdf_only_processor)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Processed PDF: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,25 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
custom_json_extractor <- function(path, mime_type) {
raw <- readLines(path, warn = FALSE)
parsed <- jsonlite::fromJSON(paste(raw, collapse = "\n"))
text <- paste(unlist(parsed), collapse = "\n")
return(list(
content = text,
mime_type = "application/json",
pages = 1L,
metadata = list(extractor = "custom-json-extractor")
))
}
register_document_extractor("custom-json-extractor", custom_json_extractor)
result <- extract_file_sync("data.json", "application/json", NULL)
cat(sprintf("Extracted %d characters from JSON\n", nchar(result$content)))
```

View File

@@ -0,0 +1,30 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
logging_processor <- function(result) {
message(sprintf(
"[plugin] processing mime=%s content_chars=%d",
result$mime_type %||% "unknown", nchar(result$content)
))
return(result)
}
logging_validator <- function(result) {
message(sprintf(
"[plugin] validating mime=%s",
result$mime_type %||% "unknown"
))
return(list(valid = TRUE, message = "ok"))
}
register_post_processor("logging_processor", logging_processor)
register_validator("logging_validator", logging_validator)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Done: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,34 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
library(testthat)
uppercase_processor <- function(result) {
result$content <- toupper(result$content)
return(result)
}
test_that("uppercase processor uppercases content", {
fake_result <- list(
content = "hello world",
mime_type = "text/plain",
metadata = list()
)
processed <- uppercase_processor(fake_result)
expect_equal(processed$content, "HELLO WORLD")
})
test_that("post processor registers and runs", {
register_post_processor("uppercase", uppercase_processor)
on.exit(unregister_post_processor("uppercase"), add = TRUE)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_bytes_sync(
charToRaw("hello world"), "text/plain", config
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
expect_match(result$content, "HELLO WORLD", fixed = TRUE)
})
```

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
min_content_validator <- function(result) {
min_length <- 100L
if (nchar(result$content) < min_length) {
return(list(
valid = FALSE,
message = sprintf("Content too short: %d < %d",
nchar(result$content), min_length)
))
}
return(list(valid = TRUE, message = "Content validation passed"))
}
register_validator("min_content", min_content_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
quality_score_validator <- function(result) {
min_score <- 0.5
score <- as.numeric(result$metadata$quality_score %||% 0)
if (score < min_score) {
return(list(
valid = FALSE,
message = sprintf(
"Quality score too low: %.2f < %.2f",
score, min_score
)
))
}
return(list(valid = TRUE, message = "Quality score validation passed"))
}
register_validator("quality_score", quality_score_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Validated extraction: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,27 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Encapsulate mutable counter state in an environment so the plugin function
# can update it across calls.
make_stateful_plugin <- function() {
state <- new.env(parent = emptyenv())
state$count <- 0L
process <- function(result) {
state$count <- state$count + 1L
return(result)
}
list(process = process, count = function() state$count)
}
plugin <- make_stateful_plugin()
register_post_processor("stateful_counter", plugin$process)
config <- list(postprocessor = list(enabled = TRUE))
extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Processed: %d\n", plugin$count()))
```

View File

@@ -0,0 +1,11 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Remove plugins by their registered name.
unregister_post_processor("metadata_enrichment")
unregister_validator("min_length")
unregister_ocr_backend("custom_ocr_backend")
unregister_document_extractor("custom_format")
```

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
word_count_processor <- function(result) {
word_count <- length(strsplit(result$content, "\\s+")[[1]])
result$metadata <- c(result$metadata, list(word_count = word_count))
return(result)
}
register_post_processor("word_count", word_count_processor)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Word count: %d\n", result$metadata$word_count))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
for (i in seq_len(min(5L, length(result$chunks)))) {
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}
```

View File

@@ -0,0 +1,25 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 500L,
overlap = 50L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE
)
)
)
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(result$chunks)) {
chunk <- result$chunks[[i]]
cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
if (!is.null(chunk$embedding)) {
cat(sprintf(" Embedding: %d dimensions\n", length(chunk$embedding)))
}
}
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
config <- list(
chunking = list(
max_characters = 1024L,
overlap = 100L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE,
batch_size = 32L
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
```

View File

@@ -0,0 +1,21 @@
```r title="R"
library(kreuzberg)
config <- list(
keywords = list(
algorithm = "yake",
max_keywords = 10L,
min_score = 0.3
)
)
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
if (!is.null(result$metadata$keywords)) {
for (kw in result$metadata$keywords) {
cat(sprintf(" - %s\n", kw))
}
}
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(enable_quality_processing = TRUE)
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
if (!is.null(result$quality_score)) {
cat(sprintf("Quality score: %.2f\n", result$quality_score))
if (result$quality_score < 0.5) {
cat("Warning: low quality extraction\n")
}
}
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE
)
texts <- c("Hello, world!", "Kreuzberg is fast")
embeddings <- embed_texts(texts, config)
stopifnot(length(embeddings) == 2L)
cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_important_words = TRUE
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(result$content)
```

View File

@@ -0,0 +1,16 @@
```r title="R"
library(kreuzberg)
config <- list(
token_reduction = list(
mode = "moderate",
preserve_important_words = TRUE
)
)
json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
cat(sprintf("MIME type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,26 @@
```r title="R"
library(kreuzberg)
document_id <- "doc-001"
config <- list(
chunking = list(
max_characters = 512L,
overlap = 50L,
embedding = list(
model = list(type = "preset", name = "balanced"),
normalize = TRUE,
batch_size = 32L
)
)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
# Each chunk has $content, $embedding, and $metadata. Pass these directly
# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
# the document_id stored as a metadata field.
cat(sprintf("document_id: %s\n", document_id))
cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
```