Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/r/advanced/batch_extraction.md
+++ b/docs/snippets/r/advanced/batch_extraction.md
@@ -0,0 +1,13 @@
+```r
+library(kreuzberg)
+
+# Batch extract from multiple files
+files <- c("report.pdf", "slides.pptx", "data.xlsx")
+results <- batch_extract_files_sync(files)
+
+for (i in seq_along(results)) {
+  cat(sprintf("File: %s\n", files[i]))
+  cat(sprintf("  MIME: %s\n", results[[i]]$mime_type))
+  cat(sprintf("  Length: %d chars\n\n", nchar(results[[i]]$content)))
+}
+```
--- a/docs/snippets/r/advanced/chunk_page_mapping.md
+++ b/docs/snippets/r/advanced/chunk_page_mapping.md
@@ -0,0 +1,27 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 500L, overlap = 50L),
+  pages = list(extract_pages = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(result$chunks)) {
+  chunk <- result$chunks[[i]]
+  metadata <- result$chunk_metadata[[i]]
+
+  if (!is.null(metadata$first_page) && !is.null(metadata$last_page)) {
+    page_range <- if (metadata$first_page == metadata$last_page) {
+      sprintf("Page %d", metadata$first_page)
+    } else {
+      sprintf("Pages %d-%d", metadata$first_page, metadata$last_page)
+    }
+
+    preview <- substr(chunk, 1L, min(50L, nchar(chunk)))
+    cat(sprintf("Chunk: %s... (%s)\n", preview, page_range))
+  }
+}
+```
--- a/docs/snippets/r/advanced/chunking_config.md
+++ b/docs/snippets/r/advanced/chunking_config.md
@@ -0,0 +1,37 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Chunks produced: %d\n", length(result$chunks)))
+for (i in seq_len(min(3L, length(result$chunks)))) {
+  cat(sprintf("Chunk %d length: %d characters\n", i, nchar(result$chunks[[i]])))
+}
+```
+
+```r title="R - Prepend Heading Context"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 500L,
+    overlap = 50L,
+    chunker_type = "markdown",
+    prepend_heading_context = TRUE
+  )
+)
+
+json <- extract_file_sync("document.md", "text/markdown", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_len(min(3L, length(result$chunks)))) {
+  chunk <- result$chunks[[i]]
+  preview <- substr(chunk, 1L, min(100L, nchar(chunk)))
+  cat(sprintf("%s\n", preview))
+}
+```
--- a/docs/snippets/r/advanced/chunking_rag.md
+++ b/docs/snippets/r/advanced/chunking_rag.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 800L, overlap = 150L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total chunks: %d\n", length(result$chunks)))
+cat("Processing chunks for RAG pipeline:\n")
+
+for (i in seq_len(min(3L, length(result$chunks)))) {
+  chunk <- result$chunks[[i]]
+  cat(sprintf("Chunk %d: %d characters\n", i, nchar(chunk)))
+}
+```
--- a/docs/snippets/r/advanced/embedding_with_chunking.md
+++ b/docs/snippets/r/advanced/embedding_with_chunking.md
@@ -0,0 +1,23 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Preparing %d chunks for embedding:\n", length(result$chunks)))
+
+embeddings_data <- list()
+for (i in seq_along(result$chunks)) {
+  embeddings_data[[i]] <- list(
+    chunk_id = i,
+    text = result$chunks[[i]],
+    length = nchar(result$chunks[[i]])
+  )
+}
+
+cat(sprintf("Ready to embed %d chunks\n", length(embeddings_data)))
+```
--- a/docs/snippets/r/advanced/keyword_extraction_config.md
+++ b/docs/snippets/r/advanced/keyword_extraction_config.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  keywords = list(
+    algorithm = "yake",
+    max_keywords = 10L,
+    min_score = 0.3,
+    ngram_range = c(1L, 3L),
+    language = "en"
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
+```
--- a/docs/snippets/r/advanced/keyword_extraction_example.md
+++ b/docs/snippets/r/advanced/keyword_extraction_example.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  keywords = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
+
+if (length(result$keywords) > 0) {
+  cat("Top keywords:\n")
+  for (i in seq_len(min(10L, length(result$keywords)))) {
+    cat(sprintf("  %d. %s\n", i, result$keywords[[i]]))
+  }
+}
+```
--- a/docs/snippets/r/advanced/language_detection_config.md
+++ b/docs/snippets/r/advanced/language_detection_config.md
@@ -0,0 +1,22 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  language_detection = list(
+    enabled = TRUE,
+    min_confidence = 0.8,
+    detect_multiple = FALSE
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+if (length(result$detected_languages) > 0) {
+  cat(sprintf("Detected language: %s\n", result$detected_languages[[1]]))
+} else {
+  cat("No language detected\n")
+}
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/advanced/language_detection_multilingual.md
+++ b/docs/snippets/r/advanced/language_detection_multilingual.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+files <- c("english.pdf", "spanish.pdf", "french.pdf")
+config <- list(language_detection = list(enabled = TRUE))
+
+for (file in files) {
+  json <- extract_file_sync(file, "application/pdf", config)
+  result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+  cat(sprintf("%s: detected language = %s\n",
+              file, result$detected_language))
+}
+```
--- a/docs/snippets/r/advanced/quality_processing_config.md
+++ b/docs/snippets/r/advanced/quality_processing_config.md
@@ -0,0 +1,10 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(enable_quality_processing = TRUE)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Quality score: %.2f\n", result$quality_score))
+```
--- a/docs/snippets/r/advanced/quality_processing_example.md
+++ b/docs/snippets/r/advanced/quality_processing_example.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(enable_quality_processing = TRUE)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat("Quality Metrics:\n")
+cat(sprintf("Quality Score: %.2f\n", result$quality_score))
+cat(sprintf("Content Length: %d characters\n", nchar(result$content)))
+cat(sprintf("Pages: %d\n", length(result$pages)))
+```
--- a/docs/snippets/r/advanced/token_reduction_config.md
+++ b/docs/snippets/r/advanced/token_reduction_config.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(
+    mode = "moderate",
+    preserve_markdown = TRUE,
+    preserve_code = TRUE,
+    language_hint = "eng"
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/advanced/token_reduction_example.md
+++ b/docs/snippets/r/advanced/token_reduction_example.md
@@ -0,0 +1,14 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat("Token-reduced content:\n")
+cat(sprintf("Length: %d characters\n", nchar(result$content)))
+cat(sprintf("Preview: %.60s...\n", result$content))
+```
--- a/docs/snippets/r/advanced/vector_database_integration.md
+++ b/docs/snippets/r/advanced/vector_database_integration.md
@@ -0,0 +1,24 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_len(min(3L, length(result$chunks)))) {
+  chunk <- result$chunks[[i]]
+  vector_doc <- list(
+    id = sprintf("doc_%d", i),
+    text = chunk,
+    metadata = list(
+      source = "document.pdf",
+      chunk_index = i,
+      length = nchar(chunk)
+    )
+  )
+  cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
+}
+```
--- a/docs/snippets/r/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/r/api/batch_extract_bytes_sync.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+paths <- c("report.pdf", "notes.txt")
+mimes <- c("application/pdf", "text/plain")
+
+items <- jsonlite::toJSON(lapply(seq_along(paths), function(i) {
+  bytes <- readBin(paths[i], what = "raw", n = file.info(paths[i])$size)
+  list(content = as.integer(bytes), mime_type = mimes[i])
+}), auto_unbox = TRUE)
+
+json <- batch_extract_bytes_sync(items = items, config = ExtractionConfig$default())
+results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(results)) {
+  cat(sprintf("[%d] mime=%s chars=%d\n",
+              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
+}
+```
--- a/docs/snippets/r/api/batch_extract_files_sync.md
+++ b/docs/snippets/r/api/batch_extract_files_sync.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+items <- jsonlite::toJSON(list(
+  list(path = "report.pdf"),
+  list(path = "slides.pptx"),
+  list(path = "data.xlsx")
+), auto_unbox = TRUE)
+
+json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
+results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(results)) {
+  cat(sprintf("[%d] mime=%s chars=%d\n",
+              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
+}
+```
--- a/docs/snippets/r/api/client_chunk_text.md
+++ b/docs/snippets/r/api/client_chunk_text.md
@@ -0,0 +1,29 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+library(httr2)
+
+payload <- list(
+  text = "Your long text content here...",
+  chunker_type = "text",
+  config = list(
+    max_characters = 1000,
+    overlap = 50,
+    trim = TRUE
+  )
+)
+
+response <- request("http://localhost:8000/chunk") |>
+  req_method("POST") |>
+  req_body_json(payload) |>
+  req_perform()
+
+result <- resp_body_json(response)
+
+cat(sprintf("Created %d chunks\n", result$chunk_count))
+for (chunk in result$chunks) {
+  preview <- substr(chunk$content, 1, 50)
+  cat(sprintf("Chunk %d: %s...\n", chunk$chunk_index, preview))
+}
+```
--- a/docs/snippets/r/api/client_extract_single_file.md
+++ b/docs/snippets/r/api/client_extract_single_file.md
@@ -0,0 +1,18 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+library(httr2)
+
+response <- request("http://localhost:8000/extract") |>
+  req_method("POST") |>
+  req_multipart_part(
+    name = "files",
+    path = "document.pdf",
+    type = "application/pdf"
+  ) |>
+  req_perform()
+
+data <- resp_body_json(response)
+cat(jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE))
+```
--- a/docs/snippets/r/api/combining_all_features.md
+++ b/docs/snippets/r/api/combining_all_features.md
@@ -0,0 +1,34 @@
+```r title="R"
+library(kreuzberg)
+
+config_json <- jsonlite::toJSON(list(
+  output_format = "markdown",
+  force_ocr = TRUE,
+  extract_tables = TRUE,
+  extract_metadata = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng",
+    dpi = 300L
+  ),
+  chunking = list(
+    chunker_type = "markdown",
+    max_characters = 1000L,
+    overlap = 200L
+  )
+), auto_unbox = TRUE)
+
+config <- ExtractionConfig$from_json(config_json)
+
+json <- extract_file_sync(
+  path = "scanned_report.pdf",
+  mime_type = "application/pdf",
+  config = config
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Chunks: %d\n", length(result$chunks)))
+cat(sprintf("Tables: %d\n", length(result$tables)))
+title <- if (!is.null(result$metadata$title)) result$metadata$title else "<none>"
+cat(sprintf("Title: %s\n", title))
+```
--- a/docs/snippets/r/api/error_handling.md
+++ b/docs/snippets/r/api/error_handling.md
@@ -0,0 +1,26 @@
+```r title="R"
+library(kreuzberg)
+
+content <- charToRaw("Hello, world!")
+
+result <- tryCatch(
+  {
+    json <- extract_bytes_sync(
+      content = content,
+      mime_type = "application/x-nonexistent",
+      config = ExtractionConfig$default()
+    )
+    jsonlite::fromJSON(json, simplifyVector = FALSE)
+  },
+  error = function(e) {
+    message(sprintf("Extraction failed: %s", conditionMessage(e)))
+    NULL
+  }
+)
+
+if (is.null(result)) {
+  cat("No content extracted; falling back to original bytes\n")
+} else {
+  cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+}
+```
--- a/docs/snippets/r/api/error_handling_extract.md
+++ b/docs/snippets/r/api/error_handling_extract.md
@@ -0,0 +1,35 @@
+```r title="R"
+library(kreuzberg)
+
+items <- jsonlite::toJSON(list(
+  list(path = "doc1.pdf"),
+  list(path = "doc2.docx"),
+  list(path = "missing.html")
+), auto_unbox = TRUE)
+
+result <- tryCatch(
+  {
+    json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
+    jsonlite::fromJSON(json, simplifyVector = FALSE)
+  },
+  error = function(e) {
+    message(sprintf("Batch extraction failed: %s", conditionMessage(e)))
+    NULL
+  }
+)
+
+if (is.null(result)) {
+  cat("No results returned\n")
+} else {
+  for (i in seq_along(result)) {
+    item <- result[[i]]
+    err <- item$metadata$error
+    if (!is.null(err)) {
+      cat(sprintf("Document %d: ERROR - %s\n", i, err))
+    } else {
+      cat(sprintf("Document %d: %d chars, %d tables\n",
+                  i, nchar(item$content), length(item$tables)))
+    }
+  }
+}
+```
--- a/docs/snippets/r/api/extract_bytes_async.md
+++ b/docs/snippets/r/api/extract_bytes_async.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+# extract_bytes is the async variant; the call blocks the calling R thread
+# until the underlying tokio task completes. Use future/promises if you need
+# to fan out without blocking.
+path <- "document.pdf"
+content <- readBin(path, what = "raw", n = file.info(path)$size)
+
+json <- extract_bytes(
+  content = content,
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/api/extract_bytes_sync.md
+++ b/docs/snippets/r/api/extract_bytes_sync.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+path <- "document.pdf"
+content <- readBin(path, what = "raw", n = file.info(path)$size)
+
+json <- extract_bytes_sync(
+  content = content,
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat(sprintf("Content preview: %s\n", substr(result$content, 1, 200)))
+```
--- a/docs/snippets/r/api/extract_file_async.md
+++ b/docs/snippets/r/api/extract_file_async.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+# extract_file is the async variant; extendr drives the tokio runtime so the
+# call returns once extraction completes. R has no native async, so wrap with
+# the future/promises packages if non-blocking dispatch is required.
+json <- extract_file(
+  path = "document.pdf",
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
+```
--- a/docs/snippets/r/api/extract_file_sync.md
+++ b/docs/snippets/r/api/extract_file_sync.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+json <- extract_file_sync(
+  path = "document.pdf",
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/config/advanced_config.md
+++ b/docs/snippets/r/config/advanced_config.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  ocr = list(backend = "tesseract", language = "eng"),
+  chunking = list(max_characters = 1500L, overlap = 300L),
+  output_format = "markdown",
+  include_document_structure = TRUE,
+  force_ocr = TRUE
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Format: %s\n", result$mime_type))
+cat(sprintf("Chunks: %d\n", length(result$chunks)))
+cat(sprintf("Content preview: %.50s...\n", result$content))
+```
--- a/docs/snippets/r/config/chunking_config.md
+++ b/docs/snippets/r/config/chunking_config.md
@@ -0,0 +1,50 @@
+```r title="R"
+library(kreuzberg)
+
+# Example 1: Basic character-based chunking
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+num_chunks <- length(result$chunks)
+cat(sprintf("Document split into %d chunks\n", num_chunks))
+for (i in seq_len(min(3L, num_chunks))) {
+  cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
+}
+```
+
+```r title="R - Markdown chunker with token-based sizing"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    chunker_type = "markdown",
+    sizing = list(
+      type = "tokenizer",
+      model = "Xenova/gpt-4o"
+    )
+  )
+)
+
+json <- extract_file_sync("document.md", "text/markdown", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Markdown document split into %d chunks\n", length(result$chunks)))
+```
+
+```r title="R - Prepend heading context"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    chunker_type = "markdown",
+    prepend_heading_context = TRUE
+  )
+)
+
+json <- extract_file_sync("document.md", "text/markdown", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Document split into %d chunks with prepended headings\n", length(result$chunks)))
+```
--- a/docs/snippets/r/config/chunking_configuration.md
+++ b/docs/snippets/r/config/chunking_configuration.md
@@ -0,0 +1,33 @@
+```r
+library(kreuzberg)
+
+# Configure text chunking for RAG pipelines
+config <- list(
+  chunking = list(
+    max_characters = 1000L,
+    overlap = 200L
+  )
+)
+
+json <- extract_file_sync("large_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat("Number of chunks:", length(result$chunks), "\n")
+```
+
+```r title="R - Prepend Heading Context"
+library(kreuzberg)
+
+# Prepend heading context to chunk content for structured documents
+config <- list(
+  chunking = list(
+    chunker_type = "markdown",
+    max_characters = 500L,
+    overlap = 50L,
+    prepend_heading_context = TRUE
+  )
+)
+
+json <- extract_file_sync("document.md", "text/markdown", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat("Number of chunks:", length(result$chunks), "\n")
+```
--- a/docs/snippets/r/config/config_basic.md
+++ b/docs/snippets/r/config/config_basic.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  output_format = "markdown"
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+cat("Content preview:\n")
+cat(substr(result$content, 1, 200))
+```
--- a/docs/snippets/r/config/config_discover.md
+++ b/docs/snippets/r/config/config_discover.md
@@ -0,0 +1,11 @@
+```r title="R"
+library(kreuzberg)
+
+# Load configuration from a JSON file and pass it to extract_file_sync.
+config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
+config <- ExtractionConfig$from_json(config_json)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/config/config_ocr.md
+++ b/docs/snippets/r/config/config_ocr.md
@@ -0,0 +1,14 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng")
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
+cat(sprintf("Detected language: %s\n", result$detected_language))
+```
--- a/docs/snippets/r/config/config_programmatic.md
+++ b/docs/snippets/r/config/config_programmatic.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng"
+  ),
+  chunking = list(
+    max_characters = 2000L,
+    overlap = 300L
+  ),
+  output_format = "markdown"
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("MIME type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/config/document_structure_config.md
+++ b/docs/snippets/r/config/document_structure_config.md
@@ -0,0 +1,21 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  include_document_structure = TRUE,
+  output_format = "markdown"
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total pages: %d\n", length(result$pages)))
+cat(sprintf("MIME type: %s\n\n", result$mime_type))
+
+for (i in seq_along(result$pages)) {
+  page <- result$pages[[i]]
+  cat(sprintf("Page %d structure:\n", i))
+  cat(sprintf("  Content: %s\n", substr(page$content, 1, 100)))
+  cat("\n")
+}
+```
--- a/docs/snippets/r/config/element_based_output.md
+++ b/docs/snippets/r/config/element_based_output.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  result_format = "element_based",
+  output_format = "markdown"
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total elements: %d\n\n", length(result$elements)))
+
+for (i in seq_along(result$elements)) {
+  element <- result$elements[[i]]
+  cat(sprintf("Element %d:\n", i))
+  cat(sprintf("  Type: %s\n", element$element_type))
+  cat(sprintf("  Content: %s\n\n", substr(element$content, 1, 100)))
+}
+```
--- a/docs/snippets/r/config/embedding_config.md
+++ b/docs/snippets/r/config/embedding_config.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 1000L,
+    overlap = 200L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      batch_size = 16L,
+      normalize = TRUE,
+      show_download_progress = TRUE
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
+```
--- a/docs/snippets/r/config/html_output.md
+++ b/docs/snippets/r/config/html_output.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  output_format = "html",
+  html_output = list(
+    theme = "git_hub",
+    embed_css = TRUE
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(result$content) # HTML with kb-* classes
+```
--- a/docs/snippets/r/config/keyword_extraction_config.md
+++ b/docs/snippets/r/config/keyword_extraction_config.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  keywords = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
+if (length(result$keywords) > 0) {
+  for (i in seq_len(min(5L, length(result$keywords)))) {
+    cat(sprintf("  - %s\n", result$keywords[[i]]))
+  }
+}
+```
--- a/docs/snippets/r/config/language_detection_config.md
+++ b/docs/snippets/r/config/language_detection_config.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  language_detection = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Detected language: %s\n", result$detected_language))
+cat(sprintf("Content preview: %.60s...\n", result$content))
+```
--- a/docs/snippets/r/config/ocr_configuration.md
+++ b/docs/snippets/r/config/ocr_configuration.md
@@ -0,0 +1,16 @@
+```r
+library(kreuzberg)
+
+# Configure OCR with Tesseract
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng+deu"
+  )
+)
+
+json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(result$content)
+```
--- a/docs/snippets/r/config/ocr_dpi_config.md
+++ b/docs/snippets/r/config/ocr_dpi_config.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+# Tesseract OCR via the kreuzberg R bindings does not expose a DPI setting in
+# the high-level config; PDF rasterization DPI is determined by the pipeline.
+# This example demonstrates running Tesseract OCR end-to-end on a PDF.
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng")
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Characters extracted: %d\n", nchar(result$content)))
+```
--- a/docs/snippets/r/config/pdf_config.md
+++ b/docs/snippets/r/config/pdf_config.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  pdf_options = list(extract_images = TRUE, extract_metadata = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Tables extracted: %d\n", length(result$tables)))
+cat(sprintf("Content preview: %.50s...\n", result$content))
+```
--- a/docs/snippets/r/config/pdf_hierarchy_config.md
+++ b/docs/snippets/r/config/pdf_hierarchy_config.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  pdf_options = list(
+    extract_metadata = TRUE,
+    hierarchy = list(
+      enabled = TRUE,
+      k_clusters = 6L,
+      include_bbox = TRUE,
+      ocr_coverage_threshold = 0.8
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Pages: %d\n", length(result$pages)))
+```
--- a/docs/snippets/r/config/postprocessor_config.md
+++ b/docs/snippets/r/config/postprocessor_config.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  postprocessor = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+cat(sprintf("Mime type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/config/quality_processing_config.md
+++ b/docs/snippets/r/config/quality_processing_config.md
@@ -0,0 +1,11 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(enable_quality_processing = TRUE)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Quality score: %.2f\n", result$quality_score))
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/config/tesseract_config.md
+++ b/docs/snippets/r/config/tesseract_config.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng+deu"
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Detected language: %s\n", result$detected_language))
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/config/token_reduction_config.md
+++ b/docs/snippets/r/config/token_reduction_config.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(enabled = TRUE)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Original content length: %d characters\n", nchar(result$content)))
+cat(sprintf("Content preview: %.60s...\n", result$content))
+```
--- a/docs/snippets/r/getting-started/basic_extraction.md
+++ b/docs/snippets/r/getting-started/basic_extraction.md
@@ -0,0 +1,7 @@
+```r
+library(kreuzberg)
+
+# Extract text from a PDF file
+result <- extract_file_sync("document.pdf")
+cat(result$content)
+```
--- a/docs/snippets/r/getting-started/basic_usage.md
+++ b/docs/snippets/r/getting-started/basic_usage.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- ExtractionConfig$default()
+
+json <- extract_file_sync(
+  path = "document.pdf",
+  mime_type = NULL,
+  config = config
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(result$content)
+cat(sprintf("\nMIME Type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/getting-started/extract_file.md
+++ b/docs/snippets/r/getting-started/extract_file.md
@@ -0,0 +1,14 @@
+```r title="R"
+library(kreuzberg)
+
+# Extract a file and inspect the result
+result <- extract_file_sync("document.pdf")
+
+# Print result information
+cat(sprintf("MIME type: %s\n", mime_type(result)))
+cat(sprintf("Content length: %d characters\n", nchar(content(result))))
+cat(sprintf("Page count: %d\n", page_count(result)))
+
+# View additional metadata
+cat(sprintf("Detected language: %s\n", detected_language(result)))
+```
--- a/docs/snippets/r/getting-started/extract_with_ocr.md
+++ b/docs/snippets/r/getting-started/extract_with_ocr.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+# Configure OCR settings via a plain list mirroring the config JSON.
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng"
+  )
+)
+
+# Extract an image file with OCR enabled
+json <- extract_file_sync("image.png", "image/png", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat("Extracted text from image:\n")
+cat(result$content)
+```
--- a/docs/snippets/r/getting-started/hello_world.md
+++ b/docs/snippets/r/getting-started/hello_world.md
@@ -0,0 +1,12 @@
+```r title="R"
+library(kreuzberg)
+
+# Extract a PDF file
+result <- extract_file_sync("example.pdf")
+
+# Print a preview of the extracted content
+content_preview <- substr(content(result), 1L, 200L)
+cat("Content preview:\n")
+cat(content_preview)
+cat("\n...\n")
+```
--- a/docs/snippets/r/getting-started/install_verify.md
+++ b/docs/snippets/r/getting-started/install_verify.md
@@ -0,0 +1,7 @@
+```r title="R"
+library(kreuzberg)
+
+# Confirm the native extension loaded by listing registered extractors
+extractors <- list_document_extractors()
+cat(sprintf("kreuzberg ready: %d document extractors registered\n", length(extractors)))
+```
--- a/docs/snippets/r/getting-started/installation.md
+++ b/docs/snippets/r/getting-started/installation.md
@@ -0,0 +1,7 @@
+```r
+# Install from source (requires Rust toolchain)
+# install.packages("kreuzberg")
+
+# Or install from GitHub
+# remotes::install_github("kreuzberg-dev/kreuzberg", subdir = "packages/r")
+```
--- a/docs/snippets/r/getting-started/read_content.md
+++ b/docs/snippets/r/getting-started/read_content.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+# Extract a document
+result <- extract_file_sync("document.docx")
+
+# Access core content fields
+cat(sprintf("MIME type: %s\n", mime_type(result)))
+cat(sprintf("Content length: %d characters\n", nchar(content(result))))
+
+# Access structured data
+cat(sprintf("Number of tables: %d\n", length(result$tables)))
+cat(sprintf("Detected language: %s\n", detected_language(result)))
+
+# Access metadata
+author <- metadata_field(result, "author")
+if (!is.null(author)) {
+  cat(sprintf("Document author: %s\n", author))
+}
+```
--- a/docs/snippets/r/llm/structured_extraction.md
+++ b/docs/snippets/r/llm/structured_extraction.md
@@ -0,0 +1,29 @@
+<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment.
+
+```r title="R"
+library(kreuzberg)
+
+schema <- list(
+  type = "object",
+  properties = list(
+    title = list(type = "string"),
+    authors = list(type = "array", items = list(type = "string")),
+    date = list(type = "string")
+  ),
+  required = c("title", "authors", "date"),
+  additionalProperties = FALSE
+)
+
+config <- list(
+  structured_extraction = list(
+    schema = schema,
+    llm = list(model = "openai/gpt-4o-mini"),
+    strict = TRUE
+  )
+)
+
+json <- extract_file_sync("paper.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(result$structured_output, "\n")
+```
--- a/docs/snippets/r/mcp/mcp_custom_client.md
+++ b/docs/snippets/r/mcp/mcp_custom_client.md
@@ -0,0 +1,25 @@
+```r title="R"
+# The kreuzberg R bindings ship no MCP client. Drive the kreuzberg CLI's
+# stdio MCP transport from R using a piped subprocess.
+mcp <- pipe("kreuzberg mcp", open = "w+")
+on.exit(close(mcp), add = TRUE)
+
+request <- list(
+  method = "tools/call",
+  params = list(
+    name = "extract_file",
+    arguments = list(
+      path = "document.pdf",
+      async = TRUE
+    )
+  )
+)
+
+writeLines(jsonlite::toJSON(request, auto_unbox = TRUE), con = mcp)
+flush(mcp)
+
+response_line <- readLines(mcp, n = 1L)
+cat(response_line, "\n")
+```
+
+<!-- snippet:syntax-only --> The R bindings have no MCP client; this snippet drives the MCP CLI over stdio. Requires the `jsonlite` package.
--- a/docs/snippets/r/mcp/mcp_server_start.md
+++ b/docs/snippets/r/mcp/mcp_server_start.md
@@ -0,0 +1,11 @@
+```r title="R"
+# The kreuzberg R bindings do not embed an MCP server: MCP is provided by the
+# kreuzberg CLI (Rust binary). Spawn it from the same R session that uses the
+# kreuzberg package for in-process extraction.
+status <- system2("kreuzberg", args = "mcp", stdout = "", stderr = "")
+if (status != 0L) {
+  stop(sprintf("MCP server exited with status %d", status))
+}
+```
+
+<!-- snippet:syntax-only --> The R bindings expose extraction primitives only; MCP transport requires the standalone kreuzberg CLI.
--- a/docs/snippets/r/metadata/language_detection.md
+++ b/docs/snippets/r/metadata/language_detection.md
@@ -0,0 +1,23 @@
+```r title="R"
+library(kreuzberg)
+
+result <- extract_file_sync("document.pdf")
+
+cat("Language Detection Results:\n\n")
+
+cat("Using direct field access:\n")
+cat("Detected Language:", result$detected_language, "\n\n")
+
+cat("Using S3 helper function:\n")
+lang <- detected_language(result)
+cat("Language (via helper):", lang, "\n\n")
+
+cat("Language Information:\n")
+if (lang == "en") {
+  cat("This is an English document\n")
+} else if (lang == "es") {
+  cat("This is a Spanish document\n")
+} else {
+  cat(sprintf("This is a %s document\n", lang))
+}
+```
--- a/docs/snippets/r/metadata/language_detection_multilingual.md
+++ b/docs/snippets/r/metadata/language_detection_multilingual.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+files <- c("english.pdf", "spanish.pdf", "french.pdf")
+config <- list(language_detection = list(enabled = TRUE))
+
+for (file in files) {
+  json <- extract_file_sync(file, "application/pdf", config)
+  result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+  cat(sprintf("%s: detected language = %s\n",
+              file, result$detected_language))
+}
+```
--- a/docs/snippets/r/metadata/metadata.md
+++ b/docs/snippets/r/metadata/metadata.md
@@ -0,0 +1,25 @@
+```r title="R"
+library(kreuzberg)
+
+result <- extract_file_sync("document.pdf")
+
+cat("Detected Language:", result$detected_language, "\n")
+cat("Quality Score:", result$quality_score, "\n")
+cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
+
+cat("Metadata fields:\n")
+authors <- metadata_field(result, "authors")
+if (!is.null(authors)) {
+  cat("Authors:", paste(authors, collapse=", "), "\n")
+}
+
+created <- metadata_field(result, "created_date")
+if (!is.null(created)) {
+  cat("Created Date:", created, "\n")
+}
+
+pages_meta <- metadata_field(result, "page_count")
+if (!is.null(pages_meta)) {
+  cat("Pages:", pages_meta, "\n")
+}
+```
--- a/docs/snippets/r/metadata/page_boundaries.md
+++ b/docs/snippets/r/metadata/page_boundaries.md
@@ -0,0 +1,22 @@
+```r title="R"
+library(kreuzberg)
+
+result <- extract_file_sync("document.pdf")
+
+boundaries <- result$metadata$pages$boundaries
+
+if (!is.null(boundaries) && length(boundaries) > 0L) {
+  content_bytes <- charToRaw(result$content)
+
+  for (i in seq_len(min(3L, length(boundaries)))) {
+    boundary <- boundaries[[i]]
+    page_bytes <- content_bytes[(boundary$byte_start + 1L):boundary$byte_end]
+    page_text <- rawToChar(page_bytes)
+    preview_end <- min(100L, nchar(page_text))
+
+    cat(sprintf("Page %d:\n", boundary$page_number))
+    cat(sprintf("  Byte range: %d-%d\n", boundary$byte_start, boundary$byte_end))
+    cat(sprintf("  Preview: %s...\n", substr(page_text, 1L, preview_end)))
+  }
+}
+```
--- a/docs/snippets/r/metadata/page_tracking_basic.md
+++ b/docs/snippets/r/metadata/page_tracking_basic.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+result <- extract_file_sync("document.pdf")
+
+cat("Total pages:", page_count(result), "\n\n")
+
+for (i in seq_along(result$pages)) {
+  page <- result$pages[[i]]
+  cat(sprintf("Page %d:\n", i))
+  cat("  Elements:", length(page$elements), "\n")
+  cat("  Text content length:", nchar(page$content), "chars\n")
+
+  if (nchar(page$content) > 0L) {
+    preview <- substr(page$content, 1L, 100L)
+    cat(sprintf("  Preview: %s...\n", preview))
+  }
+  cat("\n")
+}
+```
--- a/docs/snippets/r/metadata/tables.md
+++ b/docs/snippets/r/metadata/tables.md
@@ -0,0 +1,22 @@
+```r title="R"
+library(kreuzberg)
+
+result <- extract_file_sync("spreadsheet.xlsx")
+
+cat("Tables extracted:", length(result$tables), "\n\n")
+
+for (i in seq_along(result$tables)) {
+  table <- result$tables[[i]]
+  cat(sprintf("Table %d:\n", i))
+  cat("  Rows:", nrow(table), "\n")
+  cat("  Columns:", ncol(table), "\n")
+  cat("  Column names:", paste(colnames(table), collapse=", "), "\n")
+  cat("\n")
+
+  if (nrow(table) > 0L) {
+    cat("  Preview (first 3 rows):\n")
+    print(head(table, 3L))
+    cat("\n")
+  }
+}
+```
--- a/docs/snippets/r/metadata/vector_database_integration.md
+++ b/docs/snippets/r/metadata/vector_database_integration.md
@@ -0,0 +1,24 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_len(min(3L, length(result$chunks)))) {
+  chunk <- result$chunks[[i]]
+  vector_doc <- list(
+    id = sprintf("doc_%d", i),
+    text = chunk,
+    metadata = list(
+      source = "document.pdf",
+      chunk_index = i,
+      length = nchar(chunk)
+    )
+  )
+  cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
+}
+```
--- a/docs/snippets/r/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/r/ocr/cloud_ocr_backend.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+custom_ocr_backend <- function(image_path, language) {
+  cat(sprintf("Processing image: %s\n", image_path))
+  return(sprintf("Extracted text from %s", image_path))
+}
+
+register_ocr_backend("custom_cloud", custom_ocr_backend)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "custom_cloud", language = "en")
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+cat(sprintf("Custom backend result: %d chars\n", nchar(result$content)))
+```
--- a/docs/snippets/r/ocr/image_extraction.md
+++ b/docs/snippets/r/ocr/image_extraction.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng")
+)
+
+json <- extract_file_sync("scan.png", "image/png", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat("Image extraction via OCR:\n")
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+cat(sprintf("Mime type: %s\n", result$mime_type))
+cat(sprintf("Detected language: %s\n", result$detected_language))
+```
--- a/docs/snippets/r/ocr/image_preprocessing.md
+++ b/docs/snippets/r/ocr/image_preprocessing.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng"),
+  enable_quality_processing = TRUE
+)
+
+json <- extract_file_sync("scan.png", "image/png", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Quality: %.2f, Length: %d\n",
+            result$quality_score %||% 0,
+            nchar(result$content)))
+```
--- a/docs/snippets/r/ocr/ocr_easyocr.md
+++ b/docs/snippets/r/ocr/ocr_easyocr.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+# Note: EasyOCR backend requires Python to be installed
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "easyocr", language = "en")
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat("EasyOCR extraction:\n")
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+cat(sprintf("Detected language: %s\n", result$detected_language))
+```
--- a/docs/snippets/r/ocr/ocr_elements.md
+++ b/docs/snippets/r/ocr/ocr_elements.md
@@ -0,0 +1,23 @@
+```r title="R"
+library(kreuzberg)
+
+# Enable structured OCR elements alongside text extraction
+config <- list(
+  ocr = list(
+    backend = "paddleocr",
+    language = "en",
+    element_config = list(include_elements = TRUE)
+  )
+)
+
+json <- extract_file_sync("scanned.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+if (!is.null(result$ocr_elements)) {
+  for (element in result$ocr_elements) {
+    cat(sprintf("Text: %s\n", element$text))
+    cat(sprintf("Confidence: %.2f\n", element$confidence$recognition))
+    cat("\n")
+  }
+}
+```
--- a/docs/snippets/r/ocr/ocr_extraction.md
+++ b/docs/snippets/r/ocr/ocr_extraction.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+# Configure Tesseract OCR
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng")
+)
+
+# Extract text from a scanned image
+json <- extract_file_sync("scan.png", "image/png", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+cat("Content preview:\n")
+cat(substr(result$content, 1, 200))
+```
--- a/docs/snippets/r/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/r/ocr/ocr_force_all_pages.md
@@ -0,0 +1,12 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(force_ocr = TRUE)
+
+json <- extract_file_sync("multipage_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total pages: %d\n", length(result$pages)))
+cat(sprintf("Content extracted via OCR: %d characters\n", nchar(result$content)))
+cat(sprintf("Detected language: %s\n", result$detected_language))
+```
--- a/docs/snippets/r/ocr/ocr_multi_language.md
+++ b/docs/snippets/r/ocr/ocr_multi_language.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+# Configure multi-language OCR (English, French, German)
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "tesseract", language = "eng+fra+deu")
+)
+
+# Extract from a multilingual document
+json <- extract_file_sync("multilingual.png", "image/png", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Detected language: %s\n", result$detected_language))
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+cat("Content preview:\n")
+cat(substr(result$content, 1, 200))
+```
--- a/docs/snippets/r/ocr/ocr_paddleocr.md
+++ b/docs/snippets/r/ocr/ocr_paddleocr.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+# Configure PaddleOCR backend (defaults to mobile tier)
+config <- list(
+  force_ocr = TRUE,
+  ocr = list(backend = "paddle-ocr", language = "en")
+)
+
+# Extract text from an image using PaddleOCR
+json <- extract_file_sync("document.jpg", "image/jpeg", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat("Content preview:\n")
+cat(substr(result$content, 1, 200))
+```
--- a/docs/snippets/r/plugins/clear_plugins.md
+++ b/docs/snippets/r/plugins/clear_plugins.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+# Clear all custom OCR backends
+clear_ocr_backends()
+cat("OCR backends cleared\n")
+
+# Clear all custom validators
+clear_validators()
+cat("Validators cleared\n")
+
+# Clear all custom post-processors
+clear_post_processors()
+cat("Post-processors cleared\n")
+
+# Clear all custom document extractors
+clear_document_extractors()
+cat("Document extractors cleared\n")
+```
--- a/docs/snippets/r/plugins/custom_ocr_backend.md
+++ b/docs/snippets/r/plugins/custom_ocr_backend.md
@@ -0,0 +1,15 @@
+```r
+library(kreuzberg)
+
+# List available OCR backends
+backends <- list_ocr_backends()
+cat("Available backends:", paste(backends, collapse = ", "), "\n")
+
+# List registered post-processors
+processors <- list_post_processors()
+cat("Post-processors:", paste(processors, collapse = ", "), "\n")
+
+# Clear all custom registrations
+clear_post_processors()
+clear_validators()
+```
--- a/docs/snippets/r/plugins/embedding_backend.md
+++ b/docs/snippets/r/plugins/embedding_backend.md
@@ -0,0 +1,29 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+# Wrap an already-loaded embedder (e.g. an ONNX session) so kreuzberg can
+# call back into it during chunking and standalone embed requests.
+my_embedder <- list(
+  name = "my-embedder",
+  version = "1.0.0",
+  dimensions = 768L,
+  embed = function(texts) {
+    # Delegate to the already-loaded host model.
+    lapply(texts, function(.) rep(0.0, 768))
+  }
+)
+
+register_embedding_backend(my_embedder)
+
+config <- list(
+  embedding = list(
+    model = list(type = "plugin", name = "my-embedder"),
+    max_embed_duration_secs = 30L
+  )
+)
+
+vectors <- embed_texts(c("Hello, world!", "Second text"), config)
+cat(sprintf("Generated %d embedding vectors\n", length(vectors)))
+```
--- a/docs/snippets/r/plugins/extractor_registration.md
+++ b/docs/snippets/r/plugins/extractor_registration.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+custom_extractor <- function(path, mime_type) {
+  content <- sprintf("Extracted from %s (%s)", path, mime_type)
+  return(list(
+    content = content,
+    mime_type = mime_type,
+    pages = 1L
+  ))
+}
+
+register_document_extractor("custom_format", custom_extractor)
+
+result <- extract_file_sync("custom_document.xyz", "application/custom", NULL)
+
+cat(sprintf("Custom extractor result:\n"))
+cat(sprintf("Content: %s\n", result$content))
+cat(sprintf("Mime type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/plugins/list_plugins.md
+++ b/docs/snippets/r/plugins/list_plugins.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+ocr_backends <- list_ocr_backends()
+cat(sprintf("OCR backends: %s\n", paste(ocr_backends, collapse=", ")))
+
+validators <- list_validators()
+cat(sprintf("Validators: %s\n", paste(validators, collapse=", ")))
+
+post_processors <- list_post_processors()
+cat(sprintf("Post-processors: %s\n", paste(post_processors, collapse=", ")))
+
+extractors <- list_document_extractors()
+cat(sprintf("Document extractors: %s\n", paste(extractors, collapse=", ")))
+```
--- a/docs/snippets/r/plugins/min_length_validator.md
+++ b/docs/snippets/r/plugins/min_length_validator.md
@@ -0,0 +1,27 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+min_length_validator <- function(result) {
+  min_length <- 50L
+  if (nchar(result$content) < min_length) {
+    return(list(
+      valid = FALSE,
+      message = sprintf(
+        "Content too short: %d < %d characters",
+        nchar(result$content), min_length
+      )
+    ))
+  }
+  return(list(valid = TRUE, message = "Content length validation passed"))
+}
+
+register_validator("min_length", min_length_validator)
+
+config <- ExtractionConfig$default()
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/pdf_metadata_extractor.md
+++ b/docs/snippets/r/plugins/pdf_metadata_extractor.md
@@ -0,0 +1,22 @@
+```r title="R"
+library(kreuzberg)
+
+extract_pdf_metadata <- function(result) {
+  processed_result <- result
+  if (!is.null(result$metadata)) {
+    cat(sprintf("PDF Metadata:\n"))
+    for (key in names(result$metadata)) {
+      cat(sprintf("  %s: %s\n", key, result$metadata[[key]]))
+    }
+  }
+  return(processed_result)
+}
+
+register_post_processor("pdf_metadata", extract_pdf_metadata)
+
+config <- list(postprocessor = list(enabled = TRUE))
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extraction complete: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/pdf_only_processor.md
+++ b/docs/snippets/r/plugins/pdf_only_processor.md
@@ -0,0 +1,21 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+pdf_only_processor <- function(result) {
+  # Gate the processor so it only runs for PDF documents.
+  if (is.null(result$mime_type) || result$mime_type != "application/pdf") {
+    return(result)
+  }
+  return(result)
+}
+
+register_post_processor("pdf_only", pdf_only_processor)
+
+config <- list(postprocessor = list(enabled = TRUE))
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Processed PDF: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/plugin_extractor.md
+++ b/docs/snippets/r/plugins/plugin_extractor.md
@@ -0,0 +1,25 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+custom_json_extractor <- function(path, mime_type) {
+  raw <- readLines(path, warn = FALSE)
+  parsed <- jsonlite::fromJSON(paste(raw, collapse = "\n"))
+
+  text <- paste(unlist(parsed), collapse = "\n")
+
+  return(list(
+    content = text,
+    mime_type = "application/json",
+    pages = 1L,
+    metadata = list(extractor = "custom-json-extractor")
+  ))
+}
+
+register_document_extractor("custom-json-extractor", custom_json_extractor)
+
+result <- extract_file_sync("data.json", "application/json", NULL)
+
+cat(sprintf("Extracted %d characters from JSON\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/plugin_logging.md
+++ b/docs/snippets/r/plugins/plugin_logging.md
@@ -0,0 +1,30 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+logging_processor <- function(result) {
+  message(sprintf(
+    "[plugin] processing mime=%s content_chars=%d",
+    result$mime_type %||% "unknown", nchar(result$content)
+  ))
+  return(result)
+}
+
+logging_validator <- function(result) {
+  message(sprintf(
+    "[plugin] validating mime=%s",
+    result$mime_type %||% "unknown"
+  ))
+  return(list(valid = TRUE, message = "ok"))
+}
+
+register_post_processor("logging_processor", logging_processor)
+register_validator("logging_validator", logging_validator)
+
+config <- list(postprocessor = list(enabled = TRUE))
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Done: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/plugin_testing.md
+++ b/docs/snippets/r/plugins/plugin_testing.md
@@ -0,0 +1,34 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+library(testthat)
+
+uppercase_processor <- function(result) {
+  result$content <- toupper(result$content)
+  return(result)
+}
+
+test_that("uppercase processor uppercases content", {
+  fake_result <- list(
+    content = "hello world",
+    mime_type = "text/plain",
+    metadata = list()
+  )
+  processed <- uppercase_processor(fake_result)
+  expect_equal(processed$content, "HELLO WORLD")
+})
+
+test_that("post processor registers and runs", {
+  register_post_processor("uppercase", uppercase_processor)
+  on.exit(unregister_post_processor("uppercase"), add = TRUE)
+
+  config <- list(postprocessor = list(enabled = TRUE))
+  json <- extract_bytes_sync(
+    charToRaw("hello world"), "text/plain", config
+  )
+  result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+  expect_match(result$content, "HELLO WORLD", fixed = TRUE)
+})
+```
--- a/docs/snippets/r/plugins/plugin_validator.md
+++ b/docs/snippets/r/plugins/plugin_validator.md
@@ -0,0 +1,23 @@
+```r title="R"
+library(kreuzberg)
+
+min_content_validator <- function(result) {
+  min_length <- 100L
+  if (nchar(result$content) < min_length) {
+    return(list(
+      valid = FALSE,
+      message = sprintf("Content too short: %d < %d",
+                       nchar(result$content), min_length)
+    ))
+  }
+  return(list(valid = TRUE, message = "Content validation passed"))
+}
+
+register_validator("min_content", min_content_validator)
+
+config <- ExtractionConfig$default()
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/quality_score_validator.md
+++ b/docs/snippets/r/plugins/quality_score_validator.md
@@ -0,0 +1,29 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+quality_score_validator <- function(result) {
+  min_score <- 0.5
+  score <- as.numeric(result$metadata$quality_score %||% 0)
+
+  if (score < min_score) {
+    return(list(
+      valid = FALSE,
+      message = sprintf(
+        "Quality score too low: %.2f < %.2f",
+        score, min_score
+      )
+    ))
+  }
+  return(list(valid = TRUE, message = "Quality score validation passed"))
+}
+
+register_validator("quality_score", quality_score_validator)
+
+config <- ExtractionConfig$default()
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Validated extraction: %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/plugins/stateful_plugin.md
+++ b/docs/snippets/r/plugins/stateful_plugin.md
@@ -0,0 +1,27 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+# Encapsulate mutable counter state in an environment so the plugin function
+# can update it across calls.
+make_stateful_plugin <- function() {
+  state <- new.env(parent = emptyenv())
+  state$count <- 0L
+
+  process <- function(result) {
+    state$count <- state$count + 1L
+    return(result)
+  }
+
+  list(process = process, count = function() state$count)
+}
+
+plugin <- make_stateful_plugin()
+register_post_processor("stateful_counter", plugin$process)
+
+config <- list(postprocessor = list(enabled = TRUE))
+extract_file_sync("document.pdf", "application/pdf", config)
+
+cat(sprintf("Processed: %d\n", plugin$count()))
+```
--- a/docs/snippets/r/plugins/unregister_plugins.md
+++ b/docs/snippets/r/plugins/unregister_plugins.md
@@ -0,0 +1,11 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+# Remove plugins by their registered name.
+unregister_post_processor("metadata_enrichment")
+unregister_validator("min_length")
+unregister_ocr_backend("custom_ocr_backend")
+unregister_document_extractor("custom_format")
+```
--- a/docs/snippets/r/plugins/word_count_processor.md
+++ b/docs/snippets/r/plugins/word_count_processor.md
@@ -0,0 +1,20 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+
+word_count_processor <- function(result) {
+  word_count <- length(strsplit(result$content, "\\s+")[[1]])
+
+  result$metadata <- c(result$metadata, list(word_count = word_count))
+  return(result)
+}
+
+register_post_processor("word_count", word_count_processor)
+
+config <- list(postprocessor = list(enabled = TRUE))
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Word count: %d\n", result$metadata$word_count))
+```
--- a/docs/snippets/r/utils/chunking.md
+++ b/docs/snippets/r/utils/chunking.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total chunks: %d\n", length(result$chunks)))
+for (i in seq_len(min(5L, length(result$chunks)))) {
+  cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
+}
+```
--- a/docs/snippets/r/utils/chunking_rag.md
+++ b/docs/snippets/r/utils/chunking_rag.md
@@ -0,0 +1,25 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 500L,
+    overlap = 50L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE
+    )
+  )
+)
+
+json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(result$chunks)) {
+  chunk <- result$chunks[[i]]
+  cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
+  if (!is.null(chunk$embedding)) {
+    cat(sprintf("  Embedding: %d dimensions\n", length(chunk$embedding)))
+  }
+}
+```
--- a/docs/snippets/r/utils/embedding_with_chunking.md
+++ b/docs/snippets/r/utils/embedding_with_chunking.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 1024L,
+    overlap = 100L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE,
+      batch_size = 32L
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
+```
--- a/docs/snippets/r/utils/keyword_extraction_example.md
+++ b/docs/snippets/r/utils/keyword_extraction_example.md
@@ -0,0 +1,21 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  keywords = list(
+    algorithm = "yake",
+    max_keywords = 10L,
+    min_score = 0.3
+  )
+)
+
+json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+if (!is.null(result$metadata$keywords)) {
+  for (kw in result$metadata$keywords) {
+    cat(sprintf("  - %s\n", kw))
+  }
+}
+```
--- a/docs/snippets/r/utils/quality_processing_example.md
+++ b/docs/snippets/r/utils/quality_processing_example.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(enable_quality_processing = TRUE)
+json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+if (!is.null(result$quality_score)) {
+  cat(sprintf("Quality score: %.2f\n", result$quality_score))
+  if (result$quality_score < 0.5) {
+    cat("Warning: low quality extraction\n")
+  }
+}
+```
--- a/docs/snippets/r/utils/standalone_embed.md
+++ b/docs/snippets/r/utils/standalone_embed.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  model = list(type = "preset", name = "balanced"),
+  normalize = TRUE
+)
+
+texts <- c("Hello, world!", "Kreuzberg is fast")
+embeddings <- embed_texts(texts, config)
+
+stopifnot(length(embeddings) == 2L)
+cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
+cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
+```
--- a/docs/snippets/r/utils/token_reduction.md
+++ b/docs/snippets/r/utils/token_reduction.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(
+    mode = "moderate",
+    preserve_important_words = TRUE
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(result$content)
+```
--- a/docs/snippets/r/utils/token_reduction_example.md
+++ b/docs/snippets/r/utils/token_reduction_example.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(
+    mode = "moderate",
+    preserve_important_words = TRUE
+  )
+)
+
+json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
+cat(sprintf("MIME type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/utils/vector_database_integration.md
+++ b/docs/snippets/r/utils/vector_database_integration.md
@@ -0,0 +1,26 @@
+```r title="R"
+library(kreuzberg)
+
+document_id <- "doc-001"
+
+config <- list(
+  chunking = list(
+    max_characters = 512L,
+    overlap = 50L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE,
+      batch_size = 32L
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+# Each chunk has $content, $embedding, and $metadata. Pass these directly
+# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
+# the document_id stored as a metadata field.
+cat(sprintf("document_id: %s\n", document_id))
+cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
+```