Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/r/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/r/api/batch_extract_bytes_sync.md
@@ -0,0 +1,19 @@
+```r title="R"
+library(kreuzberg)
+
+paths <- c("report.pdf", "notes.txt")
+mimes <- c("application/pdf", "text/plain")
+
+items <- jsonlite::toJSON(lapply(seq_along(paths), function(i) {
+  bytes <- readBin(paths[i], what = "raw", n = file.info(paths[i])$size)
+  list(content = as.integer(bytes), mime_type = mimes[i])
+}), auto_unbox = TRUE)
+
+json <- batch_extract_bytes_sync(items = items, config = ExtractionConfig$default())
+results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(results)) {
+  cat(sprintf("[%d] mime=%s chars=%d\n",
+              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
+}
+```
--- a/docs/snippets/r/api/batch_extract_files_sync.md
+++ b/docs/snippets/r/api/batch_extract_files_sync.md
@@ -0,0 +1,17 @@
+```r title="R"
+library(kreuzberg)
+
+items <- jsonlite::toJSON(list(
+  list(path = "report.pdf"),
+  list(path = "slides.pptx"),
+  list(path = "data.xlsx")
+), auto_unbox = TRUE)
+
+json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
+results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(results)) {
+  cat(sprintf("[%d] mime=%s chars=%d\n",
+              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
+}
+```
--- a/docs/snippets/r/api/client_chunk_text.md
+++ b/docs/snippets/r/api/client_chunk_text.md
@@ -0,0 +1,29 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+library(httr2)
+
+payload <- list(
+  text = "Your long text content here...",
+  chunker_type = "text",
+  config = list(
+    max_characters = 1000,
+    overlap = 50,
+    trim = TRUE
+  )
+)
+
+response <- request("http://localhost:8000/chunk") |>
+  req_method("POST") |>
+  req_body_json(payload) |>
+  req_perform()
+
+result <- resp_body_json(response)
+
+cat(sprintf("Created %d chunks\n", result$chunk_count))
+for (chunk in result$chunks) {
+  preview <- substr(chunk$content, 1, 50)
+  cat(sprintf("Chunk %d: %s...\n", chunk$chunk_index, preview))
+}
+```
--- a/docs/snippets/r/api/client_extract_single_file.md
+++ b/docs/snippets/r/api/client_extract_single_file.md
@@ -0,0 +1,18 @@
+<!-- snippet:syntax-only -->
+
+```r title="R"
+library(kreuzberg)
+library(httr2)
+
+response <- request("http://localhost:8000/extract") |>
+  req_method("POST") |>
+  req_multipart_part(
+    name = "files",
+    path = "document.pdf",
+    type = "application/pdf"
+  ) |>
+  req_perform()
+
+data <- resp_body_json(response)
+cat(jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE))
+```
--- a/docs/snippets/r/api/combining_all_features.md
+++ b/docs/snippets/r/api/combining_all_features.md
@@ -0,0 +1,34 @@
+```r title="R"
+library(kreuzberg)
+
+config_json <- jsonlite::toJSON(list(
+  output_format = "markdown",
+  force_ocr = TRUE,
+  extract_tables = TRUE,
+  extract_metadata = TRUE,
+  ocr = list(
+    backend = "tesseract",
+    language = "eng",
+    dpi = 300L
+  ),
+  chunking = list(
+    chunker_type = "markdown",
+    max_characters = 1000L,
+    overlap = 200L
+  )
+), auto_unbox = TRUE)
+
+config <- ExtractionConfig$from_json(config_json)
+
+json <- extract_file_sync(
+  path = "scanned_report.pdf",
+  mime_type = "application/pdf",
+  config = config
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Chunks: %d\n", length(result$chunks)))
+cat(sprintf("Tables: %d\n", length(result$tables)))
+title <- if (!is.null(result$metadata$title)) result$metadata$title else "<none>"
+cat(sprintf("Title: %s\n", title))
+```
--- a/docs/snippets/r/api/error_handling.md
+++ b/docs/snippets/r/api/error_handling.md
@@ -0,0 +1,26 @@
+```r title="R"
+library(kreuzberg)
+
+content <- charToRaw("Hello, world!")
+
+result <- tryCatch(
+  {
+    json <- extract_bytes_sync(
+      content = content,
+      mime_type = "application/x-nonexistent",
+      config = ExtractionConfig$default()
+    )
+    jsonlite::fromJSON(json, simplifyVector = FALSE)
+  },
+  error = function(e) {
+    message(sprintf("Extraction failed: %s", conditionMessage(e)))
+    NULL
+  }
+)
+
+if (is.null(result)) {
+  cat("No content extracted; falling back to original bytes\n")
+} else {
+  cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+}
+```
--- a/docs/snippets/r/api/error_handling_extract.md
+++ b/docs/snippets/r/api/error_handling_extract.md
@@ -0,0 +1,35 @@
+```r title="R"
+library(kreuzberg)
+
+items <- jsonlite::toJSON(list(
+  list(path = "doc1.pdf"),
+  list(path = "doc2.docx"),
+  list(path = "missing.html")
+), auto_unbox = TRUE)
+
+result <- tryCatch(
+  {
+    json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
+    jsonlite::fromJSON(json, simplifyVector = FALSE)
+  },
+  error = function(e) {
+    message(sprintf("Batch extraction failed: %s", conditionMessage(e)))
+    NULL
+  }
+)
+
+if (is.null(result)) {
+  cat("No results returned\n")
+} else {
+  for (i in seq_along(result)) {
+    item <- result[[i]]
+    err <- item$metadata$error
+    if (!is.null(err)) {
+      cat(sprintf("Document %d: ERROR - %s\n", i, err))
+    } else {
+      cat(sprintf("Document %d: %d chars, %d tables\n",
+                  i, nchar(item$content), length(item$tables)))
+    }
+  }
+}
+```
--- a/docs/snippets/r/api/extract_bytes_async.md
+++ b/docs/snippets/r/api/extract_bytes_async.md
@@ -0,0 +1,18 @@
+```r title="R"
+library(kreuzberg)
+
+# extract_bytes is the async variant; the call blocks the calling R thread
+# until the underlying tokio task completes. Use future/promises if you need
+# to fan out without blocking.
+path <- "document.pdf"
+content <- readBin(path, what = "raw", n = file.info(path)$size)
+
+json <- extract_bytes(
+  content = content,
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters\n", nchar(result$content)))
+```
--- a/docs/snippets/r/api/extract_bytes_sync.md
+++ b/docs/snippets/r/api/extract_bytes_sync.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+path <- "document.pdf"
+content <- readBin(path, what = "raw", n = file.info(path)$size)
+
+json <- extract_bytes_sync(
+  content = content,
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat(sprintf("Content preview: %s\n", substr(result$content, 1, 200)))
+```
--- a/docs/snippets/r/api/extract_file_async.md
+++ b/docs/snippets/r/api/extract_file_async.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+# extract_file is the async variant; extendr drives the tokio runtime so the
+# call returns once extraction completes. R has no native async, so wrap with
+# the future/promises packages if non-blocking dispatch is required.
+json <- extract_file(
+  path = "document.pdf",
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
+```
--- a/docs/snippets/r/api/extract_file_sync.md
+++ b/docs/snippets/r/api/extract_file_sync.md
@@ -0,0 +1,13 @@
+```r title="R"
+library(kreuzberg)
+
+json <- extract_file_sync(
+  path = "document.pdf",
+  mime_type = "application/pdf",
+  config = ExtractionConfig$default()
+)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("MIME type: %s\n", result$mime_type))
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+```