Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/r/utils/chunking.md
+++ b/docs/snippets/r/utils/chunking.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(max_characters = 1000L, overlap = 200L)
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Total chunks: %d\n", length(result$chunks)))
+for (i in seq_len(min(5L, length(result$chunks)))) {
+  cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
+}
+```
--- a/docs/snippets/r/utils/chunking_rag.md
+++ b/docs/snippets/r/utils/chunking_rag.md
@@ -0,0 +1,25 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 500L,
+    overlap = 50L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE
+    )
+  )
+)
+
+json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+for (i in seq_along(result$chunks)) {
+  chunk <- result$chunks[[i]]
+  cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
+  if (!is.null(chunk$embedding)) {
+    cat(sprintf("  Embedding: %d dimensions\n", length(chunk$embedding)))
+  }
+}
+```
--- a/docs/snippets/r/utils/embedding_with_chunking.md
+++ b/docs/snippets/r/utils/embedding_with_chunking.md
@@ -0,0 +1,20 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  chunking = list(
+    max_characters = 1024L,
+    overlap = 100L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE,
+      batch_size = 32L
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
+```
--- a/docs/snippets/r/utils/keyword_extraction_example.md
+++ b/docs/snippets/r/utils/keyword_extraction_example.md
@@ -0,0 +1,21 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  keywords = list(
+    algorithm = "yake",
+    max_keywords = 10L,
+    min_score = 0.3
+  )
+)
+
+json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+if (!is.null(result$metadata$keywords)) {
+  for (kw in result$metadata$keywords) {
+    cat(sprintf("  - %s\n", kw))
+  }
+}
+```
--- a/docs/snippets/r/utils/quality_processing_example.md
+++ b/docs/snippets/r/utils/quality_processing_example.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(enable_quality_processing = TRUE)
+json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Content length: %d characters\n", nchar(result$content)))
+if (!is.null(result$quality_score)) {
+  cat(sprintf("Quality score: %.2f\n", result$quality_score))
+  if (result$quality_score < 0.5) {
+    cat("Warning: low quality extraction\n")
+  }
+}
+```
--- a/docs/snippets/r/utils/standalone_embed.md
+++ b/docs/snippets/r/utils/standalone_embed.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  model = list(type = "preset", name = "balanced"),
+  normalize = TRUE
+)
+
+texts <- c("Hello, world!", "Kreuzberg is fast")
+embeddings <- embed_texts(texts, config)
+
+stopifnot(length(embeddings) == 2L)
+cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
+cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
+```
--- a/docs/snippets/r/utils/token_reduction.md
+++ b/docs/snippets/r/utils/token_reduction.md
@@ -0,0 +1,15 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(
+    mode = "moderate",
+    preserve_important_words = TRUE
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(result$content)
+```
--- a/docs/snippets/r/utils/token_reduction_example.md
+++ b/docs/snippets/r/utils/token_reduction_example.md
@@ -0,0 +1,16 @@
+```r title="R"
+library(kreuzberg)
+
+config <- list(
+  token_reduction = list(
+    mode = "moderate",
+    preserve_important_words = TRUE
+  )
+)
+
+json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
+cat(sprintf("MIME type: %s\n", result$mime_type))
+```
--- a/docs/snippets/r/utils/vector_database_integration.md
+++ b/docs/snippets/r/utils/vector_database_integration.md
@@ -0,0 +1,26 @@
+```r title="R"
+library(kreuzberg)
+
+document_id <- "doc-001"
+
+config <- list(
+  chunking = list(
+    max_characters = 512L,
+    overlap = 50L,
+    embedding = list(
+      model = list(type = "preset", name = "balanced"),
+      normalize = TRUE,
+      batch_size = 32L
+    )
+  )
+)
+
+json <- extract_file_sync("document.pdf", "application/pdf", config)
+result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
+
+# Each chunk has $content, $embedding, and $metadata. Pass these directly
+# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
+# the document_id stored as a metadata field.
+cat(sprintf("document_id: %s\n", document_id))
+cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
+```