This commit is contained in:
15
docs/snippets/r/utils/chunking.md
Normal file
15
docs/snippets/r/utils/chunking.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
|
||||
for (i in seq_len(min(5L, length(result$chunks)))) {
|
||||
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/r/utils/chunking_rag.md
Normal file
25
docs/snippets/r/utils/chunking_rag.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 500L,
|
||||
overlap = 50L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_along(result$chunks)) {
|
||||
chunk <- result$chunks[[i]]
|
||||
cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
|
||||
if (!is.null(chunk$embedding)) {
|
||||
cat(sprintf(" Embedding: %d dimensions\n", length(chunk$embedding)))
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/utils/embedding_with_chunking.md
Normal file
20
docs/snippets/r/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1024L,
|
||||
overlap = 100L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE,
|
||||
batch_size = 32L
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
|
||||
```
|
||||
21
docs/snippets/r/utils/keyword_extraction_example.md
Normal file
21
docs/snippets/r/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(
|
||||
algorithm = "yake",
|
||||
max_keywords = 10L,
|
||||
min_score = 0.3
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
if (!is.null(result$metadata$keywords)) {
|
||||
for (kw in result$metadata$keywords) {
|
||||
cat(sprintf(" - %s\n", kw))
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/r/utils/quality_processing_example.md
Normal file
15
docs/snippets/r/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
if (!is.null(result$quality_score)) {
|
||||
cat(sprintf("Quality score: %.2f\n", result$quality_score))
|
||||
if (result$quality_score < 0.5) {
|
||||
cat("Warning: low quality extraction\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/r/utils/standalone_embed.md
Normal file
15
docs/snippets/r/utils/standalone_embed.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE
|
||||
)
|
||||
|
||||
texts <- c("Hello, world!", "Kreuzberg is fast")
|
||||
embeddings <- embed_texts(texts, config)
|
||||
|
||||
stopifnot(length(embeddings) == 2L)
|
||||
cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
|
||||
cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
|
||||
```
|
||||
15
docs/snippets/r/utils/token_reduction.md
Normal file
15
docs/snippets/r/utils/token_reduction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(
|
||||
mode = "moderate",
|
||||
preserve_important_words = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(result$content)
|
||||
```
|
||||
16
docs/snippets/r/utils/token_reduction_example.md
Normal file
16
docs/snippets/r/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(
|
||||
mode = "moderate",
|
||||
preserve_important_words = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
```
|
||||
26
docs/snippets/r/utils/vector_database_integration.md
Normal file
26
docs/snippets/r/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
document_id <- "doc-001"
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 512L,
|
||||
overlap = 50L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE,
|
||||
batch_size = 32L
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
# Each chunk has $content, $embedding, and $metadata. Pass these directly
|
||||
# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
|
||||
# the document_id stored as a metadata field.
|
||||
cat(sprintf("document_id: %s\n", document_id))
|
||||
cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
|
||||
```
|
||||
Reference in New Issue
Block a user