This commit is contained in:
23
docs/snippets/r/metadata/language_detection.md
Normal file
23
docs/snippets/r/metadata/language_detection.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Language Detection Results:\n\n")
|
||||
|
||||
cat("Using direct field access:\n")
|
||||
cat("Detected Language:", result$detected_language, "\n\n")
|
||||
|
||||
cat("Using S3 helper function:\n")
|
||||
lang <- detected_language(result)
|
||||
cat("Language (via helper):", lang, "\n\n")
|
||||
|
||||
cat("Language Information:\n")
|
||||
if (lang == "en") {
|
||||
cat("This is an English document\n")
|
||||
} else if (lang == "es") {
|
||||
cat("This is a Spanish document\n")
|
||||
} else {
|
||||
cat(sprintf("This is a %s document\n", lang))
|
||||
}
|
||||
```
|
||||
13
docs/snippets/r/metadata/language_detection_multilingual.md
Normal file
13
docs/snippets/r/metadata/language_detection_multilingual.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
files <- c("english.pdf", "spanish.pdf", "french.pdf")
|
||||
config <- list(language_detection = list(enabled = TRUE))
|
||||
|
||||
for (file in files) {
|
||||
json <- extract_file_sync(file, "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("%s: detected language = %s\n",
|
||||
file, result$detected_language))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/r/metadata/metadata.md
Normal file
25
docs/snippets/r/metadata/metadata.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Detected Language:", result$detected_language, "\n")
|
||||
cat("Quality Score:", result$quality_score, "\n")
|
||||
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
|
||||
|
||||
cat("Metadata fields:\n")
|
||||
authors <- metadata_field(result, "authors")
|
||||
if (!is.null(authors)) {
|
||||
cat("Authors:", paste(authors, collapse=", "), "\n")
|
||||
}
|
||||
|
||||
created <- metadata_field(result, "created_date")
|
||||
if (!is.null(created)) {
|
||||
cat("Created Date:", created, "\n")
|
||||
}
|
||||
|
||||
pages_meta <- metadata_field(result, "page_count")
|
||||
if (!is.null(pages_meta)) {
|
||||
cat("Pages:", pages_meta, "\n")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/r/metadata/page_boundaries.md
Normal file
22
docs/snippets/r/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
boundaries <- result$metadata$pages$boundaries
|
||||
|
||||
if (!is.null(boundaries) && length(boundaries) > 0L) {
|
||||
content_bytes <- charToRaw(result$content)
|
||||
|
||||
for (i in seq_len(min(3L, length(boundaries)))) {
|
||||
boundary <- boundaries[[i]]
|
||||
page_bytes <- content_bytes[(boundary$byte_start + 1L):boundary$byte_end]
|
||||
page_text <- rawToChar(page_bytes)
|
||||
preview_end <- min(100L, nchar(page_text))
|
||||
|
||||
cat(sprintf("Page %d:\n", boundary$page_number))
|
||||
cat(sprintf(" Byte range: %d-%d\n", boundary$byte_start, boundary$byte_end))
|
||||
cat(sprintf(" Preview: %s...\n", substr(page_text, 1L, preview_end)))
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/metadata/page_tracking_basic.md
Normal file
20
docs/snippets/r/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Total pages:", page_count(result), "\n\n")
|
||||
|
||||
for (i in seq_along(result$pages)) {
|
||||
page <- result$pages[[i]]
|
||||
cat(sprintf("Page %d:\n", i))
|
||||
cat(" Elements:", length(page$elements), "\n")
|
||||
cat(" Text content length:", nchar(page$content), "chars\n")
|
||||
|
||||
if (nchar(page$content) > 0L) {
|
||||
preview <- substr(page$content, 1L, 100L)
|
||||
cat(sprintf(" Preview: %s...\n", preview))
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/r/metadata/tables.md
Normal file
22
docs/snippets/r/metadata/tables.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("spreadsheet.xlsx")
|
||||
|
||||
cat("Tables extracted:", length(result$tables), "\n\n")
|
||||
|
||||
for (i in seq_along(result$tables)) {
|
||||
table <- result$tables[[i]]
|
||||
cat(sprintf("Table %d:\n", i))
|
||||
cat(" Rows:", nrow(table), "\n")
|
||||
cat(" Columns:", ncol(table), "\n")
|
||||
cat(" Column names:", paste(colnames(table), collapse=", "), "\n")
|
||||
cat("\n")
|
||||
|
||||
if (nrow(table) > 0L) {
|
||||
cat(" Preview (first 3 rows):\n")
|
||||
print(head(table, 3L))
|
||||
cat("\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/r/metadata/vector_database_integration.md
Normal file
24
docs/snippets/r/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
chunk <- result$chunks[[i]]
|
||||
vector_doc <- list(
|
||||
id = sprintf("doc_%d", i),
|
||||
text = chunk,
|
||||
metadata = list(
|
||||
source = "document.pdf",
|
||||
chunk_index = i,
|
||||
length = nchar(chunk)
|
||||
)
|
||||
)
|
||||
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user