Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

36
e2e/ruby/.rubocop.yaml generated Normal file
View File

@@ -0,0 +1,36 @@
# Generated by alef e2e — do not edit.
AllCops:
NewCops: enable
TargetRubyVersion: 3.2
SuggestExtensions: false
plugins:
- rubocop-rspec
# --- Justified suppressions for generated test code ---
# Generated tests are verbose by nature (setup + multiple assertions).
Metrics/BlockLength:
Enabled: false
Metrics/MethodLength:
Enabled: false
Layout/LineLength:
Enabled: false
# Generated tests use multiple assertions per example for thorough verification.
RSpec/MultipleExpectations:
Enabled: false
RSpec/ExampleLength:
Enabled: false
# Generated tests describe categories as strings, not classes.
RSpec/DescribeClass:
Enabled: false
# Fixture-driven tests may produce identical assertion bodies for different inputs.
RSpec/RepeatedExample:
Enabled: false
# Error-handling tests use bare raise_error (exception type not known at generation time).
RSpec/UnspecifiedException:
Enabled: false

9
e2e/ruby/Gemfile generated Normal file
View File

@@ -0,0 +1,9 @@
# frozen_string_literal: true
source 'https://rubygems.org'
gem 'kreuzberg', path: '../../packages/ruby'
gem 'rspec', '~> 3.13'
gem 'rubocop', '~> 1.86'
gem 'rubocop-rspec', '~> 3.9'
gem 'faraday', '~> 2.0'

119
e2e/ruby/Gemfile.lock generated Normal file
View File

@@ -0,0 +1,119 @@
PATH
remote: ../../packages/ruby
specs:
kreuzberg (5.0.0.pre.rc.3)
rb_sys (~> 0.9)
sorbet-runtime (~> 0.5)
GEM
remote: https://rubygems.org/
specs:
ast (2.4.3)
diff-lcs (1.6.2)
faraday (2.14.2)
faraday-net_http (>= 2.0, < 3.5)
json
logger
faraday-net_http (3.4.2)
net-http (~> 0.5)
json (2.19.5)
language_server-protocol (3.17.0.5)
lint_roller (1.1.0)
logger (1.7.0)
net-http (0.9.1)
uri (>= 0.11.1)
parallel (2.1.0)
parser (3.3.11.1)
ast (~> 2.4.1)
racc
prism (1.9.0)
racc (1.8.1)
rainbow (3.1.1)
rake-compiler-dock (1.12.0)
rb_sys (0.9.128)
rake-compiler-dock (= 1.12.0)
regexp_parser (2.12.0)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)
rubocop (1.86.1)
json (~> 2.3)
language_server-protocol (~> 3.17.0.2)
lint_roller (~> 1.1.0)
parallel (>= 1.10)
parser (>= 3.3.0.2)
rainbow (>= 2.2.2, < 4.0)
regexp_parser (>= 2.9.3, < 3.0)
rubocop-ast (>= 1.49.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 2.4.0, < 4.0)
rubocop-ast (1.49.1)
parser (>= 3.3.7.2)
prism (~> 1.7)
rubocop-rspec (3.9.0)
lint_roller (~> 1.1)
rubocop (~> 1.81)
ruby-progressbar (1.13.0)
sorbet-runtime (0.6.13224)
unicode-display_width (3.2.0)
unicode-emoji (~> 4.1)
unicode-emoji (4.2.0)
uri (1.1.1)
PLATFORMS
arm64-darwin-25
ruby
DEPENDENCIES
faraday (~> 2.0)
kreuzberg!
rspec (~> 3.13)
rubocop (~> 1.86)
rubocop-rspec (~> 3.9)
CHECKSUMS
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
bundler (4.0.11) sha256=5bcec0fb78302e48d02ee46f10ee6e6942be647ba5b44a6d1ddfda9a240ce785
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
faraday (2.14.2) sha256=73ccb9994a9e8648f010e32eca2ae82e41c57860aa10932cda29418b9e0223ad
faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
json (2.19.5) sha256=218a18553e4801d579ca7e0f5bc72bafd776d7397238a1fb4e74db5b0a812c59
kreuzberg (5.0.0.pre.rc.3)
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
parallel (2.1.0) sha256=b35258865c2e31134c5ecb708beaaf6772adf9d5efae28e93e99260877b09356
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
rake-compiler-dock (1.12.0) sha256=f13205c2738f3d2053afcd03491a9e4541b22a59a0bfc53fc8bc883bd8188023
rb_sys (0.9.128) sha256=9ab81f4d6d4e1895de18762232362d1264475aa7035756b50441e442130538fd
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
rubocop (1.86.1) sha256=44415f3f01d01a21e01132248d2fd0867572475b566ca188a0a42133a08d4531
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
sorbet-runtime (0.6.13224) sha256=25214f47e16bf13028519e04209ac2b1a59333657bdf01f816c1b4bcbec7b56c
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
BUNDLED WITH
4.0.11

34
e2e/ruby/spec/async_spec.rb generated Normal file
View File

@@ -0,0 +1,34 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'async' do
it 'async_extract_bytes: Async extract_bytes call on PDF document' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.extract_bytes_async(content, 'application/pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 50
end
it 'async_extract_bytes_empty_mime: extract_bytes empty MIME async' do
expect {
content = File.read("text/plain.txt").bytes
Kreuzberg.extract_bytes_async(content, '', Kreuzberg::ExtractionConfig.new())
}.to raise_error(RuntimeError)
end
it 'async_extract_bytes_invalid_mime: extract_bytes unsupported MIME async' do
expect {
content = File.read("text/plain.txt").bytes
Kreuzberg.extract_bytes_async(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
}.to raise_error(RuntimeError)
end
end

72
e2e/ruby/spec/batch_spec.rb generated Normal file
View File

@@ -0,0 +1,72 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'batch' do
it 'batch_bytes_invalid_mime: batch_extract_bytes_sync invalid MIME' do
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111], mime_type: "application/x-nonexistent", config: nil)])
expect(result).not_to be_nil
end
it 'batch_extract_bytes_happy: batch_extract_bytes: happy path with mixed inputs' do
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33], mime_type: "text/plain", config: nil), Kreuzberg::BatchBytesItem.new(content: [60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111, 100, 121, 62, 60, 47, 104, 116, 109, 108, 62], mime_type: "text/html", config: nil)])
expect(result.length).to be >= 1
end
it 'batch_extract_bytes_mixed_format: batch_extract_bytes: handles unsupported MIME gracefully' do
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114], mime_type: "application/x-unknown", config: nil)])
expect(result).not_to be_nil
end
it 'batch_extract_bytes_sync_empty_list: batch_extract_bytes_sync: empty batch' do
result = Kreuzberg.batch_extract_bytes_sync([])
expect(result.length).to eq(0)
end
it 'batch_extract_bytes_sync_invalid_mime: batch_extract_bytes_sync: unsupported MIME' do
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [100, 97, 116, 97], mime_type: "application/x-unknown", config: nil)])
expect(result).not_to be_nil
end
it 'batch_file_async_basic: Extract text from multiple files asynchronously' do
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
expect(result).not_to be_nil
end
it 'batch_file_async_not_found: batch_extract_file async nonexistent' do
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil)])
expect(result).not_to be_nil
end
it 'batch_file_not_found: batch_extract_file_sync nonexistent' do
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/b.txt", config: nil)])
expect(result).not_to be_nil
end
it 'batch_file_partial: batch_extract_file_sync mixed' do
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "text/plain.txt", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/missing.pdf", config: nil)])
expect(result).not_to be_nil
end
it 'batch_file_sync_basic: Extract text from multiple files synchronously' do
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
expect(result).not_to be_nil
end
end

21
e2e/ruby/spec/code_spec.rb generated Normal file
View File

@@ -0,0 +1,21 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'code' do
it 'code_shebang_detection: Test language detection from shebang line via bytes input' do
result = Kreuzberg.extract_file_sync('code/script.sh', 'text/x-source-code')
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
expect(result.content.length).to be >= 10
expect(result.content.to_s).to include('build')
expect(result.content.to_s).to include('clean')
end
end

150
e2e/ruby/spec/contract_spec.rb generated Normal file
View File

@@ -0,0 +1,150 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'contract' do
it 'api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'api_batch_file_async: Tests async batch file extraction API (batch_extract_file)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_extract_file_async: Tests async file extraction API (extract_file)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content' do
result = Kreuzberg.extract_file_sync('markdown/extraction_test.md', nil, Kreuzberg::ExtractionConfig.new(chunking: { 'chunker_type' => 'markdown', 'max_chars' => 300, 'max_overlap' => 50, 'prepend_heading_context' => true }))
expect(result.content.length).to be >= 10
# skipped: field 'chunks' not available on result type
expect((result.chunks || []).all? { |c| c.content && !c.content.empty? }).to be(true)
# skipped: synthetic field 'chunks_have_heading_context' not available on Ruby Chunk binding
# skipped: synthetic field 'first_chunk_starts_with_heading' not available on Ruby Chunk binding
end
it 'config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting' do
result = Kreuzberg.extract_file_sync('docx/fake.docx', nil, Kreuzberg::ExtractionConfig.new(include_document_structure: true))
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
# skipped: field 'document' not available on result type
# skipped: field 'document.nodes' not available on result type
end
it 'config_element_types: Tests element-based result format with element type assertions on DOCX' do
result = Kreuzberg.extract_file_sync('docx/unit_test_headers.docx', nil, Kreuzberg::ExtractionConfig.new(result_format: 'element_based'))
expect(['application/vnd.openxmlformats-officedocument.wordprocessingml.document'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
# skipped: field 'elements' not available on result type
end
it 'config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(extraction_timeout_secs: 300))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
end
it 'config_keywords: Tests keyword extraction via YAKE algorithm' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(keywords: { 'algorithm' => 'yake', 'max_keywords' => 10 }))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'keywords' not available on Ruby ExtractionResult
# skipped: field 'keywords' not available on Ruby ExtractionResult
end
it 'config_pages: Tests page extraction and page marker configuration' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(pages: { 'extract_pages' => true, 'insert_page_markers' => true }))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['PAGE'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(enable_quality_processing: true))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
end
it 'config_security_limits: Tests archive extraction with custom security limits' do
result = Kreuzberg.extract_file_sync('archives/documents.zip', nil, Kreuzberg::ExtractionConfig.new(security_limits: { 'max_archive_size' => 104857600, 'max_compression_ratio' => 50, 'max_files_in_archive' => 100 }))
expect(['application/zip', 'application/x-zip-compressed'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
expect(result.content.length).to be >= 10
end
it 'config_tree_sitter: Tests tree-sitter configuration round-trip' do
result = Kreuzberg.extract_file_sync('code/hello.py', nil, Kreuzberg::ExtractionConfig.new(tree_sitter: { 'groups' => ['web'], 'languages' => ['python', 'rust'], 'process' => { 'comments' => false, 'diagnostics' => false, 'docstrings' => false, 'exports' => true, 'imports' => true, 'structure' => true, 'symbols' => false } }))
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
expect(result.content.length).to be >= 5
end
it 'output_format_bytes_markdown: Tests markdown output format via bytes extraction API' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf', Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'output_format_markdown: Tests Markdown output format' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
end

39
e2e/ruby/spec/detection_spec.rb generated Normal file
View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'detection' do
it 'detect_mime_bytes_html: Detect HTML MIME from bytes' do
content = File.read("html/html.html").bytes
result = Kreuzberg.detect_mime_type_from_bytes(content)
expect(result).not_to be_nil
end
it 'detect_mime_bytes_pdf: Detect PDF MIME type from bytes' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.detect_mime_type_from_bytes(content)
expect(result).not_to be_nil
end
it 'detect_mime_bytes_png: Detect PNG MIME type from bytes' do
content = File.read("images/test_hello_world.png").bytes
result = Kreuzberg.detect_mime_type_from_bytes(content)
expect(result).not_to be_nil
end
it 'get_extensions_unknown_mime: get_extensions unknown MIME' do
expect {
Kreuzberg.get_extensions_for_mime('application/x-totally-unknown')
}.to raise_error(RuntimeError)
end
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'document_extractor_management' do
it 'document_extractors_clear: Clear all document extractors and verify list is empty' do
Kreuzberg.clear_document_extractors()
end
it 'extractors_list: List all registered document extractors' do
result = Kreuzberg.list_document_extractors()
expect(result).not_to be_nil
end
end

View File

@@ -0,0 +1,32 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'embed_async_pending' do
it 'embed_texts_async_empty_input: embed_texts_async: empty text list' do
result = Kreuzberg.embed_texts_async([])
expect(result.length).to eq(0)
expect(result).not_to be_nil
end
it 'embed_texts_async_happy: embed_texts_async: basic async embedding' do
result = Kreuzberg.embed_texts_async(['First', 'Second'])
expect(result.length).to be >= 2
expect(result).not_to be_nil
end
it 'embed_texts_async_preset_switch: embed_texts_async: preset override' do
result = Kreuzberg.embed_texts_async(['Text'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
expect(result).not_to be_nil
end
end

18
e2e/ruby/spec/embed_extra_spec.rb generated Normal file
View File

@@ -0,0 +1,18 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'embed_extra' do
it 'embed_texts_batch: Batch embed texts' do
result = Kreuzberg.embed_texts(['Hello', 'World'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
expect(result).not_to be_nil
end
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'embedding_backend_management' do
it 'embedding_backends_clear: Clear all embedding backends and verify list is empty' do
Kreuzberg.clear_embedding_backends()
end
it 'embedding_backends_list: List all registered embedding backends' do
result = Kreuzberg.list_embedding_backends()
expect(result).not_to be_nil
end
end

43
e2e/ruby/spec/embeddings_spec.rb generated Normal file
View File

@@ -0,0 +1,43 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'embeddings' do
it 'embed_texts_different_preset: embed_texts: multilingual preset' do
result = Kreuzberg.embed_texts(['Hello world', 'Test'], {model: { 'name' => 'multilingual', 'type' => 'preset' }})
expect(result.length).to be >= 2
expect(result).not_to be_nil
end
it 'get_embedding_preset_known: get_embedding_preset: known preset' do
result = Kreuzberg.get_embedding_preset('balanced')
expect(result).not_to be_nil
end
it 'get_embedding_preset_nominal: get_embedding_preset: nominal case' do
result = Kreuzberg.get_embedding_preset('balanced')
expect(result).not_to be_nil
end
it 'get_embedding_preset_unknown: get_embedding_preset: unknown preset fails' do
result = Kreuzberg.get_embedding_preset('nonexistent-xyz')
expect(result.nil? || result.empty?).to be(true)
end
it 'list_embedding_presets_sanity: list_embedding_presets: returns at least one' do
result = Kreuzberg.list_embedding_presets()
expect(result.to_s).not_to be_empty
end
end

47
e2e/ruby/spec/error_spec.rb generated Normal file
View File

@@ -0,0 +1,47 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'error' do
it 'error_empty_bytes: Graceful handling of empty bytes (should not error)' do
content = File.read("text/empty.txt").bytes
result = Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new())
expect(result).not_to be_nil
end
it 'error_empty_mime: Error when extracting with empty MIME type' do
expect {
content = File.read("text/plain.txt").bytes
Kreuzberg.extract_bytes_sync(content, '', Kreuzberg::ExtractionConfig.new())
}.to raise_error(RuntimeError)
end
it 'error_extract_bytes_conflicting_ocr: extract_bytes force+disable OCR' do
expect {
content = File.read("text/fake_text.txt").bytes
Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new(disable_ocr: true, force_ocr: true))
}.to raise_error(RuntimeError)
end
it 'error_invalid_mime_format: Error when extracting with invalid MIME type format' do
expect {
content = File.read("text/plain.txt").bytes
Kreuzberg.extract_bytes_sync(content, 'not-a-mime', Kreuzberg::ExtractionConfig.new())
}.to raise_error(RuntimeError)
end
it 'error_unsupported_mime: Error when extracting with unsupported MIME type' do
expect {
content = File.read("text/plain.txt").bytes
Kreuzberg.extract_bytes_sync(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
}.to raise_error(RuntimeError)
end
end

47
e2e/ruby/spec/format_specific_spec.rb generated Normal file
View File

@@ -0,0 +1,47 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'format_specific' do
it 'format_docx_standalone: Standalone DOCX extraction using extract_bytes_sync' do
content = File.read("docx/fake.docx").bytes
result = Kreuzberg.extract_bytes_sync(content, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')
expect(result.content.length).to be >= 20
end
it 'format_hwpx_standalone: Standalone HWPX extraction using extract_bytes_sync' do
content = File.read("hwpx/simple.hwpx").bytes
result = Kreuzberg.extract_bytes_sync(content, 'application/haansofthwpx')
expect(result.content.length).to be >= 20
expect(result.content.to_s).to include('Hello from HWPX')
end
it 'format_pdf_text: Standalone PDF text extraction using extract_bytes_sync' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf')
expect(result.content.length).to be >= 50
expect(['Mallori', 'May'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'format_pptx: PPTX presentation extraction using extract_file_sync' do
result = Kreuzberg.extract_file_sync('pptx/simple.pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
expect(result).not_to be_nil
end
it 'format_xlsx: XLSX spreadsheet extraction using extract_file_sync' do
result = Kreuzberg.extract_file_sync('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
expect(result).not_to be_nil
end
end

32
e2e/ruby/spec/mime_utilities_spec.rb generated Normal file
View File

@@ -0,0 +1,32 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'mime_utilities' do
it 'mime_detect_bytes: Detect MIME type from file bytes' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.detect_mime_type_from_bytes(content)
expect(result).to include('pdf')
end
it 'mime_detect_image: Detect MIME type from PNG image bytes' do
content = File.read("images/test_hello_world.png").bytes
result = Kreuzberg.detect_mime_type_from_bytes(content)
expect(result).to include('png')
end
it 'mime_get_extensions: Get file extensions for a MIME type' do
result = Kreuzberg.get_extensions_for_mime('application/pdf')
expect(result).to include('pdf')
end
end

View File

@@ -0,0 +1,28 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'ocr_backend_management' do
it 'ocr_backends_clear: Clear all OCR backends and verify list is empty' do
Kreuzberg.clear_ocr_backends()
end
it 'ocr_backends_list: List all registered OCR backends' do
result = Kreuzberg.list_ocr_backends()
expect(result).not_to be_nil
end
it 'ocr_backends_unregister: Unregister nonexistent OCR backend gracefully' do
Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz')
end
end

26
e2e/ruby/spec/pdf_spec.rb generated Normal file
View File

@@ -0,0 +1,26 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'pdf' do
it 'render_pdf_page_first: render_pdf_page_to_png: first page' do
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.render_pdf_page_to_png(pdf_bytes, 0)
expect(result.length).to be >= 100
end
it 'render_pdf_page_out_of_range: render_pdf_page_to_png: page out of range' do
expect {
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
Kreuzberg.render_pdf_page_to_png(pdf_bytes, 999)
}.to raise_error(RuntimeError)
end
end

144
e2e/ruby/spec/plugin_api_spec.rb generated Normal file
View File

@@ -0,0 +1,144 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'plugin_api' do
it 'register_document_extractor_trait_bridge: register_document_extractor: trait bridge' do
stub_register_document_extractor_trait_bridge = Class.new do
def name = 'test-extractor'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def extract_bytes(content, mime_type, config) = '{}'
def supported_mime_types = []
end.new
Kreuzberg.register_document_extractor(stub_register_document_extractor_trait_bridge, 'test-extractor')
Kreuzberg.unregister_document_extractor('test-extractor')
end
it 'register_embedding_backend_trait_bridge: register_embedding_backend: trait bridge' do
stub_register_embedding_backend_trait_bridge = Class.new do
def name = 'test-embedding-backend'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def dimensions = 1
def embed(texts) = []
end.new
Kreuzberg.register_embedding_backend(stub_register_embedding_backend_trait_bridge, 'test-embedding-backend')
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
end
it 'register_ocr_backend_trait_bridge: register_ocr_backend: trait bridge' do
stub_register_ocr_backend_trait_bridge = Class.new do
def name = 'test-backend'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def process_image(image_bytes, config) = '{}'
def supports_language(lang) = false
def backend_type = '{}'
end.new
Kreuzberg.register_ocr_backend(stub_register_ocr_backend_trait_bridge, 'test-backend')
Kreuzberg.unregister_ocr_backend('test-backend')
end
it 'register_post_processor_trait_bridge: register_post_processor: trait bridge' do
stub_register_post_processor_trait_bridge = Class.new do
def name = 'test-processor'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def process(result, config) = nil
def processing_stage = '{}'
end.new
Kreuzberg.register_post_processor(stub_register_post_processor_trait_bridge, 'test-processor')
Kreuzberg.unregister_post_processor('test-processor')
end
it 'register_renderer_trait_bridge: register_renderer: trait bridge' do
stub_register_renderer_trait_bridge = Class.new do
def name = 'test-renderer'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def render(doc) = ''
end.new
Kreuzberg.register_renderer(stub_register_renderer_trait_bridge, 'test-renderer')
Kreuzberg.unregister_renderer('test-renderer')
end
it 'register_validator_trait_bridge: register_validator: trait bridge' do
stub_register_validator_trait_bridge = Class.new do
def name = 'test-validator'
def initialize
nil
end
def shutdown
nil
end
def version = '1.0.0'
def validate(result, config) = nil
end.new
Kreuzberg.register_validator(stub_register_validator_trait_bridge, 'test-validator')
Kreuzberg.unregister_validator('test-validator')
end
it 'unregister_document_extractor_after_register: unregister_document_extractor' do
Kreuzberg.unregister_document_extractor('test-extractor')
end
it 'unregister_embedding_backend_after_register: unregister_embedding_backend' do
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
end
it 'unregister_post_processor_after_register: unregister_post_processor' do
Kreuzberg.unregister_post_processor('test-processor')
end
it 'unregister_renderer_after_register: unregister_renderer' do
Kreuzberg.unregister_renderer('test-renderer')
end
it 'unregister_validator_after_register: unregister_validator' do
Kreuzberg.unregister_validator('test-validator')
end
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'post_processor_management' do
it 'post_processors_clear: Clear all post-processors and verify list is empty' do
Kreuzberg.clear_post_processors()
end
it 'post_processors_list: List all registered post-processors' do
result = Kreuzberg.list_post_processors()
expect(result).not_to be_nil
end
end

View File

@@ -0,0 +1,30 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'registry_operations' do
it 'extensions_docx: Get file extensions for DOCX MIME type' do
result = Kreuzberg.get_extensions_for_mime('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
expect(result).not_to be_nil
end
it 'extensions_html: Get file extensions for HTML MIME type' do
result = Kreuzberg.get_extensions_for_mime('text/html')
expect(result).not_to be_nil
end
it 'extensions_pdf: Get file extensions for PDF MIME type' do
result = Kreuzberg.get_extensions_for_mime('application/pdf')
expect(result).not_to be_nil
end
end

48
e2e/ruby/spec/registry_spec.rb generated Normal file
View File

@@ -0,0 +1,48 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'registry' do
it 'list_document_extractors: List document extractors' do
result = Kreuzberg.list_document_extractors()
expect(result).not_to be_nil
end
it 'list_embedding_backends: List embedding backends' do
result = Kreuzberg.list_embedding_backends()
expect(result).not_to be_nil
end
it 'list_ocr_backends: List OCR backends' do
result = Kreuzberg.list_ocr_backends()
expect(result).not_to be_nil
end
it 'list_post_processors: List post-processors' do
result = Kreuzberg.list_post_processors()
expect(result).not_to be_nil
end
it 'list_renderers: List renderers' do
result = Kreuzberg.list_renderers()
expect(result).not_to be_nil
end
it 'list_validators: List validators' do
result = Kreuzberg.list_validators()
expect(result).not_to be_nil
end
end

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'renderer_management' do
it 'renderers_clear: Clear all renderers and verify list is empty' do
Kreuzberg.clear_renderers()
end
it 'renderers_list: List all registered renderers' do
result = Kreuzberg.list_renderers()
expect(result).not_to be_nil
end
end

90
e2e/ruby/spec/smoke_spec.rb generated Normal file
View File

@@ -0,0 +1,90 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'smoke' do
def alef_e2e_item_texts(item)
return [item.to_s] if item.is_a?(String) || item.is_a?(Symbol) || item.is_a?(Numeric)
[:kind, :name, :signature, :path, :alias, :text, :source].filter_map do |attr|
item.respond_to?(attr) ? item.send(attr).to_s : nil
end + [item.to_s]
end
it 'ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.' do
content = File.read("images/test_hello_world.png").bytes
result = Kreuzberg.extract_bytes_async(content, 'image/png', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
expect(result.content.length).to be >= 1
expect(['Hello', 'World', 'hello', 'world'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'smoke_docx_basic: Smoke test: DOCX with formatted text' do
result = Kreuzberg.extract_file_async('docx/fake.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
expect(result.content.length).to be >= 20
expect(['Lorem', 'ipsum', 'document', 'text'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'smoke_html_basic: Smoke test: HTML table extraction' do
result = Kreuzberg.extract_file_async('html/simple_table.html', 'text/html', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('text/html'.strip)
expect(result.content.length).to be >= 10
expect(['Sample Data Table', 'Laptop', 'Electronics', 'Product'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'smoke_image_png: Smoke test: PNG image (without OCR, metadata only)' do
result = Kreuzberg.extract_file_async('images/sample.png', nil, Kreuzberg::ExtractionConfig.new(disable_ocr: true))
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
end
it 'smoke_json_basic: Smoke test: JSON file extraction' do
result = Kreuzberg.extract_file_async('json/simple.json', 'application/json', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('application/json'.strip)
expect(result.content.length).to be >= 5
end
it 'smoke_pdf_basic: Smoke test: PDF with simple text extraction' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', 'application/pdf', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 50
expect(['May 5, 2023', 'To Whom it May Concern'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'smoke_txt_basic: Smoke test: Plain text file' do
result = Kreuzberg.extract_file_async('text/report.txt', 'text/plain', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('text/plain'.strip)
expect(result.content.length).to be >= 5
end
it 'smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables' do
result = Kreuzberg.extract_file_async('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', Kreuzberg::ExtractionConfig.new())
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'.strip)
expect(result.content.length).to be >= 100
expect(result.content.to_s).to include('Team')
expect(result.content.to_s).to include('Location')
expect(result.content.to_s).to include('Stanley Cups')
expect(result.content.to_s).to include('Blues')
expect(result.content.to_s).to include('Flyers')
expect(result.content.to_s).to include('Maple Leafs')
expect(result.content.to_s).to include('STL')
expect(result.content.to_s).to include('PHI')
expect(result.content.to_s).to include('TOR')
# skipped: field 'tables' not available on result type
# skipped: enum variant accessor 'metadata.format.excel.sheet_count' not available on Ruby (serialized to Hash)
# skipped: enum variant accessor 'metadata.format.excel.sheet_names' not available on Ruby (serialized to Hash)
end
end

73
e2e/ruby/spec/spec_helper.rb generated Normal file
View File

@@ -0,0 +1,73 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
# RSpec hooks to isolate plugin registry state between tests.
# Unregisters any test-prefixed backends (test-*, test_*) after each test
# to prevent pollution from one test affecting subsequent tests.
begin
require 'kreuzberg'
RSpec.configure do |config|
# Track initial registry state before each test
config.before(:each) do
begin
@_initial_ocr_backends = Kreuzberg.list_ocr_backends.to_set rescue Set.new
@_initial_embedding_backends = Kreuzberg.list_embedding_backends.to_set rescue Set.new
@_initial_document_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
@_initial_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
@_initial_validators = Kreuzberg.list_validators.to_set rescue Set.new
@_initial_post_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
rescue
# If registry functions aren't available, skip cleanup
end
end
# Clean up test-prefixed backends after each test
config.after(:each) do
begin
current_ocr = Kreuzberg.list_ocr_backends.to_set rescue Set.new
(current_ocr - @_initial_ocr_backends).each do |name|
Kreuzberg.unregister_ocr_backend(name) if name.to_s.start_with?('test-', 'test_')
end
current_embedding = Kreuzberg.list_embedding_backends.to_set rescue Set.new
(current_embedding - @_initial_embedding_backends).each do |name|
Kreuzberg.unregister_embedding_backend(name) if name.to_s.start_with?('test-', 'test_')
end
current_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
(current_extractors - @_initial_document_extractors).each do |name|
Kreuzberg.unregister_document_extractor(name) if name.to_s.start_with?('test-', 'test_')
end
current_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
(current_renderers - @_initial_renderers).each do |name|
Kreuzberg.unregister_renderer(name) if name.to_s.start_with?('test-', 'test_')
end
current_validators = Kreuzberg.list_validators.to_set rescue Set.new
(current_validators - @_initial_validators).each do |name|
Kreuzberg.unregister_validator(name) if name.to_s.start_with?('test-', 'test_')
end
current_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
(current_processors - @_initial_post_processors).each do |name|
Kreuzberg.unregister_post_processor(name) if name.to_s.start_with?('test-', 'test_')
end
rescue
# Cleanup failures are non-fatal; continue silently
end
end
end
rescue LoadError
# Kreuzberg not available; skip registry cleanup
end
# Change to the configured test-documents directory so that fixture file paths like
# "pdf/fake_memo.pdf" resolve correctly when running rspec from e2e/ruby/.
# spec_helper.rb lives in e2e/ruby/spec/; the fixtures dir resolves three directories up.
_test_documents = File.expand_path('../../../test_documents', __dir__)
Dir.chdir(_test_documents) if Dir.exist?(_test_documents)

View File

@@ -0,0 +1,23 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'validator_management' do
it 'validators_clear: Clear all validators and verify list is empty' do
Kreuzberg.clear_validators()
end
it 'validators_list: List all registered validators' do
result = Kreuzberg.list_validators()
expect(result).not_to be_nil
end
end