This commit is contained in:
36
e2e/ruby/.rubocop.yaml
generated
Normal file
36
e2e/ruby/.rubocop.yaml
generated
Normal file
@@ -0,0 +1,36 @@
|
||||
# Generated by alef e2e — do not edit.
|
||||
AllCops:
|
||||
NewCops: enable
|
||||
TargetRubyVersion: 3.2
|
||||
SuggestExtensions: false
|
||||
|
||||
plugins:
|
||||
- rubocop-rspec
|
||||
|
||||
# --- Justified suppressions for generated test code ---
|
||||
|
||||
# Generated tests are verbose by nature (setup + multiple assertions).
|
||||
Metrics/BlockLength:
|
||||
Enabled: false
|
||||
Metrics/MethodLength:
|
||||
Enabled: false
|
||||
Layout/LineLength:
|
||||
Enabled: false
|
||||
|
||||
# Generated tests use multiple assertions per example for thorough verification.
|
||||
RSpec/MultipleExpectations:
|
||||
Enabled: false
|
||||
RSpec/ExampleLength:
|
||||
Enabled: false
|
||||
|
||||
# Generated tests describe categories as strings, not classes.
|
||||
RSpec/DescribeClass:
|
||||
Enabled: false
|
||||
|
||||
# Fixture-driven tests may produce identical assertion bodies for different inputs.
|
||||
RSpec/RepeatedExample:
|
||||
Enabled: false
|
||||
|
||||
# Error-handling tests use bare raise_error (exception type not known at generation time).
|
||||
RSpec/UnspecifiedException:
|
||||
Enabled: false
|
||||
9
e2e/ruby/Gemfile
generated
Normal file
9
e2e/ruby/Gemfile
generated
Normal file
@@ -0,0 +1,9 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
source 'https://rubygems.org'
|
||||
|
||||
gem 'kreuzberg', path: '../../packages/ruby'
|
||||
gem 'rspec', '~> 3.13'
|
||||
gem 'rubocop', '~> 1.86'
|
||||
gem 'rubocop-rspec', '~> 3.9'
|
||||
gem 'faraday', '~> 2.0'
|
||||
119
e2e/ruby/Gemfile.lock
generated
Normal file
119
e2e/ruby/Gemfile.lock
generated
Normal file
@@ -0,0 +1,119 @@
|
||||
PATH
|
||||
remote: ../../packages/ruby
|
||||
specs:
|
||||
kreuzberg (5.0.0.pre.rc.3)
|
||||
rb_sys (~> 0.9)
|
||||
sorbet-runtime (~> 0.5)
|
||||
|
||||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
ast (2.4.3)
|
||||
diff-lcs (1.6.2)
|
||||
faraday (2.14.2)
|
||||
faraday-net_http (>= 2.0, < 3.5)
|
||||
json
|
||||
logger
|
||||
faraday-net_http (3.4.2)
|
||||
net-http (~> 0.5)
|
||||
json (2.19.5)
|
||||
language_server-protocol (3.17.0.5)
|
||||
lint_roller (1.1.0)
|
||||
logger (1.7.0)
|
||||
net-http (0.9.1)
|
||||
uri (>= 0.11.1)
|
||||
parallel (2.1.0)
|
||||
parser (3.3.11.1)
|
||||
ast (~> 2.4.1)
|
||||
racc
|
||||
prism (1.9.0)
|
||||
racc (1.8.1)
|
||||
rainbow (3.1.1)
|
||||
rake-compiler-dock (1.12.0)
|
||||
rb_sys (0.9.128)
|
||||
rake-compiler-dock (= 1.12.0)
|
||||
regexp_parser (2.12.0)
|
||||
rspec (3.13.2)
|
||||
rspec-core (~> 3.13.0)
|
||||
rspec-expectations (~> 3.13.0)
|
||||
rspec-mocks (~> 3.13.0)
|
||||
rspec-core (3.13.6)
|
||||
rspec-support (~> 3.13.0)
|
||||
rspec-expectations (3.13.5)
|
||||
diff-lcs (>= 1.2.0, < 2.0)
|
||||
rspec-support (~> 3.13.0)
|
||||
rspec-mocks (3.13.8)
|
||||
diff-lcs (>= 1.2.0, < 2.0)
|
||||
rspec-support (~> 3.13.0)
|
||||
rspec-support (3.13.7)
|
||||
rubocop (1.86.1)
|
||||
json (~> 2.3)
|
||||
language_server-protocol (~> 3.17.0.2)
|
||||
lint_roller (~> 1.1.0)
|
||||
parallel (>= 1.10)
|
||||
parser (>= 3.3.0.2)
|
||||
rainbow (>= 2.2.2, < 4.0)
|
||||
regexp_parser (>= 2.9.3, < 3.0)
|
||||
rubocop-ast (>= 1.49.0, < 2.0)
|
||||
ruby-progressbar (~> 1.7)
|
||||
unicode-display_width (>= 2.4.0, < 4.0)
|
||||
rubocop-ast (1.49.1)
|
||||
parser (>= 3.3.7.2)
|
||||
prism (~> 1.7)
|
||||
rubocop-rspec (3.9.0)
|
||||
lint_roller (~> 1.1)
|
||||
rubocop (~> 1.81)
|
||||
ruby-progressbar (1.13.0)
|
||||
sorbet-runtime (0.6.13224)
|
||||
unicode-display_width (3.2.0)
|
||||
unicode-emoji (~> 4.1)
|
||||
unicode-emoji (4.2.0)
|
||||
uri (1.1.1)
|
||||
|
||||
PLATFORMS
|
||||
arm64-darwin-25
|
||||
ruby
|
||||
|
||||
DEPENDENCIES
|
||||
faraday (~> 2.0)
|
||||
kreuzberg!
|
||||
rspec (~> 3.13)
|
||||
rubocop (~> 1.86)
|
||||
rubocop-rspec (~> 3.9)
|
||||
|
||||
CHECKSUMS
|
||||
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
||||
bundler (4.0.11) sha256=5bcec0fb78302e48d02ee46f10ee6e6942be647ba5b44a6d1ddfda9a240ce785
|
||||
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
||||
faraday (2.14.2) sha256=73ccb9994a9e8648f010e32eca2ae82e41c57860aa10932cda29418b9e0223ad
|
||||
faraday-net_http (3.4.2) sha256=f147758260d3526939bf57ecf911682f94926a3666502e24c69992765875906c
|
||||
json (2.19.5) sha256=218a18553e4801d579ca7e0f5bc72bafd776d7397238a1fb4e74db5b0a812c59
|
||||
kreuzberg (5.0.0.pre.rc.3)
|
||||
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
||||
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
||||
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
||||
net-http (0.9.1) sha256=25ba0b67c63e89df626ed8fac771d0ad24ad151a858af2cc8e6a716ca4336996
|
||||
parallel (2.1.0) sha256=b35258865c2e31134c5ecb708beaaf6772adf9d5efae28e93e99260877b09356
|
||||
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
||||
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
||||
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
||||
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
||||
rake-compiler-dock (1.12.0) sha256=f13205c2738f3d2053afcd03491a9e4541b22a59a0bfc53fc8bc883bd8188023
|
||||
rb_sys (0.9.128) sha256=9ab81f4d6d4e1895de18762232362d1264475aa7035756b50441e442130538fd
|
||||
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
|
||||
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
||||
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
||||
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
||||
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
||||
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
||||
rubocop (1.86.1) sha256=44415f3f01d01a21e01132248d2fd0867572475b566ca188a0a42133a08d4531
|
||||
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
||||
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
||||
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
||||
sorbet-runtime (0.6.13224) sha256=25214f47e16bf13028519e04209ac2b1a59333657bdf01f816c1b4bcbec7b56c
|
||||
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
||||
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
||||
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
||||
|
||||
BUNDLED WITH
|
||||
4.0.11
|
||||
34
e2e/ruby/spec/async_spec.rb
generated
Normal file
34
e2e/ruby/spec/async_spec.rb
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'async' do
|
||||
it 'async_extract_bytes: Async extract_bytes call on PDF document' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_async(content, 'application/pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 50
|
||||
|
||||
end
|
||||
|
||||
it 'async_extract_bytes_empty_mime: extract_bytes empty MIME async' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_async(content, '', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'async_extract_bytes_invalid_mime: extract_bytes unsupported MIME async' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_async(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
72
e2e/ruby/spec/batch_spec.rb
generated
Normal file
72
e2e/ruby/spec/batch_spec.rb
generated
Normal file
@@ -0,0 +1,72 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'batch' do
|
||||
it 'batch_bytes_invalid_mime: batch_extract_bytes_sync invalid MIME' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111], mime_type: "application/x-nonexistent", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_happy: batch_extract_bytes: happy path with mixed inputs' do
|
||||
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33], mime_type: "text/plain", config: nil), Kreuzberg::BatchBytesItem.new(content: [60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111, 100, 121, 62, 60, 47, 104, 116, 109, 108, 62], mime_type: "text/html", config: nil)])
|
||||
expect(result.length).to be >= 1
|
||||
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_mixed_format: batch_extract_bytes: handles unsupported MIME gracefully' do
|
||||
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114], mime_type: "application/x-unknown", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_sync_empty_list: batch_extract_bytes_sync: empty batch' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([])
|
||||
expect(result.length).to eq(0)
|
||||
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_sync_invalid_mime: batch_extract_bytes_sync: unsupported MIME' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [100, 97, 116, 97], mime_type: "application/x-unknown", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_async_basic: Extract text from multiple files asynchronously' do
|
||||
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_async_not_found: batch_extract_file async nonexistent' do
|
||||
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_not_found: batch_extract_file_sync nonexistent' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/b.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_partial: batch_extract_file_sync mixed' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "text/plain.txt", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/missing.pdf", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_sync_basic: Extract text from multiple files synchronously' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
21
e2e/ruby/spec/code_spec.rb
generated
Normal file
21
e2e/ruby/spec/code_spec.rb
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'code' do
|
||||
it 'code_shebang_detection: Test language detection from shebang line via bytes input' do
|
||||
result = Kreuzberg.extract_file_sync('code/script.sh', 'text/x-source-code')
|
||||
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(result.content.to_s).to include('build')
|
||||
expect(result.content.to_s).to include('clean')
|
||||
|
||||
end
|
||||
end
|
||||
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
@@ -0,0 +1,150 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'contract' do
|
||||
it 'api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_async: Tests async batch file extraction API (batch_extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_file_async: Tests async file extraction API (extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content' do
|
||||
result = Kreuzberg.extract_file_sync('markdown/extraction_test.md', nil, Kreuzberg::ExtractionConfig.new(chunking: { 'chunker_type' => 'markdown', 'max_chars' => 300, 'max_overlap' => 50, 'prepend_heading_context' => true }))
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'chunks' not available on result type
|
||||
expect((result.chunks || []).all? { |c| c.content && !c.content.empty? }).to be(true)
|
||||
# skipped: synthetic field 'chunks_have_heading_context' not available on Ruby Chunk binding
|
||||
# skipped: synthetic field 'first_chunk_starts_with_heading' not available on Ruby Chunk binding
|
||||
|
||||
end
|
||||
|
||||
it 'config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting' do
|
||||
result = Kreuzberg.extract_file_sync('docx/fake.docx', nil, Kreuzberg::ExtractionConfig.new(include_document_structure: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
# skipped: field 'document' not available on result type
|
||||
# skipped: field 'document.nodes' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_element_types: Tests element-based result format with element type assertions on DOCX' do
|
||||
result = Kreuzberg.extract_file_sync('docx/unit_test_headers.docx', nil, Kreuzberg::ExtractionConfig.new(result_format: 'element_based'))
|
||||
expect(['application/vnd.openxmlformats-officedocument.wordprocessingml.document'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
# skipped: field 'elements' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(extraction_timeout_secs: 300))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_keywords: Tests keyword extraction via YAKE algorithm' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(keywords: { 'algorithm' => 'yake', 'max_keywords' => 10 }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
|
||||
end
|
||||
|
||||
it 'config_pages: Tests page extraction and page marker configuration' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(pages: { 'extract_pages' => true, 'insert_page_markers' => true }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['PAGE'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(enable_quality_processing: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_security_limits: Tests archive extraction with custom security limits' do
|
||||
result = Kreuzberg.extract_file_sync('archives/documents.zip', nil, Kreuzberg::ExtractionConfig.new(security_limits: { 'max_archive_size' => 104857600, 'max_compression_ratio' => 50, 'max_files_in_archive' => 100 }))
|
||||
expect(['application/zip', 'application/x-zip-compressed'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_tree_sitter: Tests tree-sitter configuration round-trip' do
|
||||
result = Kreuzberg.extract_file_sync('code/hello.py', nil, Kreuzberg::ExtractionConfig.new(tree_sitter: { 'groups' => ['web'], 'languages' => ['python', 'rust'], 'process' => { 'comments' => false, 'diagnostics' => false, 'docstrings' => false, 'exports' => true, 'imports' => true, 'structure' => true, 'symbols' => false } }))
|
||||
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_bytes_markdown: Tests markdown output format via bytes extraction API' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf', Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_markdown: Tests Markdown output format' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
end
|
||||
39
e2e/ruby/spec/detection_spec.rb
generated
Normal file
39
e2e/ruby/spec/detection_spec.rb
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'detection' do
|
||||
it 'detect_mime_bytes_html: Detect HTML MIME from bytes' do
|
||||
content = File.read("html/html.html").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'detect_mime_bytes_pdf: Detect PDF MIME type from bytes' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'detect_mime_bytes_png: Detect PNG MIME type from bytes' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_extensions_unknown_mime: get_extensions unknown MIME' do
|
||||
expect {
|
||||
Kreuzberg.get_extensions_for_mime('application/x-totally-unknown')
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/document_extractor_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/document_extractor_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'document_extractor_management' do
|
||||
it 'document_extractors_clear: Clear all document extractors and verify list is empty' do
|
||||
Kreuzberg.clear_document_extractors()
|
||||
|
||||
end
|
||||
|
||||
it 'extractors_list: List all registered document extractors' do
|
||||
result = Kreuzberg.list_document_extractors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
32
e2e/ruby/spec/embed_async_pending_spec.rb
generated
Normal file
32
e2e/ruby/spec/embed_async_pending_spec.rb
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embed_async_pending' do
|
||||
it 'embed_texts_async_empty_input: embed_texts_async: empty text list' do
|
||||
result = Kreuzberg.embed_texts_async([])
|
||||
expect(result.length).to eq(0)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'embed_texts_async_happy: embed_texts_async: basic async embedding' do
|
||||
result = Kreuzberg.embed_texts_async(['First', 'Second'])
|
||||
expect(result.length).to be >= 2
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'embed_texts_async_preset_switch: embed_texts_async: preset override' do
|
||||
result = Kreuzberg.embed_texts_async(['Text'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
18
e2e/ruby/spec/embed_extra_spec.rb
generated
Normal file
18
e2e/ruby/spec/embed_extra_spec.rb
generated
Normal file
@@ -0,0 +1,18 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embed_extra' do
|
||||
it 'embed_texts_batch: Batch embed texts' do
|
||||
result = Kreuzberg.embed_texts(['Hello', 'World'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/embedding_backend_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/embedding_backend_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embedding_backend_management' do
|
||||
it 'embedding_backends_clear: Clear all embedding backends and verify list is empty' do
|
||||
Kreuzberg.clear_embedding_backends()
|
||||
|
||||
end
|
||||
|
||||
it 'embedding_backends_list: List all registered embedding backends' do
|
||||
result = Kreuzberg.list_embedding_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
43
e2e/ruby/spec/embeddings_spec.rb
generated
Normal file
43
e2e/ruby/spec/embeddings_spec.rb
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embeddings' do
|
||||
it 'embed_texts_different_preset: embed_texts: multilingual preset' do
|
||||
result = Kreuzberg.embed_texts(['Hello world', 'Test'], {model: { 'name' => 'multilingual', 'type' => 'preset' }})
|
||||
expect(result.length).to be >= 2
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_known: get_embedding_preset: known preset' do
|
||||
result = Kreuzberg.get_embedding_preset('balanced')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_nominal: get_embedding_preset: nominal case' do
|
||||
result = Kreuzberg.get_embedding_preset('balanced')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_unknown: get_embedding_preset: unknown preset fails' do
|
||||
result = Kreuzberg.get_embedding_preset('nonexistent-xyz')
|
||||
expect(result.nil? || result.empty?).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'list_embedding_presets_sanity: list_embedding_presets: returns at least one' do
|
||||
result = Kreuzberg.list_embedding_presets()
|
||||
expect(result.to_s).not_to be_empty
|
||||
|
||||
end
|
||||
end
|
||||
47
e2e/ruby/spec/error_spec.rb
generated
Normal file
47
e2e/ruby/spec/error_spec.rb
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'error' do
|
||||
it 'error_empty_bytes: Graceful handling of empty bytes (should not error)' do
|
||||
content = File.read("text/empty.txt").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new())
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'error_empty_mime: Error when extracting with empty MIME type' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, '', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_extract_bytes_conflicting_ocr: extract_bytes force+disable OCR' do
|
||||
expect {
|
||||
content = File.read("text/fake_text.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new(disable_ocr: true, force_ocr: true))
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_invalid_mime_format: Error when extracting with invalid MIME type format' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'not-a-mime', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_unsupported_mime: Error when extracting with unsupported MIME type' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
47
e2e/ruby/spec/format_specific_spec.rb
generated
Normal file
47
e2e/ruby/spec/format_specific_spec.rb
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'format_specific' do
|
||||
it 'format_docx_standalone: Standalone DOCX extraction using extract_bytes_sync' do
|
||||
content = File.read("docx/fake.docx").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
expect(result.content.length).to be >= 20
|
||||
|
||||
end
|
||||
|
||||
it 'format_hwpx_standalone: Standalone HWPX extraction using extract_bytes_sync' do
|
||||
content = File.read("hwpx/simple.hwpx").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/haansofthwpx')
|
||||
expect(result.content.length).to be >= 20
|
||||
expect(result.content.to_s).to include('Hello from HWPX')
|
||||
|
||||
end
|
||||
|
||||
it 'format_pdf_text: Standalone PDF text extraction using extract_bytes_sync' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf')
|
||||
expect(result.content.length).to be >= 50
|
||||
expect(['Mallori', 'May'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'format_pptx: PPTX presentation extraction using extract_file_sync' do
|
||||
result = Kreuzberg.extract_file_sync('pptx/simple.pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'format_xlsx: XLSX spreadsheet extraction using extract_file_sync' do
|
||||
result = Kreuzberg.extract_file_sync('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
32
e2e/ruby/spec/mime_utilities_spec.rb
generated
Normal file
32
e2e/ruby/spec/mime_utilities_spec.rb
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'mime_utilities' do
|
||||
it 'mime_detect_bytes: Detect MIME type from file bytes' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
expect(result).to include('pdf')
|
||||
|
||||
end
|
||||
|
||||
it 'mime_detect_image: Detect MIME type from PNG image bytes' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
expect(result).to include('png')
|
||||
|
||||
end
|
||||
|
||||
it 'mime_get_extensions: Get file extensions for a MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/pdf')
|
||||
expect(result).to include('pdf')
|
||||
|
||||
end
|
||||
end
|
||||
28
e2e/ruby/spec/ocr_backend_management_spec.rb
generated
Normal file
28
e2e/ruby/spec/ocr_backend_management_spec.rb
generated
Normal file
@@ -0,0 +1,28 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'ocr_backend_management' do
|
||||
it 'ocr_backends_clear: Clear all OCR backends and verify list is empty' do
|
||||
Kreuzberg.clear_ocr_backends()
|
||||
|
||||
end
|
||||
|
||||
it 'ocr_backends_list: List all registered OCR backends' do
|
||||
result = Kreuzberg.list_ocr_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'ocr_backends_unregister: Unregister nonexistent OCR backend gracefully' do
|
||||
Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz')
|
||||
|
||||
end
|
||||
end
|
||||
26
e2e/ruby/spec/pdf_spec.rb
generated
Normal file
26
e2e/ruby/spec/pdf_spec.rb
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'pdf' do
|
||||
it 'render_pdf_page_first: render_pdf_page_to_png: first page' do
|
||||
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.render_pdf_page_to_png(pdf_bytes, 0)
|
||||
expect(result.length).to be >= 100
|
||||
|
||||
end
|
||||
|
||||
it 'render_pdf_page_out_of_range: render_pdf_page_to_png: page out of range' do
|
||||
expect {
|
||||
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
|
||||
Kreuzberg.render_pdf_page_to_png(pdf_bytes, 999)
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
144
e2e/ruby/spec/plugin_api_spec.rb
generated
Normal file
144
e2e/ruby/spec/plugin_api_spec.rb
generated
Normal file
@@ -0,0 +1,144 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'plugin_api' do
|
||||
it 'register_document_extractor_trait_bridge: register_document_extractor: trait bridge' do
|
||||
stub_register_document_extractor_trait_bridge = Class.new do
|
||||
def name = 'test-extractor'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def extract_bytes(content, mime_type, config) = '{}'
|
||||
def supported_mime_types = []
|
||||
end.new
|
||||
Kreuzberg.register_document_extractor(stub_register_document_extractor_trait_bridge, 'test-extractor')
|
||||
|
||||
Kreuzberg.unregister_document_extractor('test-extractor')
|
||||
end
|
||||
|
||||
it 'register_embedding_backend_trait_bridge: register_embedding_backend: trait bridge' do
|
||||
stub_register_embedding_backend_trait_bridge = Class.new do
|
||||
def name = 'test-embedding-backend'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def dimensions = 1
|
||||
def embed(texts) = []
|
||||
end.new
|
||||
Kreuzberg.register_embedding_backend(stub_register_embedding_backend_trait_bridge, 'test-embedding-backend')
|
||||
|
||||
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
|
||||
end
|
||||
|
||||
it 'register_ocr_backend_trait_bridge: register_ocr_backend: trait bridge' do
|
||||
stub_register_ocr_backend_trait_bridge = Class.new do
|
||||
def name = 'test-backend'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def process_image(image_bytes, config) = '{}'
|
||||
def supports_language(lang) = false
|
||||
def backend_type = '{}'
|
||||
end.new
|
||||
Kreuzberg.register_ocr_backend(stub_register_ocr_backend_trait_bridge, 'test-backend')
|
||||
|
||||
Kreuzberg.unregister_ocr_backend('test-backend')
|
||||
end
|
||||
|
||||
it 'register_post_processor_trait_bridge: register_post_processor: trait bridge' do
|
||||
stub_register_post_processor_trait_bridge = Class.new do
|
||||
def name = 'test-processor'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def process(result, config) = nil
|
||||
def processing_stage = '{}'
|
||||
end.new
|
||||
Kreuzberg.register_post_processor(stub_register_post_processor_trait_bridge, 'test-processor')
|
||||
|
||||
Kreuzberg.unregister_post_processor('test-processor')
|
||||
end
|
||||
|
||||
it 'register_renderer_trait_bridge: register_renderer: trait bridge' do
|
||||
stub_register_renderer_trait_bridge = Class.new do
|
||||
def name = 'test-renderer'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def render(doc) = ''
|
||||
end.new
|
||||
Kreuzberg.register_renderer(stub_register_renderer_trait_bridge, 'test-renderer')
|
||||
|
||||
Kreuzberg.unregister_renderer('test-renderer')
|
||||
end
|
||||
|
||||
it 'register_validator_trait_bridge: register_validator: trait bridge' do
|
||||
stub_register_validator_trait_bridge = Class.new do
|
||||
def name = 'test-validator'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def validate(result, config) = nil
|
||||
end.new
|
||||
Kreuzberg.register_validator(stub_register_validator_trait_bridge, 'test-validator')
|
||||
|
||||
Kreuzberg.unregister_validator('test-validator')
|
||||
end
|
||||
|
||||
it 'unregister_document_extractor_after_register: unregister_document_extractor' do
|
||||
Kreuzberg.unregister_document_extractor('test-extractor')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_embedding_backend_after_register: unregister_embedding_backend' do
|
||||
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_post_processor_after_register: unregister_post_processor' do
|
||||
Kreuzberg.unregister_post_processor('test-processor')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_renderer_after_register: unregister_renderer' do
|
||||
Kreuzberg.unregister_renderer('test-renderer')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_validator_after_register: unregister_validator' do
|
||||
Kreuzberg.unregister_validator('test-validator')
|
||||
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/post_processor_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/post_processor_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'post_processor_management' do
|
||||
it 'post_processors_clear: Clear all post-processors and verify list is empty' do
|
||||
Kreuzberg.clear_post_processors()
|
||||
|
||||
end
|
||||
|
||||
it 'post_processors_list: List all registered post-processors' do
|
||||
result = Kreuzberg.list_post_processors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
30
e2e/ruby/spec/registry_operations_spec.rb
generated
Normal file
30
e2e/ruby/spec/registry_operations_spec.rb
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'registry_operations' do
|
||||
it 'extensions_docx: Get file extensions for DOCX MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'extensions_html: Get file extensions for HTML MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('text/html')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'extensions_pdf: Get file extensions for PDF MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/pdf')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
48
e2e/ruby/spec/registry_spec.rb
generated
Normal file
48
e2e/ruby/spec/registry_spec.rb
generated
Normal file
@@ -0,0 +1,48 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'registry' do
|
||||
it 'list_document_extractors: List document extractors' do
|
||||
result = Kreuzberg.list_document_extractors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_embedding_backends: List embedding backends' do
|
||||
result = Kreuzberg.list_embedding_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_ocr_backends: List OCR backends' do
|
||||
result = Kreuzberg.list_ocr_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_post_processors: List post-processors' do
|
||||
result = Kreuzberg.list_post_processors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_renderers: List renderers' do
|
||||
result = Kreuzberg.list_renderers()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_validators: List validators' do
|
||||
result = Kreuzberg.list_validators()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/renderer_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/renderer_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'renderer_management' do
|
||||
it 'renderers_clear: Clear all renderers and verify list is empty' do
|
||||
Kreuzberg.clear_renderers()
|
||||
|
||||
end
|
||||
|
||||
it 'renderers_list: List all registered renderers' do
|
||||
result = Kreuzberg.list_renderers()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
@@ -0,0 +1,90 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'smoke' do
|
||||
def alef_e2e_item_texts(item)
|
||||
return [item.to_s] if item.is_a?(String) || item.is_a?(Symbol) || item.is_a?(Numeric)
|
||||
[:kind, :name, :signature, :path, :alias, :text, :source].filter_map do |attr|
|
||||
item.respond_to?(attr) ? item.send(attr).to_s : nil
|
||||
end + [item.to_s]
|
||||
end
|
||||
it 'ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.extract_bytes_async(content, 'image/png', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
expect(result.content.length).to be >= 1
|
||||
expect(['Hello', 'World', 'hello', 'world'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_docx_basic: Smoke test: DOCX with formatted text' do
|
||||
result = Kreuzberg.extract_file_async('docx/fake.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
expect(result.content.length).to be >= 20
|
||||
expect(['Lorem', 'ipsum', 'document', 'text'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_html_basic: Smoke test: HTML table extraction' do
|
||||
result = Kreuzberg.extract_file_async('html/simple_table.html', 'text/html', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/html'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['Sample Data Table', 'Laptop', 'Electronics', 'Product'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_image_png: Smoke test: PNG image (without OCR, metadata only)' do
|
||||
result = Kreuzberg.extract_file_async('images/sample.png', nil, Kreuzberg::ExtractionConfig.new(disable_ocr: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_json_basic: Smoke test: JSON file extraction' do
|
||||
result = Kreuzberg.extract_file_async('json/simple.json', 'application/json', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/json'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_pdf_basic: Smoke test: PDF with simple text extraction' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', 'application/pdf', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 50
|
||||
expect(['May 5, 2023', 'To Whom it May Concern'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_txt_basic: Smoke test: Plain text file' do
|
||||
result = Kreuzberg.extract_file_async('text/report.txt', 'text/plain', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/plain'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables' do
|
||||
result = Kreuzberg.extract_file_async('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'.strip)
|
||||
expect(result.content.length).to be >= 100
|
||||
expect(result.content.to_s).to include('Team')
|
||||
expect(result.content.to_s).to include('Location')
|
||||
expect(result.content.to_s).to include('Stanley Cups')
|
||||
expect(result.content.to_s).to include('Blues')
|
||||
expect(result.content.to_s).to include('Flyers')
|
||||
expect(result.content.to_s).to include('Maple Leafs')
|
||||
expect(result.content.to_s).to include('STL')
|
||||
expect(result.content.to_s).to include('PHI')
|
||||
expect(result.content.to_s).to include('TOR')
|
||||
# skipped: field 'tables' not available on result type
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_count' not available on Ruby (serialized to Hash)
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_names' not available on Ruby (serialized to Hash)
|
||||
|
||||
end
|
||||
end
|
||||
73
e2e/ruby/spec/spec_helper.rb
generated
Normal file
73
e2e/ruby/spec/spec_helper.rb
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
# RSpec hooks to isolate plugin registry state between tests.
|
||||
# Unregisters any test-prefixed backends (test-*, test_*) after each test
|
||||
# to prevent pollution from one test affecting subsequent tests.
|
||||
begin
|
||||
require 'kreuzberg'
|
||||
RSpec.configure do |config|
|
||||
# Track initial registry state before each test
|
||||
config.before(:each) do
|
||||
begin
|
||||
@_initial_ocr_backends = Kreuzberg.list_ocr_backends.to_set rescue Set.new
|
||||
@_initial_embedding_backends = Kreuzberg.list_embedding_backends.to_set rescue Set.new
|
||||
@_initial_document_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
|
||||
@_initial_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
|
||||
@_initial_validators = Kreuzberg.list_validators.to_set rescue Set.new
|
||||
@_initial_post_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
|
||||
rescue
|
||||
# If registry functions aren't available, skip cleanup
|
||||
end
|
||||
end
|
||||
|
||||
# Clean up test-prefixed backends after each test
|
||||
config.after(:each) do
|
||||
begin
|
||||
current_ocr = Kreuzberg.list_ocr_backends.to_set rescue Set.new
|
||||
(current_ocr - @_initial_ocr_backends).each do |name|
|
||||
Kreuzberg.unregister_ocr_backend(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_embedding = Kreuzberg.list_embedding_backends.to_set rescue Set.new
|
||||
(current_embedding - @_initial_embedding_backends).each do |name|
|
||||
Kreuzberg.unregister_embedding_backend(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
|
||||
(current_extractors - @_initial_document_extractors).each do |name|
|
||||
Kreuzberg.unregister_document_extractor(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
|
||||
(current_renderers - @_initial_renderers).each do |name|
|
||||
Kreuzberg.unregister_renderer(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_validators = Kreuzberg.list_validators.to_set rescue Set.new
|
||||
(current_validators - @_initial_validators).each do |name|
|
||||
Kreuzberg.unregister_validator(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
|
||||
(current_processors - @_initial_post_processors).each do |name|
|
||||
Kreuzberg.unregister_post_processor(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
rescue
|
||||
# Cleanup failures are non-fatal; continue silently
|
||||
end
|
||||
end
|
||||
end
|
||||
rescue LoadError
|
||||
# Kreuzberg not available; skip registry cleanup
|
||||
end
|
||||
|
||||
# Change to the configured test-documents directory so that fixture file paths like
|
||||
# "pdf/fake_memo.pdf" resolve correctly when running rspec from e2e/ruby/.
|
||||
# spec_helper.rb lives in e2e/ruby/spec/; the fixtures dir resolves three directories up.
|
||||
_test_documents = File.expand_path('../../../test_documents', __dir__)
|
||||
Dir.chdir(_test_documents) if Dir.exist?(_test_documents)
|
||||
23
e2e/ruby/spec/validator_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/validator_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'validator_management' do
|
||||
it 'validators_clear: Clear all validators and verify list is empty' do
|
||||
Kreuzberg.clear_validators()
|
||||
|
||||
end
|
||||
|
||||
it 'validators_list: List all registered validators' do
|
||||
result = Kreuzberg.list_validators()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user