This commit is contained in:
34
e2e/ruby/spec/async_spec.rb
generated
Normal file
34
e2e/ruby/spec/async_spec.rb
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'async' do
|
||||
it 'async_extract_bytes: Async extract_bytes call on PDF document' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_async(content, 'application/pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 50
|
||||
|
||||
end
|
||||
|
||||
it 'async_extract_bytes_empty_mime: extract_bytes empty MIME async' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_async(content, '', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'async_extract_bytes_invalid_mime: extract_bytes unsupported MIME async' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_async(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
72
e2e/ruby/spec/batch_spec.rb
generated
Normal file
72
e2e/ruby/spec/batch_spec.rb
generated
Normal file
@@ -0,0 +1,72 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'batch' do
|
||||
it 'batch_bytes_invalid_mime: batch_extract_bytes_sync invalid MIME' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111], mime_type: "application/x-nonexistent", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_happy: batch_extract_bytes: happy path with mixed inputs' do
|
||||
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33], mime_type: "text/plain", config: nil), Kreuzberg::BatchBytesItem.new(content: [60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111, 100, 121, 62, 60, 47, 104, 116, 109, 108, 62], mime_type: "text/html", config: nil)])
|
||||
expect(result.length).to be >= 1
|
||||
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_mixed_format: batch_extract_bytes: handles unsupported MIME gracefully' do
|
||||
result = Kreuzberg.batch_extract_bytes_async([Kreuzberg::BatchBytesItem.new(content: [80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114], mime_type: "application/x-unknown", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_sync_empty_list: batch_extract_bytes_sync: empty batch' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([])
|
||||
expect(result.length).to eq(0)
|
||||
|
||||
end
|
||||
|
||||
it 'batch_extract_bytes_sync_invalid_mime: batch_extract_bytes_sync: unsupported MIME' do
|
||||
result = Kreuzberg.batch_extract_bytes_sync([Kreuzberg::BatchBytesItem.new(content: [100, 97, 116, 97], mime_type: "application/x-unknown", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_async_basic: Extract text from multiple files asynchronously' do
|
||||
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_async_not_found: batch_extract_file async nonexistent' do
|
||||
result = Kreuzberg.batch_extract_files_async([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_not_found: batch_extract_file_sync nonexistent' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "/nonexistent/a.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/b.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_partial: batch_extract_file_sync mixed' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "text/plain.txt", config: nil), Kreuzberg::BatchFileItem.new(path: "/nonexistent/missing.pdf", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'batch_file_sync_basic: Extract text from multiple files synchronously' do
|
||||
result = Kreuzberg.batch_extract_files_sync([Kreuzberg::BatchFileItem.new(path: "pdf/fake_memo.pdf", config: nil), Kreuzberg::BatchFileItem.new(path: "text/fake_text.txt", config: nil)])
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
21
e2e/ruby/spec/code_spec.rb
generated
Normal file
21
e2e/ruby/spec/code_spec.rb
generated
Normal file
@@ -0,0 +1,21 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'code' do
|
||||
it 'code_shebang_detection: Test language detection from shebang line via bytes input' do
|
||||
result = Kreuzberg.extract_file_sync('code/script.sh', 'text/x-source-code')
|
||||
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(result.content.to_s).to include('build')
|
||||
expect(result.content.to_s).to include('clean')
|
||||
|
||||
end
|
||||
end
|
||||
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
@@ -0,0 +1,150 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'contract' do
|
||||
it 'api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_async: Tests async batch file extraction API (batch_extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_file_async: Tests async file extraction API (extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content' do
|
||||
result = Kreuzberg.extract_file_sync('markdown/extraction_test.md', nil, Kreuzberg::ExtractionConfig.new(chunking: { 'chunker_type' => 'markdown', 'max_chars' => 300, 'max_overlap' => 50, 'prepend_heading_context' => true }))
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'chunks' not available on result type
|
||||
expect((result.chunks || []).all? { |c| c.content && !c.content.empty? }).to be(true)
|
||||
# skipped: synthetic field 'chunks_have_heading_context' not available on Ruby Chunk binding
|
||||
# skipped: synthetic field 'first_chunk_starts_with_heading' not available on Ruby Chunk binding
|
||||
|
||||
end
|
||||
|
||||
it 'config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting' do
|
||||
result = Kreuzberg.extract_file_sync('docx/fake.docx', nil, Kreuzberg::ExtractionConfig.new(include_document_structure: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
# skipped: field 'document' not available on result type
|
||||
# skipped: field 'document.nodes' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_element_types: Tests element-based result format with element type assertions on DOCX' do
|
||||
result = Kreuzberg.extract_file_sync('docx/unit_test_headers.docx', nil, Kreuzberg::ExtractionConfig.new(result_format: 'element_based'))
|
||||
expect(['application/vnd.openxmlformats-officedocument.wordprocessingml.document'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
# skipped: field 'elements' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(extraction_timeout_secs: 300))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_keywords: Tests keyword extraction via YAKE algorithm' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(keywords: { 'algorithm' => 'yake', 'max_keywords' => 10 }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
|
||||
end
|
||||
|
||||
it 'config_pages: Tests page extraction and page marker configuration' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(pages: { 'extract_pages' => true, 'insert_page_markers' => true }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['PAGE'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(enable_quality_processing: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_security_limits: Tests archive extraction with custom security limits' do
|
||||
result = Kreuzberg.extract_file_sync('archives/documents.zip', nil, Kreuzberg::ExtractionConfig.new(security_limits: { 'max_archive_size' => 104857600, 'max_compression_ratio' => 50, 'max_files_in_archive' => 100 }))
|
||||
expect(['application/zip', 'application/x-zip-compressed'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_tree_sitter: Tests tree-sitter configuration round-trip' do
|
||||
result = Kreuzberg.extract_file_sync('code/hello.py', nil, Kreuzberg::ExtractionConfig.new(tree_sitter: { 'groups' => ['web'], 'languages' => ['python', 'rust'], 'process' => { 'comments' => false, 'diagnostics' => false, 'docstrings' => false, 'exports' => true, 'imports' => true, 'structure' => true, 'symbols' => false } }))
|
||||
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_bytes_markdown: Tests markdown output format via bytes extraction API' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf', Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_markdown: Tests Markdown output format' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
end
|
||||
39
e2e/ruby/spec/detection_spec.rb
generated
Normal file
39
e2e/ruby/spec/detection_spec.rb
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'detection' do
|
||||
it 'detect_mime_bytes_html: Detect HTML MIME from bytes' do
|
||||
content = File.read("html/html.html").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'detect_mime_bytes_pdf: Detect PDF MIME type from bytes' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'detect_mime_bytes_png: Detect PNG MIME type from bytes' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_extensions_unknown_mime: get_extensions unknown MIME' do
|
||||
expect {
|
||||
Kreuzberg.get_extensions_for_mime('application/x-totally-unknown')
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/document_extractor_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/document_extractor_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'document_extractor_management' do
|
||||
it 'document_extractors_clear: Clear all document extractors and verify list is empty' do
|
||||
Kreuzberg.clear_document_extractors()
|
||||
|
||||
end
|
||||
|
||||
it 'extractors_list: List all registered document extractors' do
|
||||
result = Kreuzberg.list_document_extractors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
32
e2e/ruby/spec/embed_async_pending_spec.rb
generated
Normal file
32
e2e/ruby/spec/embed_async_pending_spec.rb
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embed_async_pending' do
|
||||
it 'embed_texts_async_empty_input: embed_texts_async: empty text list' do
|
||||
result = Kreuzberg.embed_texts_async([])
|
||||
expect(result.length).to eq(0)
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'embed_texts_async_happy: embed_texts_async: basic async embedding' do
|
||||
result = Kreuzberg.embed_texts_async(['First', 'Second'])
|
||||
expect(result.length).to be >= 2
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'embed_texts_async_preset_switch: embed_texts_async: preset override' do
|
||||
result = Kreuzberg.embed_texts_async(['Text'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
18
e2e/ruby/spec/embed_extra_spec.rb
generated
Normal file
18
e2e/ruby/spec/embed_extra_spec.rb
generated
Normal file
@@ -0,0 +1,18 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embed_extra' do
|
||||
it 'embed_texts_batch: Batch embed texts' do
|
||||
result = Kreuzberg.embed_texts(['Hello', 'World'], {model: { 'name' => 'balanced', 'type' => 'preset' }})
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/embedding_backend_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/embedding_backend_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embedding_backend_management' do
|
||||
it 'embedding_backends_clear: Clear all embedding backends and verify list is empty' do
|
||||
Kreuzberg.clear_embedding_backends()
|
||||
|
||||
end
|
||||
|
||||
it 'embedding_backends_list: List all registered embedding backends' do
|
||||
result = Kreuzberg.list_embedding_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
43
e2e/ruby/spec/embeddings_spec.rb
generated
Normal file
43
e2e/ruby/spec/embeddings_spec.rb
generated
Normal file
@@ -0,0 +1,43 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'embeddings' do
|
||||
it 'embed_texts_different_preset: embed_texts: multilingual preset' do
|
||||
result = Kreuzberg.embed_texts(['Hello world', 'Test'], {model: { 'name' => 'multilingual', 'type' => 'preset' }})
|
||||
expect(result.length).to be >= 2
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_known: get_embedding_preset: known preset' do
|
||||
result = Kreuzberg.get_embedding_preset('balanced')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_nominal: get_embedding_preset: nominal case' do
|
||||
result = Kreuzberg.get_embedding_preset('balanced')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'get_embedding_preset_unknown: get_embedding_preset: unknown preset fails' do
|
||||
result = Kreuzberg.get_embedding_preset('nonexistent-xyz')
|
||||
expect(result.nil? || result.empty?).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'list_embedding_presets_sanity: list_embedding_presets: returns at least one' do
|
||||
result = Kreuzberg.list_embedding_presets()
|
||||
expect(result.to_s).not_to be_empty
|
||||
|
||||
end
|
||||
end
|
||||
47
e2e/ruby/spec/error_spec.rb
generated
Normal file
47
e2e/ruby/spec/error_spec.rb
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'error' do
|
||||
it 'error_empty_bytes: Graceful handling of empty bytes (should not error)' do
|
||||
content = File.read("text/empty.txt").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new())
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'error_empty_mime: Error when extracting with empty MIME type' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, '', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_extract_bytes_conflicting_ocr: extract_bytes force+disable OCR' do
|
||||
expect {
|
||||
content = File.read("text/fake_text.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'text/plain', Kreuzberg::ExtractionConfig.new(disable_ocr: true, force_ocr: true))
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_invalid_mime_format: Error when extracting with invalid MIME type format' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'not-a-mime', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
|
||||
it 'error_unsupported_mime: Error when extracting with unsupported MIME type' do
|
||||
expect {
|
||||
content = File.read("text/plain.txt").bytes
|
||||
Kreuzberg.extract_bytes_sync(content, 'application/x-nonexistent', Kreuzberg::ExtractionConfig.new())
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
47
e2e/ruby/spec/format_specific_spec.rb
generated
Normal file
47
e2e/ruby/spec/format_specific_spec.rb
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'format_specific' do
|
||||
it 'format_docx_standalone: Standalone DOCX extraction using extract_bytes_sync' do
|
||||
content = File.read("docx/fake.docx").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
expect(result.content.length).to be >= 20
|
||||
|
||||
end
|
||||
|
||||
it 'format_hwpx_standalone: Standalone HWPX extraction using extract_bytes_sync' do
|
||||
content = File.read("hwpx/simple.hwpx").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/haansofthwpx')
|
||||
expect(result.content.length).to be >= 20
|
||||
expect(result.content.to_s).to include('Hello from HWPX')
|
||||
|
||||
end
|
||||
|
||||
it 'format_pdf_text: Standalone PDF text extraction using extract_bytes_sync' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf')
|
||||
expect(result.content.length).to be >= 50
|
||||
expect(['Mallori', 'May'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'format_pptx: PPTX presentation extraction using extract_file_sync' do
|
||||
result = Kreuzberg.extract_file_sync('pptx/simple.pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'format_xlsx: XLSX spreadsheet extraction using extract_file_sync' do
|
||||
result = Kreuzberg.extract_file_sync('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
32
e2e/ruby/spec/mime_utilities_spec.rb
generated
Normal file
32
e2e/ruby/spec/mime_utilities_spec.rb
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'mime_utilities' do
|
||||
it 'mime_detect_bytes: Detect MIME type from file bytes' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
expect(result).to include('pdf')
|
||||
|
||||
end
|
||||
|
||||
it 'mime_detect_image: Detect MIME type from PNG image bytes' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.detect_mime_type_from_bytes(content)
|
||||
expect(result).to include('png')
|
||||
|
||||
end
|
||||
|
||||
it 'mime_get_extensions: Get file extensions for a MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/pdf')
|
||||
expect(result).to include('pdf')
|
||||
|
||||
end
|
||||
end
|
||||
28
e2e/ruby/spec/ocr_backend_management_spec.rb
generated
Normal file
28
e2e/ruby/spec/ocr_backend_management_spec.rb
generated
Normal file
@@ -0,0 +1,28 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'ocr_backend_management' do
|
||||
it 'ocr_backends_clear: Clear all OCR backends and verify list is empty' do
|
||||
Kreuzberg.clear_ocr_backends()
|
||||
|
||||
end
|
||||
|
||||
it 'ocr_backends_list: List all registered OCR backends' do
|
||||
result = Kreuzberg.list_ocr_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'ocr_backends_unregister: Unregister nonexistent OCR backend gracefully' do
|
||||
Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz')
|
||||
|
||||
end
|
||||
end
|
||||
26
e2e/ruby/spec/pdf_spec.rb
generated
Normal file
26
e2e/ruby/spec/pdf_spec.rb
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'pdf' do
|
||||
it 'render_pdf_page_first: render_pdf_page_to_png: first page' do
|
||||
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.render_pdf_page_to_png(pdf_bytes, 0)
|
||||
expect(result.length).to be >= 100
|
||||
|
||||
end
|
||||
|
||||
it 'render_pdf_page_out_of_range: render_pdf_page_to_png: page out of range' do
|
||||
expect {
|
||||
pdf_bytes = File.read("pdf/fake_memo.pdf").bytes
|
||||
Kreuzberg.render_pdf_page_to_png(pdf_bytes, 999)
|
||||
}.to raise_error(RuntimeError)
|
||||
end
|
||||
end
|
||||
144
e2e/ruby/spec/plugin_api_spec.rb
generated
Normal file
144
e2e/ruby/spec/plugin_api_spec.rb
generated
Normal file
@@ -0,0 +1,144 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'plugin_api' do
|
||||
it 'register_document_extractor_trait_bridge: register_document_extractor: trait bridge' do
|
||||
stub_register_document_extractor_trait_bridge = Class.new do
|
||||
def name = 'test-extractor'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def extract_bytes(content, mime_type, config) = '{}'
|
||||
def supported_mime_types = []
|
||||
end.new
|
||||
Kreuzberg.register_document_extractor(stub_register_document_extractor_trait_bridge, 'test-extractor')
|
||||
|
||||
Kreuzberg.unregister_document_extractor('test-extractor')
|
||||
end
|
||||
|
||||
it 'register_embedding_backend_trait_bridge: register_embedding_backend: trait bridge' do
|
||||
stub_register_embedding_backend_trait_bridge = Class.new do
|
||||
def name = 'test-embedding-backend'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def dimensions = 1
|
||||
def embed(texts) = []
|
||||
end.new
|
||||
Kreuzberg.register_embedding_backend(stub_register_embedding_backend_trait_bridge, 'test-embedding-backend')
|
||||
|
||||
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
|
||||
end
|
||||
|
||||
it 'register_ocr_backend_trait_bridge: register_ocr_backend: trait bridge' do
|
||||
stub_register_ocr_backend_trait_bridge = Class.new do
|
||||
def name = 'test-backend'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def process_image(image_bytes, config) = '{}'
|
||||
def supports_language(lang) = false
|
||||
def backend_type = '{}'
|
||||
end.new
|
||||
Kreuzberg.register_ocr_backend(stub_register_ocr_backend_trait_bridge, 'test-backend')
|
||||
|
||||
Kreuzberg.unregister_ocr_backend('test-backend')
|
||||
end
|
||||
|
||||
it 'register_post_processor_trait_bridge: register_post_processor: trait bridge' do
|
||||
stub_register_post_processor_trait_bridge = Class.new do
|
||||
def name = 'test-processor'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def process(result, config) = nil
|
||||
def processing_stage = '{}'
|
||||
end.new
|
||||
Kreuzberg.register_post_processor(stub_register_post_processor_trait_bridge, 'test-processor')
|
||||
|
||||
Kreuzberg.unregister_post_processor('test-processor')
|
||||
end
|
||||
|
||||
it 'register_renderer_trait_bridge: register_renderer: trait bridge' do
|
||||
stub_register_renderer_trait_bridge = Class.new do
|
||||
def name = 'test-renderer'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def render(doc) = ''
|
||||
end.new
|
||||
Kreuzberg.register_renderer(stub_register_renderer_trait_bridge, 'test-renderer')
|
||||
|
||||
Kreuzberg.unregister_renderer('test-renderer')
|
||||
end
|
||||
|
||||
it 'register_validator_trait_bridge: register_validator: trait bridge' do
|
||||
stub_register_validator_trait_bridge = Class.new do
|
||||
def name = 'test-validator'
|
||||
def initialize
|
||||
nil
|
||||
end
|
||||
def shutdown
|
||||
nil
|
||||
end
|
||||
def version = '1.0.0'
|
||||
def validate(result, config) = nil
|
||||
end.new
|
||||
Kreuzberg.register_validator(stub_register_validator_trait_bridge, 'test-validator')
|
||||
|
||||
Kreuzberg.unregister_validator('test-validator')
|
||||
end
|
||||
|
||||
it 'unregister_document_extractor_after_register: unregister_document_extractor' do
|
||||
Kreuzberg.unregister_document_extractor('test-extractor')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_embedding_backend_after_register: unregister_embedding_backend' do
|
||||
Kreuzberg.unregister_embedding_backend('test-embedding-backend')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_post_processor_after_register: unregister_post_processor' do
|
||||
Kreuzberg.unregister_post_processor('test-processor')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_renderer_after_register: unregister_renderer' do
|
||||
Kreuzberg.unregister_renderer('test-renderer')
|
||||
|
||||
end
|
||||
|
||||
it 'unregister_validator_after_register: unregister_validator' do
|
||||
Kreuzberg.unregister_validator('test-validator')
|
||||
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/post_processor_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/post_processor_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'post_processor_management' do
|
||||
it 'post_processors_clear: Clear all post-processors and verify list is empty' do
|
||||
Kreuzberg.clear_post_processors()
|
||||
|
||||
end
|
||||
|
||||
it 'post_processors_list: List all registered post-processors' do
|
||||
result = Kreuzberg.list_post_processors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
30
e2e/ruby/spec/registry_operations_spec.rb
generated
Normal file
30
e2e/ruby/spec/registry_operations_spec.rb
generated
Normal file
@@ -0,0 +1,30 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'registry_operations' do
|
||||
it 'extensions_docx: Get file extensions for DOCX MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'extensions_html: Get file extensions for HTML MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('text/html')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'extensions_pdf: Get file extensions for PDF MIME type' do
|
||||
result = Kreuzberg.get_extensions_for_mime('application/pdf')
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
48
e2e/ruby/spec/registry_spec.rb
generated
Normal file
48
e2e/ruby/spec/registry_spec.rb
generated
Normal file
@@ -0,0 +1,48 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'registry' do
|
||||
it 'list_document_extractors: List document extractors' do
|
||||
result = Kreuzberg.list_document_extractors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_embedding_backends: List embedding backends' do
|
||||
result = Kreuzberg.list_embedding_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_ocr_backends: List OCR backends' do
|
||||
result = Kreuzberg.list_ocr_backends()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_post_processors: List post-processors' do
|
||||
result = Kreuzberg.list_post_processors()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_renderers: List renderers' do
|
||||
result = Kreuzberg.list_renderers()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
|
||||
it 'list_validators: List validators' do
|
||||
result = Kreuzberg.list_validators()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
23
e2e/ruby/spec/renderer_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/renderer_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'renderer_management' do
|
||||
it 'renderers_clear: Clear all renderers and verify list is empty' do
|
||||
Kreuzberg.clear_renderers()
|
||||
|
||||
end
|
||||
|
||||
it 'renderers_list: List all registered renderers' do
|
||||
result = Kreuzberg.list_renderers()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
@@ -0,0 +1,90 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'smoke' do
|
||||
def alef_e2e_item_texts(item)
|
||||
return [item.to_s] if item.is_a?(String) || item.is_a?(Symbol) || item.is_a?(Numeric)
|
||||
[:kind, :name, :signature, :path, :alias, :text, :source].filter_map do |attr|
|
||||
item.respond_to?(attr) ? item.send(attr).to_s : nil
|
||||
end + [item.to_s]
|
||||
end
|
||||
it 'ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.extract_bytes_async(content, 'image/png', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
expect(result.content.length).to be >= 1
|
||||
expect(['Hello', 'World', 'hello', 'world'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_docx_basic: Smoke test: DOCX with formatted text' do
|
||||
result = Kreuzberg.extract_file_async('docx/fake.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
expect(result.content.length).to be >= 20
|
||||
expect(['Lorem', 'ipsum', 'document', 'text'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_html_basic: Smoke test: HTML table extraction' do
|
||||
result = Kreuzberg.extract_file_async('html/simple_table.html', 'text/html', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/html'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['Sample Data Table', 'Laptop', 'Electronics', 'Product'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_image_png: Smoke test: PNG image (without OCR, metadata only)' do
|
||||
result = Kreuzberg.extract_file_async('images/sample.png', nil, Kreuzberg::ExtractionConfig.new(disable_ocr: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_json_basic: Smoke test: JSON file extraction' do
|
||||
result = Kreuzberg.extract_file_async('json/simple.json', 'application/json', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/json'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_pdf_basic: Smoke test: PDF with simple text extraction' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', 'application/pdf', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 50
|
||||
expect(['May 5, 2023', 'To Whom it May Concern'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_txt_basic: Smoke test: Plain text file' do
|
||||
result = Kreuzberg.extract_file_async('text/report.txt', 'text/plain', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/plain'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables' do
|
||||
result = Kreuzberg.extract_file_async('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'.strip)
|
||||
expect(result.content.length).to be >= 100
|
||||
expect(result.content.to_s).to include('Team')
|
||||
expect(result.content.to_s).to include('Location')
|
||||
expect(result.content.to_s).to include('Stanley Cups')
|
||||
expect(result.content.to_s).to include('Blues')
|
||||
expect(result.content.to_s).to include('Flyers')
|
||||
expect(result.content.to_s).to include('Maple Leafs')
|
||||
expect(result.content.to_s).to include('STL')
|
||||
expect(result.content.to_s).to include('PHI')
|
||||
expect(result.content.to_s).to include('TOR')
|
||||
# skipped: field 'tables' not available on result type
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_count' not available on Ruby (serialized to Hash)
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_names' not available on Ruby (serialized to Hash)
|
||||
|
||||
end
|
||||
end
|
||||
73
e2e/ruby/spec/spec_helper.rb
generated
Normal file
73
e2e/ruby/spec/spec_helper.rb
generated
Normal file
@@ -0,0 +1,73 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
# RSpec hooks to isolate plugin registry state between tests.
|
||||
# Unregisters any test-prefixed backends (test-*, test_*) after each test
|
||||
# to prevent pollution from one test affecting subsequent tests.
|
||||
begin
|
||||
require 'kreuzberg'
|
||||
RSpec.configure do |config|
|
||||
# Track initial registry state before each test
|
||||
config.before(:each) do
|
||||
begin
|
||||
@_initial_ocr_backends = Kreuzberg.list_ocr_backends.to_set rescue Set.new
|
||||
@_initial_embedding_backends = Kreuzberg.list_embedding_backends.to_set rescue Set.new
|
||||
@_initial_document_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
|
||||
@_initial_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
|
||||
@_initial_validators = Kreuzberg.list_validators.to_set rescue Set.new
|
||||
@_initial_post_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
|
||||
rescue
|
||||
# If registry functions aren't available, skip cleanup
|
||||
end
|
||||
end
|
||||
|
||||
# Clean up test-prefixed backends after each test
|
||||
config.after(:each) do
|
||||
begin
|
||||
current_ocr = Kreuzberg.list_ocr_backends.to_set rescue Set.new
|
||||
(current_ocr - @_initial_ocr_backends).each do |name|
|
||||
Kreuzberg.unregister_ocr_backend(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_embedding = Kreuzberg.list_embedding_backends.to_set rescue Set.new
|
||||
(current_embedding - @_initial_embedding_backends).each do |name|
|
||||
Kreuzberg.unregister_embedding_backend(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_extractors = Kreuzberg.list_document_extractors.to_set rescue Set.new
|
||||
(current_extractors - @_initial_document_extractors).each do |name|
|
||||
Kreuzberg.unregister_document_extractor(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_renderers = Kreuzberg.list_renderers.to_set rescue Set.new
|
||||
(current_renderers - @_initial_renderers).each do |name|
|
||||
Kreuzberg.unregister_renderer(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_validators = Kreuzberg.list_validators.to_set rescue Set.new
|
||||
(current_validators - @_initial_validators).each do |name|
|
||||
Kreuzberg.unregister_validator(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
|
||||
current_processors = Kreuzberg.list_post_processors.to_set rescue Set.new
|
||||
(current_processors - @_initial_post_processors).each do |name|
|
||||
Kreuzberg.unregister_post_processor(name) if name.to_s.start_with?('test-', 'test_')
|
||||
end
|
||||
rescue
|
||||
# Cleanup failures are non-fatal; continue silently
|
||||
end
|
||||
end
|
||||
end
|
||||
rescue LoadError
|
||||
# Kreuzberg not available; skip registry cleanup
|
||||
end
|
||||
|
||||
# Change to the configured test-documents directory so that fixture file paths like
|
||||
# "pdf/fake_memo.pdf" resolve correctly when running rspec from e2e/ruby/.
|
||||
# spec_helper.rb lives in e2e/ruby/spec/; the fixtures dir resolves three directories up.
|
||||
_test_documents = File.expand_path('../../../test_documents', __dir__)
|
||||
Dir.chdir(_test_documents) if Dir.exist?(_test_documents)
|
||||
23
e2e/ruby/spec/validator_management_spec.rb
generated
Normal file
23
e2e/ruby/spec/validator_management_spec.rb
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'validator_management' do
|
||||
it 'validators_clear: Clear all validators and verify list is empty' do
|
||||
Kreuzberg.clear_validators()
|
||||
|
||||
end
|
||||
|
||||
it 'validators_list: List all registered validators' do
|
||||
result = Kreuzberg.list_validators()
|
||||
|
||||
expect(result).not_to be_nil
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user