This commit is contained in:
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
90
e2e/ruby/spec/smoke_spec.rb
generated
Normal file
@@ -0,0 +1,90 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'smoke' do
|
||||
def alef_e2e_item_texts(item)
|
||||
return [item.to_s] if item.is_a?(String) || item.is_a?(Symbol) || item.is_a?(Numeric)
|
||||
[:kind, :name, :signature, :path, :alias, :text, :source].filter_map do |attr|
|
||||
item.respond_to?(attr) ? item.send(attr).to_s : nil
|
||||
end + [item.to_s]
|
||||
end
|
||||
it 'ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.' do
|
||||
content = File.read("images/test_hello_world.png").bytes
|
||||
result = Kreuzberg.extract_bytes_async(content, 'image/png', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
expect(result.content.length).to be >= 1
|
||||
expect(['Hello', 'World', 'hello', 'world'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_docx_basic: Smoke test: DOCX with formatted text' do
|
||||
result = Kreuzberg.extract_file_async('docx/fake.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
expect(result.content.length).to be >= 20
|
||||
expect(['Lorem', 'ipsum', 'document', 'text'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_html_basic: Smoke test: HTML table extraction' do
|
||||
result = Kreuzberg.extract_file_async('html/simple_table.html', 'text/html', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/html'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['Sample Data Table', 'Laptop', 'Electronics', 'Product'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_image_png: Smoke test: PNG image (without OCR, metadata only)' do
|
||||
result = Kreuzberg.extract_file_async('images/sample.png', nil, Kreuzberg::ExtractionConfig.new(disable_ocr: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('image/png'.strip)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_json_basic: Smoke test: JSON file extraction' do
|
||||
result = Kreuzberg.extract_file_async('json/simple.json', 'application/json', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/json'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_pdf_basic: Smoke test: PDF with simple text extraction' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', 'application/pdf', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 50
|
||||
expect(['May 5, 2023', 'To Whom it May Concern'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_txt_basic: Smoke test: Plain text file' do
|
||||
result = Kreuzberg.extract_file_async('text/report.txt', 'text/plain', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('text/plain'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables' do
|
||||
result = Kreuzberg.extract_file_async('xlsx/stanley_cups.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', Kreuzberg::ExtractionConfig.new())
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'.strip)
|
||||
expect(result.content.length).to be >= 100
|
||||
expect(result.content.to_s).to include('Team')
|
||||
expect(result.content.to_s).to include('Location')
|
||||
expect(result.content.to_s).to include('Stanley Cups')
|
||||
expect(result.content.to_s).to include('Blues')
|
||||
expect(result.content.to_s).to include('Flyers')
|
||||
expect(result.content.to_s).to include('Maple Leafs')
|
||||
expect(result.content.to_s).to include('STL')
|
||||
expect(result.content.to_s).to include('PHI')
|
||||
expect(result.content.to_s).to include('TOR')
|
||||
# skipped: field 'tables' not available on result type
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_count' not available on Ruby (serialized to Hash)
|
||||
# skipped: enum variant accessor 'metadata.format.excel.sheet_names' not available on Ruby (serialized to Hash)
|
||||
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user