This commit is contained in:
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
150
e2e/ruby/spec/contract_spec.rb
generated
Normal file
@@ -0,0 +1,150 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
require 'spec_helper'
|
||||
|
||||
RSpec.describe 'contract' do
|
||||
it 'api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_async: Tests async batch file extraction API (batch_extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'api_extract_file_async: Tests async file extraction API (extract_file)' do
|
||||
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content' do
|
||||
result = Kreuzberg.extract_file_sync('markdown/extraction_test.md', nil, Kreuzberg::ExtractionConfig.new(chunking: { 'chunker_type' => 'markdown', 'max_chars' => 300, 'max_overlap' => 50, 'prepend_heading_context' => true }))
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'chunks' not available on result type
|
||||
expect((result.chunks || []).all? { |c| c.content && !c.content.empty? }).to be(true)
|
||||
# skipped: synthetic field 'chunks_have_heading_context' not available on Ruby Chunk binding
|
||||
# skipped: synthetic field 'first_chunk_starts_with_heading' not available on Ruby Chunk binding
|
||||
|
||||
end
|
||||
|
||||
it 'config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting' do
|
||||
result = Kreuzberg.extract_file_sync('docx/fake.docx', nil, Kreuzberg::ExtractionConfig.new(include_document_structure: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
|
||||
# skipped: field 'document' not available on result type
|
||||
# skipped: field 'document.nodes' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_element_types: Tests element-based result format with element type assertions on DOCX' do
|
||||
result = Kreuzberg.extract_file_sync('docx/unit_test_headers.docx', nil, Kreuzberg::ExtractionConfig.new(result_format: 'element_based'))
|
||||
expect(['application/vnd.openxmlformats-officedocument.wordprocessingml.document'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
# skipped: field 'elements' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(extraction_timeout_secs: 300))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_keywords: Tests keyword extraction via YAKE algorithm' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(keywords: { 'algorithm' => 'yake', 'max_keywords' => 10 }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
# skipped: field 'keywords' not available on Ruby ExtractionResult
|
||||
|
||||
end
|
||||
|
||||
it 'config_pages: Tests page extraction and page marker configuration' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(pages: { 'extract_pages' => true, 'insert_page_markers' => true }))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
expect(['PAGE'].any? { |v| result.content.to_s.include?(v) }).to be(true)
|
||||
|
||||
end
|
||||
|
||||
it 'config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(enable_quality_processing: true))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'config_security_limits: Tests archive extraction with custom security limits' do
|
||||
result = Kreuzberg.extract_file_sync('archives/documents.zip', nil, Kreuzberg::ExtractionConfig.new(security_limits: { 'max_archive_size' => 104857600, 'max_compression_ratio' => 50, 'max_files_in_archive' => 100 }))
|
||||
expect(['application/zip', 'application/x-zip-compressed'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
|
||||
expect(result.content.length).to be >= 10
|
||||
|
||||
end
|
||||
|
||||
it 'config_tree_sitter: Tests tree-sitter configuration round-trip' do
|
||||
result = Kreuzberg.extract_file_sync('code/hello.py', nil, Kreuzberg::ExtractionConfig.new(tree_sitter: { 'groups' => ['web'], 'languages' => ['python', 'rust'], 'process' => { 'comments' => false, 'diagnostics' => false, 'docstrings' => false, 'exports' => true, 'imports' => true, 'structure' => true, 'symbols' => false } }))
|
||||
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
|
||||
expect(result.content.length).to be >= 5
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_bytes_markdown: Tests markdown output format via bytes extraction API' do
|
||||
content = File.read("pdf/fake_memo.pdf").bytes
|
||||
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf', Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
|
||||
it 'output_format_markdown: Tests Markdown output format' do
|
||||
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
|
||||
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
|
||||
expect(result.content.length).to be >= 10
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user