Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

150
e2e/ruby/spec/contract_spec.rb generated Normal file
View File

@@ -0,0 +1,150 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
# frozen_string_literal: true
require 'kreuzberg'
require 'json'
require 'spec_helper'
RSpec.describe 'contract' do
it 'api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'api_batch_file_async: Tests async batch file extraction API (batch_extract_file)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'api_extract_file_async: Tests async file extraction API (extract_file)' do
result = Kreuzberg.extract_file_async('pdf/fake_memo.pdf')
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['May 5, 2023', 'Mallori'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content' do
result = Kreuzberg.extract_file_sync('markdown/extraction_test.md', nil, Kreuzberg::ExtractionConfig.new(chunking: { 'chunker_type' => 'markdown', 'max_chars' => 300, 'max_overlap' => 50, 'prepend_heading_context' => true }))
expect(result.content.length).to be >= 10
# skipped: field 'chunks' not available on result type
expect((result.chunks || []).all? { |c| c.content && !c.content.empty? }).to be(true)
# skipped: synthetic field 'chunks_have_heading_context' not available on Ruby Chunk binding
# skipped: synthetic field 'first_chunk_starts_with_heading' not available on Ruby Chunk binding
end
it 'config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting' do
result = Kreuzberg.extract_file_sync('docx/fake.docx', nil, Kreuzberg::ExtractionConfig.new(include_document_structure: true))
expect(result.mime_type.to_s.strip).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document'.strip)
# skipped: field 'document' not available on result type
# skipped: field 'document.nodes' not available on result type
end
it 'config_element_types: Tests element-based result format with element type assertions on DOCX' do
result = Kreuzberg.extract_file_sync('docx/unit_test_headers.docx', nil, Kreuzberg::ExtractionConfig.new(result_format: 'element_based'))
expect(['application/vnd.openxmlformats-officedocument.wordprocessingml.document'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
# skipped: field 'elements' not available on result type
end
it 'config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(extraction_timeout_secs: 300))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
end
it 'config_keywords: Tests keyword extraction via YAKE algorithm' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(keywords: { 'algorithm' => 'yake', 'max_keywords' => 10 }))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'keywords' not available on Ruby ExtractionResult
# skipped: field 'keywords' not available on Ruby ExtractionResult
end
it 'config_pages: Tests page extraction and page marker configuration' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(pages: { 'extract_pages' => true, 'insert_page_markers' => true }))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
expect(['PAGE'].any? { |v| result.content.to_s.include?(v) }).to be(true)
end
it 'config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(enable_quality_processing: true))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
end
it 'config_security_limits: Tests archive extraction with custom security limits' do
result = Kreuzberg.extract_file_sync('archives/documents.zip', nil, Kreuzberg::ExtractionConfig.new(security_limits: { 'max_archive_size' => 104857600, 'max_compression_ratio' => 50, 'max_files_in_archive' => 100 }))
expect(['application/zip', 'application/x-zip-compressed'].any? { |v| result.mime_type.to_s.include?(v) }).to be(true)
expect(result.content.length).to be >= 10
end
it 'config_tree_sitter: Tests tree-sitter configuration round-trip' do
result = Kreuzberg.extract_file_sync('code/hello.py', nil, Kreuzberg::ExtractionConfig.new(tree_sitter: { 'groups' => ['web'], 'languages' => ['python', 'rust'], 'process' => { 'comments' => false, 'diagnostics' => false, 'docstrings' => false, 'exports' => true, 'imports' => true, 'structure' => true, 'symbols' => false } }))
expect(result.mime_type.to_s.strip).to eq('text/x-source-code'.strip)
expect(result.content.length).to be >= 5
end
it 'output_format_bytes_markdown: Tests markdown output format via bytes extraction API' do
content = File.read("pdf/fake_memo.pdf").bytes
result = Kreuzberg.extract_bytes_sync(content, 'application/pdf', Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
it 'output_format_markdown: Tests Markdown output format' do
result = Kreuzberg.extract_file_sync('pdf/fake_memo.pdf', nil, Kreuzberg::ExtractionConfig.new(output_format: 'markdown'))
expect(result.mime_type.to_s.strip).to eq('application/pdf'.strip)
expect(result.content.length).to be >= 10
# skipped: field 'metadata.output_format' not available on result type
end
end