This commit is contained in:
183
e2e/elixir/test/contract_test.exs
generated
Normal file
183
e2e/elixir/test/contract_test.exs
generated
Normal file
@@ -0,0 +1,183 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
# E2e tests for category: contract
|
||||
defmodule E2e.ContractTest do
|
||||
use ExUnit.Case, async: false
|
||||
|
||||
defp alef_e2e_format_to_string(value) when is_binary(value), do: value
|
||||
defp alef_e2e_format_to_string(metadata) do
|
||||
case metadata.image do
|
||||
%{format: fmt} when is_binary(fmt) -> fmt
|
||||
_ ->
|
||||
case metadata.pdf do
|
||||
%{} -> "PDF"
|
||||
_ ->
|
||||
case metadata.html do
|
||||
%{} -> "HTML"
|
||||
_ -> inspect(metadata)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_batch_bytes_async" do
|
||||
test "api_batch_bytes_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_batch_bytes_with_configs_async" do
|
||||
test "api_batch_bytes_with_configs_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_batch_file_async" do
|
||||
test "api_batch_file_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_batch_file_with_configs_async" do
|
||||
test "api_batch_file_with_configs_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_extract_bytes_async" do
|
||||
test "api_extract_bytes_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "api_extract_file_async" do
|
||||
test "api_extract_file_async" do
|
||||
{:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_chunking_prepend_heading_context" do
|
||||
test "config_chunking_prepend_heading_context" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/markdown/extraction_test.md", config: "{\"chunking\":{\"chunker_type\":\"markdown\",\"max_chars\":300,\"max_overlap\":50,\"prepend_heading_context\":true}}")
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'chunks' not available on result type
|
||||
assert Enum.all?(result.chunks || [], fn c -> c.content != nil and c.content != "" end)
|
||||
assert Enum.all?(result.chunks || [], fn c -> c.metadata != nil and c.metadata.heading_context != nil end)
|
||||
assert (case List.first(result.chunks || []) do
|
||||
c when is_map(c) -> String.trim_leading(c.content || "") |> String.starts_with?("#")
|
||||
_ -> false
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_document_structure_with_headings" do
|
||||
test "config_document_structure_with_headings" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/docx/fake.docx", config: "{\"include_document_structure\":true}")
|
||||
assert String.trim(result.mime_type) == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
# skipped: field 'document' not available on result type
|
||||
# skipped: field 'document.nodes' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_element_types" do
|
||||
test "config_element_types" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/docx/unit_test_headers.docx", config: "{\"result_format\":\"element_based\"}")
|
||||
assert Enum.any?(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], fn v -> String.contains?(to_string(result.mime_type), v) end)
|
||||
# skipped: field 'elements' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_extraction_timeout" do
|
||||
test "config_extraction_timeout" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"extraction_timeout_secs\":300}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_keywords" do
|
||||
test "config_keywords" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"keywords\":{\"algorithm\":\"yake\",\"max_keywords\":10}}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'keywords' not available on Elixir ExtractionResult
|
||||
# skipped: field 'keywords' not available on Elixir ExtractionResult
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_pages" do
|
||||
test "config_pages" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"pages\":{\"extract_pages\":true,\"insert_page_markers\":true}}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
assert Enum.any?(["PAGE"], fn v -> String.contains?(to_string(result.content), v) end)
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_quality_enabled" do
|
||||
test "config_quality_enabled" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"enable_quality_processing\":true}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_security_limits" do
|
||||
test "config_security_limits" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/archives/documents.zip", config: "{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}")
|
||||
assert Enum.any?(["application/zip", "application/x-zip-compressed"], fn v -> String.contains?(to_string(result.mime_type), v) end)
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
end
|
||||
end
|
||||
|
||||
describe "config_tree_sitter" do
|
||||
test "config_tree_sitter" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/code/hello.py", config: "{\"tree_sitter\":{\"groups\":[\"web\"],\"languages\":[\"python\",\"rust\"],\"process\":{\"comments\":false,\"diagnostics\":false,\"docstrings\":false,\"exports\":true,\"imports\":true,\"structure\":true,\"symbols\":false}}}")
|
||||
assert String.trim(result.mime_type) == "text/x-source-code"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 5) || (is_list(result.content) && length(result.content) >= 5) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 5)
|
||||
end
|
||||
end
|
||||
|
||||
describe "output_format_bytes_markdown" do
|
||||
test "output_format_bytes_markdown" do
|
||||
content = File.read!("../../test_documents/pdf/fake_memo.pdf")
|
||||
{:ok, result} = Kreuzberg.extract_bytes_sync(content, "application/pdf", "{\"output_format\":\"markdown\"}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
end
|
||||
end
|
||||
|
||||
describe "output_format_markdown" do
|
||||
test "output_format_markdown" do
|
||||
{:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}")
|
||||
assert String.trim(result.mime_type) == "application/pdf"
|
||||
assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10)
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user