# This file is auto-generated by alef — DO NOT EDIT. # alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 # To regenerate: alef generate # To verify freshness: alef verify --exit-code # Issues & docs: https://github.com/kreuzberg-dev/alef # E2e tests for category: contract defmodule E2e.ContractTest do use ExUnit.Case, async: false defp alef_e2e_format_to_string(value) when is_binary(value), do: value defp alef_e2e_format_to_string(metadata) do case metadata.image do %{format: fmt} when is_binary(fmt) -> fmt _ -> case metadata.pdf do %{} -> "PDF" _ -> case metadata.html do %{} -> "HTML" _ -> inspect(metadata) end end end end describe "api_batch_bytes_async" do test "api_batch_bytes_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end) end end describe "api_batch_bytes_with_configs_async" do test "api_batch_bytes_with_configs_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'metadata.output_format' not available on result type end end describe "api_batch_file_async" do test "api_batch_file_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end) end end describe "api_batch_file_with_configs_async" do test "api_batch_file_with_configs_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'metadata.output_format' not available on result type end end describe "api_extract_bytes_async" do test "api_extract_bytes_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end) end end describe "api_extract_file_async" do test "api_extract_file_async" do {:ok, result} = Kreuzberg.extract_file_async("../../test_documents/pdf/fake_memo.pdf") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) assert Enum.any?(["May 5, 2023", "Mallori"], fn v -> String.contains?(to_string(result.content), v) end) end end describe "config_chunking_prepend_heading_context" do test "config_chunking_prepend_heading_context" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/markdown/extraction_test.md", config: "{\"chunking\":{\"chunker_type\":\"markdown\",\"max_chars\":300,\"max_overlap\":50,\"prepend_heading_context\":true}}") assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'chunks' not available on result type assert Enum.all?(result.chunks || [], fn c -> c.content != nil and c.content != "" end) assert Enum.all?(result.chunks || [], fn c -> c.metadata != nil and c.metadata.heading_context != nil end) assert (case List.first(result.chunks || []) do c when is_map(c) -> String.trim_leading(c.content || "") |> String.starts_with?("#") _ -> false end) end end describe "config_document_structure_with_headings" do test "config_document_structure_with_headings" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/docx/fake.docx", config: "{\"include_document_structure\":true}") assert String.trim(result.mime_type) == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # skipped: field 'document' not available on result type # skipped: field 'document.nodes' not available on result type end end describe "config_element_types" do test "config_element_types" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/docx/unit_test_headers.docx", config: "{\"result_format\":\"element_based\"}") assert Enum.any?(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], fn v -> String.contains?(to_string(result.mime_type), v) end) # skipped: field 'elements' not available on result type end end describe "config_extraction_timeout" do test "config_extraction_timeout" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"extraction_timeout_secs\":300}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) end end describe "config_keywords" do test "config_keywords" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"keywords\":{\"algorithm\":\"yake\",\"max_keywords\":10}}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'keywords' not available on Elixir ExtractionResult # skipped: field 'keywords' not available on Elixir ExtractionResult end end describe "config_pages" do test "config_pages" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"pages\":{\"extract_pages\":true,\"insert_page_markers\":true}}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) assert Enum.any?(["PAGE"], fn v -> String.contains?(to_string(result.content), v) end) end end describe "config_quality_enabled" do test "config_quality_enabled" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"enable_quality_processing\":true}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'quality_score' not available on result type # skipped: field 'quality_score' not available on result type # skipped: field 'quality_score' not available on result type end end describe "config_security_limits" do test "config_security_limits" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/archives/documents.zip", config: "{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}") assert Enum.any?(["application/zip", "application/x-zip-compressed"], fn v -> String.contains?(to_string(result.mime_type), v) end) assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) end end describe "config_tree_sitter" do test "config_tree_sitter" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/code/hello.py", config: "{\"tree_sitter\":{\"groups\":[\"web\"],\"languages\":[\"python\",\"rust\"],\"process\":{\"comments\":false,\"diagnostics\":false,\"docstrings\":false,\"exports\":true,\"imports\":true,\"structure\":true,\"symbols\":false}}}") assert String.trim(result.mime_type) == "text/x-source-code" assert (is_binary(result.content) && byte_size(result.content) >= 5) || (is_list(result.content) && length(result.content) >= 5) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 5) end end describe "output_format_bytes_markdown" do test "output_format_bytes_markdown" do content = File.read!("../../test_documents/pdf/fake_memo.pdf") {:ok, result} = Kreuzberg.extract_bytes_sync(content, "application/pdf", "{\"output_format\":\"markdown\"}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'metadata.output_format' not available on result type end end describe "output_format_markdown" do test "output_format_markdown" do {:ok, result} = Kreuzberg.extract_file_sync("../../test_documents/pdf/fake_memo.pdf", config: "{\"output_format\":\"markdown\"}") assert String.trim(result.mime_type) == "application/pdf" assert (is_binary(result.content) && byte_size(result.content) >= 10) || (is_list(result.content) && length(result.content) >= 10) || (is_binary(result.content) == false && is_list(result.content) == false && String.length(result.content) >= 10) # skipped: field 'metadata.output_format' not available on result type end end end