diff --git a/test/data/s3/conftest.py b/test/data/s3/conftest.py index 43c00c26466..f4e6123c624 100644 --- a/test/data/s3/conftest.py +++ b/test/data/s3/conftest.py @@ -9,11 +9,13 @@ S3ROOT.rstrip("/"), S3ROOT if S3ROOT.endswith("/") else S3ROOT + "/", ] + S3ROOT_IDS = ["no_slash", "with_slash"] else: S3ROOT_VARIANTS = [None] + S3ROOT_IDS = ["no_s3root"] -@pytest.fixture(params=S3ROOT_VARIANTS, ids=["no_slash", "with_slash"]) +@pytest.fixture(params=S3ROOT_VARIANTS, ids=S3ROOT_IDS) def s3root(request): """ Fixture that provides S3ROOT with and without trailing slash. @@ -30,9 +32,8 @@ def reset_current_env(): Fixture to ensure the metaflow current environment is clean between tests. This prevents test pollution when tests manipulate the global current state. - The fixture runs automatically for all tests in this directory. """ - # Setup: Save all private attributes that might be set by current._set_env + # Setup: Capture internal state saved_state = { attr: getattr(current, attr, None) for attr in dir(current) @@ -41,27 +42,29 @@ def reset_current_env(): yield - # Teardown: Clear all current environment attributes - # First, remove any new attributes that were added - for attr in dir(current): - if ( - attr.startswith("_") - and not attr.startswith("__") - and attr not in saved_state - ): + # Teardown: Restore original attributes and clean up new ones + current_attrs = [ + attr + for attr in dir(current) + if attr.startswith("_") and not attr.startswith("__") + ] + + # 1. Clean up attributes created during the test + for attr in current_attrs: + if attr not in saved_state: try: delattr(current, attr) except AttributeError: - pass # Some attributes may be read-only + pass - # Then restore original values + # 2. Restore all original attributes (re-creating any that were deleted) for attr, value in saved_state.items(): try: if value is None: - # Remove attribute if it didn't exist before + # Remove if it was strictly None/non-existent before if hasattr(current, attr): delattr(current, attr) else: setattr(current, attr, value) - except AttributeError: - pass # Some attributes may be read-only + except (AttributeError, TypeError): + pass diff --git a/test/data/s3/s3_data.py b/test/data/s3/s3_data.py index e7869bdf9e2..2edb06984bc 100644 --- a/test/data/s3/s3_data.py +++ b/test/data/s3/s3_data.py @@ -499,7 +499,9 @@ def pytest_put_blobs_case(meta=None): def ensure_test_data(): - # update S3ROOT in __init__.py to get a fresh set of data + if S3ROOT is None: + print("S3ROOT is None. Skipping test data generation.") + return print("Ensuring that test data exists at %s" % S3ROOT) mark = urlparse(os.path.join(S3ROOT, "ALL_OK")) try: diff --git a/test/data/s3/test_s3.py b/test/data/s3/test_s3.py index c14823e63eb..b617ebf0cd3 100644 --- a/test/data/s3/test_s3.py +++ b/test/data/s3/test_s3.py @@ -1,12 +1,10 @@ import os -from re import I import shutil from hashlib import sha1 from tempfile import mkdtemp from itertools import groupby, starmap import random from uuid import uuid4 -from metaflow.plugins.datatools import s3 import pytest @@ -30,10 +28,32 @@ try: # python2 from urlparse import urlparse -except: +except ImportError: # python3 from urllib.parse import urlparse +INJECT_FAILURE_RATES = [0, 10, 50, 90] +INJECT_FAILURE_RATE_IDS = [ + "fail_rate_0", + "fail_rate_10", + "fail_rate_50", + "fail_rate_90", +] + + +@pytest.fixture +def tempdir(): + tmpdir = mkdtemp(dir=".", prefix="metaflow.test.tmp") + yield tmpdir + shutil.rmtree(tmpdir) + + +@pytest.fixture +def s3_server_side_encryption(): + if os.environ.get("MINIO_TEST", "0") != "0": + return None + return "AES256" + def s3_get_object_from_url_range(url, range_info): if range_info is None: @@ -173,13 +193,6 @@ def deranged_shuffle(objs): return shuffled_objs -@pytest.fixture -def tempdir(): - tmpdir = mkdtemp(dir=".", prefix="metaflow.test.tmp") - yield tmpdir - shutil.rmtree(tmpdir) - - @pytest.mark.parametrize( argnames=["pathspecs", "expected"], **s3_data.pytest_benchmark_case() ) @@ -198,7 +211,9 @@ def _do(): assert_results(res, expected_urls, info_only=True) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["pathspecs", "expected"], **s3_data.pytest_benchmark_many_case() ) @@ -239,12 +254,11 @@ def _do(): return res res = benchmark(_do) - # We do not actually check results because the files will be cleared - # Could be improved if we want to be real precise - # assert_results(res, expected_urls, info_should_be_empty=True) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["pathspecs", "expected"], **s3_data.pytest_benchmark_many_case() ) @@ -267,14 +281,13 @@ def _do(): return res res = benchmark(_do) - # assert_results(res, check_expected, info_should_be_empty=True) @pytest.mark.parametrize( argnames=["blobs", "expected"], **s3_data.pytest_benchmark_put_case() ) @pytest.mark.benchmark(max_time=60) -def test_put_one_benchmark(benchmark, tempdir, blobs, expected): +def test_put_one_benchmark(benchmark, tempdir, s3root, blobs, expected): # We generate the files here to avoid having them saved in the benchmark # result file which then prevents comparisons def _generate_files(blobs): @@ -301,12 +314,16 @@ def _do(): res = benchmark(_do) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["blobs", "expected"], **s3_data.pytest_benchmark_put_many_case() ) @pytest.mark.benchmark(max_time=60) -def test_put_many_benchmark(benchmark, tempdir, inject_failure_rate, blobs, expected): +def test_put_many_benchmark( + benchmark, tempdir, s3root, inject_failure_rate, blobs, expected +): def _generate_files(blobs): generated_paths = {} for blob in blobs: @@ -336,7 +353,9 @@ def _do(): res = benchmark(_do) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["pathspecs", "expected"], **s3_data.pytest_fakerun_cases() ) @@ -450,7 +469,9 @@ def test_info_one(s3root, prefixes, expected): assert_results([s3obj], {url: expected_urls[url]}, info_only=True) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_basic_case() ) @@ -489,7 +510,9 @@ def test_info_many(s3root, inject_failure_rate, prefixes, expected): assert_results(s3objs, expected_urls, info_only=True) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_fakerun_cases() ) @@ -583,7 +606,9 @@ def test_get_one_wo_meta(s3root, prefixes, expected): ) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_large_case() ) @@ -602,7 +627,9 @@ def test_get_all(inject_failure_rate, s3root, prefixes, expected): assert_results(s3objs, expected_exists, info_should_be_empty=True) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_basic_case() ) @@ -621,7 +648,9 @@ def test_get_all_with_meta(inject_failure_rate, s3root, prefixes, expected): assert_results(s3objs, expected_exists) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_basic_case() ) @@ -795,7 +824,9 @@ def test_list_recursive(s3root, prefixes, expected): assert all(e.exists for e in s3objs) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["prefixes", "expected"], **s3_data.pytest_many_prefixes_case() ) @@ -845,7 +876,9 @@ def test_get_recursive(s3root, inject_failure_rate, prefixes, expected): assert not os.path.exists(path) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) def test_put_exceptions(inject_failure_rate): with S3(inject_failure_rate=inject_failure_rate) as s3: with pytest.raises(MetaflowS3InvalidObject): @@ -858,14 +891,7 @@ def test_put_exceptions(inject_failure_rate): s3.put_many([("foo", "bar")]) -@pytest.fixture -def s3_server_side_encryption(): - if os.environ.get("MINIO_TEST", "0") != "0": - return None - return "AES256" - - -@pytest.mark.parametrize("inject_failure_rate", [0]) +@pytest.mark.parametrize("inject_failure_rate", [0], ids=["fail_rate_0"]) @pytest.mark.parametrize( argnames=["objs", "expected"], **s3_data.pytest_put_strings_case() ) @@ -934,7 +960,9 @@ def test_put_one(s3root, objs, expected, s3_server_side_encryption): assert s3obj.blob == to_bytes(obj) -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) @pytest.mark.parametrize( argnames=["blobs", "expected"], **s3_data.pytest_put_blobs_case() ) @@ -1003,7 +1031,9 @@ def _files(blobs): assert {s3obj.key for s3obj in s3objs} == {key for key, _ in shuffled_blobs} -@pytest.mark.parametrize("inject_failure_rate", [0, 10, 50, 90]) +@pytest.mark.parametrize( + "inject_failure_rate", INJECT_FAILURE_RATES, ids=INJECT_FAILURE_RATE_IDS +) def test_list_recursive_sibling_prefix_filtering(s3root, inject_failure_rate): test_prefix = f"test_log_filtering_{uuid4().hex[:8]}" diff --git a/test/data/s3/test_s3op.py b/test/data/s3/test_s3op.py index 859a0ed8937..e09d1d37583 100644 --- a/test/data/s3/test_s3op.py +++ b/test/data/s3/test_s3op.py @@ -22,9 +22,9 @@ def test_convert_to_client_error(): assert client_error.operation_name == "CompleteMultipartUpload" -def test_generate_local_path_length_limits(): - """Test that generate_local_path produces filenames under 255 characters.""" - test_cases = [ +@pytest.mark.parametrize( + "url, range_val, suffix", + [ ("s3://bucket/file.txt", "whole", None), ("s3://bucket/日本語ファイル名.txt", "whole", None), ("s3://bucket/" + "日本語" * 50 + ".txt", "whole", None), @@ -33,11 +33,22 @@ def test_generate_local_path_length_limits(): ("s3://bucket/file.txt", "bytes=0-1000", None), ("s3://bucket/file.txt", "whole", "info"), ("s3://bucket/" + "x" * 300 + ".txt", "bytes=0-9999", "meta"), - ] - - for url, range_val, suffix in test_cases: - local_path = generate_local_path(url, range=range_val, suffix=suffix) - assert len(local_path) <= 255 + ], + ids=[ + "standard_url", + "japanese_short", + "japanese_long", + "chinese_long", + "ascii_long", + "with_byte_range", + "with_info_suffix", + "long_with_range_and_suffix", + ], +) +def test_generate_local_path_length_limits(url, range_val, suffix): + """Test that generate_local_path produces filenames under 255 characters.""" + local_path = generate_local_path(url, range=range_val, suffix=suffix) + assert len(local_path) <= 255 def test_generate_local_path_uniqueness(): @@ -58,12 +69,15 @@ def test_generate_local_path_uniqueness(): assert len(hashes) == len(set(hashes)) -def test_generate_local_path_truncation_indicator(): +def test_generate_local_path_adds_indicator_for_long_urls(): """Test that truncated filenames have '...' indicator.""" long_url = "s3://bucket/" + "日本語ファイル名" * 30 + ".txt" local_path = generate_local_path(long_url) assert "..." in local_path + +def test_generate_local_path_no_indicator_for_short_urls(): + """Test that short filenames do not get a truncation indicator.""" short_url = "s3://bucket/short.txt" short_local_path = generate_local_path(short_url) assert "..." not in short_local_path @@ -139,20 +153,16 @@ def untruncated_generate_local_path(url): test_content = b"Test data for long filename handling" - try: - with S3(s3root=s3_prefix) as s3: - s3.put(problematic_filename, test_content, overwrite=True) - - with S3(s3root=s3_prefix) as s3: - objs = s3.get_many([problematic_filename]) - assert len(objs) == 1 - obj = objs[0] - assert obj.blob == test_content - assert obj.key == problematic_filename - - with S3(s3root=s3_prefix) as s3: - obj = s3.get(problematic_filename) - assert obj.blob == test_content - - finally: - pass + with S3(s3root=s3_prefix) as s3: + s3.put(problematic_filename, test_content, overwrite=True) + + with S3(s3root=s3_prefix) as s3: + objs = s3.get_many([problematic_filename]) + assert len(objs) == 1 + obj = objs[0] + assert obj.blob == test_content + assert obj.key == problematic_filename + + with S3(s3root=s3_prefix) as s3: + obj = s3.get(problematic_filename) + assert obj.blob == test_content