From d8b4e518b3fb255bc5b50d95abf4a1e1095fa0a2 Mon Sep 17 00:00:00 2001 From: Alex Ott Date: Sun, 12 Apr 2026 11:18:45 +0200 Subject: [PATCH 1/2] Improvements for PR #100 - ignore_metadata for assert_approx_df_equality The #100 started to add support for `ignore_metadata` in `assert_approx_df_equality`, but most of the work was already merged in #182. This PR fixes the missing piece in the `assert_approx_df_equality` implementation --- chispa/dataframe_comparer.py | 3 ++- tests/test_dataframe_comparer.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/chispa/dataframe_comparer.py b/chispa/dataframe_comparer.py index 6b5f465..d39b705 100644 --- a/chispa/dataframe_comparer.py +++ b/chispa/dataframe_comparer.py @@ -125,6 +125,7 @@ def assert_approx_df_equality( ignore_column_order: bool = False, ignore_row_order: bool = False, ignore_columns: list[str] | None = None, + ignore_metadata: bool = False, formats: FormattingConfig | None = None, ) -> None: if not formats: @@ -144,7 +145,7 @@ def assert_approx_df_equality( df1 = reduce(lambda acc, fn: fn(acc), transforms, df1) df2 = reduce(lambda acc, fn: fn(acc), transforms, df2) - assert_schema_equality(df1.schema, df2.schema, ignore_nullable) + assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata) if precision != 0: assert_generic_rows_equality( diff --git a/tests/test_dataframe_comparer.py b/tests/test_dataframe_comparer.py index 4ff9143..c66b098 100644 --- a/tests/test_dataframe_comparer.py +++ b/tests/test_dataframe_comparer.py @@ -287,6 +287,19 @@ def it_throws_when_dfs_are_not_same_with_ignored_columns(spark: SparkSession): with pytest.raises(DataFramesNotEqualError): assert assert_approx_df_equality(df1, df2, 0.1, ignore_columns=["name"]) + def it_can_ignore_metadata(spark: SparkSession): + schema1 = StructType([ + StructField("num", IntegerType(), True, {"comment": "a"}), + StructField("name", StringType(), True), + ]) + schema2 = StructType([ + StructField("num", IntegerType(), True, {"comment": "b"}), + StructField("name", StringType(), True), + ]) + df1 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema1) + df2 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema2) + assert_approx_df_equality(df1, df2, 0.1, ignore_metadata=True) + def it_does_not_throw_with_struct_columns_and_ignore_row_order(spark: SparkSession): data1 = [((1.0, "jose"),), ((1.1, "li"),)] df1 = spark.createDataFrame(data1, ["person"]) From daabb8aada9db3d5e2c1c085a61ef34b27a7bd47 Mon Sep 17 00:00:00 2001 From: Alex Ott Date: Sun, 12 Apr 2026 16:42:12 +0200 Subject: [PATCH 2/2] unify order of arguments with `assert_df_equality` --- chispa/dataframe_comparer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chispa/dataframe_comparer.py b/chispa/dataframe_comparer.py index d39b705..2105343 100644 --- a/chispa/dataframe_comparer.py +++ b/chispa/dataframe_comparer.py @@ -124,8 +124,8 @@ def assert_approx_df_equality( allow_nan_equality: bool = False, ignore_column_order: bool = False, ignore_row_order: bool = False, - ignore_columns: list[str] | None = None, ignore_metadata: bool = False, + ignore_columns: list[str] | None = None, formats: FormattingConfig | None = None, ) -> None: if not formats: