diff --git a/chispa/dataframe_comparer.py b/chispa/dataframe_comparer.py index 6b5f465..2105343 100644 --- a/chispa/dataframe_comparer.py +++ b/chispa/dataframe_comparer.py @@ -124,6 +124,7 @@ def assert_approx_df_equality( allow_nan_equality: bool = False, ignore_column_order: bool = False, ignore_row_order: bool = False, + ignore_metadata: bool = False, ignore_columns: list[str] | None = None, formats: FormattingConfig | None = None, ) -> None: @@ -144,7 +145,7 @@ def assert_approx_df_equality( df1 = reduce(lambda acc, fn: fn(acc), transforms, df1) df2 = reduce(lambda acc, fn: fn(acc), transforms, df2) - assert_schema_equality(df1.schema, df2.schema, ignore_nullable) + assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata) if precision != 0: assert_generic_rows_equality( diff --git a/tests/test_dataframe_comparer.py b/tests/test_dataframe_comparer.py index 4ff9143..c66b098 100644 --- a/tests/test_dataframe_comparer.py +++ b/tests/test_dataframe_comparer.py @@ -287,6 +287,19 @@ def it_throws_when_dfs_are_not_same_with_ignored_columns(spark: SparkSession): with pytest.raises(DataFramesNotEqualError): assert assert_approx_df_equality(df1, df2, 0.1, ignore_columns=["name"]) + def it_can_ignore_metadata(spark: SparkSession): + schema1 = StructType([ + StructField("num", IntegerType(), True, {"comment": "a"}), + StructField("name", StringType(), True), + ]) + schema2 = StructType([ + StructField("num", IntegerType(), True, {"comment": "b"}), + StructField("name", StringType(), True), + ]) + df1 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema1) + df2 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema2) + assert_approx_df_equality(df1, df2, 0.1, ignore_metadata=True) + def it_does_not_throw_with_struct_columns_and_ignore_row_order(spark: SparkSession): data1 = [((1.0, "jose"),), ((1.1, "li"),)] df1 = spark.createDataFrame(data1, ["person"])