Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion chispa/dataframe_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def assert_approx_df_equality(
ignore_column_order: bool = False,
ignore_row_order: bool = False,
ignore_columns: list[str] | None = None,
ignore_metadata: bool = False,
formats: FormattingConfig | None = None,
) -> None:
Comment on lines 118 to 130
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed by moving ignore_metadata after the existing positional tail in both DataFrame assertion functions and the Chispa.assert_df_equality wrapper. Added regression coverage for legacy positional formats calls so the old call order stays compatible.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about this - this change will make it compatible with https://github.com/MrPowers/chispa/blob/main/chispa/dataframe_comparer.py#L69 and https://github.com/MrPowers/chispa/blob/main/chispa/__init__.py#L43, although we need to unify it to put ignore_metadata before ignore_columns

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say the back-compatibility is the first priority. It may be a good idea in theory to prepare and make something like 1.0 release, but due to lack of maintenance I'm against it now.

if not formats:
Expand All @@ -144,7 +145,7 @@ def assert_approx_df_equality(
df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

assert_schema_equality(df1.schema, df2.schema, ignore_nullable)
assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata)

if precision != 0:
assert_generic_rows_equality(
Expand Down
13 changes: 13 additions & 0 deletions tests/test_dataframe_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,19 @@ def it_throws_when_dfs_are_not_same_with_ignored_columns(spark: SparkSession):
with pytest.raises(DataFramesNotEqualError):
assert assert_approx_df_equality(df1, df2, 0.1, ignore_columns=["name"])

def it_can_ignore_metadata(spark: SparkSession):
schema1 = StructType([
StructField("num", IntegerType(), True, {"comment": "a"}),
StructField("name", StringType(), True),
])
schema2 = StructType([
StructField("num", IntegerType(), True, {"comment": "b"}),
StructField("name", StringType(), True),
])
df1 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema1)
df2 = spark.createDataFrame([(1, "jose"), (2, "li")], schema=schema2)
assert_approx_df_equality(df1, df2, 0.1, ignore_metadata=True)

def it_does_not_throw_with_struct_columns_and_ignore_row_order(spark: SparkSession):
data1 = [((1.0, "jose"),), ((1.1, "li"),)]
df1 = spark.createDataFrame(data1, ["person"])
Expand Down
Loading