From 438724324e2fcb7680d5987fe89f6e27373e9292 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 8 Jun 2026 18:15:24 -0700 Subject: [PATCH 1/5] feat: Support IEEE 754 for SQL ops --- datafusion/common/src/hash_utils.rs | 10 +- datafusion/common/src/scalar/mod.rs | 45 +++- datafusion/common/src/utils/mod.rs | 58 +++++ datafusion/functions-nested/src/except.rs | 15 +- datafusion/functions-nested/src/set_ops.rs | 31 ++- datafusion/physical-expr-common/src/datum.rs | 18 +- .../group_values/multi_group_by/primitive.rs | 25 ++- .../src/aggregates/group_values/row.rs | 8 + .../group_values/single_group_by/primitive.rs | 33 ++- datafusion/physical-plan/src/joins/utils.rs | 12 +- .../test_files/array/array_distinct.slt | 42 ++++ .../test_files/array/array_except.slt | 48 ++++ .../test_files/array/array_union.slt | 60 +++++ .../sqllogictest/test_files/negative_zero.slt | 206 ++++++++++++++++++ 14 files changed, 574 insertions(+), 37 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/negative_zero.slt diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index fcc2e919b6cc2..02db75498af49 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -188,10 +188,16 @@ macro_rules! hash_float_value { ($(($t:ty, $i:ty)),+) => { $(impl HashValue for $t { fn hash_one(&self, state: &RandomState) -> u64 { - state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes())) + // +0.0 and -0.0 differ only in the sign bit but compare equal + // under IEEE 754; normalize -0.0 → +0.0 so Hash agrees with Eq. + let bits = <$i>::from_ne_bytes(self.to_ne_bytes()); + let bits = if bits << 1 == 0 { 0 } else { bits }; + state.hash_one(bits) } fn hash_write(&self, hasher: &mut impl Hasher) { - hasher.write(&self.to_ne_bytes()) + let bits = <$i>::from_ne_bytes(self.to_ne_bytes()); + let bits: $i = if bits << 1 == 0 { 0 } else { bits }; + hasher.write(&bits.to_ne_bytes()) } })+ }; diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c9013af72619c..c94e5170f92b8 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -469,7 +469,11 @@ pub enum ScalarValue { impl Hash for Fl { fn hash(&self, state: &mut H) { - self.0.to_bits().hash(state); + let bits = self.0.to_bits(); + // +0.0 and -0.0 differ only in the sign bit but compare equal under + // IEEE 754; normalize -0.0 → +0.0 so Hash agrees with Eq. + let bits = if bits << 1 == 0 { 0 } else { bits }; + bits.hash(state); } } @@ -500,17 +504,24 @@ impl PartialEq for ScalarValue { (Boolean(v1), Boolean(v2)) => v1.eq(v2), (Boolean(_), _) => false, (Float32(v1), Float32(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), + (Some(f1), Some(f2)) => { + *f1 == 0.0 && *f2 == 0.0 || f1.to_bits() == f2.to_bits() + } _ => v1.eq(v2), }, (Float16(v1), Float16(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), + (Some(f1), Some(f2)) => { + let (b1, b2) = (f1.to_bits(), f2.to_bits()); + ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 + } _ => v1.eq(v2), }, (Float32(_), _) => false, (Float16(_), _) => false, (Float64(v1), Float64(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), + (Some(f1), Some(f2)) => { + *f1 == 0.0 && *f2 == 0.0 || f1.to_bits() == f2.to_bits() + } _ => v1.eq(v2), }, (Float64(_), _) => false, @@ -656,17 +667,31 @@ impl PartialOrd for ScalarValue { (Boolean(v1), Boolean(v2)) => v1.partial_cmp(v2), (Boolean(_), _) => None, (Float32(v1), Float32(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), + (Some(a), Some(b)) => Some(if *a == 0.0 && *b == 0.0 { + Ordering::Equal + } else { + a.total_cmp(b) + }), _ => v1.partial_cmp(v2), }, (Float16(v1), Float16(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), + (Some(a), Some(b)) => { + Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { + Ordering::Equal + } else { + a.total_cmp(b) + }) + } _ => v1.partial_cmp(v2), }, (Float32(_), _) => None, (Float16(_), _) => None, (Float64(v1), Float64(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), + (Some(a), Some(b)) => Some(if *a == 0.0 && *b == 0.0 { + Ordering::Equal + } else { + a.total_cmp(b) + }), _ => v1.partial_cmp(v2), }, (Float64(_), _) => None, @@ -941,7 +966,11 @@ macro_rules! hash_float_value { $(impl std::hash::Hash for Fl<$t> { #[inline] fn hash(&self, state: &mut H) { - state.write(&<$i>::from_ne_bytes(self.0.to_ne_bytes()).to_ne_bytes()) + let bits = <$i>::from_ne_bytes(self.0.to_ne_bytes()); + // +0.0 and -0.0 differ only in the sign bit but compare equal + // under IEEE 754; normalize -0.0 → +0.0 so Hash agrees with Eq. + let bits: $i = if bits << 1 == 0 { 0 } else { bits }; + state.write(&bits.to_ne_bytes()) } })+ }; diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 99c5bdbc54388..f2f3e0397a31e 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1386,6 +1386,64 @@ fn fsl_values_row_number(list_size: i32, array_len: usize) -> Result Ok(PrimitiveArray::new(rows_number.into(), None)) } +/// Replace `-0.0` with `+0.0` in any `Float16`, `Float32`, or `Float64` array. +/// For non-float arrays returns the input unchanged. NaN payloads are +/// preserved. +/// +/// Arrow's comparison kernels (`arrow::compute::kernels::cmp::eq` etc.) and +/// row-encoding (`arrow::row::RowConverter`) use IEEE 754 totalOrder +/// semantics, which treats `-0.0` and `+0.0` as distinct. SQL semantics +/// (PostgreSQL / IEEE 754 equality) require them to compare equal, so +/// callers normalize before invoking those kernels. +pub fn normalize_float_zero(array: &ArrayRef) -> ArrayRef { + use arrow::array::{Float16Array, Float32Array, Float64Array}; + use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; + match array.data_type() { + DataType::Float32 => { + let arr: &Float32Array = array.as_primitive::(); + let normalized: Float32Array = + arr.unary(|v| if v.to_bits() << 1 == 0 { 0.0_f32 } else { v }); + Arc::new(normalized) + } + DataType::Float64 => { + let arr: &Float64Array = array.as_primitive::(); + let normalized: Float64Array = + arr.unary(|v| if v.to_bits() << 1 == 0 { 0.0_f64 } else { v }); + Arc::new(normalized) + } + DataType::Float16 => { + let arr: &Float16Array = array.as_primitive::(); + let normalized: Float16Array = arr.unary(|v| { + if v.to_bits() << 1 == 0 { + half::f16::from_bits(0) + } else { + v + } + }); + Arc::new(normalized) + } + _ => Arc::clone(array), + } +} + +/// Replace `-0.0` with `+0.0` in `Float16`, `Float32`, or `Float64` scalar +/// values. Other variants are returned unchanged. See [`normalize_float_zero`] +/// for context. +pub fn normalize_float_zero_scalar(scalar: ScalarValue) -> ScalarValue { + match scalar { + ScalarValue::Float32(Some(v)) if v.to_bits() << 1 == 0 => { + ScalarValue::Float32(Some(0.0)) + } + ScalarValue::Float64(Some(v)) if v.to_bits() << 1 == 0 => { + ScalarValue::Float64(Some(0.0)) + } + ScalarValue::Float16(Some(v)) if v.to_bits() << 1 == 0 => { + ScalarValue::Float16(Some(half::f16::from_bits(0))) + } + other => other, + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/datafusion/functions-nested/src/except.rs b/datafusion/functions-nested/src/except.rs index 12ed6c2e186f4..dbf815c0ec539 100644 --- a/datafusion/functions-nested/src/except.rs +++ b/datafusion/functions-nested/src/except.rs @@ -27,7 +27,7 @@ use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::take; use arrow::datatypes::{DataType, FieldRef}; use arrow::row::{RowConverter, SortField}; -use datafusion_common::utils::{ListCoercion, take_function_args}; +use datafusion_common::utils::{ListCoercion, normalize_float_zero, take_function_args}; use datafusion_common::{HashSet, Result, internal_err}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, @@ -169,16 +169,21 @@ fn general_except( ) -> Result> { let converter = RowConverter::new(vec![SortField::new(l.value_type())])?; + // Normalize -0.0 → +0.0 so RowConverter (IEEE 754 totalOrder) groups + // ±0 together for both the rhs lookup set and the lhs probe. + let l_values_norm = normalize_float_zero(l.values()); + let r_values_norm = normalize_float_zero(r.values()); + // Only convert the visible portion of the values array. For sliced // ListArrays, values() returns the full underlying array but only // elements between the first and last offset are referenced. let l_first = l.offsets()[0].as_usize(); let l_len = l.offsets()[l.len()].as_usize() - l_first; - let l_values = converter.convert_columns(&[l.values().slice(l_first, l_len)])?; + let l_values = converter.convert_columns(&[l_values_norm.slice(l_first, l_len)])?; let r_first = r.offsets()[0].as_usize(); let r_len = r.offsets()[r.len()].as_usize() - r_first; - let r_values = converter.convert_columns(&[r.values().slice(r_first, r_len)])?; + let r_values = converter.convert_columns(&[r_values_norm.slice(r_first, r_len)])?; let mut offsets = Vec::::with_capacity(l.len() + 1); offsets.push(OffsetSize::usize_as(0)); @@ -223,11 +228,11 @@ fn general_except( } else if OffsetSize::IS_LARGE { let indices = UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::>()); - take(l.values().as_ref(), &indices, None)? + take(l_values_norm.as_ref(), &indices, None)? } else { let indices = UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::>()); - take(l.values().as_ref(), &indices, None)? + take(l_values_norm.as_ref(), &indices, None)? }; Ok(GenericListArray::::new( diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs index 2ad08e2d43c02..6ae1fc97bcaf0 100644 --- a/datafusion/functions-nested/src/set_ops.rs +++ b/datafusion/functions-nested/src/set_ops.rs @@ -28,7 +28,7 @@ use arrow::datatypes::DataType::{LargeList, List, Null}; use arrow::datatypes::{DataType, Field, FieldRef}; use arrow::row::{RowConverter, SortField}; use datafusion_common::cast::{as_large_list_array, as_list_array}; -use datafusion_common::utils::ListCoercion; +use datafusion_common::utils::{ListCoercion, normalize_float_zero}; use datafusion_common::{ Result, assert_eq_or_internal_err, exec_err, internal_err, utils::take_function_args, }; @@ -351,21 +351,27 @@ fn generic_set_lists( let converter = RowConverter::new(vec![SortField::new(l.value_type())])?; + // Normalize -0.0 → +0.0 so RowConverter (which uses IEEE 754 totalOrder + // and treats ±0 as distinct) groups them together. Use the normalized + // arrays for both row conversion and the final output values. + let l_values_norm = normalize_float_zero(l.values()); + let r_values_norm = normalize_float_zero(r.values()); + // Only convert the visible portion of the values array. For sliced // ListArrays, values() returns the full underlying array but only // elements between the first and last offset are referenced. let l_first = l.offsets()[0].as_usize(); let l_len = l.offsets()[l.len()].as_usize() - l_first; - let rows_l = converter.convert_columns(&[l.values().slice(l_first, l_len)])?; + let rows_l = converter.convert_columns(&[l_values_norm.slice(l_first, l_len)])?; let r_first = r.offsets()[0].as_usize(); let r_len = r.offsets()[r.len()].as_usize() - r_first; - let rows_r = converter.convert_columns(&[r.values().slice(r_first, r_len)])?; + let rows_r = converter.convert_columns(&[r_values_norm.slice(r_first, r_len)])?; // Combine the *sliced* value arrays so 0-based indices from the row // converter map directly into the concatenated array. - let l_values = l.values().slice(l_first, l_len); - let r_values = r.values().slice(r_first, r_len); + let l_values = l_values_norm.slice(l_first, l_len); + let r_values = r_values_norm.slice(r_first, r_len); let combined_values = concat(&[l_values.as_ref(), r_values.as_ref()])?; let r_offset = l_len; @@ -558,13 +564,18 @@ fn general_array_distinct( let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; + // Normalize -0.0 → +0.0 so RowConverter (which uses IEEE 754 totalOrder + // and treats ±0 as distinct) groups them together, and so the output + // carries the canonical sign. + let values_norm = normalize_float_zero(array.values()); + // Only convert the visible portion of the values array. For sliced // ListArrays, values() returns the full underlying array but only // elements between the first and last offset are referenced. let first_offset = value_offsets[0].as_usize(); let visible_len = value_offsets[array.len()].as_usize() - first_offset; let rows = - converter.convert_columns(&[array.values().slice(first_offset, visible_len)])?; + converter.convert_columns(&[values_norm.slice(first_offset, visible_len)])?; let mut indices: Vec = Vec::with_capacity(rows.num_rows()); let mut seen = HashSet::new(); @@ -593,19 +604,19 @@ fn general_array_distinct( } // Gather distinct values in a single pass, using the computed `indices`. - // Indices are absolute positions in array.values() (first_offset was added - // back when collecting them), so we can take directly from the full values. + // Indices are absolute positions in the (normalized) values array, so we + // can take directly from the full values. // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX. let final_values = if indices.is_empty() { new_empty_array(&dt) } else if OffsetSize::IS_LARGE { let indices = UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::>()); - take(array.values().as_ref(), &indices, None)? + take(values_norm.as_ref(), &indices, None)? } else { let indices = UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::>()); - take(array.values().as_ref(), &indices, None)? + take(values_norm.as_ref(), &indices, None)? }; Ok(Arc::new(GenericListArray::::try_new( diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs index bd5790507f662..d23fb30db6c4a 100644 --- a/datafusion/physical-expr-common/src/datum.rs +++ b/datafusion/physical-expr-common/src/datum.rs @@ -23,6 +23,7 @@ use arrow::compute::kernels::cmp::{ }; use arrow::compute::{SortOptions, ilike, like, nilike, nlike}; use arrow::error::ArrowError; +use datafusion_common::utils::{normalize_float_zero, normalize_float_zero_scalar}; use datafusion_common::{Result, ScalarValue}; use datafusion_common::{arrow_datafusion_err, assert_or_internal_err, internal_err}; use datafusion_expr_common::columnar_value::ColumnarValue; @@ -84,7 +85,22 @@ pub fn apply_cmp( } }; - apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?))) + // Arrow's comparison kernels use IEEE 754 totalOrder semantics for + // floats, which treats `-0.0` and `+0.0` as distinct. Normalize float + // operands so SQL semantics (`+0.0 == -0.0`) hold. No-op for + // non-float types. + let lhs = normalize_cmp_input(lhs); + let rhs = normalize_cmp_input(rhs); + apply(&lhs, &rhs, |l, r| Ok(Arc::new(f(l, r)?))) + } +} + +fn normalize_cmp_input(cv: &ColumnarValue) -> ColumnarValue { + match cv { + ColumnarValue::Array(a) => ColumnarValue::Array(normalize_float_zero(a)), + ColumnarValue::Scalar(s) => { + ColumnarValue::Scalar(normalize_float_zero_scalar(s.clone())) + } } } diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs index 1913371845772..068b849cb240f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aggregates::group_values::HashValue; use crate::aggregates::group_values::multi_group_by::{ GroupColumn, Nulls, nulls_equal_to, }; @@ -51,6 +52,7 @@ pub struct PrimitiveGroupValueBuilder PrimitiveGroupValueBuilder where T: ArrowPrimitiveType, + T::Native: HashValue, { /// Create a new `PrimitiveGroupValueBuilder` pub fn new(data_type: DataType) -> Self { @@ -91,7 +93,9 @@ where } else { unsafe { *array_values.get_unchecked(rhs_row) } }; - if left.is_eq(right) { + // `left` was already canonicalized on append; canonicalize the + // input so ±0 (and any future equivalence class) compares equal. + if left.is_eq(right.canonicalize()) { cmp_buf[i / 8] |= 1 << (i % 8); } } @@ -133,7 +137,7 @@ where continue; } - if !self.group_values[lhs_row].is_eq(array.value(rhs_row)) { + if !self.group_values[lhs_row].is_eq(array.value(rhs_row).canonicalize()) { equal_to_results.set_bit(idx, false); } } @@ -142,6 +146,8 @@ where impl GroupColumn for PrimitiveGroupValueBuilder +where + T::Native: HashValue, { fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { // Perf: skip null check (by short circuit) if input is not nullable @@ -154,7 +160,8 @@ impl GroupColumn // Otherwise, we need to check their values } - self.group_values[lhs_row].is_eq(array.as_primitive::().value(rhs_row)) + self.group_values[lhs_row] + .is_eq(array.as_primitive::().value(rhs_row).canonicalize()) } fn append_val(&mut self, array: &ArrayRef, row: usize) -> Result<()> { @@ -165,10 +172,12 @@ impl GroupColumn self.group_values.push(T::default_value()); } else { self.nulls.append(false); - self.group_values.push(array.as_primitive::().value(row)); + self.group_values + .push(array.as_primitive::().value(row).canonicalize()); } } else { - self.group_values.push(array.as_primitive::().value(row)); + self.group_values + .push(array.as_primitive::().value(row).canonicalize()); } Ok(()) @@ -214,7 +223,7 @@ impl GroupColumn self.group_values.push(T::default_value()); } else { self.nulls.append(false); - self.group_values.push(arr.value(row)); + self.group_values.push(arr.value(row).canonicalize()); } } } @@ -222,7 +231,7 @@ impl GroupColumn (true, Nulls::None) => { self.nulls.append_n(rows.len(), false); for &row in rows { - self.group_values.push(arr.value(row)); + self.group_values.push(arr.value(row).canonicalize()); } } @@ -234,7 +243,7 @@ impl GroupColumn (false, _) => { for &row in rows { - self.group_values.push(arr.value(row)); + self.group_values.push(arr.value(row).canonicalize()); } } } diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs index a3bd31f76c233..4976a098ecee5 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/row.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs @@ -26,6 +26,7 @@ use arrow::row::{RowConverter, Rows, SortField}; use datafusion_common::Result; use datafusion_common::hash_utils::RandomState; use datafusion_common::hash_utils::create_hashes; +use datafusion_common::utils::normalize_float_zero; use datafusion_execution::memory_pool::proxy::{HashTableAllocExt, VecAllocExt}; use datafusion_expr::EmitTo; use hashbrown::hash_table::HashTable; @@ -116,6 +117,13 @@ impl GroupValuesRows { impl GroupValues for GroupValuesRows { fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { + // Normalize -0.0 → +0.0 so RowConverter (IEEE 754 totalOrder) and + // primitive hashing both group ±0 together. No-op for non-float + // columns. + let normalized_cols: Vec = + cols.iter().map(normalize_float_zero).collect(); + let cols = normalized_cols.as_slice(); + // Convert the group keys into the row format let group_rows = &mut self.rows_buffer; group_rows.clear(); diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs index 07535cfdaa6de..206ba16038b5c 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs @@ -35,8 +35,21 @@ use std::mem::size_of; use std::sync::Arc; /// A trait to allow hashing of floating point numbers -pub(crate) trait HashValue { +pub trait HashValue { fn hash(&self, state: &RandomState) -> u64; + + /// Return a canonical representative whose bit pattern is identical for + /// all values that should be grouped together. Default is the identity; + /// floats override this to fold `-0.0` into `+0.0` so the bit-equal + /// `is_eq` check used during insertion treats them as the same group. + /// NaN payload bits are preserved. + #[inline] + fn canonicalize(self) -> Self + where + Self: Sized, + { + self + } } macro_rules! hash_integer { @@ -63,13 +76,25 @@ macro_rules! hash_float { $(impl HashValue for $t { #[cfg(not(feature = "force_hash_collisions"))] fn hash(&self, state: &RandomState) -> u64 { - state.hash_one(self.to_bits()) + let bits = self.to_bits(); + // +0.0 and -0.0 differ only in the sign bit but compare equal + // under IEEE 754; normalize -0.0 → +0.0 so the hash agrees + // with the equality check used by `GroupValuesPrimitive`. + let bits = if bits << 1 == 0 { 0 } else { bits }; + state.hash_one(bits) } #[cfg(feature = "force_hash_collisions")] fn hash(&self, _state: &RandomState) -> u64 { 0 } + + #[inline] + fn canonicalize(self) -> Self { + let bits = self.to_bits(); + let bits = if bits << 1 == 0 { 0 } else { bits }; + Self::from_bits(bits) + } })+ }; } @@ -127,6 +152,10 @@ where group_id }), Some(key) => { + // Fold equivalence-class duplicates (e.g. `-0.0` → `+0.0`) + // so the bit-equal `is_eq` matches and the stored value is + // the canonical representative. + let key = key.canonicalize(); let state = &self.random_state; let hash = key.hash(state); let insert = self.map.entry( diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 9d302a60610b1..6510460bb0411 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -43,7 +43,7 @@ pub use crate::joins::{JoinOn, JoinOnRef}; use arrow::array::{ Array, ArrowPrimitiveType, BooleanBufferBuilder, NativeAdapter, PrimitiveArray, RecordBatch, RecordBatchOptions, UInt32Array, UInt32Builder, UInt64Array, - builder::UInt64Builder, downcast_array, new_null_array, + builder::UInt64Builder, downcast_array, make_array, new_null_array, }; use arrow::array::{ ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, @@ -65,6 +65,7 @@ use datafusion_common::cast::as_boolean_array; use datafusion_common::hash_utils::RandomState; use datafusion_common::hash_utils::create_hashes; use datafusion_common::stats::Precision; +use datafusion_common::utils::normalize_float_zero; use datafusion_common::{ DataFusionError, JoinSide, JoinType, NullEquality, Result, SharedResult, not_impl_err, plan_err, @@ -2194,6 +2195,15 @@ fn eq_dyn_null( }; return Ok(compare_op_for_nested(op, &left, &right)?); } + // Arrow's `eq` / `not_distinct` use IEEE 754 totalOrder semantics for + // floats, so `-0.0` and `+0.0` would compare unequal. Normalize the + // operands so SQL semantics (`+0.0 == -0.0`) hold; no-op for non-floats. + let left_arr: ArrayRef = make_array(left.to_data()); + let right_arr: ArrayRef = make_array(right.to_data()); + let left_norm = normalize_float_zero(&left_arr); + let right_norm = normalize_float_zero(&right_arr); + let left = left_norm.as_ref(); + let right = right_norm.as_ref(); match null_equality { NullEquality::NullEqualsNothing => eq(&left, &right), NullEquality::NullEqualsNull => not_distinct(&left, &right), diff --git a/datafusion/sqllogictest/test_files/array/array_distinct.slt b/datafusion/sqllogictest/test_files/array/array_distinct.slt index 88ffdf7f2ff78..7b7033139d767 100644 --- a/datafusion/sqllogictest/test_files/array/array_distinct.slt +++ b/datafusion/sqllogictest/test_files/array/array_distinct.slt @@ -210,5 +210,47 @@ select array_compact(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, ---- [] +# Negative zero (-0.0) and positive zero (+0.0) compare equal under +# IEEE 754, but their bit patterns differ. array_distinct must normalize +# the sign so the canonical representative (+0.0) is used; otherwise +# group-by / dedup hashing on the raw bits keeps both as distinct +# elements. PostgreSQL / IEEE 754 expected output below. + +# array_distinct collapses +0.0 and -0.0 into a single element. +query ? +select array_distinct([0.0, -0.0]); +---- +[0.0] + +# General case with extra elements. +query ? +select array_distinct([0.0, -0.0, 0.0, 1.0, -0.0]); +---- +[0.0, 1.0] + +# array_length(array_distinct(...)) for {+0.0, -0.0, +0.0} must be 1. +query I +select array_length(array_distinct([0.0, -0.0, 0.0])); +---- +1 + +# Float32 list. +query ? +select array_distinct(arrow_cast([0.0, -0.0], 'List(Float32)')); +---- +[0.0] + +# LargeList(Float64). +query ? +select array_distinct(arrow_cast([0.0, -0.0], 'LargeList(Float64)')); +---- +[0.0] + +# FixedSizeList(Float64). +query ? +select array_distinct(arrow_cast([0.0, -0.0], 'FixedSizeList(2, Float64)')); +---- +[0.0] + include ./cleanup.slt.part diff --git a/datafusion/sqllogictest/test_files/array/array_except.slt b/datafusion/sqllogictest/test_files/array/array_except.slt index a718723e58c38..1d41a5a79d15f 100644 --- a/datafusion/sqllogictest/test_files/array/array_except.slt +++ b/datafusion/sqllogictest/test_files/array/array_except.slt @@ -156,4 +156,52 @@ select array_except(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_c [1, 2] +# Negative zero (-0.0) and positive zero (+0.0) compare equal under +# IEEE 754, but their bit patterns differ. array_except must treat +# +0.0 and -0.0 as the same element when subtracting. PostgreSQL / +# IEEE 754 expected output below. + +# -0.0 in rhs removes +0.0 in lhs. +query ? +select array_except([0.0], [-0.0]); +---- +[] + +# Reverse direction. +query ? +select array_except([-0.0], [0.0]); +---- +[] + +# -0.0 in rhs also removes the +0.0 element from the lhs. +query ? +select array_except([0.0, -0.0], [-0.0]); +---- +[] + +# +0.0 in rhs also removes the -0.0 element from the lhs. +query ? +select array_except([0.0, -0.0], [0.0]); +---- +[] + +# More general case with extra unmatched element. +query ? +select array_except([0.0, -0.0, 1.0], [-0.0]); +---- +[1.0] + +# Float32 list. +query ? +select array_except(arrow_cast([0.0, -0.0], 'List(Float32)'), arrow_cast([0.0], 'List(Float32)')); +---- +[] + +# LargeList(Float64). +query ? +select array_except(arrow_cast([0.0, -0.0], 'LargeList(Float64)'), arrow_cast([-0.0], 'LargeList(Float64)')); +---- +[] + + include ./cleanup.slt.part diff --git a/datafusion/sqllogictest/test_files/array/array_union.slt b/datafusion/sqllogictest/test_files/array/array_union.slt index 6a0fdc546e7d7..edb90705af940 100644 --- a/datafusion/sqllogictest/test_files/array/array_union.slt +++ b/datafusion/sqllogictest/test_files/array/array_union.slt @@ -236,4 +236,64 @@ select array_except([1, 2], arrow_cast(null, 'List(Int64)')); NULL +# Negative zero (-0.0) and positive zero (+0.0) compare equal under +# IEEE 754, but their bit patterns differ. array_union and array_intersect +# must normalize the sign for dedup / matching; the canonical +# representative is +0.0. PostgreSQL / IEEE 754 expected output below. + +# array_union with +0.0 / -0.0 +query ? +select array_union([0.0], [-0.0]); +---- +[0.0] + +query ? +select array_union([0.0, 1.0], [-0.0]); +---- +[0.0, 1.0] + +query ? +select array_union([0.0, -0.0, 1.0], [-0.0, 1.0]); +---- +[0.0, 1.0] + +# Float32 list. +query ? +select array_union(arrow_cast([0.0], 'List(Float32)'), arrow_cast([-0.0], 'List(Float32)')); +---- +[0.0] + +# LargeList(Float64). +query ? +select array_union(arrow_cast([0.0], 'LargeList(Float64)'), arrow_cast([-0.0], 'LargeList(Float64)')); +---- +[0.0] + + +# array_intersect with +0.0 / -0.0 +# +0.0 in lhs matches -0.0 in rhs. +query ? +select array_intersect([0.0, 1.0], [-0.0]); +---- +[0.0] + +# Either +0.0 or -0.0 in lhs matches +0.0 in rhs (canonicalized to +0.0). +query ? +select array_intersect([0.0, -0.0], [0.0]); +---- +[0.0] + +# Same with -0.0 in rhs. +query ? +select array_intersect([0.0, -0.0], [-0.0]); +---- +[0.0] + +# Float32 list. +query ? +select array_intersect(arrow_cast([0.0], 'List(Float32)'), arrow_cast([-0.0], 'List(Float32)')); +---- +[0.0] + + include ./cleanup.slt.part diff --git a/datafusion/sqllogictest/test_files/negative_zero.slt b/datafusion/sqllogictest/test_files/negative_zero.slt new file mode 100644 index 0000000000000..a9e16e1d56237 --- /dev/null +++ b/datafusion/sqllogictest/test_files/negative_zero.slt @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Negative Zero (-0.0) vs. Positive Zero (+0.0) Behavior +########## +# +# IEEE 754 specifies +0.0 == -0.0 (they compare equal). PostgreSQL follows +# this and treats them as the same value for DISTINCT, GROUP BY, UNION, +# INTERSECT, EXCEPT, and equality predicates. The bit patterns differ in +# the sign bit, so any code path that hashes / compares the raw bits (e.g. +# `f64::to_bits` or `f64::to_ne_bytes`) will treat them as distinct values +# and must be normalized before grouping / dedup. +# +# Note: the sqllogictest formatter renders both `-0.0` and `+0.0` as `0`, +# so the visible scalar values look identical in the expected output. The +# behavior is asserted via row counts and via auxiliary `1.0 / a` +# (`Infinity` vs `-Infinity`) columns that expose the sign. + +##### +## Equality and ordering predicates +##### + +# +0.0 == -0.0 is TRUE; +0.0 < -0.0 and +0.0 > -0.0 are both FALSE. +query BBB +SELECT 0.0 = -0.0 AS eq, 0.0 < -0.0 AS lt, 0.0 > -0.0 AS gt; +---- +true false false + +# 0.0 IS DISTINCT FROM -0.0 must be FALSE because the values are equal. +query B +SELECT 0.0 IS DISTINCT FROM -0.0 AS is_distinct; +---- +false + +##### +## SELECT DISTINCT with +0.0 / -0.0 (Float64) +##### + +# DISTINCT must collapse +0.0 and -0.0 into a single row. +query R rowsort +SELECT DISTINCT a +FROM (SELECT 0.0 AS a UNION ALL SELECT -0.0 UNION ALL SELECT 0.0); +---- +0 + +# Same query, with `1.0 / a` to expose the sign in the projection. The +# tuples `(+0.0, +Infinity)` and `(-0.0, -Infinity)` are not equal — the +# zero columns compare equal but `+Infinity != -Infinity` — so DISTINCT +# keeps both rows. PG returns the same two rows. +query RR rowsort +SELECT DISTINCT a, 1.0 / a AS inv +FROM (SELECT 0.0 AS a UNION ALL SELECT -0.0 UNION ALL SELECT 0.0); +---- +0 -Infinity +0 Infinity + +# COUNT(DISTINCT) over {+0.0, -0.0} must return 1. +query I +SELECT COUNT(DISTINCT a) +FROM (SELECT 0.0 AS a UNION ALL SELECT -0.0 UNION ALL SELECT 0.0); +---- +1 + +# GROUP BY must put +0.0 and -0.0 in the same group. +query RRI rowsort +SELECT a, 1.0 / a AS inv, COUNT(*) +FROM (SELECT 0.0 AS a UNION ALL SELECT -0.0 UNION ALL SELECT 0.0) +GROUP BY a; +---- +0 Infinity 3 + +# Multi-column DISTINCT; (+0.0, 1) and (-0.0, 1) must collapse. +query RI rowsort +SELECT DISTINCT a, b +FROM (SELECT 0.0 AS a, 1 AS b UNION ALL SELECT -0.0, 1 UNION ALL SELECT 0.0, 2); +---- +0 1 +0 2 + +##### +## SELECT DISTINCT with +0.0 / -0.0 (Float32 / REAL) +##### + +# DISTINCT for Float32: same collapse to a single row. +query R rowsort +SELECT DISTINCT a +FROM ( + SELECT arrow_cast(0.0, 'Float32') AS a + UNION ALL SELECT arrow_cast(-0.0, 'Float32') + UNION ALL SELECT arrow_cast(0.0, 'Float32') +); +---- +0 + +# COUNT(DISTINCT) for Float32: must be 1. +query I +SELECT COUNT(DISTINCT a) +FROM ( + SELECT arrow_cast(0.0, 'Float32') AS a + UNION ALL SELECT arrow_cast(-0.0, 'Float32') +); +---- +1 + +##### +## UNION (set semantics) with +0.0 / -0.0 +##### + +# UNION (DISTINCT) must collapse +0.0 / -0.0 into a single row. +query R rowsort +SELECT 0.0 AS a UNION SELECT -0.0 UNION SELECT 0.0; +---- +0 + +# UNION ALL preserves every input row regardless of sign — baseline. +query R rowsort +SELECT 0.0 AS a UNION ALL SELECT -0.0 UNION ALL SELECT 0.0; +---- +0 +0 +0 + +# UNION on Float32 must also collapse to a single row. +query R rowsort +SELECT arrow_cast(0.0, 'Float32') AS a +UNION +SELECT arrow_cast(-0.0, 'Float32'); +---- +0 + +##### +## INTERSECT with +0.0 / -0.0 +##### + +# INTERSECT treats +0.0 and -0.0 as equal — one matching row. +query R rowsort +SELECT 0.0 AS a INTERSECT SELECT -0.0; +---- +0 + +# INTERSECT ALL with multiplicities min(1,1) = 1. +query R rowsort +SELECT 0.0 AS a INTERSECT ALL SELECT -0.0; +---- +0 + +# INTERSECT for Float32: same matching behavior. +query R rowsort +SELECT arrow_cast(0.0, 'Float32') AS a +INTERSECT +SELECT arrow_cast(-0.0, 'Float32'); +---- +0 + +##### +## EXCEPT with +0.0 / -0.0 +##### + +# EXCEPT treats +0.0 and -0.0 as equal — zero rows after subtraction. +query R rowsort +SELECT 0.0 AS a EXCEPT SELECT -0.0; +---- + +# Reverse direction: also zero rows. +query R rowsort +SELECT -0.0 AS a EXCEPT SELECT 0.0; +---- + +# EXCEPT for Float32: zero rows. +query R rowsort +SELECT arrow_cast(0.0, 'Float32') AS a +EXCEPT +SELECT arrow_cast(-0.0, 'Float32'); +---- + +# EXCEPT ALL with matching multiplicities: zero rows. +query R rowsort +SELECT 0.0 AS a EXCEPT ALL SELECT -0.0; +---- + +##### +## INNER JOIN ON equality with +0.0 / -0.0 +##### + +# Equi-join on a = b matches +0.0 against -0.0. +query RR +SELECT t1.a, t2.b +FROM (SELECT 0.0 AS a) t1 +JOIN (SELECT -0.0 AS b) t2 ON t1.a = t2.b; +---- +0 0 From 1a564ec59d418d5b3413277f3d5a5d01c00d8781 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 8 Jun 2026 19:01:26 -0700 Subject: [PATCH 2/5] feat: Support IEEE 754 for SQL ops --- datafusion/common/src/scalar/mod.rs | 30 +++++++++++-------- datafusion/functions-nested/src/set_ops.rs | 13 ++++---- .../group_values/single_group_by/primitive.rs | 7 +---- datafusion/physical-plan/src/joins/utils.rs | 14 +++++++-- 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c94e5170f92b8..f0d3740151488 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -505,7 +505,8 @@ impl PartialEq for ScalarValue { (Boolean(_), _) => false, (Float32(v1), Float32(v2)) => match (v1, v2) { (Some(f1), Some(f2)) => { - *f1 == 0.0 && *f2 == 0.0 || f1.to_bits() == f2.to_bits() + let (b1, b2) = (f1.to_bits(), f2.to_bits()); + ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 } _ => v1.eq(v2), }, @@ -520,7 +521,8 @@ impl PartialEq for ScalarValue { (Float16(_), _) => false, (Float64(v1), Float64(v2)) => match (v1, v2) { (Some(f1), Some(f2)) => { - *f1 == 0.0 && *f2 == 0.0 || f1.to_bits() == f2.to_bits() + let (b1, b2) = (f1.to_bits(), f2.to_bits()); + ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 } _ => v1.eq(v2), }, @@ -667,11 +669,13 @@ impl PartialOrd for ScalarValue { (Boolean(v1), Boolean(v2)) => v1.partial_cmp(v2), (Boolean(_), _) => None, (Float32(v1), Float32(v2)) => match (v1, v2) { - (Some(a), Some(b)) => Some(if *a == 0.0 && *b == 0.0 { - Ordering::Equal - } else { - a.total_cmp(b) - }), + (Some(a), Some(b)) => { + Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { + Ordering::Equal + } else { + a.total_cmp(b) + }) + } _ => v1.partial_cmp(v2), }, (Float16(v1), Float16(v2)) => match (v1, v2) { @@ -687,11 +691,13 @@ impl PartialOrd for ScalarValue { (Float32(_), _) => None, (Float16(_), _) => None, (Float64(v1), Float64(v2)) => match (v1, v2) { - (Some(a), Some(b)) => Some(if *a == 0.0 && *b == 0.0 { - Ordering::Equal - } else { - a.total_cmp(b) - }), + (Some(a), Some(b)) => { + Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { + Ordering::Equal + } else { + a.total_cmp(b) + }) + } _ => v1.partial_cmp(v2), }, (Float64(_), _) => None, diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs index 6ae1fc97bcaf0..2214d3d35bb7b 100644 --- a/datafusion/functions-nested/src/set_ops.rs +++ b/datafusion/functions-nested/src/set_ops.rs @@ -362,16 +362,17 @@ fn generic_set_lists( // elements between the first and last offset are referenced. let l_first = l.offsets()[0].as_usize(); let l_len = l.offsets()[l.len()].as_usize() - l_first; - let rows_l = converter.convert_columns(&[l_values_norm.slice(l_first, l_len)])?; + let l_values = l_values_norm.slice(l_first, l_len); + let rows_l = converter.convert_columns(&[Arc::clone(&l_values)])?; let r_first = r.offsets()[0].as_usize(); let r_len = r.offsets()[r.len()].as_usize() - r_first; - let rows_r = converter.convert_columns(&[r_values_norm.slice(r_first, r_len)])?; - - // Combine the *sliced* value arrays so 0-based indices from the row - // converter map directly into the concatenated array. - let l_values = l_values_norm.slice(l_first, l_len); let r_values = r_values_norm.slice(r_first, r_len); + let rows_r = converter.convert_columns(&[Arc::clone(&r_values)])?; + + // Indices from the row converter are 0-based in the per-side slice; + // concatenating those same slices lets indices map directly into the + // combined values array. let combined_values = concat(&[l_values.as_ref(), r_values.as_ref()])?; let r_offset = l_len; diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs index 206ba16038b5c..e254aebcfd7ce 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs @@ -76,12 +76,7 @@ macro_rules! hash_float { $(impl HashValue for $t { #[cfg(not(feature = "force_hash_collisions"))] fn hash(&self, state: &RandomState) -> u64 { - let bits = self.to_bits(); - // +0.0 and -0.0 differ only in the sign bit but compare equal - // under IEEE 754; normalize -0.0 → +0.0 so the hash agrees - // with the equality check used by `GroupValuesPrimitive`. - let bits = if bits << 1 == 0 { 0 } else { bits }; - state.hash_one(bits) + state.hash_one(self.canonicalize().to_bits()) } #[cfg(feature = "force_hash_collisions")] diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 6510460bb0411..1deba68ee0542 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -2196,8 +2196,18 @@ fn eq_dyn_null( return Ok(compare_op_for_nested(op, &left, &right)?); } // Arrow's `eq` / `not_distinct` use IEEE 754 totalOrder semantics for - // floats, so `-0.0` and `+0.0` would compare unequal. Normalize the - // operands so SQL semantics (`+0.0 == -0.0`) hold; no-op for non-floats. + // floats, so `-0.0` and `+0.0` would compare unequal. Normalize float + // operands first; non-float types dispatch directly to avoid the + // `make_array(to_data())` round-trip. + if !matches!( + left.data_type(), + DataType::Float16 | DataType::Float32 | DataType::Float64 + ) { + return match null_equality { + NullEquality::NullEqualsNothing => eq(&left, &right), + NullEquality::NullEqualsNull => not_distinct(&left, &right), + }; + } let left_arr: ArrayRef = make_array(left.to_data()); let right_arr: ArrayRef = make_array(right.to_data()); let left_norm = normalize_float_zero(&left_arr); From beb87507393baad0bf7d0da1cdbe1196d5ef1b28 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 8 Jun 2026 19:38:43 -0700 Subject: [PATCH 3/5] feat: Support IEEE 754 for SQL ops --- datafusion/common/src/scalar/mod.rs | 51 +++++------------------------ 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index f0d3740151488..c9013af72619c 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -469,11 +469,7 @@ pub enum ScalarValue { impl Hash for Fl { fn hash(&self, state: &mut H) { - let bits = self.0.to_bits(); - // +0.0 and -0.0 differ only in the sign bit but compare equal under - // IEEE 754; normalize -0.0 → +0.0 so Hash agrees with Eq. - let bits = if bits << 1 == 0 { 0 } else { bits }; - bits.hash(state); + self.0.to_bits().hash(state); } } @@ -504,26 +500,17 @@ impl PartialEq for ScalarValue { (Boolean(v1), Boolean(v2)) => v1.eq(v2), (Boolean(_), _) => false, (Float32(v1), Float32(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => { - let (b1, b2) = (f1.to_bits(), f2.to_bits()); - ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 - } + (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), _ => v1.eq(v2), }, (Float16(v1), Float16(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => { - let (b1, b2) = (f1.to_bits(), f2.to_bits()); - ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 - } + (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), _ => v1.eq(v2), }, (Float32(_), _) => false, (Float16(_), _) => false, (Float64(v1), Float64(v2)) => match (v1, v2) { - (Some(f1), Some(f2)) => { - let (b1, b2) = (f1.to_bits(), f2.to_bits()); - ((b1 << 1) == 0 && (b2 << 1) == 0) || b1 == b2 - } + (Some(f1), Some(f2)) => f1.to_bits() == f2.to_bits(), _ => v1.eq(v2), }, (Float64(_), _) => false, @@ -669,35 +656,17 @@ impl PartialOrd for ScalarValue { (Boolean(v1), Boolean(v2)) => v1.partial_cmp(v2), (Boolean(_), _) => None, (Float32(v1), Float32(v2)) => match (v1, v2) { - (Some(a), Some(b)) => { - Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { - Ordering::Equal - } else { - a.total_cmp(b) - }) - } + (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), _ => v1.partial_cmp(v2), }, (Float16(v1), Float16(v2)) => match (v1, v2) { - (Some(a), Some(b)) => { - Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { - Ordering::Equal - } else { - a.total_cmp(b) - }) - } + (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), _ => v1.partial_cmp(v2), }, (Float32(_), _) => None, (Float16(_), _) => None, (Float64(v1), Float64(v2)) => match (v1, v2) { - (Some(a), Some(b)) => { - Some(if a.to_bits() << 1 == 0 && b.to_bits() << 1 == 0 { - Ordering::Equal - } else { - a.total_cmp(b) - }) - } + (Some(f1), Some(f2)) => Some(f1.total_cmp(f2)), _ => v1.partial_cmp(v2), }, (Float64(_), _) => None, @@ -972,11 +941,7 @@ macro_rules! hash_float_value { $(impl std::hash::Hash for Fl<$t> { #[inline] fn hash(&self, state: &mut H) { - let bits = <$i>::from_ne_bytes(self.0.to_ne_bytes()); - // +0.0 and -0.0 differ only in the sign bit but compare equal - // under IEEE 754; normalize -0.0 → +0.0 so Hash agrees with Eq. - let bits: $i = if bits << 1 == 0 { 0 } else { bits }; - state.write(&bits.to_ne_bytes()) + state.write(&<$i>::from_ne_bytes(self.0.to_ne_bytes()).to_ne_bytes()) } })+ }; From 72de105b2432f6c4e884e8d8d2039ffbefb39e3a Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 9 Jun 2026 14:21:48 -0700 Subject: [PATCH 4/5] feat: Support IEEE 754 for SQL ops --- datafusion/common/src/utils/mod.rs | 31 +++++++++++++++++++ datafusion/physical-plan/src/joins/utils.rs | 11 ++++++- .../sqllogictest/test_files/negative_zero.slt | 25 +++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index f2f3e0397a31e..c79d889ff9749 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1395,24 +1395,55 @@ fn fsl_values_row_number(list_size: i32, array_len: usize) -> Result /// semantics, which treats `-0.0` and `+0.0` as distinct. SQL semantics /// (PostgreSQL / IEEE 754 equality) require them to compare equal, so /// callers normalize before invoking those kernels. +/// +/// The common case — no `-0.0` present — is allocation-free: a single +/// read-only scan of the underlying buffer (auto-vectorizable to an +/// OR-reduction) decides whether to fall through to the rewriting path. +/// Only arrays that actually contain `-0.0` pay for a new buffer. pub fn normalize_float_zero(array: &ArrayRef) -> ArrayRef { use arrow::array::{Float16Array, Float32Array, Float64Array}; use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; + // -0.0 has only the sign bit set; no other finite or NaN value shares + // this bit pattern, so a strict-equality scan reliably gates the rewrite. + const NEG_ZERO_F16_BITS: u16 = half::f16::NEG_ZERO.to_bits(); + const NEG_ZERO_F32_BITS: u32 = (-0.0_f32).to_bits(); + const NEG_ZERO_F64_BITS: u64 = (-0.0_f64).to_bits(); match array.data_type() { DataType::Float32 => { let arr: &Float32Array = array.as_primitive::(); + if !arr + .values() + .iter() + .any(|v| v.to_bits() == NEG_ZERO_F32_BITS) + { + return Arc::clone(array); + } let normalized: Float32Array = arr.unary(|v| if v.to_bits() << 1 == 0 { 0.0_f32 } else { v }); Arc::new(normalized) } DataType::Float64 => { let arr: &Float64Array = array.as_primitive::(); + if !arr + .values() + .iter() + .any(|v| v.to_bits() == NEG_ZERO_F64_BITS) + { + return Arc::clone(array); + } let normalized: Float64Array = arr.unary(|v| if v.to_bits() << 1 == 0 { 0.0_f64 } else { v }); Arc::new(normalized) } DataType::Float16 => { let arr: &Float16Array = array.as_primitive::(); + if !arr + .values() + .iter() + .any(|v| v.to_bits() == NEG_ZERO_F16_BITS) + { + return Arc::clone(array); + } let normalized: Float16Array = arr.unary(|v| { if v.to_bits() << 1 == 0 { half::f16::from_bits(0) diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 1deba68ee0542..5687be04ad867 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -2262,7 +2262,16 @@ impl JoinKeyComparator { .zip(right_arrays.iter()) .zip(sort_options.iter()) .map(|((l, r), opts)| { - let inner = make_comparator(l.as_ref(), r.as_ref(), *opts)?; + // `make_comparator` uses IEEE 754 totalOrder for floats and + // treats `-0.0` / `+0.0` as distinct. Normalize float arrays + // so SMJ / piecewise-merge equi-keys honor SQL equality; + // no-op (Arc::clone) for non-floats and for float arrays + // that contain no `-0.0`. `normalize_float_zero` preserves + // null positions, so the original null masks below remain + // valid. + let l_norm = normalize_float_zero(l); + let r_norm = normalize_float_zero(r); + let inner = make_comparator(l_norm.as_ref(), r_norm.as_ref(), *opts)?; if null_equality == NullEquality::NullEqualsNothing { let ln = l.logical_nulls().filter(|n| n.null_count() > 0); let rn = r.logical_nulls().filter(|n| n.null_count() > 0); diff --git a/datafusion/sqllogictest/test_files/negative_zero.slt b/datafusion/sqllogictest/test_files/negative_zero.slt index a9e16e1d56237..8ea1122880e14 100644 --- a/datafusion/sqllogictest/test_files/negative_zero.slt +++ b/datafusion/sqllogictest/test_files/negative_zero.slt @@ -204,3 +204,28 @@ FROM (SELECT 0.0 AS a) t1 JOIN (SELECT -0.0 AS b) t2 ON t1.a = t2.b; ---- 0 0 + +# Sort-merge join must also match +0.0 against -0.0. SMJ builds equi-key +# matchers via `JoinKeyComparator`, which calls Arrow's `make_comparator` +# (IEEE 754 totalOrder); without normalization, +0.0 and -0.0 produce +# different orderings and miss the match. +statement ok +set datafusion.optimizer.prefer_hash_join = false; + +query RR +SELECT t1.a, t2.b +FROM (SELECT 0.0 AS a) t1 +JOIN (SELECT -0.0 AS b) t2 ON t1.a = t2.b; +---- +0 0 + +# Float32 SMJ equi-join. +query RR +SELECT t1.a, t2.b +FROM (SELECT arrow_cast(0.0, 'Float32') AS a) t1 +JOIN (SELECT arrow_cast(-0.0, 'Float32') AS b) t2 ON t1.a = t2.b; +---- +0 0 + +statement ok +reset datafusion.optimizer.prefer_hash_join; From 06f10f0b2705ca038aaaebd4273f8b8e955ad7eb Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 9 Jun 2026 14:30:50 -0700 Subject: [PATCH 5/5] feat: Support IEEE 754 for SQL ops --- datafusion/common/src/utils/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index c79d889ff9749..12b3f44fe796a 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1396,7 +1396,7 @@ fn fsl_values_row_number(list_size: i32, array_len: usize) -> Result /// (PostgreSQL / IEEE 754 equality) require them to compare equal, so /// callers normalize before invoking those kernels. /// -/// The common case — no `-0.0` present — is allocation-free: a single +/// The common case - no `-0.0` present - is allocation-free: a single /// read-only scan of the underlying buffer (auto-vectorizable to an /// OR-reduction) decides whether to fall through to the rewriting path. /// Only arrays that actually contain `-0.0` pay for a new buffer.