JuliaGPU
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎CUDACore/Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎CUDACore/Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CUDACore/src/CUDACore.jl‎
Lines changed: 0 additions & 1 deletion b/‎CUDACore/src/CUDACore.jl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CUDACore/src/device/intrinsics/math.jl‎
Lines changed: 28 additions & 3 deletions b/‎CUDACore/src/device/intrinsics/math.jl‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎CUDACore/src/device/quirks.jl‎
Lines changed: 4 additions & 0 deletions b/‎CUDACore/src/device/quirks.jl‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CUDACore/src/device/random.jl‎
Lines changed: 125 additions & 37 deletions b/‎CUDACore/src/device/random.jl‎
Lines changed: 125 additions & 37 deletions
diff --git a/‎CUDACore/src/random.jl‎
Lines changed: 0 additions & 43 deletions b/‎CUDACore/src/random.jl‎
Lines changed: 0 additions & 43 deletions
@@ -348,7 +348,7 @@ steps:
           - JuliaCI/julia#v1:
               version: "1.12"
           - JuliaCI/julia-test#v1:
-              test_args: "--gpu=0,1 --quickfail core libraries/cublas/xt"
+              test_args: "--quickfail core libraries/cublas/xt"
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
 
@@ -50,7 +50,7 @@ CUDA_Runtime_jll = "0.21"
 ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
-GPUArrays = "11.4.1"
+GPUArrays = "11.5"
 GPUCompiler = "1.4"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 
@@ -112,7 +112,6 @@ include("accumulate.jl")
 include("reverse.jl")
 include("iterator.jl")
 include("sorting.jl")
-include("random.jl")
 
 # shared library types
 include("complex.jl")
 
@@ -2,7 +2,7 @@
 
 @public fma, rsqrt, saturate, byte_perm, assume
 
-using Base: FastMath
+using Base: FastMath, @assume_effects
 
 
 ## helpers
@@ -248,17 +248,42 @@ end
 @device_override Base.:(^)(x::Float64, y::Float64) = ccall("extern __nv_pow", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.:(^)(x::Float32, y::Float32) = ccall("extern __nv_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override FastMath.pow_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_powf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+# pow_fast: Base methods call llvm.powi which NVPTX cannot lower (#3065)
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float64, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    x ^ y  # no fast variant for Float64; uses __nv_powi
+end
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float32, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    FastMath.pow_fast(x, Float32(y))  # uses __nv_fast_powf
+end
+@device_override @assume_effects :foldable @inline function FastMath.pow_fast(x::Float16, y::Integer)
+    y == -1 && return inv(x)
+    y == 0 && return one(x)
+    y == 1 && return x
+    y == 2 && return x*x
+    y == 3 && return x*x*x
+    Float16(FastMath.pow_fast(Float32(x), Float32(y)))
+end
 @device_override Base.:(^)(x::Float64, y::Int32) = ccall("extern __nv_powi", llvmcall, Cdouble, (Cdouble, Int32), x, y)
 @device_override Base.:(^)(x::Float32, y::Int32) = ccall("extern __nv_powif", llvmcall, Cfloat, (Cfloat, Int32), x, y)
-@device_override @inline function Base.:(^)(x::Float32, y::Int64)
+@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float32, y::Int64)
     y == -1 && return inv(x)
     y == 0 && return one(x)
     y == 1 && return x
     y == 2 && return x*x
     y == 3 && return x*x*x
     x ^ Float32(y)
 end
-@device_override @inline function Base.:(^)(x::Float64, y::Int64)
+@device_override @assume_effects :foldable @inline function Base.:(^)(x::Float64, y::Int64)
     y == -1 && return inv(x)
     y == 0 && return one(x)
     y == 1 && return x
 
@@ -103,3 +103,7 @@ for op in (:(<), :(<=), :cmp)
         @device_override Base.$op(q::Rational, x::AbstractFloat) = $op(float(q), x)
     end
 end
+
+# reshape.jl
+@device_override Base._throw_dmrs(n, str, dims) =
+    @gputhrow "DimensionMismatch" "Dimensions mismatch when reshaping. New dimensions must be consistent with array size"
@@ -93,6 +93,9 @@ end
 
 @device_override @inline Random.default_rng() = Philox2x32()
 
+# default to Float32 on GPU (matches CUDA convention, avoids expensive FP64)
+@device_override @inline Random.rand(rng::AbstractRNG) = Random.rand(rng, Float32)
+
 """
     Random.seed!(rng::Philox2x32, seed::Integer, [counter::Integer=0])
 
@@ -123,14 +126,9 @@ else
         Random.seed!(Random.default_rng(), seed)
 end
 
-"""
-    Random.rand(rng::Philox2x32, UInt32)
-
-Generate a byte of random data using the on-device Tausworthe generator.
-"""
-function Random.rand(rng::Philox2x32{R},::Type{UInt64}) where {R}
-    ctr1, ctr2, key = rng.ctr1, rng.ctr2, rng.key
-
+# R rounds of Philox2x32, unrolled at compile time
+@inline function philox2x_rounds(::Val{R}, ctr1::UInt32, ctr2::UInt32,
+                                  key::UInt32) where R
     if R > 0                               ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
     if R > 1  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
     if R > 2  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
@@ -147,6 +145,16 @@ function Random.rand(rng::Philox2x32{R},::Type{UInt64}) where {R}
     if R > 13 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
     if R > 14 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
     if R > 15 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    ctr1, ctr2
+end
+
+"""
+    Random.rand(rng::Philox2x32, UInt64)
+
+Generate 64 bits of random data using the on-device Philox2x32 generator.
+"""
+function Random.rand(rng::Philox2x32{R}, ::Type{UInt64}) where {R}
+    ctr1, ctr2 = philox2x_rounds(Val(R), rng.ctr1, rng.ctr2, rng.key)
 
     # update the warp counter
     # NOTE: this performs the same update on every thread in the warp, but each warp writes
@@ -201,7 +209,7 @@ function emit_constant_array(name::Symbol, data::AbstractArray{T}) where {T}
     end
 end
 
-for var in [:ki, :wi, :fi, :ke, :we, :fe]
+for var in [:ke, :we, :fe]
     val = getfield(Random, var)
     gpu_var = Symbol("gpu_$var")
     arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
@@ -211,39 +219,119 @@ for var in [:ki, :wi, :fi, :ke, :we, :fe]
     end
 end
 
-## randn
+## Box-Muller helpers
+#
+# Vendored from GPUArrays.jl, which uses them in its host-side Philox4x32-10
+# batched randn kernel. Keep constants in sync when upstream tunes them.
+
+using Base: FastMath
+
+# unsigned int → uniform float in (0, 1), strictly positive
+
+@inline u01(::Type{Float32}, u::UInt32) =
+    fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+
+# Bit-pattern construction avoids Float64(::UInt64) + FMA on consumer GPUs
+# (FP64 throughput as low as 1:64). Low mantissa bit set so result ∈ (0, 1) —
+# Box-Muller needs log(u) ≠ -Inf.
+@inline u01(::Type{Float64}, u::UInt64) =
+    reinterpret(Float64, ((u >> 12) | 0x1) | 0x3ff0000000000000) - 1.0
+
+# Polynomial sincospi(Float32): branchless, stays in Float32 (Base.sincospi
+# widens internally). Bottom 3 bits of u pick an octant (swap/negate); top
+# 29 bits give the reduced argument (+0.5-biased so y ≠ 0).
+
+const SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0)
+const CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
+
+@inline function fast_sincospi(::Type{Float32}, u::UInt32)
+    oct = (u % Int32) & Int32(7)
+    y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32))
+    sp = y * evalpoly(y * y, SP_F32)
+    cp = evalpoly(y * y, CP_F32)
+    swap    = !iszero(oct & Int32(1))
+    sin_neg = !iszero(oct & Int32(2))
+    cos_neg = !iszero(oct & Int32(4))
+    s_raw = ifelse(swap, cp, sp)
+    c_raw = ifelse(swap, sp, cp)
+    (ifelse(sin_neg, -s_raw, s_raw), ifelse(cos_neg, -c_raw, c_raw))
+end
 
-@device_override function Random.randn(rng::AbstractRNG)
-    while true
-        r = Random.rand(rng, Random.UInt52Raw()) % UInt64
-        @inbounds begin
-            r &= 0x000fffffffffffff
-            rabs = Int64(r>>1) # One bit for the sign
-            idx = rabs & 0xFF
-            x = ifelse(r % Bool, -rabs, rabs)*gpu_wi()[idx+1]
-            rabs < gpu_ki()[idx+1] && return x # 99.3% of the time we return here 1st try
-            result = randn_unlikely(rng, idx, rabs, x)
-            result !== nothing && return result
-        end
-    end
+# Polynomial log(Float32), fdlibm-based. Consumes the raw UInt32 output; u01
+# is folded into the first FMA so there's no intermediate float.
+
+const SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
+const LOG_ODD_F32   = (reinterpret(Float32, Int32(0x3f2aaaaa)),
+                       reinterpret(Float32, Int32(0x3e91e9ee)))
+const LOG_EVEN_F32  = (reinterpret(Float32, Int32(0x3eccce13)),
+                       reinterpret(Float32, Int32(0x3e789e26)))
+
+@inline function fast_log(::Type{Float32}, u::UInt32)
+    x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+    ix = reinterpret(Int32, x) - SQRT_HALF_I32
+    k = ix >> Int32(23)
+    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + SQRT_HALF_I32) - 1.0f0
+    f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
+    f = ifelse(k == Int32(0), f_comp, f_std)
+    s = f / (2.0f0 + f)
+    z = s * s; w = z * z
+    R = z * evalpoly(w, LOG_ODD_F32) + w * evalpoly(w, LOG_EVEN_F32)
+    hfsq = 0.5f0 * f * f
+    Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
+        ((hfsq - (s * (hfsq + R) +
+          Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f)
 end
 
-# this unlikely branch is put in a separate function for better efficiency
-@noinline function randn_unlikely(rng, idx, rabs, x)
-    @inbounds if idx == 0
-        while true
-            xx = -Random.ziggurat_nor_inv_r*log(Random.rand(rng))
-            yy = -log(Random.rand(rng))
-            yy+yy > xx*xx &&
-                return (rabs >> 8) % Bool ? -Random.ziggurat_nor_r-xx : Random.ziggurat_nor_r+xx
-        end
-    elseif (gpu_fi()[idx] - gpu_fi()[idx+1])*Random.rand(rng) + gpu_fi()[idx+1] < exp(-0.5*x*x)
-        return x # return from the triangular area
-    else
-        return # retry
-    end
+# Box-Muller: pair of uniforms → pair of standard normals
+
+@inline function boxmuller(::Type{T}, u1::UInt32, u2::UInt32) where T <: Union{Float16,Float32}
+    r = sqrt(-2f0 * fast_log(Float32, u2))
+    s, c = fast_sincospi(Float32, u1)
+    (T(r * s), T(r * c))
+end
+
+@inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64)
+    r = sqrt(-2.0 * FastMath.log_fast(u1))
+    s, c = sincospi(2 * u2)
+    (r * s, r * c)
+end
+
+
+## randn — Box-Muller transform
+#
+# Uses Box-Muller instead of Ziggurat: rejection sampling would warp-diverge,
+# and the Ziggurat tables aren't device-accessible.
+
+# Specialization for Philox2x32: one Philox call produces exactly the pair of
+# UInt32s Box-Muller needs, halving the Philox work vs the generic path.
+@device_override @inline function Random.randn(rng::Philox2x32{R},
+                                                ::Type{T}) where {R, T <: Union{Float16,Float32}}
+    ctr1, ctr2 = philox2x_rounds(Val(R), rng.ctr1, rng.ctr2, rng.key)
+    rng.ctr1 += 1i32
+    n, _ = boxmuller(T, ctr1, ctr2)
+    n
 end
 
+# Float64 fundamentally needs 64 bits of entropy per uniform, so 2 Philox
+# calls. The u01 bit-trick avoids the expensive Float64(::UInt64) conversion.
+@device_override @inline function Random.randn(rng::Philox2x32{R},
+                                                ::Type{Float64}) where R
+    u1 = u01(Float64, Random.rand(rng, UInt64))
+    u2 = u01(Float64, Random.rand(rng, UInt64))
+    n, _ = boxmuller(Float64, u1, u2)
+    n
+end
+
+# Generic fallback for user-defined AbstractFloat types.
+@device_override @inline function Random.randn(rng::AbstractRNG, ::Type{T}) where T <: AbstractFloat
+    U1 = max(Random.rand(rng, T), floatmin(T))  # avoid log(0)
+    U2 = Random.rand(rng, T)
+    sqrt(T(-2) * FastMath.log_fast(U1)) * first(sincospi(T(2) * U2))
+end
+
+# untyped randn() defaults to Float32 on GPU
+@device_override @inline Random.randn(rng::AbstractRNG) = Random.randn(rng, Float32)
+
 ## randexp
 
 @device_override function Random.randexp(rng::AbstractRNG)