JuliaGPU
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 12 additions & 8 deletions b/‎README.md‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎examples/batchmatmul.jl‎
Lines changed: 8 additions & 9 deletions b/‎examples/batchmatmul.jl‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎examples/fft.jl‎
Lines changed: 20 additions & 20 deletions b/‎examples/fft.jl‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎examples/layernorm.jl‎
Lines changed: 34 additions & 36 deletions b/‎examples/layernorm.jl‎
Lines changed: 34 additions & 36 deletions
diff --git a/‎examples/matmul.jl‎
Lines changed: 6 additions & 6 deletions b/‎examples/matmul.jl‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/transpose.jl‎
Lines changed: 2 additions & 2 deletions b/‎examples/transpose.jl‎
Lines changed: 2 additions & 2 deletions
@@ -25,6 +25,6 @@ DLFP8TypesExt = "DLFP8Types"
 BFloat16s = "0.6"
 CUDA_Compiler_jll = "0.4"
 CUDA_Tile_jll = "13.1"
-CompilerCaching = "0.1"
+CompilerCaching = "0.1.2"
 IRStructurizer = "0.1"
 julia = "1.11"
@@ -34,10 +34,10 @@ using CUDA
 import cuTile as ct
 
 # Define kernel
-function vadd(a, b, c, tile_size::ct.Constant{Int})
+function vadd(a, b, c, tile_size::Int)
     pid = ct.bid(1)
-    tile_a = ct.load(a, pid, (tile_size[],))
-    tile_b = ct.load(b, pid, (tile_size[],))
+    tile_a = ct.load(a, pid, (tile_size,))
+    tile_b = ct.load(b, pid, (tile_size,))
     ct.store(c, pid, tile_a + tile_b)
     return
 end
@@ -297,28 +297,32 @@ permutedims(tile, (3, 1, 2))
 
 This applies to `bid`, `num_blocks`, `permutedims`, `reshape`, dimension arguments, etc.
 
-### `Val`-like constants
+### Compile-time constants
 
-CuTile.jl uses `ct.Constant{T}` to encode compile-time constant values in the type domain, similar to how `Val` works. An explicit `[]` is needed to extract the value at runtime:
+Python annotates constant parameters in the kernel signature and passes plain values at launch.
+Julia is the reverse: kernel signatures use plain types, and constants are wrapped at launch:
 
 ```python
 # Python
 @ct.kernel
-def kernel(a, b, tile_size):
+def kernel(a, b, tile_size: ct.Constant[int]):
     tile = ct.load(a, index=(0,), shape=(tile_size,))
 
 ct.launch(stream, grid, kernel, (a, b, 16))
 ```
 
 ```julia
 # Julia
-function kernel(a, b, tile_size::ct.Constant{Int})
-    tile = ct.load(a, 1, (tile_size[],))
+function kernel(a, b, tile_size::Int)
+    tile = ct.load(a, 1, (tile_size,))
 end
 
 ct.launch(kernel, grid, a, b, ct.Constant(16))
 ```
 
+`ct.Constant` arguments generate no kernel parameter; the value is embedded directly in
+the compiled code. Different constant values produce different kernel specializations.
+
 ### Broadcasting and Math Functions
 
 Python's operators and math functions work directly on tiles with automatic broadcasting.
 
@@ -12,32 +12,31 @@ import cuTile as ct
 # A: (M, K, Batch), B: (K, N, Batch), C: (M, N, Batch)
 # Grid: (M_tiles, N_tiles, Batch)
 function batch_matmul_kernel(A::ct.TileArray{T,3}, B::ct.TileArray{T,3}, C::ct.TileArray{T,3},
-                             tm::ct.Constant{Int}, tn::ct.Constant{Int},
-                             tk::ct.Constant{Int}) where {T}
+                             tm::Int, tn::Int, tk::Int) where {T}
     # Grid dimensions (1-indexed)
     bid_m = ct.bid(1)      # M tile index
     bid_n = ct.bid(2)      # N tile index
     pid_batch = ct.bid(3)  # Batch index
 
     # Number of K tiles to iterate over
     K = size(A, 2)
-    num_k = cld(K, Int32(tk[]))
+    num_k = cld(K, Int32(tk))
 
     # Initialize accumulator with Float32 for precision
-    acc = ct.full((tm[], tn[]), zero(Float32), Float32)
+    acc = ct.full((tm, tn), zero(Float32), Float32)
 
     # K reduction loop
     k = Int32(1)
     while k <= num_k
         # Load 3D tiles: (tm, tk, 1) and (tk, tn, 1)
-        a = ct.load(A, (bid_m, k, pid_batch), (tm[], tk[], 1);
+        a = ct.load(A, (bid_m, k, pid_batch), (tm, tk, 1);
                     padding_mode=ct.PaddingMode.Zero)
-        b = ct.load(B, (k, bid_n, pid_batch), (tk[], tn[], 1);
+        b = ct.load(B, (k, bid_n, pid_batch), (tk, tn, 1);
                     padding_mode=ct.PaddingMode.Zero)
 
         # Reshape 3D tiles to 2D for mma
-        a_2d = reshape(a, (tm[], tk[]))
-        b_2d = reshape(b, (tk[], tn[]))
+        a_2d = reshape(a, (tm, tk))
+        b_2d = reshape(b, (tk, tn))
 
         # Convert to TF32 for tensor cores (Float32 inputs only)
         if T === Float32
@@ -51,7 +50,7 @@ function batch_matmul_kernel(A::ct.TileArray{T,3}, B::ct.TileArray{T,3}, C::ct.T
 
     # Convert to output type, reshape to 3D, and store
     result = convert(ct.Tile{T}, acc)
-    result_3d = reshape(result, (tm[], tn[], 1))
+    result_3d = reshape(result, (tm, tn, 1))
     ct.store(C, (bid_m, bid_n, pid_batch), result_3d)
 
     return nothing
 
@@ -29,28 +29,28 @@ function fft_kernel(
     W2::ct.TileArray{Float32, 3},            # W2 (F2, F2, 2)
     T0::ct.TileArray{Float32, 3},            # T0 (F1F2, F0, 2) twiddle factors
     T1::ct.TileArray{Float32, 3},            # T1 (F0F2, F1, 2) twiddle factors
-    n_const::ct.Constant{Int},
-    f0_const::ct.Constant{Int},
-    f1_const::ct.Constant{Int},
-    f2_const::ct.Constant{Int},
-    f0f1_const::ct.Constant{Int},
-    f1f2_const::ct.Constant{Int},
-    f0f2_const::ct.Constant{Int},
-    bs_const::ct.Constant{Int},
-    d_const::ct.Constant{Int},
-    n2d_const::ct.Constant{Int}
+    n_const::Int,
+    f0_const::Int,
+    f1_const::Int,
+    f2_const::Int,
+    f0f1_const::Int,
+    f1f2_const::Int,
+    f0f2_const::Int,
+    bs_const::Int,
+    d_const::Int,
+    n2d_const::Int
 )
     # Extract constant values
-    N = n_const[]
-    F0 = f0_const[]
-    F1 = f1_const[]
-    F2 = f2_const[]
-    F0F1 = f0f1_const[]
-    F1F2 = f1f2_const[]
-    F0F2 = f0f2_const[]
-    BS = bs_const[]
-    D = d_const[]
-    N2D = n2d_const[]
+    N = n_const
+    F0 = f0_const
+    F1 = f1_const
+    F2 = f2_const
+    F0F1 = f0f1_const
+    F1F2 = f1f2_const
+    F0F2 = f0f2_const
+    BS = bs_const
+    D = d_const
+    N2D = n2d_const
 
     bid = ct.bid(1)
 
 
@@ -5,8 +5,6 @@
 using CUDA
 import cuTile as ct
 
-const ConstInt = ct.Constant{Int}
-
 #=============================================================================
  LayerNorm Forward Kernel
 
@@ -25,43 +23,43 @@ const ConstInt = ct.Constant{Int}
 function layer_norm_fwd(X::ct.TileArray{Float32, 2}, W::ct.TileArray{Float32, 1},
                         B::ct.TileArray{Float32, 1}, Y::ct.TileArray{Float32, 2},
                         Mean::ct.TileArray{Float32, 1}, Rstd::ct.TileArray{Float32, 1},
-                        eps::ct.Constant{Float32}, TILE_N::ConstInt)
+                        eps::Float32, TILE_N::Int)
     bid_m = ct.bid(1)
-    num_tiles = ct.num_tiles(X, 2, (1, TILE_N[]))
+    num_tiles = ct.num_tiles(X, 2, (1, TILE_N))
     N = size(X, 2)
 
     # Compute mean
-    mean = ct.full((1, TILE_N[]), 0.0f0, Float32)
+    mean = ct.full((1, TILE_N), 0.0f0, Float32)
     j = Int32(1)
     while j <= num_tiles
-        tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero)
+        tx = ct.load(X, (bid_m, j), (1, TILE_N); padding_mode=ct.PaddingMode.Zero)
         mean = mean .+ tx
         j += Int32(1)
     end
     mean = sum(mean; dims=2) / N
     ct.store(Mean, bid_m, mean)
 
     # Compute variance
-    var = ct.full((1, TILE_N[]), 0.0f0, Float32)
+    var = ct.full((1, TILE_N), 0.0f0, Float32)
     j = Int32(1)
     while j <= num_tiles
-        tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero)
+        tx = ct.load(X, (bid_m, j), (1, TILE_N); padding_mode=ct.PaddingMode.Zero)
         # Mask for valid elements
-        mask = ct.broadcast_to(((j - Int32(1)) * Int32(TILE_N[]) .+ ct.arange((TILE_N[],), Int32)) .<= N, (1, TILE_N[]))
+        mask = ct.broadcast_to(((j - Int32(1)) * Int32(TILE_N) .+ ct.arange((TILE_N,), Int32)) .<= N, (1, TILE_N))
         centered_tx = ifelse.(mask, tx .- mean, 0.0f0)
         var = var .+ (centered_tx .^ 2.0f0)
         j += Int32(1)
     end
     var = sum(var; dims=2) / N
-    rstd = 1.0f0 ./ sqrt.(var .+ eps[])
+    rstd = 1.0f0 ./ sqrt.(var .+ eps)
     ct.store(Rstd, bid_m, rstd)
 
     # Normalize and apply affine transformation
     j = Int32(1)
     while j <= num_tiles
-        tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero)
-        tw = reshape(ct.load(W, j, (TILE_N[],); padding_mode=ct.PaddingMode.Zero), (1, TILE_N[]))
-        tb = reshape(ct.load(B, j, (TILE_N[],); padding_mode=ct.PaddingMode.Zero), (1, TILE_N[]))
+        tx = ct.load(X, (bid_m, j), (1, TILE_N); padding_mode=ct.PaddingMode.Zero)
+        tw = reshape(ct.load(W, j, (TILE_N,); padding_mode=ct.PaddingMode.Zero), (1, TILE_N))
+        tb = reshape(ct.load(B, j, (TILE_N,); padding_mode=ct.PaddingMode.Zero), (1, TILE_N))
         ty = (tx .- mean) .* rstd
         ty = ty .* tw .+ tb
         ct.store(Y, (bid_m, j), ty)
@@ -123,21 +121,21 @@ Args:
 function layer_norm_bwd_dx(DX::ct.TileArray{Float32, 2}, DY::ct.TileArray{Float32, 2},
                            X::ct.TileArray{Float32, 2}, W::ct.TileArray{Float32, 1},
                            Mean::ct.TileArray{Float32, 1}, Rstd::ct.TileArray{Float32, 1},
-                           TILE_N::ConstInt)
+                           TILE_N::Int)
     bid_m = ct.bid(1)
-    num_tiles = ct.num_tiles(X, 2, (1, TILE_N[]))
+    num_tiles = ct.num_tiles(X, 2, (1, TILE_N))
     N = size(X, 2)
 
     # Load mean and rstd for this row
     mean = ct.load(Mean, bid_m, (1,); padding_mode=ct.PaddingMode.Zero)
     rstd = ct.load(Rstd, bid_m, (1,); padding_mode=ct.PaddingMode.Zero)
 
     # First pass: compute c1 and c2 reduction terms
-    c1 = ct.full((1, TILE_N[]), 0.0f0, Float32)
-    c2 = ct.full((1, TILE_N[]), 0.0f0, Float32)
+    c1 = ct.full((1, TILE_N), 0.0f0, Float32)
+    c2 = ct.full((1, TILE_N), 0.0f0, Float32)
     j = Int32(1)
     while j <= num_tiles
-        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N[], N)
+        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         c1 = c1 .+ (xhat .* wdy)
         c2 = c2 .+ wdy
         j += Int32(1)
@@ -148,7 +146,7 @@ function layer_norm_bwd_dx(DX::ct.TileArray{Float32, 2}, DY::ct.TileArray{Float3
     # Second pass: compute dX
     j = Int32(1)
     while j <= num_tiles
-        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N[], N)
+        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         tdx = (wdy .- (xhat .* c1 .+ c2)) .* rstd
         ct.store(DX, (bid_m, j), tdx)
         j += Int32(1)
@@ -181,22 +179,22 @@ function layer_norm_bwd_dx_partial_dwdb(DX::ct.TileArray{Float32, 2}, DY::ct.Til
                                          X::ct.TileArray{Float32, 2}, W::ct.TileArray{Float32, 1},
                                          Mean::ct.TileArray{Float32, 1}, Rstd::ct.TileArray{Float32, 1},
                                          Locks::ct.TileArray{Int, 1},
-                                         GROUP_SIZE_M::ConstInt, TILE_N::ConstInt)
+                                         GROUP_SIZE_M::Int, TILE_N::Int)
     bid_m = ct.bid(1)
-    num_tiles = ct.num_tiles(X, 2, (1, TILE_N[]))
+    num_tiles = ct.num_tiles(X, 2, (1, TILE_N))
     N = size(X, 2)
-    group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M[])) + Int32(1)
+    group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M)) + Int32(1)
 
     # Load mean and rstd for this row
     mean = ct.load(Mean, bid_m, (1,); padding_mode=ct.PaddingMode.Zero)
     rstd = ct.load(Rstd, bid_m, (1,); padding_mode=ct.PaddingMode.Zero)
 
     # First pass: compute c1 and c2 reduction terms
-    c1 = ct.full((1, TILE_N[]), 0.0f0, Float32)
-    c2 = ct.full((1, TILE_N[]), 0.0f0, Float32)
+    c1 = ct.full((1, TILE_N), 0.0f0, Float32)
+    c2 = ct.full((1, TILE_N), 0.0f0, Float32)
     j = Int32(1)
     while j <= num_tiles
-        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N[], N)
+        _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         c1 = c1 .+ (xhat .* wdy)
         c2 = c2 .+ wdy
         j += Int32(1)
@@ -207,12 +205,12 @@ function layer_norm_bwd_dx_partial_dwdb(DX::ct.TileArray{Float32, 2}, DY::ct.Til
     # Second pass: compute dX and partial dW/dB
     j = Int32(1)
     while j <= num_tiles
-        tdy, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N[], N)
+        tdy, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         tdx = (wdy .- (xhat .* c1 .+ c2)) .* rstd
         ct.store(DX, (bid_m, j), tdx)
 
-        partial_dw = reshape(tdy .* xhat, (TILE_N[], 1))
-        partial_db = reshape(tdy, (TILE_N[], 1))
+        partial_dw = reshape(tdy .* xhat, (TILE_N, 1))
+        partial_db = reshape(tdy, (TILE_N, 1))
 
         # Acquire spinlock
         while ct.atomic_cas(Locks, group_bid_m, 0, 1;
@@ -221,8 +219,8 @@ function layer_norm_bwd_dx_partial_dwdb(DX::ct.TileArray{Float32, 2}, DY::ct.Til
         end
 
         # Critical section: accumulate partial gradients
-        partial_dw = partial_dw .+ ct.load(DW, (j, group_bid_m), (TILE_N[], 1); padding_mode=ct.PaddingMode.Zero)
-        partial_db = partial_db .+ ct.load(DB, (j, group_bid_m), (TILE_N[], 1); padding_mode=ct.PaddingMode.Zero)
+        partial_dw = partial_dw .+ ct.load(DW, (j, group_bid_m), (TILE_N, 1); padding_mode=ct.PaddingMode.Zero)
+        partial_db = partial_db .+ ct.load(DB, (j, group_bid_m), (TILE_N, 1); padding_mode=ct.PaddingMode.Zero)
         ct.store(DW, (j, group_bid_m), partial_dw)
         ct.store(DB, (j, group_bid_m), partial_db)
 
@@ -251,16 +249,16 @@ Args:
 """
 function layer_norm_bwd_dwdb(DW::ct.TileArray{Float32, 2}, DB::ct.TileArray{Float32, 2},
                               FINAL_DW::ct.TileArray{Float32, 1}, FINAL_DB::ct.TileArray{Float32, 1},
-                              TILE_M::ConstInt, TILE_N::ConstInt)
+                              TILE_M::Int, TILE_N::Int)
     bid_n = ct.bid(1)
-    num_tiles = ct.num_tiles(DW, 2, (TILE_N[], TILE_M[]))
+    num_tiles = ct.num_tiles(DW, 2, (TILE_N, TILE_M))
 
-    dw = ct.zeros((TILE_N[], TILE_M[]), Float32)
-    db = ct.zeros((TILE_N[], TILE_M[]), Float32)
+    dw = ct.zeros((TILE_N, TILE_M), Float32)
+    db = ct.zeros((TILE_N, TILE_M), Float32)
     i = Int32(1)
     while i <= num_tiles
-        dw = dw .+ ct.load(DW, (bid_n, i), (TILE_N[], TILE_M[]); padding_mode=ct.PaddingMode.Zero)
-        db = db .+ ct.load(DB, (bid_n, i), (TILE_N[], TILE_M[]); padding_mode=ct.PaddingMode.Zero)
+        dw = dw .+ ct.load(DW, (bid_n, i), (TILE_N, TILE_M); padding_mode=ct.PaddingMode.Zero)
+        db = db .+ ct.load(DB, (bid_n, i), (TILE_N, TILE_M); padding_mode=ct.PaddingMode.Zero)
         i += Int32(1)
     end
     sum_dw = sum(dw; dims=2)
 
@@ -23,22 +23,22 @@ end
 # Matrix multiplication kernel with K reduction loop and 2D swizzle
 # C = A @ B where A is (M, K), B is (K, N), C is (M, N)
 function matmul_kernel(A::ct.TileArray{T,2}, B::ct.TileArray{T,2}, C::ct.TileArray{T,2},
-                       tm::ct.Constant{Int}, tn::ct.Constant{Int}, tk::ct.Constant{Int}) where {T}
+                       tm::Int, tn::Int, tk::Int) where {T}
     # Use 1D grid with swizzle for better cache locality
     bid = ct.bid(1)
     M = size(A, 1)
     N = size(B, 2)
     # swizzle_2d expects 0-indexed bid, returns 0-indexed tile coords
-    bid_m_0, bid_n_0 = swizzle_2d(M, N, tm[], tn[], 8, bid - Int32(1))
+    bid_m_0, bid_n_0 = swizzle_2d(M, N, tm, tn, 8, bid - Int32(1))
     # Convert to 1-indexed tile coordinates
     bid_m = bid_m_0 + Int32(1)
     bid_n = bid_n_0 + Int32(1)
 
     # Number of K tiles to iterate over
-    num_k = ct.num_tiles(A, 2, (tm[], tk[]))
+    num_k = ct.num_tiles(A, 2, (tm, tk))
 
     # Initialize accumulator with Float32 for precision
-    acc = ct.full((tm[], tn[]), zero(Float32), Float32)
+    acc = ct.full((tm, tn), zero(Float32), Float32)
 
     # K reduction loop - accumulate partial products
     # NOTE: Uses while-loop pattern. Native `for k in 0:n` syntax generates complex
@@ -47,8 +47,8 @@ function matmul_kernel(A::ct.TileArray{T,2}, B::ct.TileArray{T,2}, C::ct.TileArr
     while k <= num_k
         # Load and convert to TF32 for tensor cores (Float32 only)
         # padding_mode=Zero ensures out-of-bounds reads return zero (for non-aligned dimensions)
-        a = ct.load(A, (bid_m, k), (tm[], tk[]); padding_mode=ct.PaddingMode.Zero)
-        b = ct.load(B, (k, bid_n), (tk[], tn[]); padding_mode=ct.PaddingMode.Zero)
+        a = ct.load(A, (bid_m, k), (tm, tk); padding_mode=ct.PaddingMode.Zero)
+        b = ct.load(B, (k, bid_n), (tk, tn); padding_mode=ct.PaddingMode.Zero)
         if T === Float32
             a = convert(ct.Tile{ct.TFloat32}, a)
             b = convert(ct.Tile{ct.TFloat32}, b)
 
@@ -8,10 +8,10 @@ import cuTile as ct
 # Transpose kernel with TileArray and constant tile sizes
 # TileArray carries size/stride metadata, Constant parameters are ghost types
 function transpose_kernel(x::ct.TileArray{T,2}, y::ct.TileArray{T,2},
-                          tm::ct.Constant{Int}, tn::ct.Constant{Int}) where {T}
+                          tm::Int, tn::Int) where {T}
     bidx = ct.bid(1)
     bidy = ct.bid(2)
-    input_tile = ct.load(x, (bidx, bidy), (tm[], tn[]))
+    input_tile = ct.load(x, (bidx, bidy), (tm, tn))
     transposed_tile = transpose(input_tile)
     ct.store(y, (bidy, bidx), transposed_tile)
     return