Use Julia-native for loops (#174)

maleadt · claude · web-flow · commit 84f8fa82998e · 2026-04-05T13:14:58.000+02:00
The new IRStructurizer engine handles Julia's for-in-range iterator
protocol, so counting while loops are no longer needed. Convert all
`k = Int32(1); while k &lt;= n; ...; k += Int32(1); end` patterns to
`for k in Int32(1):n; ...; end` across examples and tests.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Project.toml b/Project.toml
@@ -30,5 +30,5 @@ CUDA_Tile_jll = "13.1"
 CompilerCaching = "0.2"
 EnumX = "1.0"
 GPUArrays = "11"
-IRStructurizer = "0.4"
+IRStructurizer = "0.5"
 julia = "1.11"
diff --git a/README.md b/README.md
@@ -511,30 +511,6 @@ standard Julia silently produce truncated or wrapped results instead:
 Assertions may be added in the future for testing purposes.
 
 
-## Limitations
-
-### `for` loops
-
-The compiler recognizes simple while-loop patterns but not Julia's iterator-based `for` loops. Write such loops as:
-
-```julia
-# Do this:
-i = 1
-while i <= n
-    # ...
-    i += 1
-end
-
-# Not this:
-for i in 1:n
-    # ...
-end
-```
-
-Also make sure `i`, `n`, and the increment all have the same type.
-
-
-
 ## Host-level operations
 
 cuTile.jl also provides a limited set of host-level APIs to use cuTile without
diff --git a/examples/batchmatmul.jl b/examples/batchmatmul.jl
@@ -26,8 +26,7 @@ function batch_matmul_kernel(A::ct.TileArray{T,3}, B::ct.TileArray{T,3}, C::ct.T
     acc = zeros(Float32, tm, tn)
 
     # K reduction loop
-    k = Int32(1)
-    while k <= num_k
+    for k in Int32(1):num_k
         # Load 3D tiles: (tm, tk, 1) and (tk, tn, 1)
         a = ct.load(A; index=(bid_m, k, pid_batch), shape=(tm, tk, 1),
                     padding_mode=ct.PaddingMode.Zero)
@@ -45,7 +44,6 @@ function batch_matmul_kernel(A::ct.TileArray{T,3}, B::ct.TileArray{T,3}, C::ct.T
         end
 
         acc = muladd(a_2d, b_2d, acc)
-        k += Int32(1)
     end
 
     # Convert to output type, reshape to 3D, and store
diff --git a/examples/fmha.jl b/examples/fmha.jl
@@ -73,8 +73,7 @@ function fmha_kernel(Q::ct.TileArray{T, 4}, K::ct.TileArray{T, 4},
     end
 
     # Loop over K, V blocks
-    j = Int32(0)
-    while j < Tc
+    for j in Int32(0):Tc-Int32(1)
         # QK product
         # K is (D_k, SeqLen_KV, KVH, Batch)
         # Load (TILE_N, TILE_D, 1, 1) with order=(2,1,3,4) to transpose D and N
@@ -123,8 +122,6 @@ function fmha_kernel(Q::ct.TileArray{T, 4}, K::ct.TileArray{T, 4},
         # (TILE_D, TILE_N) @ (TILE_N, TILE_M) = (TILE_D, TILE_M)
         acc = muladd(v, p, acc)
         m_i = m_ij
-
-        j += Int32(1)
     end
 
     # Final normalization and store
diff --git a/examples/layernorm.jl b/examples/layernorm.jl
@@ -33,40 +33,34 @@ function layer_norm_fwd(X::ct.TileArray{Float32, 2}, W::ct.TileArray{Float32, 1}
 
     # Compute mean
     mean = zeros(Float32, (TILE_N, 1))
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         tx = ct.load(X; index=(j, bid_m), shape=(TILE_N, 1), padding_mode=ct.PaddingMode.Zero)
         mean = mean .+ tx
-        j += Int32(1)
     end
     mean = sum(mean; dims=1) / N
     ct.store(Mean; index=bid_m, tile=mean)
 
     # Compute variance
     var = zeros(Float32, (TILE_N, 1))
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         tx = ct.load(X; index=(j, bid_m), shape=(TILE_N, 1), padding_mode=ct.PaddingMode.Zero)
         # Mask for valid elements
         mask = reshape(((j - Int32(1)) * Int32(TILE_N) .+ ct.arange(TILE_N)) .<= N, (TILE_N, 1))
         centered_tx = ifelse.(mask, tx .- mean, 0.0f0)
         var = var .+ (centered_tx .^ 2.0f0)
-        j += Int32(1)
     end
     var = sum(var; dims=1) / N
     rstd = 1.0f0 ./ sqrt.(var .+ eps)
     ct.store(Rstd; index=bid_m, tile=rstd)
 
     # Normalize and apply affine transformation
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         tx = ct.load(X; index=(j, bid_m), shape=(TILE_N, 1), padding_mode=ct.PaddingMode.Zero)
         tw = reshape(ct.load(W; index=j, shape=(TILE_N,), padding_mode=ct.PaddingMode.Zero), (TILE_N, 1))
         tb = reshape(ct.load(B; index=j, shape=(TILE_N,), padding_mode=ct.PaddingMode.Zero), (TILE_N, 1))
         ty = (tx .- mean) .* rstd
         ty = ty .* tw .+ tb
         ct.store(Y; index=(j, bid_m), tile=ty)
-        j += Int32(1)
     end
 
     return
@@ -136,23 +130,19 @@ function layer_norm_bwd_dx(DX::ct.TileArray{Float32, 2}, DY::ct.TileArray{Float3
     # First pass: compute c1 and c2 reduction terms
     c1 = zeros(Float32, (TILE_N, 1))
     c2 = zeros(Float32, (TILE_N, 1))
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         c1 = c1 .+ (xhat .* wdy)
         c2 = c2 .+ wdy
-        j += Int32(1)
     end
     c1 = sum(c1; dims=1) / N
     c2 = sum(c2; dims=1) / N
 
     # Second pass: compute dX
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         tdx = (wdy .- (xhat .* c1 .+ c2)) .* rstd
         ct.store(DX; index=(j, bid_m), tile=tdx)
-        j += Int32(1)
     end
 
     return
@@ -195,19 +185,16 @@ function layer_norm_bwd_dx_partial_dwdb(DX::ct.TileArray{Float32, 2}, DY::ct.Til
     # First pass: compute c1 and c2 reduction terms
     c1 = zeros(Float32, (TILE_N, 1))
     c2 = zeros(Float32, (TILE_N, 1))
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         _, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         c1 = c1 .+ (xhat .* wdy)
         c2 = c2 .+ wdy
-        j += Int32(1)
     end
     c1 = sum(c1; dims=1) / N
     c2 = sum(c2; dims=1) / N
 
     # Second pass: compute dX and partial dW/dB
-    j = Int32(1)
-    while j <= num_tiles
+    for j in Int32(1):num_tiles
         tdy, xhat, wdy = bwd_helper(X, W, DY, bid_m, j, mean, rstd, TILE_N, N)
         tdx = (wdy .- (xhat .* c1 .+ c2)) .* rstd
         ct.store(DX; index=(j, bid_m), tile=tdx)
@@ -230,8 +217,6 @@ function layer_norm_bwd_dx_partial_dwdb(DX::ct.TileArray{Float32, 2}, DY::ct.Til
         # Release spinlock
         ct.atomic_xchg(Locks, group_bid_m, 0;
                       memory_order=ct.MemoryOrder.Release)
-
-        j += Int32(1)
     end
 
     return
@@ -258,11 +243,9 @@ function layer_norm_bwd_dwdb(DW::ct.TileArray{Float32, 2}, DB::ct.TileArray{Floa
 
     dw = zeros(Float32, (TILE_N, TILE_M))
     db = zeros(Float32, (TILE_N, TILE_M))
-    i = Int32(1)
-    while i <= num_tiles
+    for i in Int32(1):num_tiles
         dw = dw .+ ct.load(DW; index=(bid_n, i), shape=(TILE_N, TILE_M), padding_mode=ct.PaddingMode.Zero)
         db = db .+ ct.load(DB; index=(bid_n, i), shape=(TILE_N, TILE_M), padding_mode=ct.PaddingMode.Zero)
-        i += Int32(1)
     end
     sum_dw = sum(dw; dims=2)
     sum_db = sum(db; dims=2)
diff --git a/examples/matmul.jl b/examples/matmul.jl
@@ -42,10 +42,7 @@ function matmul_kernel(A::ct.TileArray{T,2}, B::ct.TileArray{T,2}, C::ct.TileArr
     acc = zeros(Float32, tm, tn)
 
     # K reduction loop - accumulate partial products
-    # NOTE: Uses while-loop pattern. Native `for k in 0:n` syntax generates complex
-    # iterator protocol IR that doesn't map cleanly to ForOp. Use while-loops for now.
-    k = Int32(1)
-    while k <= num_k
+    for k in Int32(1):num_k
         # Load and convert to TF32 for tensor cores (Float32 only)
         # padding_mode=Zero ensures out-of-bounds reads return zero (for non-aligned dimensions)
         a = ct.load(A; index=(bid_m, k), shape=(tm, tk), padding_mode=ct.PaddingMode.Zero)
@@ -55,7 +52,6 @@ function matmul_kernel(A::ct.TileArray{T,2}, B::ct.TileArray{T,2}, C::ct.TileArr
             b = convert(ct.Tile{ct.TFloat32}, b)
         end
         acc = muladd(a, b, acc)
-        k += Int32(1)
     end
 
     # Convert accumulator to output type and store
diff --git a/examples/moe.jl b/examples/moe.jl
@@ -68,8 +68,7 @@ function fused_moe_kernel(A::ct.TileArray{T, 2}, B::ct.TileArray{T, 3},
     acc = zeros(Float32, TILE_N, TILE_M)
     num_k = cld(K, Int32(TILE_K))
 
-    k = Int32(1)
-    while k <= num_k
+    for k in Int32(1):num_k
         # 1-indexed row indices into A's K dimension
         a_k_indices = (k - Int32(1)) * Int32(TILE_K) .+ ct.arange(TILE_K)
 
@@ -87,7 +86,6 @@ function fused_moe_kernel(A::ct.TileArray{T, 2}, B::ct.TileArray{T, 3},
 
         # acc(N,M) += b(N,K) @ a(K,M)
         acc = muladd(b, a, acc)
-        k += Int32(1)
     end
 
     if mul_routed_weight
diff --git a/src/language/overlays.jl b/src/language/overlays.jl
@@ -8,6 +8,29 @@ macro overlay(ex)
 end
 
 
+#=============================================================================
+ StepRange Construction
+=============================================================================#
+
+# GPU-safe replacement for Base.steprange_last to enable `for i in start:step:stop`.
+# The original pulls in ArgumentError, @noinline overflow_case, and checked_srem_int.
+# This overlay uses unsigned arithmetic (bitcast → unsigned rem → bitcast) which
+# produces identical results and maps cleanly to Tile IR (signless integers make
+# signed↔unsigned bitcasts no-ops).
+@overlay function Base.steprange_last(start::T, step::T, stop::T) where {T<:Base.BitInteger}
+    stop == start && return stop
+    if step > zero(step)
+        stop < start && return start - oneunit(step)  # empty range
+        remain = signed(unsigned(stop - start) % unsigned(step))
+        return stop - remain
+    else
+        stop > start && return start + oneunit(step)  # empty range
+        remain = signed(unsigned(start - stop) % unsigned(-step))
+        return stop + remain
+    end
+end
+
+
 #=============================================================================
  Broadcasting
 =============================================================================#
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -27,10 +27,7 @@ _atomic_op(_, ::Type) = nothing
             start_d = idx_d
         end
 
-        @nwhileloops($N,
-            d -> (idx_d <= n_d),
-            d -> (idx_d = start_d),
-            d -> (idx_d = idx_d + reduce_stride[d]),
+        @nloops($N, idx, d -> (start_d:reduce_stride[d]:n_d),
             begin
                 tile = load(src, (@ntuple $N d -> idx_d), tile_size; padding_mode=pad_mode)
                 acc = op.(acc, f.(tile))
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,4 +1,4 @@
-using Base.Cartesian: @nexprs, @ntuple, inlineanonymous
+using Base.Cartesian: @nexprs, @ntuple, @nloops
 
 #=============================================================================
  Grid and tile sizing helpers (used by broadcast and mapreduce)
@@ -71,61 +71,3 @@ function _compute_tile_sizes(input_size::NTuple{N,Int}, dim_order; budget::Int=4
     return NTuple{N,Int}(ts)
 end
 
-#=============================================================================
- @nwhileloops — while-loop variant of Base.Cartesian.@nloops
-=============================================================================#
-
-"""
-    @nwhileloops N condexpr [preexpr [postexpr]] body
-
-Generate N nested `while` loops, analogous to `Base.Cartesian.@nloops` but
-using `while` instead of `for`.  This is needed because the cuTile compiler
-only recognizes while-loop patterns for structured control flow.
-
-`condexpr` and the optional `preexpr`/`postexpr` are `d->` anonymous functions
-specialized per dimension with Cartesian `_d` suffix naming.  If you want just
-a post-expression, supply `nothing` for the pre-expression.
-
-# Example
-```julia
-@nwhileloops 2 d->(idx_d <= n_d) d->(idx_d = start_d) d->(idx_d += stride[d]) begin
-    # innermost body
-end
-```
-generates:
-```julia
-idx_2 = start_2
-while idx_2 <= n_2
-    idx_1 = start_1
-    while idx_1 <= n_1
-        # innermost body
-        idx_1 += stride[1]
-    end
-    idx_2 += stride[2]
-end
-```
-"""
-macro nwhileloops(N, condexpr, args...)
-    _nwhileloops(N, condexpr, args...)
-end
-
-function _nwhileloops(N::Int, condexpr::Expr, args::Expr...)
-    if !(1 <= length(args) <= 3)
-        throw(ArgumentError("expected 1 to 3 trailing arguments (body, or pre+body, or pre+post+body), got $(length(args))"))
-    end
-    body = args[end]
-    ex = Expr(:escape, body)
-    for d in 1:N
-        cond = esc(inlineanonymous(condexpr, d))
-        preexpr = length(args) > 1 ? esc(inlineanonymous(args[1], d)) : nothing
-        postexpr = length(args) > 2 ? esc(inlineanonymous(args[2], d)) : nothing
-        ex = quote
-            $preexpr
-            while $cond
-                $ex
-                $postexpr
-            end
-        end
-    end
-    ex
-end
diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
diff --git a/test/device/control_flow.jl b/test/device/control_flow.jl