EnzymeAD
diff --git a/‎Project.toml‎
Lines changed: 2 additions & 0 deletions b/‎Project.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎deps/ReactantExtra/API.cpp‎
Lines changed: 10 additions & 11 deletions b/‎deps/ReactantExtra/API.cpp‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎ext/ReactantCUDAExt.jl‎
Lines changed: 239 additions & 12 deletions b/‎ext/ReactantCUDAExt.jl‎
Lines changed: 239 additions & 12 deletions
diff --git a/‎src/Compiler.jl‎
Lines changed: 2 additions & 1 deletion b/‎src/Compiler.jl‎
Lines changed: 2 additions & 1 deletion
@@ -8,6 +8,7 @@ projects = ["docs", "test"]
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
@@ -104,6 +105,7 @@ ReactantZygoteExt = "Zygote"
 AbstractFFTs = "1.5"
 Adapt = "4.4"
 ArrayInterface = "7.17.1"
+BFloat16s = "0.6.1"
 CEnum = "0.5"
 CUDA = "5.9"
 Crayons = "4.1.1"
 
@@ -1249,15 +1249,13 @@ REACTANT_ABI uint8_t FutureIsReady(FutureType *Future) {
 
 REACTANT_ABI void FutureAwait(FutureType *Future) { Future->Await(); }
 
-xla::CompileOptions
-GenerateCompileOptions(int64_t device_id, const int64_t *mesh_ids,
-                       int64_t num_mesh_ids, const char *xla_gpu_cuda_data_dir,
-                       bool use_shardy_partitioner, int64_t num_replicas,
-                       int64_t num_partitions, bool use_spmd_partitioning,
-                       bool kernel_cache_enabled, const char *kernel_cache_path,
-                       bool autotune_cache_enabled,
-                       const char *autotune_cache_path, int process_id,
-                       bool xla_enable_enzyme_comms_opt) {
+xla::CompileOptions GenerateCompileOptions(
+    int64_t device_id, const int64_t *mesh_ids, int64_t num_mesh_ids,
+    const char *xla_gpu_cuda_data_dir, bool use_shardy_partitioner,
+    int64_t num_replicas, int64_t num_partitions, bool use_spmd_partitioning,
+    bool kernel_cache_enabled, const char *kernel_cache_path,
+    bool autotune_cache_enabled, const char *autotune_cache_path,
+    int process_id, bool xla_enable_enzyme_comms_opt) {
   xla::CompileOptions options;
   auto debug_options = options.executable_build_options.mutable_debug_options();
 
@@ -1905,14 +1903,15 @@ ifrt_compile(ifrt::Client *client, MlirModule cmod, int64_t device_id,
              bool use_spmd_partitioning, bool kernel_cache_enabled,
              const char *kernel_cache_path, bool autotune_cache_enabled,
              const char *autotune_cache_path, int process_id,
-	     bool xla_enable_enzyme_comms_opt) {
+             bool xla_enable_enzyme_comms_opt) {
   return ifrt_compile_internal(
       client, cmod,
       GenerateCompileOptions(
           device_id, mesh_ids, num_mesh_ids, xla_gpu_cuda_data_dir,
           use_shardy_partitioner, num_replicas, num_partitions,
           use_spmd_partitioning, kernel_cache_enabled, kernel_cache_path,
-          autotune_cache_enabled, autotune_cache_path, process_id, xla_enable_enzyme_comms_opt));
+          autotune_cache_enabled, autotune_cache_path, process_id,
+          xla_enable_enzyme_comms_opt));
 }
 
 REACTANT_ABI HeldIfrtLoadedExecutable *
 
@@ -1,5 +1,6 @@
 module ReactantCUDAExt
 
+using BFloat16s: BFloat16
 using Reactant:
     Reactant,
     TracedRArray,
@@ -959,6 +960,7 @@ function compile(job)
             throw(GPUCompiler.InvalidIRError(job, errors))
         end
         # LLVM.strip_debuginfo!(mod)
+        dl = string(LLVM.datalayout(mod))
         modstr = string(mod)
         # This is a bit weird since we're taking a module from julia's llvm into reactant's llvm version
         # it is probably safer to reparse a string using the right llvm module api, so we will do that.
@@ -967,7 +969,17 @@ function compile(job)
         )
         @assert mmod != C_NULL
 
-        linkRes = MLIR.API.LinkInModule(MLIR.IR.current_module(), mmod, entryname)
+        cur_module = MLIR.IR.current_module()
+        linkRes = MLIR.API.LinkInModule(cur_module, mmod, entryname)
+
+        dl_attr_name = "llvm.data_layout"
+        prevdlattr = MLIR.IR.getattr(MLIR.IR.Operation(cur_module), dl_attr_name)
+        if !isnothing(prevdlattr)
+            prevdl = String(prevdlattr)
+            @assert prevdl == dl "data layout mismatch, tried compiling cuda kernels for different target machines?"
+        else
+            MLIR.IR.setattr!(MLIR.IR.Operation(cur_module), dl_attr_name, MLIR.IR.Attribute(dl))
+        end
 
         String(Reactant.TracedUtils.get_attribute_by_name(linkRes, "sym_name"))
     end
@@ -1135,6 +1147,13 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
     for (i, prev) in enumerate(Any[func.f, args...])
         Reactant.make_tracer(seen, prev, (kernelargsym, i), Reactant.NoStopTracedTrack)
     end
+    bfloat16_compile_type = Reactant.Compiler.BFLOAT16_COMPILE_TYPE[]
+    has_cast_float_type =
+        bfloat16_compile_type !== BFloat16 && any(values(seen)) do arg
+            (arg isa TracedRArray || arg isa TracedRNumber) &&
+                Reactant.unwrapped_eltype(typeof(arg)) === BFloat16
+        end
+
     wrapper_tys = MLIR.IR.Type[]
     for arg in values(seen)
         if !(arg isa TracedRArray || arg isa TracedRNumber)
@@ -1162,6 +1181,16 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
     end
     wrapbody = MLIR.IR.Block(wrapper_tys, [MLIR.IR.Location() for _ in wrapper_tys])
     push!(MLIR.IR.region(wrapfunc, 1), wrapbody)
+    if has_cast_float_type
+        MLIR.IR.setattr!(
+            wrapfunc, "enzymexla.float_type", MLIR.IR.Attribute(MLIR.IR.Type(BFloat16))
+        )
+        MLIR.IR.setattr!(
+            wrapfunc,
+            "enzymexla.src_float_type",
+            MLIR.IR.Attribute(MLIR.IR.Type(bfloat16_compile_type)),
+        )
+    end
     for i in 1:length(wrapper_tys)
         MLIR.API.ReactantFuncSetArgAttr(
             wrapfunc,
@@ -1223,15 +1252,65 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
             )
             push!(allocs, (alloc, argty, jltyp))
 
-            sz = abi_sizeof(a)
-            array_ty = MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGet(MLIR.IR.Type(Int8), sz))
-            cdata = MLIR.IR.result(
-                MLIR.Dialects.llvm.mlir_constant(;
-                    res=array_ty, value=MLIR.IR.DenseElementsAttribute(to_bytes(a))
-                ),
-                1,
-            )
-            MLIR.Dialects.llvm.store(cdata, alloc)
+            if has_cast_float_type
+                # The argument `a` has BFloat16 fields but the GPU function was
+                # compiled with a substitute type (e.g. Float32). We need to:
+                # 1. Create an alloca with the bf16 layout
+                # 2. Store the raw bf16 bytes into it
+                # 3. Load, walk the struct fields, extend bf16→f32, store into alloc
+                compile_float_ty = MLIR.IR.Type(bfloat16_compile_type)
+                bf16_float_ty = MLIR.IR.Type(BFloat16)
+                bf16_ty = _replace_float_in_llvm_type(
+                    argty, compile_float_ty, bf16_float_ty
+                )
+
+                bf16_c1 = MLIR.IR.result(
+                    MLIR.Dialects.llvm.mlir_constant(;
+                        res=MLIR.IR.Type(Int64), value=MLIR.IR.Attribute(1)
+                    ),
+                    1,
+                )
+                bf16_alloc = MLIR.IR.result(
+                    MLIR.Dialects.llvm.alloca(
+                        bf16_c1; elem_type=MLIR.IR.Attribute(bf16_ty), res=llvmptr
+                    ),
+                    1,
+                )
+
+                sz = abi_sizeof(a)
+                val = to_bytes(a)
+                array_ty = MLIR.IR.Type(
+                    MLIR.API.mlirLLVMArrayTypeGet(MLIR.IR.Type(Int8), sz)
+                )
+                cdata = MLIR.IR.result(
+                    MLIR.Dialects.llvm.mlir_constant(;
+                        res=array_ty, value=MLIR.IR.DenseElementsAttribute(val)
+                    ),
+                    1,
+                )
+                MLIR.Dialects.llvm.store(cdata, bf16_alloc)
+
+                bf16_val = MLIR.IR.result(
+                    MLIR.Dialects.llvm.load(bf16_alloc; res=bf16_ty), 1
+                )
+                converted_val = _convert_bf16_value(
+                    bf16_val, bf16_ty, argty, bf16_float_ty, compile_float_ty
+                )
+                MLIR.Dialects.llvm.store(converted_val, alloc)
+            else
+                sz = abi_sizeof(a)
+                val = to_bytes(a)
+                array_ty = MLIR.IR.Type(
+                    MLIR.API.mlirLLVMArrayTypeGet(MLIR.IR.Type(Int8), sz)
+                )
+                cdata = MLIR.IR.result(
+                    MLIR.Dialects.llvm.mlir_constant(;
+                        res=array_ty, value=MLIR.IR.DenseElementsAttribute(val)
+                    ),
+                    1,
+                )
+                MLIR.Dialects.llvm.store(cdata, alloc)
+            end
         end
     end
     LLVM.deactivate(ctx)
@@ -1275,7 +1354,14 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
             # we need to now compute the offset in bytes of the path
             julia_arg = allargs[p[2]]
 
-            offset = get_field_offset(typeof(julia_arg), p[3:end])
+            offset = if has_cast_float_type
+                get_field_offset(
+                    _bfloat16_to_ft_type(typeof(julia_arg), bfloat16_compile_type),
+                    p[3:end],
+                )
+            else
+                get_field_offset(typeof(julia_arg), p[3:end])
+            end
             MLIR.IR.with_block(wrapbody) do
                 ptr = MLIR.IR.result(
                     MLIR.Dialects.llvm.getelementptr(
@@ -1353,6 +1439,9 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
             "enzymexla.kernel_call", @__FILE__, @__LINE__
         ),
     )
+    if has_cast_float_type
+        MLIR.IR.setattr!(call, "cast_float_type", MLIR.IR.UnitAttribute())
+    end
 
     argidx = 1
     for arg in values(seen)
@@ -1364,13 +1453,151 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
     end
 end
 
+function _bfloat16_to_ft_type(@nospecialize(T), @nospecialize(FT))
+    T === BFloat16 && return FT
+    T isa DataType || return T
+    isempty(T.parameters) && return T
+    new_params = Any[_bfloat16_to_ft_type(p, FT) for p in T.parameters]
+    all(p1 === p2 for (p1, p2) in zip(T.parameters, new_params)) && return T
+    return T.name.wrapper{new_params...}
+end
+
+function _substitute_bfloat16_tt(@nospecialize(tt::Type{<:Tuple}), @nospecialize(FT))
+    new_params = Any[_bfloat16_to_ft_type(T, FT) for T in tt.parameters]
+    return Tuple{new_params...}
+end
+
+"""
+    _replace_float_in_llvm_type(ty, src_float_ty, tgt_float_ty)
+
+Recursively walk an LLVM type and replace `src_float_ty` with `tgt_float_ty`.
+Handles struct types and array types.
+"""
+function _replace_float_in_llvm_type(
+    ty::MLIR.IR.Type, src_float_ty::MLIR.IR.Type, tgt_float_ty::MLIR.IR.Type
+)
+    ty == src_float_ty && return tgt_float_ty
+    if MLIR.API.mlirTypeIsALLVMStructType(ty)
+        n = MLIR.API.mlirLLVMStructTypeGetNumElementTypes(ty)
+        field_types = MLIR.IR.Type[
+            _replace_float_in_llvm_type(
+                MLIR.IR.Type(MLIR.API.mlirLLVMStructTypeGetElementType(ty, i - 1)),
+                src_float_ty,
+                tgt_float_ty,
+            ) for i in 1:n
+        ]
+        if all(
+            field_types[i] ==
+            MLIR.IR.Type(MLIR.API.mlirLLVMStructTypeGetElementType(ty, i - 1)) for i in 1:n
+        )
+            return ty
+        end
+        ctx = MLIR.IR.current_context()
+        is_packed = MLIR.API.mlirLLVMStructTypeIsPacked(ty)
+        return MLIR.IR.Type(
+            MLIR.API.mlirLLVMStructTypeLiteralGet(ctx, n, field_types, is_packed)
+        )
+    elseif MLIR.API.mlirTypeIsALLVMArrayType(ty)
+        elem_ty = MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGetElementType(ty))
+        new_elem_ty = _replace_float_in_llvm_type(elem_ty, src_float_ty, tgt_float_ty)
+        if new_elem_ty == elem_ty
+            return ty
+        end
+        num_elems = MLIR.API.mlirLLVMArrayTypeGetNumElements(ty)
+        return MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGet(new_elem_ty, num_elems))
+    end
+    return ty
+end
+
+"""
+    _convert_bf16_value(src_val, src_ty, tgt_ty, src_float_ty, tgt_float_ty)
+
+Recursively walk an LLVM value, converting float fields from `src_float_ty` to
+`tgt_float_ty` using arith.extf. Returns a new value of type `tgt_ty`.
+"""
+function _convert_bf16_value(
+    src_val::MLIR.IR.Value,
+    src_ty::MLIR.IR.Type,
+    tgt_ty::MLIR.IR.Type,
+    src_float_ty::MLIR.IR.Type,
+    tgt_float_ty::MLIR.IR.Type,
+)
+    src_ty == tgt_ty && return src_val
+    if src_ty == src_float_ty
+        src_width = MLIR.API.mlirFloatTypeGetWidth(src_float_ty)
+        tgt_width = MLIR.API.mlirFloatTypeGetWidth(tgt_float_ty)
+        if tgt_width > src_width
+            return MLIR.IR.result(MLIR.Dialects.llvm.fpext(src_val; res=tgt_float_ty), 1)
+        elseif tgt_width < src_width
+            return MLIR.IR.result(MLIR.Dialects.llvm.fptrunc(src_val; res=tgt_float_ty), 1)
+        else
+            return MLIR.IR.result(MLIR.Dialects.llvm.fptrunc(src_val; res=tgt_float_ty), 1)
+        end
+    end
+    if MLIR.API.mlirTypeIsALLVMStructType(src_ty)
+        n = MLIR.API.mlirLLVMStructTypeGetNumElementTypes(src_ty)
+        tgt_val = MLIR.IR.result(MLIR.Dialects.llvm.mlir_undef(; res=tgt_ty), 1)
+        for i in 0:(n - 1)
+            field_src_ty = MLIR.IR.Type(
+                MLIR.API.mlirLLVMStructTypeGetElementType(src_ty, i)
+            )
+            field_tgt_ty = MLIR.IR.Type(
+                MLIR.API.mlirLLVMStructTypeGetElementType(tgt_ty, i)
+            )
+            field_val = MLIR.IR.result(
+                MLIR.Dialects.llvm.extractvalue(
+                    src_val; res=field_src_ty, position=MLIR.IR.Attribute(Int64[i])
+                ),
+                1,
+            )
+            converted = _convert_bf16_value(
+                field_val, field_src_ty, field_tgt_ty, src_float_ty, tgt_float_ty
+            )
+            tgt_val = MLIR.IR.result(
+                MLIR.Dialects.llvm.insertvalue(
+                    tgt_val, converted; res=tgt_ty, position=MLIR.IR.Attribute(Int64[i])
+                ),
+                1,
+            )
+        end
+        return tgt_val
+    elseif MLIR.API.mlirTypeIsALLVMArrayType(src_ty)
+        num_elems = MLIR.API.mlirLLVMArrayTypeGetNumElements(src_ty)
+        elem_src_ty = MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGetElementType(src_ty))
+        elem_tgt_ty = MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGetElementType(tgt_ty))
+        tgt_val = MLIR.IR.result(MLIR.Dialects.llvm.mlir_undef(; res=tgt_ty), 1)
+        for i in 0:(num_elems - 1)
+            elem_val = MLIR.IR.result(
+                MLIR.Dialects.llvm.extractvalue(
+                    src_val; res=elem_src_ty, position=MLIR.IR.Attribute(Int64[i])
+                ),
+                1,
+            )
+            converted = _convert_bf16_value(
+                elem_val, elem_src_ty, elem_tgt_ty, src_float_ty, tgt_float_ty
+            )
+            tgt_val = MLIR.IR.result(
+                MLIR.Dialects.llvm.insertvalue(
+                    tgt_val, converted; res=tgt_ty, position=MLIR.IR.Attribute(Int64[i])
+                ),
+                1,
+            )
+        end
+        return tgt_val
+    end
+    return src_val
+end
+
 Reactant.@reactant_overlay @noinline function CUDA.cufunction(
     f::F, tt::TT=Tuple{}; kwargs...
 ) where {F,TT}
     res = Base.@lock CUDA.cufunction_lock begin
         # compile the function
         cache = llvm_compiler_cache(MLIR.IR.current_module())
-        source = CUDA.methodinstance(F, tt)
+        effective_tt = _substitute_bfloat16_tt(
+            tt, Reactant.Compiler.BFLOAT16_COMPILE_TYPE[]
+        )
+        source = CUDA.methodinstance(F, effective_tt)
         # cuda = CUDA.active_state()
         device = nothing # cuda.device
         # config = CUDA.compiler_config(device; kwargs...)::CUDA.CUDACompilerConfig
 
@@ -1118,6 +1118,7 @@ function cubinFeatures()
     return "+ptx$ptx"
 end
 
+const BFLOAT16_COMPILE_TYPE = Ref{DataType}(Float32)
 const DEBUG_KERNEL = Ref{Bool}(false)
 const DUMP_LLVMIR = Ref{Bool}(false)
 const DUMP_FAILED_LOCKSTEP = Ref{Bool}(false)
@@ -1343,7 +1344,7 @@ function compile_mlir!(
         # Raise enabled but use default passes
         # TODO(#2240) remove redundant libdevice raise after fixing phase ordering
         result =
-            "canonicalize,llvm-to-memref-access,canonicalize,convert-llvm-to-cf,canonicalize,enzyme-lift-cf-to-scf,canonicalize,func.func(canonicalize-loops),canonicalize-scf-for,canonicalize,libdevice-funcs-raise,canonicalize,affine-cfg,canonicalize,func.func(canonicalize-loops),canonicalize,llvm-to-affine-access,canonicalize,delinearize-indexing,canonicalize,simplify-affine-exprs,affine-cfg,canonicalize,func.func(affine-loop-invariant-code-motion),canonicalize,sort-memory,raise-affine-to-stablehlo{strip_llvm_debuginfo=$(compile_options.strip_llvm_debuginfo) prefer_while_raising=false dump_failed_lockstep=$(DUMP_FAILED_LOCKSTEP[])},canonicalize,arith-raise{stablehlo=true}," *
+            "canonicalize,llvm-to-memref-access,canonicalize,convert-llvm-to-cf,canonicalize,enzyme-lift-cf-to-scf,canonicalize,func.func(canonicalize-loops),canonicalize-scf-for,canonicalize,libdevice-funcs-raise,canonicalize,affine-cfg,canonicalize,func.func(canonicalize-loops),canonicalize,llvm-to-affine-access,canonicalize,delinearize-indexing,canonicalize,simplify-affine-exprs,affine-cfg,canonicalize,func.func(affine-loop-invariant-code-motion),canonicalize,sort-memory,func.func(kernelcast),raise-affine-to-stablehlo{strip_llvm_debuginfo=$(compile_options.strip_llvm_debuginfo) prefer_while_raising=false dump_failed_lockstep=$(DUMP_FAILED_LOCKSTEP[])},canonicalize,arith-raise{stablehlo=true}," *
             opt_passes2
 
         if DUS_TO_CONCAT[]