diff --git a/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp b/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp index c34713ba..c17a7110 100644 --- a/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp +++ b/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp @@ -949,7 +949,7 @@ struct Sm100FmhaBwdKernelTmaWarpSpecialized { TensorC const& coord, TensorShape const& tensor_shape) { - // TODO: Performance of FlashMLA on sm90 is dropped with latest cutlass, so here revert the to the old version. + // TODO: Performance of FlashMLA on sm90 is dropped with latest cutlass, so here revert to the old version. // Tensor preds = cute::lazy::transform(coord, [&](auto const& c) { return elem_less(c, tensor_shape); }); auto copy_op = make_cotiled_copy( diff --git a/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp b/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp index c25d638d..d45d2af7 100644 --- a/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp +++ b/csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp @@ -954,7 +954,7 @@ struct Sm100FmhaBwdMlaKernelTmaWarpSpecialized { TensorC const& coord, TensorShape const& tensor_shape) { - // TODO: Performance of FlashMLA on sm90 is dropped with latest cutlass, so here revert the to the old version. + // TODO: Performance of FlashMLA on sm90 is dropped with latest cutlass, so here revert to the old version. // Tensor preds = cute::lazy::transform(coord, [&](auto const& c) { return elem_less(c, tensor_shape); }); auto copy_op = make_cotiled_copy(