Create recipe for flux2pro running on AMD (#4200)

hualazimi0425 · web-flow · commit 7dd8be5dd4ac · 2026-03-31T05:21:14.000Z
Differential Revision: D98537991 Pull Request resolved: #4200
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -38,6 +38,7 @@
     is_sm_at_least_100,
     torch_version_at_least,
 )
+from unittest.mock import patch
 
 # Needed since changing args to function causes recompiles
 torch._dynamo.config.cache_size_limit = 128
@@ -1489,5 +1490,118 @@ def test_create_tensor_out_of_inference_mode(self):
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
+
+class TestMI350HardwareSupport(common_utils.TestCase):
+    """Tests that MI350 (gfx950) is accepted by FP8 hardware checks.
+
+    Uses mocking so the tests run on any hardware without needing an actual
+    MI350 GPU.
+    """
+
+    def _patch_mi350_only(self):
+        """Context manager simulating an MI350-only environment."""
+        from unittest.mock import patch
+
+        return [
+            patch("torchao.float8.inference.is_MI350", return_value=True),
+            patch("torchao.float8.inference.is_MI300", return_value=False),
+            patch("torchao.float8.inference.is_sm_at_least_89", return_value=False),
+            patch("torch.cuda.is_available", return_value=True),
+            patch("torch.xpu.is_available", return_value=False),
+        ]
+
+    def _patch_no_hw(self):
+        """Context manager simulating unsupported hardware."""
+        from unittest.mock import patch
+
+        return [
+            patch("torchao.float8.inference.is_MI350", return_value=False),
+            patch("torchao.float8.inference.is_MI300", return_value=False),
+            patch("torchao.float8.inference.is_sm_at_least_89", return_value=False),
+            patch("torch.cuda.is_available", return_value=True),
+            patch("torch.xpu.is_available", return_value=False),
+        ]
+
+    def _start(self, patches):
+        for p in patches:
+            p.start()
+
+    def _stop(self, patches):
+        for p in patches:
+            p.stop()
+
+    def test_check_hardware_support_mi350_per_tensor(self):
+        from torchao.float8.inference import _check_hardware_support
+
+        patches = self._patch_mi350_only()
+        self._start(patches)
+        try:
+            _check_hardware_support((PerTensor(), PerTensor()))
+        finally:
+            self._stop(patches)
+
+    def test_check_hardware_support_mi350_per_row(self):
+        from torchao.float8.inference import _check_hardware_support
+
+        patches = self._patch_mi350_only()
+        self._start(patches)
+        try:
+            _check_hardware_support((PerRow(), PerRow()))
+        finally:
+            self._stop(patches)
+
+    def test_check_hardware_support_rejects_unsupported_hw(self):
+        from torchao.float8.inference import _check_hardware_support
+
+        patches = self._patch_no_hw()
+        self._start(patches)
+        try:
+            with self.assertRaises(AssertionError):
+                _check_hardware_support((PerRow(), PerRow()))
+        finally:
+            self._stop(patches)
+
+    def test_quant_api_hardware_gate_mi350(self):
+        """The assertion in _float8_dynamic_activation_float8_weight_transform
+        should pass on MI350."""
+
+        with (
+            patch("torchao.quantization.quant_api.is_MI350", return_value=True),
+            patch("torchao.quantization.quant_api.is_MI300", return_value=False),
+            patch(
+                "torchao.quantization.quant_api.is_sm_at_least_89",
+                return_value=False,
+            ),
+            patch("torch.cuda.is_available", return_value=True),
+        ):
+            from torchao.quantization.quant_api import (
+                is_MI300,
+                is_MI350,
+                is_sm_at_least_89,
+            )
+
+            self.assertTrue(is_sm_at_least_89() or is_MI300() or is_MI350())
+
+    def test_quant_api_hardware_gate_rejects_unsupported(self):
+        from unittest.mock import patch
+
+        with (
+            patch("torchao.quantization.quant_api.is_MI350", return_value=False),
+            patch("torchao.quantization.quant_api.is_MI300", return_value=False),
+            patch(
+                "torchao.quantization.quant_api.is_sm_at_least_89",
+                return_value=False,
+            ),
+            patch("torch.cuda.is_available", return_value=True),
+        ):
+            from torchao.quantization.quant_api import (
+                is_MI300,
+                is_MI350,
+                is_sm_at_least_89,
+            )
+
+            self.assertFalse(is_sm_at_least_89() or is_MI300() or is_MI350())
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -16,6 +16,7 @@
 from torchao.float8.types import FP8Granularity
 from torchao.utils import (
     is_MI300,
+    is_MI350,
     is_sm_at_least_89,
 )
 
@@ -295,7 +296,7 @@ def _check_hardware_support(
 
     if is_per_tensor or is_per_row:
         assert torch.xpu.is_available() or (
-            torch.cuda.is_available() and is_sm_at_least_89() or is_MI300()
+            torch.cuda.is_available() and is_sm_at_least_89() or is_MI300() or is_MI350()
         ), (
             "Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+ or XPU."
         )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -91,6 +91,7 @@
 )
 from torchao.utils import (
     is_MI300,
+    is_MI350,
     is_sm_at_least_89,
 )
 
@@ -1527,7 +1528,7 @@ def _float8_dynamic_activation_float8_weight_transform(
     parameter_name: str = "weight",
 ):
     if torch.cuda.is_available():
-        assert is_sm_at_least_89() or is_MI300(), (
+        assert is_sm_at_least_89() or is_MI300() or is_MI350(), (
             "Float8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+"
         )
     if config.set_inductor_config:

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`from torchao.float8.types import FP8Granularity`
`17`	`17`	`from torchao.utils import (`
`18`	`18`	`is_MI300,`
	`19`	`+ is_MI350,`
`19`	`20`	`is_sm_at_least_89,`
`20`	`21`	`)`
`21`	`22`
`@@ -295,7 +296,7 @@ def _check_hardware_support(`
`295`	`296`
`296`	`297`	`if is_per_tensor or is_per_row:`
`297`	`298`	`assert torch.xpu.is_available() or (`
`298`		`- torch.cuda.is_available() and is_sm_at_least_89() or is_MI300()`
	`299`	`+ torch.cuda.is_available() and is_sm_at_least_89() or is_MI300() or is_MI350()`
`299`	`300`	`), (`
`300`	`301`	`"Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+ or XPU."`
`301`	`302`	`)`
Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@`
`91`	`91`	`)`
`92`	`92`	`from torchao.utils import (`
`93`	`93`	`is_MI300,`
	`94`	`+ is_MI350,`
`94`	`95`	`is_sm_at_least_89,`
`95`	`96`	`)`
`96`	`97`
`@@ -1527,7 +1528,7 @@ def _float8_dynamic_activation_float8_weight_transform(`
`1527`	`1528`	`parameter_name: str = "weight",`
`1528`	`1529`	`):`
`1529`	`1530`	`if torch.cuda.is_available():`
`1530`		`- assert is_sm_at_least_89() or is_MI300(), (`
	`1531`	`+ assert is_sm_at_least_89() or is_MI300() or is_MI350(), (`
`1531`	`1532`	`"Float8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+"`
`1532`	`1533`	`)`
`1533`	`1534`	`if config.set_inductor_config:`