[GR-71966] Support matplotlib benchmarks by handling flaky fork failures and enabling end-to-end time measuring

Andrija Kolic · Andrija Kolic · commit f81f003e66a9 · 2026-04-08T17:06:12.000Z
PullRequest: graal/23247
diff --git a/sdk/mx.sdk/mx_sdk_benchmark.py b/sdk/mx.sdk/mx_sdk_benchmark.py
@@ -818,7 +818,7 @@ def stage_aware_run(self, args, out=None, err=None, cwd=None, nonZeroIsFatal=Fal
             self._prepare_for_running(args, out, err, cwd, nonZeroIsFatal)
             self.run_single_stage()
 
-        if self.stages_info.failed:
+        if self.stages_info.failed and not self.bmSuite.ignore_benchmark_failure(out.data, bm_exec_context().get("benchmarks"), bm_exec_context().get("bm_suite_args")):
             mx.abort('Exiting the benchmark due to the failure.')
 
     def prepare_stages(self, bm_suite: NativeImageBenchmarkMixin, bm_suite_args) -> tuple[list[Stage], list[Stage]]:
@@ -2078,7 +2078,7 @@ class GraalHostPolyBenchStagingVm(PolyBenchStagingVm):
     GRAALHOST_FSMAPPING_TEMPLATE: Template = Template("""
     {
       "fsmappings": [
-        {"concrete": "${path}", "virt": "${path}"}
+        {"concrete": "${path}", "virt": "${path}", "mutable": true}
       ]
     }
     """)
diff --git a/truffle/mx.truffle/mx_polybench/model.py b/truffle/mx.truffle/mx_polybench/model.py
@@ -930,7 +930,7 @@ def verify_and_process_id_score_function(self, datapoint: DataPoint):
 class NonNativeImageBenchmarkSummaryPostProcessor(FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor):
     """
     Post-processor that calculates the outlier-excluded average of the "avg-time" metric across dispatches
-    and produces a final "time" metric for a benchmark.
+    and produces a final "time"/"one-shot" metric for a benchmark.
     Should only be used when running a benchmark in server (non-native) mode.
     """
 
@@ -944,7 +944,10 @@ def key_fn(dp):
         field = "metric.value"
 
         def update_fn(dp):
-            dp["metric.name"] = "time"
+            metric_name = "time"
+            if self._suite._use_wall_clock_time():
+                metric_name = "one-shot"
+            dp["metric.name"] = metric_name
             if "metric.object" in dp:
                 del dp["metric.object"]
             if "metric.fork-number" in dp:
@@ -994,7 +997,7 @@ def determine_stable_run_outlier_exclusion_percentiles(self, bench_config: Bench
 class NativeModeBenchmarkSummaryPostProcessor(FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor):
     """
     Post-processor that calculates the outlier-excluded average of the "avg-time" metric across image builds
-    and produces a final "time" metric for a benchmark (separate "run" and "instrument-run" datapoints).
+    and produces a final "time"/"one-shot" metric for a benchmark (separate "run" and "instrument-run" datapoints).
     Should only be used when running a benchmark in native mode.
     """
 
@@ -1008,7 +1011,10 @@ def key_fn(dp):
         field = "metric.value"
 
         def update_fn(dp):
-            dp["metric.name"] = "time"
+            metric_name = "time"
+            if self._suite._use_wall_clock_time():
+                metric_name = "one-shot"
+            dp["metric.name"] = metric_name
             if "metric.fork-number" in dp:
                 del dp["metric.fork-number"]
             if "native-image.image-fork-number" in dp:
@@ -1048,6 +1054,38 @@ def select_datapoints(self, datapoints: DataPoints) -> DataPoints:
             )
         return super().select_datapoints(datapoints)
 
+    def group_datapoints(self, datapoints: DataPoints) -> Dict[Any, DataPoints]:
+        groups = super().group_datapoints(datapoints)
+        flaky_tolerance = self._suite.polybench_bench_suite_args(
+            bm_exec_context().get("bm_suite_args")
+        ).flaky_failure_stability_tolerance
+        config = bm_exec_context().get(PolybenchBenchmarkSuite.STABLE_CONFIG)
+        if flaky_tolerance is None or config is None:
+            return groups
+
+        # Filter out groups belonging to benchmarks which had too many flaky failures
+        flaky_tolerance = int(flaky_tolerance)
+        groups_within_tolerance = {}
+        for key, group in groups.items():
+            bench = key[0]
+            stage = key[1]
+            if stage != "run":
+                # Flaky tolerance only applies to datapoints from the 'run' stage
+                groups_within_tolerance[key] = group
+                continue
+            total_forks = config.get_benchmark(bench).builds * config.get_benchmark(bench).forks
+            required_count = total_forks - flaky_tolerance
+            if len(group) < required_count:
+                mx.log(
+                    f"Not generating the stabilized metric for benchmark '{bench}' due to too few successful runs ({len(group)} < {required_count})."
+                )
+            else:
+                mx.log(
+                    f"Generating the stabilized metric for benchmark '{bench}' using the {len(group)} available datapoints (out of {total_forks})."
+                )
+                groups_within_tolerance[key] = group
+        return groups_within_tolerance
+
     def determine_stable_run_outlier_exclusion_percentiles(self, bench_config: BenchmarkStableRunConfig):
         self._lower_percentile = bench_config.build_outlier_exclusion.lower_percentile
         self._upper_percentile = bench_config.build_outlier_exclusion.upper_percentile
@@ -1300,6 +1338,18 @@ def _ensure_instrumentation_profile_name_is_benchmark_specific(
         # Store the profile for use in upcoming IMAGE stages
         bm_exec_context().get(PolybenchBenchmarkSuite.PGO_PROFILES).append(bench_unique_profile_path)
 
+    def flakySkipPatterns(self, benchmarks, bmSuiteArgs):
+        polybench_patterns = []
+        if self._allow_flaky_skip_pattern("GR-73428"):
+            # Transient glibc malloc errors (double free/invalid size) in NumPy during matplotlib benchmarks [GR-73428]
+            polybench_patterns.append(
+                r"\*\*\* Error in `[^`]+': (free\(\): invalid size)|(double free or corruption \([^\)]+\))|(corrupted size vs\. prev_size): 0x[a-z\d]+ \*\*\*"
+            )
+            polybench_patterns.append(
+                r"_GraalPyMem_RawFree: freed memory size \(\d+\) is larger than allocated memory size \(\d+\)"
+            )
+        return polybench_patterns + super().flakySkipPatterns(benchmarks, bmSuiteArgs)
+
     def run(self, benchmarks, bmSuiteArgs) -> DataPoints:
         # name used by NativeImageBenchmarkMixin
         self.benchmark_name = benchmarks[0]
@@ -1341,6 +1391,15 @@ def _resolve_stable_run_config(self) -> Optional[SuiteStableRunConfig]:
             return None
         return SuiteStableRunConfig(config_path)
 
+    def _use_wall_clock_time(self) -> bool:
+        return self.polybench_bench_suite_args(bm_exec_context().get("bm_suite_args")).disable_time_tracker_rule
+
+    def _allow_flaky_skip_pattern(self, pattern_id: str) -> bool:
+        enabled_patterns = self.polybench_bench_suite_args(
+            bm_exec_context().get("bm_suite_args")
+        ).allow_flaky_polybench_patterns
+        return enabled_patterns is not None and pattern_id in enabled_patterns.split(",")
+
     @staticmethod
     def _prepare_distributions(
         working_directory: str, resolved_benchmark: ResolvedPolybenchBenchmark
@@ -1507,6 +1566,7 @@ def rules(self, output, benchmarks, bmSuiteArgs):
             # - "warmup" (per-iteration data for "warmup" and "run" iterations)
             # - "time-sample" (per-iteration data for only the "run" iterations)
             # - "avg-time" (aggregation of per-iteration data for the "run" iterations after outlier removal)
+            # The post-processors use the "avg-time" data to produce a final "time"/"one-shot" datapoint.
             rules += [
                 mx_benchmark.StdOutRule(
                     r"\[.*\] iteration ([0-9]*): (?P<value>.*) (?P<unit>.*)",
@@ -1535,21 +1595,40 @@ def rules(self, output, benchmarks, bmSuiteArgs):
                     },
                     startPattern=r"::: Running :::",
                 ),
-                ExcludeWarmupRule(
-                    r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)",
-                    {
-                        "benchmark": benchmark_name,
-                        "metric.better": "lower",
-                        "metric.name": "avg-time",
-                        "metric.object": "fork",
-                        "metric.unit": ("<unit>", str),
-                        "metric.value": ("<value>", float),
-                        "metric.type": "numeric",
-                        "metric.score-function": "id",
-                    },
-                    startPattern=r"::: Running :::",
-                ),
             ]
+            if self._use_wall_clock_time():
+                rules.append(
+                    mx_benchmark.StdOutRule(
+                        r"Wall-clock time: (?P<value>.*) sec",
+                        {
+                            "benchmark": benchmark_name,
+                            "metric.better": "lower",
+                            "metric.name": "avg-time",
+                            "metric.object": "fork",
+                            "metric.unit": "ms",
+                            "metric.value": ("<value>", lambda x: float(x) * 1000),
+                            "metric.type": "numeric",
+                            "metric.score-function": "id",
+                        },
+                    )
+                )
+            else:
+                rules.append(
+                    ExcludeWarmupRule(
+                        r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)",
+                        {
+                            "benchmark": benchmark_name,
+                            "metric.better": "lower",
+                            "metric.name": "avg-time",
+                            "metric.object": "fork",
+                            "metric.unit": ("<unit>", str),
+                            "metric.value": ("<value>", float),
+                            "metric.type": "numeric",
+                            "metric.score-function": "id",
+                        },
+                        startPattern=r"::: Running :::",
+                    )
+                )
         elif metric_name in ("allocated-memory", "metaspace-memory", "application-memory", "instructions"):
             rules += [
                 ExcludeWarmupRule(
@@ -1692,6 +1771,27 @@ def _get_metric_name(bench_output) -> Optional[str]:
         "profile from the first fork. Relevant only for PGO benchmarks that run multiple forks."
     ),
 )
+_polybench_bench_suite_parser.parser.add_argument(
+    "--disable-time-tracker-rule",
+    action="store_true",
+    help="Use the wall-clock time recorded by the TimeTracker to generate 'avg-time' datapoints.",
+)
+_polybench_bench_suite_parser.parser.add_argument(
+    "--allow-flaky-polybench-patterns",
+    help=(
+        "A comma-separated list of pattern identifiers which should be used for flaky failure identification. "
+        "This option is useful to regulate whether to allow or not known flaky failures."
+    ),
+)
+_polybench_bench_suite_parser.parser.add_argument(
+    "--flaky-failure-stability-tolerance",
+    help=(
+        "The number of flaky failures allowed per-benchmark for generating the stabilized metrics. "
+        "The stabilized metrics are not produced for benchmarks which exceed the tolerance, "
+        "but the benchmark execution itself is not failed. "
+        "By default, the stabilized metric will be produced as long as any forks were successful."
+    ),
+)
 add_parser(PolybenchBenchmarkSuite.POLYBENCH_BENCH_SUITE_PARSER_NAME, _polybench_bench_suite_parser)
 
 
diff --git a/vm/mx.vm/suite.py b/vm/mx.vm/suite.py
@@ -58,7 +58,7 @@
             },
             {
                 "name": "graalpython",
-                "version": "8107d5a57cf868f0960f2e56f7e5a7ff9b223ce1",
+                "version": "c4ec8a2c0c70ad6a85a3d4ee8443eab591e456ac",
                 "dynamic": True,
                 "urls": [
                     {"url": "https://github.com/graalvm/graalpython.git", "kind": "git"},

Original file line number	Diff line number	Diff line change
`@@ -818,7 +818,7 @@ def stage_aware_run(self, args, out=None, err=None, cwd=None, nonZeroIsFatal=Fal`
`818`	`818`	`self._prepare_for_running(args, out, err, cwd, nonZeroIsFatal)`
`819`	`819`	`self.run_single_stage()`
`820`	`820`
`821`		`- if self.stages_info.failed:`
	`821`	`+ if self.stages_info.failed and not self.bmSuite.ignore_benchmark_failure(out.data, bm_exec_context().get("benchmarks"), bm_exec_context().get("bm_suite_args")):`
`822`	`822`	`mx.abort('Exiting the benchmark due to the failure.')`
`823`	`823`
`824`	`824`	`def prepare_stages(self, bm_suite: NativeImageBenchmarkMixin, bm_suite_args) -> tuple[list[Stage], list[Stage]]:`
`@@ -2078,7 +2078,7 @@ class GraalHostPolyBenchStagingVm(PolyBenchStagingVm):`
`2078`	`2078`	`GRAALHOST_FSMAPPING_TEMPLATE: Template = Template("""`
`2079`	`2079`	`{`
`2080`	`2080`	`"fsmappings": [`
`2081`		`- {"concrete": "${path}", "virt": "${path}"}`
	`2081`	`+ {"concrete": "${path}", "virt": "${path}", "mutable": true}`
`2082`	`2082`	`]`
`2083`	`2083`	`}`
`2084`	`2084`	`""")`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@`
`58`	`58`	`},`
`59`	`59`	`{`
`60`	`60`	`"name": "graalpython",`
`61`		`- "version": "8107d5a57cf868f0960f2e56f7e5a7ff9b223ce1",`
	`61`	`+ "version": "c4ec8a2c0c70ad6a85a3d4ee8443eab591e456ac",`
`62`	`62`	`"dynamic": True,`
`63`	`63`	`"urls": [`
`64`	`64`	`{"url": "https://github.com/graalvm/graalpython.git", "kind": "git"},`