Skip to content

Commit f81f003

Browse files
author
Andrija Kolic
committed
[GR-71966] Support matplotlib benchmarks by handling flaky fork failures and enabling end-to-end time measuring
PullRequest: graal/23247
2 parents f1b7165 + ce22c01 commit f81f003

3 files changed

Lines changed: 121 additions & 21 deletions

File tree

sdk/mx.sdk/mx_sdk_benchmark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ def stage_aware_run(self, args, out=None, err=None, cwd=None, nonZeroIsFatal=Fal
818818
self._prepare_for_running(args, out, err, cwd, nonZeroIsFatal)
819819
self.run_single_stage()
820820

821-
if self.stages_info.failed:
821+
if self.stages_info.failed and not self.bmSuite.ignore_benchmark_failure(out.data, bm_exec_context().get("benchmarks"), bm_exec_context().get("bm_suite_args")):
822822
mx.abort('Exiting the benchmark due to the failure.')
823823

824824
def prepare_stages(self, bm_suite: NativeImageBenchmarkMixin, bm_suite_args) -> tuple[list[Stage], list[Stage]]:
@@ -2078,7 +2078,7 @@ class GraalHostPolyBenchStagingVm(PolyBenchStagingVm):
20782078
GRAALHOST_FSMAPPING_TEMPLATE: Template = Template("""
20792079
{
20802080
"fsmappings": [
2081-
{"concrete": "${path}", "virt": "${path}"}
2081+
{"concrete": "${path}", "virt": "${path}", "mutable": true}
20822082
]
20832083
}
20842084
""")

truffle/mx.truffle/mx_polybench/model.py

Lines changed: 118 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -930,7 +930,7 @@ def verify_and_process_id_score_function(self, datapoint: DataPoint):
930930
class NonNativeImageBenchmarkSummaryPostProcessor(FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor):
931931
"""
932932
Post-processor that calculates the outlier-excluded average of the "avg-time" metric across dispatches
933-
and produces a final "time" metric for a benchmark.
933+
and produces a final "time"/"one-shot" metric for a benchmark.
934934
Should only be used when running a benchmark in server (non-native) mode.
935935
"""
936936

@@ -944,7 +944,10 @@ def key_fn(dp):
944944
field = "metric.value"
945945

946946
def update_fn(dp):
947-
dp["metric.name"] = "time"
947+
metric_name = "time"
948+
if self._suite._use_wall_clock_time():
949+
metric_name = "one-shot"
950+
dp["metric.name"] = metric_name
948951
if "metric.object" in dp:
949952
del dp["metric.object"]
950953
if "metric.fork-number" in dp:
@@ -994,7 +997,7 @@ def determine_stable_run_outlier_exclusion_percentiles(self, bench_config: Bench
994997
class NativeModeBenchmarkSummaryPostProcessor(FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor):
995998
"""
996999
Post-processor that calculates the outlier-excluded average of the "avg-time" metric across image builds
997-
and produces a final "time" metric for a benchmark (separate "run" and "instrument-run" datapoints).
1000+
and produces a final "time"/"one-shot" metric for a benchmark (separate "run" and "instrument-run" datapoints).
9981001
Should only be used when running a benchmark in native mode.
9991002
"""
10001003

@@ -1008,7 +1011,10 @@ def key_fn(dp):
10081011
field = "metric.value"
10091012

10101013
def update_fn(dp):
1011-
dp["metric.name"] = "time"
1014+
metric_name = "time"
1015+
if self._suite._use_wall_clock_time():
1016+
metric_name = "one-shot"
1017+
dp["metric.name"] = metric_name
10121018
if "metric.fork-number" in dp:
10131019
del dp["metric.fork-number"]
10141020
if "native-image.image-fork-number" in dp:
@@ -1048,6 +1054,38 @@ def select_datapoints(self, datapoints: DataPoints) -> DataPoints:
10481054
)
10491055
return super().select_datapoints(datapoints)
10501056

1057+
def group_datapoints(self, datapoints: DataPoints) -> Dict[Any, DataPoints]:
1058+
groups = super().group_datapoints(datapoints)
1059+
flaky_tolerance = self._suite.polybench_bench_suite_args(
1060+
bm_exec_context().get("bm_suite_args")
1061+
).flaky_failure_stability_tolerance
1062+
config = bm_exec_context().get(PolybenchBenchmarkSuite.STABLE_CONFIG)
1063+
if flaky_tolerance is None or config is None:
1064+
return groups
1065+
1066+
# Filter out groups belonging to benchmarks which had too many flaky failures
1067+
flaky_tolerance = int(flaky_tolerance)
1068+
groups_within_tolerance = {}
1069+
for key, group in groups.items():
1070+
bench = key[0]
1071+
stage = key[1]
1072+
if stage != "run":
1073+
# Flaky tolerance only applies to datapoints from the 'run' stage
1074+
groups_within_tolerance[key] = group
1075+
continue
1076+
total_forks = config.get_benchmark(bench).builds * config.get_benchmark(bench).forks
1077+
required_count = total_forks - flaky_tolerance
1078+
if len(group) < required_count:
1079+
mx.log(
1080+
f"Not generating the stabilized metric for benchmark '{bench}' due to too few successful runs ({len(group)} < {required_count})."
1081+
)
1082+
else:
1083+
mx.log(
1084+
f"Generating the stabilized metric for benchmark '{bench}' using the {len(group)} available datapoints (out of {total_forks})."
1085+
)
1086+
groups_within_tolerance[key] = group
1087+
return groups_within_tolerance
1088+
10511089
def determine_stable_run_outlier_exclusion_percentiles(self, bench_config: BenchmarkStableRunConfig):
10521090
self._lower_percentile = bench_config.build_outlier_exclusion.lower_percentile
10531091
self._upper_percentile = bench_config.build_outlier_exclusion.upper_percentile
@@ -1300,6 +1338,18 @@ def _ensure_instrumentation_profile_name_is_benchmark_specific(
13001338
# Store the profile for use in upcoming IMAGE stages
13011339
bm_exec_context().get(PolybenchBenchmarkSuite.PGO_PROFILES).append(bench_unique_profile_path)
13021340

1341+
def flakySkipPatterns(self, benchmarks, bmSuiteArgs):
1342+
polybench_patterns = []
1343+
if self._allow_flaky_skip_pattern("GR-73428"):
1344+
# Transient glibc malloc errors (double free/invalid size) in NumPy during matplotlib benchmarks [GR-73428]
1345+
polybench_patterns.append(
1346+
r"\*\*\* Error in `[^`]+': (free\(\): invalid size)|(double free or corruption \([^\)]+\))|(corrupted size vs\. prev_size): 0x[a-z\d]+ \*\*\*"
1347+
)
1348+
polybench_patterns.append(
1349+
r"_GraalPyMem_RawFree: freed memory size \(\d+\) is larger than allocated memory size \(\d+\)"
1350+
)
1351+
return polybench_patterns + super().flakySkipPatterns(benchmarks, bmSuiteArgs)
1352+
13031353
def run(self, benchmarks, bmSuiteArgs) -> DataPoints:
13041354
# name used by NativeImageBenchmarkMixin
13051355
self.benchmark_name = benchmarks[0]
@@ -1341,6 +1391,15 @@ def _resolve_stable_run_config(self) -> Optional[SuiteStableRunConfig]:
13411391
return None
13421392
return SuiteStableRunConfig(config_path)
13431393

1394+
def _use_wall_clock_time(self) -> bool:
1395+
return self.polybench_bench_suite_args(bm_exec_context().get("bm_suite_args")).disable_time_tracker_rule
1396+
1397+
def _allow_flaky_skip_pattern(self, pattern_id: str) -> bool:
1398+
enabled_patterns = self.polybench_bench_suite_args(
1399+
bm_exec_context().get("bm_suite_args")
1400+
).allow_flaky_polybench_patterns
1401+
return enabled_patterns is not None and pattern_id in enabled_patterns.split(",")
1402+
13441403
@staticmethod
13451404
def _prepare_distributions(
13461405
working_directory: str, resolved_benchmark: ResolvedPolybenchBenchmark
@@ -1507,6 +1566,7 @@ def rules(self, output, benchmarks, bmSuiteArgs):
15071566
# - "warmup" (per-iteration data for "warmup" and "run" iterations)
15081567
# - "time-sample" (per-iteration data for only the "run" iterations)
15091568
# - "avg-time" (aggregation of per-iteration data for the "run" iterations after outlier removal)
1569+
# The post-processors use the "avg-time" data to produce a final "time"/"one-shot" datapoint.
15101570
rules += [
15111571
mx_benchmark.StdOutRule(
15121572
r"\[.*\] iteration ([0-9]*): (?P<value>.*) (?P<unit>.*)",
@@ -1535,21 +1595,40 @@ def rules(self, output, benchmarks, bmSuiteArgs):
15351595
},
15361596
startPattern=r"::: Running :::",
15371597
),
1538-
ExcludeWarmupRule(
1539-
r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)",
1540-
{
1541-
"benchmark": benchmark_name,
1542-
"metric.better": "lower",
1543-
"metric.name": "avg-time",
1544-
"metric.object": "fork",
1545-
"metric.unit": ("<unit>", str),
1546-
"metric.value": ("<value>", float),
1547-
"metric.type": "numeric",
1548-
"metric.score-function": "id",
1549-
},
1550-
startPattern=r"::: Running :::",
1551-
),
15521598
]
1599+
if self._use_wall_clock_time():
1600+
rules.append(
1601+
mx_benchmark.StdOutRule(
1602+
r"Wall-clock time: (?P<value>.*) sec",
1603+
{
1604+
"benchmark": benchmark_name,
1605+
"metric.better": "lower",
1606+
"metric.name": "avg-time",
1607+
"metric.object": "fork",
1608+
"metric.unit": "ms",
1609+
"metric.value": ("<value>", lambda x: float(x) * 1000),
1610+
"metric.type": "numeric",
1611+
"metric.score-function": "id",
1612+
},
1613+
)
1614+
)
1615+
else:
1616+
rules.append(
1617+
ExcludeWarmupRule(
1618+
r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)",
1619+
{
1620+
"benchmark": benchmark_name,
1621+
"metric.better": "lower",
1622+
"metric.name": "avg-time",
1623+
"metric.object": "fork",
1624+
"metric.unit": ("<unit>", str),
1625+
"metric.value": ("<value>", float),
1626+
"metric.type": "numeric",
1627+
"metric.score-function": "id",
1628+
},
1629+
startPattern=r"::: Running :::",
1630+
)
1631+
)
15531632
elif metric_name in ("allocated-memory", "metaspace-memory", "application-memory", "instructions"):
15541633
rules += [
15551634
ExcludeWarmupRule(
@@ -1692,6 +1771,27 @@ def _get_metric_name(bench_output) -> Optional[str]:
16921771
"profile from the first fork. Relevant only for PGO benchmarks that run multiple forks."
16931772
),
16941773
)
1774+
_polybench_bench_suite_parser.parser.add_argument(
1775+
"--disable-time-tracker-rule",
1776+
action="store_true",
1777+
help="Use the wall-clock time recorded by the TimeTracker to generate 'avg-time' datapoints.",
1778+
)
1779+
_polybench_bench_suite_parser.parser.add_argument(
1780+
"--allow-flaky-polybench-patterns",
1781+
help=(
1782+
"A comma-separated list of pattern identifiers which should be used for flaky failure identification. "
1783+
"This option is useful to regulate whether to allow or not known flaky failures."
1784+
),
1785+
)
1786+
_polybench_bench_suite_parser.parser.add_argument(
1787+
"--flaky-failure-stability-tolerance",
1788+
help=(
1789+
"The number of flaky failures allowed per-benchmark for generating the stabilized metrics. "
1790+
"The stabilized metrics are not produced for benchmarks which exceed the tolerance, "
1791+
"but the benchmark execution itself is not failed. "
1792+
"By default, the stabilized metric will be produced as long as any forks were successful."
1793+
),
1794+
)
16951795
add_parser(PolybenchBenchmarkSuite.POLYBENCH_BENCH_SUITE_PARSER_NAME, _polybench_bench_suite_parser)
16961796

16971797

vm/mx.vm/suite.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
},
5959
{
6060
"name": "graalpython",
61-
"version": "8107d5a57cf868f0960f2e56f7e5a7ff9b223ce1",
61+
"version": "c4ec8a2c0c70ad6a85a3d4ee8443eab591e456ac",
6262
"dynamic": True,
6363
"urls": [
6464
{"url": "https://github.com/graalvm/graalpython.git", "kind": "git"},

0 commit comments

Comments
 (0)