@@ -930,7 +930,7 @@ def verify_and_process_id_score_function(self, datapoint: DataPoint):
930930class NonNativeImageBenchmarkSummaryPostProcessor (FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor ):
931931 """
932932 Post-processor that calculates the outlier-excluded average of the "avg-time" metric across dispatches
933- and produces a final "time" metric for a benchmark.
933+ and produces a final "time"/"one-shot" metric for a benchmark.
934934 Should only be used when running a benchmark in server (non-native) mode.
935935 """
936936
@@ -944,7 +944,10 @@ def key_fn(dp):
944944 field = "metric.value"
945945
946946 def update_fn (dp ):
947- dp ["metric.name" ] = "time"
947+ metric_name = "time"
948+ if self ._suite ._use_wall_clock_time ():
949+ metric_name = "one-shot"
950+ dp ["metric.name" ] = metric_name
948951 if "metric.object" in dp :
949952 del dp ["metric.object" ]
950953 if "metric.fork-number" in dp :
@@ -994,7 +997,7 @@ def determine_stable_run_outlier_exclusion_percentiles(self, bench_config: Bench
994997class NativeModeBenchmarkSummaryPostProcessor (FinalDispatchFinalStageAverageWithOutlierRemovalPostProcessor ):
995998 """
996999 Post-processor that calculates the outlier-excluded average of the "avg-time" metric across image builds
997- and produces a final "time" metric for a benchmark (separate "run" and "instrument-run" datapoints).
1000+ and produces a final "time"/"one-shot" metric for a benchmark (separate "run" and "instrument-run" datapoints).
9981001 Should only be used when running a benchmark in native mode.
9991002 """
10001003
@@ -1008,7 +1011,10 @@ def key_fn(dp):
10081011 field = "metric.value"
10091012
10101013 def update_fn (dp ):
1011- dp ["metric.name" ] = "time"
1014+ metric_name = "time"
1015+ if self ._suite ._use_wall_clock_time ():
1016+ metric_name = "one-shot"
1017+ dp ["metric.name" ] = metric_name
10121018 if "metric.fork-number" in dp :
10131019 del dp ["metric.fork-number" ]
10141020 if "native-image.image-fork-number" in dp :
@@ -1048,6 +1054,38 @@ def select_datapoints(self, datapoints: DataPoints) -> DataPoints:
10481054 )
10491055 return super ().select_datapoints (datapoints )
10501056
1057+ def group_datapoints (self , datapoints : DataPoints ) -> Dict [Any , DataPoints ]:
1058+ groups = super ().group_datapoints (datapoints )
1059+ flaky_tolerance = self ._suite .polybench_bench_suite_args (
1060+ bm_exec_context ().get ("bm_suite_args" )
1061+ ).flaky_failure_stability_tolerance
1062+ config = bm_exec_context ().get (PolybenchBenchmarkSuite .STABLE_CONFIG )
1063+ if flaky_tolerance is None or config is None :
1064+ return groups
1065+
1066+ # Filter out groups belonging to benchmarks which had too many flaky failures
1067+ flaky_tolerance = int (flaky_tolerance )
1068+ groups_within_tolerance = {}
1069+ for key , group in groups .items ():
1070+ bench = key [0 ]
1071+ stage = key [1 ]
1072+ if stage != "run" :
1073+ # Flaky tolerance only applies to datapoints from the 'run' stage
1074+ groups_within_tolerance [key ] = group
1075+ continue
1076+ total_forks = config .get_benchmark (bench ).builds * config .get_benchmark (bench ).forks
1077+ required_count = total_forks - flaky_tolerance
1078+ if len (group ) < required_count :
1079+ mx .log (
1080+ f"Not generating the stabilized metric for benchmark '{ bench } ' due to too few successful runs ({ len (group )} < { required_count } )."
1081+ )
1082+ else :
1083+ mx .log (
1084+ f"Generating the stabilized metric for benchmark '{ bench } ' using the { len (group )} available datapoints (out of { total_forks } )."
1085+ )
1086+ groups_within_tolerance [key ] = group
1087+ return groups_within_tolerance
1088+
10511089 def determine_stable_run_outlier_exclusion_percentiles (self , bench_config : BenchmarkStableRunConfig ):
10521090 self ._lower_percentile = bench_config .build_outlier_exclusion .lower_percentile
10531091 self ._upper_percentile = bench_config .build_outlier_exclusion .upper_percentile
@@ -1300,6 +1338,18 @@ def _ensure_instrumentation_profile_name_is_benchmark_specific(
13001338 # Store the profile for use in upcoming IMAGE stages
13011339 bm_exec_context ().get (PolybenchBenchmarkSuite .PGO_PROFILES ).append (bench_unique_profile_path )
13021340
1341+ def flakySkipPatterns (self , benchmarks , bmSuiteArgs ):
1342+ polybench_patterns = []
1343+ if self ._allow_flaky_skip_pattern ("GR-73428" ):
1344+ # Transient glibc malloc errors (double free/invalid size) in NumPy during matplotlib benchmarks [GR-73428]
1345+ polybench_patterns .append (
1346+ r"\*\*\* Error in `[^`]+': (free\(\): invalid size)|(double free or corruption \([^\)]+\))|(corrupted size vs\. prev_size): 0x[a-z\d]+ \*\*\*"
1347+ )
1348+ polybench_patterns .append (
1349+ r"_GraalPyMem_RawFree: freed memory size \(\d+\) is larger than allocated memory size \(\d+\)"
1350+ )
1351+ return polybench_patterns + super ().flakySkipPatterns (benchmarks , bmSuiteArgs )
1352+
13031353 def run (self , benchmarks , bmSuiteArgs ) -> DataPoints :
13041354 # name used by NativeImageBenchmarkMixin
13051355 self .benchmark_name = benchmarks [0 ]
@@ -1341,6 +1391,15 @@ def _resolve_stable_run_config(self) -> Optional[SuiteStableRunConfig]:
13411391 return None
13421392 return SuiteStableRunConfig (config_path )
13431393
1394+ def _use_wall_clock_time (self ) -> bool :
1395+ return self .polybench_bench_suite_args (bm_exec_context ().get ("bm_suite_args" )).disable_time_tracker_rule
1396+
1397+ def _allow_flaky_skip_pattern (self , pattern_id : str ) -> bool :
1398+ enabled_patterns = self .polybench_bench_suite_args (
1399+ bm_exec_context ().get ("bm_suite_args" )
1400+ ).allow_flaky_polybench_patterns
1401+ return enabled_patterns is not None and pattern_id in enabled_patterns .split ("," )
1402+
13441403 @staticmethod
13451404 def _prepare_distributions (
13461405 working_directory : str , resolved_benchmark : ResolvedPolybenchBenchmark
@@ -1507,6 +1566,7 @@ def rules(self, output, benchmarks, bmSuiteArgs):
15071566 # - "warmup" (per-iteration data for "warmup" and "run" iterations)
15081567 # - "time-sample" (per-iteration data for only the "run" iterations)
15091568 # - "avg-time" (aggregation of per-iteration data for the "run" iterations after outlier removal)
1569+ # The post-processors use the "avg-time" data to produce a final "time"/"one-shot" datapoint.
15101570 rules += [
15111571 mx_benchmark .StdOutRule (
15121572 r"\[.*\] iteration ([0-9]*): (?P<value>.*) (?P<unit>.*)" ,
@@ -1535,21 +1595,40 @@ def rules(self, output, benchmarks, bmSuiteArgs):
15351595 },
15361596 startPattern = r"::: Running :::" ,
15371597 ),
1538- ExcludeWarmupRule (
1539- r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)" ,
1540- {
1541- "benchmark" : benchmark_name ,
1542- "metric.better" : "lower" ,
1543- "metric.name" : "avg-time" ,
1544- "metric.object" : "fork" ,
1545- "metric.unit" : ("<unit>" , str ),
1546- "metric.value" : ("<value>" , float ),
1547- "metric.type" : "numeric" ,
1548- "metric.score-function" : "id" ,
1549- },
1550- startPattern = r"::: Running :::" ,
1551- ),
15521598 ]
1599+ if self ._use_wall_clock_time ():
1600+ rules .append (
1601+ mx_benchmark .StdOutRule (
1602+ r"Wall-clock time: (?P<value>.*) sec" ,
1603+ {
1604+ "benchmark" : benchmark_name ,
1605+ "metric.better" : "lower" ,
1606+ "metric.name" : "avg-time" ,
1607+ "metric.object" : "fork" ,
1608+ "metric.unit" : "ms" ,
1609+ "metric.value" : ("<value>" , lambda x : float (x ) * 1000 ),
1610+ "metric.type" : "numeric" ,
1611+ "metric.score-function" : "id" ,
1612+ },
1613+ )
1614+ )
1615+ else :
1616+ rules .append (
1617+ ExcludeWarmupRule (
1618+ r"\[.*\] run aggregate summary: (?P<value>.*) (?P<unit>.*)" ,
1619+ {
1620+ "benchmark" : benchmark_name ,
1621+ "metric.better" : "lower" ,
1622+ "metric.name" : "avg-time" ,
1623+ "metric.object" : "fork" ,
1624+ "metric.unit" : ("<unit>" , str ),
1625+ "metric.value" : ("<value>" , float ),
1626+ "metric.type" : "numeric" ,
1627+ "metric.score-function" : "id" ,
1628+ },
1629+ startPattern = r"::: Running :::" ,
1630+ )
1631+ )
15531632 elif metric_name in ("allocated-memory" , "metaspace-memory" , "application-memory" , "instructions" ):
15541633 rules += [
15551634 ExcludeWarmupRule (
@@ -1692,6 +1771,27 @@ def _get_metric_name(bench_output) -> Optional[str]:
16921771 "profile from the first fork. Relevant only for PGO benchmarks that run multiple forks."
16931772 ),
16941773)
1774+ _polybench_bench_suite_parser .parser .add_argument (
1775+ "--disable-time-tracker-rule" ,
1776+ action = "store_true" ,
1777+ help = "Use the wall-clock time recorded by the TimeTracker to generate 'avg-time' datapoints." ,
1778+ )
1779+ _polybench_bench_suite_parser .parser .add_argument (
1780+ "--allow-flaky-polybench-patterns" ,
1781+ help = (
1782+ "A comma-separated list of pattern identifiers which should be used for flaky failure identification. "
1783+ "This option is useful to regulate whether to allow or not known flaky failures."
1784+ ),
1785+ )
1786+ _polybench_bench_suite_parser .parser .add_argument (
1787+ "--flaky-failure-stability-tolerance" ,
1788+ help = (
1789+ "The number of flaky failures allowed per-benchmark for generating the stabilized metrics. "
1790+ "The stabilized metrics are not produced for benchmarks which exceed the tolerance, "
1791+ "but the benchmark execution itself is not failed. "
1792+ "By default, the stabilized metric will be produced as long as any forks were successful."
1793+ ),
1794+ )
16951795add_parser (PolybenchBenchmarkSuite .POLYBENCH_BENCH_SUITE_PARSER_NAME , _polybench_bench_suite_parser )
16961796
16971797
0 commit comments