From 4f8a934c4fa97479c24f756ef8ee40542f978102 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Thu, 25 Jun 2026 12:25:12 -0700
Subject: [PATCH] Adjust `pca.R` to choose a cluster representative closest to
 100M Wasm instructions executed

Rather than the fewest Wasm instructions executed. 100M was chosen previously as
a large enough number to avoid lots of noise and to fill our callgrind-simulated
caches, but small enough to run individual iterations relatively quickly.
---
 benchmarks/README.md | 20 ++++------------
 scripts/pca.R        | 56 ++++++++++++++++++++++++++++++--------------
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 85ec9090..9356e653 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -64,6 +64,11 @@ record its execution, it must:
   sibling file located next to the `benchmark.wasm` file. The runner will assert
   that the actual execution's output matches the expectation.
 
+* The benchmark should dynamically execute ~100,000,000 wasm instructions per
+  execution. You can check this via `cargo run -- pca-metrics path/to/benchmark`
+  and looking at the `dynamic_total_inst_count` column of the resulting CSV
+  output.
+
 Many of the above requirements can be checked by running the `.wasm` file
 through the `validate` command:
 
@@ -119,21 +124,6 @@ following requirements:
   ballpark numbers for whether further investment in an optimization is worth
   it, without waiting for the full, thorough benchmark suite to complete.
 
-* Each workload must have an expected result, so that we can validate executions
-  and avoid accepting "fast" but incorrect results.
-
-* Compiling and instantiating the candidate program and then executing its
-  workload should take *roughly* one to six seconds total.
-
-  > Napkin math: We want the full benchmark to run in a reasonable amount of
-  > time, say twenty to thirty minutes, and we want somewhere around ten to
-  > twenty programs altogether in the benchmark suite to balance diversity,
-  > simplicity, and time spent in execution versus compilation and
-  > instantiation. Additionally, for good statistical analyses, we need *at
-  > least* 30 samples (ideally more like 100) from each benchmark program. That
-  > leaves an average of about one to six seconds for each benchmark program to
-  > compile, instantiate, and execute the workload.
-
 * Inputs should be given through I/O and results reported through I/O. This
   ensures that the compiler cannot optimize the benchmark program away.
 
diff --git a/scripts/pca.R b/scripts/pca.R
index 9927f19f..f7c259ea 100755
--- a/scripts/pca.R
+++ b/scripts/pca.R
@@ -24,11 +24,12 @@
 # Euclidean distance between their principal-component scores, as in the paper.
 #
 # Finally, we recommend a subset of the suite. Each cluster is represented by
-# its cheapest member (the benchmark that executes the fewest dynamic wasm
-# instructions). Sweeping the number of clusters traces a Pareto trade-off
-# between clustering error (SSE) and the cost of running the subset (its total
-# dynamic instructions); the knee of that curve is the Pareto-optimal cluster
-# size.
+# the member whose dynamic wasm instruction count is closest to
+# `TARGET_INST_COUNT`, so the subset runs benchmarks of a representative,
+# substantial size rather than each cluster's briefest (and noisiest) member.
+# Sweeping the number of clusters traces a Pareto trade-off between clustering
+# error (SSE) and the cost of running the subset (its total dynamic
+# instructions); the knee of that curve is the Pareto-optimal cluster size.
 #
 # Outputs (written to the current working directory):
 #
@@ -70,10 +71,17 @@ CLUSTER_VAR_THRESHOLD <- 0.9
 # characteristic, so it is excluded from the PCA itself.
 COST_COLUMN <- "dynamic_total_inst_count"
 
+# Each cluster is represented by the benchmark whose dynamic instruction count
+# is closest to this target, rather than by its cheapest member. Choosing a
+# representative near this size keeps the subset's benchmarks long enough to be
+# meaningful while steering away from each cluster's most expensive members.
+TARGET_INST_COUNT <- 100000000
+
 # Benchmarks executing fewer than this many dynamic instructions run too briefly
-# to characterize reliably -- their instruction-mix ratios are dominated by
-# noise -- so they are filtered out before the analysis.
-MIN_DYNAMIC_INST_COUNT <- 1000000
+# to characterize reliably so they are filtered out before the analysis. The
+# floor is half the representative target: a benchmark smaller than that is too
+# far below the target size to stand in for its cluster anyway.
+MIN_DYNAMIC_INST_COUNT <- TARGET_INST_COUNT / 2
 
 # Turn a benchmark path into a short, unique label.
 #
@@ -236,21 +244,29 @@ within_cluster_sse <- function(scores, assignment) {
     }, numeric(1)))
 }
 
-# Cost of representing every cluster by its cheapest member.
+# Index, within a vector of dynamic instruction counts, of a cluster's
+# representative: the member whose count is closest to `TARGET_INST_COUNT`.
+representative_index <- function(cost) {
+    which.min(abs(cost - TARGET_INST_COUNT))
+}
+
+# Cost of representing every cluster by its representative member.
 #
-# The benchmark executing the fewest dynamic instructions stands in for its
-# whole cluster. This total rises as the number of clusters grows (more
-# representatives to run).
+# Each cluster's representative -- the benchmark whose dynamic instruction count
+# is closest to `TARGET_INST_COUNT` -- stands in for the whole cluster, so this
+# total is the sum of the representatives' instruction counts.
 subset_cost <- function(cost, assignment) {
     clusters <- split(seq_along(assignment), assignment)
-    sum(vapply(clusters, function(idx) min(cost[idx]), numeric(1)))
+    sum(vapply(clusters, function(idx) {
+        cluster_cost <- cost[idx]
+        cluster_cost[representative_index(cluster_cost)]
+    }, numeric(1)))
 }
 
 # Group benchmarks by cluster at size k.
 #
 # Returns a list with one data frame per cluster: member full paths and dynamic
-# instruction counts, sorted ascending by count so each cluster's cheapest
-# member (its representative) comes first.
+# instruction counts, sorted ascending by count for a readable cost breakdown.
 cluster_members <- function(clustering, cost, paths, k) {
     assignment <- cutree(clustering, k = k)
     lapply(split(seq_along(assignment), assignment), function(idx) {
@@ -420,7 +436,9 @@ main <- function() {
     # With the graphs written, report the suggested subset per cluster.
     clusters <- cluster_members(clustering, cost, paths, analysis$best_k)
     cat(sprintf(
-        paste0("\nSuggested subset: cheapest benchmark in each of the %d clusters:\n\n"),
+        paste0("\nSuggested subset: the benchmark closest to %s dynamic ",
+               "instructions in each of the %d clusters:\n\n"),
+        format(TARGET_INST_COUNT, big.mark = ",", scientific = FALSE),
         analysis$best_k
     ))
     cat("```\n")
@@ -441,8 +459,10 @@ main <- function() {
                 "    ", members$benchmark[j], "\n"
             ))
         }
-        # The representative: the cheapest member, listed first after sorting.
-        cat(members$benchmark[1], "\n", sep = "")
+        # The representative: the member whose instruction count is closest to
+        # `TARGET_INST_COUNT`.
+        rep_idx <- representative_index(members$dynamic_insts)
+        cat(members$benchmark[rep_idx], "\n", sep = "")
     }
     cat("```\n")
 }