From 4f8a934c4fa97479c24f756ef8ee40542f978102 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Thu, 25 Jun 2026 12:25:12 -0700 Subject: [PATCH] Adjust `pca.R` to choose a cluster representative closest to 100M Wasm instructions executed Rather than the fewest Wasm instructions executed. 100M was chosen previously as a large enough number to avoid lots of noise and to fill our callgrind-simulated caches, but small enough to run individual iterations relatively quickly. --- benchmarks/README.md | 20 ++++------------ scripts/pca.R | 56 ++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 85ec9090..9356e653 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -64,6 +64,11 @@ record its execution, it must: sibling file located next to the `benchmark.wasm` file. The runner will assert that the actual execution's output matches the expectation. +* The benchmark should dynamically execute ~100,000,000 wasm instructions per + execution. You can check this via `cargo run -- pca-metrics path/to/benchmark` + and looking at the `dynamic_total_inst_count` column of the resulting CSV + output. + Many of the above requirements can be checked by running the `.wasm` file through the `validate` command: @@ -119,21 +124,6 @@ following requirements: ballpark numbers for whether further investment in an optimization is worth it, without waiting for the full, thorough benchmark suite to complete. -* Each workload must have an expected result, so that we can validate executions - and avoid accepting "fast" but incorrect results. - -* Compiling and instantiating the candidate program and then executing its - workload should take *roughly* one to six seconds total. - - > Napkin math: We want the full benchmark to run in a reasonable amount of - > time, say twenty to thirty minutes, and we want somewhere around ten to - > twenty programs altogether in the benchmark suite to balance diversity, - > simplicity, and time spent in execution versus compilation and - > instantiation. Additionally, for good statistical analyses, we need *at - > least* 30 samples (ideally more like 100) from each benchmark program. That - > leaves an average of about one to six seconds for each benchmark program to - > compile, instantiate, and execute the workload. - * Inputs should be given through I/O and results reported through I/O. This ensures that the compiler cannot optimize the benchmark program away. diff --git a/scripts/pca.R b/scripts/pca.R index 9927f19f..f7c259ea 100755 --- a/scripts/pca.R +++ b/scripts/pca.R @@ -24,11 +24,12 @@ # Euclidean distance between their principal-component scores, as in the paper. # # Finally, we recommend a subset of the suite. Each cluster is represented by -# its cheapest member (the benchmark that executes the fewest dynamic wasm -# instructions). Sweeping the number of clusters traces a Pareto trade-off -# between clustering error (SSE) and the cost of running the subset (its total -# dynamic instructions); the knee of that curve is the Pareto-optimal cluster -# size. +# the member whose dynamic wasm instruction count is closest to +# `TARGET_INST_COUNT`, so the subset runs benchmarks of a representative, +# substantial size rather than each cluster's briefest (and noisiest) member. +# Sweeping the number of clusters traces a Pareto trade-off between clustering +# error (SSE) and the cost of running the subset (its total dynamic +# instructions); the knee of that curve is the Pareto-optimal cluster size. # # Outputs (written to the current working directory): # @@ -70,10 +71,17 @@ CLUSTER_VAR_THRESHOLD <- 0.9 # characteristic, so it is excluded from the PCA itself. COST_COLUMN <- "dynamic_total_inst_count" +# Each cluster is represented by the benchmark whose dynamic instruction count +# is closest to this target, rather than by its cheapest member. Choosing a +# representative near this size keeps the subset's benchmarks long enough to be +# meaningful while steering away from each cluster's most expensive members. +TARGET_INST_COUNT <- 100000000 + # Benchmarks executing fewer than this many dynamic instructions run too briefly -# to characterize reliably -- their instruction-mix ratios are dominated by -# noise -- so they are filtered out before the analysis. -MIN_DYNAMIC_INST_COUNT <- 1000000 +# to characterize reliably so they are filtered out before the analysis. The +# floor is half the representative target: a benchmark smaller than that is too +# far below the target size to stand in for its cluster anyway. +MIN_DYNAMIC_INST_COUNT <- TARGET_INST_COUNT / 2 # Turn a benchmark path into a short, unique label. # @@ -236,21 +244,29 @@ within_cluster_sse <- function(scores, assignment) { }, numeric(1))) } -# Cost of representing every cluster by its cheapest member. +# Index, within a vector of dynamic instruction counts, of a cluster's +# representative: the member whose count is closest to `TARGET_INST_COUNT`. +representative_index <- function(cost) { + which.min(abs(cost - TARGET_INST_COUNT)) +} + +# Cost of representing every cluster by its representative member. # -# The benchmark executing the fewest dynamic instructions stands in for its -# whole cluster. This total rises as the number of clusters grows (more -# representatives to run). +# Each cluster's representative -- the benchmark whose dynamic instruction count +# is closest to `TARGET_INST_COUNT` -- stands in for the whole cluster, so this +# total is the sum of the representatives' instruction counts. subset_cost <- function(cost, assignment) { clusters <- split(seq_along(assignment), assignment) - sum(vapply(clusters, function(idx) min(cost[idx]), numeric(1))) + sum(vapply(clusters, function(idx) { + cluster_cost <- cost[idx] + cluster_cost[representative_index(cluster_cost)] + }, numeric(1))) } # Group benchmarks by cluster at size k. # # Returns a list with one data frame per cluster: member full paths and dynamic -# instruction counts, sorted ascending by count so each cluster's cheapest -# member (its representative) comes first. +# instruction counts, sorted ascending by count for a readable cost breakdown. cluster_members <- function(clustering, cost, paths, k) { assignment <- cutree(clustering, k = k) lapply(split(seq_along(assignment), assignment), function(idx) { @@ -420,7 +436,9 @@ main <- function() { # With the graphs written, report the suggested subset per cluster. clusters <- cluster_members(clustering, cost, paths, analysis$best_k) cat(sprintf( - paste0("\nSuggested subset: cheapest benchmark in each of the %d clusters:\n\n"), + paste0("\nSuggested subset: the benchmark closest to %s dynamic ", + "instructions in each of the %d clusters:\n\n"), + format(TARGET_INST_COUNT, big.mark = ",", scientific = FALSE), analysis$best_k )) cat("```\n") @@ -441,8 +459,10 @@ main <- function() { " ", members$benchmark[j], "\n" )) } - # The representative: the cheapest member, listed first after sorting. - cat(members$benchmark[1], "\n", sep = "") + # The representative: the member whose instruction count is closest to + # `TARGET_INST_COUNT`. + rep_idx <- representative_index(members$dynamic_insts) + cat(members$benchmark[rep_idx], "\n", sep = "") } cat("```\n") }