1//! The implementation of the this crate is kept in a separate module
2//! so that it is easy to publish this crate as part of GPUI's dependencies
3
4use collections::HashMap;
5use serde::{Deserialize, Serialize};
6use std::{num::NonZero, time::Duration};
7
8pub mod consts {
9 //! Preset identifiers and constants so that the profiler and proc macro agree
10 //! on their communication protocol.
11
12 /// The suffix on the actual test function.
13 pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
14 /// The suffix on an extra function which prints metadata about a test to stdout.
15 pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
16 /// The env var in which we pass the iteration count to our tests.
17 pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
18 /// The prefix printed on all benchmark test metadata lines, to distinguish it from
19 /// possible output by the test harness itself.
20 pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
21 /// The version number for the data returned from the test metadata function.
22 /// Increment on non-backwards-compatible changes.
23 pub const MDATA_VER: u32 = 0;
24 /// The default weight, if none is specified.
25 pub const WEIGHT_DEFAULT: u8 = 50;
26 /// How long a test must have run to be assumed to be reliable-ish.
27 pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
28
29 /// Identifier for the iteration count of a test metadata.
30 pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
31 /// Identifier for the weight of a test metadata.
32 pub const WEIGHT_LINE_NAME: &str = "weight";
33 /// Identifier for importance in test metadata.
34 pub const IMPORTANCE_LINE_NAME: &str = "importance";
35 /// Identifier for the test metadata version.
36 pub const VERSION_LINE_NAME: &str = "version";
37
38 /// Where to save json run information.
39 pub const RUNS_DIR: &str = ".perf-runs";
40}
41
42/// How relevant a benchmark is.
43#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
44pub enum Importance {
45 /// Regressions shouldn't be accepted without good reason.
46 Critical = 4,
47 /// Regressions should be paid extra attention.
48 Important = 3,
49 /// No extra attention should be paid to regressions, but they might still
50 /// be indicative of something happening.
51 #[default]
52 Average = 2,
53 /// Unclear if regressions are likely to be meaningful, but still worth keeping
54 /// an eye on. Lowest level that's checked by default by the profiler.
55 Iffy = 1,
56 /// Regressions are likely to be spurious or don't affect core functionality.
57 /// Only relevant if a lot of them happen, or as supplemental evidence for a
58 /// higher-importance benchmark regressing. Not checked by default.
59 Fluff = 0,
60}
61
62impl std::fmt::Display for Importance {
63 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64 match self {
65 Importance::Critical => f.write_str("critical"),
66 Importance::Important => f.write_str("important"),
67 Importance::Average => f.write_str("average"),
68 Importance::Iffy => f.write_str("iffy"),
69 Importance::Fluff => f.write_str("fluff"),
70 }
71 }
72}
73
74/// Why or when did this test fail?
75#[derive(Clone, Debug, Serialize, Deserialize)]
76pub enum FailKind {
77 /// Failed while triaging it to determine the iteration count.
78 Triage,
79 /// Failed while profiling it.
80 Profile,
81 /// Failed due to an incompatible version for the test.
82 VersionMismatch,
83 /// Could not parse metadata for a test.
84 BadMetadata,
85 /// Skipped due to filters applied on the perf run.
86 Skipped,
87}
88
89impl std::fmt::Display for FailKind {
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 match self {
92 FailKind::Triage => f.write_str("errored in triage"),
93 FailKind::Profile => f.write_str("errored while profiling"),
94 FailKind::VersionMismatch => f.write_str("test version mismatch"),
95 FailKind::BadMetadata => f.write_str("bad test metadata"),
96 FailKind::Skipped => f.write_str("skipped"),
97 }
98 }
99}
100
101/// Information about a given perf test.
102#[derive(Clone, Debug, Serialize, Deserialize)]
103pub struct TestMdata {
104 /// A version number for when the test was generated. If this is greater
105 /// than the version this test handler expects, one of the following will
106 /// happen in an unspecified manner:
107 /// - The test is skipped silently.
108 /// - The handler exits with an error message indicating the version mismatch
109 /// or inability to parse the metadata.
110 ///
111 /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
112 /// correctly parse the output of this test.
113 pub version: u32,
114 /// How many iterations to pass this test if this is preset, or how many
115 /// iterations a test ended up running afterwards if determined at runtime.
116 pub iterations: Option<NonZero<usize>>,
117 /// The importance of this particular test. See the docs on `Importance` for
118 /// details.
119 pub importance: Importance,
120 /// The weight of this particular test within its importance category. Used
121 /// when comparing across runs.
122 pub weight: u8,
123}
124
125/// The actual timings of a test, as measured by Hyperfine.
126#[derive(Clone, Debug, Serialize, Deserialize)]
127pub struct Timings {
128 /// Mean runtime for `self.iter_total` runs of this test.
129 pub mean: Duration,
130 /// Standard deviation for the above.
131 pub stddev: Duration,
132}
133
134impl Timings {
135 /// How many iterations does this test seem to do per second?
136 #[expect(
137 clippy::cast_precision_loss,
138 reason = "We only care about a couple sig figs anyways"
139 )]
140 #[must_use]
141 pub fn iters_per_sec(&self, total_iters: NonZero<usize>) -> f64 {
142 (1000. / self.mean.as_millis() as f64) * total_iters.get() as f64
143 }
144}
145
146/// Aggregate results, meant to be used for a given importance category. Each
147/// test name corresponds to its benchmark results, iteration count, and weight.
148type CategoryInfo = HashMap<String, (Timings, NonZero<usize>, u8)>;
149
150/// Aggregate output of all tests run by this handler.
151#[derive(Clone, Debug, Default, Serialize, Deserialize)]
152pub struct Output {
153 /// A list of test outputs. Format is `(test_name, mdata, timings)`.
154 /// The latter being `Ok(_)` indicates the test succeeded.
155 ///
156 /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
157 /// `mdata.iterations` is `Some(_)`.
158 tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
159}
160
161impl Output {
162 /// Instantiates an empty "output". Useful for merging.
163 #[must_use]
164 pub fn blank() -> Self {
165 Output { tests: Vec::new() }
166 }
167
168 /// Reports a success and adds it to this run's `Output`.
169 pub fn success(
170 &mut self,
171 name: impl AsRef<str>,
172 mut mdata: TestMdata,
173 iters: NonZero<usize>,
174 timings: Timings,
175 ) {
176 mdata.iterations = Some(iters);
177 self.tests
178 .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
179 }
180
181 /// Reports a failure and adds it to this run's `Output`. If this test was tried
182 /// with some number of iterations (i.e. this was not a version mismatch or skipped
183 /// test), it should be reported also.
184 ///
185 /// Using the `fail!()` macro is usually more convenient.
186 pub fn failure(
187 &mut self,
188 name: impl AsRef<str>,
189 mut mdata: Option<TestMdata>,
190 attempted_iters: Option<NonZero<usize>>,
191 kind: FailKind,
192 ) {
193 if let Some(ref mut mdata) = mdata {
194 mdata.iterations = attempted_iters;
195 }
196 self.tests
197 .push((name.as_ref().to_string(), mdata, Err(kind)));
198 }
199
200 /// True if no tests executed this run.
201 #[must_use]
202 pub fn is_empty(&self) -> bool {
203 self.tests.is_empty()
204 }
205
206 /// Sorts the runs in the output in the order that we want them printed.
207 pub fn sort(&mut self) {
208 self.tests.sort_unstable_by(|a, b| match (a, b) {
209 // Tests where we got no metadata go at the end.
210 ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
211 ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
212 // Then sort by importance, then weight.
213 ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
214 let c = a_mdata.importance.cmp(&b_mdata.importance);
215 if matches!(c, std::cmp::Ordering::Equal) {
216 a_mdata.weight.cmp(&b_mdata.weight)
217 } else {
218 c
219 }
220 }
221 // Lastly by name.
222 ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
223 });
224 }
225
226 /// Merges the output of two runs, appending a prefix to the results of the new run.
227 /// To be used in conjunction with `Output::blank()`, or else only some tests will have
228 /// a prefix set.
229 pub fn merge<'a>(&mut self, other: Self, pref_other: impl Into<Option<&'a str>>) {
230 let pref = if let Some(pref) = pref_other.into() {
231 "crates/".to_string() + pref + "::"
232 } else {
233 String::new()
234 };
235 self.tests = std::mem::take(&mut self.tests)
236 .into_iter()
237 .chain(
238 other
239 .tests
240 .into_iter()
241 .map(|(name, md, tm)| (pref.clone() + &name, md, tm)),
242 )
243 .collect();
244 }
245
246 /// Evaluates the performance of `self` against `baseline`. The latter is taken
247 /// as the comparison point, i.e. a positive resulting `PerfReport` means that
248 /// `self` performed better.
249 ///
250 /// # Panics
251 /// `self` and `baseline` are assumed to have the iterations field on all
252 /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
253 #[must_use]
254 pub fn compare_perf(self, baseline: Self) -> PerfReport {
255 let self_categories = self.collapse();
256 let mut other_categories = baseline.collapse();
257
258 let deltas = self_categories
259 .into_iter()
260 .filter_map(|(cat, self_data)| {
261 // Only compare categories where both meow
262 // runs have data. /
263 let mut other_data = other_categories.remove(&cat)?;
264 let mut max = f64::MIN;
265 let mut min = f64::MAX;
266
267 // Running totals for averaging out tests.
268 let mut r_total_numerator = 0.;
269 let mut r_total_denominator = 0;
270 // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
271 for (name, (s_timings, s_iters, weight)) in self_data {
272 // Only use the new weights if they conflict.
273 let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
274 continue;
275 };
276 let shift =
277 (o_timings.iters_per_sec(o_iters) / s_timings.iters_per_sec(s_iters)) - 1.;
278 if shift > max {
279 max = shift;
280 }
281 if shift < min {
282 min = shift;
283 }
284 r_total_numerator += shift * f64::from(weight);
285 r_total_denominator += u32::from(weight);
286 }
287 // There were no runs here!
288 if r_total_denominator == 0 {
289 None
290 } else {
291 let mean = r_total_numerator / f64::from(r_total_denominator);
292 // TODO: also aggregate standard deviation? That's harder to keep
293 // meaningful, though, since we dk which tests are correlated.
294 Some((cat, PerfDelta { max, mean, min }))
295 }
296 })
297 .collect();
298
299 PerfReport { deltas }
300 }
301
302 /// Collapses the `PerfReport` into a `HashMap` over `Importance`, with
303 /// each importance category having its tests contained.
304 fn collapse(self) -> HashMap<Importance, CategoryInfo> {
305 let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
306 for entry in self.tests {
307 if let Some(mdata) = entry.1
308 && let Ok(timings) = entry.2
309 {
310 if let Some(handle) = categories.get_mut(&mdata.importance) {
311 handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
312 } else {
313 let mut new = HashMap::default();
314 new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
315 categories.insert(mdata.importance, new);
316 }
317 }
318 }
319
320 categories
321 }
322}
323
324impl std::fmt::Display for Output {
325 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326 // Don't print the header for an empty run.
327 if self.tests.is_empty() {
328 return Ok(());
329 }
330
331 // We want to print important tests at the top, then alphabetical.
332 let mut sorted = self.clone();
333 sorted.sort();
334 // Markdown header for making a nice little table :>
335 writeln!(
336 f,
337 "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
338 )?;
339 writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
340 for (name, metadata, timings) in &sorted.tests {
341 match metadata {
342 Some(metadata) => match timings {
343 // Happy path.
344 Ok(timings) => {
345 // If the test succeeded, then metadata.iterations is Some(_).
346 writeln!(
347 f,
348 "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
349 name,
350 timings.iters_per_sec(metadata.iterations.unwrap()),
351 {
352 // Very small mean runtimes will give inaccurate
353 // results. Should probably also penalise weight.
354 let mean = timings.mean.as_secs_f64() * 1000.;
355 if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
356 format!("{mean:.2} (unreliable)")
357 } else {
358 format!("{mean:.2}")
359 }
360 },
361 timings.stddev.as_secs_f64() * 1000.,
362 metadata.iterations.unwrap(),
363 metadata.importance,
364 metadata.weight,
365 )?;
366 }
367 // We have (some) metadata, but the test errored.
368 Err(err) => writeln!(
369 f,
370 "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
371 err,
372 name,
373 metadata
374 .iterations
375 .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
376 metadata.importance,
377 metadata.weight
378 )?,
379 },
380 // No metadata, couldn't even parse the test output.
381 None => writeln!(
382 f,
383 "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
384 timings.as_ref().unwrap_err(),
385 name
386 )?,
387 }
388 }
389 Ok(())
390 }
391}
392
393/// The difference in performance between two runs within a given importance
394/// category.
395struct PerfDelta {
396 /// The biggest improvement / least bad regression.
397 max: f64,
398 /// The weighted average change in test times.
399 mean: f64,
400 /// The worst regression / smallest improvement.
401 min: f64,
402}
403
404/// Shim type for reporting all performance deltas across importance categories.
405pub struct PerfReport {
406 /// Inner (group, diff) pairing.
407 deltas: HashMap<Importance, PerfDelta>,
408}
409
410impl std::fmt::Display for PerfReport {
411 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
412 if self.deltas.is_empty() {
413 return write!(f, "(no matching tests)");
414 }
415 let sorted = self.deltas.iter().collect::<Vec<_>>();
416 writeln!(f, "| Category | Max | Mean | Min |")?;
417 // We don't want to print too many newlines at the end, so handle newlines
418 // a little jankily like this.
419 write!(f, "|:---|---:|---:|---:|")?;
420 for (cat, delta) in sorted.into_iter().rev() {
421 const SIGN_POS: &str = "↑";
422 const SIGN_NEG: &str = "↓";
423 const SIGN_NEUTRAL_POS: &str = "±↑";
424 const SIGN_NEUTRAL_NEG: &str = "±↓";
425
426 let prettify = |time: f64| {
427 let sign = if time > 0.05 {
428 SIGN_POS
429 } else if time > 0. {
430 SIGN_NEUTRAL_POS
431 } else if time > -0.05 {
432 SIGN_NEUTRAL_NEG
433 } else {
434 SIGN_NEG
435 };
436 format!("{} {:.1}%", sign, time.abs() * 100.)
437 };
438
439 // Pretty-print these instead of just using the float display impl.
440 write!(
441 f,
442 "\n| {cat} | {} | {} | {} |",
443 prettify(delta.max),
444 prettify(delta.mean),
445 prettify(delta.min)
446 )?;
447 }
448 Ok(())
449 }
450}