1//! Some constants and datatypes used in the Zed perf profiler. Should only be
2//! consumed by the crate providing the matching macros.
3//!
4//! For usage documentation, see the docs on this crate's binary.
5
6use collections::HashMap;
7use serde::{Deserialize, Serialize};
8use std::{num::NonZero, time::Duration};
9
10pub mod consts {
11 //! Preset idenitifiers and constants so that the profiler and proc macro agree
12 //! on their communication protocol.
13
14 /// The suffix on the actual test function.
15 pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
16 /// The suffix on an extra function which prints metadata about a test to stdout.
17 pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
18 /// The env var in which we pass the iteration count to our tests.
19 pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
20 /// The prefix printed on all benchmark test metadata lines, to distinguish it from
21 /// possible output by the test harness itself.
22 pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
23 /// The version number for the data returned from the test metadata function.
24 /// Increment on non-backwards-compatible changes.
25 pub const MDATA_VER: u32 = 0;
26 /// The default weight, if none is specified.
27 pub const WEIGHT_DEFAULT: u8 = 50;
28 /// How long a test must have run to be assumed to be reliable-ish.
29 pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
30
31 /// Identifier for the iteration count of a test metadata.
32 pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
33 /// Identifier for the weight of a test metadata.
34 pub const WEIGHT_LINE_NAME: &str = "weight";
35 /// Identifier for importance in test metadata.
36 pub const IMPORTANCE_LINE_NAME: &str = "importance";
37 /// Identifier for the test metadata version.
38 pub const VERSION_LINE_NAME: &str = "version";
39
40 /// Where to save json run information.
41 pub const RUNS_DIR: &str = ".perf-runs";
42}
43
44/// How relevant a benchmark is.
45#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
46pub enum Importance {
47 /// Regressions shouldn't be accepted without good reason.
48 Critical = 4,
49 /// Regressions should be paid extra attention.
50 Important = 3,
51 /// No extra attention should be paid to regressions, but they might still
52 /// be indicative of something happening.
53 #[default]
54 Average = 2,
55 /// Unclear if regressions are likely to be meaningful, but still worth keeping
56 /// an eye on. Lowest level that's checked by default by the profiler.
57 Iffy = 1,
58 /// Regressions are likely to be spurious or don't affect core functionality.
59 /// Only relevant if a lot of them happen, or as supplemental evidence for a
60 /// higher-importance benchmark regressing. Not checked by default.
61 Fluff = 0,
62}
63
64impl std::fmt::Display for Importance {
65 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
66 match self {
67 Importance::Critical => f.write_str("critical"),
68 Importance::Important => f.write_str("important"),
69 Importance::Average => f.write_str("average"),
70 Importance::Iffy => f.write_str("iffy"),
71 Importance::Fluff => f.write_str("fluff"),
72 }
73 }
74}
75
76/// Why or when did this test fail?
77#[derive(Clone, Debug, Serialize, Deserialize)]
78pub enum FailKind {
79 /// Failed while triaging it to determine the iteration count.
80 Triage,
81 /// Failed while profiling it.
82 Profile,
83 /// Failed due to an incompatible version for the test.
84 VersionMismatch,
85 /// Could not parse metadata for a test.
86 BadMetadata,
87 /// Skipped due to filters applied on the perf run.
88 Skipped,
89}
90
91impl std::fmt::Display for FailKind {
92 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93 match self {
94 FailKind::Triage => f.write_str("errored in triage"),
95 FailKind::Profile => f.write_str("errored while profiling"),
96 FailKind::VersionMismatch => f.write_str("test version mismatch"),
97 FailKind::BadMetadata => f.write_str("bad test metadata"),
98 FailKind::Skipped => f.write_str("skipped"),
99 }
100 }
101}
102
103/// Information about a given perf test.
104#[derive(Clone, Debug, Serialize, Deserialize)]
105pub struct TestMdata {
106 /// A version number for when the test was generated. If this is greater
107 /// than the version this test handler expects, one of the following will
108 /// happen in an unspecified manner:
109 /// - The test is skipped silently.
110 /// - The handler exits with an error message indicating the version mismatch
111 /// or inability to parse the metadata.
112 ///
113 /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
114 /// correctly parse the output of this test.
115 pub version: u32,
116 /// How many iterations to pass this test if this is preset, or how many
117 /// iterations a test ended up running afterwards if determined at runtime.
118 pub iterations: Option<NonZero<usize>>,
119 /// The importance of this particular test. See the docs on `Importance` for
120 /// details.
121 pub importance: Importance,
122 /// The weight of this particular test within its importance category. Used
123 /// when comparing across runs.
124 pub weight: u8,
125}
126
127/// The actual timings of a test, as measured by Hyperfine.
128#[derive(Clone, Debug, Serialize, Deserialize)]
129pub struct Timings {
130 /// Mean runtime for `self.iter_total` runs of this test.
131 pub mean: Duration,
132 /// Standard deviation for the above.
133 pub stddev: Duration,
134}
135
136impl Timings {
137 /// How many iterations does this test seem to do per second?
138 #[expect(
139 clippy::cast_precision_loss,
140 reason = "We only care about a couple sig figs anyways"
141 )]
142 #[must_use]
143 pub fn iters_per_sec(&self, total_iters: NonZero<usize>) -> f64 {
144 (1000. / self.mean.as_millis() as f64) * total_iters.get() as f64
145 }
146}
147
148/// Aggregate results, meant to be used for a given importance category. Each
149/// test name corresponds to its benchmark results, iteration count, and weight.
150type CategoryInfo = HashMap<String, (Timings, NonZero<usize>, u8)>;
151
152/// Aggregate output of all tests run by this handler.
153#[derive(Clone, Debug, Default, Serialize, Deserialize)]
154pub struct Output {
155 /// A list of test outputs. Format is `(test_name, mdata, timings)`.
156 /// The latter being `Ok(_)` indicates the test succeeded.
157 ///
158 /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
159 /// `mdata.iterations` is `Some(_)`.
160 tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
161}
162
163impl Output {
164 /// Instantiates an empty "output". Useful for merging.
165 #[must_use]
166 pub fn blank() -> Self {
167 Output { tests: Vec::new() }
168 }
169
170 /// Reports a success and adds it to this run's `Output`.
171 pub fn success(
172 &mut self,
173 name: impl AsRef<str>,
174 mut mdata: TestMdata,
175 iters: NonZero<usize>,
176 timings: Timings,
177 ) {
178 mdata.iterations = Some(iters);
179 self.tests
180 .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
181 }
182
183 /// Reports a failure and adds it to this run's `Output`. If this test was tried
184 /// with some number of iterations (i.e. this was not a version mismatch or skipped
185 /// test), it should be reported also.
186 ///
187 /// Using the `fail!()` macro is usually more convenient.
188 pub fn failure(
189 &mut self,
190 name: impl AsRef<str>,
191 mut mdata: Option<TestMdata>,
192 attempted_iters: Option<NonZero<usize>>,
193 kind: FailKind,
194 ) {
195 if let Some(ref mut mdata) = mdata {
196 mdata.iterations = attempted_iters;
197 }
198 self.tests
199 .push((name.as_ref().to_string(), mdata, Err(kind)));
200 }
201
202 /// True if no tests executed this run.
203 #[must_use]
204 pub fn is_empty(&self) -> bool {
205 self.tests.is_empty()
206 }
207
208 /// Sorts the runs in the output in the order that we want them printed.
209 pub fn sort(&mut self) {
210 self.tests.sort_unstable_by(|a, b| match (a, b) {
211 // Tests where we got no metadata go at the end.
212 ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
213 ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
214 // Then sort by importance, then weight.
215 ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
216 let c = a_mdata.importance.cmp(&b_mdata.importance);
217 if matches!(c, std::cmp::Ordering::Equal) {
218 a_mdata.weight.cmp(&b_mdata.weight)
219 } else {
220 c
221 }
222 }
223 // Lastly by name.
224 ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
225 });
226 }
227
228 /// Merges the output of two runs, appending a prefix to the results of the new run.
229 /// To be used in conjunction with `Output::blank()`, or else only some tests will have
230 /// a prefix set.
231 pub fn merge<'a>(&mut self, other: Self, pref_other: impl Into<Option<&'a str>>) {
232 let pref = if let Some(pref) = pref_other.into() {
233 "crates/".to_string() + pref + "::"
234 } else {
235 String::new()
236 };
237 self.tests = std::mem::take(&mut self.tests)
238 .into_iter()
239 .chain(
240 other
241 .tests
242 .into_iter()
243 .map(|(name, md, tm)| (pref.clone() + &name, md, tm)),
244 )
245 .collect();
246 }
247
248 /// Evaluates the performance of `self` against `baseline`. The latter is taken
249 /// as the comparison point, i.e. a positive resulting `PerfReport` means that
250 /// `self` performed better.
251 ///
252 /// # Panics
253 /// `self` and `baseline` are assumed to have the iterations field on all
254 /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
255 #[must_use]
256 pub fn compare_perf(self, baseline: Self) -> PerfReport {
257 let self_categories = self.collapse();
258 let mut other_categories = baseline.collapse();
259
260 let deltas = self_categories
261 .into_iter()
262 .filter_map(|(cat, self_data)| {
263 // Only compare categories where both meow
264 // runs have data. /
265 let mut other_data = other_categories.remove(&cat)?;
266 let mut max = f64::MIN;
267 let mut min = f64::MAX;
268
269 // Running totals for averaging out tests.
270 let mut r_total_numerator = 0.;
271 let mut r_total_denominator = 0;
272 // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
273 for (name, (s_timings, s_iters, weight)) in self_data {
274 // Only use the new weights if they conflict.
275 let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
276 continue;
277 };
278 let shift =
279 (o_timings.iters_per_sec(o_iters) / s_timings.iters_per_sec(s_iters)) - 1.;
280 if shift > max {
281 max = shift;
282 }
283 if shift < min {
284 min = shift;
285 }
286 r_total_numerator += shift * f64::from(weight);
287 r_total_denominator += u32::from(weight);
288 }
289 // There were no runs here!
290 if r_total_denominator == 0 {
291 None
292 } else {
293 let mean = r_total_numerator / f64::from(r_total_denominator);
294 // TODO: also aggregate standard deviation? That's harder to keep
295 // meaningful, though, since we dk which tests are correlated.
296 Some((cat, PerfDelta { max, mean, min }))
297 }
298 })
299 .collect();
300
301 PerfReport { deltas }
302 }
303
304 /// Collapses the `PerfReport` into a `HashMap` over `Importance`, with
305 /// each importance category having its tests contained.
306 fn collapse(self) -> HashMap<Importance, CategoryInfo> {
307 let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
308 for entry in self.tests {
309 if let Some(mdata) = entry.1
310 && let Ok(timings) = entry.2
311 {
312 if let Some(handle) = categories.get_mut(&mdata.importance) {
313 handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
314 } else {
315 let mut new = HashMap::default();
316 new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
317 categories.insert(mdata.importance, new);
318 }
319 }
320 }
321
322 categories
323 }
324}
325
326impl std::fmt::Display for Output {
327 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328 // Don't print the header for an empty run.
329 if self.tests.is_empty() {
330 return Ok(());
331 }
332
333 // We want to print important tests at the top, then alphabetical.
334 let mut sorted = self.clone();
335 sorted.sort();
336 // Markdown header for making a nice little table :>
337 writeln!(
338 f,
339 "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
340 )?;
341 writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
342 for (name, metadata, timings) in &sorted.tests {
343 match metadata {
344 Some(metadata) => match timings {
345 // Happy path.
346 Ok(timings) => {
347 // If the test succeeded, then metadata.iterations is Some(_).
348 writeln!(
349 f,
350 "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
351 name,
352 timings.iters_per_sec(metadata.iterations.unwrap()),
353 {
354 // Very small mean runtimes will give inaccurate
355 // results. Should probably also penalise weight.
356 let mean = timings.mean.as_secs_f64() * 1000.;
357 if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
358 format!("{mean:.2} (unreliable)")
359 } else {
360 format!("{mean:.2}")
361 }
362 },
363 timings.stddev.as_secs_f64() * 1000.,
364 metadata.iterations.unwrap(),
365 metadata.importance,
366 metadata.weight,
367 )?;
368 }
369 // We have (some) metadata, but the test errored.
370 Err(err) => writeln!(
371 f,
372 "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
373 err,
374 name,
375 metadata
376 .iterations
377 .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
378 metadata.importance,
379 metadata.weight
380 )?,
381 },
382 // No metadata, couldn't even parse the test output.
383 None => writeln!(
384 f,
385 "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
386 timings.as_ref().unwrap_err(),
387 name
388 )?,
389 }
390 }
391 Ok(())
392 }
393}
394
395/// The difference in performance between two runs within a given importance
396/// category.
397struct PerfDelta {
398 /// The biggest improvement / least bad regression.
399 max: f64,
400 /// The weighted average change in test times.
401 mean: f64,
402 /// The worst regression / smallest improvement.
403 min: f64,
404}
405
406/// Shim type for reporting all performance deltas across importance categories.
407pub struct PerfReport {
408 /// Inner (group, diff) pairing.
409 deltas: HashMap<Importance, PerfDelta>,
410}
411
412impl std::fmt::Display for PerfReport {
413 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414 if self.deltas.is_empty() {
415 return write!(f, "(no matching tests)");
416 }
417 let sorted = self.deltas.iter().collect::<Vec<_>>();
418 writeln!(f, "| Category | Max | Mean | Min |")?;
419 // We don't want to print too many newlines at the end, so handle newlines
420 // a little jankily like this.
421 write!(f, "|:---|---:|---:|---:|")?;
422 for (cat, delta) in sorted.into_iter().rev() {
423 const SIGN_POS: &str = "↑";
424 const SIGN_NEG: &str = "↓";
425 const SIGN_NEUTRAL: &str = "±";
426
427 let prettify = |time: f64| {
428 let sign = if time > 0.05 {
429 SIGN_POS
430 } else if time < 0.05 && time > -0.05 {
431 SIGN_NEUTRAL
432 } else {
433 SIGN_NEG
434 };
435 format!("{} {:.1}%", sign, time.abs() * 100.)
436 };
437
438 // Pretty-print these instead of just using the float display impl.
439 write!(
440 f,
441 "\n| {cat} | {} | {} | {} |",
442 prettify(delta.max),
443 prettify(delta.mean),
444 prettify(delta.min)
445 )?;
446 }
447 Ok(())
448 }
449}