lib.rs

  1//! Some constants and datatypes used in the Zed perf profiler. Should only be
  2//! consumed by the crate providing the matching macros.
  3//!
  4//! For usage documentation, see the docs on this crate's binary.
  5
  6use collections::HashMap;
  7use serde::{Deserialize, Serialize};
  8use std::{num::NonZero, time::Duration};
  9
 10pub mod consts {
 11    //! Preset idenitifiers and constants so that the profiler and proc macro agree
 12    //! on their communication protocol.
 13
 14    /// The suffix on the actual test function.
 15    pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
 16    /// The suffix on an extra function which prints metadata about a test to stdout.
 17    pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
 18    /// The env var in which we pass the iteration count to our tests.
 19    pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
 20    /// The prefix printed on all benchmark test metadata lines, to distinguish it from
 21    /// possible output by the test harness itself.
 22    pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
 23    /// The version number for the data returned from the test metadata function.
 24    /// Increment on non-backwards-compatible changes.
 25    pub const MDATA_VER: u32 = 0;
 26    /// The default weight, if none is specified.
 27    pub const WEIGHT_DEFAULT: u8 = 50;
 28    /// How long a test must have run to be assumed to be reliable-ish.
 29    pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
 30
 31    /// Identifier for the iteration count of a test metadata.
 32    pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
 33    /// Identifier for the weight of a test metadata.
 34    pub const WEIGHT_LINE_NAME: &str = "weight";
 35    /// Identifier for importance in test metadata.
 36    pub const IMPORTANCE_LINE_NAME: &str = "importance";
 37    /// Identifier for the test metadata version.
 38    pub const VERSION_LINE_NAME: &str = "version";
 39
 40    /// Where to save json run information.
 41    pub const RUNS_DIR: &str = ".perf-runs";
 42}
 43
 44/// How relevant a benchmark is.
 45#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
 46pub enum Importance {
 47    /// Regressions shouldn't be accepted without good reason.
 48    Critical = 4,
 49    /// Regressions should be paid extra attention.
 50    Important = 3,
 51    /// No extra attention should be paid to regressions, but they might still
 52    /// be indicative of something happening.
 53    #[default]
 54    Average = 2,
 55    /// Unclear if regressions are likely to be meaningful, but still worth keeping
 56    /// an eye on. Lowest level that's checked by default by the profiler.
 57    Iffy = 1,
 58    /// Regressions are likely to be spurious or don't affect core functionality.
 59    /// Only relevant if a lot of them happen, or as supplemental evidence for a
 60    /// higher-importance benchmark regressing. Not checked by default.
 61    Fluff = 0,
 62}
 63
 64impl std::fmt::Display for Importance {
 65    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 66        match self {
 67            Importance::Critical => f.write_str("critical"),
 68            Importance::Important => f.write_str("important"),
 69            Importance::Average => f.write_str("average"),
 70            Importance::Iffy => f.write_str("iffy"),
 71            Importance::Fluff => f.write_str("fluff"),
 72        }
 73    }
 74}
 75
 76/// Why or when did this test fail?
 77#[derive(Clone, Debug, Serialize, Deserialize)]
 78pub enum FailKind {
 79    /// Failed while triaging it to determine the iteration count.
 80    Triage,
 81    /// Failed while profiling it.
 82    Profile,
 83    /// Failed due to an incompatible version for the test.
 84    VersionMismatch,
 85    /// Could not parse metadata for a test.
 86    BadMetadata,
 87    /// Skipped due to filters applied on the perf run.
 88    Skipped,
 89}
 90
 91impl std::fmt::Display for FailKind {
 92    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 93        match self {
 94            FailKind::Triage => f.write_str("errored in triage"),
 95            FailKind::Profile => f.write_str("errored while profiling"),
 96            FailKind::VersionMismatch => f.write_str("test version mismatch"),
 97            FailKind::BadMetadata => f.write_str("bad test metadata"),
 98            FailKind::Skipped => f.write_str("skipped"),
 99        }
100    }
101}
102
103/// Information about a given perf test.
104#[derive(Clone, Debug, Serialize, Deserialize)]
105pub struct TestMdata {
106    /// A version number for when the test was generated. If this is greater
107    /// than the version this test handler expects, one of the following will
108    /// happen in an unspecified manner:
109    /// - The test is skipped silently.
110    /// - The handler exits with an error message indicating the version mismatch
111    ///   or inability to parse the metadata.
112    ///
113    /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
114    /// correctly parse the output of this test.
115    pub version: u32,
116    /// How many iterations to pass this test if this is preset, or how many
117    /// iterations a test ended up running afterwards if determined at runtime.
118    pub iterations: Option<NonZero<usize>>,
119    /// The importance of this particular test. See the docs on `Importance` for
120    /// details.
121    pub importance: Importance,
122    /// The weight of this particular test within its importance category. Used
123    /// when comparing across runs.
124    pub weight: u8,
125}
126
127/// The actual timings of a test, as measured by Hyperfine.
128#[derive(Clone, Debug, Serialize, Deserialize)]
129pub struct Timings {
130    /// Mean runtime for `self.iter_total` runs of this test.
131    pub mean: Duration,
132    /// Standard deviation for the above.
133    pub stddev: Duration,
134}
135
136impl Timings {
137    /// How many iterations does this test seem to do per second?
138    #[expect(
139        clippy::cast_precision_loss,
140        reason = "We only care about a couple sig figs anyways"
141    )]
142    #[must_use]
143    pub fn iters_per_sec(&self, total_iters: NonZero<usize>) -> f64 {
144        (1000. / self.mean.as_millis() as f64) * total_iters.get() as f64
145    }
146}
147
148/// Aggregate results, meant to be used for a given importance category. Each
149/// test name corresponds to its benchmark results, iteration count, and weight.
150type CategoryInfo = HashMap<String, (Timings, NonZero<usize>, u8)>;
151
152/// Aggregate output of all tests run by this handler.
153#[derive(Clone, Debug, Default, Serialize, Deserialize)]
154pub struct Output {
155    /// A list of test outputs. Format is `(test_name, mdata, timings)`.
156    /// The latter being `Ok(_)` indicates the test succeeded.
157    ///
158    /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
159    /// `mdata.iterations` is `Some(_)`.
160    tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
161}
162
163impl Output {
164    /// Instantiates an empty "output". Useful for merging.
165    #[must_use]
166    pub fn blank() -> Self {
167        Output { tests: Vec::new() }
168    }
169
170    /// Reports a success and adds it to this run's `Output`.
171    pub fn success(
172        &mut self,
173        name: impl AsRef<str>,
174        mut mdata: TestMdata,
175        iters: NonZero<usize>,
176        timings: Timings,
177    ) {
178        mdata.iterations = Some(iters);
179        self.tests
180            .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
181    }
182
183    /// Reports a failure and adds it to this run's `Output`. If this test was tried
184    /// with some number of iterations (i.e. this was not a version mismatch or skipped
185    /// test), it should be reported also.
186    ///
187    /// Using the `fail!()` macro is usually more convenient.
188    pub fn failure(
189        &mut self,
190        name: impl AsRef<str>,
191        mut mdata: Option<TestMdata>,
192        attempted_iters: Option<NonZero<usize>>,
193        kind: FailKind,
194    ) {
195        if let Some(ref mut mdata) = mdata {
196            mdata.iterations = attempted_iters;
197        }
198        self.tests
199            .push((name.as_ref().to_string(), mdata, Err(kind)));
200    }
201
202    /// True if no tests executed this run.
203    #[must_use]
204    pub fn is_empty(&self) -> bool {
205        self.tests.is_empty()
206    }
207
208    /// Sorts the runs in the output in the order that we want them printed.
209    pub fn sort(&mut self) {
210        self.tests.sort_unstable_by(|a, b| match (a, b) {
211            // Tests where we got no metadata go at the end.
212            ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
213            ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
214            // Then sort by importance, then weight.
215            ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
216                let c = a_mdata.importance.cmp(&b_mdata.importance);
217                if matches!(c, std::cmp::Ordering::Equal) {
218                    a_mdata.weight.cmp(&b_mdata.weight)
219                } else {
220                    c
221                }
222            }
223            // Lastly by name.
224            ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
225        });
226    }
227
228    /// Merges the output of two runs, appending a prefix to the results of the new run.
229    /// To be used in conjunction with `Output::blank()`, or else only some tests will have
230    /// a prefix set.
231    pub fn merge<'a>(&mut self, other: Self, pref_other: impl Into<Option<&'a str>>) {
232        let pref = if let Some(pref) = pref_other.into() {
233            "crates/".to_string() + pref + "::"
234        } else {
235            String::new()
236        };
237        self.tests = std::mem::take(&mut self.tests)
238            .into_iter()
239            .chain(
240                other
241                    .tests
242                    .into_iter()
243                    .map(|(name, md, tm)| (pref.clone() + &name, md, tm)),
244            )
245            .collect();
246    }
247
248    /// Evaluates the performance of `self` against `baseline`. The latter is taken
249    /// as the comparison point, i.e. a positive resulting `PerfReport` means that
250    /// `self` performed better.
251    ///
252    /// # Panics
253    /// `self` and `baseline` are assumed to have the iterations field on all
254    /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
255    #[must_use]
256    pub fn compare_perf(self, baseline: Self) -> PerfReport {
257        let self_categories = self.collapse();
258        let mut other_categories = baseline.collapse();
259
260        let deltas = self_categories
261            .into_iter()
262            .filter_map(|(cat, self_data)| {
263                // Only compare categories where both           meow
264                // runs have data.                              /
265                let mut other_data = other_categories.remove(&cat)?;
266                let mut max = f64::MIN;
267                let mut min = f64::MAX;
268
269                // Running totals for averaging out tests.
270                let mut r_total_numerator = 0.;
271                let mut r_total_denominator = 0;
272                // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
273                for (name, (s_timings, s_iters, weight)) in self_data {
274                    // Only use the new weights if they conflict.
275                    let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
276                        continue;
277                    };
278                    let shift =
279                        (o_timings.iters_per_sec(o_iters) / s_timings.iters_per_sec(s_iters)) - 1.;
280                    if shift > max {
281                        max = shift;
282                    }
283                    if shift < min {
284                        min = shift;
285                    }
286                    r_total_numerator += shift * f64::from(weight);
287                    r_total_denominator += u32::from(weight);
288                }
289                // There were no runs here!
290                if r_total_denominator == 0 {
291                    None
292                } else {
293                    let mean = r_total_numerator / f64::from(r_total_denominator);
294                    // TODO: also aggregate standard deviation? That's harder to keep
295                    // meaningful, though, since we dk which tests are correlated.
296                    Some((cat, PerfDelta { max, mean, min }))
297                }
298            })
299            .collect();
300
301        PerfReport { deltas }
302    }
303
304    /// Collapses the `PerfReport` into a `HashMap` over `Importance`, with
305    /// each importance category having its tests contained.
306    fn collapse(self) -> HashMap<Importance, CategoryInfo> {
307        let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
308        for entry in self.tests {
309            if let Some(mdata) = entry.1
310                && let Ok(timings) = entry.2
311            {
312                if let Some(handle) = categories.get_mut(&mdata.importance) {
313                    handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
314                } else {
315                    let mut new = HashMap::default();
316                    new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
317                    categories.insert(mdata.importance, new);
318                }
319            }
320        }
321
322        categories
323    }
324}
325
326impl std::fmt::Display for Output {
327    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328        // Don't print the header for an empty run.
329        if self.tests.is_empty() {
330            return Ok(());
331        }
332
333        // We want to print important tests at the top, then alphabetical.
334        let mut sorted = self.clone();
335        sorted.sort();
336        // Markdown header for making a nice little table :>
337        writeln!(
338            f,
339            "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
340        )?;
341        writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
342        for (name, metadata, timings) in &sorted.tests {
343            match metadata {
344                Some(metadata) => match timings {
345                    // Happy path.
346                    Ok(timings) => {
347                        // If the test succeeded, then metadata.iterations is Some(_).
348                        writeln!(
349                            f,
350                            "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
351                            name,
352                            timings.iters_per_sec(metadata.iterations.unwrap()),
353                            {
354                                // Very small mean runtimes will give inaccurate
355                                // results. Should probably also penalise weight.
356                                let mean = timings.mean.as_secs_f64() * 1000.;
357                                if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
358                                    format!("{mean:.2} (unreliable)")
359                                } else {
360                                    format!("{mean:.2}")
361                                }
362                            },
363                            timings.stddev.as_secs_f64() * 1000.,
364                            metadata.iterations.unwrap(),
365                            metadata.importance,
366                            metadata.weight,
367                        )?;
368                    }
369                    // We have (some) metadata, but the test errored.
370                    Err(err) => writeln!(
371                        f,
372                        "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
373                        err,
374                        name,
375                        metadata
376                            .iterations
377                            .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
378                        metadata.importance,
379                        metadata.weight
380                    )?,
381                },
382                // No metadata, couldn't even parse the test output.
383                None => writeln!(
384                    f,
385                    "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
386                    timings.as_ref().unwrap_err(),
387                    name
388                )?,
389            }
390        }
391        Ok(())
392    }
393}
394
395/// The difference in performance between two runs within a given importance
396/// category.
397struct PerfDelta {
398    /// The biggest improvement / least bad regression.
399    max: f64,
400    /// The weighted average change in test times.
401    mean: f64,
402    /// The worst regression / smallest improvement.
403    min: f64,
404}
405
406/// Shim type for reporting all performance deltas across importance categories.
407pub struct PerfReport {
408    /// Inner (group, diff) pairing.
409    deltas: HashMap<Importance, PerfDelta>,
410}
411
412impl std::fmt::Display for PerfReport {
413    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414        if self.deltas.is_empty() {
415            return write!(f, "(no matching tests)");
416        }
417        let sorted = self.deltas.iter().collect::<Vec<_>>();
418        writeln!(f, "| Category | Max | Mean | Min |")?;
419        // We don't want to print too many newlines at the end, so handle newlines
420        // a little jankily like this.
421        write!(f, "|:---|---:|---:|---:|")?;
422        for (cat, delta) in sorted.into_iter().rev() {
423            const SIGN_POS: &str = "↑";
424            const SIGN_NEG: &str = "↓";
425            const SIGN_NEUTRAL: &str = "±";
426
427            let prettify = |time: f64| {
428                let sign = if time > 0.05 {
429                    SIGN_POS
430                } else if time < 0.05 && time > -0.05 {
431                    SIGN_NEUTRAL
432                } else {
433                    SIGN_NEG
434                };
435                format!("{} {:.1}%", sign, time.abs() * 100.)
436            };
437
438            // Pretty-print these instead of just using the float display impl.
439            write!(
440                f,
441                "\n| {cat} | {} | {} | {} |",
442                prettify(delta.max),
443                prettify(delta.mean),
444                prettify(delta.min)
445            )?;
446        }
447        Ok(())
448    }
449}