lib.rs

  1//! Some constants and datatypes used in the Zed perf profiler. Should only be
  2//! consumed by the crate providing the matching macros.
  3
  4use collections::HashMap;
  5use serde::{Deserialize, Serialize};
  6use std::time::Duration;
  7
  8pub mod consts {
  9    //! Preset idenitifiers and constants so that the profiler and proc macro agree
 10    //! on their communication protocol.
 11
 12    /// The suffix on the actual test function.
 13    pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
 14    /// The suffix on an extra function which prints metadata about a test to stdout.
 15    pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
 16    /// The env var in which we pass the iteration count to our tests.
 17    pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
 18    /// The prefix printed on all benchmark test metadata lines, to distinguish it from
 19    /// possible output by the test harness itself.
 20    pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
 21    /// The version number for the data returned from the test metadata function.
 22    /// Increment on non-backwards-compatible changes.
 23    pub const MDATA_VER: u32 = 0;
 24    /// The default weight, if none is specified.
 25    pub const WEIGHT_DEFAULT: u8 = 50;
 26    /// How long a test must have run to be assumed to be reliable-ish.
 27    pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
 28
 29    /// Identifier for the iteration count of a test metadata.
 30    pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
 31    /// Identifier for the weight of a test metadata.
 32    pub const WEIGHT_LINE_NAME: &str = "weight";
 33    /// Identifier for importance in test metadata.
 34    pub const IMPORTANCE_LINE_NAME: &str = "importance";
 35    /// Identifier for the test metadata version.
 36    pub const VERSION_LINE_NAME: &str = "version";
 37
 38    /// Where to save json run information.
 39    pub const RUNS_DIR: &str = ".perf-runs";
 40}
 41
 42/// How relevant a benchmark is.
 43#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
 44pub enum Importance {
 45    /// Regressions shouldn't be accepted without good reason.
 46    Critical = 4,
 47    /// Regressions should be paid extra attention.
 48    Important = 3,
 49    /// No extra attention should be paid to regressions, but they might still
 50    /// be indicative of something happening.
 51    #[default]
 52    Average = 2,
 53    /// Unclear if regressions are likely to be meaningful, but still worth keeping
 54    /// an eye on. Lowest level that's checked by default by the profiler.
 55    Iffy = 1,
 56    /// Regressions are likely to be spurious or don't affect core functionality.
 57    /// Only relevant if a lot of them happen, or as supplemental evidence for a
 58    /// higher-importance benchmark regressing. Not checked by default.
 59    Fluff = 0,
 60}
 61
 62impl std::fmt::Display for Importance {
 63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 64        match self {
 65            Importance::Critical => f.write_str("critical"),
 66            Importance::Important => f.write_str("important"),
 67            Importance::Average => f.write_str("average"),
 68            Importance::Iffy => f.write_str("iffy"),
 69            Importance::Fluff => f.write_str("fluff"),
 70        }
 71    }
 72}
 73
 74/// Why or when did this test fail?
 75#[derive(Clone, Debug, Serialize, Deserialize)]
 76pub enum FailKind {
 77    /// Failed while triaging it to determine the iteration count.
 78    Triage,
 79    /// Failed while profiling it.
 80    Profile,
 81    /// Failed due to an incompatible version for the test.
 82    VersionMismatch,
 83    /// Skipped due to filters applied on the perf run.
 84    Skipped,
 85}
 86
 87impl std::fmt::Display for FailKind {
 88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 89        match self {
 90            FailKind::Triage => f.write_str("failed in triage"),
 91            FailKind::Profile => f.write_str("failed while profiling"),
 92            FailKind::VersionMismatch => f.write_str("test version mismatch"),
 93            FailKind::Skipped => f.write_str("skipped"),
 94        }
 95    }
 96}
 97
 98/// Information about a given perf test.
 99#[derive(Clone, Debug, Serialize, Deserialize)]
100pub struct TestMdata {
101    /// A version number for when the test was generated. If this is greater
102    /// than the version this test handler expects, one of the following will
103    /// happen in an unspecified manner:
104    /// - The test is skipped silently.
105    /// - The handler exits with an error message indicating the version mismatch
106    ///   or inability to parse the metadata.
107    ///
108    /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
109    /// correctly parse the output of this test.
110    pub version: u32,
111    /// How many iterations to pass this test, if this is preset.
112    pub iterations: Option<usize>,
113    /// The importance of this particular test. See the docs on `Importance` for
114    /// details.
115    pub importance: Importance,
116    /// The weight of this particular test within its importance category. Used
117    /// when comparing across runs.
118    pub weight: u8,
119}
120
121/// The actual timings of a test, as measured by Hyperfine.
122#[derive(Clone, Debug, Serialize, Deserialize)]
123pub struct Timings {
124    /// Mean runtime for `self.iter_total` runs of this test.
125    pub mean: Duration,
126    /// Standard deviation for the above.
127    pub stddev: Duration,
128}
129
130impl Timings {
131    /// How many iterations does this test seem to do per second?
132    #[expect(
133        clippy::cast_precision_loss,
134        reason = "We only care about a couple sig figs anyways"
135    )]
136    #[must_use]
137    pub fn iters_per_sec(&self, total_iters: usize) -> f64 {
138        (1000. / self.mean.as_millis() as f64) * total_iters as f64
139    }
140}
141
142/// Aggregate output of all tests run by this handler.
143#[derive(Clone, Debug, Default, Serialize, Deserialize)]
144pub struct Output {
145    /// A list of test outputs. Format is `(test_name, iter_count, timings)`.
146    /// The latter being set indicates the test succeeded.
147    ///
148    /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
149    /// `mdata.iterations` is `Some(_)`.
150    tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
151}
152
153impl Output {
154    /// Instantiates an empty "output". Useful for merging.
155    #[must_use]
156    pub fn blank() -> Self {
157        Output { tests: Vec::new() }
158    }
159
160    /// Reports a success and adds it to this run's `Output`.
161    pub fn success(
162        &mut self,
163        name: impl AsRef<str>,
164        mut mdata: TestMdata,
165        iters: usize,
166        timings: Timings,
167    ) {
168        mdata.iterations = Some(iters);
169        self.tests
170            .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
171    }
172
173    /// Reports a failure and adds it to this run's `Output`. If this test was tried
174    /// with some number of iterations (i.e. this was not a version mismatch or skipped
175    /// test), it should be reported also.
176    ///
177    /// Using the `fail!()` macro is usually more convenient.
178    pub fn failure(
179        &mut self,
180        name: impl AsRef<str>,
181        mut mdata: Option<TestMdata>,
182        attempted_iters: Option<usize>,
183        kind: FailKind,
184    ) {
185        if let Some(ref mut mdata) = mdata {
186            mdata.iterations = attempted_iters;
187        }
188        self.tests
189            .push((name.as_ref().to_string(), mdata, Err(kind)));
190    }
191
192    /// True if no tests executed this run.
193    #[must_use]
194    pub fn is_empty(&self) -> bool {
195        self.tests.is_empty()
196    }
197
198    /// Sorts the runs in the output in the order that we want it printed.
199    pub fn sort(&mut self) {
200        self.tests.sort_unstable_by(|a, b| match (a, b) {
201            // Tests where we got no metadata go at the end.
202            ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
203            ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
204            // Then sort by importance, then weight.
205            ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
206                let c = a_mdata.importance.cmp(&b_mdata.importance);
207                if matches!(c, std::cmp::Ordering::Equal) {
208                    a_mdata.weight.cmp(&b_mdata.weight)
209                } else {
210                    c
211                }
212            }
213            // Lastly by name.
214            ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
215        });
216    }
217
218    /// Merges the output of two runs, appending a prefix to the results of the new run.
219    /// To be used in conjunction with `Output::blank()`, or else only some tests will have
220    /// a prefix set.
221    pub fn merge(&mut self, other: Self, pref_other: impl AsRef<str>) {
222        self.tests = std::mem::take(&mut self.tests)
223            .into_iter()
224            .chain(other.tests.into_iter().map(|(name, md, tm)| {
225                let mut new_name = "crates/".to_string();
226                new_name.push_str(pref_other.as_ref());
227                new_name.push_str("::");
228                new_name.push_str(&name);
229                (new_name, md, tm)
230            }))
231            .collect();
232    }
233
234    /// Evaluates the performance of `self` against `baseline`. The latter is taken
235    /// as the comparison point, i.e. a positive resulting `PerfReport` means that
236    /// `self` performed better.
237    ///
238    /// # Panics
239    /// `self` and `baseline` are assumed to have the iterations field on all
240    /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
241    #[must_use]
242    pub fn compare_perf(self, baseline: Self) -> PerfReport {
243        let self_categories = self.collapse();
244        let mut other_categories = baseline.collapse();
245
246        let deltas = self_categories
247            .into_iter()
248            .filter_map(|(cat, self_data)| {
249                // Only compare categories where both           meow
250                // runs have data.                              /
251                let mut other_data = other_categories.remove(&cat)?;
252                let mut max = 0.;
253                let mut min = 0.;
254
255                // Running totals for averaging out tests.
256                let mut r_total_numerator = 0.;
257                let mut r_total_denominator = 0;
258                // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
259                for (name, (s_timings, s_iters, weight)) in self_data {
260                    // Only use the new weights if they conflict.
261                    let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
262                        continue;
263                    };
264                    let shift =
265                        (s_timings.iters_per_sec(s_iters) / o_timings.iters_per_sec(o_iters)) - 1.;
266                    if shift > max {
267                        max = shift;
268                    }
269                    if shift < min {
270                        min = shift;
271                    }
272                    r_total_numerator += shift * f64::from(weight);
273                    r_total_denominator += u32::from(weight);
274                }
275                let mean = r_total_numerator / f64::from(r_total_denominator);
276                // TODO: also aggregate standard deviation? that's harder to keep
277                // meaningful, though, since we dk which tests are correlated
278                Some((cat, PerfDelta { max, mean, min }))
279            })
280            .collect();
281
282        PerfReport { deltas }
283    }
284
285    /// Collapses the `PerfReport` into a `HashMap` of `Importance` <-> tests
286    /// each represented as a map of `name, (Timings, iterations, weight)`.
287    fn collapse(self) -> HashMap<Importance, HashMap<String, (Timings, usize, u8)>> {
288        let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
289        for entry in self.tests {
290            if let Some(mdata) = entry.1
291                && let Ok(timings) = entry.2
292            {
293                if let Some(handle) = categories.get_mut(&mdata.importance) {
294                    handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
295                } else {
296                    let mut new = HashMap::default();
297                    new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
298                    categories.insert(mdata.importance, new);
299                }
300            }
301        }
302
303        categories
304    }
305}
306
307impl std::fmt::Display for Output {
308    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
309        // Don't print the header for an empty run.
310        if self.tests.is_empty() {
311            return Ok(());
312        }
313
314        // We want to print important tests at the top, then alphabetical.
315        let mut sorted = self.clone();
316        sorted.sort();
317        // Markdown header for making a nice little table :>
318        writeln!(
319            f,
320            "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
321        )?;
322        writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
323        for (name, metadata, timings) in &sorted.tests {
324            match metadata {
325                Some(metadata) => match timings {
326                    // Happy path.
327                    Ok(timings) => {
328                        // If the test succeeded, then metadata.iterations is Some(_).
329                        writeln!(
330                            f,
331                            "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
332                            name,
333                            timings.iters_per_sec(metadata.iterations.unwrap()),
334                            {
335                                // Very small mean runtimes will give inaccurate
336                                // results. Should probably also penalise weight.
337                                let mean = timings.mean.as_secs_f64() * 1000.;
338                                if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
339                                    format!("{mean:.2} (unreliable)")
340                                } else {
341                                    format!("{mean:.2}")
342                                }
343                            },
344                            timings.stddev.as_secs_f64() * 1000.,
345                            metadata.iterations.unwrap(),
346                            metadata.importance,
347                            metadata.weight,
348                        )?;
349                    }
350                    // We have (some) metadata, but the test errored.
351                    Err(err) => writeln!(
352                        f,
353                        "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
354                        err,
355                        name,
356                        metadata
357                            .iterations
358                            .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
359                        metadata.importance,
360                        metadata.weight
361                    )?,
362                },
363                // No metadata, couldn't even parse the test output.
364                None => writeln!(
365                    f,
366                    "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
367                    timings.as_ref().unwrap_err(),
368                    name
369                )?,
370            }
371        }
372        writeln!(f)?;
373        Ok(())
374    }
375}
376
377/// The difference in performance between two runs within a given importance
378/// category.
379struct PerfDelta {
380    /// The biggest improvement / least bad regression.
381    max: f64,
382    /// The weighted average change in test times.
383    mean: f64,
384    /// The worst regression / smallest improvement.
385    min: f64,
386}
387
388/// Shim type for reporting all performance deltas across importance categories.
389pub struct PerfReport {
390    /// Inner (group, diff) pairing.
391    deltas: HashMap<Importance, PerfDelta>,
392}
393
394impl std::fmt::Display for PerfReport {
395    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
396        if self.deltas.is_empty() {
397            return write!(f, "(no matching tests)");
398        }
399        let sorted = self.deltas.iter().collect::<Vec<_>>();
400        writeln!(f, "| Category | Max | Mean | Min |")?;
401        // We don't want to print too many newlines at the end, so handle newlines
402        // a little jankily like this.
403        write!(f, "|:---|---:|---:|---:|")?;
404        for (cat, delta) in sorted.into_iter().rev() {
405            write!(
406                f,
407                "\n| {cat} | {:.3} | {:.3} | {:.3} |",
408                delta.max, delta.mean, delta.min
409            )?;
410        }
411        Ok(())
412    }
413}