implementation.rs

  1//! The implementation of the this crate is kept in a separate module
  2//! so that it is easy to publish this crate as part of GPUI's dependencies
  3
  4use collections::HashMap;
  5use serde::{Deserialize, Serialize};
  6use std::{num::NonZero, time::Duration};
  7
  8pub mod consts {
  9    //! Preset identifiers and constants so that the profiler and proc macro agree
 10    //! on their communication protocol.
 11
 12    /// The suffix on the actual test function.
 13    pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
 14    /// The suffix on an extra function which prints metadata about a test to stdout.
 15    pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
 16    /// The env var in which we pass the iteration count to our tests.
 17    pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
 18    /// The prefix printed on all benchmark test metadata lines, to distinguish it from
 19    /// possible output by the test harness itself.
 20    pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
 21    /// The version number for the data returned from the test metadata function.
 22    /// Increment on non-backwards-compatible changes.
 23    pub const MDATA_VER: u32 = 0;
 24    /// The default weight, if none is specified.
 25    pub const WEIGHT_DEFAULT: u8 = 50;
 26    /// How long a test must have run to be assumed to be reliable-ish.
 27    pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
 28
 29    /// Identifier for the iteration count of a test metadata.
 30    pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
 31    /// Identifier for the weight of a test metadata.
 32    pub const WEIGHT_LINE_NAME: &str = "weight";
 33    /// Identifier for importance in test metadata.
 34    pub const IMPORTANCE_LINE_NAME: &str = "importance";
 35    /// Identifier for the test metadata version.
 36    pub const VERSION_LINE_NAME: &str = "version";
 37
 38    /// Where to save json run information.
 39    pub const RUNS_DIR: &str = ".perf-runs";
 40}
 41
 42/// How relevant a benchmark is.
 43#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
 44pub enum Importance {
 45    /// Regressions shouldn't be accepted without good reason.
 46    Critical = 4,
 47    /// Regressions should be paid extra attention.
 48    Important = 3,
 49    /// No extra attention should be paid to regressions, but they might still
 50    /// be indicative of something happening.
 51    #[default]
 52    Average = 2,
 53    /// Unclear if regressions are likely to be meaningful, but still worth keeping
 54    /// an eye on. Lowest level that's checked by default by the profiler.
 55    Iffy = 1,
 56    /// Regressions are likely to be spurious or don't affect core functionality.
 57    /// Only relevant if a lot of them happen, or as supplemental evidence for a
 58    /// higher-importance benchmark regressing. Not checked by default.
 59    Fluff = 0,
 60}
 61
 62impl std::fmt::Display for Importance {
 63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 64        match self {
 65            Importance::Critical => f.write_str("critical"),
 66            Importance::Important => f.write_str("important"),
 67            Importance::Average => f.write_str("average"),
 68            Importance::Iffy => f.write_str("iffy"),
 69            Importance::Fluff => f.write_str("fluff"),
 70        }
 71    }
 72}
 73
 74/// Why or when did this test fail?
 75#[derive(Clone, Debug, Serialize, Deserialize)]
 76pub enum FailKind {
 77    /// Failed while triaging it to determine the iteration count.
 78    Triage,
 79    /// Failed while profiling it.
 80    Profile,
 81    /// Failed due to an incompatible version for the test.
 82    VersionMismatch,
 83    /// Could not parse metadata for a test.
 84    BadMetadata,
 85    /// Skipped due to filters applied on the perf run.
 86    Skipped,
 87}
 88
 89impl std::fmt::Display for FailKind {
 90    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 91        match self {
 92            FailKind::Triage => f.write_str("errored in triage"),
 93            FailKind::Profile => f.write_str("errored while profiling"),
 94            FailKind::VersionMismatch => f.write_str("test version mismatch"),
 95            FailKind::BadMetadata => f.write_str("bad test metadata"),
 96            FailKind::Skipped => f.write_str("skipped"),
 97        }
 98    }
 99}
100
101/// Information about a given perf test.
102#[derive(Clone, Debug, Serialize, Deserialize)]
103pub struct TestMdata {
104    /// A version number for when the test was generated. If this is greater
105    /// than the version this test handler expects, one of the following will
106    /// happen in an unspecified manner:
107    /// - The test is skipped silently.
108    /// - The handler exits with an error message indicating the version mismatch
109    ///   or inability to parse the metadata.
110    ///
111    /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
112    /// correctly parse the output of this test.
113    pub version: u32,
114    /// How many iterations to pass this test if this is preset, or how many
115    /// iterations a test ended up running afterwards if determined at runtime.
116    pub iterations: Option<NonZero<usize>>,
117    /// The importance of this particular test. See the docs on `Importance` for
118    /// details.
119    pub importance: Importance,
120    /// The weight of this particular test within its importance category. Used
121    /// when comparing across runs.
122    pub weight: u8,
123}
124
125/// The actual timings of a test, as measured by Hyperfine.
126#[derive(Clone, Debug, Serialize, Deserialize)]
127pub struct Timings {
128    /// Mean runtime for `self.iter_total` runs of this test.
129    pub mean: Duration,
130    /// Standard deviation for the above.
131    pub stddev: Duration,
132}
133
134impl Timings {
135    /// How many iterations does this test seem to do per second?
136    #[expect(
137        clippy::cast_precision_loss,
138        reason = "We only care about a couple sig figs anyways"
139    )]
140    #[must_use]
141    pub fn iters_per_sec(&self, total_iters: NonZero<usize>) -> f64 {
142        (1000. / self.mean.as_millis() as f64) * total_iters.get() as f64
143    }
144}
145
146/// Aggregate results, meant to be used for a given importance category. Each
147/// test name corresponds to its benchmark results, iteration count, and weight.
148type CategoryInfo = HashMap<String, (Timings, NonZero<usize>, u8)>;
149
150/// Aggregate output of all tests run by this handler.
151#[derive(Clone, Debug, Default, Serialize, Deserialize)]
152pub struct Output {
153    /// A list of test outputs. Format is `(test_name, mdata, timings)`.
154    /// The latter being `Ok(_)` indicates the test succeeded.
155    ///
156    /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
157    /// `mdata.iterations` is `Some(_)`.
158    tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
159}
160
161impl Output {
162    /// Instantiates an empty "output". Useful for merging.
163    #[must_use]
164    pub fn blank() -> Self {
165        Output { tests: Vec::new() }
166    }
167
168    /// Reports a success and adds it to this run's `Output`.
169    pub fn success(
170        &mut self,
171        name: impl AsRef<str>,
172        mut mdata: TestMdata,
173        iters: NonZero<usize>,
174        timings: Timings,
175    ) {
176        mdata.iterations = Some(iters);
177        self.tests
178            .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
179    }
180
181    /// Reports a failure and adds it to this run's `Output`. If this test was tried
182    /// with some number of iterations (i.e. this was not a version mismatch or skipped
183    /// test), it should be reported also.
184    ///
185    /// Using the `fail!()` macro is usually more convenient.
186    pub fn failure(
187        &mut self,
188        name: impl AsRef<str>,
189        mut mdata: Option<TestMdata>,
190        attempted_iters: Option<NonZero<usize>>,
191        kind: FailKind,
192    ) {
193        if let Some(ref mut mdata) = mdata {
194            mdata.iterations = attempted_iters;
195        }
196        self.tests
197            .push((name.as_ref().to_string(), mdata, Err(kind)));
198    }
199
200    /// True if no tests executed this run.
201    #[must_use]
202    pub fn is_empty(&self) -> bool {
203        self.tests.is_empty()
204    }
205
206    /// Sorts the runs in the output in the order that we want them printed.
207    pub fn sort(&mut self) {
208        self.tests.sort_unstable_by(|a, b| match (a, b) {
209            // Tests where we got no metadata go at the end.
210            ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
211            ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
212            // Then sort by importance, then weight.
213            ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
214                let c = a_mdata.importance.cmp(&b_mdata.importance);
215                if matches!(c, std::cmp::Ordering::Equal) {
216                    a_mdata.weight.cmp(&b_mdata.weight)
217                } else {
218                    c
219                }
220            }
221            // Lastly by name.
222            ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
223        });
224    }
225
226    /// Merges the output of two runs, appending a prefix to the results of the new run.
227    /// To be used in conjunction with `Output::blank()`, or else only some tests will have
228    /// a prefix set.
229    pub fn merge<'a>(&mut self, other: Self, pref_other: impl Into<Option<&'a str>>) {
230        let pref = if let Some(pref) = pref_other.into() {
231            "crates/".to_string() + pref + "::"
232        } else {
233            String::new()
234        };
235        self.tests = std::mem::take(&mut self.tests)
236            .into_iter()
237            .chain(
238                other
239                    .tests
240                    .into_iter()
241                    .map(|(name, md, tm)| (pref.clone() + &name, md, tm)),
242            )
243            .collect();
244    }
245
246    /// Evaluates the performance of `self` against `baseline`. The latter is taken
247    /// as the comparison point, i.e. a positive resulting `PerfReport` means that
248    /// `self` performed better.
249    ///
250    /// # Panics
251    /// `self` and `baseline` are assumed to have the iterations field on all
252    /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
253    #[must_use]
254    pub fn compare_perf(self, baseline: Self) -> PerfReport {
255        let self_categories = self.collapse();
256        let mut other_categories = baseline.collapse();
257
258        let deltas = self_categories
259            .into_iter()
260            .filter_map(|(cat, self_data)| {
261                // Only compare categories where both           meow
262                // runs have data.                              /
263                let mut other_data = other_categories.remove(&cat)?;
264                let mut max = f64::MIN;
265                let mut min = f64::MAX;
266
267                // Running totals for averaging out tests.
268                let mut r_total_numerator = 0.;
269                let mut r_total_denominator = 0;
270                // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
271                for (name, (s_timings, s_iters, weight)) in self_data {
272                    // Only use the new weights if they conflict.
273                    let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
274                        continue;
275                    };
276                    let shift =
277                        (o_timings.iters_per_sec(o_iters) / s_timings.iters_per_sec(s_iters)) - 1.;
278                    if shift > max {
279                        max = shift;
280                    }
281                    if shift < min {
282                        min = shift;
283                    }
284                    r_total_numerator += shift * f64::from(weight);
285                    r_total_denominator += u32::from(weight);
286                }
287                // There were no runs here!
288                if r_total_denominator == 0 {
289                    None
290                } else {
291                    let mean = r_total_numerator / f64::from(r_total_denominator);
292                    // TODO: also aggregate standard deviation? That's harder to keep
293                    // meaningful, though, since we dk which tests are correlated.
294                    Some((cat, PerfDelta { max, mean, min }))
295                }
296            })
297            .collect();
298
299        PerfReport { deltas }
300    }
301
302    /// Collapses the `PerfReport` into a `HashMap` over `Importance`, with
303    /// each importance category having its tests contained.
304    fn collapse(self) -> HashMap<Importance, CategoryInfo> {
305        let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
306        for entry in self.tests {
307            if let Some(mdata) = entry.1
308                && let Ok(timings) = entry.2
309            {
310                if let Some(handle) = categories.get_mut(&mdata.importance) {
311                    handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
312                } else {
313                    let mut new = HashMap::default();
314                    new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
315                    categories.insert(mdata.importance, new);
316                }
317            }
318        }
319
320        categories
321    }
322}
323
324impl std::fmt::Display for Output {
325    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326        // Don't print the header for an empty run.
327        if self.tests.is_empty() {
328            return Ok(());
329        }
330
331        // We want to print important tests at the top, then alphabetical.
332        let mut sorted = self.clone();
333        sorted.sort();
334        // Markdown header for making a nice little table :>
335        writeln!(
336            f,
337            "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
338        )?;
339        writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
340        for (name, metadata, timings) in &sorted.tests {
341            match metadata {
342                Some(metadata) => match timings {
343                    // Happy path.
344                    Ok(timings) => {
345                        // If the test succeeded, then metadata.iterations is Some(_).
346                        writeln!(
347                            f,
348                            "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
349                            name,
350                            timings.iters_per_sec(metadata.iterations.unwrap()),
351                            {
352                                // Very small mean runtimes will give inaccurate
353                                // results. Should probably also penalise weight.
354                                let mean = timings.mean.as_secs_f64() * 1000.;
355                                if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
356                                    format!("{mean:.2} (unreliable)")
357                                } else {
358                                    format!("{mean:.2}")
359                                }
360                            },
361                            timings.stddev.as_secs_f64() * 1000.,
362                            metadata.iterations.unwrap(),
363                            metadata.importance,
364                            metadata.weight,
365                        )?;
366                    }
367                    // We have (some) metadata, but the test errored.
368                    Err(err) => writeln!(
369                        f,
370                        "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
371                        err,
372                        name,
373                        metadata
374                            .iterations
375                            .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
376                        metadata.importance,
377                        metadata.weight
378                    )?,
379                },
380                // No metadata, couldn't even parse the test output.
381                None => writeln!(
382                    f,
383                    "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
384                    timings.as_ref().unwrap_err(),
385                    name
386                )?,
387            }
388        }
389        Ok(())
390    }
391}
392
393/// The difference in performance between two runs within a given importance
394/// category.
395struct PerfDelta {
396    /// The biggest improvement / least bad regression.
397    max: f64,
398    /// The weighted average change in test times.
399    mean: f64,
400    /// The worst regression / smallest improvement.
401    min: f64,
402}
403
404/// Shim type for reporting all performance deltas across importance categories.
405pub struct PerfReport {
406    /// Inner (group, diff) pairing.
407    deltas: HashMap<Importance, PerfDelta>,
408}
409
410impl std::fmt::Display for PerfReport {
411    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
412        if self.deltas.is_empty() {
413            return write!(f, "(no matching tests)");
414        }
415        let sorted = self.deltas.iter().collect::<Vec<_>>();
416        writeln!(f, "| Category | Max | Mean | Min |")?;
417        // We don't want to print too many newlines at the end, so handle newlines
418        // a little jankily like this.
419        write!(f, "|:---|---:|---:|---:|")?;
420        for (cat, delta) in sorted.into_iter().rev() {
421            const SIGN_POS: &str = "↑";
422            const SIGN_NEG: &str = "↓";
423            const SIGN_NEUTRAL_POS: &str = "±↑";
424            const SIGN_NEUTRAL_NEG: &str = "±↓";
425
426            let prettify = |time: f64| {
427                let sign = if time > 0.05 {
428                    SIGN_POS
429                } else if time > 0. {
430                    SIGN_NEUTRAL_POS
431                } else if time > -0.05 {
432                    SIGN_NEUTRAL_NEG
433                } else {
434                    SIGN_NEG
435                };
436                format!("{} {:.1}%", sign, time.abs() * 100.)
437            };
438
439            // Pretty-print these instead of just using the float display impl.
440            write!(
441                f,
442                "\n| {cat} | {} | {} | {} |",
443                prettify(delta.max),
444                prettify(delta.mean),
445                prettify(delta.min)
446            )?;
447        }
448        Ok(())
449    }
450}