reliability.rs

  1use anyhow::{Context as _, Result};
  2use client::{TelemetrySettings, telemetry::MINIDUMP_ENDPOINT};
  3use futures::AsyncReadExt;
  4use gpui::{App, AppContext as _};
  5use http_client::{self, HttpClient, HttpClientWithUrl};
  6use project::Project;
  7use proto::{CrashReport, GetCrashFilesResponse};
  8use reqwest::multipart::{Form, Part};
  9use settings::Settings;
 10use smol::stream::StreamExt;
 11use std::{ffi::OsStr, fs, sync::Arc};
 12use util::ResultExt;
 13
 14pub fn init(http_client: Arc<HttpClientWithUrl>, installation_id: Option<String>, cx: &mut App) {
 15    #[cfg(target_os = "macos")]
 16    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
 17
 18    if client::TelemetrySettings::get_global(cx).diagnostics {
 19        let client = http_client.clone();
 20        let id = installation_id.clone();
 21        cx.background_spawn(async move {
 22            upload_previous_minidumps(client, id).await.warn_on_err();
 23        })
 24        .detach()
 25    }
 26
 27    cx.observe_new(move |project: &mut Project, _, cx| {
 28        let http_client = http_client.clone();
 29        let installation_id = installation_id.clone();
 30
 31        let Some(remote_client) = project.remote_client() else {
 32            return;
 33        };
 34        remote_client.update(cx, |client, cx| {
 35            if !TelemetrySettings::get_global(cx).diagnostics {
 36                return;
 37            }
 38            let request = client.proto_client().request(proto::GetCrashFiles {});
 39            cx.background_spawn(async move {
 40                let GetCrashFilesResponse { crashes } = request.await?;
 41
 42                let Some(endpoint) = MINIDUMP_ENDPOINT.as_ref() else {
 43                    return Ok(());
 44                };
 45                for CrashReport {
 46                    metadata,
 47                    minidump_contents,
 48                } in crashes
 49                {
 50                    if let Some(metadata) = serde_json::from_str(&metadata).log_err() {
 51                        upload_minidump(
 52                            http_client.clone(),
 53                            endpoint,
 54                            minidump_contents,
 55                            &metadata,
 56                            installation_id.clone(),
 57                        )
 58                        .await
 59                        .log_err();
 60                    }
 61                }
 62
 63                anyhow::Ok(())
 64            })
 65            .detach_and_log_err(cx);
 66        })
 67    })
 68    .detach();
 69}
 70
 71#[cfg(target_os = "macos")]
 72pub fn monitor_main_thread_hangs(
 73    http_client: Arc<HttpClientWithUrl>,
 74    installation_id: Option<String>,
 75    cx: &App,
 76) {
 77    // This is too noisy to ship to stable for now.
 78    if !matches!(
 79        ReleaseChannel::global(cx),
 80        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
 81    ) {
 82        return;
 83    }
 84
 85    use nix::sys::signal::{
 86        SaFlags, SigAction, SigHandler, SigSet,
 87        Signal::{self, SIGUSR2},
 88        sigaction,
 89    };
 90
 91    use parking_lot::Mutex;
 92
 93    use http_client::Method;
 94    use release_channel::ReleaseChannel;
 95    use std::{
 96        ffi::c_int,
 97        sync::{OnceLock, mpsc},
 98        time::Duration,
 99    };
100    use telemetry_events::{BacktraceFrame, HangReport};
101
102    use nix::sys::pthread;
103
104    let foreground_executor = cx.foreground_executor();
105    let background_executor = cx.background_executor();
106    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
107
108    // Initialize SIGUSR2 handler to send a backtrace to a channel.
109    let (backtrace_tx, backtrace_rx) = mpsc::channel();
110    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
111    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
112    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
113    BACKTRACE.lock().reserve(100);
114
115    fn handle_backtrace_signal() {
116        unsafe {
117            extern "C" fn handle_sigusr2(_i: c_int) {
118                unsafe {
119                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
120                    // which can only be triggered by This signal handler. In addition,
121                    // this signal handler is immediately removed by SA_RESETHAND, and this
122                    // signal handler cannot be re-entrant due to the SIGUSR2 mask defined
123                    // below
124                    let mut bt = BACKTRACE.lock();
125                    bt.clear();
126                    backtrace::trace_unsynchronized(|frame| {
127                        if bt.len() < bt.capacity() {
128                            bt.push(frame.clone());
129                            true
130                        } else {
131                            false
132                        }
133                    });
134                }
135
136                BACKTRACE_SENDER.get().unwrap().send(()).ok();
137            }
138
139            let mut mask = SigSet::empty();
140            mask.add(SIGUSR2);
141            sigaction(
142                Signal::SIGUSR2,
143                &SigAction::new(
144                    SigHandler::Handler(handle_sigusr2),
145                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
146                    mask,
147                ),
148            )
149            .log_err();
150        }
151    }
152
153    handle_backtrace_signal();
154    let main_thread = pthread::pthread_self();
155
156    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
157    foreground_executor
158        .spawn(async move { while (rx.next().await).is_some() {} })
159        .detach();
160
161    background_executor
162        .spawn({
163            let background_executor = background_executor.clone();
164            async move {
165                loop {
166                    background_executor.timer(Duration::from_secs(1)).await;
167                    match tx.try_send(()) {
168                        Ok(_) => continue,
169                        Err(e) => {
170                            if e.into_send_error().is_full() {
171                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
172                            }
173                            // Only detect the first hang
174                            break;
175                        }
176                    }
177                }
178            }
179        })
180        .detach();
181
182    let app_version = release_channel::AppVersion::global(cx);
183    let os_name = client::telemetry::os_name();
184
185    background_executor
186        .clone()
187        .spawn(async move {
188            let os_version = client::telemetry::os_version();
189
190            loop {
191                while backtrace_rx.recv().is_ok() {
192                    if !telemetry_settings.diagnostics {
193                        return;
194                    }
195
196                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
197                    // the backtrace transmitter has fired, which itself is only done
198                    // by the signal handler. And due to SA_RESETHAND  the signal handler
199                    // will not run again until `handle_backtrace_signal` is called.
200                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
201                    let backtrace: Vec<_> = raw_backtrace
202                        .into_iter()
203                        .map(|frame| {
204                            let mut btf = BacktraceFrame {
205                                ip: frame.ip() as usize,
206                                symbol_addr: frame.symbol_address() as usize,
207                                base: frame.module_base_address().map(|addr| addr as usize),
208                                symbols: vec![],
209                            };
210
211                            backtrace::resolve_frame(&frame, |symbol| {
212                                if let Some(name) = symbol.name() {
213                                    btf.symbols.push(name.to_string());
214                                }
215                            });
216
217                            btf
218                        })
219                        .collect();
220
221                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
222                    handle_backtrace_signal();
223
224                    log::error!(
225                        "Suspected hang on main thread:\n{}",
226                        backtrace
227                            .iter()
228                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
229                            .collect::<Vec<_>>()
230                            .join("\n")
231                    );
232
233                    let report = HangReport {
234                        backtrace,
235                        app_version: Some(app_version),
236                        os_name: os_name.clone(),
237                        os_version: Some(os_version.clone()),
238                        architecture: std::env::consts::ARCH.into(),
239                        installation_id: installation_id.clone(),
240                    };
241
242                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
243                        continue;
244                    };
245
246                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
247                    else {
248                        continue;
249                    };
250
251                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
252                        continue;
253                    };
254
255                    let Ok(request) = http_client::Request::builder()
256                        .method(Method::POST)
257                        .uri(url.as_ref())
258                        .header("x-zed-checksum", checksum)
259                        .body(json_bytes.into())
260                    else {
261                        continue;
262                    };
263
264                    if let Some(response) = http_client.send(request).await.log_err()
265                        && response.status() != 200
266                    {
267                        log::error!("Failed to send hang report: HTTP {:?}", response.status());
268                    }
269                }
270            }
271        })
272        .detach()
273}
274
275pub async fn upload_previous_minidumps(
276    http: Arc<HttpClientWithUrl>,
277    installation_id: Option<String>,
278) -> anyhow::Result<()> {
279    let Some(minidump_endpoint) = MINIDUMP_ENDPOINT.as_ref() else {
280        log::warn!("Minidump endpoint not set");
281        return Ok(());
282    };
283
284    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
285    while let Some(child) = children.next().await {
286        let child = child?;
287        let child_path = child.path();
288        if child_path.extension() != Some(OsStr::new("dmp")) {
289            continue;
290        }
291        let mut json_path = child_path.clone();
292        json_path.set_extension("json");
293        if let Ok(metadata) = serde_json::from_slice(&smol::fs::read(&json_path).await?)
294            && upload_minidump(
295                http.clone(),
296                minidump_endpoint,
297                smol::fs::read(&child_path)
298                    .await
299                    .context("Failed to read minidump")?,
300                &metadata,
301                installation_id.clone(),
302            )
303            .await
304            .log_err()
305            .is_some()
306        {
307            fs::remove_file(child_path).ok();
308            fs::remove_file(json_path).ok();
309        }
310    }
311    Ok(())
312}
313
314async fn upload_minidump(
315    http: Arc<HttpClientWithUrl>,
316    endpoint: &str,
317    minidump: Vec<u8>,
318    metadata: &crashes::CrashInfo,
319    installation_id: Option<String>,
320) -> Result<()> {
321    let mut form = Form::new()
322        .part(
323            "upload_file_minidump",
324            Part::bytes(minidump)
325                .file_name("minidump.dmp")
326                .mime_str("application/octet-stream")?,
327        )
328        .text(
329            "sentry[tags][channel]",
330            metadata.init.release_channel.clone(),
331        )
332        .text("sentry[tags][version]", metadata.init.zed_version.clone())
333        .text("sentry[tags][binary]", metadata.init.binary.clone())
334        .text("sentry[release]", metadata.init.commit_sha.clone())
335        .text("platform", "rust");
336    let mut panic_message = "".to_owned();
337    if let Some(panic_info) = metadata.panic.as_ref() {
338        panic_message = panic_info.message.clone();
339        form = form
340            .text("sentry[logentry][formatted]", panic_info.message.clone())
341            .text("span", panic_info.span.clone());
342    }
343    if let Some(minidump_error) = metadata.minidump_error.clone() {
344        form = form.text("minidump_error", minidump_error);
345    }
346    if let Some(id) = installation_id.clone() {
347        form = form.text("sentry[user][id]", id)
348    }
349
350    ::telemetry::event!(
351        "Minidump Uploaded",
352        panic_message = panic_message,
353        crashed_version = metadata.init.zed_version.clone(),
354        commit_sha = metadata.init.commit_sha.clone(),
355    );
356
357    let gpu_count = metadata.gpus.len();
358    for (index, gpu) in metadata.gpus.iter().cloned().enumerate() {
359        let system_specs::GpuInfo {
360            device_name,
361            device_pci_id,
362            vendor_name,
363            vendor_pci_id,
364            driver_version,
365            driver_name,
366        } = gpu;
367        let num = if gpu_count == 1 && metadata.active_gpu.is_none() {
368            String::new()
369        } else {
370            index.to_string()
371        };
372        let name = format!("gpu{num}");
373        let root = format!("sentry[contexts][{name}]");
374        form = form
375            .text(
376                format!("{root}[Description]"),
377                "A GPU found on the users system. May or may not be the GPU Zed is running on",
378            )
379            .text(format!("{root}[type]"), "gpu")
380            .text(format!("{root}[name]"), device_name.unwrap_or(name))
381            .text(format!("{root}[id]"), format!("{:#06x}", device_pci_id))
382            .text(
383                format!("{root}[vendor_id]"),
384                format!("{:#06x}", vendor_pci_id),
385            )
386            .text_if_some(format!("{root}[vendor_name]"), vendor_name)
387            .text_if_some(format!("{root}[driver_version]"), driver_version)
388            .text_if_some(format!("{root}[driver_name]"), driver_name);
389    }
390    if let Some(active_gpu) = metadata.active_gpu.clone() {
391        form = form
392            .text(
393                "sentry[contexts][Active_GPU][Description]",
394                "The GPU Zed is running on",
395            )
396            .text("sentry[contexts][Active_GPU][type]", "gpu")
397            .text("sentry[contexts][Active_GPU][name]", active_gpu.device_name)
398            .text(
399                "sentry[contexts][Active_GPU][driver_version]",
400                active_gpu.driver_info,
401            )
402            .text(
403                "sentry[contexts][Active_GPU][driver_name]",
404                active_gpu.driver_name,
405            )
406            .text(
407                "sentry[contexts][Active_GPU][is_software_emulated]",
408                active_gpu.is_software_emulated.to_string(),
409            );
410    }
411
412    // TODO: feature-flag-context, and more of device-context like screen resolution, available ram, device model, etc
413
414    let mut response_text = String::new();
415    let mut response = http.send_multipart_form(endpoint, form).await?;
416    response
417        .body_mut()
418        .read_to_string(&mut response_text)
419        .await?;
420    if !response.status().is_success() {
421        anyhow::bail!("failed to upload minidump: {response_text}");
422    }
423    log::info!("Uploaded minidump. event id: {response_text}");
424    Ok(())
425}
426
427trait FormExt {
428    fn text_if_some(
429        self,
430        label: impl Into<std::borrow::Cow<'static, str>>,
431        value: Option<impl Into<std::borrow::Cow<'static, str>>>,
432    ) -> Self;
433}
434
435impl FormExt for Form {
436    fn text_if_some(
437        self,
438        label: impl Into<std::borrow::Cow<'static, str>>,
439        value: Option<impl Into<std::borrow::Cow<'static, str>>>,
440    ) -> Self {
441        match value {
442            Some(value) => self.text(label.into(), value.into()),
443            None => self,
444        }
445    }
446}