Add error handling to keep Zed running in the face of transient wgpu errors (#50539)

Conrad Irwin created

Release Notes:

- linux: Reduce crashes in the face of transient GPU errors

Change summary

crates/gpui/src/platform/wgpu/wgpu_renderer.rs | 27 +++++++++++++++++++
1 file changed, 26 insertions(+), 1 deletion(-)

Detailed changes

crates/gpui/src/platform/wgpu/wgpu_renderer.rs 🔗

@@ -8,7 +8,7 @@ use bytemuck::{Pod, Zeroable};
 use log::warn;
 use raw_window_handle::{HasDisplayHandle, HasWindowHandle};
 use std::num::NonZeroU64;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 
 #[repr(C)]
 #[derive(Clone, Copy, Pod, Zeroable)]
@@ -118,6 +118,8 @@ pub struct WgpuRenderer {
     transparent_alpha_mode: wgpu::CompositeAlphaMode,
     opaque_alpha_mode: wgpu::CompositeAlphaMode,
     max_texture_size: u32,
+    last_error: Arc<Mutex<Option<String>>>,
+    failed_frame_count: u32,
 }
 
 impl WgpuRenderer {
@@ -348,6 +350,13 @@ impl WgpuRenderer {
 
         let adapter_info = context.adapter.get_info();
 
+        let last_error: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+        let last_error_clone = Arc::clone(&last_error);
+        device.on_uncaptured_error(Arc::new(move |error| {
+            let mut guard = last_error_clone.lock().unwrap();
+            *guard = Some(error.to_string());
+        }));
+
         Ok(Self {
             device,
             queue,
@@ -378,6 +387,8 @@ impl WgpuRenderer {
             transparent_alpha_mode,
             opaque_alpha_mode,
             max_texture_size,
+            last_error,
+            failed_frame_count: 0,
         })
     }
 
@@ -914,6 +925,20 @@ impl WgpuRenderer {
     }
 
     pub fn draw(&mut self, scene: &Scene) {
+        let last_error = self.last_error.lock().unwrap().take();
+        if let Some(error) = last_error {
+            self.failed_frame_count += 1;
+            log::error!(
+                "GPU error during frame (failure {} of 20): {error}",
+                self.failed_frame_count
+            );
+            if self.failed_frame_count > 20 {
+                panic!("Too many consecutive GPU errors. Last error: {error}");
+            }
+        } else {
+            self.failed_frame_count = 0;
+        }
+
         self.atlas.before_frame();
 
         let frame = match self.surface.get_current_texture() {