Try to recover even harder from linux GPU errors (#54349)

Conrad Irwin created

Release Notes:

- N/A

Change summary

crates/gpui_linux/src/linux/wayland/window.rs |  4 +
crates/gpui_linux/src/linux/x11/window.rs     |  4 +
crates/gpui_wgpu/src/wgpu_atlas.rs            |  9 ++++
crates/gpui_wgpu/src/wgpu_renderer.rs         | 45 ++++++++++++++------
4 files changed, 48 insertions(+), 14 deletions(-)

Detailed changes

crates/gpui_linux/src/linux/wayland/window.rs 🔗

@@ -1385,6 +1385,10 @@ impl PlatformWindow for WaylandWindow {
         }
 
         state.renderer.draw(scene);
+
+        if state.renderer.needs_redraw() {
+            state.force_render_after_recovery = true;
+        }
     }
 
     fn completed_frame(&self) {

crates/gpui_linux/src/linux/x11/window.rs 🔗

@@ -1680,6 +1680,10 @@ impl PlatformWindow for X11Window {
         }
 
         inner.renderer.draw(scene);
+
+        if inner.renderer.needs_redraw() {
+            inner.force_render_after_recovery = true;
+        }
     }
 
     fn sprite_atlas(&self) -> Arc<dyn PlatformAtlas> {

crates/gpui_wgpu/src/wgpu_atlas.rs 🔗

@@ -82,6 +82,15 @@ impl WgpuAtlas {
         }
     }
 
+    /// Clears all cached textures and tiles, forcing them to be recreated.
+    /// Use this for incremental recovery when the device is still valid.
+    pub fn clear(&self) {
+        let mut lock = self.0.lock();
+        lock.storage = WgpuAtlasStorage::default();
+        lock.tiles_by_key.clear();
+        lock.pending_uploads.clear();
+    }
+
     /// Handles device lost by clearing all textures and cached tiles.
     /// The atlas will lazily recreate textures as needed on subsequent frames.
     pub fn handle_device_lost(&self, context: &WgpuContext) {

crates/gpui_wgpu/src/wgpu_renderer.rs 🔗

@@ -121,6 +121,15 @@ struct WgpuResources {
     path_msaa_view: Option<wgpu::TextureView>,
 }
 
+impl WgpuResources {
+    fn invalidate_intermediate_textures(&mut self) {
+        self.path_intermediate_texture = None;
+        self.path_intermediate_view = None;
+        self.path_msaa_texture = None;
+        self.path_msaa_view = None;
+    }
+}
+
 pub struct WgpuRenderer {
     /// Shared GPU context for device recovery coordination (unused on WASM).
     #[allow(dead_code)]
@@ -146,6 +155,7 @@ pub struct WgpuRenderer {
     failed_frame_count: u32,
     device_lost: std::sync::Arc<std::sync::atomic::AtomicBool>,
     surface_configured: bool,
+    needs_redraw: bool,
 }
 
 impl WgpuRenderer {
@@ -474,6 +484,7 @@ impl WgpuRenderer {
             failed_frame_count: 0,
             device_lost: context.device_lost_flag(),
             surface_configured: true,
+            needs_redraw: false,
         })
     }
 
@@ -973,10 +984,7 @@ impl WgpuRenderer {
             // Invalidate intermediate textures - they will be lazily recreated
             // in draw() after we confirm the surface is healthy. This avoids
             // panics when the device/surface is in an invalid state during resize.
-            resources.path_intermediate_texture = None;
-            resources.path_intermediate_view = None;
-            resources.path_msaa_texture = None;
-            resources.path_msaa_view = None;
+            resources.invalidate_intermediate_textures();
         }
     }
 
@@ -1077,10 +1085,19 @@ impl WgpuRenderer {
         if let Some(error) = last_error {
             self.failed_frame_count += 1;
             log::error!(
-                "GPU error during frame (failure {} of 20): {error}",
+                "GPU error during frame (failure {} of 10): {error}",
                 self.failed_frame_count
             );
-            if self.failed_frame_count > 20 {
+
+            // TBD. Does retrying more actually help?
+            if self.failed_frame_count > 5 {
+                if let Some(res) = self.resources.as_mut() {
+                    res.invalidate_intermediate_textures();
+                }
+                self.atlas.clear();
+                self.needs_redraw = true;
+                return;
+            } else if self.failed_frame_count > 10 {
                 panic!("Too many consecutive GPU errors. Last error: {error}");
             }
         } else {
@@ -1668,10 +1685,7 @@ impl WgpuRenderer {
         self.surface_configured = false;
         // Drop intermediate textures since they reference the old surface size.
         if let Some(res) = self.resources.as_mut() {
-            res.path_intermediate_texture = None;
-            res.path_intermediate_view = None;
-            res.path_msaa_texture = None;
-            res.path_msaa_view = None;
+            res.invalidate_intermediate_textures();
         }
     }
 
@@ -1721,10 +1735,7 @@ impl WgpuRenderer {
             res.surface = surface;
 
             // Invalidate intermediate textures — they'll be recreated lazily.
-            res.path_intermediate_texture = None;
-            res.path_intermediate_view = None;
-            res.path_msaa_texture = None;
-            res.path_msaa_view = None;
+            res.invalidate_intermediate_textures();
         }
 
         self.surface_configured = true;
@@ -1743,6 +1754,12 @@ impl WgpuRenderer {
         self.device_lost.load(std::sync::atomic::Ordering::SeqCst)
     }
 
+    /// Returns true if a redraw is needed because GPU state was cleared.
+    /// Calling this method clears the flag.
+    pub fn needs_redraw(&mut self) -> bool {
+        std::mem::take(&mut self.needs_redraw)
+    }
+
     /// Recovers from a lost GPU device by recreating the renderer with a new context.
     ///
     /// Call this after detecting `device_lost()` returns true.