gpui: Take advantage of unified memory on Apple silicon (#44273)

Marco Mihai Condrache created

Metal chooses a buffer’s default storage mode based on the type of GPU
in use.
On Apple GPUs, the default mode is shared, which allows the CPU and GPU
to access the same memory without requiring explicit synchronization.
On discrete or external GPUs, Metal instead defaults to managed storage,
which does require explicit CPU–GPU memory synchronization.

This change aligns our buffer usage with Metal’s default behavior and
avoids unnecessary synchronization on Apple-silicon Macs. As a result,
memory usage on Apple hardware is reduced and performance improves due
to fewer sync operations.

Ref:
https://developer.apple.com/documentation/metal/setting-resource-storage-modes
Ref:
https://developer.apple.com/documentation/metal/synchronizing-a-managed-resource-in-macos

With the storage mode:

<img width="356" height="74" alt="image"
src="https://github.com/user-attachments/assets/e5a5bf9a-f339-417b-b5ab-818d8f692bd1"
/>

On main branch:

<img width="356" height="74" alt="image"
src="https://github.com/user-attachments/assets/6ccd77fe-7929-4423-9696-671d185ceffb"
/>

That's a 44% reduction of memory usage.

Release Notes:

- Reduced memory usage on Apple-silicon Macs by using shared memory
where appropriate

---------

Signed-off-by: Marco Mihai Condrache <52580954+marcocondrache@users.noreply.github.com>

Change summary

crates/gpui/src/platform/mac/metal_atlas.rs    |  9 ++
crates/gpui/src/platform/mac/metal_renderer.rs | 61 ++++++++++++++++---
2 files changed, 58 insertions(+), 12 deletions(-)

Detailed changes

crates/gpui/src/platform/mac/metal_atlas.rs 🔗

@@ -15,6 +15,9 @@ pub(crate) struct MetalAtlas(Mutex<MetalAtlasState>);
 impl MetalAtlas {
     pub(crate) fn new(device: Device) -> Self {
         MetalAtlas(Mutex::new(MetalAtlasState {
+            // Shared memory can be used only if CPU and GPU share the same memory space.
+            // https://developer.apple.com/documentation/metal/setting-resource-storage-modes
+            unified_memory: device.has_unified_memory(),
             device: AssertSend(device),
             monochrome_textures: Default::default(),
             polychrome_textures: Default::default(),
@@ -29,6 +32,7 @@ impl MetalAtlas {
 
 struct MetalAtlasState {
     device: AssertSend<Device>,
+    unified_memory: bool,
     monochrome_textures: AtlasTextureList<MetalAtlasTexture>,
     polychrome_textures: AtlasTextureList<MetalAtlasTexture>,
     tiles_by_key: FxHashMap<AtlasKey, AtlasTile>,
@@ -146,6 +150,11 @@ impl MetalAtlasState {
         }
         texture_descriptor.set_pixel_format(pixel_format);
         texture_descriptor.set_usage(usage);
+        texture_descriptor.set_storage_mode(if self.unified_memory {
+            metal::MTLStorageMode::Shared
+        } else {
+            metal::MTLStorageMode::Managed
+        });
         let metal_texture = self.device.new_texture(&texture_descriptor);
 
         let texture_list = match kind {

crates/gpui/src/platform/mac/metal_renderer.rs 🔗

@@ -76,12 +76,22 @@ impl InstanceBufferPool {
         self.buffers.clear();
     }
 
-    pub(crate) fn acquire(&mut self, device: &metal::Device) -> InstanceBuffer {
+    pub(crate) fn acquire(
+        &mut self,
+        device: &metal::Device,
+        unified_memory: bool,
+    ) -> InstanceBuffer {
         let buffer = self.buffers.pop().unwrap_or_else(|| {
-            device.new_buffer(
-                self.buffer_size as u64,
-                MTLResourceOptions::StorageModeManaged,
-            )
+            let options = if unified_memory {
+                MTLResourceOptions::StorageModeShared
+                    // Buffers are write only which can benefit from the combined cache
+                    // https://developer.apple.com/documentation/metal/mtlresourceoptions/cpucachemodewritecombined
+                    | MTLResourceOptions::CPUCacheModeWriteCombined
+            } else {
+                MTLResourceOptions::StorageModeManaged
+            };
+
+            device.new_buffer(self.buffer_size as u64, options)
         });
         InstanceBuffer {
             metal_buffer: buffer,
@@ -99,6 +109,7 @@ impl InstanceBufferPool {
 pub(crate) struct MetalRenderer {
     device: metal::Device,
     layer: metal::MetalLayer,
+    unified_memory: bool,
     presents_with_transaction: bool,
     command_queue: CommandQueue,
     paths_rasterization_pipeline_state: metal::RenderPipelineState,
@@ -179,6 +190,10 @@ impl MetalRenderer {
             output
         }
 
+        // Shared memory can be used only if CPU and GPU share the same memory space.
+        // https://developer.apple.com/documentation/metal/setting-resource-storage-modes
+        let unified_memory = device.has_unified_memory();
+
         let unit_vertices = [
             to_float2_bits(point(0., 0.)),
             to_float2_bits(point(1., 0.)),
@@ -190,7 +205,12 @@ impl MetalRenderer {
         let unit_vertices = device.new_buffer_with_data(
             unit_vertices.as_ptr() as *const c_void,
             mem::size_of_val(&unit_vertices) as u64,
-            MTLResourceOptions::StorageModeManaged,
+            if unified_memory {
+                MTLResourceOptions::StorageModeShared
+                    | MTLResourceOptions::CPUCacheModeWriteCombined
+            } else {
+                MTLResourceOptions::StorageModeManaged
+            },
         );
 
         let paths_rasterization_pipeline_state = build_path_rasterization_pipeline_state(
@@ -268,6 +288,7 @@ impl MetalRenderer {
             device,
             layer,
             presents_with_transaction: false,
+            unified_memory,
             command_queue,
             paths_rasterization_pipeline_state,
             path_sprites_pipeline_state,
@@ -337,14 +358,23 @@ impl MetalRenderer {
         texture_descriptor.set_width(size.width.0 as u64);
         texture_descriptor.set_height(size.height.0 as u64);
         texture_descriptor.set_pixel_format(metal::MTLPixelFormat::BGRA8Unorm);
+        texture_descriptor.set_storage_mode(metal::MTLStorageMode::Private);
         texture_descriptor
             .set_usage(metal::MTLTextureUsage::RenderTarget | metal::MTLTextureUsage::ShaderRead);
         self.path_intermediate_texture = Some(self.device.new_texture(&texture_descriptor));
 
         if self.path_sample_count > 1 {
+            // https://developer.apple.com/documentation/metal/choosing-a-resource-storage-mode-for-apple-gpus
+            // Rendering MSAA textures are done in a single pass, so we can use memory-less storage on Apple Silicon
+            let storage_mode = if self.unified_memory {
+                metal::MTLStorageMode::Memoryless
+            } else {
+                metal::MTLStorageMode::Private
+            };
+
             let mut msaa_descriptor = texture_descriptor;
             msaa_descriptor.set_texture_type(metal::MTLTextureType::D2Multisample);
-            msaa_descriptor.set_storage_mode(metal::MTLStorageMode::Private);
+            msaa_descriptor.set_storage_mode(storage_mode);
             msaa_descriptor.set_sample_count(self.path_sample_count as _);
             self.path_intermediate_msaa_texture = Some(self.device.new_texture(&msaa_descriptor));
         } else {
@@ -378,7 +408,10 @@ impl MetalRenderer {
         };
 
         loop {
-            let mut instance_buffer = self.instance_buffer_pool.lock().acquire(&self.device);
+            let mut instance_buffer = self
+                .instance_buffer_pool
+                .lock()
+                .acquire(&self.device, self.unified_memory);
 
             let command_buffer =
                 self.draw_primitives(scene, &mut instance_buffer, drawable, viewport_size);
@@ -550,10 +583,14 @@ impl MetalRenderer {
 
         command_encoder.end_encoding();
 
-        instance_buffer.metal_buffer.did_modify_range(NSRange {
-            location: 0,
-            length: instance_offset as NSUInteger,
-        });
+        if !self.unified_memory {
+            // Sync the instance buffer to the GPU
+            instance_buffer.metal_buffer.did_modify_range(NSRange {
+                location: 0,
+                length: instance_offset as NSUInteger,
+            });
+        }
+
         Ok(command_buffer.to_owned())
     }