fixes denoise setting, prepares for migration to 16kHz

David Kleingeld created 2 months ago

Change summary

crates/audio/src/audio.rs                                   |  40 +
crates/audio/src/audio_settings.rs                          | 133 +++++-
crates/livekit_client/src/livekit_client.rs                 |  20 
crates/livekit_client/src/livekit_client/playback.rs        |  89 +++-
crates/livekit_client/src/livekit_client/playback/source.rs |  40 +
5 files changed, 237 insertions(+), 85 deletions(-)

Detailed changes

crates/audio/src/audio.rs 🔗

@@ -31,18 +31,20 @@ pub use rodio_ext::RodioExt;
 
 use crate::audio_settings::LIVE_SETTINGS;
 
-// NOTE: We used to use WebRTC's mixer which only supported
-// 16kHz, 32kHz and 48kHz. As 48 is the most common "next step up"
-// for audio output devices like speakers/bluetooth, we just hard-code
-// this; and downsample when we need to.
+// We are migrating to 16kHz sample rate from 48kHz. In the future
+// once we are reasonably sure most users have upgraded we will
+// remove the LEGACY parameters.
 //
-// Since most noise cancelling requires 16kHz we will move to
-// that in the future.
-pub const SAMPLE_RATE: NonZero<u32> = nz!(48000);
-pub const CHANNEL_COUNT: NonZero<u16> = nz!(2);
+// We migrate to 16kHz because it is sufficient for speech and required
+// by the denoiser and future Speech to Text layers.
+pub const SAMPLE_RATE: NonZero<u32> = nz!(16000);
+pub const CHANNEL_COUNT: NonZero<u16> = nz!(1);
 pub const BUFFER_SIZE: usize = // echo canceller and livekit want 10ms of audio
     (SAMPLE_RATE.get() as usize / 100) * CHANNEL_COUNT.get() as usize;
 
+pub const LEGACY_SAMPLE_RATE: NonZero<u32> = nz!(48000);
+pub const LEGACY_CHANNEL_COUNT: NonZero<u16> = nz!(2);
+
 pub const REPLAY_DURATION: Duration = Duration::from_secs(30);
 
 pub fn init(cx: &mut App) {
@@ -160,8 +162,13 @@ impl Audio {
         let stream = rodio::microphone::MicrophoneBuilder::new()
             .default_device()?
             .default_config()?
-            .prefer_sample_rates([SAMPLE_RATE, SAMPLE_RATE.saturating_mul(nz!(2))])
-            .prefer_channel_counts([nz!(1), nz!(2)])
+            .prefer_sample_rates([
+                SAMPLE_RATE, // sample rates trivially resamplable to `SAMPLE_RATE`
+                SAMPLE_RATE.saturating_mul(nz!(2)),
+                SAMPLE_RATE.saturating_mul(nz!(3)),
+                SAMPLE_RATE.saturating_mul(nz!(4)),
+            ])
+            .prefer_channel_counts([CHANNEL_COUNT, CHANNEL_COUNT.saturating_mul(nz!(2))])
             .prefer_buffer_sizes(512..)
             .open_stream()?;
         info!("Opened microphone: {:?}", stream.config());
@@ -189,15 +196,24 @@ impl Audio {
                     }
                 }
             })
+            .denoise()
+            .context("Could not set up denoiser")?
+            .periodic_access(Duration::from_millis(100), move |denoise| {
+                denoise.set_enabled(LIVE_SETTINGS.denoise.load(Ordering::Relaxed));
+            })
             .automatic_gain_control(1.0, 4.0, 0.0, 5.0)
             .periodic_access(Duration::from_millis(100), move |agc_source| {
-                agc_source.set_enabled(LIVE_SETTINGS.control_input_volume.load(Ordering::Relaxed));
+                agc_source
+                    .set_enabled(LIVE_SETTINGS.auto_microphone_volume.load(Ordering::Relaxed));
             })
             .replayable(REPLAY_DURATION)?;
 
         voip_parts
             .replays
             .add_voip_stream("local microphone".to_string(), replay);
+
+        let stream = stream.constant_params(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE);
+
         Ok(stream)
     }
 
@@ -210,7 +226,7 @@ impl Audio {
         let (replay_source, source) = source
             .automatic_gain_control(1.0, 4.0, 0.0, 5.0)
             .periodic_access(Duration::from_millis(100), move |agc_source| {
-                agc_source.set_enabled(LIVE_SETTINGS.control_input_volume.load(Ordering::Relaxed));
+                agc_source.set_enabled(LIVE_SETTINGS.auto_speaker_volume.load(Ordering::Relaxed));
             })
             .replayable(REPLAY_DURATION)
             .expect("REPLAY_DURATION is longer than 100ms");

crates/audio/src/audio_settings.rs 🔗

@@ -9,44 +9,107 @@ use settings::{Settings, SettingsKey, SettingsSources, SettingsStore, SettingsUi
 #[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug, SettingsUi)]
 pub struct AudioSettings {
     /// Opt into the new audio system.
+    ///
+    /// You need to rejoin a call for this setting to apply
     #[serde(rename = "experimental.rodio_audio", default)]
     pub rodio_audio: bool, // default is false
     /// Requires 'rodio_audio: true'
     ///
-    /// Use the new audio systems automatic gain control for your microphone.
-    /// This affects how loud you sound to others.
-    #[serde(rename = "experimental.control_input_volume", default)]
-    pub control_input_volume: bool,
+    /// Automatically increase or decrease you microphone's volume. This affects how
+    /// loud you sound to others.
+    ///
+    /// Recommended: off (default)
+    /// Microphones are too quite in zed, until everyone is on experimental
+    /// audio and has auto speaker volume on this will make you very loud
+    /// compared to other speakers.
+    #[serde(
+        rename = "experimental.auto_microphone_volume",
+        default = "default_false"
+    )]
+    pub auto_microphone_volume: bool,
+    /// Requires 'rodio_audio: true'
+    ///
+    /// Automatically increate or decrease the volume of other call members.
+    /// This only affects how things sound for you.
+    #[serde(rename = "experimental.auto_speaker_volume", default = "default_true")]
+    pub auto_speaker_volume: bool,
+    /// Requires 'rodio_audio: true'
+    ///
+    /// Remove background noises. Works great for typing, cars, dogs, AC. Does
+    /// not work well on music.
+    #[serde(rename = "experimental.denoise", default = "default_false")]
+    pub denoise: bool,
     /// Requires 'rodio_audio: true'
     ///
-    /// Use the new audio systems automatic gain control on everyone in the
-    /// call. This makes call members who are too quite louder and those who are
-    /// too loud quieter. This only affects how things sound for you.
-    #[serde(rename = "experimental.control_output_volume", default)]
-    pub control_output_volume: bool,
+    /// Use audio parameters compatible with the previous versions of
+    /// experimental audio and non-experimental audio. When this is false you
+    /// will sound strange to anyone not on the latest experimental audio. In
+    /// the future we will migrate by setting this to false
+    ///
+    /// You need to rejoin a call for this setting to apply
+    #[serde(
+        rename = "experimental.legacy_audio_compatible",
+        default = "default_true"
+    )]
+    pub legacy_audio_compatible: bool,
 }
-
 /// Configuration of audio in Zed.
 #[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug, SettingsUi, SettingsKey)]
 #[serde(default)]
 #[settings_key(key = "audio")]
 pub struct AudioSettingsContent {
     /// Opt into the new audio system.
+    ///
+    /// You need to rejoin a call for this setting to apply
     #[serde(rename = "experimental.rodio_audio", default)]
     pub rodio_audio: bool, // default is false
     /// Requires 'rodio_audio: true'
     ///
-    /// Use the new audio systems automatic gain control for your microphone.
-    /// This affects how loud you sound to others.
-    #[serde(rename = "experimental.control_input_volume", default)]
-    pub control_input_volume: bool,
+    /// Automatically increase or decrease you microphone's volume. This affects how
+    /// loud you sound to others.
+    ///
+    /// Recommended: off (default)
+    /// Microphones are too quite in zed, until everyone is on experimental
+    /// audio and has auto speaker volume on this will make you very loud
+    /// compared to other speakers.
+    #[serde(
+        rename = "experimental.auto_microphone_volume",
+        default = "default_false"
+    )]
+    pub auto_microphone_volume: bool,
+    /// Requires 'rodio_audio: true'
+    ///
+    /// Automatically increate or decrease the volume of other call members.
+    /// This only affects how things sound for you.
+    #[serde(rename = "experimental.auto_speaker_volume", default = "default_true")]
+    pub auto_speaker_volume: bool,
     /// Requires 'rodio_audio: true'
     ///
-    /// Use the new audio systems automatic gain control on everyone in the
-    /// call. This makes call members who are too quite louder and those who are
-    /// too loud quieter. This only affects how things sound for you.
-    #[serde(rename = "experimental.control_output_volume", default)]
-    pub control_output_volume: bool,
+    /// Remove background noises. Works great for typing, cars, dogs, AC. Does
+    /// not work well on music.
+    #[serde(rename = "experimental.denoise", default = "default_false")]
+    pub denoise: bool,
+    /// Requires 'rodio_audio: true'
+    ///
+    /// Use audio parameters compatible with the previous versions of
+    /// experimental audio and non-experimental audio. When this is false you
+    /// will sound strange to anyone not on the latest experimental audio. In
+    /// the future we will migrate by setting this to false
+    ///
+    /// You need to rejoin a call for this setting to apply
+    #[serde(
+        rename = "experimental.legacy_audio_compatible",
+        default = "default_true"
+    )]
+    pub legacy_audio_compatible: bool,
+}
+
+fn default_true() -> bool {
+    true
+}
+
+fn default_false() -> bool {
+    false
 }
 
 impl Settings for AudioSettings {
@@ -61,31 +124,38 @@ impl Settings for AudioSettings {
 
 /// See docs on [LIVE_SETTINGS]
 pub(crate) struct LiveSettings {
-    pub(crate) control_input_volume: AtomicBool,
-    pub(crate) control_output_volume: AtomicBool,
+    pub(crate) auto_microphone_volume: AtomicBool,
+    pub(crate) auto_speaker_volume: AtomicBool,
+    pub(crate) denoise: AtomicBool,
 }
 
 impl LiveSettings {
     pub(crate) fn initialize(&self, cx: &mut App) {
         cx.observe_global::<SettingsStore>(move |cx| {
-            LIVE_SETTINGS.control_input_volume.store(
-                AudioSettings::get_global(cx).control_input_volume,
+            LIVE_SETTINGS.auto_microphone_volume.store(
+                AudioSettings::get_global(cx).auto_microphone_volume,
                 Ordering::Relaxed,
             );
-            LIVE_SETTINGS.control_output_volume.store(
-                AudioSettings::get_global(cx).control_output_volume,
+            LIVE_SETTINGS.auto_speaker_volume.store(
+                AudioSettings::get_global(cx).auto_speaker_volume,
                 Ordering::Relaxed,
             );
+            LIVE_SETTINGS
+                .denoise
+                .store(AudioSettings::get_global(cx).denoise, Ordering::Relaxed);
         })
         .detach();
 
         let init_settings = AudioSettings::get_global(cx);
         LIVE_SETTINGS
-            .control_input_volume
-            .store(init_settings.control_input_volume, Ordering::Relaxed);
+            .auto_microphone_volume
+            .store(init_settings.auto_microphone_volume, Ordering::Relaxed);
+        LIVE_SETTINGS
+            .auto_speaker_volume
+            .store(init_settings.auto_speaker_volume, Ordering::Relaxed);
         LIVE_SETTINGS
-            .control_output_volume
-            .store(init_settings.control_output_volume, Ordering::Relaxed);
+            .denoise
+            .store(init_settings.denoise, Ordering::Relaxed);
     }
 }
 
@@ -94,6 +164,7 @@ impl LiveSettings {
 /// real time and must each run in a dedicated OS thread, therefore we can not
 /// use the background executor.
 pub(crate) static LIVE_SETTINGS: LiveSettings = LiveSettings {
-    control_input_volume: AtomicBool::new(true),
-    control_output_volume: AtomicBool::new(true),
+    auto_microphone_volume: AtomicBool::new(true),
+    auto_speaker_volume: AtomicBool::new(true),
+    denoise: AtomicBool::new(true),
 };

crates/livekit_client/src/livekit_client.rs 🔗

@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use anyhow::{Context as _, Result};
+use anyhow::{Context as _, Result, anyhow};
 use audio::AudioSettings;
 use collections::HashMap;
 use futures::{SinkExt, channel::mpsc};
@@ -12,7 +12,10 @@ use settings::Settings;
 
 mod playback;
 
-use crate::{LocalTrack, Participant, RemoteTrack, RoomEvent, TrackPublication};
+use crate::{
+    LocalTrack, Participant, RemoteTrack, RoomEvent, TrackPublication,
+    livekit_client::playback::Speaker,
+};
 pub use playback::AudioStream;
 pub(crate) use playback::{RemoteVideoFrame, play_remote_video_track};
 
@@ -132,11 +135,20 @@ impl Room {
         track: &RemoteAudioTrack,
         cx: &mut App,
     ) -> Result<playback::AudioStream> {
+        let speaker: Speaker =
+            serde_urlencoded::from_str(&track.0.name()).unwrap_or_else(|_| Speaker {
+                name: track.0.name(),
+                is_staff: false,
+                legacy_audio_compatible: true,
+            });
+
         if AudioSettings::get_global(cx).rodio_audio {
             info!("Using experimental.rodio_audio audio pipeline for output");
-            playback::play_remote_audio_track(&track.0, cx)
-        } else {
+            playback::play_remote_audio_track(&track.0, speaker, cx)
+        } else if speaker.legacy_audio_compatible {
             Ok(self.playback.play_remote_audio_track(&track.0))
+        } else {
+            Err(anyhow!("Client version too old to play audio in call"))
         }
     }
 }

crates/livekit_client/src/livekit_client/playback.rs 🔗

@@ -1,6 +1,6 @@
 use anyhow::{Context as _, Result};
 
-use audio::{AudioSettings, CHANNEL_COUNT, SAMPLE_RATE};
+use audio::{AudioSettings, CHANNEL_COUNT, LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE, SAMPLE_RATE};
 use cpal::traits::{DeviceTrait, StreamTrait as _};
 use futures::channel::mpsc::UnboundedSender;
 use futures::{Stream, StreamExt as _};
@@ -43,12 +43,17 @@ pub(crate) struct AudioStack {
 
 pub(crate) fn play_remote_audio_track(
     track: &livekit::track::RemoteAudioTrack,
+    speaker: Speaker,
     cx: &mut gpui::App,
 ) -> Result<AudioStream> {
+    let stream = source::LiveKitStream::new(
+        cx.background_executor(),
+        track,
+        speaker.legacy_audio_compatible,
+    );
+
     let stop_handle = Arc::new(AtomicBool::new(false));
     let stop_handle_clone = stop_handle.clone();
-    let stream = source::LiveKitStream::new(cx.background_executor(), track);
-
     let stream = stream
         .stoppable()
         .periodic_access(Duration::from_millis(50), move |s| {
@@ -57,10 +62,6 @@ pub(crate) fn play_remote_audio_track(
             }
         });
 
-    let speaker: Speaker = serde_urlencoded::from_str(&track.name()).unwrap_or_else(|_| Speaker {
-        name: track.name(),
-        is_staff: false,
-    });
     audio::Audio::play_voip_stream(stream, speaker.name, speaker.is_staff, cx)
         .context("Could not play audio")?;
 
@@ -152,17 +153,32 @@ impl AudioStack {
         is_staff: bool,
         cx: &AsyncApp,
     ) -> Result<(crate::LocalAudioTrack, AudioStream)> {
-        let source = NativeAudioSource::new(
-            // n.b. this struct's options are always ignored, noise cancellation is provided by apm.
-            AudioSourceOptions::default(),
-            SAMPLE_RATE.get(),
-            CHANNEL_COUNT.get().into(),
-            10,
-        );
+        let legacy_audio_compatible =
+            AudioSettings::try_read_global(cx, |setting| setting.legacy_audio_compatible)
+                .unwrap_or_default();
+
+        let source = if legacy_audio_compatible {
+            NativeAudioSource::new(
+                // n.b. this struct's options are always ignored, noise cancellation is provided by apm.
+                AudioSourceOptions::default(),
+                LEGACY_SAMPLE_RATE.get(),
+                LEGACY_CHANNEL_COUNT.get().into(),
+                10,
+            )
+        } else {
+            NativeAudioSource::new(
+                // n.b. this struct's options are always ignored, noise cancellation is provided by apm.
+                AudioSourceOptions::default(),
+                SAMPLE_RATE.get(),
+                CHANNEL_COUNT.get().into(),
+                10,
+            )
+        };
 
         let track_name = serde_urlencoded::to_string(Speaker {
             name: user_name,
             is_staff,
+            legacy_audio_compatible,
         })
         .context("Could not encode user information in track name")?;
 
@@ -186,6 +202,7 @@ impl AudioStack {
         let capture_task = if rodio_pipeline {
             info!("Using experimental.rodio_audio audio pipeline");
             let voip_parts = audio::VoipParts::new(cx)?;
+<<<<<<< HEAD
             // Audio needs to run real-time and should never be paused. That is why we are using a
             // normal std::thread and not a background task
             thread::Builder::new()
@@ -197,11 +214,32 @@ impl AudioStack {
                     Ok::<(), anyhow::Error>(())
                 })
                 .unwrap();
+=======
+            // Audio needs to run real-time and should never be paused. That is
+            // why we are using a normal std::thread and not a background task
+            thread::spawn(move || {
+                // microphone is non send on mac
+                let microphone = match audio::Audio::open_microphone(voip_parts) {
+                    Ok(m) => m,
+                    Err(e) => {
+                        log::error!("Could not open microphone: {e}");
+                        return;
+                    }
+                };
+                send_to_livekit(frame_tx, microphone);
+                Ok::<(), anyhow::Error>(())
+            });
+>>>>>>> e459c8f30c (fixes denoise setting, prepares for migration to 16kHz)
             Task::ready(Ok(()))
         } else {
             self.executor.spawn(async move {
-                Self::capture_input(apm, frame_tx, SAMPLE_RATE.get(), CHANNEL_COUNT.get().into())
-                    .await
+                Self::capture_input(
+                    apm,
+                    frame_tx,
+                    LEGACY_SAMPLE_RATE.get(),
+                    LEGACY_CHANNEL_COUNT.get().into(),
+                )
+                .await
             })
         };
 
@@ -389,25 +427,30 @@ impl AudioStack {
 }
 
 #[derive(Serialize, Deserialize)]
-struct Speaker {
-    name: String,
-    is_staff: bool,
+pub struct Speaker {
+    pub name: String,
+    pub is_staff: bool,
+    pub legacy_audio_compatible: bool,
 }
 
 fn send_to_livekit(frame_tx: UnboundedSender<AudioFrame<'static>>, mut microphone: impl Source) {
     use cpal::Sample;
+    let sample_rate = microphone.sample_rate().get();
+    let num_channels = microphone.channels().get() as u32;
+    let buffer_size = sample_rate / 100 * num_channels;
+
     loop {
         let sampled: Vec<_> = microphone
             .by_ref()
-            .take(audio::BUFFER_SIZE)
+            .take(buffer_size as usize)
             .map(|s| s.to_sample())
             .collect();
 
         if frame_tx
             .unbounded_send(AudioFrame {
-                sample_rate: SAMPLE_RATE.get(),
-                num_channels: CHANNEL_COUNT.get() as u32,
-                samples_per_channel: sampled.len() as u32 / CHANNEL_COUNT.get() as u32,
+                sample_rate,
+                num_channels,
+                samples_per_channel: sampled.len() as u32 / num_channels,
                 data: Cow::Owned(sampled),
             })
             .is_err()

crates/livekit_client/src/livekit_client/playback/source.rs 🔗

@@ -3,17 +3,19 @@ use std::num::NonZero;
 use futures::StreamExt;
 use libwebrtc::{audio_stream::native::NativeAudioStream, prelude::AudioFrame};
 use livekit::track::RemoteAudioTrack;
-use rodio::{Source, buffer::SamplesBuffer, conversions::SampleTypeConverter, nz};
+use rodio::{
+    ChannelCount, SampleRate, Source, buffer::SamplesBuffer, conversions::SampleTypeConverter,
+};
 
-use audio::{CHANNEL_COUNT, SAMPLE_RATE};
+use audio::{CHANNEL_COUNT, LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE, SAMPLE_RATE};
 
 fn frame_to_samplesbuffer(frame: AudioFrame) -> SamplesBuffer {
     let samples = frame.data.iter().copied();
     let samples = SampleTypeConverter::<_, _>::new(samples);
     let samples: Vec<f32> = samples.collect();
     SamplesBuffer::new(
-        nz!(2), // frame always has two channels
-        NonZero::new(frame.sample_rate).expect("audio frame sample rate is nonzero"),
+        NonZero::new(frame.num_channels as u16).expect("zero channels is nonsense"),
+        NonZero::new(frame.sample_rate).expect("samplerate zero is nonsense"),
         samples,
     )
 }
@@ -22,14 +24,26 @@ pub struct LiveKitStream {
     // shared_buffer: SharedBuffer,
     inner: rodio::queue::SourcesQueueOutput,
     _receiver_task: gpui::Task<()>,
+    channel_count: ChannelCount,
+    sample_rate: SampleRate,
 }
 
 impl LiveKitStream {
-    pub fn new(executor: &gpui::BackgroundExecutor, track: &RemoteAudioTrack) -> Self {
+    pub fn new(
+        executor: &gpui::BackgroundExecutor,
+        track: &RemoteAudioTrack,
+        legacy: bool,
+    ) -> Self {
+        let (channel_count, sample_rate) = if legacy {
+            (LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE)
+        } else {
+            (CHANNEL_COUNT, SAMPLE_RATE)
+        };
+
         let mut stream = NativeAudioStream::new(
             track.rtc_track(),
-            SAMPLE_RATE.get() as i32,
-            CHANNEL_COUNT.get().into(),
+            sample_rate.get() as i32,
+            channel_count.get().into(),
         );
         let (queue_input, queue_output) = rodio::queue::queue(true);
         // spawn rtc stream
@@ -45,6 +59,8 @@ impl LiveKitStream {
         LiveKitStream {
             _receiver_task: receiver_task,
             inner: queue_output,
+            sample_rate,
+            channel_count,
         }
     }
 }
@@ -63,17 +79,11 @@ impl Source for LiveKitStream {
     }
 
     fn channels(&self) -> rodio::ChannelCount {
-        // This must be hardcoded because the playback source assumes constant
-        // sample rate and channel count. The queue upon which this is build
-        // will however report different counts and rates. Even though we put in
-        // only items with our (constant) CHANNEL_COUNT & SAMPLE_RATE this will
-        // play silence on one channel and at 44100 which is not what our
-        // constants are.
-        CHANNEL_COUNT
+        self.channel_count
     }
 
     fn sample_rate(&self) -> rodio::SampleRate {
-        SAMPLE_RATE // see comment on channels
+        self.sample_rate
     }
 
     fn total_duration(&self) -> Option<std::time::Duration> {