Define readiness probe to know when the new server can accept traffic

Antonio Scandurra created

Change summary

crates/collab/k8s/manifest.template.yml | 5 +++++
crates/collab/src/integration_tests.rs  | 8 ++++----
crates/collab/src/main.rs               | 2 +-
crates/collab/src/rpc.rs                | 7 ++++---
4 files changed, 14 insertions(+), 8 deletions(-)

Detailed changes

crates/collab/k8s/manifest.template.yml 🔗

@@ -59,6 +59,11 @@ spec:
           ports:
             - containerPort: 8080
               protocol: TCP
+          readinessProbe:
+            httpGet:
+              path: /
+              initialDelaySeconds: 5
+              periodSeconds: 5
           env:
             - name: HTTP_PORT
               value: "8080"

crates/collab/src/integration_tests.rs 🔗

@@ -685,7 +685,7 @@ async fn test_server_restarts(
     );
 
     // The server finishes restarting, cleaning up stale connections.
-    server.start();
+    server.start().await.unwrap();
     deterministic.advance_clock(CLEANUP_TIMEOUT);
     assert_eq!(
         room_participants(&room_a, cx_a),
@@ -805,7 +805,7 @@ async fn test_server_restarts(
 
     // The server finishes restarting, cleaning up stale connections and canceling the
     // call to user D because the room has become empty.
-    server.start();
+    server.start().await.unwrap();
     deterministic.advance_clock(CLEANUP_TIMEOUT);
     assert!(incoming_call_d.next().await.unwrap().is_none());
 }
@@ -6124,7 +6124,7 @@ async fn test_random_collaboration(
                 log::info!("Simulating server restart");
                 server.teardown();
                 deterministic.advance_clock(RECEIVE_TIMEOUT + RECONNECT_TIMEOUT);
-                server.start();
+                server.start().await.unwrap();
                 deterministic.advance_clock(CLEANUP_TIMEOUT);
             }
             _ if !op_start_signals.is_empty() => {
@@ -6324,7 +6324,7 @@ impl TestServer {
             app_state.clone(),
             Executor::Deterministic(deterministic.build_background()),
         );
-        server.start();
+        server.start().await.unwrap();
         // Advance clock to ensure the server's cleanup task is finished.
         deterministic.advance_clock(CLEANUP_TIMEOUT);
         Self {

crates/collab/src/main.rs 🔗

@@ -58,7 +58,7 @@ async fn main() -> Result<()> {
                 .expect("failed to bind TCP listener");
 
             let rpc_server = collab::rpc::Server::new(state.clone(), Executor::Production);
-            rpc_server.start();
+            rpc_server.start().await?;
 
             let app = collab::api::routes(rpc_server.clone(), state.clone())
                 .merge(collab::rpc::routes(rpc_server.clone()))

crates/collab/src/rpc.rs 🔗

@@ -238,14 +238,14 @@ impl Server {
         Arc::new(server)
     }
 
-    pub fn start(&self) {
+    pub async fn start(&self) -> Result<()> {
+        self.app_state.db.delete_stale_projects().await?;
         let db = self.app_state.db.clone();
         let peer = self.peer.clone();
+        let timeout = self.executor.sleep(CLEANUP_TIMEOUT);
         let pool = self.connection_pool.clone();
         let live_kit_client = self.app_state.live_kit_client.clone();
-        let timeout = self.executor.sleep(CLEANUP_TIMEOUT);
         self.executor.spawn_detached(async move {
-            db.delete_stale_projects().await.trace_err();
             timeout.await;
             if let Some(room_ids) = db.stale_room_ids().await.trace_err() {
                 for room_id in room_ids {
@@ -321,6 +321,7 @@ impl Server {
                 }
             }
         });
+        Ok(())
     }
 
     pub fn teardown(&self) {