From 6d538468248827608472c6ce3498b083104c55f7 Mon Sep 17 00:00:00 2001
From: Conrad Irwin <conrad.irwin@gmail.com>
Date: Tue, 5 Mar 2024 21:58:00 -0700
Subject: [PATCH] 0-downtime collab deploys? (#8926)

Before this change Kubernetes would send a SIGTERM to the old server
before the new one was ready. Now it will wait.

From my reading it seems like startupProbe should not be necessary if we
have a
readinessProbe; but from testing it seems like without startupProbe we
still
drop requests when using `rollout restart`

Release Notes:

- Fixed connectivity issues during Zed deploys.
---
 crates/collab/k8s/collab.template.yml | 13 +++++++++++++
 crates/collab/src/rpc.rs              |  4 +++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/crates/collab/k8s/collab.template.yml b/crates/collab/k8s/collab.template.yml
index f0484bf57e6814fc72a1358a466043cf27bee828..41584e94a7c08ad47348f39386dbea368212d68b 100644
--- a/crates/collab/k8s/collab.template.yml
+++ b/crates/collab/k8s/collab.template.yml
@@ -33,6 +33,11 @@ metadata:
 
 spec:
   replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
   selector:
     matchLabels:
       app: ${ZED_SERVICE_NAME}
@@ -78,6 +83,13 @@ spec:
               port: 8080
             initialDelaySeconds: 1
             periodSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /
+              port: 8080
+            initialDelaySeconds: 1
+            periodSeconds: 1
+            failureThreshold: 15
           env:
             - name: HTTP_PORT
               value: "8080"
@@ -173,6 +185,7 @@ spec:
               value: "true"
             - name: ZED_ENVIRONMENT
               value: ${ZED_ENVIRONMENT}
+          terminationGracePeriodSeconds: 10
           securityContext:
             capabilities:
               # FIXME - Switch to the more restrictive `PERFMON` capability.
diff --git a/crates/collab/src/rpc.rs b/crates/collab/src/rpc.rs
index 2cb69c264b601d4e0d47b25989b625964ee3345b..c3df8d12fd4c0896cf337c5af36492f677ce358b 100644
--- a/crates/collab/src/rpc.rs
+++ b/crates/collab/src/rpc.rs
@@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument};
 use util::SemanticVersion;
 
 pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);
-pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10);
+
+// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
+pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
 
 const MESSAGE_COUNT_PER_PAGE: usize = 100;
 const MAX_MESSAGE_LEN: usize = 1024;