0-downtime collab deploys? (#8926)

Conrad Irwin created

Before this change Kubernetes would send a SIGTERM to the old server
before the new one was ready. Now it will wait.

From my reading it seems like startupProbe should not be necessary if we
have a
readinessProbe; but from testing it seems like without startupProbe we
still
drop requests when using `rollout restart`

Release Notes:

- Fixed connectivity issues during Zed deploys.

Change summary

crates/collab/k8s/collab.template.yml | 13 +++++++++++++
crates/collab/src/rpc.rs              |  4 +++-
2 files changed, 16 insertions(+), 1 deletion(-)

Detailed changes

crates/collab/k8s/collab.template.yml 🔗

@@ -33,6 +33,11 @@ metadata:
 
 spec:
   replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
   selector:
     matchLabels:
       app: ${ZED_SERVICE_NAME}
@@ -78,6 +83,13 @@ spec:
               port: 8080
             initialDelaySeconds: 1
             periodSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /
+              port: 8080
+            initialDelaySeconds: 1
+            periodSeconds: 1
+            failureThreshold: 15
           env:
             - name: HTTP_PORT
               value: "8080"
@@ -173,6 +185,7 @@ spec:
               value: "true"
             - name: ZED_ENVIRONMENT
               value: ${ZED_ENVIRONMENT}
+          terminationGracePeriodSeconds: 10
           securityContext:
             capabilities:
               # FIXME - Switch to the more restrictive `PERFMON` capability.

crates/collab/src/rpc.rs 🔗

@@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument};
 use util::SemanticVersion;
 
 pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);
-pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10);
+
+// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
+pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
 
 const MESSAGE_COUNT_PER_PAGE: usize = 100;
 const MAX_MESSAGE_LEN: usize = 1024;