From 6d538468248827608472c6ce3498b083104c55f7 Mon Sep 17 00:00:00 2001 From: Conrad Irwin Date: Tue, 5 Mar 2024 21:58:00 -0700 Subject: [PATCH] 0-downtime collab deploys? (#8926) Before this change Kubernetes would send a SIGTERM to the old server before the new one was ready. Now it will wait. From my reading it seems like startupProbe should not be necessary if we have a readinessProbe; but from testing it seems like without startupProbe we still drop requests when using `rollout restart` Release Notes: - Fixed connectivity issues during Zed deploys. --- crates/collab/k8s/collab.template.yml | 13 +++++++++++++ crates/collab/src/rpc.rs | 4 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/crates/collab/k8s/collab.template.yml b/crates/collab/k8s/collab.template.yml index f0484bf57e6814fc72a1358a466043cf27bee828..41584e94a7c08ad47348f39386dbea368212d68b 100644 --- a/crates/collab/k8s/collab.template.yml +++ b/crates/collab/k8s/collab.template.yml @@ -33,6 +33,11 @@ metadata: spec: replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 selector: matchLabels: app: ${ZED_SERVICE_NAME} @@ -78,6 +83,13 @@ spec: port: 8080 initialDelaySeconds: 1 periodSeconds: 1 + startupProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 1 + periodSeconds: 1 + failureThreshold: 15 env: - name: HTTP_PORT value: "8080" @@ -173,6 +185,7 @@ spec: value: "true" - name: ZED_ENVIRONMENT value: ${ZED_ENVIRONMENT} + terminationGracePeriodSeconds: 10 securityContext: capabilities: # FIXME - Switch to the more restrictive `PERFMON` capability. diff --git a/crates/collab/src/rpc.rs b/crates/collab/src/rpc.rs index 2cb69c264b601d4e0d47b25989b625964ee3345b..c3df8d12fd4c0896cf337c5af36492f677ce358b 100644 --- a/crates/collab/src/rpc.rs +++ b/crates/collab/src/rpc.rs @@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument}; use util::SemanticVersion; pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30); -pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10); + +// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources. +pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15); const MESSAGE_COUNT_PER_PAGE: usize = 100; const MAX_MESSAGE_LEN: usize = 1024;