events.rs

  1use super::ips_file::IpsFile;
  2use crate::api::CloudflareIpCountryHeader;
  3use crate::{AppState, Error, Result, api::slack};
  4use anyhow::anyhow;
  5use aws_sdk_s3::primitives::ByteStream;
  6use axum::{
  7    Extension, Router, TypedHeader,
  8    body::Bytes,
  9    headers::Header,
 10    http::{HeaderMap, HeaderName, StatusCode},
 11    routing::post,
 12};
 13use chrono::Duration;
 14use semantic_version::SemanticVersion;
 15use serde::{Deserialize, Serialize};
 16use serde_json::json;
 17use sha2::{Digest, Sha256};
 18use std::sync::{Arc, OnceLock};
 19use telemetry_events::{Event, EventRequestBody, Panic};
 20use util::ResultExt;
 21use uuid::Uuid;
 22
 23const CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
 24
 25pub fn router() -> Router {
 26    Router::new()
 27        .route("/telemetry/events", post(post_events))
 28        .route("/telemetry/crashes", post(post_crash))
 29        .route("/telemetry/panics", post(post_panic))
 30        .route("/telemetry/hangs", post(post_hang))
 31}
 32
 33pub struct ZedChecksumHeader(Vec<u8>);
 34
 35impl Header for ZedChecksumHeader {
 36    fn name() -> &'static HeaderName {
 37        static ZED_CHECKSUM_HEADER: OnceLock<HeaderName> = OnceLock::new();
 38        ZED_CHECKSUM_HEADER.get_or_init(|| HeaderName::from_static("x-zed-checksum"))
 39    }
 40
 41    fn decode<'i, I>(values: &mut I) -> Result<Self, axum::headers::Error>
 42    where
 43        Self: Sized,
 44        I: Iterator<Item = &'i axum::http::HeaderValue>,
 45    {
 46        let checksum = values
 47            .next()
 48            .ok_or_else(axum::headers::Error::invalid)?
 49            .to_str()
 50            .map_err(|_| axum::headers::Error::invalid())?;
 51
 52        let bytes = hex::decode(checksum).map_err(|_| axum::headers::Error::invalid())?;
 53        Ok(Self(bytes))
 54    }
 55
 56    fn encode<E: Extend<axum::http::HeaderValue>>(&self, _values: &mut E) {
 57        unimplemented!()
 58    }
 59}
 60
 61pub async fn post_crash(
 62    Extension(app): Extension<Arc<AppState>>,
 63    headers: HeaderMap,
 64    body: Bytes,
 65) -> Result<()> {
 66    let report = IpsFile::parse(&body)?;
 67    let version_threshold = SemanticVersion::new(0, 123, 0);
 68
 69    let bundle_id = &report.header.bundle_id;
 70    let app_version = &report.app_version();
 71
 72    if bundle_id == "dev.zed.Zed-Dev" {
 73        log::error!("Crash uploads from {} are ignored.", bundle_id);
 74        return Ok(());
 75    }
 76
 77    if app_version.is_none() || app_version.unwrap() < version_threshold {
 78        log::error!(
 79            "Crash uploads from {} are ignored.",
 80            report.header.app_version
 81        );
 82        return Ok(());
 83    }
 84    let app_version = app_version.unwrap();
 85
 86    if let Some(blob_store_client) = app.blob_store_client.as_ref() {
 87        let response = blob_store_client
 88            .head_object()
 89            .bucket(CRASH_REPORTS_BUCKET)
 90            .key(report.header.incident_id.clone() + ".ips")
 91            .send()
 92            .await;
 93
 94        if response.is_ok() {
 95            log::info!("We've already uploaded this crash");
 96            return Ok(());
 97        }
 98
 99        blob_store_client
100            .put_object()
101            .bucket(CRASH_REPORTS_BUCKET)
102            .key(report.header.incident_id.clone() + ".ips")
103            .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
104            .body(ByteStream::from(body.to_vec()))
105            .send()
106            .await
107            .map_err(|e| log::error!("Failed to upload crash: {}", e))
108            .ok();
109    }
110
111    let recent_panic_on: Option<i64> = headers
112        .get("x-zed-panicked-on")
113        .and_then(|h| h.to_str().ok())
114        .and_then(|s| s.parse().ok());
115
116    let installation_id = headers
117        .get("x-zed-installation-id")
118        .and_then(|h| h.to_str().ok())
119        .map(|s| s.to_string())
120        .unwrap_or_default();
121
122    let mut recent_panic = None;
123
124    if let Some(recent_panic_on) = recent_panic_on {
125        let crashed_at = match report.timestamp() {
126            Ok(t) => Some(t),
127            Err(e) => {
128                log::error!("Can't parse {}: {}", report.header.timestamp, e);
129                None
130            }
131        };
132        if crashed_at.is_some_and(|t| (t.timestamp_millis() - recent_panic_on).abs() <= 30000) {
133            recent_panic = headers.get("x-zed-panic").and_then(|h| h.to_str().ok());
134        }
135    }
136
137    let description = report.description(recent_panic);
138    let summary = report.backtrace_summary();
139
140    tracing::error!(
141        service = "client",
142        version = %report.header.app_version,
143        os_version = %report.header.os_version,
144        bundle_id = %report.header.bundle_id,
145        incident_id = %report.header.incident_id,
146        installation_id = %installation_id,
147        description = %description,
148        backtrace = %summary,
149        "crash report"
150    );
151
152    if let Some(kinesis_client) = app.kinesis_client.clone()
153        && let Some(stream) = app.config.kinesis_stream.clone()
154    {
155        let properties = json!({
156            "app_version": report.header.app_version,
157            "os_version": report.header.os_version,
158            "os_name": "macOS",
159            "bundle_id": report.header.bundle_id,
160            "incident_id": report.header.incident_id,
161            "installation_id": installation_id,
162            "description": description,
163            "backtrace": summary,
164        });
165        let row = SnowflakeRow::new(
166            "Crash Reported",
167            None,
168            false,
169            Some(installation_id),
170            properties,
171        );
172        let data = serde_json::to_vec(&row)?;
173        kinesis_client
174            .put_record()
175            .stream_name(stream)
176            .partition_key(row.insert_id.unwrap_or_default())
177            .data(data.into())
178            .send()
179            .await
180            .log_err();
181    }
182
183    if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
184        let payload = slack::WebhookBody::new(|w| {
185            w.add_section(|s| s.text(slack::Text::markdown(description)))
186                .add_section(|s| {
187                    s.add_field(slack::Text::markdown(format!(
188                        "*Version:*\n{} ({})",
189                        bundle_id, app_version
190                    )))
191                    .add_field({
192                        let hostname = app.config.blob_store_url.clone().unwrap_or_default();
193                        let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
194                            hostname.strip_prefix("http://").unwrap_or_default()
195                        });
196
197                        slack::Text::markdown(format!(
198                            "*Incident:*\n<https://{}.{}/{}.ips|{}…>",
199                            CRASH_REPORTS_BUCKET,
200                            hostname,
201                            report.header.incident_id,
202                            report
203                                .header
204                                .incident_id
205                                .chars()
206                                .take(8)
207                                .collect::<String>(),
208                        ))
209                    })
210                })
211                .add_rich_text(|r| r.add_preformatted(|p| p.add_text(summary)))
212        });
213        let payload_json = serde_json::to_string(&payload).map_err(|err| {
214            log::error!("Failed to serialize payload to JSON: {err}");
215            Error::Internal(anyhow!(err))
216        })?;
217
218        reqwest::Client::new()
219            .post(slack_panics_webhook)
220            .header("Content-Type", "application/json")
221            .body(payload_json)
222            .send()
223            .await
224            .map_err(|err| {
225                log::error!("Failed to send payload to Slack: {err}");
226                Error::Internal(anyhow!(err))
227            })?;
228    }
229
230    Ok(())
231}
232
233pub async fn post_hang(
234    Extension(app): Extension<Arc<AppState>>,
235    TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
236    body: Bytes,
237) -> Result<()> {
238    let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
239        return Err(Error::http(
240            StatusCode::INTERNAL_SERVER_ERROR,
241            "events not enabled".into(),
242        ))?;
243    };
244
245    if checksum != expected {
246        return Err(Error::http(
247            StatusCode::BAD_REQUEST,
248            "invalid checksum".into(),
249        ))?;
250    }
251
252    let incident_id = Uuid::new_v4().to_string();
253
254    // dump JSON into S3 so we can get frame offsets if we need to.
255    if let Some(blob_store_client) = app.blob_store_client.as_ref() {
256        blob_store_client
257            .put_object()
258            .bucket(CRASH_REPORTS_BUCKET)
259            .key(incident_id.clone() + ".hang.json")
260            .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
261            .body(ByteStream::from(body.to_vec()))
262            .send()
263            .await
264            .map_err(|e| log::error!("Failed to upload crash: {}", e))
265            .ok();
266    }
267
268    let report: telemetry_events::HangReport = serde_json::from_slice(&body).map_err(|err| {
269        log::error!("can't parse report json: {err}");
270        Error::Internal(anyhow!(err))
271    })?;
272
273    let mut backtrace = "Possible hang detected on main thread:".to_string();
274    let unknown = "<unknown>".to_string();
275    for frame in report.backtrace.iter() {
276        backtrace.push_str(&format!("\n{}", frame.symbols.first().unwrap_or(&unknown)));
277    }
278
279    tracing::error!(
280        service = "client",
281        version = %report.app_version.unwrap_or_default().to_string(),
282        os_name = %report.os_name,
283        os_version = report.os_version.unwrap_or_default(),
284        incident_id = %incident_id,
285        installation_id = %report.installation_id.unwrap_or_default(),
286        backtrace = %backtrace,
287        "hang report");
288
289    Ok(())
290}
291
292pub async fn post_panic(
293    Extension(app): Extension<Arc<AppState>>,
294    TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
295    body: Bytes,
296) -> Result<()> {
297    let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
298        return Err(Error::http(
299            StatusCode::INTERNAL_SERVER_ERROR,
300            "events not enabled".into(),
301        ))?;
302    };
303
304    if checksum != expected {
305        return Err(Error::http(
306            StatusCode::BAD_REQUEST,
307            "invalid checksum".into(),
308        ))?;
309    }
310
311    let report: telemetry_events::PanicRequest = serde_json::from_slice(&body)
312        .map_err(|_| Error::http(StatusCode::BAD_REQUEST, "invalid json".into()))?;
313    let incident_id = uuid::Uuid::new_v4().to_string();
314    let panic = report.panic;
315
316    if panic.os_name == "Linux" && panic.os_version == Some("1.0.0".to_string()) {
317        return Err(Error::http(
318            StatusCode::BAD_REQUEST,
319            "invalid os version".into(),
320        ))?;
321    }
322
323    if let Some(blob_store_client) = app.blob_store_client.as_ref() {
324        let response = blob_store_client
325            .head_object()
326            .bucket(CRASH_REPORTS_BUCKET)
327            .key(incident_id.clone() + ".json")
328            .send()
329            .await;
330
331        if response.is_ok() {
332            log::info!("We've already uploaded this crash");
333            return Ok(());
334        }
335
336        blob_store_client
337            .put_object()
338            .bucket(CRASH_REPORTS_BUCKET)
339            .key(incident_id.clone() + ".json")
340            .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
341            .body(ByteStream::from(body.to_vec()))
342            .send()
343            .await
344            .map_err(|e| log::error!("Failed to upload crash: {}", e))
345            .ok();
346    }
347
348    let backtrace = panic.backtrace.join("\n");
349
350    tracing::error!(
351        service = "client",
352        version = %panic.app_version,
353        os_name = %panic.os_name,
354        os_version = %panic.os_version.clone().unwrap_or_default(),
355        incident_id = %incident_id,
356        installation_id = %panic.installation_id.clone().unwrap_or_default(),
357        description = %panic.payload,
358        backtrace = %backtrace,
359        "panic report"
360    );
361
362    if let Some(kinesis_client) = app.kinesis_client.clone()
363        && let Some(stream) = app.config.kinesis_stream.clone()
364    {
365        let properties = json!({
366            "app_version": panic.app_version,
367            "os_name": panic.os_name,
368            "os_version": panic.os_version,
369            "incident_id": incident_id,
370            "installation_id": panic.installation_id,
371            "description": panic.payload,
372            "backtrace": backtrace,
373        });
374        let row = SnowflakeRow::new(
375            "Panic Reported",
376            None,
377            false,
378            panic.installation_id.clone(),
379            properties,
380        );
381        let data = serde_json::to_vec(&row)?;
382        kinesis_client
383            .put_record()
384            .stream_name(stream)
385            .partition_key(row.insert_id.unwrap_or_default())
386            .data(data.into())
387            .send()
388            .await
389            .log_err();
390    }
391
392    if !report_to_slack(&panic) {
393        return Ok(());
394    }
395
396    if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
397        let backtrace = if panic.backtrace.len() > 25 {
398            let total = panic.backtrace.len();
399            format!(
400                "{}\n   and {} more",
401                panic
402                    .backtrace
403                    .iter()
404                    .take(20)
405                    .cloned()
406                    .collect::<Vec<_>>()
407                    .join("\n"),
408                total - 20
409            )
410        } else {
411            panic.backtrace.join("\n")
412        };
413        let backtrace_with_summary = panic.payload + "\n" + &backtrace;
414
415        let version = if panic.release_channel == "nightly"
416            && !panic.app_version.contains("remote-server")
417            && let Some(sha) = panic.app_commit_sha
418        {
419            format!("Zed Nightly {}", sha.chars().take(7).collect::<String>())
420        } else {
421            panic.app_version
422        };
423
424        let payload = slack::WebhookBody::new(|w| {
425            w.add_section(|s| s.text(slack::Text::markdown("Panic request".to_string())))
426                .add_section(|s| {
427                    s.add_field(slack::Text::markdown(format!("*Version:*\n {version} ",)))
428                        .add_field({
429                            let hostname = app.config.blob_store_url.clone().unwrap_or_default();
430                            let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
431                                hostname.strip_prefix("http://").unwrap_or_default()
432                            });
433
434                            slack::Text::markdown(format!(
435                                "*{} {}:*\n<https://{}.{}/{}.json|{}…>",
436                                panic.os_name,
437                                panic.os_version.unwrap_or_default(),
438                                CRASH_REPORTS_BUCKET,
439                                hostname,
440                                incident_id,
441                                incident_id.chars().take(8).collect::<String>(),
442                            ))
443                        })
444                })
445                .add_rich_text(|r| r.add_preformatted(|p| p.add_text(backtrace_with_summary)))
446        });
447        let payload_json = serde_json::to_string(&payload).map_err(|err| {
448            log::error!("Failed to serialize payload to JSON: {err}");
449            Error::Internal(anyhow!(err))
450        })?;
451
452        reqwest::Client::new()
453            .post(slack_panics_webhook)
454            .header("Content-Type", "application/json")
455            .body(payload_json)
456            .send()
457            .await
458            .map_err(|err| {
459                log::error!("Failed to send payload to Slack: {err}");
460                Error::Internal(anyhow!(err))
461            })?;
462    }
463
464    Ok(())
465}
466
467fn report_to_slack(panic: &Panic) -> bool {
468    // Panics on macOS should make their way to Slack as a crash report,
469    // so we don't need to send them a second time via this channel.
470    if panic.os_name == "macOS" {
471        return false;
472    }
473
474    if panic.payload.contains("ERROR_SURFACE_LOST_KHR") {
475        return false;
476    }
477
478    if panic.payload.contains("ERROR_INITIALIZATION_FAILED") {
479        return false;
480    }
481
482    if panic
483        .payload
484        .contains("GPU has crashed, and no debug information is available")
485    {
486        return false;
487    }
488
489    true
490}
491
492pub async fn post_events(
493    Extension(app): Extension<Arc<AppState>>,
494    TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
495    country_code_header: Option<TypedHeader<CloudflareIpCountryHeader>>,
496    body: Bytes,
497) -> Result<()> {
498    let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
499        return Err(Error::http(
500            StatusCode::INTERNAL_SERVER_ERROR,
501            "events not enabled".into(),
502        ))?;
503    };
504
505    let checksum_matched = checksum == expected;
506
507    let request_body: telemetry_events::EventRequestBody =
508        serde_json::from_slice(&body).map_err(|err| {
509            log::error!("can't parse event json: {err}");
510            Error::Internal(anyhow!(err))
511        })?;
512
513    let Some(last_event) = request_body.events.last() else {
514        return Err(Error::http(StatusCode::BAD_REQUEST, "no events".into()))?;
515    };
516    let country_code = country_code_header.map(|h| h.to_string());
517
518    let first_event_at = chrono::Utc::now()
519        - chrono::Duration::milliseconds(last_event.milliseconds_since_first_event);
520
521    if let Some(kinesis_client) = app.kinesis_client.clone()
522        && let Some(stream) = app.config.kinesis_stream.clone()
523    {
524        let mut request = kinesis_client.put_records().stream_name(stream);
525        let mut has_records = false;
526        for row in for_snowflake(
527            request_body.clone(),
528            first_event_at,
529            country_code.clone(),
530            checksum_matched,
531        ) {
532            if let Some(data) = serde_json::to_vec(&row).log_err() {
533                request = request.records(
534                    aws_sdk_kinesis::types::PutRecordsRequestEntry::builder()
535                        .partition_key(request_body.system_id.clone().unwrap_or_default())
536                        .data(data.into())
537                        .build()
538                        .unwrap(),
539                );
540                has_records = true;
541            }
542        }
543        if has_records {
544            request.send().await.log_err();
545        }
546    };
547
548    Ok(())
549}
550
551pub fn calculate_json_checksum(app: Arc<AppState>, json: &impl AsRef<[u8]>) -> Option<Vec<u8>> {
552    let checksum_seed = app.config.zed_client_checksum_seed.as_ref()?;
553
554    let mut summer = Sha256::new();
555    summer.update(checksum_seed);
556    summer.update(json);
557    summer.update(checksum_seed);
558    Some(summer.finalize().into_iter().collect())
559}
560
561fn for_snowflake(
562    body: EventRequestBody,
563    first_event_at: chrono::DateTime<chrono::Utc>,
564    country_code: Option<String>,
565    checksum_matched: bool,
566) -> impl Iterator<Item = SnowflakeRow> {
567    body.events.into_iter().map(move |event| {
568        let timestamp =
569            first_event_at + Duration::milliseconds(event.milliseconds_since_first_event);
570        let (event_type, mut event_properties) = match &event.event {
571            Event::Flexible(e) => (
572                e.event_type.clone(),
573                serde_json::to_value(&e.event_properties).unwrap(),
574            ),
575        };
576
577        if let serde_json::Value::Object(ref mut map) = event_properties {
578            map.insert("app_version".to_string(), body.app_version.clone().into());
579            map.insert("os_name".to_string(), body.os_name.clone().into());
580            map.insert("os_version".to_string(), body.os_version.clone().into());
581            map.insert("architecture".to_string(), body.architecture.clone().into());
582            map.insert(
583                "release_channel".to_string(),
584                body.release_channel.clone().into(),
585            );
586            map.insert("signed_in".to_string(), event.signed_in.into());
587            map.insert("checksum_matched".to_string(), checksum_matched.into());
588            if let Some(country_code) = country_code.as_ref() {
589                map.insert("country".to_string(), country_code.clone().into());
590            }
591        }
592
593        // NOTE: most amplitude user properties are read out of our event_properties
594        // dictionary. See https://app.amplitude.com/data/zed/Zed/sources/detail/production/falcon%3A159998
595        // for how that is configured.
596        let user_properties = body.is_staff.map(|is_staff| {
597            serde_json::json!({
598                "is_staff": is_staff,
599            })
600        });
601
602        SnowflakeRow {
603            time: timestamp,
604            user_id: body.metrics_id.clone(),
605            device_id: body.system_id.clone(),
606            event_type,
607            event_properties,
608            user_properties,
609            insert_id: Some(Uuid::new_v4().to_string()),
610        }
611    })
612}
613
614#[derive(Serialize, Deserialize, Debug)]
615pub struct SnowflakeRow {
616    pub time: chrono::DateTime<chrono::Utc>,
617    pub user_id: Option<String>,
618    pub device_id: Option<String>,
619    pub event_type: String,
620    pub event_properties: serde_json::Value,
621    pub user_properties: Option<serde_json::Value>,
622    pub insert_id: Option<String>,
623}
624
625impl SnowflakeRow {
626    pub fn new(
627        event_type: impl Into<String>,
628        metrics_id: Option<Uuid>,
629        is_staff: bool,
630        system_id: Option<String>,
631        event_properties: serde_json::Value,
632    ) -> Self {
633        Self {
634            time: chrono::Utc::now(),
635            event_type: event_type.into(),
636            device_id: system_id,
637            user_id: metrics_id.map(|id| id.to_string()),
638            insert_id: Some(uuid::Uuid::new_v4().to_string()),
639            event_properties,
640            user_properties: Some(json!({"is_staff": is_staff})),
641        }
642    }
643
644    pub async fn write(
645        self,
646        client: &Option<aws_sdk_kinesis::Client>,
647        stream: &Option<String>,
648    ) -> anyhow::Result<()> {
649        let Some((client, stream)) = client.as_ref().zip(stream.as_ref()) else {
650            return Ok(());
651        };
652        let row = serde_json::to_vec(&self)?;
653        client
654            .put_record()
655            .stream_name(stream)
656            .partition_key(&self.user_id.unwrap_or_default())
657            .data(row.into())
658            .send()
659            .await?;
660        Ok(())
661    }
662}