1use super::ips_file::IpsFile;
2use crate::api::CloudflareIpCountryHeader;
3use crate::{AppState, Error, Result, api::slack};
4use anyhow::anyhow;
5use aws_sdk_s3::primitives::ByteStream;
6use axum::{
7 Extension, Router, TypedHeader,
8 body::Bytes,
9 headers::Header,
10 http::{HeaderMap, HeaderName, StatusCode},
11 routing::post,
12};
13use chrono::Duration;
14use semantic_version::SemanticVersion;
15use serde::{Deserialize, Serialize};
16use serde_json::json;
17use sha2::{Digest, Sha256};
18use std::sync::{Arc, OnceLock};
19use telemetry_events::{Event, EventRequestBody, Panic};
20use util::ResultExt;
21use uuid::Uuid;
22
23const CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
24
25pub fn router() -> Router {
26 Router::new()
27 .route("/telemetry/events", post(post_events))
28 .route("/telemetry/crashes", post(post_crash))
29 .route("/telemetry/panics", post(post_panic))
30 .route("/telemetry/hangs", post(post_hang))
31}
32
33pub struct ZedChecksumHeader(Vec<u8>);
34
35impl Header for ZedChecksumHeader {
36 fn name() -> &'static HeaderName {
37 static ZED_CHECKSUM_HEADER: OnceLock<HeaderName> = OnceLock::new();
38 ZED_CHECKSUM_HEADER.get_or_init(|| HeaderName::from_static("x-zed-checksum"))
39 }
40
41 fn decode<'i, I>(values: &mut I) -> Result<Self, axum::headers::Error>
42 where
43 Self: Sized,
44 I: Iterator<Item = &'i axum::http::HeaderValue>,
45 {
46 let checksum = values
47 .next()
48 .ok_or_else(axum::headers::Error::invalid)?
49 .to_str()
50 .map_err(|_| axum::headers::Error::invalid())?;
51
52 let bytes = hex::decode(checksum).map_err(|_| axum::headers::Error::invalid())?;
53 Ok(Self(bytes))
54 }
55
56 fn encode<E: Extend<axum::http::HeaderValue>>(&self, _values: &mut E) {
57 unimplemented!()
58 }
59}
60
61pub async fn post_crash(
62 Extension(app): Extension<Arc<AppState>>,
63 headers: HeaderMap,
64 body: Bytes,
65) -> Result<()> {
66 let report = IpsFile::parse(&body)?;
67 let version_threshold = SemanticVersion::new(0, 123, 0);
68
69 let bundle_id = &report.header.bundle_id;
70 let app_version = &report.app_version();
71
72 if bundle_id == "dev.zed.Zed-Dev" {
73 log::error!("Crash uploads from {} are ignored.", bundle_id);
74 return Ok(());
75 }
76
77 if app_version.is_none() || app_version.unwrap() < version_threshold {
78 log::error!(
79 "Crash uploads from {} are ignored.",
80 report.header.app_version
81 );
82 return Ok(());
83 }
84 let app_version = app_version.unwrap();
85
86 if let Some(blob_store_client) = app.blob_store_client.as_ref() {
87 let response = blob_store_client
88 .head_object()
89 .bucket(CRASH_REPORTS_BUCKET)
90 .key(report.header.incident_id.clone() + ".ips")
91 .send()
92 .await;
93
94 if response.is_ok() {
95 log::info!("We've already uploaded this crash");
96 return Ok(());
97 }
98
99 blob_store_client
100 .put_object()
101 .bucket(CRASH_REPORTS_BUCKET)
102 .key(report.header.incident_id.clone() + ".ips")
103 .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
104 .body(ByteStream::from(body.to_vec()))
105 .send()
106 .await
107 .map_err(|e| log::error!("Failed to upload crash: {}", e))
108 .ok();
109 }
110
111 let recent_panic_on: Option<i64> = headers
112 .get("x-zed-panicked-on")
113 .and_then(|h| h.to_str().ok())
114 .and_then(|s| s.parse().ok());
115
116 let installation_id = headers
117 .get("x-zed-installation-id")
118 .and_then(|h| h.to_str().ok())
119 .map(|s| s.to_string())
120 .unwrap_or_default();
121
122 let mut recent_panic = None;
123
124 if let Some(recent_panic_on) = recent_panic_on {
125 let crashed_at = match report.timestamp() {
126 Ok(t) => Some(t),
127 Err(e) => {
128 log::error!("Can't parse {}: {}", report.header.timestamp, e);
129 None
130 }
131 };
132 if crashed_at.is_some_and(|t| (t.timestamp_millis() - recent_panic_on).abs() <= 30000) {
133 recent_panic = headers.get("x-zed-panic").and_then(|h| h.to_str().ok());
134 }
135 }
136
137 let description = report.description(recent_panic);
138 let summary = report.backtrace_summary();
139
140 tracing::error!(
141 service = "client",
142 version = %report.header.app_version,
143 os_version = %report.header.os_version,
144 bundle_id = %report.header.bundle_id,
145 incident_id = %report.header.incident_id,
146 installation_id = %installation_id,
147 description = %description,
148 backtrace = %summary,
149 "crash report"
150 );
151
152 if let Some(kinesis_client) = app.kinesis_client.clone()
153 && let Some(stream) = app.config.kinesis_stream.clone()
154 {
155 let properties = json!({
156 "app_version": report.header.app_version,
157 "os_version": report.header.os_version,
158 "os_name": "macOS",
159 "bundle_id": report.header.bundle_id,
160 "incident_id": report.header.incident_id,
161 "installation_id": installation_id,
162 "description": description,
163 "backtrace": summary,
164 });
165 let row = SnowflakeRow::new(
166 "Crash Reported",
167 None,
168 false,
169 Some(installation_id),
170 properties,
171 );
172 let data = serde_json::to_vec(&row)?;
173 kinesis_client
174 .put_record()
175 .stream_name(stream)
176 .partition_key(row.insert_id.unwrap_or_default())
177 .data(data.into())
178 .send()
179 .await
180 .log_err();
181 }
182
183 if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
184 let payload = slack::WebhookBody::new(|w| {
185 w.add_section(|s| s.text(slack::Text::markdown(description)))
186 .add_section(|s| {
187 s.add_field(slack::Text::markdown(format!(
188 "*Version:*\n{} ({})",
189 bundle_id, app_version
190 )))
191 .add_field({
192 let hostname = app.config.blob_store_url.clone().unwrap_or_default();
193 let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
194 hostname.strip_prefix("http://").unwrap_or_default()
195 });
196
197 slack::Text::markdown(format!(
198 "*Incident:*\n<https://{}.{}/{}.ips|{}…>",
199 CRASH_REPORTS_BUCKET,
200 hostname,
201 report.header.incident_id,
202 report
203 .header
204 .incident_id
205 .chars()
206 .take(8)
207 .collect::<String>(),
208 ))
209 })
210 })
211 .add_rich_text(|r| r.add_preformatted(|p| p.add_text(summary)))
212 });
213 let payload_json = serde_json::to_string(&payload).map_err(|err| {
214 log::error!("Failed to serialize payload to JSON: {err}");
215 Error::Internal(anyhow!(err))
216 })?;
217
218 reqwest::Client::new()
219 .post(slack_panics_webhook)
220 .header("Content-Type", "application/json")
221 .body(payload_json)
222 .send()
223 .await
224 .map_err(|err| {
225 log::error!("Failed to send payload to Slack: {err}");
226 Error::Internal(anyhow!(err))
227 })?;
228 }
229
230 Ok(())
231}
232
233pub async fn post_hang(
234 Extension(app): Extension<Arc<AppState>>,
235 TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
236 body: Bytes,
237) -> Result<()> {
238 let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
239 return Err(Error::http(
240 StatusCode::INTERNAL_SERVER_ERROR,
241 "events not enabled".into(),
242 ))?;
243 };
244
245 if checksum != expected {
246 return Err(Error::http(
247 StatusCode::BAD_REQUEST,
248 "invalid checksum".into(),
249 ))?;
250 }
251
252 let incident_id = Uuid::new_v4().to_string();
253
254 // dump JSON into S3 so we can get frame offsets if we need to.
255 if let Some(blob_store_client) = app.blob_store_client.as_ref() {
256 blob_store_client
257 .put_object()
258 .bucket(CRASH_REPORTS_BUCKET)
259 .key(incident_id.clone() + ".hang.json")
260 .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
261 .body(ByteStream::from(body.to_vec()))
262 .send()
263 .await
264 .map_err(|e| log::error!("Failed to upload crash: {}", e))
265 .ok();
266 }
267
268 let report: telemetry_events::HangReport = serde_json::from_slice(&body).map_err(|err| {
269 log::error!("can't parse report json: {err}");
270 Error::Internal(anyhow!(err))
271 })?;
272
273 let mut backtrace = "Possible hang detected on main thread:".to_string();
274 let unknown = "<unknown>".to_string();
275 for frame in report.backtrace.iter() {
276 backtrace.push_str(&format!("\n{}", frame.symbols.first().unwrap_or(&unknown)));
277 }
278
279 tracing::error!(
280 service = "client",
281 version = %report.app_version.unwrap_or_default().to_string(),
282 os_name = %report.os_name,
283 os_version = report.os_version.unwrap_or_default(),
284 incident_id = %incident_id,
285 installation_id = %report.installation_id.unwrap_or_default(),
286 backtrace = %backtrace,
287 "hang report");
288
289 Ok(())
290}
291
292pub async fn post_panic(
293 Extension(app): Extension<Arc<AppState>>,
294 TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
295 body: Bytes,
296) -> Result<()> {
297 let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
298 return Err(Error::http(
299 StatusCode::INTERNAL_SERVER_ERROR,
300 "events not enabled".into(),
301 ))?;
302 };
303
304 if checksum != expected {
305 return Err(Error::http(
306 StatusCode::BAD_REQUEST,
307 "invalid checksum".into(),
308 ))?;
309 }
310
311 let report: telemetry_events::PanicRequest = serde_json::from_slice(&body)
312 .map_err(|_| Error::http(StatusCode::BAD_REQUEST, "invalid json".into()))?;
313 let incident_id = uuid::Uuid::new_v4().to_string();
314 let panic = report.panic;
315
316 if panic.os_name == "Linux" && panic.os_version == Some("1.0.0".to_string()) {
317 return Err(Error::http(
318 StatusCode::BAD_REQUEST,
319 "invalid os version".into(),
320 ))?;
321 }
322
323 if let Some(blob_store_client) = app.blob_store_client.as_ref() {
324 let response = blob_store_client
325 .head_object()
326 .bucket(CRASH_REPORTS_BUCKET)
327 .key(incident_id.clone() + ".json")
328 .send()
329 .await;
330
331 if response.is_ok() {
332 log::info!("We've already uploaded this crash");
333 return Ok(());
334 }
335
336 blob_store_client
337 .put_object()
338 .bucket(CRASH_REPORTS_BUCKET)
339 .key(incident_id.clone() + ".json")
340 .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
341 .body(ByteStream::from(body.to_vec()))
342 .send()
343 .await
344 .map_err(|e| log::error!("Failed to upload crash: {}", e))
345 .ok();
346 }
347
348 let backtrace = panic.backtrace.join("\n");
349
350 tracing::error!(
351 service = "client",
352 version = %panic.app_version,
353 os_name = %panic.os_name,
354 os_version = %panic.os_version.clone().unwrap_or_default(),
355 incident_id = %incident_id,
356 installation_id = %panic.installation_id.clone().unwrap_or_default(),
357 description = %panic.payload,
358 backtrace = %backtrace,
359 "panic report"
360 );
361
362 if let Some(kinesis_client) = app.kinesis_client.clone()
363 && let Some(stream) = app.config.kinesis_stream.clone()
364 {
365 let properties = json!({
366 "app_version": panic.app_version,
367 "os_name": panic.os_name,
368 "os_version": panic.os_version,
369 "incident_id": incident_id,
370 "installation_id": panic.installation_id,
371 "description": panic.payload,
372 "backtrace": backtrace,
373 });
374 let row = SnowflakeRow::new(
375 "Panic Reported",
376 None,
377 false,
378 panic.installation_id.clone(),
379 properties,
380 );
381 let data = serde_json::to_vec(&row)?;
382 kinesis_client
383 .put_record()
384 .stream_name(stream)
385 .partition_key(row.insert_id.unwrap_or_default())
386 .data(data.into())
387 .send()
388 .await
389 .log_err();
390 }
391
392 if !report_to_slack(&panic) {
393 return Ok(());
394 }
395
396 if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
397 let backtrace = if panic.backtrace.len() > 25 {
398 let total = panic.backtrace.len();
399 format!(
400 "{}\n and {} more",
401 panic
402 .backtrace
403 .iter()
404 .take(20)
405 .cloned()
406 .collect::<Vec<_>>()
407 .join("\n"),
408 total - 20
409 )
410 } else {
411 panic.backtrace.join("\n")
412 };
413 let backtrace_with_summary = panic.payload + "\n" + &backtrace;
414
415 let version = if panic.release_channel == "nightly"
416 && !panic.app_version.contains("remote-server")
417 && let Some(sha) = panic.app_commit_sha
418 {
419 format!("Zed Nightly {}", sha.chars().take(7).collect::<String>())
420 } else {
421 panic.app_version
422 };
423
424 let payload = slack::WebhookBody::new(|w| {
425 w.add_section(|s| s.text(slack::Text::markdown("Panic request".to_string())))
426 .add_section(|s| {
427 s.add_field(slack::Text::markdown(format!("*Version:*\n {version} ",)))
428 .add_field({
429 let hostname = app.config.blob_store_url.clone().unwrap_or_default();
430 let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
431 hostname.strip_prefix("http://").unwrap_or_default()
432 });
433
434 slack::Text::markdown(format!(
435 "*{} {}:*\n<https://{}.{}/{}.json|{}…>",
436 panic.os_name,
437 panic.os_version.unwrap_or_default(),
438 CRASH_REPORTS_BUCKET,
439 hostname,
440 incident_id,
441 incident_id.chars().take(8).collect::<String>(),
442 ))
443 })
444 })
445 .add_rich_text(|r| r.add_preformatted(|p| p.add_text(backtrace_with_summary)))
446 });
447 let payload_json = serde_json::to_string(&payload).map_err(|err| {
448 log::error!("Failed to serialize payload to JSON: {err}");
449 Error::Internal(anyhow!(err))
450 })?;
451
452 reqwest::Client::new()
453 .post(slack_panics_webhook)
454 .header("Content-Type", "application/json")
455 .body(payload_json)
456 .send()
457 .await
458 .map_err(|err| {
459 log::error!("Failed to send payload to Slack: {err}");
460 Error::Internal(anyhow!(err))
461 })?;
462 }
463
464 Ok(())
465}
466
467fn report_to_slack(panic: &Panic) -> bool {
468 // Panics on macOS should make their way to Slack as a crash report,
469 // so we don't need to send them a second time via this channel.
470 if panic.os_name == "macOS" {
471 return false;
472 }
473
474 if panic.payload.contains("ERROR_SURFACE_LOST_KHR") {
475 return false;
476 }
477
478 if panic.payload.contains("ERROR_INITIALIZATION_FAILED") {
479 return false;
480 }
481
482 if panic
483 .payload
484 .contains("GPU has crashed, and no debug information is available")
485 {
486 return false;
487 }
488
489 true
490}
491
492pub async fn post_events(
493 Extension(app): Extension<Arc<AppState>>,
494 TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
495 country_code_header: Option<TypedHeader<CloudflareIpCountryHeader>>,
496 body: Bytes,
497) -> Result<()> {
498 let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
499 return Err(Error::http(
500 StatusCode::INTERNAL_SERVER_ERROR,
501 "events not enabled".into(),
502 ))?;
503 };
504
505 let checksum_matched = checksum == expected;
506
507 let request_body: telemetry_events::EventRequestBody =
508 serde_json::from_slice(&body).map_err(|err| {
509 log::error!("can't parse event json: {err}");
510 Error::Internal(anyhow!(err))
511 })?;
512
513 let Some(last_event) = request_body.events.last() else {
514 return Err(Error::http(StatusCode::BAD_REQUEST, "no events".into()))?;
515 };
516 let country_code = country_code_header.map(|h| h.to_string());
517
518 let first_event_at = chrono::Utc::now()
519 - chrono::Duration::milliseconds(last_event.milliseconds_since_first_event);
520
521 if let Some(kinesis_client) = app.kinesis_client.clone()
522 && let Some(stream) = app.config.kinesis_stream.clone()
523 {
524 let mut request = kinesis_client.put_records().stream_name(stream);
525 let mut has_records = false;
526 for row in for_snowflake(
527 request_body.clone(),
528 first_event_at,
529 country_code.clone(),
530 checksum_matched,
531 ) {
532 if let Some(data) = serde_json::to_vec(&row).log_err() {
533 request = request.records(
534 aws_sdk_kinesis::types::PutRecordsRequestEntry::builder()
535 .partition_key(request_body.system_id.clone().unwrap_or_default())
536 .data(data.into())
537 .build()
538 .unwrap(),
539 );
540 has_records = true;
541 }
542 }
543 if has_records {
544 request.send().await.log_err();
545 }
546 };
547
548 Ok(())
549}
550
551pub fn calculate_json_checksum(app: Arc<AppState>, json: &impl AsRef<[u8]>) -> Option<Vec<u8>> {
552 let checksum_seed = app.config.zed_client_checksum_seed.as_ref()?;
553
554 let mut summer = Sha256::new();
555 summer.update(checksum_seed);
556 summer.update(json);
557 summer.update(checksum_seed);
558 Some(summer.finalize().into_iter().collect())
559}
560
561fn for_snowflake(
562 body: EventRequestBody,
563 first_event_at: chrono::DateTime<chrono::Utc>,
564 country_code: Option<String>,
565 checksum_matched: bool,
566) -> impl Iterator<Item = SnowflakeRow> {
567 body.events.into_iter().map(move |event| {
568 let timestamp =
569 first_event_at + Duration::milliseconds(event.milliseconds_since_first_event);
570 let (event_type, mut event_properties) = match &event.event {
571 Event::Flexible(e) => (
572 e.event_type.clone(),
573 serde_json::to_value(&e.event_properties).unwrap(),
574 ),
575 };
576
577 if let serde_json::Value::Object(ref mut map) = event_properties {
578 map.insert("app_version".to_string(), body.app_version.clone().into());
579 map.insert("os_name".to_string(), body.os_name.clone().into());
580 map.insert("os_version".to_string(), body.os_version.clone().into());
581 map.insert("architecture".to_string(), body.architecture.clone().into());
582 map.insert(
583 "release_channel".to_string(),
584 body.release_channel.clone().into(),
585 );
586 map.insert("signed_in".to_string(), event.signed_in.into());
587 map.insert("checksum_matched".to_string(), checksum_matched.into());
588 if let Some(country_code) = country_code.as_ref() {
589 map.insert("country".to_string(), country_code.clone().into());
590 }
591 }
592
593 // NOTE: most amplitude user properties are read out of our event_properties
594 // dictionary. See https://app.amplitude.com/data/zed/Zed/sources/detail/production/falcon%3A159998
595 // for how that is configured.
596 let user_properties = body.is_staff.map(|is_staff| {
597 serde_json::json!({
598 "is_staff": is_staff,
599 })
600 });
601
602 SnowflakeRow {
603 time: timestamp,
604 user_id: body.metrics_id.clone(),
605 device_id: body.system_id.clone(),
606 event_type,
607 event_properties,
608 user_properties,
609 insert_id: Some(Uuid::new_v4().to_string()),
610 }
611 })
612}
613
614#[derive(Serialize, Deserialize, Debug)]
615pub struct SnowflakeRow {
616 pub time: chrono::DateTime<chrono::Utc>,
617 pub user_id: Option<String>,
618 pub device_id: Option<String>,
619 pub event_type: String,
620 pub event_properties: serde_json::Value,
621 pub user_properties: Option<serde_json::Value>,
622 pub insert_id: Option<String>,
623}
624
625impl SnowflakeRow {
626 pub fn new(
627 event_type: impl Into<String>,
628 metrics_id: Option<Uuid>,
629 is_staff: bool,
630 system_id: Option<String>,
631 event_properties: serde_json::Value,
632 ) -> Self {
633 Self {
634 time: chrono::Utc::now(),
635 event_type: event_type.into(),
636 device_id: system_id,
637 user_id: metrics_id.map(|id| id.to_string()),
638 insert_id: Some(uuid::Uuid::new_v4().to_string()),
639 event_properties,
640 user_properties: Some(json!({"is_staff": is_staff})),
641 }
642 }
643
644 pub async fn write(
645 self,
646 client: &Option<aws_sdk_kinesis::Client>,
647 stream: &Option<String>,
648 ) -> anyhow::Result<()> {
649 let Some((client, stream)) = client.as_ref().zip(stream.as_ref()) else {
650 return Ok(());
651 };
652 let row = serde_json::to_vec(&self)?;
653 client
654 .put_record()
655 .stream_name(stream)
656 .partition_key(&self.user_id.unwrap_or_default())
657 .data(row.into())
658 .send()
659 .await?;
660 Ok(())
661 }
662}