evals: Retry on Anthropic's internal and transient I/O errors (#35395)

Oleksiy Syvokon created

Release Notes:

- N/A

Change summary

crates/assistant_tools/src/edit_agent/evals.rs | 59 +++++++++++--------
1 file changed, 34 insertions(+), 25 deletions(-)

Detailed changes

crates/assistant_tools/src/edit_agent/evals.rs 🔗

@@ -1658,23 +1658,24 @@ impl EditAgentTest {
 }
 
 async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
+    const MAX_RETRIES: usize = 20;
     let mut attempt = 0;
+
     loop {
         attempt += 1;
-        match request().await {
-            Ok(result) => return Ok(result),
-            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
-                Ok(err) => match &err {
+        let response = request().await;
+
+        if attempt >= MAX_RETRIES {
+            return response;
+        }
+
+        let retry_delay = match &response {
+            Ok(_) => None,
+            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
+                Some(err) => match &err {
                     LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
                     | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
-                        let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
-                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
-                        let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
-                        eprintln!(
-                            "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
-                        );
-                        Timer::after(retry_after + jitter).await;
-                        continue;
+                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
                     }
                     LanguageModelCompletionError::UpstreamProviderError {
                         status,
@@ -1687,23 +1688,31 @@ async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) ->
                             StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
                         ) || status.as_u16() == 529;
 
-                        if !should_retry {
-                            return Err(err.into());
+                        if should_retry {
+                            // Use server-provided retry_after if available, otherwise use default
+                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
+                        } else {
+                            None
                         }
-
-                        // Use server-provided retry_after if available, otherwise use default
-                        let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
-                        let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
-                        eprintln!(
-                            "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
-                        );
-                        Timer::after(retry_after + jitter).await;
-                        continue;
                     }
-                    _ => return Err(err.into()),
+                    LanguageModelCompletionError::ApiReadResponseError { .. }
+                    | LanguageModelCompletionError::ApiInternalServerError { .. }
+                    | LanguageModelCompletionError::HttpSend { .. } => {
+                        // Exponential backoff for transient I/O and internal server errors
+                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
+                    }
+                    _ => None,
                 },
-                Err(err) => return Err(err),
+                _ => None,
             },
+        };
+
+        if let Some(retry_after) = retry_delay {
+            let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
+            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
+            Timer::after(retry_after + jitter).await;
+        } else {
+            return response;
         }
     }
 }