Add retry logic to docs_suggestions workflow for transient Factory API failures (#49594)

morgankrey created

Add exponential backoff retry logic (3 attempts with 5s/10s/15s delays)
to the Droid CLI installation and
docs-suggest script execution steps in both the batch-suggestions and
cherry-pick-suggestions jobs.

This handles intermittent Factory API authentication issues that can
cause workflow failures when the API is temporarily unavailable or
rate-limited.

Release Notes:

- N/A

Change summary

.github/workflows/docs_suggestions.yml | 82 +++++++++++++++++++++++----
1 file changed, 68 insertions(+), 14 deletions(-)

Detailed changes

.github/workflows/docs_suggestions.yml 🔗

@@ -70,7 +70,20 @@ jobs:
 
       - name: Install Droid CLI
         run: |
-          curl -fsSL https://app.factory.ai/cli | sh
+          # Retry with exponential backoff for transient network/auth issues
+          MAX_RETRIES=3
+          for i in $(seq 1 "$MAX_RETRIES"); do
+            echo "Attempt $i of $MAX_RETRIES to install Droid CLI..."
+            if curl -fsSL https://app.factory.ai/cli | sh; then
+              echo "Droid CLI installed successfully"
+              break
+            fi
+            if [ "$i" -eq "$MAX_RETRIES" ]; then
+              echo "Failed to install Droid CLI after $MAX_RETRIES attempts"
+              exit 1
+            fi
+            sleep $((i * 5))
+          done
           echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
         env:
           FACTORY_API_KEY: ${{ secrets.FACTORY_API_KEY }}
@@ -100,12 +113,26 @@ jobs:
           
           OUTPUT_FILE=$(mktemp)
           
-          ./script/docs-suggest \
-            --pr "${{ steps.pr.outputs.number }}" \
-            --immediate \
-            --preview \
-            --output "$OUTPUT_FILE" \
-            --verbose
+          # Retry with exponential backoff for transient Factory API failures
+          MAX_RETRIES=3
+          for i in $(seq 1 "$MAX_RETRIES"); do
+            echo "Attempt $i of $MAX_RETRIES to analyze PR..."
+            if ./script/docs-suggest \
+              --pr "${{ steps.pr.outputs.number }}" \
+              --immediate \
+              --preview \
+              --output "$OUTPUT_FILE" \
+              --verbose; then
+              echo "Analysis completed successfully"
+              break
+            fi
+            if [ "$i" -eq "$MAX_RETRIES" ]; then
+              echo "Analysis failed after $MAX_RETRIES attempts"
+              exit 1
+            fi
+            echo "Retrying in $((i * 5)) seconds..."
+            sleep $((i * 5))
+          done
           
           # Check if we got actionable suggestions (not "no updates needed")
           if grep -q "Documentation Suggestions" "$OUTPUT_FILE" && \
@@ -251,7 +278,20 @@ jobs:
 
       - name: Install Droid CLI
         run: |
-          curl -fsSL https://app.factory.ai/cli | sh
+          # Retry with exponential backoff for transient network/auth issues
+          MAX_RETRIES=3
+          for i in $(seq 1 "$MAX_RETRIES"); do
+            echo "Attempt $i of $MAX_RETRIES to install Droid CLI..."
+            if curl -fsSL https://app.factory.ai/cli | sh; then
+              echo "Droid CLI installed successfully"
+              break
+            fi
+            if [ "$i" -eq "$MAX_RETRIES" ]; then
+              echo "Failed to install Droid CLI after $MAX_RETRIES attempts"
+              exit 1
+            fi
+            sleep $((i * 5))
+          done
           echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
         env:
           FACTORY_API_KEY: ${{ secrets.FACTORY_API_KEY }}
@@ -275,12 +315,26 @@ jobs:
           OUTPUT_FILE=$(mktemp)
           
           # Cherry-picks don't get preview callout
-          ./script/docs-suggest \
-            --pr "${{ steps.pr.outputs.number }}" \
-            --immediate \
-            --no-preview \
-            --output "$OUTPUT_FILE" \
-            --verbose
+          # Retry with exponential backoff for transient Factory API failures
+          MAX_RETRIES=3
+          for i in $(seq 1 "$MAX_RETRIES"); do
+            echo "Attempt $i of $MAX_RETRIES to analyze PR..."
+            if ./script/docs-suggest \
+              --pr "${{ steps.pr.outputs.number }}" \
+              --immediate \
+              --no-preview \
+              --output "$OUTPUT_FILE" \
+              --verbose; then
+              echo "Analysis completed successfully"
+              break
+            fi
+            if [ "$i" -eq "$MAX_RETRIES" ]; then
+              echo "Analysis failed after $MAX_RETRIES attempts"
+              exit 1
+            fi
+            echo "Retrying in $((i * 5)) seconds..."
+            sleep $((i * 5))
+          done
           
           # Check if we got actionable suggestions
           if [ -s "$OUTPUT_FILE" ] && \