Post comments on duplicate bug reports (#49482)

Lena created 1 month ago

Release Notes:

- N/A

Change summary

.github/workflows/comment_on_potential_duplicate_issues.yml |  70 
.github/workflows/identify_potential_duplicate_issues.yml   | 692 -------
.github/workflows/update_duplicate_magnets.yml              |  27 
script/github-check-new-issue-for-duplicates.py             | 534 +++++
script/github-find-top-duplicated-bugs.py                   | 223 ++
5 files changed, 854 insertions(+), 692 deletions(-)

Detailed changes

.github/workflows/comment_on_potential_duplicate_issues.yml 🔗

@@ -0,0 +1,70 @@
+name: Comment on potential duplicate bug/crash reports
+
+on:
+  issues:
+    types: [opened]
+  workflow_dispatch:
+    inputs:
+      issue_number:
+        description: "Issue number to analyze"
+        required: true
+        type: number
+
+concurrency:
+  group: potential-duplicate-check-${{ github.event.issue.number || inputs.issue_number }}
+  cancel-in-progress: true
+
+jobs:
+  identify-duplicates:
+    # For manual testing, allow running on any branch; for automatic runs, only on main repo
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'zed-industries/zed'
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    permissions:
+      contents: read
+      issues: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          sparse-checkout: script/github-check-new-issue-for-duplicates.py
+          sparse-checkout-cone-mode: false
+
+      - name: Get github app token
+        id: get-app-token
+        uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7
+        with:
+          app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }}
+          private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }}
+          owner: zed-industries
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install requests
+
+      - name: Run duplicate detection
+        id: detect
+        env:
+          GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }}
+          ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }}
+        run: |
+          python script/github-check-new-issue-for-duplicates.py "$ISSUE_NUMBER" > result.json
+          cat result.json
+
+      - name: Write job summary
+        if: always()
+        run: |
+          echo '```json' >> "$GITHUB_STEP_SUMMARY"
+          if [[ -f result.json ]] && jq empty result.json 2>/dev/null; then
+            jq . result.json >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo '{"error": "No valid result.json generated. Check logs for details."}' >> "$GITHUB_STEP_SUMMARY"
+          fi
+          echo '```' >> "$GITHUB_STEP_SUMMARY"

.github/workflows/identify_potential_duplicate_issues.yml 🔗

@@ -1,692 +0,0 @@
-name: Identify potential duplicates among new bug/crash reports
-
-on:
-  issues:
-    types: [opened]
-  workflow_dispatch:
-    inputs:
-      issue_number:
-        description: "Issue number to analyze (for testing)"
-        required: true
-        type: number
-
-concurrency:
-  group: potential-duplicate-check-${{ github.event.issue.number || inputs.issue_number }}
-  cancel-in-progress: true
-
-jobs:
-  identify-duplicates:
-    # For manual testing, allow running on any branch; for automatic runs, only on main repo
-    if: github.event_name == 'workflow_dispatch' || github.repository == 'zed-industries/zed'
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-
-    permissions:
-      contents: read
-      issues: read
-
-    steps:
-      - name: Get github app token
-        id: get-app-token
-        uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v2.1.4
-        with:
-          app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }}
-          private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }}
-          owner: zed-industries
-
-      - name: Fetch issue and check eligibility
-        id: fetch-issue
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          github-token: ${{ steps.get-app-token.outputs.token }}
-          script: |
-            const issueNumber = context.payload.issue?.number || ${{ inputs.issue_number || 0 }};
-            if (!issueNumber) {
-              core.setFailed('No issue number provided');
-              return;
-            }
-
-            const { data: issue } = await github.rest.issues.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: issueNumber
-            });
-
-            const typeName = issue.type?.name;
-            const isTargetType = typeName === 'Bug' || typeName === 'Crash';
-
-            console.log(`Issue #${issueNumber}: "${issue.title}"`);
-            console.log(`Issue type: ${typeName || '(none)'}`);
-            console.log(`Is target type (Bug/Crash): ${isTargetType}`);
-
-            // Set default outputs for all paths
-            core.setOutput('issue_number', issueNumber);
-            core.setOutput('issue_title', issue.title);
-            core.setOutput('issue_body', (issue.body || '').slice(0, 6000));
-            core.setOutput('is_target_type', String(isTargetType));
-            core.setOutput('is_staff', 'false');
-            core.setOutput('should_continue', 'false');
-
-            if (!isTargetType) {
-              console.log('::notice::Skipping - issue type is not Bug or Crash');
-              return;
-            }
-
-            // Check if author is staff (skip if so - they know what they're doing)
-            const author = issue.user?.login || '';
-            let isStaff = false;
-            if (author) {
-              try {
-                const response = await github.rest.teams.getMembershipForUserInOrg({
-                  org: 'zed-industries',
-                  team_slug: 'staff',
-                  username: author
-                });
-                isStaff = response.data.state === 'active';
-              } catch (error) {
-                if (error.status !== 404) throw error;
-              }
-            }
-
-            core.setOutput('is_staff', String(isStaff));
-            if (isStaff) {
-              console.log(`::notice::Skipping - author @${author} is a staff member`);
-              return;
-            }
-
-            core.setOutput('should_continue', 'true');
-
-      # ========================================================================
-      # PASS 1: Detect areas using Claude with the full area taxonomy
-      # ========================================================================
-      - name: "Pass 1: Detect areas with Claude"
-        if: steps.fetch-issue.outputs.should_continue == 'true'
-        id: detect-areas
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }}
-          ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }}
-          ISSUE_BODY: ${{ steps.fetch-issue.outputs.issue_body }}
-        run: |
-          # shellcheck disable=SC2016
-          cat > /tmp/area_prompt.txt << 'PROMPT_EOF'
-          You are classifying a GitHub issue for the Zed code editor into area categories.
-
-          ## Issue Title
-          ISSUE_TITLE_PLACEHOLDER
-
-          ## Issue Body
-          ISSUE_BODY_PLACEHOLDER
-
-          ## Available Area Labels
-          (descriptions provided only where the label name isn't self-explanatory)
-
-          accessibility
-          ai, ai/acp (Agent Communication Protocol), ai/agent thread, ai/anthropic, ai/assistant, ai/bedrock, ai/codex, ai/copilot, ai/deepseek, ai/edit prediction, ai/gemini, ai/inline assistant, ai/lmstudio, ai/mcp (Model Context Protocol), ai/mistral, ai/ollama, ai/openai, ai/openai compatible, ai/openrouter, ai/qwen, ai/supermaven, ai/text thread, ai/zeta
-          auth
-          autocompletions
-          billing
-          cli
-          code actions
-          code folding
-          collab - real-time collaboration with other Zed users (screen sharing, shared editing). NOT for remote development over SSH.
-          collab/audio, collab/chat
-          command palette
-          controls/ime, controls/keybinds, controls/mouse
-          debugger, debugger/dap/CodeLLDB, debugger/dap/debugpy, debugger/dap/gdb, debugger/dap/javascript
-          design papercut - small UI/UX polish issues
-          dev containers - Docker-based development environments
-          diagnostics - LSP errors/warnings display
-          discoverability
-          editor, editor/brackets, editor/linked edits
-          extensions/infrastructure
-          file finder - fuzzy file search (Cmd/Ctrl+P)
-          gpui - Zed's internal UI rendering framework
-          inlay hints - inline hints from LSP (type annotations, parameter names)
-          installer-updater
-          integrations/environment - shell environment, PATH, env vars
-          integrations/git, integrations/git/blame, integrations/terminal
-          internationalization, internationalization/rtl support
-          keymap editor
-          language server, language server/server failure
-          languages/* - language-specific syntax, grammar, or LSP issues (e.g., languages/python, languages/rust, languages/typescript)
-          legal
-          logging
-          multi-buffer - viewing multiple files or search results in a single editor pane
-          multi-cursor
-          navigation - go to definition, find references, symbol search
-          network - proxy settings, connectivity, SSL certificates. NOT for collab.
-          onboarding
-          outline - document symbols/structure sidebar
-          parity/* - feature parity requests comparing to other editors (parity/vscode, parity/vim, parity/emacs, parity/jetbrains, parity/helix)
-          performance, performance/memory leak
-          permissions
-          popovers - hover cards, tooltips, autocomplete dropdowns
-          preview/images, preview/markdown
-          project panel - file tree sidebar
-          release notes
-          repl
-          search - project-wide search, find/replace
-          security & privacy, security & privacy/workspace trust
-          serialization - saving/restoring workspace state, undo history, folding state across restarts
-          settings, settings/ui
-          snippets
-          status bar
-          tasks - task runner integration
-          telemetry
-          tooling/* - external tool integrations (tooling/emmet, tooling/eslint, tooling/prettier, tooling/flatpak, tooling/nix)
-          tree-sitter - syntax parsing and highlighting engine
-          ui/animations, ui/dock, ui/file icons, ui/font, ui/menus, ui/minimap, ui/panel, ui/scaling, ui/scrolling, ui/tabs, ui/themes
-          workspace - window management, pane layout, project handling
-          zed account
-          zed.dev
-
-          ## Your Task
-
-          Based on the issue title and body, identify which areas this issue relates to.
-          - Select 1-5 areas that best match the issue
-          - Prefer more specific sub-areas when applicable (e.g., "ai/gemini" over just "ai")
-          - Only select areas that are clearly relevant
-
-          ## Response Format
-
-          Return ONLY a JSON object (no markdown fences, no explanation):
-          {
-            "areas": ["area1", "area2"],
-            "reasoning": "Brief explanation of why these areas were selected"
-          }
-          PROMPT_EOF
-
-          # Single quotes are intentional to prevent bash expansion; node reads env vars via process.env
-          # shellcheck disable=SC2016
-          node << 'SCRIPT_EOF'
-          const fs = require('fs');
-          let prompt = fs.readFileSync('/tmp/area_prompt.txt', 'utf8');
-          prompt = prompt.replace('ISSUE_TITLE_PLACEHOLDER', process.env.ISSUE_TITLE || '');
-          prompt = prompt.replace('ISSUE_BODY_PLACEHOLDER', process.env.ISSUE_BODY || '');
-          fs.writeFileSync('/tmp/area_prompt_final.txt', prompt);
-          SCRIPT_EOF
-
-          HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/area_response.json -X POST "https://api.anthropic.com/v1/messages" \
-            -H "Content-Type: application/json" \
-            -H "x-api-key: $ANTHROPIC_API_KEY" \
-            -H "anthropic-version: 2023-06-01" \
-            --data-binary @- << EOF
-          {
-            "model": "claude-sonnet-4-5-20250929",
-            "max_tokens": 256,
-            "messages": [{"role": "user", "content": $(jq -Rs . < /tmp/area_prompt_final.txt)}]
-          }
-          EOF
-          )
-
-          RESPONSE=$(< /tmp/area_response.json)
-
-          if [ "$HTTP_CODE" -lt 200 ] || [ "$HTTP_CODE" -ge 300 ]; then
-            echo "HTTP Error: $HTTP_CODE"
-            echo "$RESPONSE" | jq . 2>/dev/null || echo "$RESPONSE"
-            exit 1
-          fi
-
-          if echo "$RESPONSE" | jq -e '.error' > /dev/null 2>&1; then
-            echo "API Error:"
-            echo "$RESPONSE" | jq .
-            exit 1
-          fi
-
-          AREA_RESULT=$(echo "$RESPONSE" | jq -r '.content[0].text // empty')
-
-          if [ -z "$AREA_RESULT" ]; then
-            echo "Error: No response from Claude for area detection"
-            echo "$RESPONSE" | jq .
-            exit 1
-          fi
-
-          echo "Area detection result: $AREA_RESULT"
-
-          # Extract just the areas array, handling potential markdown fences
-          # shellcheck disable=SC2016
-          CLEAN_JSON=$(echo "$AREA_RESULT" | sed 's/^```json//; s/^```//; s/```$//' | tr -d '\n')
-          AREAS=$(echo "$CLEAN_JSON" | jq -r '.areas // [] | join(",")')
-          echo "Detected areas: $AREAS"
-
-          echo "detected_areas=$AREAS" >> "$GITHUB_OUTPUT"
-
-          INPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.input_tokens')
-          OUTPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.output_tokens')
-          echo "Pass 1 token usage - Input: $INPUT_TOKENS, Output: $OUTPUT_TOKENS"
-
-      # ========================================================================
-      # Use detected areas to filter magnets and search for candidates
-      # ========================================================================
-      - name: Filter magnets and search for candidates
-        if: steps.fetch-issue.outputs.should_continue == 'true'
-        id: gather-candidates
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          github-token: ${{ steps.get-app-token.outputs.token }}
-          script: |
-            // ============================================================
-            // KNOWN DUPLICATE MAGNETS (from #46355)
-            // ============================================================
-            const DUPLICATE_MAGNETS = [
-              { number: 37074, title: "Support history with external ACP agents", areas: ["ai", "ai/gemini", "ai/acp"] },
-              { number: 35780, title: "Zed consumes a lot of memory and CPU when opening ~/ or other large file trees", areas: ["workspace", "performance", "performance/memory leak", "integrations/git"] },
-              { number: 16965, title: "Support for non UTF-8 text encodings", areas: ["editor", "internationalization"] },
-              { number: 38109, title: "Zed out of sync with changes made outside of editor", areas: ["workspace"] },
-              { number: 16727, title: "Select text in markdown preview", areas: ["preview/markdown", "languages/markdown"] },
-              { number: 31102, title: "RTL Right-to-Left Text Input/Rendering Support", areas: ["internationalization"] },
-              { number: 7371, title: "Restarts should be non-destructive on workspace restore/reload", areas: ["workspace", "serialization"] },
-              { number: 7992, title: "Font rendering on LoDPI displays", areas: ["ui/font"] },
-              { number: 40018, title: "Windows Beta: Terminal overwrites text when resized and window overflow", areas: ["integrations/terminal"] },
-              { number: 29962, title: "Agent Panel: Cannot access zed hosted models (via Cloudflare HKG)", areas: ["ai", "network"] },
-              { number: 15097, title: "Serialize undo history (local and remote projects)", areas: ["workspace", "serialization"] },
-              { number: 29846, title: "Collapsed code blocks are not restored properly", areas: ["editor", "serialization", "code folding"] },
-              { number: 38799, title: "Poor search performance in large repositories", areas: ["performance", "search"] },
-              { number: 27283, title: "Inefficient memory use when opening large file in Zed", areas: ["performance"] },
-              { number: 39806, title: "Raspberry Pi OS (Trixie) Zed 0.207.3 Video Memory Corruption on Start", areas: ["gpui"] },
-              { number: 29970, title: "Unable to download any extensions (due to potential DigitalOcean IP block or ISP block)", areas: ["network"] },
-              { number: 29026, title: "Ability to copy/paste files from the system file manager", areas: ["workspace"] },
-              { number: 7940, title: "Zed is sometimes unresponsive when the OS awakes from sleep", areas: ["workspace"] },
-              { number: 37025, title: "Failed to generate thread summary", areas: ["ai"] },
-              { number: 16156, title: "Support for project settings to enable/disable/control AI features", areas: ["ai", "settings"] },
-              { number: 24752, title: "Extra horizontal scrolling when inline blame is enabled with soft wrapping", areas: ["editor"] },
-              { number: 20970, title: "Excessive memory consumption on project search with large files present", areas: ["performance/memory leak", "search", "multi-buffer"] },
-              { number: 12176, title: "Only some ligatures are being applied", areas: ["ui/font", "settings"] },
-              { number: 13564, title: "blade: Text is rendered either too thick or too thin", areas: ["ui/font"] },
-              { number: 38901, title: "Terminal freezes in Linux session when Ctrl+C is pressed before exit", areas: ["controls/keybinds", "integrations/terminal"] },
-              { number: 20167, title: "Support unsetting default keybindings", areas: ["controls/keybinds"] },
-              { number: 25469, title: "Tracking - Linux non-QWERTY keyboard support", areas: ["controls/keybinds"] },
-              { number: 29598, title: "Manual refresh on unsupported filesystems (nfs, fuse, exfat) without inotify/fsevents", areas: ["project panel"] },
-              { number: 14428, title: "Ordering of search tokens in file finder fuzzy match", areas: ["file finder"] },
-              { number: 20771, title: "Workspace: Reload to respect the desktop/workspace Zed windows were in after reload", areas: ["workspace", "serialization"] },
-              { number: 7465, title: "Lines with RTL text aren't rendered correctly", areas: ["editor", "internationalization/rtl support", "parity/vscode"] },
-              { number: 16120, title: "Large files without newlines (all on one line) cause Zed to hang/crash", areas: ["editor"] },
-              { number: 22703, title: "Syntax aware folding (folds.scm support)", areas: ["editor", "tree-sitter"] },
-              { number: 38927, title: "Find & Replace memory leak on large files", areas: ["performance", "performance/memory leak"] },
-              { number: 4560, title: "Improve streaming search speed", areas: ["performance", "search"] },
-              { number: 14053, title: "Linux Shortcuts don't work with non-latin / international keyboard layouts", areas: ["internationalization", "controls/keybinds"] },
-              { number: 31637, title: "High memory consumption in Project Search with large codebases", areas: ["performance/memory leak", "search"] },
-              { number: 11744, title: "Incorrect spacing of terminal font", areas: ["ui/font", "integrations/terminal"] },
-              { number: 4746, title: "Terminal Nerd Font rendering incorrect line height", areas: ["ui/font", "integrations/terminal"] },
-              { number: 10647, title: "User configurable mouse bindings (like keymap for key+mouse)", areas: ["controls/keybinds", "controls/mouse", "accessibility"] },
-              { number: 34865, title: "ctrl-w with pane::CloseActiveItem binding closes the project panel instead of the active pane", areas: ["controls/keybinds", "ui/panel"] },
-              { number: 12163, title: "Cannot see list of installed extensions when offline / disconnected", areas: ["network"] },
-              { number: 44630, title: "Tables do not render all columns in markdown preview", areas: ["preview/markdown"] },
-              { number: 39435, title: "Windows: Low fps in many cases", areas: ["gpui"] },
-              { number: 36227, title: "Zed becomes unresponsive when closing", areas: ["workspace"] },
-              { number: 44962, title: "Can not open file in zed if filename includes (1)", areas: ["workspace"] },
-              { number: 32318, title: "Zed hangs after exiting sleep mode in Linux", areas: ["workspace"] },
-              { number: 5120, title: "Add options to hide title and status bar", areas: ["settings", "status bar"] },
-              { number: 29323, title: "uv: Failed to detect Python venv correctly", areas: ["language server", "languages/python", "integrations/environment"] },
-              { number: 7450, title: "Support LSP Semantic Tokens", areas: ["language server", "languages", "ui/themes"] },
-              { number: 31846, title: "LSP: triggerCharacters for signature help declared by servers do not seem to be respected", areas: ["language server"] },
-              { number: 32792, title: "[SWAY] Zed window flashes rapidly on Sway/wlroots", areas: ["gpui"] },
-              { number: 28398, title: "Stale buffers should be removed from search multibuffer", areas: ["search", "multi-buffer"] },
-              { number: 35011, title: "Delete Key against remote Hosts Doesn't Delete Folders", areas: ["project panel"] },
-              { number: 8626, title: "Palette File Navigation - Preview File Content", areas: ["file finder"] },
-              { number: 31468, title: "Certain LSP features are not activated till you trigger them manually when working with a remote project", areas: ["language server/server failure", "autocompletions"] },
-              { number: 9789, title: "Zed checks for LSP updates when offline and disables LSPs irreversibly in the process", areas: ["language server/server failure"] },
-              { number: 21403, title: "Completions and code actions should not use uniform lists", areas: ["autocompletions", "popovers", "diagnostics"] },
-              { number: 15196, title: "Remote Project REPL support", areas: ["repl"] },
-            ];
-
-            const MAX_SEARCHES = 5;
-
-            const issueNumber = parseInt('${{ steps.fetch-issue.outputs.issue_number }}', 10);
-            const title = process.env.ISSUE_TITLE || '';
-            const body = process.env.ISSUE_BODY || '';
-            const detectedAreasStr = '${{ steps.detect-areas.outputs.detected_areas }}';
-            const detectedAreas = new Set(detectedAreasStr.split(',').filter(a => a.trim()));
-
-            console.log(`Detected areas from Claude: ${[...detectedAreas].join(', ') || '(none)'}`);
-
-            // Helper: check if two areas match (handles hierarchy like "ai" matching "ai/gemini")
-            function areasMatch(detected, magnetArea) {
-              if (detected === magnetArea) return true;
-              if (magnetArea.startsWith(detected + '/')) return true;
-              if (detected.startsWith(magnetArea + '/')) return true;
-              return false;
-            }
-
-            // Filter magnets based on detected areas
-            const relevantMagnets = DUPLICATE_MAGNETS.filter(magnet => {
-              if (detectedAreas.size === 0) return true;
-              return magnet.areas.some(magnetArea =>
-                [...detectedAreas].some(detected => areasMatch(detected, magnetArea))
-              );
-            }).slice(0, 20);
-
-            console.log(`Relevant duplicate magnets: ${relevantMagnets.length}`);
-
-            // Build search queries
-            const searchQueries = [];
-            const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0];
-
-            // 1. Keyword search from title
-            const stopwords = ['with', 'that', 'this', 'from', 'have', 'been', 'were', 'what', 'when',
-                               'where', 'which', 'while', 'does', 'doesn', 'should', 'would', 'could',
-                               'about', 'after', 'before', 'between', 'into', 'through', 'during',
-                               'above', 'below', 'under', 'again', 'further', 'then', 'once', 'here',
-                               'there', 'some', 'such', 'only', 'same', 'than', 'very', 'just', 'also',
-                               'work', 'working', 'works', 'issue', 'problem', 'error', 'bug', 'zed'];
-            const titleKeywords = title
-              .toLowerCase()
-              .replace(/[^\w\s]/g, ' ')
-              .split(/\s+/)
-              .filter(w => w.length >= 3 && !stopwords.includes(w))
-              .slice(0, 5);
-
-            if (titleKeywords.length >= 2) {
-              searchQueries.push({
-                type: 'keyword',
-                query: `repo:zed-industries/zed is:issue created:>${thirtyDaysAgo} ${titleKeywords.join(' ')}`
-              });
-            }
-
-            // 2. Area-based searches (using Claude-detected areas)
-            for (const area of [...detectedAreas].slice(0, 3)) {
-              searchQueries.push({
-                type: 'area',
-                query: `repo:zed-industries/zed is:issue is:open label:"area:${area}" created:>${thirtyDaysAgo}`
-              });
-            }
-
-            // 3. Look for error patterns in the body
-            const errorPatterns = body.match(/(?:error|panic|crash|failed|exception)[:\s]+[^\n]{10,100}/gi) || [];
-            if (errorPatterns.length > 0) {
-              const errorSnippet = errorPatterns[0]
-                .slice(0, 60)
-                .replace(/[^\w\s]/g, ' ')
-                .replace(/\s+/g, ' ')
-                .trim();
-              if (errorSnippet.length > 15) {
-                searchQueries.push({
-                  type: 'error',
-                  query: `repo:zed-industries/zed is:issue "${errorSnippet.slice(0, 40)}"`
-                });
-              }
-            }
-
-            // Execute searches and collect candidates
-            const candidates = [];
-            const seenIssues = new Set([issueNumber]);
-
-            for (const { type, query } of searchQueries.slice(0, MAX_SEARCHES)) {
-              try {
-                console.log(`Search (${type}): ${query}`);
-                const { data: results } = await github.rest.search.issuesAndPullRequests({
-                  q: query,
-                  sort: 'created',
-                  order: 'desc',
-                  per_page: 10
-                });
-
-                for (const item of results.items) {
-                  if (!seenIssues.has(item.number) && !item.pull_request) {
-                    seenIssues.add(item.number);
-                    candidates.push({
-                      number: item.number,
-                      title: item.title,
-                      state: item.state,
-                      created_at: item.created_at,
-                      body_preview: (item.body || '').slice(0, 800),
-                      source: type
-                    });
-                  }
-                }
-              } catch (error) {
-                console.log(`Search failed (${type}): ${error.message}`);
-              }
-            }
-
-            console.log(`Found ${candidates.length} candidates from searches`);
-
-            // Prepare issue data for Claude
-            const issueData = {
-              number: issueNumber,
-              title: title,
-              body: body.slice(0, 4000),
-            };
-
-            // Prepare output
-            core.setOutput('issue_data', JSON.stringify(issueData));
-            core.setOutput('duplicate_magnets', JSON.stringify(relevantMagnets));
-            core.setOutput('candidates', JSON.stringify(candidates.slice(0, 12)));
-            core.setOutput('detected_areas', [...detectedAreas].join(', '));
-            core.setOutput('should_analyze', (relevantMagnets.length > 0 || candidates.length > 0) ? 'true' : 'false');
-        env:
-          ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }}
-          ISSUE_BODY: ${{ steps.fetch-issue.outputs.issue_body }}
-
-      # ========================================================================
-      # PASS 2: Analyze duplicates with Claude
-      # ========================================================================
-      - name: "Pass 2: Analyze duplicates with Claude"
-        if: |
-          steps.fetch-issue.outputs.should_continue == 'true' &&
-          steps.gather-candidates.outputs.should_analyze == 'true'
-        id: analyze
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }}
-          ISSUE_DATA: ${{ steps.gather-candidates.outputs.issue_data }}
-          DUPLICATE_MAGNETS: ${{ steps.gather-candidates.outputs.duplicate_magnets }}
-          CANDIDATES: ${{ steps.gather-candidates.outputs.candidates }}
-        run: |
-          # shellcheck disable=SC2016
-          cat > /tmp/prompt.txt << 'PROMPT_EOF'
-          You are analyzing a GitHub issue to determine if it might be a duplicate of an existing issue.
-
-          ## New Issue Being Analyzed
-          ISSUE_DATA_PLACEHOLDER
-
-          ## Known Frequently-Duplicated Issues (High Priority)
-          These issues have historically received many duplicate reports. Check these first.
-          DUPLICATE_MAGNETS_PLACEHOLDER
-
-          ## Recent Similar Issues Found by Search
-          CANDIDATES_PLACEHOLDER
-
-          ## Your Task
-
-          1. First, understand what the new issue is about:
-             - What specific bug or problem is being reported?
-             - What error messages, stack traces, or specific behaviors are mentioned?
-             - What component/feature is affected?
-
-          2. Check against the frequently-duplicated issues first (high priority):
-             - These are known "duplicate magnets" that often get re-reported
-             - If the new issue describes the same problem, it's likely a duplicate
-
-          3. Then check the recent similar issues:
-             - Look for issues describing the SAME bug, not just related topics
-
-          ## Duplicate Criteria (be strict!)
-
-          An issue IS a duplicate if:
-          - It describes the EXACT same bug with the same root cause
-          - It has the same error message or stack trace
-          - It has the same reproduction steps leading to the same outcome
-
-          An issue is NOT a duplicate if:
-          - It's merely related to the same feature/area
-          - It has similar symptoms but potentially different causes
-          - It mentions similar things but describes a different problem
-
-          Be VERY conservative. It's better to miss a duplicate than to incorrectly flag a unique issue.
-
-          ## Response Format
-
-          Return ONLY a JSON object (no markdown fences, no explanation before or after):
-          {
-            "is_potential_duplicate": boolean,
-            "confidence": "high" | "medium" | "low" | "none",
-            "potential_duplicates": [
-              {"number": integer, "title": "string", "similarity_reason": "string explaining why this might be the same bug"}
-            ],
-            "analysis_summary": "Brief explanation of what the new issue is about and your conclusion",
-            "recommendation": "flag_as_duplicate" | "needs_human_review" | "not_a_duplicate"
-          }
-          PROMPT_EOF
-
-          # Single quotes are intentional to prevent bash expansion; node reads env vars via process.env
-          # shellcheck disable=SC2016
-          node << 'SCRIPT_EOF'
-          const fs = require('fs');
-
-          let prompt = fs.readFileSync('/tmp/prompt.txt', 'utf8');
-          prompt = prompt.replace('ISSUE_DATA_PLACEHOLDER', process.env.ISSUE_DATA);
-          prompt = prompt.replace('DUPLICATE_MAGNETS_PLACEHOLDER', process.env.DUPLICATE_MAGNETS);
-          prompt = prompt.replace('CANDIDATES_PLACEHOLDER', process.env.CANDIDATES);
-
-          fs.writeFileSync('/tmp/prompt_final.txt', prompt);
-          SCRIPT_EOF
-
-          HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/response.json -X POST "https://api.anthropic.com/v1/messages" \
-            -H "Content-Type: application/json" \
-            -H "x-api-key: $ANTHROPIC_API_KEY" \
-            -H "anthropic-version: 2023-06-01" \
-            --data-binary @- << EOF
-          {
-            "model": "claude-sonnet-4-5-20250929",
-            "max_tokens": 1024,
-            "messages": [{"role": "user", "content": $(jq -Rs . < /tmp/prompt_final.txt)}]
-          }
-          EOF
-          )
-
-          RESPONSE=$(< /tmp/response.json)
-
-          if [ "$HTTP_CODE" -lt 200 ] || [ "$HTTP_CODE" -ge 300 ]; then
-            echo "HTTP Error: $HTTP_CODE"
-            echo "$RESPONSE" | jq . 2>/dev/null || echo "$RESPONSE"
-            exit 1
-          fi
-
-          if echo "$RESPONSE" | jq -e '.error' > /dev/null 2>&1; then
-            echo "API Error:"
-            echo "$RESPONSE" | jq .
-            exit 1
-          fi
-
-          ANALYSIS=$(echo "$RESPONSE" | jq -r '.content[0].text // empty')
-
-          if [ -z "$ANALYSIS" ]; then
-            echo "Error: No response from Claude"
-            echo "$RESPONSE" | jq .
-            exit 1
-          fi
-
-          {
-            echo "analysis<<ANALYSIS_EOF"
-            echo "$ANALYSIS"
-            echo "ANALYSIS_EOF"
-          } >> "$GITHUB_OUTPUT"
-
-          INPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.input_tokens')
-          OUTPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.output_tokens')
-          echo "Pass 2 token usage - Input: $INPUT_TOKENS, Output: $OUTPUT_TOKENS"
-
-      # ========================================================================
-      # Log results
-      # ========================================================================
-      - name: Log analysis results
-        if: |
-          steps.fetch-issue.outputs.should_continue == 'true' &&
-          !cancelled()
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          script: |
-            const issueNumber = parseInt('${{ steps.fetch-issue.outputs.issue_number }}', 10) || 0;
-            const issueTitle = process.env.ISSUE_TITLE || '';
-            const detectedAreas = '${{ steps.gather-candidates.outputs.detected_areas }}' || '(none)';
-            const shouldAnalyze = '${{ steps.gather-candidates.outputs.should_analyze }}' === 'true';
-            const analysisRaw = process.env.ANALYSIS_OUTPUT || '';
-
-            console.log('='.repeat(60));
-            console.log('DUPLICATE DETECTION RESULTS (TWO-PASS)');
-            console.log('='.repeat(60));
-            console.log(`Issue: #${issueNumber} - ${issueTitle}`);
-            console.log(`URL: https://github.com/zed-industries/zed/issues/${issueNumber}`);
-            console.log(`Detected Areas: ${detectedAreas}`);
-
-            if (!shouldAnalyze) {
-              console.log('\nNo duplicate magnets or candidates found - skipping analysis');
-              core.summary.addHeading(`✅ Issue #${issueNumber}: No similar issues found`, 2);
-              core.summary.addRaw(`\n**Title:** ${issueTitle}\n\n`);
-              core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`);
-              core.summary.addRaw('No potential duplicates were found by search or in the known duplicate magnets list.\n');
-              await core.summary.write();
-              return;
-            }
-
-            if (!analysisRaw) {
-              console.log('\nNo analysis output received');
-              core.summary.addHeading(`⚠️ Issue #${issueNumber}: Analysis incomplete`, 2);
-              core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`);
-              core.summary.addRaw('The Claude analysis step did not produce output. Check workflow logs.\n');
-              await core.summary.write();
-              return;
-            }
-
-            try {
-              let cleanJson = analysisRaw.trim();
-              if (cleanJson.startsWith('```')) {
-                cleanJson = cleanJson.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '');
-              }
-
-              const analysis = JSON.parse(cleanJson);
-
-              console.log(`\nIs Potential Duplicate: ${analysis.is_potential_duplicate}`);
-              console.log(`Confidence: ${analysis.confidence}`);
-              console.log(`Recommendation: ${analysis.recommendation}`);
-              console.log(`\nAnalysis Summary:\n${analysis.analysis_summary}`);
-
-              if (analysis.potential_duplicates && analysis.potential_duplicates.length > 0) {
-                console.log(`\nPotential Duplicates Found: ${analysis.potential_duplicates.length}`);
-                for (const dup of analysis.potential_duplicates) {
-                  console.log(`  - #${dup.number}: ${dup.title}`);
-                  console.log(`    Reason: ${dup.similarity_reason}`);
-                }
-              } else {
-                console.log('\nNo potential duplicates identified by analysis.');
-              }
-
-              console.log('\n' + '='.repeat(60));
-
-              const summaryIcon = analysis.is_potential_duplicate ? '⚠️' : '✅';
-              const summaryText = analysis.is_potential_duplicate
-                ? `Potential duplicate detected (${analysis.confidence} confidence)`
-                : 'No likely duplicates found';
-
-              core.summary.addHeading(`${summaryIcon} Issue #${issueNumber}: ${summaryText}`, 2);
-              core.summary.addRaw(`\n**Title:** ${issueTitle}\n\n`);
-              core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`);
-              core.summary.addRaw(`**Recommendation:** \`${analysis.recommendation}\`\n\n`);
-              core.summary.addRaw(`**Summary:** ${analysis.analysis_summary}\n\n`);
-
-              if (analysis.potential_duplicates && analysis.potential_duplicates.length > 0) {
-                core.summary.addHeading('Potential Duplicates', 3);
-                const rows = analysis.potential_duplicates.map(d => [
-                  `[#${d.number}](https://github.com/zed-industries/zed/issues/${d.number})`,
-                  d.title.slice(0, 60) + (d.title.length > 60 ? '...' : ''),
-                  d.similarity_reason
-                ]);
-                core.summary.addTable([
-                  [{data: 'Issue', header: true}, {data: 'Title', header: true}, {data: 'Similarity Reason', header: true}],
-                  ...rows
-                ]);
-              }
-
-              await core.summary.write();
-
-            } catch (e) {
-              console.log('Failed to parse analysis output:', e.message);
-              console.log('Raw output:', analysisRaw);
-              core.summary.addHeading(`⚠️ Issue #${issueNumber}: Failed to parse analysis`, 2);
-              core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`);
-              core.summary.addRaw(`Error: ${e.message}\n\nRaw output:\n\`\`\`\n${analysisRaw.slice(0, 1000)}\n\`\`\``);
-              await core.summary.write();
-            }
-        env:
-          ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }}
-          ANALYSIS_OUTPUT: ${{ steps.analyze.outputs.analysis }}

.github/workflows/update_duplicate_magnets.yml 🔗

@@ -0,0 +1,27 @@
+name: Update Duplicate Magnets Issue
+
+on:
+  schedule:
+    - cron: "0 6 * * 1,4" # Mondays and Thursdays at 6 AM UTC
+  workflow_dispatch:
+
+jobs:
+  update-duplicate-magnets:
+    runs-on: ubuntu-latest
+    if: github.repository == 'zed-industries/zed'
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install requests
+
+      - name: Update duplicate magnets issue
+        run: |
+          python script/github-find-top-duplicated-bugs.py \
+            --github-token ${{ secrets.GITHUB_TOKEN }} \
+            --issue-number 46355

script/github-check-new-issue-for-duplicates.py 🔗

@@ -0,0 +1,534 @@
+#!/usr/bin/env python3
+"""
+Comment on newly opened issues that might be duplicates of an existing issue.
+
+This script is run by a GitHub Actions workflow when a new bug or crash report
+is opened. It:
+1. Checks eligibility (must be bug/crash type, non-staff author)
+2. Detects relevant areas using Claude + the area label taxonomy
+3. Parses known "duplicate magnets" from tracking issue #46355
+4. Searches for similar recent issues by title keywords, area labels, and error patterns
+5. Asks Claude to analyze potential duplicates (magnets + search results)
+6. Posts a comment on the issue if high-confidence duplicates are found
+
+Requires:
+    requests (pip install requests)
+
+Usage:
+    python github-check-new-issue-for-duplicates.py <issue_number>
+
+Environment variables:
+    GITHUB_TOKEN       - GitHub token (org members: read, issues: read & write)
+    ANTHROPIC_API_KEY  - Anthropic API key for Claude
+
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+from datetime import datetime, timedelta
+
+import requests
+
+GITHUB_API = "https://api.github.com"
+REPO_OWNER = "zed-industries"
+REPO_NAME = "zed"
+TRACKING_ISSUE_NUMBER = 46355
+STAFF_TEAM_SLUG = "staff"
+
+# area prefixes to collapse in taxonomy (show summary instead of all sub-labels)
+PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"]
+
+# stopwords to filter from title keyword searches (short words handled by len > 2 filter)
+STOPWORDS = {
+    "after", "all", "also", "and", "any", "but", "can't", "does", "doesn't",
+    "don't", "for", "from", "have", "just", "not", "only", "some", "that",
+    "the", "this", "when", "while", "with", "won't", "work", "working", "zed",
+}
+
+
+def log(message):
+    """Print to stderr so it doesn't interfere with JSON output on stdout."""
+    print(message, file=sys.stderr)
+
+
+def github_api_get(path, params=None):
+    """Fetch JSON from the GitHub API. Raises on non-2xx status."""
+    url = f"{GITHUB_API}/{path.lstrip('/')}"
+    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
+    response.raise_for_status()
+    return response.json()
+
+
+def github_search_issues(query, per_page=15):
+    """Search issues, returning most recently created first."""
+    params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page}
+    return github_api_get("/search/issues", params).get("items", [])
+
+
+def check_team_membership(org, team_slug, username):
+    """Check if user is an active member of a team."""
+    try:
+        data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}")
+        return data.get("state") == "active"
+    except requests.HTTPError as e:
+        if e.response.status_code == 404:
+            return False
+        raise
+
+
+def post_comment(issue_number: int, body):
+    url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
+    response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body})
+    response.raise_for_status()
+    log(f"  Posted comment on #{issue_number}")
+
+
+def build_duplicate_comment(matches):
+    """Build the comment body for potential duplicates."""
+    match_list = "\n".join(f"- #{m['number']}" for m in matches)
+    explanations = "\n\n".join(f"**#{m['number']}:** {m['explanation']}" for m in matches)
+
+    return f"""This issue appears to be a duplicate of:
+
+{match_list}
+
+**If this is indeed a duplicate:**
+Please close this issue and subscribe to the linked issue for updates (select "Close as not planned" → "Duplicate")
+
+**If this is a different issue:**
+No action needed. A maintainer will review this shortly.
+
+<details>
+<summary>Why were these issues selected?</summary>
+
+{explanations}
+
+</details>
+
+---
+<sub>This is an automated analysis and might be incorrect.</sub>"""
+
+
+def call_claude(api_key, system, user_content, max_tokens=1024):
+    """Send a message to Claude and return the text response. Raises on non-2xx status."""
+    response = requests.post(
+        "https://api.anthropic.com/v1/messages",
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "content-type": "application/json",
+        },
+        json={
+            "model": "claude-sonnet-4-20250514",
+            "max_tokens": max_tokens,
+            "temperature": 0.0,
+            "system": system,
+            "messages": [{"role": "user", "content": user_content}],
+        },
+    )
+    response.raise_for_status()
+    data = response.json()
+
+    usage = data.get("usage", {})
+    log(f"  Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}")
+
+    content = data.get("content", [])
+    if content and content[0].get("type") == "text":
+        return content[0].get("text") or ""
+    return ""
+
+
+def fetch_issue(issue_number: int):
+    """Fetch issue from GitHub and return as a dict."""
+    log(f"Fetching issue #{issue_number}")
+
+    issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
+    issue = {
+        "number": issue_number,
+        "title": issue_data["title"],
+        "body": issue_data.get("body") or "",
+        "author": (issue_data.get("user") or {}).get("login") or "",
+        "type": (issue_data.get("type") or {}).get("name"),
+    }
+
+    log(f"  Title: {issue['title']}\n  Type: {issue['type']}\n  Author: {issue['author']}")
+    return issue
+
+
+def should_skip(issue):
+    """Check if issue should be skipped in duplicate detection process."""
+    if issue["type"] not in ["Bug", "Crash"]:
+        log(f"  Skipping: issue type '{issue['type']}' is not a bug/crash report")
+        return True
+
+    if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]):
+        log(f"  Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member")
+        return True
+
+    return False
+
+
+def fetch_area_labels():
+    """Fetch area:* labels from the repository. Returns list of {name, description} dicts."""
+    log("Fetching area labels")
+
+    labels = []
+    page = 1
+    while page_labels := github_api_get(
+        f"/repos/{REPO_OWNER}/{REPO_NAME}/labels",
+        params={"per_page": 100, "page": page},
+    ):
+        labels.extend(page_labels)
+        page += 1
+
+    # label["name"][5:] removes the "area:" prefix
+    area_labels = [
+        {"name": label["name"][5:], "description": label.get("description") or ""}
+        for label in labels
+        if label["name"].startswith("area:")
+    ]
+
+    log(f"  Found {len(area_labels)} area labels")
+    return area_labels
+
+
+def format_taxonomy_for_claude(area_labels):
+    """Format area labels into a string for Claude, collapsing certain prefixes."""
+    lines = set()
+
+    for area in area_labels:
+        name = area["name"]
+        collapsible_prefix = next(
+            (p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None)
+
+        if collapsible_prefix:
+            lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)")
+        else:
+            desc = area["description"]
+            lines.add(f"- {name}: {desc}" if desc else f"- {name}")
+
+    return "\n".join(sorted(lines))
+
+
+def detect_areas(anthropic_key, issue, taxonomy):
+    """Use Claude to detect relevant areas for the issue."""
+    log("Detecting areas with Claude")
+
+    system_prompt = """You analyze GitHub issues to identify which area labels apply.
+
+Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names.
+- Output at most 3 areas, ranked by relevance
+- Use exact area names from the taxonomy
+- If no areas clearly match, output: none
+- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust",
+tooling/eslint, parity/vscode)
+
+Example outputs:
+- "editor, parity/vim"
+- "ai, ai/agent panel"
+- "none"
+"""
+
+    user_content = f"""## Area Taxonomy
+{taxonomy}
+
+# Issue Title
+{issue['title']}
+
+# Issue Body
+{issue['body'][:4000]}"""
+
+    response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip()
+    log(f"  Detected areas: {response}")
+
+    if response.lower() == "none":
+        return []
+    return [area.strip() for area in response.split(",")]
+
+
+def parse_duplicate_magnets():
+    """Parse known duplicate magnets from tracking issue #46355.
+
+    Returns a list of magnets sorted by duplicate count (most duplicated first).
+    Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch
+    title and body_preview for the ones you need.
+    """
+    log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}")
+
+    issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}")
+    body = issue_data.get("body") or ""
+
+    # parse the issue body
+    # format: ## area_name
+    #         -   [N dupes] https://github.com/zed-industries/zed/issues/NUMBER
+    magnets = {}  # number -> {number, areas, dupe_count}
+    current_area = None
+
+    for line in body.split("\n"):
+        # check for area header
+        if line.startswith("## "):
+            current_area = line[3:].strip()
+            continue
+
+        if not current_area or not line.startswith("-") or "/issues/" not in line:
+            continue
+
+        # parse: -   [N dupes] https://github.com/.../issues/NUMBER
+        try:
+            dupe_count = int(line.split("[")[1].split()[0])
+            number = int(line.split("/issues/")[1].split()[0].rstrip(")"))
+        except (ValueError, IndexError):
+            continue
+
+        # skip "(unlabeled)": these magnets should match everything
+        is_unlabeled = current_area == "(unlabeled)"
+
+        if number in magnets:
+            if not is_unlabeled:
+                magnets[number]["areas"].append(current_area)
+        else:
+            magnets[number] = {
+                "number": number,
+                "areas": [] if is_unlabeled else [current_area],
+                "dupe_count": dupe_count,
+            }
+
+    magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True)
+    log(f"  Parsed {len(magnet_list)} duplicate magnets")
+    return magnet_list
+
+
+def enrich_magnets(magnets):
+    """Fetch title and body_preview for magnets from the API."""
+    log(f"  Fetching details for {len(magnets)} magnets")
+    for magnet in magnets:
+        data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}")
+        magnet["title"] = data["title"]
+        magnet["body_preview"] = (data.get("body") or "")[:500]
+
+
+def areas_match(detected, magnet_area):
+    """Check if detected area matches magnet area. Matches broadly across hierarchy levels."""
+    return (
+        detected == magnet_area
+        or magnet_area.startswith(f"{detected}/")
+        or detected.startswith(f"{magnet_area}/")
+    )
+
+
+def filter_magnets_by_areas(magnets, detected_areas):
+    """Filter magnets based on detected areas."""
+    if not detected_areas:
+        return magnets
+
+    detected_set = set(detected_areas)
+
+    def matches(magnet):
+        # unlabeled magnets (empty areas) match everything
+        if not magnet["areas"]:
+            return True
+        return any(
+            areas_match(detected, magnet_area)
+            for detected in detected_set
+            for magnet_area in magnet["areas"]
+        )
+
+    return list(filter(matches, magnets))
+
+
+def search_for_similar_issues(issue, detected_areas, max_searches=6):
+    """Search for similar issues that might be duplicates.
+
+    Searches by title keywords, area labels (last 60 days), and error patterns.
+    max_searches caps the total number of queries to keep token usage and context size under control.
+    """
+    log("Searching for similar issues")
+
+    sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
+    base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open"
+    seen_issues = {}
+    queries = []
+
+    title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2]
+
+    if title_keywords:
+        keywords_query = " ".join(title_keywords)
+        queries.append(("title_keywords", f"{base_query} {keywords_query}"))
+
+    for area in detected_areas:
+        queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}'))
+
+    # error pattern search: capture 5–90 chars after keyword, colon optional
+    error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})"
+    match = re.search(error_pattern, issue["body"])
+    if match:
+        error_snippet = match.group(1).strip()
+        queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"'))
+
+    for search_type, query in queries[:max_searches]:
+        log(f"  Search ({search_type}): {query}")
+        try:
+            results = github_search_issues(query, per_page=15)
+            for item in results:
+                number = item["number"]
+                if number != issue["number"] and number not in seen_issues:
+                    body = item.get("body") or ""
+                    seen_issues[number] = {
+                        "number": number,
+                        "title": item["title"],
+                        "state": item.get("state", ""),
+                        "created_at": item.get("created_at", ""),
+                        "body_preview": body[:500],
+                        "source": search_type,
+                    }
+        except requests.RequestException as e:
+            log(f"  Search failed: {e}")
+
+    similar_issues = list(seen_issues.values())
+    log(f"  Found {len(similar_issues)} similar issues")
+    return similar_issues
+
+
+def analyze_duplicates(anthropic_key, issue, magnets, search_results):
+    """Use Claude to analyze potential duplicates."""
+    log("Analyzing duplicates with Claude")
+
+    top_magnets = magnets[:10]
+    enrich_magnets(top_magnets)
+    magnet_numbers = {m["number"] for m in top_magnets}
+
+    candidates = [
+        {"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"}
+        for m in top_magnets
+    ] + [
+        {"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"}
+        for r in search_results[:10]
+        if r["number"] not in magnet_numbers
+    ]
+
+    if not candidates:
+        return [], "No candidates to analyze"
+
+    system_prompt = """You analyze GitHub issues to identify potential duplicates.
+
+Given a new issue and a list of existing issues, identify which existing issues might be duplicates.
+
+For each potential duplicate, assess confidence:
+- "high": Very likely the same issue (same root cause, same symptoms)
+- "medium": Possibly related (likely to be the same root cause)
+- Do NOT include tangentially related issues (same general area but probably different issues)
+
+Output only valid JSON (no markdown code blocks) with this structure:
+{
+  "matches": [
+    {
+      "number": 12345,
+      "confidence": "high|medium",
+      "explanation": "Brief explanation of why this might be a duplicate"
+    }
+  ],
+  "summary": "One sentence summary of findings"
+}
+
+Only include matches with "high" or "medium" confidence. Return empty matches array if none found."""
+
+    user_content = f"""## New Issue #{issue['number']}
+**Title:** {issue['title']}
+
+**Body:**
+{issue['body'][:3000]}
+
+## Existing Issues to Compare
+{json.dumps(candidates, indent=2)}"""
+
+    response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048)
+
+    try:
+        data = json.loads(response)
+    except json.JSONDecodeError as e:
+        log(f"  Failed to parse response: {e}")
+        log(f"  Raw response: {response}")
+        return [], "Failed to parse analysis"
+
+    matches = data.get("matches", [])
+    summary = data.get("summary", "Analysis complete")
+    log(f"  Found {len(matches)} potential matches")
+    return matches, summary
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Identify potential duplicate issues")
+    parser.add_argument("issue_number", type=int, help="Issue number to analyze")
+    parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted")
+    args = parser.parse_args()
+
+    github_token = os.environ.get("GITHUB_TOKEN")
+    anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
+
+    if not github_token:
+        log("Error: GITHUB_TOKEN not set")
+        sys.exit(1)
+    if not anthropic_key:
+        log("Error: ANTHROPIC_API_KEY not set")
+        sys.exit(1)
+
+    GITHUB_HEADERS = {
+        "Authorization": f"Bearer {github_token}",
+        "Accept": "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+    }
+
+    issue = fetch_issue(args.issue_number)
+    if should_skip(issue):
+        print(json.dumps({"skipped": True}))
+        sys.exit(0)
+
+    # detect areas
+    taxonomy = format_taxonomy_for_claude(fetch_area_labels())
+    detected_areas = detect_areas(anthropic_key, issue, taxonomy)
+
+    # search for potential duplicates
+    all_magnets = parse_duplicate_magnets()
+    relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas)
+    search_results = search_for_similar_issues(issue, detected_areas)
+
+    # analyze potential duplicates
+    if relevant_magnets or search_results:
+        matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results)
+    else:
+        matches, summary = [], "No potential duplicates to analyze"
+
+    # post comment if high-confidence matches found
+    high_confidence_matches = [m for m in matches if m["confidence"] == "high"]
+    commented = False
+
+    if high_confidence_matches:
+        comment_body = build_duplicate_comment(high_confidence_matches)
+        if args.dry_run:
+            log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40)
+        else:
+            log("Posting comment for high-confidence match(es)")
+            try:
+                post_comment(issue["number"], comment_body)
+                commented = True
+            except requests.RequestException as e:
+                log(f"  Failed to post comment: {e}")
+
+    print(json.dumps({
+        "skipped": False,
+        "issue": {
+            "number": issue["number"],
+            "title": issue["title"],
+            "author": issue["author"],
+            "type": issue["type"],
+        },
+        "detected_areas": detected_areas,
+        "magnets_count": len(relevant_magnets),
+        "search_results_count": len(search_results),
+        "matches": matches,
+        "summary": summary,
+        "commented": commented,
+    }))

script/github-find-top-duplicated-bugs.py 🔗

@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+Find open issues that have the most duplicates filed against them and update
+a GitHub issue with the results.
+
+Queries open issues and looks for MarkedAsDuplicateEvent in their timelines.
+Only includes issues that have been re-reported at least twice (2+ duplicates
+closed against them). Groups results by area: label. The output is formatted
+as markdown with issue URLs (GitHub renders the titles automatically).
+
+This script is run regularly by the update_duplicate_magnets.yml workflow.
+
+Requires: requests (pip install requests)
+GitHub token permissions: issues:write
+
+Usage:
+    # Print to stdout only for testing:
+    python github-find-top-duplicated-bugs.py --github-token ghp_xxx
+
+    # Update a GitHub issue:
+    python github-find-top-duplicated-bugs.py --github-token ghp_xxx --issue-number 46355
+"""
+
+import argparse
+import os
+import sys
+from collections import Counter, defaultdict
+
+import requests
+
+OWNER = "zed-industries"
+REPO = "zed"
+
+GRAPHQL_URL = "https://api.github.com/graphql"
+REST_API_URL = "https://api.github.com"
+
+headers = None
+
+ISSUES_WITH_DUPLICATES_QUERY = """
+query($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    issues(
+      first: 100
+      after: $cursor
+      states: [OPEN]
+      orderBy: {field: UPDATED_AT, direction: DESC}
+    ) {
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+      nodes {
+        number
+        url
+        labels(first: 20) {
+          nodes {
+            name
+          }
+        }
+        timelineItems(first: 100, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
+          nodes {
+            ... on MarkedAsDuplicateEvent {
+              duplicate {
+                ... on Issue {
+                  number
+                  state
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def extract_duplicate_info(issue):
+    """Extract duplicate count and info from an issue. Returns None if < 2 duplicates."""
+    seen_duplicates = set()
+    for event in issue["timelineItems"]["nodes"]:
+        try:
+            if event["duplicate"]["state"] == "CLOSED":
+                seen_duplicates.add(event["duplicate"]["number"])
+        except (KeyError, TypeError):
+            continue
+
+    if len(seen_duplicates) < 2:
+        return None
+
+    labels = [l["name"] for l in issue["labels"]["nodes"]]
+    areas = [l.replace("area:", "") for l in labels if l.startswith("area:")]
+
+    return {
+        "number": issue["number"],
+        "url": issue["url"],
+        "areas": areas if areas else ["(unlabeled)"],
+        "duplicate_count": len(seen_duplicates),
+    }
+
+
+def fetch_canonical_issues_with_duplicates(max_pages=100):
+    """Fetch open issues and count how many duplicates point to each."""
+    print(f"Finding open issues with the most duplicates in {OWNER}/{REPO}")
+
+    cursor = None
+    duplicate_magnets = []
+    total_issues_scanned = 0
+
+    for page in range(max_pages):
+        response = requests.post(
+            GRAPHQL_URL,
+            headers=headers,
+            json={
+                "query": ISSUES_WITH_DUPLICATES_QUERY,
+                "variables": {"owner": OWNER, "repo": REPO, "cursor": cursor},
+            },
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        if "errors" in data:
+            print(f"GraphQL errors: {data['errors']}")
+            break
+
+        issues = data["data"]["repository"]["issues"]
+        total_issues_scanned += len(issues["nodes"])
+
+        for issue in issues["nodes"]:
+            if info := extract_duplicate_info(issue):
+                duplicate_magnets.append(info)
+
+        page_info = issues["pageInfo"]
+        if not page_info["hasNextPage"]:
+            print(f"Done: scanned {total_issues_scanned} open issues")
+            break
+        cursor = page_info["endCursor"]
+
+        print(
+            f"Page {page + 1}: scanned {total_issues_scanned} open issues, "
+            f"{len(duplicate_magnets)} have duplicates"
+        )
+
+    return duplicate_magnets
+
+
+def build_markdown_body(duplicate_magnets):
+    """Group results by area and build markdown body for the GitHub issue.
+
+    NOTE: the output format is parsed by fetch_duplicate_magnets() in
+    github-check-new-issue-for-duplicates.py — update that if you change this.
+    """
+    by_area = defaultdict(list)
+    area_totals = Counter()
+    for info in duplicate_magnets:
+        for area in info["areas"]:
+            by_area[area].append(info)
+            area_totals[area] += info["duplicate_count"]
+
+    lines = [
+        "These are the issues that are frequently re-reported. "
+        "The list is generated regularly by running a script."
+    ]
+
+    for area, _ in area_totals.most_common():
+        issues = sorted(by_area[area], key=lambda x: x["duplicate_count"], reverse=True)
+
+        lines.append("")
+        lines.append(f"## {area}")
+        lines.append("")
+
+        for info in issues:
+            lines.append(
+                f"-   [{info['duplicate_count']:2d} dupes] {info['url']}"
+            )
+
+    return "\n".join(lines)
+
+
+def update_github_issue(issue_number, body):
+    """Update the body of a GitHub issue."""
+    url = f"{REST_API_URL}/repos/{OWNER}/{REPO}/issues/{issue_number}"
+    response = requests.patch(url, headers=headers, json={"body": body})
+    response.raise_for_status()
+    print(f"Updated issue #{issue_number}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Find open issues with the most duplicates filed against them."
+    )
+    parser.add_argument(
+        "--github-token",
+        default=os.environ.get("GITHUB_TOKEN"),
+        help="GitHub token (or set GITHUB_TOKEN env var)",
+    )
+    parser.add_argument(
+        "--issue-number",
+        type=int,
+        help="GitHub issue number to update (if not provided, prints to stdout)",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if not args.github_token:
+        print("Error: --github-token is required (or set GITHUB_TOKEN env var)")
+        sys.exit(1)
+
+    headers = {
+        "Authorization": f"Bearer {args.github_token}",
+        "Content-Type": "application/json",
+    }
+
+    if duplicate_magnets := fetch_canonical_issues_with_duplicates():
+        body = build_markdown_body(duplicate_magnets)
+        if args.issue_number:
+            update_github_issue(args.issue_number, body)
+        else:
+            print(body)