From 16aeee6361968905cfd0043d4a36a6e86774c674 Mon Sep 17 00:00:00 2001 From: Lena <241371603+zelenenka@users.noreply.github.com> Date: Wed, 18 Feb 2026 17:15:15 +0000 Subject: [PATCH] Post comments on duplicate bug reports (#49482) Release Notes: - N/A --- .../comment_on_potential_duplicate_issues.yml | 70 ++ .../identify_potential_duplicate_issues.yml | 692 ------------------ .../workflows/update_duplicate_magnets.yml | 27 + .../github-check-new-issue-for-duplicates.py | 534 ++++++++++++++ script/github-find-top-duplicated-bugs.py | 223 ++++++ 5 files changed, 854 insertions(+), 692 deletions(-) create mode 100644 .github/workflows/comment_on_potential_duplicate_issues.yml delete mode 100644 .github/workflows/identify_potential_duplicate_issues.yml create mode 100644 .github/workflows/update_duplicate_magnets.yml create mode 100644 script/github-check-new-issue-for-duplicates.py create mode 100644 script/github-find-top-duplicated-bugs.py diff --git a/.github/workflows/comment_on_potential_duplicate_issues.yml b/.github/workflows/comment_on_potential_duplicate_issues.yml new file mode 100644 index 0000000000000000000000000000000000000000..de51cb1105c98901237ec88d47c34c69ea5c8080 --- /dev/null +++ b/.github/workflows/comment_on_potential_duplicate_issues.yml @@ -0,0 +1,70 @@ +name: Comment on potential duplicate bug/crash reports + +on: + issues: + types: [opened] + workflow_dispatch: + inputs: + issue_number: + description: "Issue number to analyze" + required: true + type: number + +concurrency: + group: potential-duplicate-check-${{ github.event.issue.number || inputs.issue_number }} + cancel-in-progress: true + +jobs: + identify-duplicates: + # For manual testing, allow running on any branch; for automatic runs, only on main repo + if: github.event_name == 'workflow_dispatch' || github.repository == 'zed-industries/zed' + runs-on: ubuntu-latest + timeout-minutes: 5 + + permissions: + contents: read + issues: write + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + sparse-checkout: script/github-check-new-issue-for-duplicates.py + sparse-checkout-cone-mode: false + + - name: Get github app token + id: get-app-token + uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7 + with: + app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }} + private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }} + owner: zed-industries + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install requests + + - name: Run duplicate detection + id: detect + env: + GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }} + ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }} + run: | + python script/github-check-new-issue-for-duplicates.py "$ISSUE_NUMBER" > result.json + cat result.json + + - name: Write job summary + if: always() + run: | + echo '```json' >> "$GITHUB_STEP_SUMMARY" + if [[ -f result.json ]] && jq empty result.json 2>/dev/null; then + jq . result.json >> "$GITHUB_STEP_SUMMARY" + else + echo '{"error": "No valid result.json generated. Check logs for details."}' >> "$GITHUB_STEP_SUMMARY" + fi + echo '```' >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/identify_potential_duplicate_issues.yml b/.github/workflows/identify_potential_duplicate_issues.yml deleted file mode 100644 index bfc0e5cdca633635bc683814da2738f943f4d590..0000000000000000000000000000000000000000 --- a/.github/workflows/identify_potential_duplicate_issues.yml +++ /dev/null @@ -1,692 +0,0 @@ -name: Identify potential duplicates among new bug/crash reports - -on: - issues: - types: [opened] - workflow_dispatch: - inputs: - issue_number: - description: "Issue number to analyze (for testing)" - required: true - type: number - -concurrency: - group: potential-duplicate-check-${{ github.event.issue.number || inputs.issue_number }} - cancel-in-progress: true - -jobs: - identify-duplicates: - # For manual testing, allow running on any branch; for automatic runs, only on main repo - if: github.event_name == 'workflow_dispatch' || github.repository == 'zed-industries/zed' - runs-on: ubuntu-latest - timeout-minutes: 5 - - permissions: - contents: read - issues: read - - steps: - - name: Get github app token - id: get-app-token - uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v2.1.4 - with: - app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }} - private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }} - owner: zed-industries - - - name: Fetch issue and check eligibility - id: fetch-issue - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - with: - github-token: ${{ steps.get-app-token.outputs.token }} - script: | - const issueNumber = context.payload.issue?.number || ${{ inputs.issue_number || 0 }}; - if (!issueNumber) { - core.setFailed('No issue number provided'); - return; - } - - const { data: issue } = await github.rest.issues.get({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: issueNumber - }); - - const typeName = issue.type?.name; - const isTargetType = typeName === 'Bug' || typeName === 'Crash'; - - console.log(`Issue #${issueNumber}: "${issue.title}"`); - console.log(`Issue type: ${typeName || '(none)'}`); - console.log(`Is target type (Bug/Crash): ${isTargetType}`); - - // Set default outputs for all paths - core.setOutput('issue_number', issueNumber); - core.setOutput('issue_title', issue.title); - core.setOutput('issue_body', (issue.body || '').slice(0, 6000)); - core.setOutput('is_target_type', String(isTargetType)); - core.setOutput('is_staff', 'false'); - core.setOutput('should_continue', 'false'); - - if (!isTargetType) { - console.log('::notice::Skipping - issue type is not Bug or Crash'); - return; - } - - // Check if author is staff (skip if so - they know what they're doing) - const author = issue.user?.login || ''; - let isStaff = false; - if (author) { - try { - const response = await github.rest.teams.getMembershipForUserInOrg({ - org: 'zed-industries', - team_slug: 'staff', - username: author - }); - isStaff = response.data.state === 'active'; - } catch (error) { - if (error.status !== 404) throw error; - } - } - - core.setOutput('is_staff', String(isStaff)); - if (isStaff) { - console.log(`::notice::Skipping - author @${author} is a staff member`); - return; - } - - core.setOutput('should_continue', 'true'); - - # ======================================================================== - # PASS 1: Detect areas using Claude with the full area taxonomy - # ======================================================================== - - name: "Pass 1: Detect areas with Claude" - if: steps.fetch-issue.outputs.should_continue == 'true' - id: detect-areas - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }} - ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }} - ISSUE_BODY: ${{ steps.fetch-issue.outputs.issue_body }} - run: | - # shellcheck disable=SC2016 - cat > /tmp/area_prompt.txt << 'PROMPT_EOF' - You are classifying a GitHub issue for the Zed code editor into area categories. - - ## Issue Title - ISSUE_TITLE_PLACEHOLDER - - ## Issue Body - ISSUE_BODY_PLACEHOLDER - - ## Available Area Labels - (descriptions provided only where the label name isn't self-explanatory) - - accessibility - ai, ai/acp (Agent Communication Protocol), ai/agent thread, ai/anthropic, ai/assistant, ai/bedrock, ai/codex, ai/copilot, ai/deepseek, ai/edit prediction, ai/gemini, ai/inline assistant, ai/lmstudio, ai/mcp (Model Context Protocol), ai/mistral, ai/ollama, ai/openai, ai/openai compatible, ai/openrouter, ai/qwen, ai/supermaven, ai/text thread, ai/zeta - auth - autocompletions - billing - cli - code actions - code folding - collab - real-time collaboration with other Zed users (screen sharing, shared editing). NOT for remote development over SSH. - collab/audio, collab/chat - command palette - controls/ime, controls/keybinds, controls/mouse - debugger, debugger/dap/CodeLLDB, debugger/dap/debugpy, debugger/dap/gdb, debugger/dap/javascript - design papercut - small UI/UX polish issues - dev containers - Docker-based development environments - diagnostics - LSP errors/warnings display - discoverability - editor, editor/brackets, editor/linked edits - extensions/infrastructure - file finder - fuzzy file search (Cmd/Ctrl+P) - gpui - Zed's internal UI rendering framework - inlay hints - inline hints from LSP (type annotations, parameter names) - installer-updater - integrations/environment - shell environment, PATH, env vars - integrations/git, integrations/git/blame, integrations/terminal - internationalization, internationalization/rtl support - keymap editor - language server, language server/server failure - languages/* - language-specific syntax, grammar, or LSP issues (e.g., languages/python, languages/rust, languages/typescript) - legal - logging - multi-buffer - viewing multiple files or search results in a single editor pane - multi-cursor - navigation - go to definition, find references, symbol search - network - proxy settings, connectivity, SSL certificates. NOT for collab. - onboarding - outline - document symbols/structure sidebar - parity/* - feature parity requests comparing to other editors (parity/vscode, parity/vim, parity/emacs, parity/jetbrains, parity/helix) - performance, performance/memory leak - permissions - popovers - hover cards, tooltips, autocomplete dropdowns - preview/images, preview/markdown - project panel - file tree sidebar - release notes - repl - search - project-wide search, find/replace - security & privacy, security & privacy/workspace trust - serialization - saving/restoring workspace state, undo history, folding state across restarts - settings, settings/ui - snippets - status bar - tasks - task runner integration - telemetry - tooling/* - external tool integrations (tooling/emmet, tooling/eslint, tooling/prettier, tooling/flatpak, tooling/nix) - tree-sitter - syntax parsing and highlighting engine - ui/animations, ui/dock, ui/file icons, ui/font, ui/menus, ui/minimap, ui/panel, ui/scaling, ui/scrolling, ui/tabs, ui/themes - workspace - window management, pane layout, project handling - zed account - zed.dev - - ## Your Task - - Based on the issue title and body, identify which areas this issue relates to. - - Select 1-5 areas that best match the issue - - Prefer more specific sub-areas when applicable (e.g., "ai/gemini" over just "ai") - - Only select areas that are clearly relevant - - ## Response Format - - Return ONLY a JSON object (no markdown fences, no explanation): - { - "areas": ["area1", "area2"], - "reasoning": "Brief explanation of why these areas were selected" - } - PROMPT_EOF - - # Single quotes are intentional to prevent bash expansion; node reads env vars via process.env - # shellcheck disable=SC2016 - node << 'SCRIPT_EOF' - const fs = require('fs'); - let prompt = fs.readFileSync('/tmp/area_prompt.txt', 'utf8'); - prompt = prompt.replace('ISSUE_TITLE_PLACEHOLDER', process.env.ISSUE_TITLE || ''); - prompt = prompt.replace('ISSUE_BODY_PLACEHOLDER', process.env.ISSUE_BODY || ''); - fs.writeFileSync('/tmp/area_prompt_final.txt', prompt); - SCRIPT_EOF - - HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/area_response.json -X POST "https://api.anthropic.com/v1/messages" \ - -H "Content-Type: application/json" \ - -H "x-api-key: $ANTHROPIC_API_KEY" \ - -H "anthropic-version: 2023-06-01" \ - --data-binary @- << EOF - { - "model": "claude-sonnet-4-5-20250929", - "max_tokens": 256, - "messages": [{"role": "user", "content": $(jq -Rs . < /tmp/area_prompt_final.txt)}] - } - EOF - ) - - RESPONSE=$(< /tmp/area_response.json) - - if [ "$HTTP_CODE" -lt 200 ] || [ "$HTTP_CODE" -ge 300 ]; then - echo "HTTP Error: $HTTP_CODE" - echo "$RESPONSE" | jq . 2>/dev/null || echo "$RESPONSE" - exit 1 - fi - - if echo "$RESPONSE" | jq -e '.error' > /dev/null 2>&1; then - echo "API Error:" - echo "$RESPONSE" | jq . - exit 1 - fi - - AREA_RESULT=$(echo "$RESPONSE" | jq -r '.content[0].text // empty') - - if [ -z "$AREA_RESULT" ]; then - echo "Error: No response from Claude for area detection" - echo "$RESPONSE" | jq . - exit 1 - fi - - echo "Area detection result: $AREA_RESULT" - - # Extract just the areas array, handling potential markdown fences - # shellcheck disable=SC2016 - CLEAN_JSON=$(echo "$AREA_RESULT" | sed 's/^```json//; s/^```//; s/```$//' | tr -d '\n') - AREAS=$(echo "$CLEAN_JSON" | jq -r '.areas // [] | join(",")') - echo "Detected areas: $AREAS" - - echo "detected_areas=$AREAS" >> "$GITHUB_OUTPUT" - - INPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.input_tokens') - OUTPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.output_tokens') - echo "Pass 1 token usage - Input: $INPUT_TOKENS, Output: $OUTPUT_TOKENS" - - # ======================================================================== - # Use detected areas to filter magnets and search for candidates - # ======================================================================== - - name: Filter magnets and search for candidates - if: steps.fetch-issue.outputs.should_continue == 'true' - id: gather-candidates - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - with: - github-token: ${{ steps.get-app-token.outputs.token }} - script: | - // ============================================================ - // KNOWN DUPLICATE MAGNETS (from #46355) - // ============================================================ - const DUPLICATE_MAGNETS = [ - { number: 37074, title: "Support history with external ACP agents", areas: ["ai", "ai/gemini", "ai/acp"] }, - { number: 35780, title: "Zed consumes a lot of memory and CPU when opening ~/ or other large file trees", areas: ["workspace", "performance", "performance/memory leak", "integrations/git"] }, - { number: 16965, title: "Support for non UTF-8 text encodings", areas: ["editor", "internationalization"] }, - { number: 38109, title: "Zed out of sync with changes made outside of editor", areas: ["workspace"] }, - { number: 16727, title: "Select text in markdown preview", areas: ["preview/markdown", "languages/markdown"] }, - { number: 31102, title: "RTL Right-to-Left Text Input/Rendering Support", areas: ["internationalization"] }, - { number: 7371, title: "Restarts should be non-destructive on workspace restore/reload", areas: ["workspace", "serialization"] }, - { number: 7992, title: "Font rendering on LoDPI displays", areas: ["ui/font"] }, - { number: 40018, title: "Windows Beta: Terminal overwrites text when resized and window overflow", areas: ["integrations/terminal"] }, - { number: 29962, title: "Agent Panel: Cannot access zed hosted models (via Cloudflare HKG)", areas: ["ai", "network"] }, - { number: 15097, title: "Serialize undo history (local and remote projects)", areas: ["workspace", "serialization"] }, - { number: 29846, title: "Collapsed code blocks are not restored properly", areas: ["editor", "serialization", "code folding"] }, - { number: 38799, title: "Poor search performance in large repositories", areas: ["performance", "search"] }, - { number: 27283, title: "Inefficient memory use when opening large file in Zed", areas: ["performance"] }, - { number: 39806, title: "Raspberry Pi OS (Trixie) Zed 0.207.3 Video Memory Corruption on Start", areas: ["gpui"] }, - { number: 29970, title: "Unable to download any extensions (due to potential DigitalOcean IP block or ISP block)", areas: ["network"] }, - { number: 29026, title: "Ability to copy/paste files from the system file manager", areas: ["workspace"] }, - { number: 7940, title: "Zed is sometimes unresponsive when the OS awakes from sleep", areas: ["workspace"] }, - { number: 37025, title: "Failed to generate thread summary", areas: ["ai"] }, - { number: 16156, title: "Support for project settings to enable/disable/control AI features", areas: ["ai", "settings"] }, - { number: 24752, title: "Extra horizontal scrolling when inline blame is enabled with soft wrapping", areas: ["editor"] }, - { number: 20970, title: "Excessive memory consumption on project search with large files present", areas: ["performance/memory leak", "search", "multi-buffer"] }, - { number: 12176, title: "Only some ligatures are being applied", areas: ["ui/font", "settings"] }, - { number: 13564, title: "blade: Text is rendered either too thick or too thin", areas: ["ui/font"] }, - { number: 38901, title: "Terminal freezes in Linux session when Ctrl+C is pressed before exit", areas: ["controls/keybinds", "integrations/terminal"] }, - { number: 20167, title: "Support unsetting default keybindings", areas: ["controls/keybinds"] }, - { number: 25469, title: "Tracking - Linux non-QWERTY keyboard support", areas: ["controls/keybinds"] }, - { number: 29598, title: "Manual refresh on unsupported filesystems (nfs, fuse, exfat) without inotify/fsevents", areas: ["project panel"] }, - { number: 14428, title: "Ordering of search tokens in file finder fuzzy match", areas: ["file finder"] }, - { number: 20771, title: "Workspace: Reload to respect the desktop/workspace Zed windows were in after reload", areas: ["workspace", "serialization"] }, - { number: 7465, title: "Lines with RTL text aren't rendered correctly", areas: ["editor", "internationalization/rtl support", "parity/vscode"] }, - { number: 16120, title: "Large files without newlines (all on one line) cause Zed to hang/crash", areas: ["editor"] }, - { number: 22703, title: "Syntax aware folding (folds.scm support)", areas: ["editor", "tree-sitter"] }, - { number: 38927, title: "Find & Replace memory leak on large files", areas: ["performance", "performance/memory leak"] }, - { number: 4560, title: "Improve streaming search speed", areas: ["performance", "search"] }, - { number: 14053, title: "Linux Shortcuts don't work with non-latin / international keyboard layouts", areas: ["internationalization", "controls/keybinds"] }, - { number: 31637, title: "High memory consumption in Project Search with large codebases", areas: ["performance/memory leak", "search"] }, - { number: 11744, title: "Incorrect spacing of terminal font", areas: ["ui/font", "integrations/terminal"] }, - { number: 4746, title: "Terminal Nerd Font rendering incorrect line height", areas: ["ui/font", "integrations/terminal"] }, - { number: 10647, title: "User configurable mouse bindings (like keymap for key+mouse)", areas: ["controls/keybinds", "controls/mouse", "accessibility"] }, - { number: 34865, title: "ctrl-w with pane::CloseActiveItem binding closes the project panel instead of the active pane", areas: ["controls/keybinds", "ui/panel"] }, - { number: 12163, title: "Cannot see list of installed extensions when offline / disconnected", areas: ["network"] }, - { number: 44630, title: "Tables do not render all columns in markdown preview", areas: ["preview/markdown"] }, - { number: 39435, title: "Windows: Low fps in many cases", areas: ["gpui"] }, - { number: 36227, title: "Zed becomes unresponsive when closing", areas: ["workspace"] }, - { number: 44962, title: "Can not open file in zed if filename includes (1)", areas: ["workspace"] }, - { number: 32318, title: "Zed hangs after exiting sleep mode in Linux", areas: ["workspace"] }, - { number: 5120, title: "Add options to hide title and status bar", areas: ["settings", "status bar"] }, - { number: 29323, title: "uv: Failed to detect Python venv correctly", areas: ["language server", "languages/python", "integrations/environment"] }, - { number: 7450, title: "Support LSP Semantic Tokens", areas: ["language server", "languages", "ui/themes"] }, - { number: 31846, title: "LSP: triggerCharacters for signature help declared by servers do not seem to be respected", areas: ["language server"] }, - { number: 32792, title: "[SWAY] Zed window flashes rapidly on Sway/wlroots", areas: ["gpui"] }, - { number: 28398, title: "Stale buffers should be removed from search multibuffer", areas: ["search", "multi-buffer"] }, - { number: 35011, title: "Delete Key against remote Hosts Doesn't Delete Folders", areas: ["project panel"] }, - { number: 8626, title: "Palette File Navigation - Preview File Content", areas: ["file finder"] }, - { number: 31468, title: "Certain LSP features are not activated till you trigger them manually when working with a remote project", areas: ["language server/server failure", "autocompletions"] }, - { number: 9789, title: "Zed checks for LSP updates when offline and disables LSPs irreversibly in the process", areas: ["language server/server failure"] }, - { number: 21403, title: "Completions and code actions should not use uniform lists", areas: ["autocompletions", "popovers", "diagnostics"] }, - { number: 15196, title: "Remote Project REPL support", areas: ["repl"] }, - ]; - - const MAX_SEARCHES = 5; - - const issueNumber = parseInt('${{ steps.fetch-issue.outputs.issue_number }}', 10); - const title = process.env.ISSUE_TITLE || ''; - const body = process.env.ISSUE_BODY || ''; - const detectedAreasStr = '${{ steps.detect-areas.outputs.detected_areas }}'; - const detectedAreas = new Set(detectedAreasStr.split(',').filter(a => a.trim())); - - console.log(`Detected areas from Claude: ${[...detectedAreas].join(', ') || '(none)'}`); - - // Helper: check if two areas match (handles hierarchy like "ai" matching "ai/gemini") - function areasMatch(detected, magnetArea) { - if (detected === magnetArea) return true; - if (magnetArea.startsWith(detected + '/')) return true; - if (detected.startsWith(magnetArea + '/')) return true; - return false; - } - - // Filter magnets based on detected areas - const relevantMagnets = DUPLICATE_MAGNETS.filter(magnet => { - if (detectedAreas.size === 0) return true; - return magnet.areas.some(magnetArea => - [...detectedAreas].some(detected => areasMatch(detected, magnetArea)) - ); - }).slice(0, 20); - - console.log(`Relevant duplicate magnets: ${relevantMagnets.length}`); - - // Build search queries - const searchQueries = []; - const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0]; - - // 1. Keyword search from title - const stopwords = ['with', 'that', 'this', 'from', 'have', 'been', 'were', 'what', 'when', - 'where', 'which', 'while', 'does', 'doesn', 'should', 'would', 'could', - 'about', 'after', 'before', 'between', 'into', 'through', 'during', - 'above', 'below', 'under', 'again', 'further', 'then', 'once', 'here', - 'there', 'some', 'such', 'only', 'same', 'than', 'very', 'just', 'also', - 'work', 'working', 'works', 'issue', 'problem', 'error', 'bug', 'zed']; - const titleKeywords = title - .toLowerCase() - .replace(/[^\w\s]/g, ' ') - .split(/\s+/) - .filter(w => w.length >= 3 && !stopwords.includes(w)) - .slice(0, 5); - - if (titleKeywords.length >= 2) { - searchQueries.push({ - type: 'keyword', - query: `repo:zed-industries/zed is:issue created:>${thirtyDaysAgo} ${titleKeywords.join(' ')}` - }); - } - - // 2. Area-based searches (using Claude-detected areas) - for (const area of [...detectedAreas].slice(0, 3)) { - searchQueries.push({ - type: 'area', - query: `repo:zed-industries/zed is:issue is:open label:"area:${area}" created:>${thirtyDaysAgo}` - }); - } - - // 3. Look for error patterns in the body - const errorPatterns = body.match(/(?:error|panic|crash|failed|exception)[:\s]+[^\n]{10,100}/gi) || []; - if (errorPatterns.length > 0) { - const errorSnippet = errorPatterns[0] - .slice(0, 60) - .replace(/[^\w\s]/g, ' ') - .replace(/\s+/g, ' ') - .trim(); - if (errorSnippet.length > 15) { - searchQueries.push({ - type: 'error', - query: `repo:zed-industries/zed is:issue "${errorSnippet.slice(0, 40)}"` - }); - } - } - - // Execute searches and collect candidates - const candidates = []; - const seenIssues = new Set([issueNumber]); - - for (const { type, query } of searchQueries.slice(0, MAX_SEARCHES)) { - try { - console.log(`Search (${type}): ${query}`); - const { data: results } = await github.rest.search.issuesAndPullRequests({ - q: query, - sort: 'created', - order: 'desc', - per_page: 10 - }); - - for (const item of results.items) { - if (!seenIssues.has(item.number) && !item.pull_request) { - seenIssues.add(item.number); - candidates.push({ - number: item.number, - title: item.title, - state: item.state, - created_at: item.created_at, - body_preview: (item.body || '').slice(0, 800), - source: type - }); - } - } - } catch (error) { - console.log(`Search failed (${type}): ${error.message}`); - } - } - - console.log(`Found ${candidates.length} candidates from searches`); - - // Prepare issue data for Claude - const issueData = { - number: issueNumber, - title: title, - body: body.slice(0, 4000), - }; - - // Prepare output - core.setOutput('issue_data', JSON.stringify(issueData)); - core.setOutput('duplicate_magnets', JSON.stringify(relevantMagnets)); - core.setOutput('candidates', JSON.stringify(candidates.slice(0, 12))); - core.setOutput('detected_areas', [...detectedAreas].join(', ')); - core.setOutput('should_analyze', (relevantMagnets.length > 0 || candidates.length > 0) ? 'true' : 'false'); - env: - ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }} - ISSUE_BODY: ${{ steps.fetch-issue.outputs.issue_body }} - - # ======================================================================== - # PASS 2: Analyze duplicates with Claude - # ======================================================================== - - name: "Pass 2: Analyze duplicates with Claude" - if: | - steps.fetch-issue.outputs.should_continue == 'true' && - steps.gather-candidates.outputs.should_analyze == 'true' - id: analyze - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_ISSUE_DEDUP }} - ISSUE_DATA: ${{ steps.gather-candidates.outputs.issue_data }} - DUPLICATE_MAGNETS: ${{ steps.gather-candidates.outputs.duplicate_magnets }} - CANDIDATES: ${{ steps.gather-candidates.outputs.candidates }} - run: | - # shellcheck disable=SC2016 - cat > /tmp/prompt.txt << 'PROMPT_EOF' - You are analyzing a GitHub issue to determine if it might be a duplicate of an existing issue. - - ## New Issue Being Analyzed - ISSUE_DATA_PLACEHOLDER - - ## Known Frequently-Duplicated Issues (High Priority) - These issues have historically received many duplicate reports. Check these first. - DUPLICATE_MAGNETS_PLACEHOLDER - - ## Recent Similar Issues Found by Search - CANDIDATES_PLACEHOLDER - - ## Your Task - - 1. First, understand what the new issue is about: - - What specific bug or problem is being reported? - - What error messages, stack traces, or specific behaviors are mentioned? - - What component/feature is affected? - - 2. Check against the frequently-duplicated issues first (high priority): - - These are known "duplicate magnets" that often get re-reported - - If the new issue describes the same problem, it's likely a duplicate - - 3. Then check the recent similar issues: - - Look for issues describing the SAME bug, not just related topics - - ## Duplicate Criteria (be strict!) - - An issue IS a duplicate if: - - It describes the EXACT same bug with the same root cause - - It has the same error message or stack trace - - It has the same reproduction steps leading to the same outcome - - An issue is NOT a duplicate if: - - It's merely related to the same feature/area - - It has similar symptoms but potentially different causes - - It mentions similar things but describes a different problem - - Be VERY conservative. It's better to miss a duplicate than to incorrectly flag a unique issue. - - ## Response Format - - Return ONLY a JSON object (no markdown fences, no explanation before or after): - { - "is_potential_duplicate": boolean, - "confidence": "high" | "medium" | "low" | "none", - "potential_duplicates": [ - {"number": integer, "title": "string", "similarity_reason": "string explaining why this might be the same bug"} - ], - "analysis_summary": "Brief explanation of what the new issue is about and your conclusion", - "recommendation": "flag_as_duplicate" | "needs_human_review" | "not_a_duplicate" - } - PROMPT_EOF - - # Single quotes are intentional to prevent bash expansion; node reads env vars via process.env - # shellcheck disable=SC2016 - node << 'SCRIPT_EOF' - const fs = require('fs'); - - let prompt = fs.readFileSync('/tmp/prompt.txt', 'utf8'); - prompt = prompt.replace('ISSUE_DATA_PLACEHOLDER', process.env.ISSUE_DATA); - prompt = prompt.replace('DUPLICATE_MAGNETS_PLACEHOLDER', process.env.DUPLICATE_MAGNETS); - prompt = prompt.replace('CANDIDATES_PLACEHOLDER', process.env.CANDIDATES); - - fs.writeFileSync('/tmp/prompt_final.txt', prompt); - SCRIPT_EOF - - HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/response.json -X POST "https://api.anthropic.com/v1/messages" \ - -H "Content-Type: application/json" \ - -H "x-api-key: $ANTHROPIC_API_KEY" \ - -H "anthropic-version: 2023-06-01" \ - --data-binary @- << EOF - { - "model": "claude-sonnet-4-5-20250929", - "max_tokens": 1024, - "messages": [{"role": "user", "content": $(jq -Rs . < /tmp/prompt_final.txt)}] - } - EOF - ) - - RESPONSE=$(< /tmp/response.json) - - if [ "$HTTP_CODE" -lt 200 ] || [ "$HTTP_CODE" -ge 300 ]; then - echo "HTTP Error: $HTTP_CODE" - echo "$RESPONSE" | jq . 2>/dev/null || echo "$RESPONSE" - exit 1 - fi - - if echo "$RESPONSE" | jq -e '.error' > /dev/null 2>&1; then - echo "API Error:" - echo "$RESPONSE" | jq . - exit 1 - fi - - ANALYSIS=$(echo "$RESPONSE" | jq -r '.content[0].text // empty') - - if [ -z "$ANALYSIS" ]; then - echo "Error: No response from Claude" - echo "$RESPONSE" | jq . - exit 1 - fi - - { - echo "analysis<> "$GITHUB_OUTPUT" - - INPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.input_tokens') - OUTPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.output_tokens') - echo "Pass 2 token usage - Input: $INPUT_TOKENS, Output: $OUTPUT_TOKENS" - - # ======================================================================== - # Log results - # ======================================================================== - - name: Log analysis results - if: | - steps.fetch-issue.outputs.should_continue == 'true' && - !cancelled() - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - with: - script: | - const issueNumber = parseInt('${{ steps.fetch-issue.outputs.issue_number }}', 10) || 0; - const issueTitle = process.env.ISSUE_TITLE || ''; - const detectedAreas = '${{ steps.gather-candidates.outputs.detected_areas }}' || '(none)'; - const shouldAnalyze = '${{ steps.gather-candidates.outputs.should_analyze }}' === 'true'; - const analysisRaw = process.env.ANALYSIS_OUTPUT || ''; - - console.log('='.repeat(60)); - console.log('DUPLICATE DETECTION RESULTS (TWO-PASS)'); - console.log('='.repeat(60)); - console.log(`Issue: #${issueNumber} - ${issueTitle}`); - console.log(`URL: https://github.com/zed-industries/zed/issues/${issueNumber}`); - console.log(`Detected Areas: ${detectedAreas}`); - - if (!shouldAnalyze) { - console.log('\nNo duplicate magnets or candidates found - skipping analysis'); - core.summary.addHeading(`✅ Issue #${issueNumber}: No similar issues found`, 2); - core.summary.addRaw(`\n**Title:** ${issueTitle}\n\n`); - core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`); - core.summary.addRaw('No potential duplicates were found by search or in the known duplicate magnets list.\n'); - await core.summary.write(); - return; - } - - if (!analysisRaw) { - console.log('\nNo analysis output received'); - core.summary.addHeading(`⚠️ Issue #${issueNumber}: Analysis incomplete`, 2); - core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`); - core.summary.addRaw('The Claude analysis step did not produce output. Check workflow logs.\n'); - await core.summary.write(); - return; - } - - try { - let cleanJson = analysisRaw.trim(); - if (cleanJson.startsWith('```')) { - cleanJson = cleanJson.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, ''); - } - - const analysis = JSON.parse(cleanJson); - - console.log(`\nIs Potential Duplicate: ${analysis.is_potential_duplicate}`); - console.log(`Confidence: ${analysis.confidence}`); - console.log(`Recommendation: ${analysis.recommendation}`); - console.log(`\nAnalysis Summary:\n${analysis.analysis_summary}`); - - if (analysis.potential_duplicates && analysis.potential_duplicates.length > 0) { - console.log(`\nPotential Duplicates Found: ${analysis.potential_duplicates.length}`); - for (const dup of analysis.potential_duplicates) { - console.log(` - #${dup.number}: ${dup.title}`); - console.log(` Reason: ${dup.similarity_reason}`); - } - } else { - console.log('\nNo potential duplicates identified by analysis.'); - } - - console.log('\n' + '='.repeat(60)); - - const summaryIcon = analysis.is_potential_duplicate ? '⚠️' : '✅'; - const summaryText = analysis.is_potential_duplicate - ? `Potential duplicate detected (${analysis.confidence} confidence)` - : 'No likely duplicates found'; - - core.summary.addHeading(`${summaryIcon} Issue #${issueNumber}: ${summaryText}`, 2); - core.summary.addRaw(`\n**Title:** ${issueTitle}\n\n`); - core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`); - core.summary.addRaw(`**Recommendation:** \`${analysis.recommendation}\`\n\n`); - core.summary.addRaw(`**Summary:** ${analysis.analysis_summary}\n\n`); - - if (analysis.potential_duplicates && analysis.potential_duplicates.length > 0) { - core.summary.addHeading('Potential Duplicates', 3); - const rows = analysis.potential_duplicates.map(d => [ - `[#${d.number}](https://github.com/zed-industries/zed/issues/${d.number})`, - d.title.slice(0, 60) + (d.title.length > 60 ? '...' : ''), - d.similarity_reason - ]); - core.summary.addTable([ - [{data: 'Issue', header: true}, {data: 'Title', header: true}, {data: 'Similarity Reason', header: true}], - ...rows - ]); - } - - await core.summary.write(); - - } catch (e) { - console.log('Failed to parse analysis output:', e.message); - console.log('Raw output:', analysisRaw); - core.summary.addHeading(`⚠️ Issue #${issueNumber}: Failed to parse analysis`, 2); - core.summary.addRaw(`**Detected Areas:** ${detectedAreas}\n\n`); - core.summary.addRaw(`Error: ${e.message}\n\nRaw output:\n\`\`\`\n${analysisRaw.slice(0, 1000)}\n\`\`\``); - await core.summary.write(); - } - env: - ISSUE_TITLE: ${{ steps.fetch-issue.outputs.issue_title }} - ANALYSIS_OUTPUT: ${{ steps.analyze.outputs.analysis }} diff --git a/.github/workflows/update_duplicate_magnets.yml b/.github/workflows/update_duplicate_magnets.yml new file mode 100644 index 0000000000000000000000000000000000000000..1c6c5a562532891eb97ceb11f44b81f35612c026 --- /dev/null +++ b/.github/workflows/update_duplicate_magnets.yml @@ -0,0 +1,27 @@ +name: Update Duplicate Magnets Issue + +on: + schedule: + - cron: "0 6 * * 1,4" # Mondays and Thursdays at 6 AM UTC + workflow_dispatch: + +jobs: + update-duplicate-magnets: + runs-on: ubuntu-latest + if: github.repository == 'zed-industries/zed' + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install requests + + - name: Update duplicate magnets issue + run: | + python script/github-find-top-duplicated-bugs.py \ + --github-token ${{ secrets.GITHUB_TOKEN }} \ + --issue-number 46355 diff --git a/script/github-check-new-issue-for-duplicates.py b/script/github-check-new-issue-for-duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..c8978d421b0c68779dfcd98f628ff1af6f57c91e --- /dev/null +++ b/script/github-check-new-issue-for-duplicates.py @@ -0,0 +1,534 @@ +#!/usr/bin/env python3 +""" +Comment on newly opened issues that might be duplicates of an existing issue. + +This script is run by a GitHub Actions workflow when a new bug or crash report +is opened. It: +1. Checks eligibility (must be bug/crash type, non-staff author) +2. Detects relevant areas using Claude + the area label taxonomy +3. Parses known "duplicate magnets" from tracking issue #46355 +4. Searches for similar recent issues by title keywords, area labels, and error patterns +5. Asks Claude to analyze potential duplicates (magnets + search results) +6. Posts a comment on the issue if high-confidence duplicates are found + +Requires: + requests (pip install requests) + +Usage: + python github-check-new-issue-for-duplicates.py + +Environment variables: + GITHUB_TOKEN - GitHub token (org members: read, issues: read & write) + ANTHROPIC_API_KEY - Anthropic API key for Claude + +""" + +import argparse +import json +import os +import re +import sys +from datetime import datetime, timedelta + +import requests + +GITHUB_API = "https://api.github.com" +REPO_OWNER = "zed-industries" +REPO_NAME = "zed" +TRACKING_ISSUE_NUMBER = 46355 +STAFF_TEAM_SLUG = "staff" + +# area prefixes to collapse in taxonomy (show summary instead of all sub-labels) +PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"] + +# stopwords to filter from title keyword searches (short words handled by len > 2 filter) +STOPWORDS = { + "after", "all", "also", "and", "any", "but", "can't", "does", "doesn't", + "don't", "for", "from", "have", "just", "not", "only", "some", "that", + "the", "this", "when", "while", "with", "won't", "work", "working", "zed", +} + + +def log(message): + """Print to stderr so it doesn't interfere with JSON output on stdout.""" + print(message, file=sys.stderr) + + +def github_api_get(path, params=None): + """Fetch JSON from the GitHub API. Raises on non-2xx status.""" + url = f"{GITHUB_API}/{path.lstrip('/')}" + response = requests.get(url, headers=GITHUB_HEADERS, params=params) + response.raise_for_status() + return response.json() + + +def github_search_issues(query, per_page=15): + """Search issues, returning most recently created first.""" + params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page} + return github_api_get("/search/issues", params).get("items", []) + + +def check_team_membership(org, team_slug, username): + """Check if user is an active member of a team.""" + try: + data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}") + return data.get("state") == "active" + except requests.HTTPError as e: + if e.response.status_code == 404: + return False + raise + + +def post_comment(issue_number: int, body): + url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments" + response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body}) + response.raise_for_status() + log(f" Posted comment on #{issue_number}") + + +def build_duplicate_comment(matches): + """Build the comment body for potential duplicates.""" + match_list = "\n".join(f"- #{m['number']}" for m in matches) + explanations = "\n\n".join(f"**#{m['number']}:** {m['explanation']}" for m in matches) + + return f"""This issue appears to be a duplicate of: + +{match_list} + +**If this is indeed a duplicate:** +Please close this issue and subscribe to the linked issue for updates (select "Close as not planned" → "Duplicate") + +**If this is a different issue:** +No action needed. A maintainer will review this shortly. + +
+Why were these issues selected? + +{explanations} + +
+ +--- +This is an automated analysis and might be incorrect.""" + + +def call_claude(api_key, system, user_content, max_tokens=1024): + """Send a message to Claude and return the text response. Raises on non-2xx status.""" + response = requests.post( + "https://api.anthropic.com/v1/messages", + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + }, + json={ + "model": "claude-sonnet-4-20250514", + "max_tokens": max_tokens, + "temperature": 0.0, + "system": system, + "messages": [{"role": "user", "content": user_content}], + }, + ) + response.raise_for_status() + data = response.json() + + usage = data.get("usage", {}) + log(f" Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}") + + content = data.get("content", []) + if content and content[0].get("type") == "text": + return content[0].get("text") or "" + return "" + + +def fetch_issue(issue_number: int): + """Fetch issue from GitHub and return as a dict.""" + log(f"Fetching issue #{issue_number}") + + issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}") + issue = { + "number": issue_number, + "title": issue_data["title"], + "body": issue_data.get("body") or "", + "author": (issue_data.get("user") or {}).get("login") or "", + "type": (issue_data.get("type") or {}).get("name"), + } + + log(f" Title: {issue['title']}\n Type: {issue['type']}\n Author: {issue['author']}") + return issue + + +def should_skip(issue): + """Check if issue should be skipped in duplicate detection process.""" + if issue["type"] not in ["Bug", "Crash"]: + log(f" Skipping: issue type '{issue['type']}' is not a bug/crash report") + return True + + if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]): + log(f" Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member") + return True + + return False + + +def fetch_area_labels(): + """Fetch area:* labels from the repository. Returns list of {name, description} dicts.""" + log("Fetching area labels") + + labels = [] + page = 1 + while page_labels := github_api_get( + f"/repos/{REPO_OWNER}/{REPO_NAME}/labels", + params={"per_page": 100, "page": page}, + ): + labels.extend(page_labels) + page += 1 + + # label["name"][5:] removes the "area:" prefix + area_labels = [ + {"name": label["name"][5:], "description": label.get("description") or ""} + for label in labels + if label["name"].startswith("area:") + ] + + log(f" Found {len(area_labels)} area labels") + return area_labels + + +def format_taxonomy_for_claude(area_labels): + """Format area labels into a string for Claude, collapsing certain prefixes.""" + lines = set() + + for area in area_labels: + name = area["name"] + collapsible_prefix = next( + (p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None) + + if collapsible_prefix: + lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)") + else: + desc = area["description"] + lines.add(f"- {name}: {desc}" if desc else f"- {name}") + + return "\n".join(sorted(lines)) + + +def detect_areas(anthropic_key, issue, taxonomy): + """Use Claude to detect relevant areas for the issue.""" + log("Detecting areas with Claude") + + system_prompt = """You analyze GitHub issues to identify which area labels apply. + +Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names. +- Output at most 3 areas, ranked by relevance +- Use exact area names from the taxonomy +- If no areas clearly match, output: none +- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust", +tooling/eslint, parity/vscode) + +Example outputs: +- "editor, parity/vim" +- "ai, ai/agent panel" +- "none" +""" + + user_content = f"""## Area Taxonomy +{taxonomy} + +# Issue Title +{issue['title']} + +# Issue Body +{issue['body'][:4000]}""" + + response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip() + log(f" Detected areas: {response}") + + if response.lower() == "none": + return [] + return [area.strip() for area in response.split(",")] + + +def parse_duplicate_magnets(): + """Parse known duplicate magnets from tracking issue #46355. + + Returns a list of magnets sorted by duplicate count (most duplicated first). + Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch + title and body_preview for the ones you need. + """ + log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}") + + issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}") + body = issue_data.get("body") or "" + + # parse the issue body + # format: ## area_name + # - [N dupes] https://github.com/zed-industries/zed/issues/NUMBER + magnets = {} # number -> {number, areas, dupe_count} + current_area = None + + for line in body.split("\n"): + # check for area header + if line.startswith("## "): + current_area = line[3:].strip() + continue + + if not current_area or not line.startswith("-") or "/issues/" not in line: + continue + + # parse: - [N dupes] https://github.com/.../issues/NUMBER + try: + dupe_count = int(line.split("[")[1].split()[0]) + number = int(line.split("/issues/")[1].split()[0].rstrip(")")) + except (ValueError, IndexError): + continue + + # skip "(unlabeled)": these magnets should match everything + is_unlabeled = current_area == "(unlabeled)" + + if number in magnets: + if not is_unlabeled: + magnets[number]["areas"].append(current_area) + else: + magnets[number] = { + "number": number, + "areas": [] if is_unlabeled else [current_area], + "dupe_count": dupe_count, + } + + magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True) + log(f" Parsed {len(magnet_list)} duplicate magnets") + return magnet_list + + +def enrich_magnets(magnets): + """Fetch title and body_preview for magnets from the API.""" + log(f" Fetching details for {len(magnets)} magnets") + for magnet in magnets: + data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}") + magnet["title"] = data["title"] + magnet["body_preview"] = (data.get("body") or "")[:500] + + +def areas_match(detected, magnet_area): + """Check if detected area matches magnet area. Matches broadly across hierarchy levels.""" + return ( + detected == magnet_area + or magnet_area.startswith(f"{detected}/") + or detected.startswith(f"{magnet_area}/") + ) + + +def filter_magnets_by_areas(magnets, detected_areas): + """Filter magnets based on detected areas.""" + if not detected_areas: + return magnets + + detected_set = set(detected_areas) + + def matches(magnet): + # unlabeled magnets (empty areas) match everything + if not magnet["areas"]: + return True + return any( + areas_match(detected, magnet_area) + for detected in detected_set + for magnet_area in magnet["areas"] + ) + + return list(filter(matches, magnets)) + + +def search_for_similar_issues(issue, detected_areas, max_searches=6): + """Search for similar issues that might be duplicates. + + Searches by title keywords, area labels (last 60 days), and error patterns. + max_searches caps the total number of queries to keep token usage and context size under control. + """ + log("Searching for similar issues") + + sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d") + base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open" + seen_issues = {} + queries = [] + + title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2] + + if title_keywords: + keywords_query = " ".join(title_keywords) + queries.append(("title_keywords", f"{base_query} {keywords_query}")) + + for area in detected_areas: + queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}')) + + # error pattern search: capture 5–90 chars after keyword, colon optional + error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})" + match = re.search(error_pattern, issue["body"]) + if match: + error_snippet = match.group(1).strip() + queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"')) + + for search_type, query in queries[:max_searches]: + log(f" Search ({search_type}): {query}") + try: + results = github_search_issues(query, per_page=15) + for item in results: + number = item["number"] + if number != issue["number"] and number not in seen_issues: + body = item.get("body") or "" + seen_issues[number] = { + "number": number, + "title": item["title"], + "state": item.get("state", ""), + "created_at": item.get("created_at", ""), + "body_preview": body[:500], + "source": search_type, + } + except requests.RequestException as e: + log(f" Search failed: {e}") + + similar_issues = list(seen_issues.values()) + log(f" Found {len(similar_issues)} similar issues") + return similar_issues + + +def analyze_duplicates(anthropic_key, issue, magnets, search_results): + """Use Claude to analyze potential duplicates.""" + log("Analyzing duplicates with Claude") + + top_magnets = magnets[:10] + enrich_magnets(top_magnets) + magnet_numbers = {m["number"] for m in top_magnets} + + candidates = [ + {"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"} + for m in top_magnets + ] + [ + {"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"} + for r in search_results[:10] + if r["number"] not in magnet_numbers + ] + + if not candidates: + return [], "No candidates to analyze" + + system_prompt = """You analyze GitHub issues to identify potential duplicates. + +Given a new issue and a list of existing issues, identify which existing issues might be duplicates. + +For each potential duplicate, assess confidence: +- "high": Very likely the same issue (same root cause, same symptoms) +- "medium": Possibly related (likely to be the same root cause) +- Do NOT include tangentially related issues (same general area but probably different issues) + +Output only valid JSON (no markdown code blocks) with this structure: +{ + "matches": [ + { + "number": 12345, + "confidence": "high|medium", + "explanation": "Brief explanation of why this might be a duplicate" + } + ], + "summary": "One sentence summary of findings" +} + +Only include matches with "high" or "medium" confidence. Return empty matches array if none found.""" + + user_content = f"""## New Issue #{issue['number']} +**Title:** {issue['title']} + +**Body:** +{issue['body'][:3000]} + +## Existing Issues to Compare +{json.dumps(candidates, indent=2)}""" + + response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048) + + try: + data = json.loads(response) + except json.JSONDecodeError as e: + log(f" Failed to parse response: {e}") + log(f" Raw response: {response}") + return [], "Failed to parse analysis" + + matches = data.get("matches", []) + summary = data.get("summary", "Analysis complete") + log(f" Found {len(matches)} potential matches") + return matches, summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Identify potential duplicate issues") + parser.add_argument("issue_number", type=int, help="Issue number to analyze") + parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted") + args = parser.parse_args() + + github_token = os.environ.get("GITHUB_TOKEN") + anthropic_key = os.environ.get("ANTHROPIC_API_KEY") + + if not github_token: + log("Error: GITHUB_TOKEN not set") + sys.exit(1) + if not anthropic_key: + log("Error: ANTHROPIC_API_KEY not set") + sys.exit(1) + + GITHUB_HEADERS = { + "Authorization": f"Bearer {github_token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + issue = fetch_issue(args.issue_number) + if should_skip(issue): + print(json.dumps({"skipped": True})) + sys.exit(0) + + # detect areas + taxonomy = format_taxonomy_for_claude(fetch_area_labels()) + detected_areas = detect_areas(anthropic_key, issue, taxonomy) + + # search for potential duplicates + all_magnets = parse_duplicate_magnets() + relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas) + search_results = search_for_similar_issues(issue, detected_areas) + + # analyze potential duplicates + if relevant_magnets or search_results: + matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results) + else: + matches, summary = [], "No potential duplicates to analyze" + + # post comment if high-confidence matches found + high_confidence_matches = [m for m in matches if m["confidence"] == "high"] + commented = False + + if high_confidence_matches: + comment_body = build_duplicate_comment(high_confidence_matches) + if args.dry_run: + log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40) + else: + log("Posting comment for high-confidence match(es)") + try: + post_comment(issue["number"], comment_body) + commented = True + except requests.RequestException as e: + log(f" Failed to post comment: {e}") + + print(json.dumps({ + "skipped": False, + "issue": { + "number": issue["number"], + "title": issue["title"], + "author": issue["author"], + "type": issue["type"], + }, + "detected_areas": detected_areas, + "magnets_count": len(relevant_magnets), + "search_results_count": len(search_results), + "matches": matches, + "summary": summary, + "commented": commented, + })) diff --git a/script/github-find-top-duplicated-bugs.py b/script/github-find-top-duplicated-bugs.py new file mode 100644 index 0000000000000000000000000000000000000000..001998713060dd6f418c209d26450e0b9dc8d6d8 --- /dev/null +++ b/script/github-find-top-duplicated-bugs.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +Find open issues that have the most duplicates filed against them and update +a GitHub issue with the results. + +Queries open issues and looks for MarkedAsDuplicateEvent in their timelines. +Only includes issues that have been re-reported at least twice (2+ duplicates +closed against them). Groups results by area: label. The output is formatted +as markdown with issue URLs (GitHub renders the titles automatically). + +This script is run regularly by the update_duplicate_magnets.yml workflow. + +Requires: requests (pip install requests) +GitHub token permissions: issues:write + +Usage: + # Print to stdout only for testing: + python github-find-top-duplicated-bugs.py --github-token ghp_xxx + + # Update a GitHub issue: + python github-find-top-duplicated-bugs.py --github-token ghp_xxx --issue-number 46355 +""" + +import argparse +import os +import sys +from collections import Counter, defaultdict + +import requests + +OWNER = "zed-industries" +REPO = "zed" + +GRAPHQL_URL = "https://api.github.com/graphql" +REST_API_URL = "https://api.github.com" + +headers = None + +ISSUES_WITH_DUPLICATES_QUERY = """ +query($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + issues( + first: 100 + after: $cursor + states: [OPEN] + orderBy: {field: UPDATED_AT, direction: DESC} + ) { + pageInfo { + hasNextPage + endCursor + } + nodes { + number + url + labels(first: 20) { + nodes { + name + } + } + timelineItems(first: 100, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) { + nodes { + ... on MarkedAsDuplicateEvent { + duplicate { + ... on Issue { + number + state + } + } + } + } + } + } + } + } +} +""" + + +def extract_duplicate_info(issue): + """Extract duplicate count and info from an issue. Returns None if < 2 duplicates.""" + seen_duplicates = set() + for event in issue["timelineItems"]["nodes"]: + try: + if event["duplicate"]["state"] == "CLOSED": + seen_duplicates.add(event["duplicate"]["number"]) + except (KeyError, TypeError): + continue + + if len(seen_duplicates) < 2: + return None + + labels = [l["name"] for l in issue["labels"]["nodes"]] + areas = [l.replace("area:", "") for l in labels if l.startswith("area:")] + + return { + "number": issue["number"], + "url": issue["url"], + "areas": areas if areas else ["(unlabeled)"], + "duplicate_count": len(seen_duplicates), + } + + +def fetch_canonical_issues_with_duplicates(max_pages=100): + """Fetch open issues and count how many duplicates point to each.""" + print(f"Finding open issues with the most duplicates in {OWNER}/{REPO}") + + cursor = None + duplicate_magnets = [] + total_issues_scanned = 0 + + for page in range(max_pages): + response = requests.post( + GRAPHQL_URL, + headers=headers, + json={ + "query": ISSUES_WITH_DUPLICATES_QUERY, + "variables": {"owner": OWNER, "repo": REPO, "cursor": cursor}, + }, + ) + response.raise_for_status() + data = response.json() + + if "errors" in data: + print(f"GraphQL errors: {data['errors']}") + break + + issues = data["data"]["repository"]["issues"] + total_issues_scanned += len(issues["nodes"]) + + for issue in issues["nodes"]: + if info := extract_duplicate_info(issue): + duplicate_magnets.append(info) + + page_info = issues["pageInfo"] + if not page_info["hasNextPage"]: + print(f"Done: scanned {total_issues_scanned} open issues") + break + cursor = page_info["endCursor"] + + print( + f"Page {page + 1}: scanned {total_issues_scanned} open issues, " + f"{len(duplicate_magnets)} have duplicates" + ) + + return duplicate_magnets + + +def build_markdown_body(duplicate_magnets): + """Group results by area and build markdown body for the GitHub issue. + + NOTE: the output format is parsed by fetch_duplicate_magnets() in + github-check-new-issue-for-duplicates.py — update that if you change this. + """ + by_area = defaultdict(list) + area_totals = Counter() + for info in duplicate_magnets: + for area in info["areas"]: + by_area[area].append(info) + area_totals[area] += info["duplicate_count"] + + lines = [ + "These are the issues that are frequently re-reported. " + "The list is generated regularly by running a script." + ] + + for area, _ in area_totals.most_common(): + issues = sorted(by_area[area], key=lambda x: x["duplicate_count"], reverse=True) + + lines.append("") + lines.append(f"## {area}") + lines.append("") + + for info in issues: + lines.append( + f"- [{info['duplicate_count']:2d} dupes] {info['url']}" + ) + + return "\n".join(lines) + + +def update_github_issue(issue_number, body): + """Update the body of a GitHub issue.""" + url = f"{REST_API_URL}/repos/{OWNER}/{REPO}/issues/{issue_number}" + response = requests.patch(url, headers=headers, json={"body": body}) + response.raise_for_status() + print(f"Updated issue #{issue_number}") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Find open issues with the most duplicates filed against them." + ) + parser.add_argument( + "--github-token", + default=os.environ.get("GITHUB_TOKEN"), + help="GitHub token (or set GITHUB_TOKEN env var)", + ) + parser.add_argument( + "--issue-number", + type=int, + help="GitHub issue number to update (if not provided, prints to stdout)", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + if not args.github_token: + print("Error: --github-token is required (or set GITHUB_TOKEN env var)") + sys.exit(1) + + headers = { + "Authorization": f"Bearer {args.github_token}", + "Content-Type": "application/json", + } + + if duplicate_magnets := fetch_canonical_issues_with_duplicates(): + body = build_markdown_body(duplicate_magnets) + if args.issue_number: + update_github_issue(args.issue_number, body) + else: + print(body)