github-check-new-issue-for-duplicates.py

  1#!/usr/bin/env python3
  2"""
  3Comment on newly opened issues that might be duplicates of an existing issue.
  4
  5This script is run by a GitHub Actions workflow when a new bug or crash report
  6is opened. It:
  71. Checks eligibility (must be bug/crash type, non-staff author)
  82. Detects relevant areas using Claude + the area label taxonomy
  93. Parses known "duplicate magnets" from tracking issue #46355
 104. Searches for similar recent issues by title keywords, area labels, and error patterns
 115. Asks Claude to analyze potential duplicates (magnets + search results)
 126. Posts a comment on the issue if high-confidence duplicates are found
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Usage:
 18    python github-check-new-issue-for-duplicates.py <issue_number>
 19
 20Environment variables:
 21    GITHUB_TOKEN       - GitHub token (org members: read, issues: read & write)
 22    ANTHROPIC_API_KEY  - Anthropic API key for Claude
 23
 24"""
 25
 26import argparse
 27import json
 28import os
 29import re
 30import sys
 31from datetime import datetime, timedelta
 32
 33import requests
 34
 35GITHUB_API = "https://api.github.com"
 36REPO_OWNER = "zed-industries"
 37REPO_NAME = "zed"
 38TRACKING_ISSUE_NUMBER = 46355
 39STAFF_TEAM_SLUG = "staff"
 40
 41# area prefixes to collapse in taxonomy (show summary instead of all sub-labels)
 42PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"]
 43
 44# stopwords to filter from title keyword searches (short words handled by len > 2 filter)
 45STOPWORDS = {
 46    "after", "all", "also", "and", "any", "but", "can't", "does", "doesn't",
 47    "don't", "for", "from", "have", "just", "not", "only", "some", "that",
 48    "the", "this", "when", "while", "with", "won't", "work", "working", "zed",
 49}
 50
 51
 52def log(message):
 53    """Print to stderr so it doesn't interfere with JSON output on stdout."""
 54    print(message, file=sys.stderr)
 55
 56
 57def github_api_get(path, params=None):
 58    """Fetch JSON from the GitHub API. Raises on non-2xx status."""
 59    url = f"{GITHUB_API}/{path.lstrip('/')}"
 60    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 61    response.raise_for_status()
 62    return response.json()
 63
 64
 65def github_search_issues(query, per_page=15):
 66    """Search issues, returning most recently created first."""
 67    params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page}
 68    return github_api_get("/search/issues", params).get("items", [])
 69
 70
 71def check_team_membership(org, team_slug, username):
 72    """Check if user is an active member of a team."""
 73    try:
 74        data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}")
 75        return data.get("state") == "active"
 76    except requests.HTTPError as e:
 77        if e.response.status_code == 404:
 78            return False
 79        raise
 80
 81
 82def post_comment(issue_number: int, body):
 83    url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
 84    response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body})
 85    response.raise_for_status()
 86    log(f"  Posted comment on #{issue_number}")
 87
 88
 89def build_duplicate_comment(matches):
 90    """Build the comment body for potential duplicates."""
 91    match_list = "\n".join(f"- #{m['number']}" for m in matches)
 92    explanations = "\n\n".join(
 93        f"**#{m['number']}:** {m['explanation']}\n\n**Shared root cause:** {m['shared_root_cause']}"
 94        if m.get('shared_root_cause')
 95        else f"**#{m['number']}:** {m['explanation']}"
 96        for m in matches
 97    )
 98
 99    return f"""This issue appears to be a duplicate of:
100
101{match_list}
102
103**If this is indeed a duplicate:**
104Please close this issue and subscribe to the linked issue for updates (select "Close as not planned""Duplicate")
105
106**If this is a different issue:**
107No action needed. A maintainer will review this shortly.
108
109<details>
110<summary>Why were these issues selected?</summary>
111
112{explanations}
113
114</details>
115
116---
117<sub>This is an automated analysis and might be incorrect.</sub>"""
118
119
120def call_claude(api_key, system, user_content, max_tokens=1024):
121    """Send a message to Claude and return the text response. Raises on non-2xx status."""
122    response = requests.post(
123        "https://api.anthropic.com/v1/messages",
124        headers={
125            "x-api-key": api_key,
126            "anthropic-version": "2023-06-01",
127            "content-type": "application/json",
128        },
129        json={
130            "model": "claude-sonnet-4-20250514",
131            "max_tokens": max_tokens,
132            "temperature": 0.0,
133            "system": system,
134            "messages": [{"role": "user", "content": user_content}],
135        },
136    )
137    response.raise_for_status()
138    data = response.json()
139
140    usage = data.get("usage", {})
141    log(f"  Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}")
142
143    content = data.get("content", [])
144    if content and content[0].get("type") == "text":
145        return content[0].get("text") or ""
146    return ""
147
148
149def fetch_issue(issue_number: int):
150    """Fetch issue from GitHub and return as a dict."""
151    log(f"Fetching issue #{issue_number}")
152
153    issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
154    issue = {
155        "number": issue_number,
156        "title": issue_data["title"],
157        "body": issue_data.get("body") or "",
158        "author": (issue_data.get("user") or {}).get("login") or "",
159        "type": (issue_data.get("type") or {}).get("name"),
160    }
161
162    log(f"  Title: {issue['title']}\n  Type: {issue['type']}\n  Author: {issue['author']}")
163    return issue
164
165
166def should_skip(issue):
167    """Check if issue should be skipped in duplicate detection process."""
168    if issue["type"] not in ["Bug", "Crash"]:
169        log(f"  Skipping: issue type '{issue['type']}' is not a bug/crash report")
170        return True
171
172    if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]):
173        log(f"  Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member")
174        return True
175
176    return False
177
178
179def fetch_area_labels():
180    """Fetch area:* labels from the repository. Returns list of {name, description} dicts."""
181    log("Fetching area labels")
182
183    labels = []
184    page = 1
185    while page_labels := github_api_get(
186        f"/repos/{REPO_OWNER}/{REPO_NAME}/labels",
187        params={"per_page": 100, "page": page},
188    ):
189        labels.extend(page_labels)
190        page += 1
191
192    # label["name"][5:] removes the "area:" prefix
193    area_labels = [
194        {"name": label["name"][5:], "description": label.get("description") or ""}
195        for label in labels
196        if label["name"].startswith("area:")
197    ]
198
199    log(f"  Found {len(area_labels)} area labels")
200    return area_labels
201
202
203def format_taxonomy_for_claude(area_labels):
204    """Format area labels into a string for Claude, collapsing certain prefixes."""
205    lines = set()
206
207    for area in area_labels:
208        name = area["name"]
209        collapsible_prefix = next(
210            (p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None)
211
212        if collapsible_prefix:
213            lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)")
214        else:
215            desc = area["description"]
216            lines.add(f"- {name}: {desc}" if desc else f"- {name}")
217
218    return "\n".join(sorted(lines))
219
220
221def detect_areas(anthropic_key, issue, taxonomy):
222    """Use Claude to detect relevant areas for the issue."""
223    log("Detecting areas with Claude")
224
225    system_prompt = """You analyze GitHub issues to identify which area labels apply.
226
227Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names.
228- Output at most 3 areas, ranked by relevance
229- Use exact area names from the taxonomy
230- If no areas clearly match, output: none
231- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust",
232tooling/eslint, parity/vscode)
233
234Example outputs:
235- "editor, parity/vim"
236- "ai, ai/agent panel"
237- "none"
238"""
239
240    user_content = f"""## Area Taxonomy
241{taxonomy}
242
243# Issue Title
244{issue['title']}
245
246# Issue Body
247{issue['body'][:4000]}"""
248
249    response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip()
250    log(f"  Detected areas: {response}")
251
252    if response.lower() == "none":
253        return []
254    return [area.strip() for area in response.split(",")]
255
256
257def parse_duplicate_magnets():
258    """Parse known duplicate magnets from tracking issue #46355.
259
260    Returns a list of magnets sorted by duplicate count (most duplicated first).
261    Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch
262    title and body_preview for the ones you need.
263    """
264    log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}")
265
266    issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}")
267    body = issue_data.get("body") or ""
268
269    # parse the issue body
270    # format: ## area_name
271    #         -   [N dupes] https://github.com/zed-industries/zed/issues/NUMBER
272    magnets = {}  # number -> {number, areas, dupe_count}
273    current_area = None
274
275    for line in body.split("\n"):
276        # check for area header
277        if line.startswith("## "):
278            current_area = line[3:].strip()
279            continue
280
281        if not current_area or not line.startswith("-") or "/issues/" not in line:
282            continue
283
284        # parse: -   [N dupes] https://github.com/.../issues/NUMBER
285        try:
286            dupe_count = int(line.split("[")[1].split()[0])
287            number = int(line.split("/issues/")[1].split()[0].rstrip(")"))
288        except (ValueError, IndexError):
289            continue
290
291        # skip "(unlabeled)": these magnets should match everything
292        is_unlabeled = current_area == "(unlabeled)"
293
294        if number in magnets:
295            if not is_unlabeled:
296                magnets[number]["areas"].append(current_area)
297        else:
298            magnets[number] = {
299                "number": number,
300                "areas": [] if is_unlabeled else [current_area],
301                "dupe_count": dupe_count,
302            }
303
304    magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True)
305    log(f"  Parsed {len(magnet_list)} duplicate magnets")
306    return magnet_list
307
308
309def enrich_magnets(magnets):
310    """Fetch title and body_preview for magnets from the API."""
311    log(f"  Fetching details for {len(magnets)} magnets")
312    for magnet in magnets:
313        data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}")
314        magnet["title"] = data["title"]
315        magnet["body_preview"] = (data.get("body") or "")[:1000]
316
317
318def areas_match(detected, magnet_area):
319    """Check if detected area matches magnet area. Matches broadly across hierarchy levels."""
320    return (
321        detected == magnet_area
322        or magnet_area.startswith(f"{detected}/")
323        or detected.startswith(f"{magnet_area}/")
324    )
325
326
327def filter_magnets_by_areas(magnets, detected_areas):
328    """Filter magnets based on detected areas."""
329    if not detected_areas:
330        return magnets
331
332    detected_set = set(detected_areas)
333
334    def matches(magnet):
335        # unlabeled magnets (empty areas) match everything
336        if not magnet["areas"]:
337            return True
338        return any(
339            areas_match(detected, magnet_area)
340            for detected in detected_set
341            for magnet_area in magnet["areas"]
342        )
343
344    return list(filter(matches, magnets))
345
346
347def search_for_similar_issues(issue, detected_areas, max_searches=6):
348    """Search for similar issues that might be duplicates.
349
350    Searches by title keywords, area labels (last 60 days), and error patterns.
351    max_searches caps the total number of queries to keep token usage and context size under control.
352    """
353    log("Searching for similar issues")
354
355    sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
356    base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open"
357    seen_issues = {}
358    queries = []
359
360    title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2]
361
362    if title_keywords:
363        keywords_query = " ".join(title_keywords)
364        queries.append(("title_keywords", f"{base_query} {keywords_query}"))
365
366    for area in detected_areas:
367        queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}'))
368
369    # error pattern search: capture 5–90 chars after keyword, colon optional
370    error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})"
371    match = re.search(error_pattern, issue["body"])
372    if match:
373        error_snippet = match.group(1).strip()
374        queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"'))
375
376    for search_type, query in queries[:max_searches]:
377        log(f"  Search ({search_type}): {query}")
378        try:
379            results = github_search_issues(query, per_page=15)
380            for item in results:
381                number = item["number"]
382                if number != issue["number"] and number not in seen_issues:
383                    body = item.get("body") or ""
384                    seen_issues[number] = {
385                        "number": number,
386                        "title": item["title"],
387                        "state": item.get("state", ""),
388                        "created_at": item.get("created_at", ""),
389                        "body_preview": body[:1000],
390                        "source": search_type,
391                    }
392        except requests.RequestException as e:
393            log(f"  Search failed: {e}")
394
395    similar_issues = list(seen_issues.values())
396    log(f"  Found {len(similar_issues)} similar issues")
397    return similar_issues
398
399
400def analyze_duplicates(anthropic_key, issue, magnets, search_results):
401    """Use Claude to analyze potential duplicates."""
402    log("Analyzing duplicates with Claude")
403
404    top_magnets = magnets[:10]
405    enrich_magnets(top_magnets)
406    magnet_numbers = {m["number"] for m in top_magnets}
407
408    candidates = [
409        {"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"}
410        for m in top_magnets
411    ] + [
412        {"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"}
413        for r in search_results[:10]
414        if r["number"] not in magnet_numbers
415    ]
416
417    if not candidates:
418        return [], "No candidates to analyze"
419
420    system_prompt = """You analyze GitHub issues to identify potential duplicates.
421
422Given a new issue and a list of existing issues, identify which existing issues are duplicates — meaning
423they are caused by the SAME BUG in the code, not just similar symptoms.
424
425CRITICAL DISTINCTION — shared symptoms vs shared root cause:
426- "models missing", "can't sign in", "editor hangs", "venv not detected" are SYMPTOMS that many
427  different bugs can produce. Two reports of the same symptom are NOT duplicates unless you can
428  identify a specific shared root cause.
429- A duplicate means: if a developer fixed the existing issue, the new issue would also be fixed.
430- If the issues just happen to be in the same feature area, or describe similar-sounding problems
431  with different specifics (different error messages, different triggers, different platforms, different
432  configurations), they are NOT duplicates.
433
434For each potential duplicate, assess confidence:
435- "high": Almost certainly the same bug. You can name a specific shared root cause, and the
436  reproduction steps / error messages / triggers are consistent.
437- "medium": Likely the same bug based on specific technical details, but some uncertainty remains.
438- Do NOT include issues that merely share symptoms, affect the same feature area, or sound similar
439  at a surface level.
440
441Examples of things that are NOT duplicates:
442- Two issues about "Copilot models not showing" — one caused by a Zed update breaking the model list,
443  the other caused by the user's plan not including those models.
444- Two issues about "Zed hangs" — one triggered by network drives, the other by large projects.
445- Two issues about "can't sign in" — one caused by a missing system package, the other by a server-side error.
446
447Output only valid JSON (no markdown code blocks) with this structure:
448{
449  "matches": [
450    {
451      "number": 12345,
452      "confidence": "high|medium",
453      "shared_root_cause": "The specific bug/root cause shared by both issues",
454      "explanation": "Brief explanation with concrete evidence from both issues"
455    }
456  ],
457  "summary": "One sentence summary of findings"
458}
459
460When in doubt, return an empty matches array. A false positive (flagging a non-duplicate) is much
461worse than a false negative (missing a real duplicate), because it wastes the time of both the
462issue author and the maintainers.
463
464Return empty matches array if none found or if you can only identify shared symptoms."""
465
466    user_content = f"""## New Issue #{issue['number']}
467**Title:** {issue['title']}
468
469**Body:**
470{issue['body'][:3000]}
471
472## Existing Issues to Compare
473{json.dumps(candidates, indent=2)}"""
474
475    response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048)
476
477    try:
478        data = json.loads(response)
479    except json.JSONDecodeError as e:
480        log(f"  Failed to parse response: {e}")
481        log(f"  Raw response: {response}")
482        return [], "Failed to parse analysis"
483
484    matches = data.get("matches", [])
485    summary = data.get("summary", "Analysis complete")
486    log(f"  Found {len(matches)} potential matches")
487    return matches, summary
488
489
490if __name__ == "__main__":
491    parser = argparse.ArgumentParser(description="Identify potential duplicate issues")
492    parser.add_argument("issue_number", type=int, help="Issue number to analyze")
493    parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted")
494    args = parser.parse_args()
495
496    github_token = os.environ.get("GITHUB_TOKEN")
497    anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
498
499    if not github_token:
500        log("Error: GITHUB_TOKEN not set")
501        sys.exit(1)
502    if not anthropic_key:
503        log("Error: ANTHROPIC_API_KEY not set")
504        sys.exit(1)
505
506    GITHUB_HEADERS = {
507        "Authorization": f"Bearer {github_token}",
508        "Accept": "application/vnd.github+json",
509        "X-GitHub-Api-Version": "2022-11-28",
510    }
511
512    issue = fetch_issue(args.issue_number)
513    if should_skip(issue):
514        print(json.dumps({"skipped": True}))
515        sys.exit(0)
516
517    # detect areas
518    taxonomy = format_taxonomy_for_claude(fetch_area_labels())
519    detected_areas = detect_areas(anthropic_key, issue, taxonomy)
520
521    # search for potential duplicates
522    all_magnets = parse_duplicate_magnets()
523    relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas)
524    search_results = search_for_similar_issues(issue, detected_areas)
525
526    # analyze potential duplicates
527    if relevant_magnets or search_results:
528        matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results)
529    else:
530        matches, summary = [], "No potential duplicates to analyze"
531
532    # post comment if high-confidence matches found
533    high_confidence_matches = [m for m in matches if m["confidence"] == "high"]
534    commented = False
535
536    if high_confidence_matches:
537        comment_body = build_duplicate_comment(high_confidence_matches)
538        if args.dry_run:
539            log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40)
540        else:
541            log("Posting comment for high-confidence match(es)")
542            try:
543                post_comment(issue["number"], comment_body)
544                commented = True
545            except requests.RequestException as e:
546                log(f"  Failed to post comment: {e}")
547
548    print(json.dumps({
549        "skipped": False,
550        "issue": {
551            "number": issue["number"],
552            "title": issue["title"],
553            "author": issue["author"],
554            "type": issue["type"],
555        },
556        "detected_areas": detected_areas,
557        "magnets_count": len(relevant_magnets),
558        "search_results_count": len(search_results),
559        "matches": matches,
560        "summary": summary,
561        "commented": commented,
562    }))