1#!/usr/bin/env python3
2"""
3Comment on newly opened issues that might be duplicates of an existing issue.
4
5This script is run by a GitHub Actions workflow when a new bug or crash report
6is opened. It:
71. Checks eligibility (must be bug/crash type, non-staff author)
82. Detects relevant areas using Claude + the area label taxonomy
93. Parses known "duplicate magnets" from tracking issue #46355
104. Searches for similar recent issues by title keywords, area labels, and error patterns
115. Asks Claude to analyze potential duplicates (magnets + search results)
126. Posts a comment on the issue if high-confidence duplicates are found
13
14Requires:
15 requests (pip install requests)
16
17Usage:
18 python github-check-new-issue-for-duplicates.py <issue_number>
19
20Environment variables:
21 GITHUB_TOKEN - GitHub token (org members: read, issues: read & write)
22 ANTHROPIC_API_KEY - Anthropic API key for Claude
23
24"""
25
26import argparse
27import json
28import os
29import re
30import sys
31from datetime import datetime, timedelta
32
33import requests
34
35GITHUB_API = "https://api.github.com"
36REPO_OWNER = "zed-industries"
37REPO_NAME = "zed"
38TRACKING_ISSUE_NUMBER = 46355
39STAFF_TEAM_SLUG = "staff"
40
41# area prefixes to collapse in taxonomy (show summary instead of all sub-labels)
42PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"]
43
44# stopwords to filter from title keyword searches (short words handled by len > 2 filter)
45STOPWORDS = {
46 "after", "all", "also", "and", "any", "but", "can't", "does", "doesn't",
47 "don't", "for", "from", "have", "just", "not", "only", "some", "that",
48 "the", "this", "when", "while", "with", "won't", "work", "working", "zed",
49}
50
51
52def log(message):
53 """Print to stderr so it doesn't interfere with JSON output on stdout."""
54 print(message, file=sys.stderr)
55
56
57def github_api_get(path, params=None):
58 """Fetch JSON from the GitHub API. Raises on non-2xx status."""
59 url = f"{GITHUB_API}/{path.lstrip('/')}"
60 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
61 response.raise_for_status()
62 return response.json()
63
64
65def github_search_issues(query, per_page=15):
66 """Search issues, returning most recently created first."""
67 params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page}
68 return github_api_get("/search/issues", params).get("items", [])
69
70
71def check_team_membership(org, team_slug, username):
72 """Check if user is an active member of a team."""
73 try:
74 data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}")
75 return data.get("state") == "active"
76 except requests.HTTPError as e:
77 if e.response.status_code == 404:
78 return False
79 raise
80
81
82def post_comment(issue_number: int, body):
83 url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
84 response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body})
85 response.raise_for_status()
86 log(f" Posted comment on #{issue_number}")
87
88
89def build_duplicate_comment(matches):
90 """Build the comment body for potential duplicates."""
91 match_list = "\n".join(f"- #{m['number']}" for m in matches)
92 explanations = "\n\n".join(
93 f"**#{m['number']}:** {m['explanation']}\n\n**Shared root cause:** {m['shared_root_cause']}"
94 if m.get('shared_root_cause')
95 else f"**#{m['number']}:** {m['explanation']}"
96 for m in matches
97 )
98
99 return f"""This issue appears to be a duplicate of:
100
101{match_list}
102
103**If this is indeed a duplicate:**
104Please close this issue and subscribe to the linked issue for updates (select "Close as not planned" → "Duplicate")
105
106**If this is a different issue:**
107No action needed. A maintainer will review this shortly.
108
109<details>
110<summary>Why were these issues selected?</summary>
111
112{explanations}
113
114</details>
115
116---
117<sub>This is an automated analysis and might be incorrect.</sub>"""
118
119
120def call_claude(api_key, system, user_content, max_tokens=1024):
121 """Send a message to Claude and return the text response. Raises on non-2xx status."""
122 response = requests.post(
123 "https://api.anthropic.com/v1/messages",
124 headers={
125 "x-api-key": api_key,
126 "anthropic-version": "2023-06-01",
127 "content-type": "application/json",
128 },
129 json={
130 "model": "claude-sonnet-4-20250514",
131 "max_tokens": max_tokens,
132 "temperature": 0.0,
133 "system": system,
134 "messages": [{"role": "user", "content": user_content}],
135 },
136 )
137 response.raise_for_status()
138 data = response.json()
139
140 usage = data.get("usage", {})
141 log(f" Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}")
142
143 content = data.get("content", [])
144 if content and content[0].get("type") == "text":
145 return content[0].get("text") or ""
146 return ""
147
148
149def fetch_issue(issue_number: int):
150 """Fetch issue from GitHub and return as a dict."""
151 log(f"Fetching issue #{issue_number}")
152
153 issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
154 issue = {
155 "number": issue_number,
156 "title": issue_data["title"],
157 "body": issue_data.get("body") or "",
158 "author": (issue_data.get("user") or {}).get("login") or "",
159 "type": (issue_data.get("type") or {}).get("name"),
160 }
161
162 log(f" Title: {issue['title']}\n Type: {issue['type']}\n Author: {issue['author']}")
163 return issue
164
165
166def should_skip(issue):
167 """Check if issue should be skipped in duplicate detection process."""
168 if issue["type"] not in ["Bug", "Crash"]:
169 log(f" Skipping: issue type '{issue['type']}' is not a bug/crash report")
170 return True
171
172 if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]):
173 log(f" Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member")
174 return True
175
176 return False
177
178
179def fetch_area_labels():
180 """Fetch area:* labels from the repository. Returns list of {name, description} dicts."""
181 log("Fetching area labels")
182
183 labels = []
184 page = 1
185 while page_labels := github_api_get(
186 f"/repos/{REPO_OWNER}/{REPO_NAME}/labels",
187 params={"per_page": 100, "page": page},
188 ):
189 labels.extend(page_labels)
190 page += 1
191
192 # label["name"][5:] removes the "area:" prefix
193 area_labels = [
194 {"name": label["name"][5:], "description": label.get("description") or ""}
195 for label in labels
196 if label["name"].startswith("area:")
197 ]
198
199 log(f" Found {len(area_labels)} area labels")
200 return area_labels
201
202
203def format_taxonomy_for_claude(area_labels):
204 """Format area labels into a string for Claude, collapsing certain prefixes."""
205 lines = set()
206
207 for area in area_labels:
208 name = area["name"]
209 collapsible_prefix = next(
210 (p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None)
211
212 if collapsible_prefix:
213 lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)")
214 else:
215 desc = area["description"]
216 lines.add(f"- {name}: {desc}" if desc else f"- {name}")
217
218 return "\n".join(sorted(lines))
219
220
221def detect_areas(anthropic_key, issue, taxonomy):
222 """Use Claude to detect relevant areas for the issue."""
223 log("Detecting areas with Claude")
224
225 system_prompt = """You analyze GitHub issues to identify which area labels apply.
226
227Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names.
228- Output at most 3 areas, ranked by relevance
229- Use exact area names from the taxonomy
230- If no areas clearly match, output: none
231- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust",
232tooling/eslint, parity/vscode)
233
234Example outputs:
235- "editor, parity/vim"
236- "ai, ai/agent panel"
237- "none"
238"""
239
240 user_content = f"""## Area Taxonomy
241{taxonomy}
242
243# Issue Title
244{issue['title']}
245
246# Issue Body
247{issue['body'][:4000]}"""
248
249 response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip()
250 log(f" Detected areas: {response}")
251
252 if response.lower() == "none":
253 return []
254 return [area.strip() for area in response.split(",")]
255
256
257def parse_duplicate_magnets():
258 """Parse known duplicate magnets from tracking issue #46355.
259
260 Returns a list of magnets sorted by duplicate count (most duplicated first).
261 Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch
262 title and body_preview for the ones you need.
263 """
264 log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}")
265
266 issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}")
267 body = issue_data.get("body") or ""
268
269 # parse the issue body
270 # format: ## area_name
271 # - [N dupes] https://github.com/zed-industries/zed/issues/NUMBER
272 magnets = {} # number -> {number, areas, dupe_count}
273 current_area = None
274
275 for line in body.split("\n"):
276 # check for area header
277 if line.startswith("## "):
278 current_area = line[3:].strip()
279 continue
280
281 if not current_area or not line.startswith("-") or "/issues/" not in line:
282 continue
283
284 # parse: - [N dupes] https://github.com/.../issues/NUMBER
285 try:
286 dupe_count = int(line.split("[")[1].split()[0])
287 number = int(line.split("/issues/")[1].split()[0].rstrip(")"))
288 except (ValueError, IndexError):
289 continue
290
291 # skip "(unlabeled)": these magnets should match everything
292 is_unlabeled = current_area == "(unlabeled)"
293
294 if number in magnets:
295 if not is_unlabeled:
296 magnets[number]["areas"].append(current_area)
297 else:
298 magnets[number] = {
299 "number": number,
300 "areas": [] if is_unlabeled else [current_area],
301 "dupe_count": dupe_count,
302 }
303
304 magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True)
305 log(f" Parsed {len(magnet_list)} duplicate magnets")
306 return magnet_list
307
308
309def enrich_magnets(magnets):
310 """Fetch title and body_preview for magnets from the API."""
311 log(f" Fetching details for {len(magnets)} magnets")
312 for magnet in magnets:
313 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}")
314 magnet["title"] = data["title"]
315 magnet["body_preview"] = (data.get("body") or "")[:1000]
316
317
318def areas_match(detected, magnet_area):
319 """Check if detected area matches magnet area. Matches broadly across hierarchy levels."""
320 return (
321 detected == magnet_area
322 or magnet_area.startswith(f"{detected}/")
323 or detected.startswith(f"{magnet_area}/")
324 )
325
326
327def filter_magnets_by_areas(magnets, detected_areas):
328 """Filter magnets based on detected areas."""
329 if not detected_areas:
330 return magnets
331
332 detected_set = set(detected_areas)
333
334 def matches(magnet):
335 # unlabeled magnets (empty areas) match everything
336 if not magnet["areas"]:
337 return True
338 return any(
339 areas_match(detected, magnet_area)
340 for detected in detected_set
341 for magnet_area in magnet["areas"]
342 )
343
344 return list(filter(matches, magnets))
345
346
347def search_for_similar_issues(issue, detected_areas, max_searches=6):
348 """Search for similar issues that might be duplicates.
349
350 Searches by title keywords, area labels (last 60 days), and error patterns.
351 max_searches caps the total number of queries to keep token usage and context size under control.
352 """
353 log("Searching for similar issues")
354
355 sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
356 base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open"
357 seen_issues = {}
358 queries = []
359
360 title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2]
361
362 if title_keywords:
363 keywords_query = " ".join(title_keywords)
364 queries.append(("title_keywords", f"{base_query} {keywords_query}"))
365
366 for area in detected_areas:
367 queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}'))
368
369 # error pattern search: capture 5–90 chars after keyword, colon optional
370 error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})"
371 match = re.search(error_pattern, issue["body"])
372 if match:
373 error_snippet = match.group(1).strip()
374 queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"'))
375
376 for search_type, query in queries[:max_searches]:
377 log(f" Search ({search_type}): {query}")
378 try:
379 results = github_search_issues(query, per_page=15)
380 for item in results:
381 number = item["number"]
382 if number != issue["number"] and number not in seen_issues:
383 body = item.get("body") or ""
384 seen_issues[number] = {
385 "number": number,
386 "title": item["title"],
387 "state": item.get("state", ""),
388 "created_at": item.get("created_at", ""),
389 "body_preview": body[:1000],
390 "source": search_type,
391 }
392 except requests.RequestException as e:
393 log(f" Search failed: {e}")
394
395 similar_issues = list(seen_issues.values())
396 log(f" Found {len(similar_issues)} similar issues")
397 return similar_issues
398
399
400def analyze_duplicates(anthropic_key, issue, magnets, search_results):
401 """Use Claude to analyze potential duplicates."""
402 log("Analyzing duplicates with Claude")
403
404 top_magnets = magnets[:10]
405 enrich_magnets(top_magnets)
406 magnet_numbers = {m["number"] for m in top_magnets}
407
408 candidates = [
409 {"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"}
410 for m in top_magnets
411 ] + [
412 {"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"}
413 for r in search_results[:10]
414 if r["number"] not in magnet_numbers
415 ]
416
417 if not candidates:
418 return [], "No candidates to analyze"
419
420 system_prompt = """You analyze GitHub issues to identify potential duplicates.
421
422Given a new issue and a list of existing issues, identify which existing issues are duplicates — meaning
423they are caused by the SAME BUG in the code, not just similar symptoms.
424
425CRITICAL DISTINCTION — shared symptoms vs shared root cause:
426- "models missing", "can't sign in", "editor hangs", "venv not detected" are SYMPTOMS that many
427 different bugs can produce. Two reports of the same symptom are NOT duplicates unless you can
428 identify a specific shared root cause.
429- A duplicate means: if a developer fixed the existing issue, the new issue would also be fixed.
430- If the issues just happen to be in the same feature area, or describe similar-sounding problems
431 with different specifics (different error messages, different triggers, different platforms, different
432 configurations), they are NOT duplicates.
433
434For each potential duplicate, assess confidence:
435- "high": Almost certainly the same bug. You can name a specific shared root cause, and the
436 reproduction steps / error messages / triggers are consistent.
437- "medium": Likely the same bug based on specific technical details, but some uncertainty remains.
438- Do NOT include issues that merely share symptoms, affect the same feature area, or sound similar
439 at a surface level.
440
441Examples of things that are NOT duplicates:
442- Two issues about "Copilot models not showing" — one caused by a Zed update breaking the model list,
443 the other caused by the user's plan not including those models.
444- Two issues about "Zed hangs" — one triggered by network drives, the other by large projects.
445- Two issues about "can't sign in" — one caused by a missing system package, the other by a server-side error.
446
447Output only valid JSON (no markdown code blocks) with this structure:
448{
449 "matches": [
450 {
451 "number": 12345,
452 "confidence": "high|medium",
453 "shared_root_cause": "The specific bug/root cause shared by both issues",
454 "explanation": "Brief explanation with concrete evidence from both issues"
455 }
456 ],
457 "summary": "One sentence summary of findings"
458}
459
460When in doubt, return an empty matches array. A false positive (flagging a non-duplicate) is much
461worse than a false negative (missing a real duplicate), because it wastes the time of both the
462issue author and the maintainers.
463
464Return empty matches array if none found or if you can only identify shared symptoms."""
465
466 user_content = f"""## New Issue #{issue['number']}
467**Title:** {issue['title']}
468
469**Body:**
470{issue['body'][:3000]}
471
472## Existing Issues to Compare
473{json.dumps(candidates, indent=2)}"""
474
475 response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048)
476
477 try:
478 data = json.loads(response)
479 except json.JSONDecodeError as e:
480 log(f" Failed to parse response: {e}")
481 log(f" Raw response: {response}")
482 return [], "Failed to parse analysis"
483
484 matches = data.get("matches", [])
485 summary = data.get("summary", "Analysis complete")
486 log(f" Found {len(matches)} potential matches")
487 return matches, summary
488
489
490if __name__ == "__main__":
491 parser = argparse.ArgumentParser(description="Identify potential duplicate issues")
492 parser.add_argument("issue_number", type=int, help="Issue number to analyze")
493 parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted")
494 args = parser.parse_args()
495
496 github_token = os.environ.get("GITHUB_TOKEN")
497 anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
498
499 if not github_token:
500 log("Error: GITHUB_TOKEN not set")
501 sys.exit(1)
502 if not anthropic_key:
503 log("Error: ANTHROPIC_API_KEY not set")
504 sys.exit(1)
505
506 GITHUB_HEADERS = {
507 "Authorization": f"Bearer {github_token}",
508 "Accept": "application/vnd.github+json",
509 "X-GitHub-Api-Version": "2022-11-28",
510 }
511
512 issue = fetch_issue(args.issue_number)
513 if should_skip(issue):
514 print(json.dumps({"skipped": True}))
515 sys.exit(0)
516
517 # detect areas
518 taxonomy = format_taxonomy_for_claude(fetch_area_labels())
519 detected_areas = detect_areas(anthropic_key, issue, taxonomy)
520
521 # search for potential duplicates
522 all_magnets = parse_duplicate_magnets()
523 relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas)
524 search_results = search_for_similar_issues(issue, detected_areas)
525
526 # analyze potential duplicates
527 if relevant_magnets or search_results:
528 matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results)
529 else:
530 matches, summary = [], "No potential duplicates to analyze"
531
532 # post comment if high-confidence matches found
533 high_confidence_matches = [m for m in matches if m["confidence"] == "high"]
534 commented = False
535
536 if high_confidence_matches:
537 comment_body = build_duplicate_comment(high_confidence_matches)
538 if args.dry_run:
539 log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40)
540 else:
541 log("Posting comment for high-confidence match(es)")
542 try:
543 post_comment(issue["number"], comment_body)
544 commented = True
545 except requests.RequestException as e:
546 log(f" Failed to post comment: {e}")
547
548 print(json.dumps({
549 "skipped": False,
550 "issue": {
551 "number": issue["number"],
552 "title": issue["title"],
553 "author": issue["author"],
554 "type": issue["type"],
555 },
556 "detected_areas": detected_areas,
557 "magnets_count": len(relevant_magnets),
558 "search_results_count": len(search_results),
559 "matches": matches,
560 "summary": summary,
561 "commented": commented,
562 }))