1#!/usr/bin/env python3
2"""
3Comment on newly opened issues that might be duplicates of an existing issue.
4
5This script is run by a GitHub Actions workflow when a new bug or crash report
6is opened. It:
71. Checks eligibility (must be bug/crash type, non-staff author)
82. Detects relevant areas using Claude + the area label taxonomy
93. Parses known "duplicate magnets" from tracking issue #46355
104. Searches for similar recent issues by title keywords, area labels, and error patterns
115. Asks Claude to analyze potential duplicates (magnets + search results)
126. Posts a comment on the issue if high-confidence duplicates are found
13
14Requires:
15 requests (pip install requests)
16
17Usage:
18 python github-check-new-issue-for-duplicates.py <issue_number>
19
20Environment variables:
21 GITHUB_TOKEN - GitHub token (org members: read, issues: read & write)
22 ANTHROPIC_API_KEY - Anthropic API key for Claude
23
24"""
25
26import argparse
27import json
28import os
29import re
30import sys
31from datetime import datetime, timedelta
32
33import requests
34
35GITHUB_API = "https://api.github.com"
36REPO_OWNER = "zed-industries"
37REPO_NAME = "zed"
38TRACKING_ISSUE_NUMBER = 46355
39STAFF_TEAM_SLUG = "staff"
40
41# area prefixes to collapse in taxonomy (show summary instead of all sub-labels)
42PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"]
43
44# stopwords to filter from title keyword searches (short words handled by len > 2 filter)
45STOPWORDS = {
46 "after", "all", "also", "and", "any", "but", "can't", "does", "doesn't",
47 "don't", "for", "from", "have", "just", "not", "only", "some", "that",
48 "the", "this", "when", "while", "with", "won't", "work", "working", "zed",
49}
50
51
52def log(message):
53 """Print to stderr so it doesn't interfere with JSON output on stdout."""
54 print(message, file=sys.stderr)
55
56
57def github_api_get(path, params=None):
58 """Fetch JSON from the GitHub API. Raises on non-2xx status."""
59 url = f"{GITHUB_API}/{path.lstrip('/')}"
60 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
61 response.raise_for_status()
62 return response.json()
63
64
65def github_search_issues(query, per_page=15):
66 """Search issues, returning most recently created first."""
67 params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page}
68 return github_api_get("/search/issues", params).get("items", [])
69
70
71def check_team_membership(org, team_slug, username):
72 """Check if user is an active member of a team."""
73 try:
74 data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}")
75 return data.get("state") == "active"
76 except requests.HTTPError as e:
77 if e.response.status_code == 404:
78 return False
79 raise
80
81
82def post_comment(issue_number: int, body):
83 url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
84 response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body})
85 response.raise_for_status()
86 log(f" Posted comment on #{issue_number}")
87
88
89def build_duplicate_comment(matches):
90 """Build the comment body for potential duplicates."""
91 match_list = "\n".join(f"- #{m['number']}" for m in matches)
92 explanations = "\n\n".join(f"**#{m['number']}:** {m['explanation']}" for m in matches)
93
94 return f"""This issue appears to be a duplicate of:
95
96{match_list}
97
98**If this is indeed a duplicate:**
99Please close this issue and subscribe to the linked issue for updates (select "Close as not planned" → "Duplicate")
100
101**If this is a different issue:**
102No action needed. A maintainer will review this shortly.
103
104<details>
105<summary>Why were these issues selected?</summary>
106
107{explanations}
108
109</details>
110
111---
112<sub>This is an automated analysis and might be incorrect.</sub>"""
113
114
115def call_claude(api_key, system, user_content, max_tokens=1024):
116 """Send a message to Claude and return the text response. Raises on non-2xx status."""
117 response = requests.post(
118 "https://api.anthropic.com/v1/messages",
119 headers={
120 "x-api-key": api_key,
121 "anthropic-version": "2023-06-01",
122 "content-type": "application/json",
123 },
124 json={
125 "model": "claude-sonnet-4-20250514",
126 "max_tokens": max_tokens,
127 "temperature": 0.0,
128 "system": system,
129 "messages": [{"role": "user", "content": user_content}],
130 },
131 )
132 response.raise_for_status()
133 data = response.json()
134
135 usage = data.get("usage", {})
136 log(f" Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}")
137
138 content = data.get("content", [])
139 if content and content[0].get("type") == "text":
140 return content[0].get("text") or ""
141 return ""
142
143
144def fetch_issue(issue_number: int):
145 """Fetch issue from GitHub and return as a dict."""
146 log(f"Fetching issue #{issue_number}")
147
148 issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
149 issue = {
150 "number": issue_number,
151 "title": issue_data["title"],
152 "body": issue_data.get("body") or "",
153 "author": (issue_data.get("user") or {}).get("login") or "",
154 "type": (issue_data.get("type") or {}).get("name"),
155 }
156
157 log(f" Title: {issue['title']}\n Type: {issue['type']}\n Author: {issue['author']}")
158 return issue
159
160
161def should_skip(issue):
162 """Check if issue should be skipped in duplicate detection process."""
163 if issue["type"] not in ["Bug", "Crash"]:
164 log(f" Skipping: issue type '{issue['type']}' is not a bug/crash report")
165 return True
166
167 if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]):
168 log(f" Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member")
169 return True
170
171 return False
172
173
174def fetch_area_labels():
175 """Fetch area:* labels from the repository. Returns list of {name, description} dicts."""
176 log("Fetching area labels")
177
178 labels = []
179 page = 1
180 while page_labels := github_api_get(
181 f"/repos/{REPO_OWNER}/{REPO_NAME}/labels",
182 params={"per_page": 100, "page": page},
183 ):
184 labels.extend(page_labels)
185 page += 1
186
187 # label["name"][5:] removes the "area:" prefix
188 area_labels = [
189 {"name": label["name"][5:], "description": label.get("description") or ""}
190 for label in labels
191 if label["name"].startswith("area:")
192 ]
193
194 log(f" Found {len(area_labels)} area labels")
195 return area_labels
196
197
198def format_taxonomy_for_claude(area_labels):
199 """Format area labels into a string for Claude, collapsing certain prefixes."""
200 lines = set()
201
202 for area in area_labels:
203 name = area["name"]
204 collapsible_prefix = next(
205 (p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None)
206
207 if collapsible_prefix:
208 lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)")
209 else:
210 desc = area["description"]
211 lines.add(f"- {name}: {desc}" if desc else f"- {name}")
212
213 return "\n".join(sorted(lines))
214
215
216def detect_areas(anthropic_key, issue, taxonomy):
217 """Use Claude to detect relevant areas for the issue."""
218 log("Detecting areas with Claude")
219
220 system_prompt = """You analyze GitHub issues to identify which area labels apply.
221
222Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names.
223- Output at most 3 areas, ranked by relevance
224- Use exact area names from the taxonomy
225- If no areas clearly match, output: none
226- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust",
227tooling/eslint, parity/vscode)
228
229Example outputs:
230- "editor, parity/vim"
231- "ai, ai/agent panel"
232- "none"
233"""
234
235 user_content = f"""## Area Taxonomy
236{taxonomy}
237
238# Issue Title
239{issue['title']}
240
241# Issue Body
242{issue['body'][:4000]}"""
243
244 response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip()
245 log(f" Detected areas: {response}")
246
247 if response.lower() == "none":
248 return []
249 return [area.strip() for area in response.split(",")]
250
251
252def parse_duplicate_magnets():
253 """Parse known duplicate magnets from tracking issue #46355.
254
255 Returns a list of magnets sorted by duplicate count (most duplicated first).
256 Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch
257 title and body_preview for the ones you need.
258 """
259 log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}")
260
261 issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}")
262 body = issue_data.get("body") or ""
263
264 # parse the issue body
265 # format: ## area_name
266 # - [N dupes] https://github.com/zed-industries/zed/issues/NUMBER
267 magnets = {} # number -> {number, areas, dupe_count}
268 current_area = None
269
270 for line in body.split("\n"):
271 # check for area header
272 if line.startswith("## "):
273 current_area = line[3:].strip()
274 continue
275
276 if not current_area or not line.startswith("-") or "/issues/" not in line:
277 continue
278
279 # parse: - [N dupes] https://github.com/.../issues/NUMBER
280 try:
281 dupe_count = int(line.split("[")[1].split()[0])
282 number = int(line.split("/issues/")[1].split()[0].rstrip(")"))
283 except (ValueError, IndexError):
284 continue
285
286 # skip "(unlabeled)": these magnets should match everything
287 is_unlabeled = current_area == "(unlabeled)"
288
289 if number in magnets:
290 if not is_unlabeled:
291 magnets[number]["areas"].append(current_area)
292 else:
293 magnets[number] = {
294 "number": number,
295 "areas": [] if is_unlabeled else [current_area],
296 "dupe_count": dupe_count,
297 }
298
299 magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True)
300 log(f" Parsed {len(magnet_list)} duplicate magnets")
301 return magnet_list
302
303
304def enrich_magnets(magnets):
305 """Fetch title and body_preview for magnets from the API."""
306 log(f" Fetching details for {len(magnets)} magnets")
307 for magnet in magnets:
308 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}")
309 magnet["title"] = data["title"]
310 magnet["body_preview"] = (data.get("body") or "")[:500]
311
312
313def areas_match(detected, magnet_area):
314 """Check if detected area matches magnet area. Matches broadly across hierarchy levels."""
315 return (
316 detected == magnet_area
317 or magnet_area.startswith(f"{detected}/")
318 or detected.startswith(f"{magnet_area}/")
319 )
320
321
322def filter_magnets_by_areas(magnets, detected_areas):
323 """Filter magnets based on detected areas."""
324 if not detected_areas:
325 return magnets
326
327 detected_set = set(detected_areas)
328
329 def matches(magnet):
330 # unlabeled magnets (empty areas) match everything
331 if not magnet["areas"]:
332 return True
333 return any(
334 areas_match(detected, magnet_area)
335 for detected in detected_set
336 for magnet_area in magnet["areas"]
337 )
338
339 return list(filter(matches, magnets))
340
341
342def search_for_similar_issues(issue, detected_areas, max_searches=6):
343 """Search for similar issues that might be duplicates.
344
345 Searches by title keywords, area labels (last 60 days), and error patterns.
346 max_searches caps the total number of queries to keep token usage and context size under control.
347 """
348 log("Searching for similar issues")
349
350 sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
351 base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open"
352 seen_issues = {}
353 queries = []
354
355 title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2]
356
357 if title_keywords:
358 keywords_query = " ".join(title_keywords)
359 queries.append(("title_keywords", f"{base_query} {keywords_query}"))
360
361 for area in detected_areas:
362 queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}'))
363
364 # error pattern search: capture 5–90 chars after keyword, colon optional
365 error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})"
366 match = re.search(error_pattern, issue["body"])
367 if match:
368 error_snippet = match.group(1).strip()
369 queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"'))
370
371 for search_type, query in queries[:max_searches]:
372 log(f" Search ({search_type}): {query}")
373 try:
374 results = github_search_issues(query, per_page=15)
375 for item in results:
376 number = item["number"]
377 if number != issue["number"] and number not in seen_issues:
378 body = item.get("body") or ""
379 seen_issues[number] = {
380 "number": number,
381 "title": item["title"],
382 "state": item.get("state", ""),
383 "created_at": item.get("created_at", ""),
384 "body_preview": body[:500],
385 "source": search_type,
386 }
387 except requests.RequestException as e:
388 log(f" Search failed: {e}")
389
390 similar_issues = list(seen_issues.values())
391 log(f" Found {len(similar_issues)} similar issues")
392 return similar_issues
393
394
395def analyze_duplicates(anthropic_key, issue, magnets, search_results):
396 """Use Claude to analyze potential duplicates."""
397 log("Analyzing duplicates with Claude")
398
399 top_magnets = magnets[:10]
400 enrich_magnets(top_magnets)
401 magnet_numbers = {m["number"] for m in top_magnets}
402
403 candidates = [
404 {"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"}
405 for m in top_magnets
406 ] + [
407 {"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"}
408 for r in search_results[:10]
409 if r["number"] not in magnet_numbers
410 ]
411
412 if not candidates:
413 return [], "No candidates to analyze"
414
415 system_prompt = """You analyze GitHub issues to identify potential duplicates.
416
417Given a new issue and a list of existing issues, identify which existing issues might be duplicates.
418
419For each potential duplicate, assess confidence:
420- "high": Very likely the same issue (same root cause, same symptoms)
421- "medium": Possibly related (likely to be the same root cause)
422- Do NOT include tangentially related issues (same general area but probably different issues)
423
424Output only valid JSON (no markdown code blocks) with this structure:
425{
426 "matches": [
427 {
428 "number": 12345,
429 "confidence": "high|medium",
430 "explanation": "Brief explanation of why this might be a duplicate"
431 }
432 ],
433 "summary": "One sentence summary of findings"
434}
435
436Only include matches with "high" or "medium" confidence. Return empty matches array if none found."""
437
438 user_content = f"""## New Issue #{issue['number']}
439**Title:** {issue['title']}
440
441**Body:**
442{issue['body'][:3000]}
443
444## Existing Issues to Compare
445{json.dumps(candidates, indent=2)}"""
446
447 response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048)
448
449 try:
450 data = json.loads(response)
451 except json.JSONDecodeError as e:
452 log(f" Failed to parse response: {e}")
453 log(f" Raw response: {response}")
454 return [], "Failed to parse analysis"
455
456 matches = data.get("matches", [])
457 summary = data.get("summary", "Analysis complete")
458 log(f" Found {len(matches)} potential matches")
459 return matches, summary
460
461
462if __name__ == "__main__":
463 parser = argparse.ArgumentParser(description="Identify potential duplicate issues")
464 parser.add_argument("issue_number", type=int, help="Issue number to analyze")
465 parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted")
466 args = parser.parse_args()
467
468 github_token = os.environ.get("GITHUB_TOKEN")
469 anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
470
471 if not github_token:
472 log("Error: GITHUB_TOKEN not set")
473 sys.exit(1)
474 if not anthropic_key:
475 log("Error: ANTHROPIC_API_KEY not set")
476 sys.exit(1)
477
478 GITHUB_HEADERS = {
479 "Authorization": f"Bearer {github_token}",
480 "Accept": "application/vnd.github+json",
481 "X-GitHub-Api-Version": "2022-11-28",
482 }
483
484 issue = fetch_issue(args.issue_number)
485 if should_skip(issue):
486 print(json.dumps({"skipped": True}))
487 sys.exit(0)
488
489 # detect areas
490 taxonomy = format_taxonomy_for_claude(fetch_area_labels())
491 detected_areas = detect_areas(anthropic_key, issue, taxonomy)
492
493 # search for potential duplicates
494 all_magnets = parse_duplicate_magnets()
495 relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas)
496 search_results = search_for_similar_issues(issue, detected_areas)
497
498 # analyze potential duplicates
499 if relevant_magnets or search_results:
500 matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results)
501 else:
502 matches, summary = [], "No potential duplicates to analyze"
503
504 # post comment if high-confidence matches found
505 high_confidence_matches = [m for m in matches if m["confidence"] == "high"]
506 commented = False
507
508 if high_confidence_matches:
509 comment_body = build_duplicate_comment(high_confidence_matches)
510 if args.dry_run:
511 log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40)
512 else:
513 log("Posting comment for high-confidence match(es)")
514 try:
515 post_comment(issue["number"], comment_body)
516 commented = True
517 except requests.RequestException as e:
518 log(f" Failed to post comment: {e}")
519
520 print(json.dumps({
521 "skipped": False,
522 "issue": {
523 "number": issue["number"],
524 "title": issue["title"],
525 "author": issue["author"],
526 "type": issue["type"],
527 },
528 "detected_areas": detected_areas,
529 "magnets_count": len(relevant_magnets),
530 "search_results_count": len(search_results),
531 "matches": matches,
532 "summary": summary,
533 "commented": commented,
534 }))