1#!/usr/bin/env python3
2"""
3Find open issues that have the most duplicates filed against them and update
4a GitHub issue with the results.
5
6Queries open issues and looks for MarkedAsDuplicateEvent in their timelines.
7Only includes issues that have been re-reported at least twice (2+ duplicates
8closed against them). Groups results by area: label. The output is formatted
9as markdown with issue URLs (GitHub renders the titles automatically).
10
11This script is run regularly by the update_duplicate_magnets.yml workflow.
12
13Requires: requests (pip install requests)
14GitHub token permissions: issues:write
15
16Usage:
17 # Print to stdout only for testing:
18 python github-find-top-duplicated-bugs.py --github-token ghp_xxx
19
20 # Update a GitHub issue:
21 python github-find-top-duplicated-bugs.py --github-token ghp_xxx --issue-number 46355
22"""
23
24import argparse
25import os
26import sys
27from collections import Counter, defaultdict
28
29import requests
30
31OWNER = "zed-industries"
32REPO = "zed"
33
34GRAPHQL_URL = "https://api.github.com/graphql"
35REST_API_URL = "https://api.github.com"
36
37headers = None
38
39ISSUES_WITH_DUPLICATES_QUERY = """
40query($owner: String!, $repo: String!, $cursor: String) {
41 repository(owner: $owner, name: $repo) {
42 issues(
43 first: 100
44 after: $cursor
45 states: [OPEN]
46 orderBy: {field: UPDATED_AT, direction: DESC}
47 ) {
48 pageInfo {
49 hasNextPage
50 endCursor
51 }
52 nodes {
53 number
54 url
55 labels(first: 20) {
56 nodes {
57 name
58 }
59 }
60 timelineItems(first: 100, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
61 nodes {
62 ... on MarkedAsDuplicateEvent {
63 duplicate {
64 ... on Issue {
65 number
66 state
67 }
68 }
69 }
70 }
71 }
72 }
73 }
74 }
75}
76"""
77
78
79def extract_duplicate_info(issue):
80 """Extract duplicate count and info from an issue. Returns None if < 2 duplicates."""
81 seen_duplicates = set()
82 for event in issue["timelineItems"]["nodes"]:
83 try:
84 if event["duplicate"]["state"] == "CLOSED":
85 seen_duplicates.add(event["duplicate"]["number"])
86 except (KeyError, TypeError):
87 continue
88
89 if len(seen_duplicates) < 2:
90 return None
91
92 labels = [l["name"] for l in issue["labels"]["nodes"]]
93 areas = [l.replace("area:", "") for l in labels if l.startswith("area:")]
94
95 return {
96 "number": issue["number"],
97 "url": issue["url"],
98 "areas": areas if areas else ["(unlabeled)"],
99 "duplicate_count": len(seen_duplicates),
100 }
101
102
103def fetch_canonical_issues_with_duplicates(max_pages=100):
104 """Fetch open issues and count how many duplicates point to each."""
105 print(f"Finding open issues with the most duplicates in {OWNER}/{REPO}")
106
107 cursor = None
108 duplicate_magnets = []
109 total_issues_scanned = 0
110
111 for page in range(max_pages):
112 response = requests.post(
113 GRAPHQL_URL,
114 headers=headers,
115 json={
116 "query": ISSUES_WITH_DUPLICATES_QUERY,
117 "variables": {"owner": OWNER, "repo": REPO, "cursor": cursor},
118 },
119 )
120 response.raise_for_status()
121 data = response.json()
122
123 if "errors" in data:
124 print(f"GraphQL errors: {data['errors']}")
125 break
126
127 issues = data["data"]["repository"]["issues"]
128 total_issues_scanned += len(issues["nodes"])
129
130 for issue in issues["nodes"]:
131 if info := extract_duplicate_info(issue):
132 duplicate_magnets.append(info)
133
134 page_info = issues["pageInfo"]
135 if not page_info["hasNextPage"]:
136 print(f"Done: scanned {total_issues_scanned} open issues")
137 break
138 cursor = page_info["endCursor"]
139
140 print(
141 f"Page {page + 1}: scanned {total_issues_scanned} open issues, "
142 f"{len(duplicate_magnets)} have duplicates"
143 )
144
145 return duplicate_magnets
146
147
148def build_markdown_body(duplicate_magnets):
149 """Group results by area and build markdown body for the GitHub issue.
150
151 NOTE: the output format is parsed by fetch_duplicate_magnets() in
152 github-check-new-issue-for-duplicates.py — update that if you change this.
153 """
154 by_area = defaultdict(list)
155 area_totals = Counter()
156 for info in duplicate_magnets:
157 for area in info["areas"]:
158 by_area[area].append(info)
159 area_totals[area] += info["duplicate_count"]
160
161 lines = [
162 "These are the issues that are frequently re-reported. "
163 "The list is generated regularly by running a script."
164 ]
165
166 for area, _ in area_totals.most_common():
167 issues = sorted(by_area[area], key=lambda x: x["duplicate_count"], reverse=True)
168
169 lines.append("")
170 lines.append(f"## {area}")
171 lines.append("")
172
173 for info in issues:
174 lines.append(
175 f"- [{info['duplicate_count']:2d} dupes] {info['url']}"
176 )
177
178 return "\n".join(lines)
179
180
181def update_github_issue(issue_number, body):
182 """Update the body of a GitHub issue."""
183 url = f"{REST_API_URL}/repos/{OWNER}/{REPO}/issues/{issue_number}"
184 response = requests.patch(url, headers=headers, json={"body": body})
185 response.raise_for_status()
186 print(f"Updated issue #{issue_number}")
187
188
189def parse_args():
190 parser = argparse.ArgumentParser(
191 description="Find open issues with the most duplicates filed against them."
192 )
193 parser.add_argument(
194 "--github-token",
195 default=os.environ.get("GITHUB_TOKEN"),
196 help="GitHub token (or set GITHUB_TOKEN env var)",
197 )
198 parser.add_argument(
199 "--issue-number",
200 type=int,
201 help="GitHub issue number to update (if not provided, prints to stdout)",
202 )
203 return parser.parse_args()
204
205
206if __name__ == "__main__":
207 args = parse_args()
208
209 if not args.github_token:
210 print("Error: --github-token is required (or set GITHUB_TOKEN env var)")
211 sys.exit(1)
212
213 headers = {
214 "Authorization": f"Bearer {args.github_token}",
215 "Content-Type": "application/json",
216 }
217
218 if duplicate_magnets := fetch_canonical_issues_with_duplicates():
219 body = build_markdown_body(duplicate_magnets)
220 if args.issue_number:
221 update_github_issue(args.issue_number, body)
222 else:
223 print(body)