1#!/usr/bin/env python3
2"""
3Track the effectiveness of the duplicate-detection bot by classifying issues
4into outcome categories on a GitHub Projects v2 board.
5
6Subcommands:
7 classify-closed <issue_number> <closer_login> <state_reason>
8 Classify a closed issue and add it to the project board.
9
10 classify-open
11 Classify open, triaged, bot-commented issues and add them to
12 the project board as Noise.
13
14Requires:
15 requests (pip install requests)
16
17Environment variables:
18 GITHUB_TOKEN - GitHub App token
19 PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
20"""
21
22import argparse
23import functools
24import os
25import re
26import sys
27
28import requests
29
30GITHUB_API = "https://api.github.com"
31GRAPHQL_URL = "https://api.github.com/graphql"
32REPO_OWNER = "zed-industries"
33REPO_NAME = "zed"
34STAFF_TEAM_SLUG = "staff"
35BOT_LOGIN = "zed-community-bot[bot]"
36BOT_APP_SLUG = "zed-community-bot"
37BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
38BOT_START_DATE = "2026-02-18"
39NEEDS_TRIAGE_LABEL = "state:needs triage"
40DEFAULT_PROJECT_NUMBER = 76
41VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
42
43
44def github_api_get(path, params=None):
45 url = f"{GITHUB_API}/{path.lstrip('/')}"
46 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
47 response.raise_for_status()
48 return response.json()
49
50
51def github_search_issues(query):
52 """Search issues, returning most recently created first."""
53 # not handling pagination on purpose: the oldest issues are on the board already
54 params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
55 return github_api_get("/search/issues", params).get("items", [])
56
57
58def is_staff_member(username):
59 """Check if user is an active member of the staff team."""
60 try:
61 data = github_api_get(
62 f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
63 )
64 return data.get("state") == "active"
65 except requests.HTTPError as error:
66 if error.response.status_code == 404:
67 return False
68 raise
69
70
71def fetch_issue(issue_number):
72 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
73 return {
74 "number": issue_number,
75 "node_id": data["node_id"],
76 "author": (data.get("user") or {}).get("login", ""),
77 "type_name": (data.get("type") or {}).get("name"),
78 }
79
80
81def get_bot_duplicate_comment(issue_number):
82 """Get the bot's duplicate-detection comment body from an issue.
83
84 Returns the comment body if found, else None.
85 """
86 comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
87 page = 1
88 while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
89 for comment in comments:
90 author = (comment.get("user") or {}).get("login", "")
91 body = comment.get("body", "")
92 if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
93 return body
94 page += 1
95 return None
96
97
98def parse_suggested_issues(comment_body):
99 """Extract issue numbers from the bot's comment (lines like '- #12345')."""
100 return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
101
102
103def github_api_graphql(query, variables=None):
104 """Execute a GitHub GraphQL query. Raises on errors."""
105 response = requests.post(
106 GRAPHQL_URL,
107 headers=GITHUB_HEADERS,
108 json={"query": query, "variables": variables or {}},
109 )
110 response.raise_for_status()
111 data = response.json()
112 if "errors" in data:
113 raise RuntimeError(f"GraphQL errors: {data['errors']}")
114 return data["data"]
115
116
117def get_closed_as_duplicate_of(issue_number):
118 """Get the issue number this issue was closed as a duplicate of.
119
120 Uses the timeline to find the most recent MarkedAsDuplicateEvent.
121 Returns the original issue number, or None.
122
123 Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
124 If the closer used the "Close as duplicate" button without separately
125 marking the duplicate relationship, no event is created and this returns
126 None. The caller handles this by flagging the item for manual review.
127 """
128 data = github_api_graphql(
129 """
130 query($owner: String!, $repo: String!, $number: Int!) {
131 repository(owner: $owner, name: $repo) {
132 issue(number: $number) {
133 timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
134 nodes {
135 ... on MarkedAsDuplicateEvent {
136 canonical { ... on Issue { number } }
137 }
138 }
139 }
140 }
141 }
142 }
143 """,
144 {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
145 )
146 nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
147 for node in reversed(nodes):
148 if original := (node.get("canonical") or {}).get("number"):
149 return original
150 return None
151
152
153@functools.lru_cache
154def get_project_config():
155 """Fetch the project board's ID, field IDs, and option IDs."""
156 data = github_api_graphql(
157 """
158 query($org: String!, $number: Int!) {
159 organization(login: $org) {
160 projectV2(number: $number) {
161 id
162 fields(first: 30) {
163 nodes {
164 ... on ProjectV2SingleSelectField { id name options { id name } }
165 ... on ProjectV2Field { id name }
166 }
167 }
168 }
169 }
170 }
171 """,
172 {"org": REPO_OWNER, "number": PROJECT_NUMBER},
173 )
174 project = data["organization"]["projectV2"]
175
176 config = {"project_id": project["id"], "fields": {}}
177 for field_node in project["fields"]["nodes"]:
178 name = field_node.get("name")
179 if not name:
180 continue
181 field_info = {"id": field_node["id"]}
182 if "options" in field_node:
183 field_info["options"] = {
184 option["name"]: option["id"] for option in field_node["options"]
185 }
186 config["fields"][name] = field_info
187
188 print(f" Project config loaded: {len(config['fields'])} fields")
189 return config
190
191
192def find_project_item(issue_node_id):
193 """Check if an issue is already on our project board.
194
195 Returns the project item ID if found, or None.
196 """
197 data = github_api_graphql(
198 "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
199 {"id": issue_node_id},
200 )
201 for item in data["node"]["projectItems"]["nodes"]:
202 if item["project"]["number"] == PROJECT_NUMBER:
203 return item["id"]
204 return None
205
206
207def add_project_item(issue_node_id):
208 """Add an issue to the project board. Returns the new item ID."""
209 config = get_project_config()
210 data = github_api_graphql(
211 """
212 mutation($projectId: ID!, $contentId: ID!) {
213 addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
214 item { id }
215 }
216 }
217 """,
218 {"projectId": config["project_id"], "contentId": issue_node_id},
219 )
220 return data["addProjectV2ItemById"]["item"]["id"]
221
222
223def set_field_value(item_id, field_name, value):
224 """Set a single field value on a project board item."""
225 config = get_project_config()
226 field = config["fields"].get(field_name)
227 if not field:
228 print(f" Warning: field '{field_name}' not found on project board")
229 return
230
231 if "options" in field:
232 # single-select field
233 option_id = field["options"].get(value)
234 if not option_id:
235 print(f" Warning: option '{value}' not found for field '{field_name}'")
236 return
237 field_value = {"singleSelectOptionId": option_id}
238 else:
239 # text field
240 field_value = {"text": str(value)}
241
242 github_api_graphql(
243 """
244 mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
245 updateProjectV2ItemFieldValue(input: {
246 projectId: $projectId
247 itemId: $itemId
248 fieldId: $fieldId
249 value: $value
250 }) {
251 projectV2Item { id }
252 }
253 }
254 """,
255 {
256 "projectId": config["project_id"],
257 "itemId": item_id,
258 "fieldId": field["id"],
259 "value": field_value,
260 },
261 )
262
263
264def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
265 """Add an issue to the project board (or update it if already there), setting field values."""
266 item_id = find_project_item(issue_node_id)
267 if item_id:
268 print(f" Issue already on board, updating (item {item_id})")
269 else:
270 item_id = add_project_item(issue_node_id)
271 print(f" Added to project board (item {item_id})")
272
273 set_field_value(item_id, "Outcome", outcome)
274 set_field_value(item_id, "Status", status)
275
276 if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
277 set_field_value(item_id, "Closed as", closed_as)
278
279 if notes:
280 set_field_value(item_id, "Notes", notes)
281
282 return item_id
283
284
285def classify_closed(issue_number, closer_login, state_reason):
286 """Classify a closed issue and add/update it on the project board."""
287 state_reason = state_reason or "unknown"
288 print(f"Classifying closed issue #{issue_number}")
289 print(f" Closer: {closer_login}, state_reason: {state_reason}")
290
291 issue = fetch_issue(issue_number)
292 author = issue["author"]
293 print(f" Author: {author}, type: {issue['type_name']}")
294
295 if is_staff_member(author):
296 print(f" Skipping: author '{author}' is a staff member")
297 return
298
299 bot_comment = get_bot_duplicate_comment(issue_number)
300 bot_commented = bot_comment is not None
301 print(f" Bot commented: {bot_commented}")
302
303 closer_is_author = closer_login == author
304
305 if bot_commented and closer_is_author:
306 classify_as_success(issue, state_reason)
307 elif bot_commented and not closer_is_author:
308 # Only authors, staff, and triagers can close issues, so
309 # a non-author closer is always someone with elevated permissions.
310 classify_non_author_closed(issue, bot_comment, state_reason)
311 elif not bot_commented and state_reason == "duplicate":
312 classify_as_missed_opportunity(issue)
313 else:
314 print(" Skipping: no bot comment and not closed as duplicate")
315
316
317def classify_as_success(issue, state_reason):
318 """Author closed their own issue after the bot commented."""
319 if state_reason == "duplicate":
320 status = "Auto-classified"
321 notes = None
322 else:
323 # could be closed for an unrelated reason; flag for review
324 status = "Needs review"
325 notes = f"Author closed as {state_reason}"
326
327 if status == "Auto-classified":
328 print(f" -> Success (closed as {state_reason})")
329 else:
330 print(f" -> Possible Success, needs review ({notes})")
331 add_or_update_project_item(
332 issue["node_id"],
333 outcome="Success",
334 closed_as=state_reason,
335 status=status,
336 notes=notes,
337 )
338
339
340def classify_non_author_closed(issue, bot_comment, state_reason):
341 """Non-author (staff or triager) closed an issue the bot had commented on."""
342 if state_reason == "duplicate":
343 classify_as_assist(issue, bot_comment)
344 else:
345 notes = f"Closed by staff/triager as {state_reason}, not duplicate"
346 print(f" -> Possible Noise, needs review ({notes})")
347 add_or_update_project_item(
348 issue["node_id"],
349 outcome="Noise",
350 closed_as=state_reason,
351 status="Needs review",
352 notes=notes,
353 )
354
355
356def classify_as_assist(issue, bot_comment):
357 """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
358 suggested = parse_suggested_issues(bot_comment)
359 original = None
360 try:
361 original = get_closed_as_duplicate_of(issue["number"])
362 except (requests.RequestException, RuntimeError) as error:
363 print(f" Warning: failed to get the original-for the duplicate issue: {error}")
364
365 if original and suggested:
366 if original in suggested:
367 status = "Auto-classified"
368 notes = None
369 print(f" -> Assist (original #{original} matches bot suggestion)")
370 else:
371 status = "Needs review"
372 suggested_str = ", ".join(f"#{number}" for number in suggested)
373 notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
374 print(f" -> Possible Assist, needs review ({notes})")
375 else:
376 # couldn't determine original or no suggestions parsed
377 status = "Needs review"
378 if not original:
379 notes = "Could not determine original issue from timeline"
380 else:
381 notes = f"Closed as dup of #{original}; could not parse bot suggestions"
382 print(f" -> Possible Assist, needs review ({notes})")
383
384 add_or_update_project_item(
385 issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
386
387
388def classify_as_missed_opportunity(issue):
389 """Issue closed as duplicate but the bot never commented."""
390 print(" -> Missed opportunity")
391 add_or_update_project_item(
392 issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
393
394
395def classify_open():
396 """Classify open, triaged, bot-commented issues as Noise."""
397 print("Classifying open issues")
398
399 query = (
400 f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
401 f"commenter:app/{BOT_APP_SLUG} "
402 f'-label:"{NEEDS_TRIAGE_LABEL}" '
403 f"created:>={BOT_START_DATE}"
404 )
405 print(f" Search query: {query}")
406
407 results = github_search_issues(query)
408 print(f" Found {len(results)} candidate issues")
409
410 added, skipped, errors = 0, 0, 0
411 for item in results:
412 number = item["number"]
413 try:
414 type_name = (item.get("type") or {}).get("name")
415 author = (item.get("user") or {}).get("login", "")
416 node_id = item["node_id"]
417
418 skip_reason = (
419 f"type is {type_name}" if type_name not in ("Bug", "Crash")
420 else f"author {author} is staff" if is_staff_member(author)
421 else "already on the board" if find_project_item(node_id)
422 else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
423 else None
424 )
425 if skip_reason:
426 print(f" #{number}: skipping, {skip_reason}")
427 skipped += 1
428 continue
429
430 print(f" #{number}: adding as Noise")
431 add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
432 added += 1
433 except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
434 print(f" #{number}: error processing issue, skipping: {error}")
435 errors += 1
436
437 print(f" Done: added {added}, skipped {skipped}, errors {errors}")
438
439
440if __name__ == "__main__":
441 parser = argparse.ArgumentParser(
442 description="Track duplicate bot effectiveness on a GitHub project board.",
443 )
444 subparsers = parser.add_subparsers(dest="command", required=True)
445
446 classify_parser = subparsers.add_parser(
447 "classify-closed",
448 help="Classify a closed issue and add it to the project board.",
449 )
450 classify_parser.add_argument("issue_number", type=int)
451 classify_parser.add_argument("closer_login")
452 classify_parser.add_argument("state_reason")
453
454 subparsers.add_parser(
455 "classify-open",
456 help="Classify open, triaged, bot-commented issues as Noise.",
457 )
458
459 args = parser.parse_args()
460
461 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
462 if not GITHUB_TOKEN:
463 print("Error: GITHUB_TOKEN environment variable is required")
464 sys.exit(1)
465
466 raw_project_number = os.environ.get("PROJECT_NUMBER", "")
467 if raw_project_number:
468 try:
469 PROJECT_NUMBER = int(raw_project_number)
470 except ValueError:
471 print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
472 sys.exit(1)
473 else:
474 PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
475
476 GITHUB_HEADERS = {
477 "Authorization": f"token {GITHUB_TOKEN}",
478 "Accept": "application/vnd.github+json",
479 }
480
481 if args.command == "classify-closed":
482 classify_closed(args.issue_number, args.closer_login, args.state_reason)
483 elif args.command == "classify-open":
484 classify_open()