1#!/usr/bin/env python3
2"""
3Track the effectiveness of the duplicate-detection bot by classifying issues
4into outcome categories on a GitHub Projects v2 board.
5
6Subcommands:
7 classify-closed <issue_number> <closer_login> <state_reason>
8 Classify a closed issue and add it to the project board.
9
10 classify-open
11 Classify open, triaged, bot-commented issues and add them to
12 the project board as Noise.
13
14Requires:
15 requests (pip install requests)
16
17Environment variables:
18 GITHUB_TOKEN - GitHub App token
19 PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
20"""
21
22import argparse
23import functools
24import os
25import re
26import sys
27from datetime import datetime, timezone
28
29import requests
30
31GITHUB_API = "https://api.github.com"
32GRAPHQL_URL = "https://api.github.com/graphql"
33REPO_OWNER = "zed-industries"
34REPO_NAME = "zed"
35STAFF_TEAM_SLUG = "staff"
36BOT_LOGIN = "zed-community-bot[bot]"
37BOT_APP_SLUG = "zed-community-bot"
38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
39BOT_START_DATE = "2026-02-18"
40NEEDS_TRIAGE_LABEL = "state:needs triage"
41DEFAULT_PROJECT_NUMBER = 76
42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
43# Add a new tuple when you deploy a new version of the bot that you want to
44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
45# Newest first, please. The datetime is for the deployment time (merge to maain).
46BOT_VERSION_TIMELINE = [
47 ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
48 ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
49]
50
51
52def bot_version_for_time(date_string):
53 """Return the bot version that was active at the given ISO 8601 timestamp."""
54 timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
55 for version, deployed in BOT_VERSION_TIMELINE:
56 if timestamp >= deployed:
57 return version
58 return BOT_VERSION_TIMELINE[-1][0]
59
60
61def github_api_get(path, params=None):
62 url = f"{GITHUB_API}/{path.lstrip('/')}"
63 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
64 response.raise_for_status()
65 return response.json()
66
67
68def github_search_issues(query):
69 """Search issues, returning most recently created first."""
70 # not handling pagination on purpose: the oldest issues are on the board already
71 params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
72 return github_api_get("/search/issues", params).get("items", [])
73
74
75def is_staff_member(username):
76 """Check if user is an active member of the staff team."""
77 try:
78 data = github_api_get(
79 f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
80 )
81 return data.get("state") == "active"
82 except requests.HTTPError as error:
83 if error.response.status_code == 404:
84 return False
85 raise
86
87
88def fetch_issue(issue_number):
89 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
90 return {
91 "number": issue_number,
92 "node_id": data["node_id"],
93 "author": (data.get("user") or {}).get("login", ""),
94 "type_name": (data.get("type") or {}).get("name"),
95 }
96
97
98def get_bot_comment_with_time(issue_number):
99 """Get the bot's duplicate-detection comment and its timestamp from an issue.
100
101 Returns {"body": str, "created_at": str} if found, else None.
102 """
103 comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
104 page = 1
105 while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
106 for comment in comments:
107 author = (comment.get("user") or {}).get("login", "")
108 body = comment.get("body", "")
109 if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
110 return {"body": body, "created_at": comment.get("created_at", "")}
111 page += 1
112 return None
113
114
115def parse_suggested_issues(comment_body):
116 """Extract issue numbers from the bot's comment (lines like '- #12345')."""
117 return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
118
119
120def github_api_graphql(query, variables=None):
121 """Execute a GitHub GraphQL query. Raises on errors."""
122 response = requests.post(
123 GRAPHQL_URL,
124 headers=GITHUB_HEADERS,
125 json={"query": query, "variables": variables or {}},
126 )
127 response.raise_for_status()
128 data = response.json()
129 if "errors" in data:
130 raise RuntimeError(f"GraphQL errors: {data['errors']}")
131 return data["data"]
132
133
134def get_closed_as_duplicate_of(issue_number):
135 """Get the issue number this issue was closed as a duplicate of.
136
137 Uses the timeline to find the most recent MarkedAsDuplicateEvent.
138 Returns the original issue number, or None.
139
140 Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
141 If the closer used the "Close as duplicate" button without separately
142 marking the duplicate relationship, no event is created and this returns
143 None. The caller handles this by flagging the item for manual review.
144 """
145 data = github_api_graphql(
146 """
147 query($owner: String!, $repo: String!, $number: Int!) {
148 repository(owner: $owner, name: $repo) {
149 issue(number: $number) {
150 timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
151 nodes {
152 ... on MarkedAsDuplicateEvent {
153 canonical { ... on Issue { number } }
154 }
155 }
156 }
157 }
158 }
159 }
160 """,
161 {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
162 )
163 nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
164 for node in reversed(nodes):
165 if original := (node.get("canonical") or {}).get("number"):
166 return original
167 return None
168
169
170@functools.lru_cache
171def get_project_config():
172 """Fetch the project board's ID, field IDs, and option IDs."""
173 data = github_api_graphql(
174 """
175 query($org: String!, $number: Int!) {
176 organization(login: $org) {
177 projectV2(number: $number) {
178 id
179 fields(first: 30) {
180 nodes {
181 ... on ProjectV2SingleSelectField { id name options { id name } }
182 ... on ProjectV2Field { id name }
183 }
184 }
185 }
186 }
187 }
188 """,
189 {"org": REPO_OWNER, "number": PROJECT_NUMBER},
190 )
191 project = data["organization"]["projectV2"]
192
193 config = {"project_id": project["id"], "fields": {}}
194 for field_node in project["fields"]["nodes"]:
195 name = field_node.get("name")
196 if not name:
197 continue
198 field_info = {"id": field_node["id"]}
199 if "options" in field_node:
200 field_info["options"] = {
201 option["name"]: option["id"] for option in field_node["options"]
202 }
203 config["fields"][name] = field_info
204
205 print(f" Project config loaded: {len(config['fields'])} fields")
206 return config
207
208
209def find_project_item(issue_node_id):
210 """Check if an issue is already on our project board.
211
212 Returns the project item ID if found, or None.
213 """
214 data = github_api_graphql(
215 "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
216 {"id": issue_node_id},
217 )
218 for item in data["node"]["projectItems"]["nodes"]:
219 if item["project"]["number"] == PROJECT_NUMBER:
220 return item["id"]
221 return None
222
223
224def add_project_item(issue_node_id):
225 """Add an issue to the project board. Returns the new item ID."""
226 config = get_project_config()
227 data = github_api_graphql(
228 """
229 mutation($projectId: ID!, $contentId: ID!) {
230 addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
231 item { id }
232 }
233 }
234 """,
235 {"projectId": config["project_id"], "contentId": issue_node_id},
236 )
237 return data["addProjectV2ItemById"]["item"]["id"]
238
239
240def set_field_value(item_id, field_name, value):
241 """Set a single field value on a project board item."""
242 config = get_project_config()
243 field = config["fields"].get(field_name)
244 if not field:
245 print(f" Warning: field '{field_name}' not found on project board")
246 return
247
248 if "options" in field:
249 # single-select field
250 option_id = field["options"].get(value)
251 if not option_id:
252 print(f" Warning: option '{value}' not found for field '{field_name}'")
253 return
254 field_value = {"singleSelectOptionId": option_id}
255 else:
256 # text field
257 field_value = {"text": str(value)}
258
259 github_api_graphql(
260 """
261 mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
262 updateProjectV2ItemFieldValue(input: {
263 projectId: $projectId
264 itemId: $itemId
265 fieldId: $fieldId
266 value: $value
267 }) {
268 projectV2Item { id }
269 }
270 }
271 """,
272 {
273 "projectId": config["project_id"],
274 "itemId": item_id,
275 "fieldId": field["id"],
276 "value": field_value,
277 },
278 )
279
280
281def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
282 """Add an issue to the project board (or update it if already there), setting field values."""
283 item_id = find_project_item(issue_node_id)
284 if item_id:
285 print(f" Issue already on board, updating (item {item_id})")
286 else:
287 item_id = add_project_item(issue_node_id)
288 print(f" Added to project board (item {item_id})")
289
290 set_field_value(item_id, "Outcome", outcome)
291 set_field_value(item_id, "Status", status)
292
293 if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
294 set_field_value(item_id, "Closed as", closed_as)
295
296 if notes:
297 set_field_value(item_id, "Notes", notes)
298
299 if bot_comment_time:
300 set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
301
302 return item_id
303
304
305def classify_closed(issue_number, closer_login, state_reason):
306 """Classify a closed issue and add/update it on the project board."""
307 state_reason = state_reason or "unknown"
308 print(f"Classifying closed issue #{issue_number}")
309 print(f" Closer: {closer_login}, state_reason: {state_reason}")
310
311 issue = fetch_issue(issue_number)
312 author = issue["author"]
313 print(f" Author: {author}, type: {issue['type_name']}")
314
315 if is_staff_member(author):
316 print(f" Skipping: author '{author}' is a staff member")
317 return
318
319 bot_comment = get_bot_comment_with_time(issue_number)
320 bot_commented = bot_comment is not None
321 print(f" Bot commented: {bot_commented}")
322
323 closer_is_author = closer_login == author
324
325 if bot_commented and closer_is_author:
326 classify_as_success(issue, bot_comment, state_reason)
327 elif bot_commented and not closer_is_author:
328 # Only authors, staff, and triagers can close issues, so
329 # a non-author closer is always someone with elevated permissions.
330 classify_non_author_closed(issue, bot_comment, state_reason)
331 elif not bot_commented and state_reason == "duplicate":
332 classify_as_missed_opportunity(issue)
333 else:
334 print(" Skipping: no bot comment and not closed as duplicate")
335
336
337def classify_as_success(issue, bot_comment, state_reason):
338 """Author closed their own issue after the bot commented."""
339 if state_reason == "duplicate":
340 status = "Auto-classified"
341 notes = None
342 else:
343 # could be closed for an unrelated reason; flag for review
344 status = "Needs review"
345 notes = f"Author closed as {state_reason}"
346
347 if status == "Auto-classified":
348 print(f" -> Success (closed as {state_reason})")
349 else:
350 print(f" -> Possible Success, needs review ({notes})")
351 add_or_update_project_item(
352 issue["node_id"],
353 outcome="Success",
354 closed_as=state_reason,
355 status=status,
356 notes=notes,
357 bot_comment_time=bot_comment["created_at"],
358 )
359
360
361def classify_non_author_closed(issue, bot_comment, state_reason):
362 """Non-author (staff or triager) closed an issue the bot had commented on."""
363 if state_reason == "duplicate":
364 classify_as_assist(issue, bot_comment)
365 else:
366 notes = f"Closed by staff/triager as {state_reason}, not duplicate"
367 print(f" -> Possible Noise, needs review ({notes})")
368 add_or_update_project_item(
369 issue["node_id"],
370 outcome="Noise",
371 closed_as=state_reason,
372 status="Needs review",
373 notes=notes,
374 bot_comment_time=bot_comment["created_at"],
375 )
376
377
378def classify_as_assist(issue, bot_comment):
379 """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
380 suggested = parse_suggested_issues(bot_comment["body"])
381 original = None
382 try:
383 original = get_closed_as_duplicate_of(issue["number"])
384 except (requests.RequestException, RuntimeError) as error:
385 print(f" Warning: failed to get the original-for the duplicate issue: {error}")
386
387 if original and suggested:
388 if original in suggested:
389 status = "Auto-classified"
390 notes = None
391 print(f" -> Assist (original #{original} matches bot suggestion)")
392 else:
393 status = "Needs review"
394 suggested_str = ", ".join(f"#{number}" for number in suggested)
395 notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
396 print(f" -> Possible Assist, needs review ({notes})")
397 else:
398 # couldn't determine original or no suggestions parsed
399 status = "Needs review"
400 if not original:
401 notes = "Could not determine original issue from timeline"
402 else:
403 notes = f"Closed as dup of #{original}; could not parse bot suggestions"
404 print(f" -> Possible Assist, needs review ({notes})")
405
406 add_or_update_project_item(
407 issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
408 bot_comment_time=bot_comment["created_at"])
409
410
411def classify_as_missed_opportunity(issue):
412 """Issue closed as duplicate but the bot never commented."""
413 print(" -> Missed opportunity")
414 add_or_update_project_item(
415 issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
416
417
418def classify_open():
419 """Classify open, triaged, bot-commented issues as Noise."""
420 print("Classifying open issues")
421
422 query = (
423 f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
424 f"commenter:app/{BOT_APP_SLUG} "
425 f'-label:"{NEEDS_TRIAGE_LABEL}" '
426 f"created:>={BOT_START_DATE}"
427 )
428 print(f" Search query: {query}")
429
430 results = github_search_issues(query)
431 print(f" Found {len(results)} candidate issues")
432
433 added, skipped, errors = 0, 0, 0
434 for item in results:
435 number = item["number"]
436 try:
437 type_name = (item.get("type") or {}).get("name")
438 author = (item.get("user") or {}).get("login", "")
439 node_id = item["node_id"]
440
441 skip_reason = (
442 f"type is {type_name}" if type_name not in ("Bug", "Crash")
443 else f"author {author} is staff" if is_staff_member(author)
444 else "already on the board" if find_project_item(node_id)
445 else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
446 else None
447 )
448
449 if skip_reason:
450 print(f" #{number}: skipping, {skip_reason}")
451 skipped += 1
452 continue
453
454 print(f" #{number}: adding as Noise")
455 add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
456 bot_comment_time=bot_comment["created_at"])
457 added += 1
458 except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
459 print(f" #{number}: error processing issue, skipping: {error}")
460 errors += 1
461
462 print(f" Done: added {added}, skipped {skipped}, errors {errors}")
463
464
465if __name__ == "__main__":
466 parser = argparse.ArgumentParser(
467 description="Track duplicate bot effectiveness on a GitHub project board.",
468 )
469 subparsers = parser.add_subparsers(dest="command", required=True)
470
471 classify_parser = subparsers.add_parser(
472 "classify-closed",
473 help="Classify a closed issue and add it to the project board.",
474 )
475 classify_parser.add_argument("issue_number", type=int)
476 classify_parser.add_argument("closer_login")
477 classify_parser.add_argument("state_reason")
478
479 subparsers.add_parser(
480 "classify-open",
481 help="Classify open, triaged, bot-commented issues as Noise.",
482 )
483
484 args = parser.parse_args()
485
486 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
487 if not GITHUB_TOKEN:
488 print("Error: GITHUB_TOKEN environment variable is required")
489 sys.exit(1)
490
491 raw_project_number = os.environ.get("PROJECT_NUMBER", "")
492 if raw_project_number:
493 try:
494 PROJECT_NUMBER = int(raw_project_number)
495 except ValueError:
496 print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
497 sys.exit(1)
498 else:
499 PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
500
501 GITHUB_HEADERS = {
502 "Authorization": f"token {GITHUB_TOKEN}",
503 "Accept": "application/vnd.github+json",
504 }
505
506 if args.command == "classify-closed":
507 classify_closed(args.issue_number, args.closer_login, args.state_reason)
508 elif args.command == "classify-open":
509 classify_open()