1#!/usr/bin/env python3
2"""
3Track the effectiveness of the duplicate-detection bot by classifying issues
4into outcome categories on a GitHub Projects v2 board.
5
6Subcommands:
7 classify-closed <issue_number> <closer_login> <state_reason>
8 Classify a closed issue and add it to the project board.
9
10 classify-open
11 Classify open, triaged, bot-commented issues and add them to
12 the project board as Noise.
13
14Requires:
15 requests (pip install requests)
16
17Environment variables:
18 GITHUB_TOKEN - GitHub App token
19 PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
20"""
21
22import argparse
23import functools
24import os
25import re
26import sys
27from datetime import datetime, timezone
28
29import requests
30
31GITHUB_API = "https://api.github.com"
32GRAPHQL_URL = "https://api.github.com/graphql"
33REPO_OWNER = "zed-industries"
34REPO_NAME = "zed"
35STAFF_TEAM_SLUG = "staff"
36BOT_LOGIN = "zed-community-bot[bot]"
37BOT_APP_SLUG = "zed-community-bot"
38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
39BOT_START_DATE = "2026-02-18"
40NEEDS_TRIAGE_LABEL = "state:needs triage"
41DEFAULT_PROJECT_NUMBER = 76
42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
43# Add a new tuple when you deploy a new version of the bot that you want to
44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
45# Newest first, please. The datetime is for the deployment time (merge to maain).
46BOT_VERSION_TIMELINE = [
47 ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
48 ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
49]
50
51
52def bot_version_for_time(date_string):
53 """Return the bot version that was active at the given ISO 8601 timestamp."""
54 timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
55 for version, deployed in BOT_VERSION_TIMELINE:
56 if timestamp >= deployed:
57 return version
58 return BOT_VERSION_TIMELINE[-1][0]
59
60
61def github_api_get(path, params=None):
62 url = f"{GITHUB_API}/{path.lstrip('/')}"
63 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
64 response.raise_for_status()
65 return response.json()
66
67
68def github_search_issues(query):
69 """Search issues, returning most recently created first."""
70 # not handling pagination on purpose: the oldest issues are on the board already
71 params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
72 return github_api_get("/search/issues", params).get("items", [])
73
74
75def is_staff_member(username):
76 """Check if user is an active member of the staff team."""
77 try:
78 data = github_api_get(
79 f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
80 )
81 return data.get("state") == "active"
82 except requests.HTTPError as error:
83 if error.response.status_code == 404:
84 return False
85 raise
86
87
88def fetch_issue(issue_number):
89 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
90 return {
91 "number": issue_number,
92 "node_id": data["node_id"],
93 "author": (data.get("user") or {}).get("login", ""),
94 "type_name": (data.get("type") or {}).get("name"),
95 "created_at": data.get("created_at", ""),
96 }
97
98
99def get_bot_comment_with_time(issue_number):
100 """Get the bot's duplicate-detection comment and its timestamp from an issue.
101
102 Returns {"body": str, "created_at": str} if found, else None.
103 """
104 comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
105 page = 1
106 while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
107 for comment in comments:
108 author = (comment.get("user") or {}).get("login", "")
109 body = comment.get("body", "")
110 if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
111 return {"body": body, "created_at": comment.get("created_at", "")}
112 page += 1
113 return None
114
115
116def parse_suggested_issues(comment_body):
117 """Extract issue numbers from the bot's comment (lines like '- #12345')."""
118 return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
119
120
121def github_api_graphql(query, variables=None, partial_errors_ok=False):
122 """Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
123 response = requests.post(
124 GRAPHQL_URL,
125 headers=GITHUB_HEADERS,
126 json={"query": query, "variables": variables or {}},
127 )
128 response.raise_for_status()
129 data = response.json()
130 if "errors" in data:
131 if not partial_errors_ok or "data" not in data:
132 raise RuntimeError(f"GraphQL errors: {data['errors']}")
133 print(f" GraphQL partial errors (ignored): {data['errors']}")
134 return data["data"]
135
136
137def find_canonical_among(duplicate_number, candidates):
138 """Check if any candidate issue has duplicate_number marked as a duplicate.
139
140 The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
141 duplicate's. So to find which canonical issue our duplicate was closed against,
142 we check each candidate's timeline for a MarkedAsDuplicateEvent whose
143 `duplicate` field matches our issue.
144
145 Returns the matching canonical issue number, or None.
146 """
147 if not candidates:
148 return None
149
150 data = github_api_graphql(
151 """
152 query($owner: String!, $repo: String!, $numbers: [Int!]!) {
153 repository(owner: $owner, name: $repo) {
154 PLACEHOLDER
155 }
156 }
157 """.replace("PLACEHOLDER", "\n ".join(
158 f'issue_{number}: issue(number: {number}) {{'
159 f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
160 f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
161 for number in candidates
162 )),
163 {"owner": REPO_OWNER, "repo": REPO_NAME, "numbers": list(candidates)},
164 partial_errors_ok=True,
165 )
166
167 repo = data["repository"]
168 for candidate in candidates:
169 issue_data = repo.get(f"issue_{candidate}")
170 if not issue_data:
171 continue
172 for node in issue_data["timelineItems"]["nodes"]:
173 dup_number = (node.get("duplicate") or {}).get("number")
174 if dup_number == duplicate_number:
175 return candidate
176 return None
177
178
179@functools.lru_cache
180def get_project_config():
181 """Fetch the project board's ID, field IDs, and option IDs."""
182 data = github_api_graphql(
183 """
184 query($org: String!, $number: Int!) {
185 organization(login: $org) {
186 projectV2(number: $number) {
187 id
188 fields(first: 30) {
189 nodes {
190 ... on ProjectV2SingleSelectField { id name options { id name } }
191 ... on ProjectV2Field { id name }
192 }
193 }
194 }
195 }
196 }
197 """,
198 {"org": REPO_OWNER, "number": PROJECT_NUMBER},
199 )
200 project = data["organization"]["projectV2"]
201
202 config = {"project_id": project["id"], "fields": {}}
203 for field_node in project["fields"]["nodes"]:
204 name = field_node.get("name")
205 if not name:
206 continue
207 field_info = {"id": field_node["id"]}
208 if "options" in field_node:
209 field_info["options"] = {
210 option["name"]: option["id"] for option in field_node["options"]
211 }
212 config["fields"][name] = field_info
213
214 print(f" Project config loaded: {len(config['fields'])} fields")
215 return config
216
217
218def find_project_item(issue_node_id):
219 """Check if an issue is already on our project board.
220
221 Returns the project item ID if found, or None.
222 """
223 data = github_api_graphql(
224 "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
225 {"id": issue_node_id},
226 )
227 for item in data["node"]["projectItems"]["nodes"]:
228 if item["project"]["number"] == PROJECT_NUMBER:
229 return item["id"]
230 return None
231
232
233def add_project_item(issue_node_id):
234 """Add an issue to the project board. Returns the new item ID."""
235 config = get_project_config()
236 data = github_api_graphql(
237 """
238 mutation($projectId: ID!, $contentId: ID!) {
239 addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
240 item { id }
241 }
242 }
243 """,
244 {"projectId": config["project_id"], "contentId": issue_node_id},
245 )
246 return data["addProjectV2ItemById"]["item"]["id"]
247
248
249def set_field_value(item_id, field_name, value):
250 """Set a single field value on a project board item."""
251 config = get_project_config()
252 field = config["fields"].get(field_name)
253 if not field:
254 print(f" Warning: field '{field_name}' not found on project board")
255 return
256
257 if "options" in field:
258 # single-select field
259 option_id = field["options"].get(value)
260 if not option_id:
261 print(f" Warning: option '{value}' not found for field '{field_name}'")
262 return
263 field_value = {"singleSelectOptionId": option_id}
264 else:
265 # text field
266 field_value = {"text": str(value)}
267
268 github_api_graphql(
269 """
270 mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
271 updateProjectV2ItemFieldValue(input: {
272 projectId: $projectId
273 itemId: $itemId
274 fieldId: $fieldId
275 value: $value
276 }) {
277 projectV2Item { id }
278 }
279 }
280 """,
281 {
282 "projectId": config["project_id"],
283 "itemId": item_id,
284 "fieldId": field["id"],
285 "value": field_value,
286 },
287 )
288
289
290def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
291 """Add an issue to the project board (or update it if already there), setting field values."""
292 item_id = find_project_item(issue_node_id)
293 if item_id:
294 print(f" Issue already on board, updating (item {item_id})")
295 else:
296 item_id = add_project_item(issue_node_id)
297 print(f" Added to project board (item {item_id})")
298
299 set_field_value(item_id, "Outcome", outcome)
300 set_field_value(item_id, "Status", status)
301
302 if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
303 set_field_value(item_id, "Closed as", closed_as)
304
305 if notes:
306 set_field_value(item_id, "Notes", notes)
307
308 if bot_comment_time:
309 set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
310
311 return item_id
312
313
314def classify_closed(issue_number, closer_login, state_reason):
315 """Classify a closed issue and add/update it on the project board."""
316 state_reason = state_reason or "unknown"
317 print(f"Classifying closed issue #{issue_number}")
318 print(f" Closer: {closer_login}, state_reason: {state_reason}")
319
320 issue = fetch_issue(issue_number)
321 author = issue["author"]
322 print(f" Author: {author}, type: {issue['type_name']}")
323
324 if is_staff_member(author):
325 print(f" Skipping: author '{author}' is a staff member")
326 return
327
328 bot_comment = get_bot_comment_with_time(issue_number)
329 bot_commented = bot_comment is not None
330 print(f" Bot commented: {bot_commented}")
331
332 closer_is_author = closer_login == author
333
334 if bot_commented and closer_is_author:
335 classify_as_success(issue, bot_comment, state_reason)
336 elif bot_commented and not closer_is_author:
337 # Only authors, staff, and triagers can close issues, so
338 # a non-author closer is always someone with elevated permissions.
339 classify_non_author_closed(issue, bot_comment, state_reason)
340 elif not bot_commented and state_reason == "duplicate":
341 classify_as_missed_opportunity(issue)
342 else:
343 print(" Skipping: no bot comment and not closed as duplicate")
344
345
346def classify_as_success(issue, bot_comment, state_reason):
347 """Author closed their own issue after the bot commented."""
348 if state_reason == "duplicate":
349 status = "Auto-classified"
350 notes = None
351 else:
352 # could be closed for an unrelated reason; flag for review
353 status = "Needs review"
354 notes = f"Author closed as {state_reason}"
355
356 if status == "Auto-classified":
357 print(f" -> Success (closed as {state_reason})")
358 else:
359 print(f" -> Possible Success, needs review ({notes})")
360 add_or_update_project_item(
361 issue["node_id"],
362 outcome="Success",
363 closed_as=state_reason,
364 status=status,
365 notes=notes,
366 bot_comment_time=bot_comment["created_at"],
367 )
368
369
370def classify_non_author_closed(issue, bot_comment, state_reason):
371 """Non-author (staff or triager) closed an issue the bot had commented on."""
372 if state_reason == "duplicate":
373 classify_as_assist(issue, bot_comment)
374 else:
375 notes = f"Closed by staff/triager as {state_reason}, not duplicate"
376 print(f" -> Possible Noise, needs review ({notes})")
377 add_or_update_project_item(
378 issue["node_id"],
379 outcome="Noise",
380 closed_as=state_reason,
381 status="Needs review",
382 notes=notes,
383 bot_comment_time=bot_comment["created_at"],
384 )
385
386
387def classify_as_assist(issue, bot_comment):
388 """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
389 suggested = parse_suggested_issues(bot_comment["body"])
390 if not suggested:
391 print(" -> Assist, needs review (could not parse bot suggestions)")
392 add_or_update_project_item(
393 issue["node_id"], outcome="Assist", closed_as="duplicate",
394 status="Needs review", notes="Could not parse bot suggestions",
395 bot_comment_time=bot_comment["created_at"])
396 return
397
398 original = None
399 try:
400 original = find_canonical_among(issue["number"], suggested)
401 except (requests.RequestException, RuntimeError) as error:
402 print(f" Warning: failed to query candidate timelines: {error}")
403
404 if original:
405 status = "Auto-classified"
406 notes = None
407 print(f" -> Assist (original #{original} matches bot suggestion)")
408 else:
409 status = "Needs review"
410 suggested_str = ", ".join(f"#{number}" for number in suggested)
411 notes = f"Bot suggested {suggested_str}; none matched as canonical"
412 print(f" -> Possible Assist, needs review ({notes})")
413
414 add_or_update_project_item(
415 issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
416 bot_comment_time=bot_comment["created_at"])
417
418
419def classify_as_missed_opportunity(issue):
420 """Issue closed as duplicate but the bot never commented."""
421 print(" -> Missed opportunity")
422 add_or_update_project_item(
423 issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified",
424 bot_comment_time=issue["created_at"])
425
426
427def classify_open():
428 """Classify open, triaged, bot-commented issues as Noise."""
429 print("Classifying open issues")
430
431 query = (
432 f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
433 f"commenter:app/{BOT_APP_SLUG} "
434 f'-label:"{NEEDS_TRIAGE_LABEL}" '
435 f"created:>={BOT_START_DATE}"
436 )
437 print(f" Search query: {query}")
438
439 results = github_search_issues(query)
440 print(f" Found {len(results)} candidate issues")
441
442 added, skipped, errors = 0, 0, 0
443 for item in results:
444 number = item["number"]
445 try:
446 type_name = (item.get("type") or {}).get("name")
447 author = (item.get("user") or {}).get("login", "")
448 node_id = item["node_id"]
449
450 skip_reason = (
451 f"type is {type_name}" if type_name not in ("Bug", "Crash")
452 else f"author {author} is staff" if is_staff_member(author)
453 else "already on the board" if find_project_item(node_id)
454 else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
455 else None
456 )
457
458 if skip_reason:
459 print(f" #{number}: skipping, {skip_reason}")
460 skipped += 1
461 continue
462
463 print(f" #{number}: adding as Noise")
464 add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
465 bot_comment_time=bot_comment["created_at"])
466 added += 1
467 except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
468 print(f" #{number}: error processing issue, skipping: {error}")
469 errors += 1
470
471 print(f" Done: added {added}, skipped {skipped}, errors {errors}")
472
473
474if __name__ == "__main__":
475 parser = argparse.ArgumentParser(
476 description="Track duplicate bot effectiveness on a GitHub project board.",
477 )
478 subparsers = parser.add_subparsers(dest="command", required=True)
479
480 classify_parser = subparsers.add_parser(
481 "classify-closed",
482 help="Classify a closed issue and add it to the project board.",
483 )
484 classify_parser.add_argument("issue_number", type=int)
485 classify_parser.add_argument("closer_login")
486 classify_parser.add_argument("state_reason")
487
488 subparsers.add_parser(
489 "classify-open",
490 help="Classify open, triaged, bot-commented issues as Noise.",
491 )
492
493 args = parser.parse_args()
494
495 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
496 if not GITHUB_TOKEN:
497 print("Error: GITHUB_TOKEN environment variable is required")
498 sys.exit(1)
499
500 raw_project_number = os.environ.get("PROJECT_NUMBER", "")
501 if raw_project_number:
502 try:
503 PROJECT_NUMBER = int(raw_project_number)
504 except ValueError:
505 print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
506 sys.exit(1)
507 else:
508 PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
509
510 GITHUB_HEADERS = {
511 "Authorization": f"token {GITHUB_TOKEN}",
512 "Accept": "application/vnd.github+json",
513 }
514
515 if args.command == "classify-closed":
516 classify_closed(args.issue_number, args.closer_login, args.state_reason)
517 elif args.command == "classify-open":
518 classify_open()