1#!/usr/bin/env python3
2"""
3Track the effectiveness of the duplicate-detection bot by classifying issues
4into outcome categories on a GitHub Projects v2 board.
5
6Subcommands:
7 classify-closed <issue_number> <closer_login> <state_reason>
8 Classify a closed issue and add it to the project board.
9
10 classify-open
11 Classify open, triaged, bot-commented issues and add them to
12 the project board as Noise.
13
14Requires:
15 requests (pip install requests)
16
17Environment variables:
18 GITHUB_TOKEN - GitHub App token
19 PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
20"""
21
22import argparse
23import functools
24import os
25import re
26import sys
27from datetime import datetime, timezone
28
29import requests
30
31GITHUB_API = "https://api.github.com"
32GRAPHQL_URL = "https://api.github.com/graphql"
33REPO_OWNER = "zed-industries"
34REPO_NAME = "zed"
35STAFF_TEAM_SLUG = "staff"
36BOT_LOGIN = "zed-community-bot[bot]"
37BOT_APP_SLUG = "zed-community-bot"
38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
39BOT_START_DATE = "2026-02-18"
40NEEDS_TRIAGE_LABEL = "state:needs triage"
41DEFAULT_PROJECT_NUMBER = 76
42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
43# Add a new tuple when you deploy a new version of the bot that you want to
44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
45# Newest first, please. The datetime is for the deployment time (merge to maain).
46BOT_VERSION_TIMELINE = [
47 ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
48 ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
49]
50
51
52def bot_version_for_time(date_string):
53 """Return the bot version that was active at the given ISO 8601 timestamp."""
54 timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
55 for version, deployed in BOT_VERSION_TIMELINE:
56 if timestamp >= deployed:
57 return version
58 return BOT_VERSION_TIMELINE[-1][0]
59
60
61def github_api_get(path, params=None):
62 url = f"{GITHUB_API}/{path.lstrip('/')}"
63 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
64 response.raise_for_status()
65 return response.json()
66
67
68def github_search_issues(query):
69 """Search issues, returning most recently created first."""
70 # not handling pagination on purpose: the oldest issues are on the board already
71 params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
72 return github_api_get("/search/issues", params).get("items", [])
73
74
75def is_staff_member(username):
76 """Check if user is an active member of the staff team."""
77 try:
78 data = github_api_get(
79 f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
80 )
81 return data.get("state") == "active"
82 except requests.HTTPError as error:
83 if error.response.status_code == 404:
84 return False
85 raise
86
87
88def fetch_issue(issue_number):
89 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
90 return {
91 "number": issue_number,
92 "node_id": data["node_id"],
93 "author": (data.get("user") or {}).get("login", ""),
94 "type_name": (data.get("type") or {}).get("name"),
95 }
96
97
98def get_bot_comment_with_time(issue_number):
99 """Get the bot's duplicate-detection comment and its timestamp from an issue.
100
101 Returns {"body": str, "created_at": str} if found, else None.
102 """
103 comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
104 page = 1
105 while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
106 for comment in comments:
107 author = (comment.get("user") or {}).get("login", "")
108 body = comment.get("body", "")
109 if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
110 return {"body": body, "created_at": comment.get("created_at", "")}
111 page += 1
112 return None
113
114
115def parse_suggested_issues(comment_body):
116 """Extract issue numbers from the bot's comment (lines like '- #12345')."""
117 return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
118
119
120def github_api_graphql(query, variables=None, partial_errors_ok=False):
121 """Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
122 response = requests.post(
123 GRAPHQL_URL,
124 headers=GITHUB_HEADERS,
125 json={"query": query, "variables": variables or {}},
126 )
127 response.raise_for_status()
128 data = response.json()
129 if "errors" in data:
130 if not partial_errors_ok or "data" not in data:
131 raise RuntimeError(f"GraphQL errors: {data['errors']}")
132 print(f" GraphQL partial errors (ignored): {data['errors']}")
133 return data["data"]
134
135
136def find_canonical_among(duplicate_number, candidates):
137 """Check if any candidate issue has duplicate_number marked as a duplicate.
138
139 The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
140 duplicate's. So to find which canonical issue our duplicate was closed against,
141 we check each candidate's timeline for a MarkedAsDuplicateEvent whose
142 `duplicate` field matches our issue.
143
144 Returns the matching canonical issue number, or None.
145 """
146 if not candidates:
147 return None
148
149 data = github_api_graphql(
150 """
151 query($owner: String!, $repo: String!, $numbers: [Int!]!) {
152 repository(owner: $owner, name: $repo) {
153 PLACEHOLDER
154 }
155 }
156 """.replace("PLACEHOLDER", "\n ".join(
157 f'issue_{number}: issue(number: {number}) {{'
158 f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
159 f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
160 for number in candidates
161 )),
162 {"owner": REPO_OWNER, "repo": REPO_NAME, "numbers": list(candidates)},
163 partial_errors_ok=True,
164 )
165
166 repo = data["repository"]
167 for candidate in candidates:
168 issue_data = repo.get(f"issue_{candidate}")
169 if not issue_data:
170 continue
171 for node in issue_data["timelineItems"]["nodes"]:
172 dup_number = (node.get("duplicate") or {}).get("number")
173 if dup_number == duplicate_number:
174 return candidate
175 return None
176
177
178@functools.lru_cache
179def get_project_config():
180 """Fetch the project board's ID, field IDs, and option IDs."""
181 data = github_api_graphql(
182 """
183 query($org: String!, $number: Int!) {
184 organization(login: $org) {
185 projectV2(number: $number) {
186 id
187 fields(first: 30) {
188 nodes {
189 ... on ProjectV2SingleSelectField { id name options { id name } }
190 ... on ProjectV2Field { id name }
191 }
192 }
193 }
194 }
195 }
196 """,
197 {"org": REPO_OWNER, "number": PROJECT_NUMBER},
198 )
199 project = data["organization"]["projectV2"]
200
201 config = {"project_id": project["id"], "fields": {}}
202 for field_node in project["fields"]["nodes"]:
203 name = field_node.get("name")
204 if not name:
205 continue
206 field_info = {"id": field_node["id"]}
207 if "options" in field_node:
208 field_info["options"] = {
209 option["name"]: option["id"] for option in field_node["options"]
210 }
211 config["fields"][name] = field_info
212
213 print(f" Project config loaded: {len(config['fields'])} fields")
214 return config
215
216
217def find_project_item(issue_node_id):
218 """Check if an issue is already on our project board.
219
220 Returns the project item ID if found, or None.
221 """
222 data = github_api_graphql(
223 "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
224 {"id": issue_node_id},
225 )
226 for item in data["node"]["projectItems"]["nodes"]:
227 if item["project"]["number"] == PROJECT_NUMBER:
228 return item["id"]
229 return None
230
231
232def add_project_item(issue_node_id):
233 """Add an issue to the project board. Returns the new item ID."""
234 config = get_project_config()
235 data = github_api_graphql(
236 """
237 mutation($projectId: ID!, $contentId: ID!) {
238 addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
239 item { id }
240 }
241 }
242 """,
243 {"projectId": config["project_id"], "contentId": issue_node_id},
244 )
245 return data["addProjectV2ItemById"]["item"]["id"]
246
247
248def set_field_value(item_id, field_name, value):
249 """Set a single field value on a project board item."""
250 config = get_project_config()
251 field = config["fields"].get(field_name)
252 if not field:
253 print(f" Warning: field '{field_name}' not found on project board")
254 return
255
256 if "options" in field:
257 # single-select field
258 option_id = field["options"].get(value)
259 if not option_id:
260 print(f" Warning: option '{value}' not found for field '{field_name}'")
261 return
262 field_value = {"singleSelectOptionId": option_id}
263 else:
264 # text field
265 field_value = {"text": str(value)}
266
267 github_api_graphql(
268 """
269 mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
270 updateProjectV2ItemFieldValue(input: {
271 projectId: $projectId
272 itemId: $itemId
273 fieldId: $fieldId
274 value: $value
275 }) {
276 projectV2Item { id }
277 }
278 }
279 """,
280 {
281 "projectId": config["project_id"],
282 "itemId": item_id,
283 "fieldId": field["id"],
284 "value": field_value,
285 },
286 )
287
288
289def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
290 """Add an issue to the project board (or update it if already there), setting field values."""
291 item_id = find_project_item(issue_node_id)
292 if item_id:
293 print(f" Issue already on board, updating (item {item_id})")
294 else:
295 item_id = add_project_item(issue_node_id)
296 print(f" Added to project board (item {item_id})")
297
298 set_field_value(item_id, "Outcome", outcome)
299 set_field_value(item_id, "Status", status)
300
301 if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
302 set_field_value(item_id, "Closed as", closed_as)
303
304 if notes:
305 set_field_value(item_id, "Notes", notes)
306
307 if bot_comment_time:
308 set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
309
310 return item_id
311
312
313def classify_closed(issue_number, closer_login, state_reason):
314 """Classify a closed issue and add/update it on the project board."""
315 state_reason = state_reason or "unknown"
316 print(f"Classifying closed issue #{issue_number}")
317 print(f" Closer: {closer_login}, state_reason: {state_reason}")
318
319 issue = fetch_issue(issue_number)
320 author = issue["author"]
321 print(f" Author: {author}, type: {issue['type_name']}")
322
323 if is_staff_member(author):
324 print(f" Skipping: author '{author}' is a staff member")
325 return
326
327 bot_comment = get_bot_comment_with_time(issue_number)
328 bot_commented = bot_comment is not None
329 print(f" Bot commented: {bot_commented}")
330
331 closer_is_author = closer_login == author
332
333 if bot_commented and closer_is_author:
334 classify_as_success(issue, bot_comment, state_reason)
335 elif bot_commented and not closer_is_author:
336 # Only authors, staff, and triagers can close issues, so
337 # a non-author closer is always someone with elevated permissions.
338 classify_non_author_closed(issue, bot_comment, state_reason)
339 elif not bot_commented and state_reason == "duplicate":
340 classify_as_missed_opportunity(issue)
341 else:
342 print(" Skipping: no bot comment and not closed as duplicate")
343
344
345def classify_as_success(issue, bot_comment, state_reason):
346 """Author closed their own issue after the bot commented."""
347 if state_reason == "duplicate":
348 status = "Auto-classified"
349 notes = None
350 else:
351 # could be closed for an unrelated reason; flag for review
352 status = "Needs review"
353 notes = f"Author closed as {state_reason}"
354
355 if status == "Auto-classified":
356 print(f" -> Success (closed as {state_reason})")
357 else:
358 print(f" -> Possible Success, needs review ({notes})")
359 add_or_update_project_item(
360 issue["node_id"],
361 outcome="Success",
362 closed_as=state_reason,
363 status=status,
364 notes=notes,
365 bot_comment_time=bot_comment["created_at"],
366 )
367
368
369def classify_non_author_closed(issue, bot_comment, state_reason):
370 """Non-author (staff or triager) closed an issue the bot had commented on."""
371 if state_reason == "duplicate":
372 classify_as_assist(issue, bot_comment)
373 else:
374 notes = f"Closed by staff/triager as {state_reason}, not duplicate"
375 print(f" -> Possible Noise, needs review ({notes})")
376 add_or_update_project_item(
377 issue["node_id"],
378 outcome="Noise",
379 closed_as=state_reason,
380 status="Needs review",
381 notes=notes,
382 bot_comment_time=bot_comment["created_at"],
383 )
384
385
386def classify_as_assist(issue, bot_comment):
387 """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
388 suggested = parse_suggested_issues(bot_comment["body"])
389 if not suggested:
390 print(" -> Assist, needs review (could not parse bot suggestions)")
391 add_or_update_project_item(
392 issue["node_id"], outcome="Assist", closed_as="duplicate",
393 status="Needs review", notes="Could not parse bot suggestions",
394 bot_comment_time=bot_comment["created_at"])
395 return
396
397 original = None
398 try:
399 original = find_canonical_among(issue["number"], suggested)
400 except (requests.RequestException, RuntimeError) as error:
401 print(f" Warning: failed to query candidate timelines: {error}")
402
403 if original:
404 status = "Auto-classified"
405 notes = None
406 print(f" -> Assist (original #{original} matches bot suggestion)")
407 else:
408 status = "Needs review"
409 suggested_str = ", ".join(f"#{number}" for number in suggested)
410 notes = f"Bot suggested {suggested_str}; none matched as canonical"
411 print(f" -> Possible Assist, needs review ({notes})")
412
413 add_or_update_project_item(
414 issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
415 bot_comment_time=bot_comment["created_at"])
416
417
418def classify_as_missed_opportunity(issue):
419 """Issue closed as duplicate but the bot never commented."""
420 print(" -> Missed opportunity")
421 add_or_update_project_item(
422 issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
423
424
425def classify_open():
426 """Classify open, triaged, bot-commented issues as Noise."""
427 print("Classifying open issues")
428
429 query = (
430 f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
431 f"commenter:app/{BOT_APP_SLUG} "
432 f'-label:"{NEEDS_TRIAGE_LABEL}" '
433 f"created:>={BOT_START_DATE}"
434 )
435 print(f" Search query: {query}")
436
437 results = github_search_issues(query)
438 print(f" Found {len(results)} candidate issues")
439
440 added, skipped, errors = 0, 0, 0
441 for item in results:
442 number = item["number"]
443 try:
444 type_name = (item.get("type") or {}).get("name")
445 author = (item.get("user") or {}).get("login", "")
446 node_id = item["node_id"]
447
448 skip_reason = (
449 f"type is {type_name}" if type_name not in ("Bug", "Crash")
450 else f"author {author} is staff" if is_staff_member(author)
451 else "already on the board" if find_project_item(node_id)
452 else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
453 else None
454 )
455
456 if skip_reason:
457 print(f" #{number}: skipping, {skip_reason}")
458 skipped += 1
459 continue
460
461 print(f" #{number}: adding as Noise")
462 add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
463 bot_comment_time=bot_comment["created_at"])
464 added += 1
465 except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
466 print(f" #{number}: error processing issue, skipping: {error}")
467 errors += 1
468
469 print(f" Done: added {added}, skipped {skipped}, errors {errors}")
470
471
472if __name__ == "__main__":
473 parser = argparse.ArgumentParser(
474 description="Track duplicate bot effectiveness on a GitHub project board.",
475 )
476 subparsers = parser.add_subparsers(dest="command", required=True)
477
478 classify_parser = subparsers.add_parser(
479 "classify-closed",
480 help="Classify a closed issue and add it to the project board.",
481 )
482 classify_parser.add_argument("issue_number", type=int)
483 classify_parser.add_argument("closer_login")
484 classify_parser.add_argument("state_reason")
485
486 subparsers.add_parser(
487 "classify-open",
488 help="Classify open, triaged, bot-commented issues as Noise.",
489 )
490
491 args = parser.parse_args()
492
493 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
494 if not GITHUB_TOKEN:
495 print("Error: GITHUB_TOKEN environment variable is required")
496 sys.exit(1)
497
498 raw_project_number = os.environ.get("PROJECT_NUMBER", "")
499 if raw_project_number:
500 try:
501 PROJECT_NUMBER = int(raw_project_number)
502 except ValueError:
503 print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
504 sys.exit(1)
505 else:
506 PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
507
508 GITHUB_HEADERS = {
509 "Authorization": f"token {GITHUB_TOKEN}",
510 "Accept": "application/vnd.github+json",
511 }
512
513 if args.command == "classify-closed":
514 classify_closed(args.issue_number, args.closer_login, args.state_reason)
515 elif args.command == "classify-open":
516 classify_open()