1#!/usr/bin/env python3
2"""
3Track the effectiveness of the duplicate-detection bot by classifying issues
4into outcome categories on a GitHub Projects v2 board.
5
6Subcommands:
7 classify-closed <issue_number> <closer_login> <state_reason>
8 Classify a closed issue and add it to the project board.
9
10 classify-open
11 Classify open, triaged, bot-commented issues and add them to
12 the project board as Noise.
13
14Requires:
15 requests (pip install requests)
16
17Environment variables:
18 GITHUB_TOKEN - GitHub App token
19 PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
20"""
21
22import argparse
23import functools
24import os
25import re
26import sys
27
28import requests
29
30GITHUB_API = "https://api.github.com"
31GRAPHQL_URL = "https://api.github.com/graphql"
32REPO_OWNER = "zed-industries"
33REPO_NAME = "zed"
34STAFF_TEAM_SLUG = "staff"
35BOT_LOGIN = "zed-community-bot[bot]"
36BOT_APP_SLUG = "zed-community-bot"
37BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
38BOT_START_DATE = "2026-02-18"
39NEEDS_TRIAGE_LABEL = "state:needs triage"
40DEFAULT_PROJECT_NUMBER = 76
41VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
42# Bump this when the duplicate-detection bot's behavior changes in a way that
43# could affect outcome rates (e.g. prompt rewrites, model swaps, candidate
44# filtering changes). Don't bump for unrelated changes like comment formatting.
45BOT_VERSION = "v2"
46
47
48def github_api_get(path, params=None):
49 url = f"{GITHUB_API}/{path.lstrip('/')}"
50 response = requests.get(url, headers=GITHUB_HEADERS, params=params)
51 response.raise_for_status()
52 return response.json()
53
54
55def github_search_issues(query):
56 """Search issues, returning most recently created first."""
57 # not handling pagination on purpose: the oldest issues are on the board already
58 params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
59 return github_api_get("/search/issues", params).get("items", [])
60
61
62def is_staff_member(username):
63 """Check if user is an active member of the staff team."""
64 try:
65 data = github_api_get(
66 f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
67 )
68 return data.get("state") == "active"
69 except requests.HTTPError as error:
70 if error.response.status_code == 404:
71 return False
72 raise
73
74
75def fetch_issue(issue_number):
76 data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
77 return {
78 "number": issue_number,
79 "node_id": data["node_id"],
80 "author": (data.get("user") or {}).get("login", ""),
81 "type_name": (data.get("type") or {}).get("name"),
82 }
83
84
85def get_bot_duplicate_comment(issue_number):
86 """Get the bot's duplicate-detection comment body from an issue.
87
88 Returns the comment body if found, else None.
89 """
90 comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
91 page = 1
92 while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
93 for comment in comments:
94 author = (comment.get("user") or {}).get("login", "")
95 body = comment.get("body", "")
96 if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
97 return body
98 page += 1
99 return None
100
101
102def parse_suggested_issues(comment_body):
103 """Extract issue numbers from the bot's comment (lines like '- #12345')."""
104 return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
105
106
107def github_api_graphql(query, variables=None):
108 """Execute a GitHub GraphQL query. Raises on errors."""
109 response = requests.post(
110 GRAPHQL_URL,
111 headers=GITHUB_HEADERS,
112 json={"query": query, "variables": variables or {}},
113 )
114 response.raise_for_status()
115 data = response.json()
116 if "errors" in data:
117 raise RuntimeError(f"GraphQL errors: {data['errors']}")
118 return data["data"]
119
120
121def get_closed_as_duplicate_of(issue_number):
122 """Get the issue number this issue was closed as a duplicate of.
123
124 Uses the timeline to find the most recent MarkedAsDuplicateEvent.
125 Returns the original issue number, or None.
126
127 Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
128 If the closer used the "Close as duplicate" button without separately
129 marking the duplicate relationship, no event is created and this returns
130 None. The caller handles this by flagging the item for manual review.
131 """
132 data = github_api_graphql(
133 """
134 query($owner: String!, $repo: String!, $number: Int!) {
135 repository(owner: $owner, name: $repo) {
136 issue(number: $number) {
137 timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
138 nodes {
139 ... on MarkedAsDuplicateEvent {
140 canonical { ... on Issue { number } }
141 }
142 }
143 }
144 }
145 }
146 }
147 """,
148 {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
149 )
150 nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
151 for node in reversed(nodes):
152 if original := (node.get("canonical") or {}).get("number"):
153 return original
154 return None
155
156
157@functools.lru_cache
158def get_project_config():
159 """Fetch the project board's ID, field IDs, and option IDs."""
160 data = github_api_graphql(
161 """
162 query($org: String!, $number: Int!) {
163 organization(login: $org) {
164 projectV2(number: $number) {
165 id
166 fields(first: 30) {
167 nodes {
168 ... on ProjectV2SingleSelectField { id name options { id name } }
169 ... on ProjectV2Field { id name }
170 }
171 }
172 }
173 }
174 }
175 """,
176 {"org": REPO_OWNER, "number": PROJECT_NUMBER},
177 )
178 project = data["organization"]["projectV2"]
179
180 config = {"project_id": project["id"], "fields": {}}
181 for field_node in project["fields"]["nodes"]:
182 name = field_node.get("name")
183 if not name:
184 continue
185 field_info = {"id": field_node["id"]}
186 if "options" in field_node:
187 field_info["options"] = {
188 option["name"]: option["id"] for option in field_node["options"]
189 }
190 config["fields"][name] = field_info
191
192 print(f" Project config loaded: {len(config['fields'])} fields")
193 return config
194
195
196def find_project_item(issue_node_id):
197 """Check if an issue is already on our project board.
198
199 Returns the project item ID if found, or None.
200 """
201 data = github_api_graphql(
202 "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
203 {"id": issue_node_id},
204 )
205 for item in data["node"]["projectItems"]["nodes"]:
206 if item["project"]["number"] == PROJECT_NUMBER:
207 return item["id"]
208 return None
209
210
211def add_project_item(issue_node_id):
212 """Add an issue to the project board. Returns the new item ID."""
213 config = get_project_config()
214 data = github_api_graphql(
215 """
216 mutation($projectId: ID!, $contentId: ID!) {
217 addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
218 item { id }
219 }
220 }
221 """,
222 {"projectId": config["project_id"], "contentId": issue_node_id},
223 )
224 return data["addProjectV2ItemById"]["item"]["id"]
225
226
227def set_field_value(item_id, field_name, value):
228 """Set a single field value on a project board item."""
229 config = get_project_config()
230 field = config["fields"].get(field_name)
231 if not field:
232 print(f" Warning: field '{field_name}' not found on project board")
233 return
234
235 if "options" in field:
236 # single-select field
237 option_id = field["options"].get(value)
238 if not option_id:
239 print(f" Warning: option '{value}' not found for field '{field_name}'")
240 return
241 field_value = {"singleSelectOptionId": option_id}
242 else:
243 # text field
244 field_value = {"text": str(value)}
245
246 github_api_graphql(
247 """
248 mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
249 updateProjectV2ItemFieldValue(input: {
250 projectId: $projectId
251 itemId: $itemId
252 fieldId: $fieldId
253 value: $value
254 }) {
255 projectV2Item { id }
256 }
257 }
258 """,
259 {
260 "projectId": config["project_id"],
261 "itemId": item_id,
262 "fieldId": field["id"],
263 "value": field_value,
264 },
265 )
266
267
268def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
269 """Add an issue to the project board (or update it if already there), setting field values."""
270 item_id = find_project_item(issue_node_id)
271 if item_id:
272 print(f" Issue already on board, updating (item {item_id})")
273 else:
274 item_id = add_project_item(issue_node_id)
275 print(f" Added to project board (item {item_id})")
276
277 set_field_value(item_id, "Outcome", outcome)
278 set_field_value(item_id, "Status", status)
279
280 if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
281 set_field_value(item_id, "Closed as", closed_as)
282
283 if notes:
284 set_field_value(item_id, "Notes", notes)
285
286 set_field_value(item_id, "Bot version", BOT_VERSION)
287
288 return item_id
289
290
291def classify_closed(issue_number, closer_login, state_reason):
292 """Classify a closed issue and add/update it on the project board."""
293 state_reason = state_reason or "unknown"
294 print(f"Classifying closed issue #{issue_number}")
295 print(f" Closer: {closer_login}, state_reason: {state_reason}")
296
297 issue = fetch_issue(issue_number)
298 author = issue["author"]
299 print(f" Author: {author}, type: {issue['type_name']}")
300
301 if is_staff_member(author):
302 print(f" Skipping: author '{author}' is a staff member")
303 return
304
305 bot_comment = get_bot_duplicate_comment(issue_number)
306 bot_commented = bot_comment is not None
307 print(f" Bot commented: {bot_commented}")
308
309 closer_is_author = closer_login == author
310
311 if bot_commented and closer_is_author:
312 classify_as_success(issue, state_reason)
313 elif bot_commented and not closer_is_author:
314 # Only authors, staff, and triagers can close issues, so
315 # a non-author closer is always someone with elevated permissions.
316 classify_non_author_closed(issue, bot_comment, state_reason)
317 elif not bot_commented and state_reason == "duplicate":
318 classify_as_missed_opportunity(issue)
319 else:
320 print(" Skipping: no bot comment and not closed as duplicate")
321
322
323def classify_as_success(issue, state_reason):
324 """Author closed their own issue after the bot commented."""
325 if state_reason == "duplicate":
326 status = "Auto-classified"
327 notes = None
328 else:
329 # could be closed for an unrelated reason; flag for review
330 status = "Needs review"
331 notes = f"Author closed as {state_reason}"
332
333 if status == "Auto-classified":
334 print(f" -> Success (closed as {state_reason})")
335 else:
336 print(f" -> Possible Success, needs review ({notes})")
337 add_or_update_project_item(
338 issue["node_id"],
339 outcome="Success",
340 closed_as=state_reason,
341 status=status,
342 notes=notes,
343 )
344
345
346def classify_non_author_closed(issue, bot_comment, state_reason):
347 """Non-author (staff or triager) closed an issue the bot had commented on."""
348 if state_reason == "duplicate":
349 classify_as_assist(issue, bot_comment)
350 else:
351 notes = f"Closed by staff/triager as {state_reason}, not duplicate"
352 print(f" -> Possible Noise, needs review ({notes})")
353 add_or_update_project_item(
354 issue["node_id"],
355 outcome="Noise",
356 closed_as=state_reason,
357 status="Needs review",
358 notes=notes,
359 )
360
361
362def classify_as_assist(issue, bot_comment):
363 """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
364 suggested = parse_suggested_issues(bot_comment)
365 original = None
366 try:
367 original = get_closed_as_duplicate_of(issue["number"])
368 except (requests.RequestException, RuntimeError) as error:
369 print(f" Warning: failed to get the original-for the duplicate issue: {error}")
370
371 if original and suggested:
372 if original in suggested:
373 status = "Auto-classified"
374 notes = None
375 print(f" -> Assist (original #{original} matches bot suggestion)")
376 else:
377 status = "Needs review"
378 suggested_str = ", ".join(f"#{number}" for number in suggested)
379 notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
380 print(f" -> Possible Assist, needs review ({notes})")
381 else:
382 # couldn't determine original or no suggestions parsed
383 status = "Needs review"
384 if not original:
385 notes = "Could not determine original issue from timeline"
386 else:
387 notes = f"Closed as dup of #{original}; could not parse bot suggestions"
388 print(f" -> Possible Assist, needs review ({notes})")
389
390 add_or_update_project_item(
391 issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
392
393
394def classify_as_missed_opportunity(issue):
395 """Issue closed as duplicate but the bot never commented."""
396 print(" -> Missed opportunity")
397 add_or_update_project_item(
398 issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
399
400
401def classify_open():
402 """Classify open, triaged, bot-commented issues as Noise."""
403 print("Classifying open issues")
404
405 query = (
406 f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
407 f"commenter:app/{BOT_APP_SLUG} "
408 f'-label:"{NEEDS_TRIAGE_LABEL}" '
409 f"created:>={BOT_START_DATE}"
410 )
411 print(f" Search query: {query}")
412
413 results = github_search_issues(query)
414 print(f" Found {len(results)} candidate issues")
415
416 added, skipped, errors = 0, 0, 0
417 for item in results:
418 number = item["number"]
419 try:
420 type_name = (item.get("type") or {}).get("name")
421 author = (item.get("user") or {}).get("login", "")
422 node_id = item["node_id"]
423
424 skip_reason = (
425 f"type is {type_name}" if type_name not in ("Bug", "Crash")
426 else f"author {author} is staff" if is_staff_member(author)
427 else "already on the board" if find_project_item(node_id)
428 else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
429 else None
430 )
431 if skip_reason:
432 print(f" #{number}: skipping, {skip_reason}")
433 skipped += 1
434 continue
435
436 print(f" #{number}: adding as Noise")
437 add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
438 added += 1
439 except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
440 print(f" #{number}: error processing issue, skipping: {error}")
441 errors += 1
442
443 print(f" Done: added {added}, skipped {skipped}, errors {errors}")
444
445
446if __name__ == "__main__":
447 parser = argparse.ArgumentParser(
448 description="Track duplicate bot effectiveness on a GitHub project board.",
449 )
450 subparsers = parser.add_subparsers(dest="command", required=True)
451
452 classify_parser = subparsers.add_parser(
453 "classify-closed",
454 help="Classify a closed issue and add it to the project board.",
455 )
456 classify_parser.add_argument("issue_number", type=int)
457 classify_parser.add_argument("closer_login")
458 classify_parser.add_argument("state_reason")
459
460 subparsers.add_parser(
461 "classify-open",
462 help="Classify open, triaged, bot-commented issues as Noise.",
463 )
464
465 args = parser.parse_args()
466
467 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
468 if not GITHUB_TOKEN:
469 print("Error: GITHUB_TOKEN environment variable is required")
470 sys.exit(1)
471
472 raw_project_number = os.environ.get("PROJECT_NUMBER", "")
473 if raw_project_number:
474 try:
475 PROJECT_NUMBER = int(raw_project_number)
476 except ValueError:
477 print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
478 sys.exit(1)
479 else:
480 PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
481
482 GITHUB_HEADERS = {
483 "Authorization": f"token {GITHUB_TOKEN}",
484 "Accept": "application/vnd.github+json",
485 }
486
487 if args.command == "classify-closed":
488 classify_closed(args.issue_number, args.closer_login, args.state_reason)
489 elif args.command == "classify-open":
490 classify_open()