github-track-duplicate-bot-effectiveness.py

  1#!/usr/bin/env python3
  2"""
  3Track the effectiveness of the duplicate-detection bot by classifying issues
  4into outcome categories on a GitHub Projects v2 board.
  5
  6Subcommands:
  7    classify-closed <issue_number> <closer_login> <state_reason>
  8        Classify a closed issue and add it to the project board.
  9
 10    classify-open
 11        Classify open, triaged, bot-commented issues and add them to
 12        the project board as Noise.
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Environment variables:
 18    GITHUB_TOKEN     - GitHub App token
 19    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
 20"""
 21
 22import argparse
 23import functools
 24import os
 25import re
 26import sys
 27from datetime import datetime, timezone
 28
 29import requests
 30
 31GITHUB_API = "https://api.github.com"
 32GRAPHQL_URL = "https://api.github.com/graphql"
 33REPO_OWNER = "zed-industries"
 34REPO_NAME = "zed"
 35STAFF_TEAM_SLUG = "staff"
 36BOT_LOGIN = "zed-community-bot[bot]"
 37BOT_APP_SLUG = "zed-community-bot"
 38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
 39BOT_START_DATE = "2026-02-18"
 40NEEDS_TRIAGE_LABEL = "state:needs triage"
 41DEFAULT_PROJECT_NUMBER = 76
 42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
 43# Add a new tuple when you deploy a new version of the bot that you want to
 44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
 45# Newest first, please. The datetime is for the deployment time (merge to maain).
 46BOT_VERSION_TIMELINE = [
 47    ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
 48    ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
 49]
 50
 51
 52def bot_version_for_time(date_string):
 53    """Return the bot version that was active at the given ISO 8601 timestamp."""
 54    timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
 55    for version, deployed in BOT_VERSION_TIMELINE:
 56        if timestamp >= deployed:
 57            return version
 58    return BOT_VERSION_TIMELINE[-1][0]
 59
 60
 61def github_api_get(path, params=None):
 62    url = f"{GITHUB_API}/{path.lstrip('/')}"
 63    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 64    response.raise_for_status()
 65    return response.json()
 66
 67
 68def github_search_issues(query):
 69    """Search issues, returning most recently created first."""
 70    # not handling pagination on purpose: the oldest issues are on the board already
 71    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
 72    return github_api_get("/search/issues", params).get("items", [])
 73
 74
 75def is_staff_member(username):
 76    """Check if user is an active member of the staff team."""
 77    try:
 78        data = github_api_get(
 79            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
 80        )
 81        return data.get("state") == "active"
 82    except requests.HTTPError as error:
 83        if error.response.status_code == 404:
 84            return False
 85        raise
 86
 87
 88def fetch_issue(issue_number):
 89    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
 90    return {
 91        "number": issue_number,
 92        "node_id": data["node_id"],
 93        "author": (data.get("user") or {}).get("login", ""),
 94        "type_name": (data.get("type") or {}).get("name"),
 95    }
 96
 97
 98def get_bot_comment_with_time(issue_number):
 99    """Get the bot's duplicate-detection comment and its timestamp from an issue.
100
101    Returns {"body": str, "created_at": str} if found, else None.
102    """
103    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
104    page = 1
105    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
106        for comment in comments:
107            author = (comment.get("user") or {}).get("login", "")
108            body = comment.get("body", "")
109            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
110                return {"body": body, "created_at": comment.get("created_at", "")}
111        page += 1
112    return None
113
114
115def parse_suggested_issues(comment_body):
116    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
117    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
118
119
120def github_api_graphql(query, variables=None, partial_errors_ok=False):
121    """Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
122    response = requests.post(
123        GRAPHQL_URL,
124        headers=GITHUB_HEADERS,
125        json={"query": query, "variables": variables or {}},
126    )
127    response.raise_for_status()
128    data = response.json()
129    if "errors" in data:
130        if not partial_errors_ok or "data" not in data:
131            raise RuntimeError(f"GraphQL errors: {data['errors']}")
132        print(f"  GraphQL partial errors (ignored): {data['errors']}")
133    return data["data"]
134
135
136def find_canonical_among(duplicate_number, candidates):
137    """Check if any candidate issue has duplicate_number marked as a duplicate.
138
139    The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
140    duplicate's. So to find which canonical issue our duplicate was closed against,
141    we check each candidate's timeline for a MarkedAsDuplicateEvent whose
142    `duplicate` field matches our issue.
143
144    Returns the matching canonical issue number, or None.
145    """
146    if not candidates:
147        return None
148
149    data = github_api_graphql(
150        """
151        query($owner: String!, $repo: String!, $numbers: [Int!]!) {
152          repository(owner: $owner, name: $repo) {
153            PLACEHOLDER
154          }
155        }
156        """.replace("PLACEHOLDER", "\n            ".join(
157            f'issue_{number}: issue(number: {number}) {{'
158            f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
159            f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
160            for number in candidates
161        )),
162        {"owner": REPO_OWNER, "repo": REPO_NAME, "numbers": list(candidates)},
163        partial_errors_ok=True,
164    )
165
166    repo = data["repository"]
167    for candidate in candidates:
168        issue_data = repo.get(f"issue_{candidate}")
169        if not issue_data:
170            continue
171        for node in issue_data["timelineItems"]["nodes"]:
172            dup_number = (node.get("duplicate") or {}).get("number")
173            if dup_number == duplicate_number:
174                return candidate
175    return None
176
177
178@functools.lru_cache
179def get_project_config():
180    """Fetch the project board's ID, field IDs, and option IDs."""
181    data = github_api_graphql(
182        """
183        query($org: String!, $number: Int!) {
184          organization(login: $org) {
185            projectV2(number: $number) {
186              id
187              fields(first: 30) {
188                nodes {
189                  ... on ProjectV2SingleSelectField { id name options { id name } }
190                  ... on ProjectV2Field { id name }
191                }
192              }
193            }
194          }
195        }
196        """,
197        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
198    )
199    project = data["organization"]["projectV2"]
200
201    config = {"project_id": project["id"], "fields": {}}
202    for field_node in project["fields"]["nodes"]:
203        name = field_node.get("name")
204        if not name:
205            continue
206        field_info = {"id": field_node["id"]}
207        if "options" in field_node:
208            field_info["options"] = {
209                option["name"]: option["id"] for option in field_node["options"]
210            }
211        config["fields"][name] = field_info
212
213    print(f"  Project config loaded: {len(config['fields'])} fields")
214    return config
215
216
217def find_project_item(issue_node_id):
218    """Check if an issue is already on our project board.
219
220    Returns the project item ID if found, or None.
221    """
222    data = github_api_graphql(
223        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
224        {"id": issue_node_id},
225    )
226    for item in data["node"]["projectItems"]["nodes"]:
227        if item["project"]["number"] == PROJECT_NUMBER:
228            return item["id"]
229    return None
230
231
232def add_project_item(issue_node_id):
233    """Add an issue to the project board. Returns the new item ID."""
234    config = get_project_config()
235    data = github_api_graphql(
236        """
237        mutation($projectId: ID!, $contentId: ID!) {
238          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
239            item { id }
240          }
241        }
242        """,
243        {"projectId": config["project_id"], "contentId": issue_node_id},
244    )
245    return data["addProjectV2ItemById"]["item"]["id"]
246
247
248def set_field_value(item_id, field_name, value):
249    """Set a single field value on a project board item."""
250    config = get_project_config()
251    field = config["fields"].get(field_name)
252    if not field:
253        print(f"  Warning: field '{field_name}' not found on project board")
254        return
255
256    if "options" in field:
257        # single-select field
258        option_id = field["options"].get(value)
259        if not option_id:
260            print(f"  Warning: option '{value}' not found for field '{field_name}'")
261            return
262        field_value = {"singleSelectOptionId": option_id}
263    else:
264        # text field
265        field_value = {"text": str(value)}
266
267    github_api_graphql(
268        """
269        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
270          updateProjectV2ItemFieldValue(input: {
271            projectId: $projectId
272            itemId: $itemId
273            fieldId: $fieldId
274            value: $value
275          }) {
276            projectV2Item { id }
277          }
278        }
279        """,
280        {
281            "projectId": config["project_id"],
282            "itemId": item_id,
283            "fieldId": field["id"],
284            "value": field_value,
285        },
286    )
287
288
289def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
290    """Add an issue to the project board (or update it if already there), setting field values."""
291    item_id = find_project_item(issue_node_id)
292    if item_id:
293        print(f"  Issue already on board, updating (item {item_id})")
294    else:
295        item_id = add_project_item(issue_node_id)
296        print(f"  Added to project board (item {item_id})")
297
298    set_field_value(item_id, "Outcome", outcome)
299    set_field_value(item_id, "Status", status)
300
301    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
302        set_field_value(item_id, "Closed as", closed_as)
303
304    if notes:
305        set_field_value(item_id, "Notes", notes)
306
307    if bot_comment_time:
308        set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
309
310    return item_id
311
312
313def classify_closed(issue_number, closer_login, state_reason):
314    """Classify a closed issue and add/update it on the project board."""
315    state_reason = state_reason or "unknown"
316    print(f"Classifying closed issue #{issue_number}")
317    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
318
319    issue = fetch_issue(issue_number)
320    author = issue["author"]
321    print(f"  Author: {author}, type: {issue['type_name']}")
322
323    if is_staff_member(author):
324        print(f"  Skipping: author '{author}' is a staff member")
325        return
326
327    bot_comment = get_bot_comment_with_time(issue_number)
328    bot_commented = bot_comment is not None
329    print(f"  Bot commented: {bot_commented}")
330
331    closer_is_author = closer_login == author
332
333    if bot_commented and closer_is_author:
334        classify_as_success(issue, bot_comment, state_reason)
335    elif bot_commented and not closer_is_author:
336        # Only authors, staff, and triagers can close issues, so
337        # a non-author closer is always someone with elevated permissions.
338        classify_non_author_closed(issue, bot_comment, state_reason)
339    elif not bot_commented and state_reason == "duplicate":
340        classify_as_missed_opportunity(issue)
341    else:
342        print("  Skipping: no bot comment and not closed as duplicate")
343
344
345def classify_as_success(issue, bot_comment, state_reason):
346    """Author closed their own issue after the bot commented."""
347    if state_reason == "duplicate":
348        status = "Auto-classified"
349        notes = None
350    else:
351        # could be closed for an unrelated reason; flag for review
352        status = "Needs review"
353        notes = f"Author closed as {state_reason}"
354
355    if status == "Auto-classified":
356        print(f"  -> Success (closed as {state_reason})")
357    else:
358        print(f"  -> Possible Success, needs review ({notes})")
359    add_or_update_project_item(
360        issue["node_id"],
361        outcome="Success",
362        closed_as=state_reason,
363        status=status,
364        notes=notes,
365        bot_comment_time=bot_comment["created_at"],
366    )
367
368
369def classify_non_author_closed(issue, bot_comment, state_reason):
370    """Non-author (staff or triager) closed an issue the bot had commented on."""
371    if state_reason == "duplicate":
372        classify_as_assist(issue, bot_comment)
373    else:
374        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
375        print(f"  -> Possible Noise, needs review ({notes})")
376        add_or_update_project_item(
377            issue["node_id"],
378            outcome="Noise",
379            closed_as=state_reason,
380            status="Needs review",
381            notes=notes,
382            bot_comment_time=bot_comment["created_at"],
383        )
384
385
386def classify_as_assist(issue, bot_comment):
387    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
388    suggested = parse_suggested_issues(bot_comment["body"])
389    if not suggested:
390        print("  -> Assist, needs review (could not parse bot suggestions)")
391        add_or_update_project_item(
392            issue["node_id"], outcome="Assist", closed_as="duplicate",
393            status="Needs review", notes="Could not parse bot suggestions",
394            bot_comment_time=bot_comment["created_at"])
395        return
396
397    original = None
398    try:
399        original = find_canonical_among(issue["number"], suggested)
400    except (requests.RequestException, RuntimeError) as error:
401        print(f"  Warning: failed to query candidate timelines: {error}")
402
403    if original:
404        status = "Auto-classified"
405        notes = None
406        print(f"  -> Assist (original #{original} matches bot suggestion)")
407    else:
408        status = "Needs review"
409        suggested_str = ", ".join(f"#{number}" for number in suggested)
410        notes = f"Bot suggested {suggested_str}; none matched as canonical"
411        print(f"  -> Possible Assist, needs review ({notes})")
412
413    add_or_update_project_item(
414        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
415        bot_comment_time=bot_comment["created_at"])
416
417
418def classify_as_missed_opportunity(issue):
419    """Issue closed as duplicate but the bot never commented."""
420    print("  -> Missed opportunity")
421    add_or_update_project_item(
422        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
423
424
425def classify_open():
426    """Classify open, triaged, bot-commented issues as Noise."""
427    print("Classifying open issues")
428
429    query = (
430        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
431        f"commenter:app/{BOT_APP_SLUG} "
432        f'-label:"{NEEDS_TRIAGE_LABEL}" '
433        f"created:>={BOT_START_DATE}"
434    )
435    print(f"  Search query: {query}")
436
437    results = github_search_issues(query)
438    print(f"  Found {len(results)} candidate issues")
439
440    added, skipped, errors = 0, 0, 0
441    for item in results:
442        number = item["number"]
443        try:
444            type_name = (item.get("type") or {}).get("name")
445            author = (item.get("user") or {}).get("login", "")
446            node_id = item["node_id"]
447
448            skip_reason = (
449                f"type is {type_name}" if type_name not in ("Bug", "Crash")
450                else f"author {author} is staff" if is_staff_member(author)
451                else "already on the board" if find_project_item(node_id)
452                else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
453                else None
454            )
455
456            if skip_reason:
457                print(f"  #{number}: skipping, {skip_reason}")
458                skipped += 1
459                continue
460
461            print(f"  #{number}: adding as Noise")
462            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
463                                       bot_comment_time=bot_comment["created_at"])
464            added += 1
465        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
466            print(f"  #{number}: error processing issue, skipping: {error}")
467            errors += 1
468
469    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
470
471
472if __name__ == "__main__":
473    parser = argparse.ArgumentParser(
474        description="Track duplicate bot effectiveness on a GitHub project board.",
475    )
476    subparsers = parser.add_subparsers(dest="command", required=True)
477
478    classify_parser = subparsers.add_parser(
479        "classify-closed",
480        help="Classify a closed issue and add it to the project board.",
481    )
482    classify_parser.add_argument("issue_number", type=int)
483    classify_parser.add_argument("closer_login")
484    classify_parser.add_argument("state_reason")
485
486    subparsers.add_parser(
487        "classify-open",
488        help="Classify open, triaged, bot-commented issues as Noise.",
489    )
490
491    args = parser.parse_args()
492
493    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
494    if not GITHUB_TOKEN:
495        print("Error: GITHUB_TOKEN environment variable is required")
496        sys.exit(1)
497
498    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
499    if raw_project_number:
500        try:
501            PROJECT_NUMBER = int(raw_project_number)
502        except ValueError:
503            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
504            sys.exit(1)
505    else:
506        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
507
508    GITHUB_HEADERS = {
509        "Authorization": f"token {GITHUB_TOKEN}",
510        "Accept": "application/vnd.github+json",
511    }
512
513    if args.command == "classify-closed":
514        classify_closed(args.issue_number, args.closer_login, args.state_reason)
515    elif args.command == "classify-open":
516        classify_open()