github-track-duplicate-bot-effectiveness.py

  1#!/usr/bin/env python3
  2"""
  3Track the effectiveness of the duplicate-detection bot by classifying issues
  4into outcome categories on a GitHub Projects v2 board.
  5
  6Subcommands:
  7    classify-closed <issue_number> <closer_login> <state_reason>
  8        Classify a closed issue and add it to the project board.
  9
 10    classify-open
 11        Classify open, triaged, bot-commented issues and add them to
 12        the project board as Noise.
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Environment variables:
 18    GITHUB_TOKEN     - GitHub App token
 19    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
 20"""
 21
 22import argparse
 23import functools
 24import os
 25import re
 26import sys
 27from datetime import datetime, timezone
 28
 29import requests
 30
 31GITHUB_API = "https://api.github.com"
 32GRAPHQL_URL = "https://api.github.com/graphql"
 33REPO_OWNER = "zed-industries"
 34REPO_NAME = "zed"
 35STAFF_TEAM_SLUG = "staff"
 36BOT_LOGIN = "zed-community-bot[bot]"
 37BOT_APP_SLUG = "zed-community-bot"
 38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
 39BOT_START_DATE = "2026-02-18"
 40NEEDS_TRIAGE_LABEL = "state:needs triage"
 41DEFAULT_PROJECT_NUMBER = 76
 42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
 43# Add a new tuple when you deploy a new version of the bot that you want to
 44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
 45# Newest first, please. The datetime is for the deployment time (merge to maain).
 46BOT_VERSION_TIMELINE = [
 47    ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
 48    ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
 49]
 50
 51
 52def bot_version_for_time(date_string):
 53    """Return the bot version that was active at the given ISO 8601 timestamp."""
 54    timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
 55    for version, deployed in BOT_VERSION_TIMELINE:
 56        if timestamp >= deployed:
 57            return version
 58    return BOT_VERSION_TIMELINE[-1][0]
 59
 60
 61def github_api_get(path, params=None):
 62    url = f"{GITHUB_API}/{path.lstrip('/')}"
 63    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 64    response.raise_for_status()
 65    return response.json()
 66
 67
 68def github_search_issues(query):
 69    """Search issues, returning most recently created first."""
 70    # not handling pagination on purpose: the oldest issues are on the board already
 71    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
 72    return github_api_get("/search/issues", params).get("items", [])
 73
 74
 75def is_staff_member(username):
 76    """Check if user is an active member of the staff team."""
 77    try:
 78        data = github_api_get(
 79            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
 80        )
 81        return data.get("state") == "active"
 82    except requests.HTTPError as error:
 83        if error.response.status_code == 404:
 84            return False
 85        raise
 86
 87
 88def fetch_issue(issue_number):
 89    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
 90    return {
 91        "number": issue_number,
 92        "node_id": data["node_id"],
 93        "author": (data.get("user") or {}).get("login", ""),
 94        "type_name": (data.get("type") or {}).get("name"),
 95        "created_at": data.get("created_at", ""),
 96    }
 97
 98
 99def get_bot_comment_with_time(issue_number):
100    """Get the bot's duplicate-detection comment and its timestamp from an issue.
101
102    Returns {"body": str, "created_at": str} if found, else None.
103    """
104    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
105    page = 1
106    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
107        for comment in comments:
108            author = (comment.get("user") or {}).get("login", "")
109            body = comment.get("body", "")
110            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
111                return {"body": body, "created_at": comment.get("created_at", "")}
112        page += 1
113    return None
114
115
116def parse_suggested_issues(comment_body):
117    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
118    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
119
120
121def github_api_graphql(query, variables=None, partial_errors_ok=False):
122    """Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
123    response = requests.post(
124        GRAPHQL_URL,
125        headers=GITHUB_HEADERS,
126        json={"query": query, "variables": variables or {}},
127    )
128    response.raise_for_status()
129    data = response.json()
130    if "errors" in data:
131        if not partial_errors_ok or "data" not in data:
132            raise RuntimeError(f"GraphQL errors: {data['errors']}")
133        print(f"  GraphQL partial errors (ignored): {data['errors']}")
134    return data["data"]
135
136
137def find_canonical_among(duplicate_number, candidates):
138    """Check if any candidate issue has duplicate_number marked as a duplicate.
139
140    The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
141    duplicate's. So to find which canonical issue our duplicate was closed against,
142    we check each candidate's timeline for a MarkedAsDuplicateEvent whose
143    `duplicate` field matches our issue.
144
145    Returns the matching canonical issue number, or None.
146    """
147    if not candidates:
148        return None
149
150    data = github_api_graphql(
151        """
152        query($owner: String!, $repo: String!, $numbers: [Int!]!) {
153          repository(owner: $owner, name: $repo) {
154            PLACEHOLDER
155          }
156        }
157        """.replace("PLACEHOLDER", "\n            ".join(
158            f'issue_{number}: issue(number: {number}) {{'
159            f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
160            f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
161            for number in candidates
162        )),
163        {"owner": REPO_OWNER, "repo": REPO_NAME, "numbers": list(candidates)},
164        partial_errors_ok=True,
165    )
166
167    repo = data["repository"]
168    for candidate in candidates:
169        issue_data = repo.get(f"issue_{candidate}")
170        if not issue_data:
171            continue
172        for node in issue_data["timelineItems"]["nodes"]:
173            dup_number = (node.get("duplicate") or {}).get("number")
174            if dup_number == duplicate_number:
175                return candidate
176    return None
177
178
179@functools.lru_cache
180def get_project_config():
181    """Fetch the project board's ID, field IDs, and option IDs."""
182    data = github_api_graphql(
183        """
184        query($org: String!, $number: Int!) {
185          organization(login: $org) {
186            projectV2(number: $number) {
187              id
188              fields(first: 30) {
189                nodes {
190                  ... on ProjectV2SingleSelectField { id name options { id name } }
191                  ... on ProjectV2Field { id name }
192                }
193              }
194            }
195          }
196        }
197        """,
198        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
199    )
200    project = data["organization"]["projectV2"]
201
202    config = {"project_id": project["id"], "fields": {}}
203    for field_node in project["fields"]["nodes"]:
204        name = field_node.get("name")
205        if not name:
206            continue
207        field_info = {"id": field_node["id"]}
208        if "options" in field_node:
209            field_info["options"] = {
210                option["name"]: option["id"] for option in field_node["options"]
211            }
212        config["fields"][name] = field_info
213
214    print(f"  Project config loaded: {len(config['fields'])} fields")
215    return config
216
217
218def find_project_item(issue_node_id):
219    """Check if an issue is already on our project board.
220
221    Returns the project item ID if found, or None.
222    """
223    data = github_api_graphql(
224        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
225        {"id": issue_node_id},
226    )
227    for item in data["node"]["projectItems"]["nodes"]:
228        if item["project"]["number"] == PROJECT_NUMBER:
229            return item["id"]
230    return None
231
232
233def add_project_item(issue_node_id):
234    """Add an issue to the project board. Returns the new item ID."""
235    config = get_project_config()
236    data = github_api_graphql(
237        """
238        mutation($projectId: ID!, $contentId: ID!) {
239          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
240            item { id }
241          }
242        }
243        """,
244        {"projectId": config["project_id"], "contentId": issue_node_id},
245    )
246    return data["addProjectV2ItemById"]["item"]["id"]
247
248
249def set_field_value(item_id, field_name, value):
250    """Set a single field value on a project board item."""
251    config = get_project_config()
252    field = config["fields"].get(field_name)
253    if not field:
254        print(f"  Warning: field '{field_name}' not found on project board")
255        return
256
257    if "options" in field:
258        # single-select field
259        option_id = field["options"].get(value)
260        if not option_id:
261            print(f"  Warning: option '{value}' not found for field '{field_name}'")
262            return
263        field_value = {"singleSelectOptionId": option_id}
264    else:
265        # text field
266        field_value = {"text": str(value)}
267
268    github_api_graphql(
269        """
270        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
271          updateProjectV2ItemFieldValue(input: {
272            projectId: $projectId
273            itemId: $itemId
274            fieldId: $fieldId
275            value: $value
276          }) {
277            projectV2Item { id }
278          }
279        }
280        """,
281        {
282            "projectId": config["project_id"],
283            "itemId": item_id,
284            "fieldId": field["id"],
285            "value": field_value,
286        },
287    )
288
289
290def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
291    """Add an issue to the project board (or update it if already there), setting field values."""
292    item_id = find_project_item(issue_node_id)
293    if item_id:
294        print(f"  Issue already on board, updating (item {item_id})")
295    else:
296        item_id = add_project_item(issue_node_id)
297        print(f"  Added to project board (item {item_id})")
298
299    set_field_value(item_id, "Outcome", outcome)
300    set_field_value(item_id, "Status", status)
301
302    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
303        set_field_value(item_id, "Closed as", closed_as)
304
305    if notes:
306        set_field_value(item_id, "Notes", notes)
307
308    if bot_comment_time:
309        set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
310
311    return item_id
312
313
314def classify_closed(issue_number, closer_login, state_reason):
315    """Classify a closed issue and add/update it on the project board."""
316    state_reason = state_reason or "unknown"
317    print(f"Classifying closed issue #{issue_number}")
318    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
319
320    issue = fetch_issue(issue_number)
321    author = issue["author"]
322    print(f"  Author: {author}, type: {issue['type_name']}")
323
324    if is_staff_member(author):
325        print(f"  Skipping: author '{author}' is a staff member")
326        return
327
328    bot_comment = get_bot_comment_with_time(issue_number)
329    bot_commented = bot_comment is not None
330    print(f"  Bot commented: {bot_commented}")
331
332    closer_is_author = closer_login == author
333
334    if bot_commented and closer_is_author:
335        classify_as_success(issue, bot_comment, state_reason)
336    elif bot_commented and not closer_is_author:
337        # Only authors, staff, and triagers can close issues, so
338        # a non-author closer is always someone with elevated permissions.
339        classify_non_author_closed(issue, bot_comment, state_reason)
340    elif not bot_commented and state_reason == "duplicate":
341        classify_as_missed_opportunity(issue)
342    else:
343        print("  Skipping: no bot comment and not closed as duplicate")
344
345
346def classify_as_success(issue, bot_comment, state_reason):
347    """Author closed their own issue after the bot commented."""
348    if state_reason == "duplicate":
349        status = "Auto-classified"
350        notes = None
351    else:
352        # could be closed for an unrelated reason; flag for review
353        status = "Needs review"
354        notes = f"Author closed as {state_reason}"
355
356    if status == "Auto-classified":
357        print(f"  -> Success (closed as {state_reason})")
358    else:
359        print(f"  -> Possible Success, needs review ({notes})")
360    add_or_update_project_item(
361        issue["node_id"],
362        outcome="Success",
363        closed_as=state_reason,
364        status=status,
365        notes=notes,
366        bot_comment_time=bot_comment["created_at"],
367    )
368
369
370def classify_non_author_closed(issue, bot_comment, state_reason):
371    """Non-author (staff or triager) closed an issue the bot had commented on."""
372    if state_reason == "duplicate":
373        classify_as_assist(issue, bot_comment)
374    else:
375        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
376        print(f"  -> Possible Noise, needs review ({notes})")
377        add_or_update_project_item(
378            issue["node_id"],
379            outcome="Noise",
380            closed_as=state_reason,
381            status="Needs review",
382            notes=notes,
383            bot_comment_time=bot_comment["created_at"],
384        )
385
386
387def classify_as_assist(issue, bot_comment):
388    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
389    suggested = parse_suggested_issues(bot_comment["body"])
390    if not suggested:
391        print("  -> Assist, needs review (could not parse bot suggestions)")
392        add_or_update_project_item(
393            issue["node_id"], outcome="Assist", closed_as="duplicate",
394            status="Needs review", notes="Could not parse bot suggestions",
395            bot_comment_time=bot_comment["created_at"])
396        return
397
398    original = None
399    try:
400        original = find_canonical_among(issue["number"], suggested)
401    except (requests.RequestException, RuntimeError) as error:
402        print(f"  Warning: failed to query candidate timelines: {error}")
403
404    if original:
405        status = "Auto-classified"
406        notes = None
407        print(f"  -> Assist (original #{original} matches bot suggestion)")
408    else:
409        status = "Needs review"
410        suggested_str = ", ".join(f"#{number}" for number in suggested)
411        notes = f"Bot suggested {suggested_str}; none matched as canonical"
412        print(f"  -> Possible Assist, needs review ({notes})")
413
414    add_or_update_project_item(
415        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
416        bot_comment_time=bot_comment["created_at"])
417
418
419def classify_as_missed_opportunity(issue):
420    """Issue closed as duplicate but the bot never commented."""
421    print("  -> Missed opportunity")
422    add_or_update_project_item(
423        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified",
424        bot_comment_time=issue["created_at"])
425
426
427def classify_open():
428    """Classify open, triaged, bot-commented issues as Noise."""
429    print("Classifying open issues")
430
431    query = (
432        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
433        f"commenter:app/{BOT_APP_SLUG} "
434        f'-label:"{NEEDS_TRIAGE_LABEL}" '
435        f"created:>={BOT_START_DATE}"
436    )
437    print(f"  Search query: {query}")
438
439    results = github_search_issues(query)
440    print(f"  Found {len(results)} candidate issues")
441
442    added, skipped, errors = 0, 0, 0
443    for item in results:
444        number = item["number"]
445        try:
446            type_name = (item.get("type") or {}).get("name")
447            author = (item.get("user") or {}).get("login", "")
448            node_id = item["node_id"]
449
450            skip_reason = (
451                f"type is {type_name}" if type_name not in ("Bug", "Crash")
452                else f"author {author} is staff" if is_staff_member(author)
453                else "already on the board" if find_project_item(node_id)
454                else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
455                else None
456            )
457
458            if skip_reason:
459                print(f"  #{number}: skipping, {skip_reason}")
460                skipped += 1
461                continue
462
463            print(f"  #{number}: adding as Noise")
464            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
465                                       bot_comment_time=bot_comment["created_at"])
466            added += 1
467        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
468            print(f"  #{number}: error processing issue, skipping: {error}")
469            errors += 1
470
471    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
472
473
474if __name__ == "__main__":
475    parser = argparse.ArgumentParser(
476        description="Track duplicate bot effectiveness on a GitHub project board.",
477    )
478    subparsers = parser.add_subparsers(dest="command", required=True)
479
480    classify_parser = subparsers.add_parser(
481        "classify-closed",
482        help="Classify a closed issue and add it to the project board.",
483    )
484    classify_parser.add_argument("issue_number", type=int)
485    classify_parser.add_argument("closer_login")
486    classify_parser.add_argument("state_reason")
487
488    subparsers.add_parser(
489        "classify-open",
490        help="Classify open, triaged, bot-commented issues as Noise.",
491    )
492
493    args = parser.parse_args()
494
495    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
496    if not GITHUB_TOKEN:
497        print("Error: GITHUB_TOKEN environment variable is required")
498        sys.exit(1)
499
500    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
501    if raw_project_number:
502        try:
503            PROJECT_NUMBER = int(raw_project_number)
504        except ValueError:
505            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
506            sys.exit(1)
507    else:
508        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
509
510    GITHUB_HEADERS = {
511        "Authorization": f"token {GITHUB_TOKEN}",
512        "Accept": "application/vnd.github+json",
513    }
514
515    if args.command == "classify-closed":
516        classify_closed(args.issue_number, args.closer_login, args.state_reason)
517    elif args.command == "classify-open":
518        classify_open()