github-track-duplicate-bot-effectiveness.py

  1#!/usr/bin/env python3
  2"""
  3Track the effectiveness of the duplicate-detection bot by classifying issues
  4into outcome categories on a GitHub Projects v2 board.
  5
  6Subcommands:
  7    classify-closed <issue_number> <closer_login> <state_reason>
  8        Classify a closed issue and add it to the project board.
  9
 10    classify-open
 11        Classify open, triaged, bot-commented issues and add them to
 12        the project board as Noise.
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Environment variables:
 18    GITHUB_TOKEN     - GitHub App token
 19    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
 20"""
 21
 22import argparse
 23import functools
 24import os
 25import re
 26import sys
 27from datetime import datetime, timezone
 28
 29import requests
 30
 31GITHUB_API = "https://api.github.com"
 32GRAPHQL_URL = "https://api.github.com/graphql"
 33REPO_OWNER = "zed-industries"
 34REPO_NAME = "zed"
 35STAFF_TEAM_SLUG = "staff"
 36BOT_LOGIN = "zed-community-bot[bot]"
 37BOT_APP_SLUG = "zed-community-bot"
 38BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
 39BOT_START_DATE = "2026-02-18"
 40NEEDS_TRIAGE_LABEL = "state:needs triage"
 41DEFAULT_PROJECT_NUMBER = 76
 42VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
 43# Add a new tuple when you deploy a new version of the bot that you want to
 44# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
 45# Newest first, please. The datetime is for the deployment time (merge to maain).
 46BOT_VERSION_TIMELINE = [
 47    ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
 48    ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
 49]
 50
 51
 52def bot_version_for_time(date_string):
 53    """Return the bot version that was active at the given ISO 8601 timestamp."""
 54    timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
 55    for version, deployed in BOT_VERSION_TIMELINE:
 56        if timestamp >= deployed:
 57            return version
 58    return BOT_VERSION_TIMELINE[-1][0]
 59
 60
 61def github_api_get(path, params=None):
 62    url = f"{GITHUB_API}/{path.lstrip('/')}"
 63    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 64    response.raise_for_status()
 65    return response.json()
 66
 67
 68def github_search_issues(query):
 69    """Search issues, returning most recently created first."""
 70    # not handling pagination on purpose: the oldest issues are on the board already
 71    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
 72    return github_api_get("/search/issues", params).get("items", [])
 73
 74
 75def is_staff_member(username):
 76    """Check if user is an active member of the staff team."""
 77    try:
 78        data = github_api_get(
 79            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
 80        )
 81        return data.get("state") == "active"
 82    except requests.HTTPError as error:
 83        if error.response.status_code == 404:
 84            return False
 85        raise
 86
 87
 88def fetch_issue(issue_number):
 89    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
 90    return {
 91        "number": issue_number,
 92        "node_id": data["node_id"],
 93        "author": (data.get("user") or {}).get("login", ""),
 94        "type_name": (data.get("type") or {}).get("name"),
 95    }
 96
 97
 98def get_bot_comment_with_time(issue_number):
 99    """Get the bot's duplicate-detection comment and its timestamp from an issue.
100
101    Returns {"body": str, "created_at": str} if found, else None.
102    """
103    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
104    page = 1
105    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
106        for comment in comments:
107            author = (comment.get("user") or {}).get("login", "")
108            body = comment.get("body", "")
109            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
110                return {"body": body, "created_at": comment.get("created_at", "")}
111        page += 1
112    return None
113
114
115def parse_suggested_issues(comment_body):
116    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
117    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
118
119
120def github_api_graphql(query, variables=None):
121    """Execute a GitHub GraphQL query. Raises on errors."""
122    response = requests.post(
123        GRAPHQL_URL,
124        headers=GITHUB_HEADERS,
125        json={"query": query, "variables": variables or {}},
126    )
127    response.raise_for_status()
128    data = response.json()
129    if "errors" in data:
130        raise RuntimeError(f"GraphQL errors: {data['errors']}")
131    return data["data"]
132
133
134def get_closed_as_duplicate_of(issue_number):
135    """Get the issue number this issue was closed as a duplicate of.
136
137    Uses the timeline to find the most recent MarkedAsDuplicateEvent.
138    Returns the original issue number, or None.
139
140    Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
141    If the closer used the "Close as duplicate" button without separately
142    marking the duplicate relationship, no event is created and this returns
143    None. The caller handles this by flagging the item for manual review.
144    """
145    data = github_api_graphql(
146        """
147        query($owner: String!, $repo: String!, $number: Int!) {
148          repository(owner: $owner, name: $repo) {
149            issue(number: $number) {
150              timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
151                nodes {
152                  ... on MarkedAsDuplicateEvent {
153                    canonical { ... on Issue { number } }
154                  }
155                }
156              }
157            }
158          }
159        }
160        """,
161        {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
162    )
163    nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
164    for node in reversed(nodes):
165        if original := (node.get("canonical") or {}).get("number"):
166            return original
167    return None
168
169
170@functools.lru_cache
171def get_project_config():
172    """Fetch the project board's ID, field IDs, and option IDs."""
173    data = github_api_graphql(
174        """
175        query($org: String!, $number: Int!) {
176          organization(login: $org) {
177            projectV2(number: $number) {
178              id
179              fields(first: 30) {
180                nodes {
181                  ... on ProjectV2SingleSelectField { id name options { id name } }
182                  ... on ProjectV2Field { id name }
183                }
184              }
185            }
186          }
187        }
188        """,
189        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
190    )
191    project = data["organization"]["projectV2"]
192
193    config = {"project_id": project["id"], "fields": {}}
194    for field_node in project["fields"]["nodes"]:
195        name = field_node.get("name")
196        if not name:
197            continue
198        field_info = {"id": field_node["id"]}
199        if "options" in field_node:
200            field_info["options"] = {
201                option["name"]: option["id"] for option in field_node["options"]
202            }
203        config["fields"][name] = field_info
204
205    print(f"  Project config loaded: {len(config['fields'])} fields")
206    return config
207
208
209def find_project_item(issue_node_id):
210    """Check if an issue is already on our project board.
211
212    Returns the project item ID if found, or None.
213    """
214    data = github_api_graphql(
215        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
216        {"id": issue_node_id},
217    )
218    for item in data["node"]["projectItems"]["nodes"]:
219        if item["project"]["number"] == PROJECT_NUMBER:
220            return item["id"]
221    return None
222
223
224def add_project_item(issue_node_id):
225    """Add an issue to the project board. Returns the new item ID."""
226    config = get_project_config()
227    data = github_api_graphql(
228        """
229        mutation($projectId: ID!, $contentId: ID!) {
230          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
231            item { id }
232          }
233        }
234        """,
235        {"projectId": config["project_id"], "contentId": issue_node_id},
236    )
237    return data["addProjectV2ItemById"]["item"]["id"]
238
239
240def set_field_value(item_id, field_name, value):
241    """Set a single field value on a project board item."""
242    config = get_project_config()
243    field = config["fields"].get(field_name)
244    if not field:
245        print(f"  Warning: field '{field_name}' not found on project board")
246        return
247
248    if "options" in field:
249        # single-select field
250        option_id = field["options"].get(value)
251        if not option_id:
252            print(f"  Warning: option '{value}' not found for field '{field_name}'")
253            return
254        field_value = {"singleSelectOptionId": option_id}
255    else:
256        # text field
257        field_value = {"text": str(value)}
258
259    github_api_graphql(
260        """
261        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
262          updateProjectV2ItemFieldValue(input: {
263            projectId: $projectId
264            itemId: $itemId
265            fieldId: $fieldId
266            value: $value
267          }) {
268            projectV2Item { id }
269          }
270        }
271        """,
272        {
273            "projectId": config["project_id"],
274            "itemId": item_id,
275            "fieldId": field["id"],
276            "value": field_value,
277        },
278    )
279
280
281def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
282    """Add an issue to the project board (or update it if already there), setting field values."""
283    item_id = find_project_item(issue_node_id)
284    if item_id:
285        print(f"  Issue already on board, updating (item {item_id})")
286    else:
287        item_id = add_project_item(issue_node_id)
288        print(f"  Added to project board (item {item_id})")
289
290    set_field_value(item_id, "Outcome", outcome)
291    set_field_value(item_id, "Status", status)
292
293    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
294        set_field_value(item_id, "Closed as", closed_as)
295
296    if notes:
297        set_field_value(item_id, "Notes", notes)
298
299    if bot_comment_time:
300        set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
301
302    return item_id
303
304
305def classify_closed(issue_number, closer_login, state_reason):
306    """Classify a closed issue and add/update it on the project board."""
307    state_reason = state_reason or "unknown"
308    print(f"Classifying closed issue #{issue_number}")
309    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
310
311    issue = fetch_issue(issue_number)
312    author = issue["author"]
313    print(f"  Author: {author}, type: {issue['type_name']}")
314
315    if is_staff_member(author):
316        print(f"  Skipping: author '{author}' is a staff member")
317        return
318
319    bot_comment = get_bot_comment_with_time(issue_number)
320    bot_commented = bot_comment is not None
321    print(f"  Bot commented: {bot_commented}")
322
323    closer_is_author = closer_login == author
324
325    if bot_commented and closer_is_author:
326        classify_as_success(issue, bot_comment, state_reason)
327    elif bot_commented and not closer_is_author:
328        # Only authors, staff, and triagers can close issues, so
329        # a non-author closer is always someone with elevated permissions.
330        classify_non_author_closed(issue, bot_comment, state_reason)
331    elif not bot_commented and state_reason == "duplicate":
332        classify_as_missed_opportunity(issue)
333    else:
334        print("  Skipping: no bot comment and not closed as duplicate")
335
336
337def classify_as_success(issue, bot_comment, state_reason):
338    """Author closed their own issue after the bot commented."""
339    if state_reason == "duplicate":
340        status = "Auto-classified"
341        notes = None
342    else:
343        # could be closed for an unrelated reason; flag for review
344        status = "Needs review"
345        notes = f"Author closed as {state_reason}"
346
347    if status == "Auto-classified":
348        print(f"  -> Success (closed as {state_reason})")
349    else:
350        print(f"  -> Possible Success, needs review ({notes})")
351    add_or_update_project_item(
352        issue["node_id"],
353        outcome="Success",
354        closed_as=state_reason,
355        status=status,
356        notes=notes,
357        bot_comment_time=bot_comment["created_at"],
358    )
359
360
361def classify_non_author_closed(issue, bot_comment, state_reason):
362    """Non-author (staff or triager) closed an issue the bot had commented on."""
363    if state_reason == "duplicate":
364        classify_as_assist(issue, bot_comment)
365    else:
366        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
367        print(f"  -> Possible Noise, needs review ({notes})")
368        add_or_update_project_item(
369            issue["node_id"],
370            outcome="Noise",
371            closed_as=state_reason,
372            status="Needs review",
373            notes=notes,
374            bot_comment_time=bot_comment["created_at"],
375        )
376
377
378def classify_as_assist(issue, bot_comment):
379    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
380    suggested = parse_suggested_issues(bot_comment["body"])
381    original = None
382    try:
383        original = get_closed_as_duplicate_of(issue["number"])
384    except (requests.RequestException, RuntimeError) as error:
385        print(f"  Warning: failed to get the original-for the duplicate issue: {error}")
386
387    if original and suggested:
388        if original in suggested:
389            status = "Auto-classified"
390            notes = None
391            print(f"  -> Assist (original #{original} matches bot suggestion)")
392        else:
393            status = "Needs review"
394            suggested_str = ", ".join(f"#{number}" for number in suggested)
395            notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
396            print(f"  -> Possible Assist, needs review ({notes})")
397    else:
398        # couldn't determine original or no suggestions parsed
399        status = "Needs review"
400        if not original:
401            notes = "Could not determine original issue from timeline"
402        else:
403            notes = f"Closed as dup of #{original}; could not parse bot suggestions"
404        print(f"  -> Possible Assist, needs review ({notes})")
405
406    add_or_update_project_item(
407        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
408        bot_comment_time=bot_comment["created_at"])
409
410
411def classify_as_missed_opportunity(issue):
412    """Issue closed as duplicate but the bot never commented."""
413    print("  -> Missed opportunity")
414    add_or_update_project_item(
415        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
416
417
418def classify_open():
419    """Classify open, triaged, bot-commented issues as Noise."""
420    print("Classifying open issues")
421
422    query = (
423        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
424        f"commenter:app/{BOT_APP_SLUG} "
425        f'-label:"{NEEDS_TRIAGE_LABEL}" '
426        f"created:>={BOT_START_DATE}"
427    )
428    print(f"  Search query: {query}")
429
430    results = github_search_issues(query)
431    print(f"  Found {len(results)} candidate issues")
432
433    added, skipped, errors = 0, 0, 0
434    for item in results:
435        number = item["number"]
436        try:
437            type_name = (item.get("type") or {}).get("name")
438            author = (item.get("user") or {}).get("login", "")
439            node_id = item["node_id"]
440
441            skip_reason = (
442                f"type is {type_name}" if type_name not in ("Bug", "Crash")
443                else f"author {author} is staff" if is_staff_member(author)
444                else "already on the board" if find_project_item(node_id)
445                else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
446                else None
447            )
448
449            if skip_reason:
450                print(f"  #{number}: skipping, {skip_reason}")
451                skipped += 1
452                continue
453
454            print(f"  #{number}: adding as Noise")
455            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
456                                       bot_comment_time=bot_comment["created_at"])
457            added += 1
458        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
459            print(f"  #{number}: error processing issue, skipping: {error}")
460            errors += 1
461
462    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
463
464
465if __name__ == "__main__":
466    parser = argparse.ArgumentParser(
467        description="Track duplicate bot effectiveness on a GitHub project board.",
468    )
469    subparsers = parser.add_subparsers(dest="command", required=True)
470
471    classify_parser = subparsers.add_parser(
472        "classify-closed",
473        help="Classify a closed issue and add it to the project board.",
474    )
475    classify_parser.add_argument("issue_number", type=int)
476    classify_parser.add_argument("closer_login")
477    classify_parser.add_argument("state_reason")
478
479    subparsers.add_parser(
480        "classify-open",
481        help="Classify open, triaged, bot-commented issues as Noise.",
482    )
483
484    args = parser.parse_args()
485
486    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
487    if not GITHUB_TOKEN:
488        print("Error: GITHUB_TOKEN environment variable is required")
489        sys.exit(1)
490
491    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
492    if raw_project_number:
493        try:
494            PROJECT_NUMBER = int(raw_project_number)
495        except ValueError:
496            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
497            sys.exit(1)
498    else:
499        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
500
501    GITHUB_HEADERS = {
502        "Authorization": f"token {GITHUB_TOKEN}",
503        "Accept": "application/vnd.github+json",
504    }
505
506    if args.command == "classify-closed":
507        classify_closed(args.issue_number, args.closer_login, args.state_reason)
508    elif args.command == "classify-open":
509        classify_open()