github-track-duplicate-bot-effectiveness.py

  1#!/usr/bin/env python3
  2"""
  3Track the effectiveness of the duplicate-detection bot by classifying issues
  4into outcome categories on a GitHub Projects v2 board.
  5
  6Subcommands:
  7    classify-closed <issue_number> <closer_login> <state_reason>
  8        Classify a closed issue and add it to the project board.
  9
 10    classify-open
 11        Classify open, triaged, bot-commented issues and add them to
 12        the project board as Noise.
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Environment variables:
 18    GITHUB_TOKEN     - GitHub App token
 19    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
 20"""
 21
 22import argparse
 23import functools
 24import os
 25import re
 26import sys
 27
 28import requests
 29
 30GITHUB_API = "https://api.github.com"
 31GRAPHQL_URL = "https://api.github.com/graphql"
 32REPO_OWNER = "zed-industries"
 33REPO_NAME = "zed"
 34STAFF_TEAM_SLUG = "staff"
 35BOT_LOGIN = "zed-community-bot[bot]"
 36BOT_APP_SLUG = "zed-community-bot"
 37BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
 38BOT_START_DATE = "2026-02-18"
 39NEEDS_TRIAGE_LABEL = "state:needs triage"
 40DEFAULT_PROJECT_NUMBER = 76
 41VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
 42# Bump this when the duplicate-detection bot's behavior changes in a way that
 43# could affect outcome rates (e.g. prompt rewrites, model swaps, candidate
 44# filtering changes). Don't bump for unrelated changes like comment formatting.
 45BOT_VERSION = "v2"
 46
 47
 48def github_api_get(path, params=None):
 49    url = f"{GITHUB_API}/{path.lstrip('/')}"
 50    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 51    response.raise_for_status()
 52    return response.json()
 53
 54
 55def github_search_issues(query):
 56    """Search issues, returning most recently created first."""
 57    # not handling pagination on purpose: the oldest issues are on the board already
 58    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
 59    return github_api_get("/search/issues", params).get("items", [])
 60
 61
 62def is_staff_member(username):
 63    """Check if user is an active member of the staff team."""
 64    try:
 65        data = github_api_get(
 66            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
 67        )
 68        return data.get("state") == "active"
 69    except requests.HTTPError as error:
 70        if error.response.status_code == 404:
 71            return False
 72        raise
 73
 74
 75def fetch_issue(issue_number):
 76    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
 77    return {
 78        "number": issue_number,
 79        "node_id": data["node_id"],
 80        "author": (data.get("user") or {}).get("login", ""),
 81        "type_name": (data.get("type") or {}).get("name"),
 82    }
 83
 84
 85def get_bot_duplicate_comment(issue_number):
 86    """Get the bot's duplicate-detection comment body from an issue.
 87
 88    Returns the comment body if found, else None.
 89    """
 90    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
 91    page = 1
 92    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
 93        for comment in comments:
 94            author = (comment.get("user") or {}).get("login", "")
 95            body = comment.get("body", "")
 96            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
 97                return body
 98        page += 1
 99    return None
100
101
102def parse_suggested_issues(comment_body):
103    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
104    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
105
106
107def github_api_graphql(query, variables=None):
108    """Execute a GitHub GraphQL query. Raises on errors."""
109    response = requests.post(
110        GRAPHQL_URL,
111        headers=GITHUB_HEADERS,
112        json={"query": query, "variables": variables or {}},
113    )
114    response.raise_for_status()
115    data = response.json()
116    if "errors" in data:
117        raise RuntimeError(f"GraphQL errors: {data['errors']}")
118    return data["data"]
119
120
121def get_closed_as_duplicate_of(issue_number):
122    """Get the issue number this issue was closed as a duplicate of.
123
124    Uses the timeline to find the most recent MarkedAsDuplicateEvent.
125    Returns the original issue number, or None.
126
127    Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
128    If the closer used the "Close as duplicate" button without separately
129    marking the duplicate relationship, no event is created and this returns
130    None. The caller handles this by flagging the item for manual review.
131    """
132    data = github_api_graphql(
133        """
134        query($owner: String!, $repo: String!, $number: Int!) {
135          repository(owner: $owner, name: $repo) {
136            issue(number: $number) {
137              timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
138                nodes {
139                  ... on MarkedAsDuplicateEvent {
140                    canonical { ... on Issue { number } }
141                  }
142                }
143              }
144            }
145          }
146        }
147        """,
148        {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
149    )
150    nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
151    for node in reversed(nodes):
152        if original := (node.get("canonical") or {}).get("number"):
153            return original
154    return None
155
156
157@functools.lru_cache
158def get_project_config():
159    """Fetch the project board's ID, field IDs, and option IDs."""
160    data = github_api_graphql(
161        """
162        query($org: String!, $number: Int!) {
163          organization(login: $org) {
164            projectV2(number: $number) {
165              id
166              fields(first: 30) {
167                nodes {
168                  ... on ProjectV2SingleSelectField { id name options { id name } }
169                  ... on ProjectV2Field { id name }
170                }
171              }
172            }
173          }
174        }
175        """,
176        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
177    )
178    project = data["organization"]["projectV2"]
179
180    config = {"project_id": project["id"], "fields": {}}
181    for field_node in project["fields"]["nodes"]:
182        name = field_node.get("name")
183        if not name:
184            continue
185        field_info = {"id": field_node["id"]}
186        if "options" in field_node:
187            field_info["options"] = {
188                option["name"]: option["id"] for option in field_node["options"]
189            }
190        config["fields"][name] = field_info
191
192    print(f"  Project config loaded: {len(config['fields'])} fields")
193    return config
194
195
196def find_project_item(issue_node_id):
197    """Check if an issue is already on our project board.
198
199    Returns the project item ID if found, or None.
200    """
201    data = github_api_graphql(
202        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
203        {"id": issue_node_id},
204    )
205    for item in data["node"]["projectItems"]["nodes"]:
206        if item["project"]["number"] == PROJECT_NUMBER:
207            return item["id"]
208    return None
209
210
211def add_project_item(issue_node_id):
212    """Add an issue to the project board. Returns the new item ID."""
213    config = get_project_config()
214    data = github_api_graphql(
215        """
216        mutation($projectId: ID!, $contentId: ID!) {
217          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
218            item { id }
219          }
220        }
221        """,
222        {"projectId": config["project_id"], "contentId": issue_node_id},
223    )
224    return data["addProjectV2ItemById"]["item"]["id"]
225
226
227def set_field_value(item_id, field_name, value):
228    """Set a single field value on a project board item."""
229    config = get_project_config()
230    field = config["fields"].get(field_name)
231    if not field:
232        print(f"  Warning: field '{field_name}' not found on project board")
233        return
234
235    if "options" in field:
236        # single-select field
237        option_id = field["options"].get(value)
238        if not option_id:
239            print(f"  Warning: option '{value}' not found for field '{field_name}'")
240            return
241        field_value = {"singleSelectOptionId": option_id}
242    else:
243        # text field
244        field_value = {"text": str(value)}
245
246    github_api_graphql(
247        """
248        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
249          updateProjectV2ItemFieldValue(input: {
250            projectId: $projectId
251            itemId: $itemId
252            fieldId: $fieldId
253            value: $value
254          }) {
255            projectV2Item { id }
256          }
257        }
258        """,
259        {
260            "projectId": config["project_id"],
261            "itemId": item_id,
262            "fieldId": field["id"],
263            "value": field_value,
264        },
265    )
266
267
268def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
269    """Add an issue to the project board (or update it if already there), setting field values."""
270    item_id = find_project_item(issue_node_id)
271    if item_id:
272        print(f"  Issue already on board, updating (item {item_id})")
273    else:
274        item_id = add_project_item(issue_node_id)
275        print(f"  Added to project board (item {item_id})")
276
277    set_field_value(item_id, "Outcome", outcome)
278    set_field_value(item_id, "Status", status)
279
280    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
281        set_field_value(item_id, "Closed as", closed_as)
282
283    if notes:
284        set_field_value(item_id, "Notes", notes)
285
286    set_field_value(item_id, "Bot version", BOT_VERSION)
287
288    return item_id
289
290
291def classify_closed(issue_number, closer_login, state_reason):
292    """Classify a closed issue and add/update it on the project board."""
293    state_reason = state_reason or "unknown"
294    print(f"Classifying closed issue #{issue_number}")
295    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
296
297    issue = fetch_issue(issue_number)
298    author = issue["author"]
299    print(f"  Author: {author}, type: {issue['type_name']}")
300
301    if is_staff_member(author):
302        print(f"  Skipping: author '{author}' is a staff member")
303        return
304
305    bot_comment = get_bot_duplicate_comment(issue_number)
306    bot_commented = bot_comment is not None
307    print(f"  Bot commented: {bot_commented}")
308
309    closer_is_author = closer_login == author
310
311    if bot_commented and closer_is_author:
312        classify_as_success(issue, state_reason)
313    elif bot_commented and not closer_is_author:
314        # Only authors, staff, and triagers can close issues, so
315        # a non-author closer is always someone with elevated permissions.
316        classify_non_author_closed(issue, bot_comment, state_reason)
317    elif not bot_commented and state_reason == "duplicate":
318        classify_as_missed_opportunity(issue)
319    else:
320        print("  Skipping: no bot comment and not closed as duplicate")
321
322
323def classify_as_success(issue, state_reason):
324    """Author closed their own issue after the bot commented."""
325    if state_reason == "duplicate":
326        status = "Auto-classified"
327        notes = None
328    else:
329        # could be closed for an unrelated reason; flag for review
330        status = "Needs review"
331        notes = f"Author closed as {state_reason}"
332
333    if status == "Auto-classified":
334        print(f"  -> Success (closed as {state_reason})")
335    else:
336        print(f"  -> Possible Success, needs review ({notes})")
337    add_or_update_project_item(
338        issue["node_id"],
339        outcome="Success",
340        closed_as=state_reason,
341        status=status,
342        notes=notes,
343    )
344
345
346def classify_non_author_closed(issue, bot_comment, state_reason):
347    """Non-author (staff or triager) closed an issue the bot had commented on."""
348    if state_reason == "duplicate":
349        classify_as_assist(issue, bot_comment)
350    else:
351        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
352        print(f"  -> Possible Noise, needs review ({notes})")
353        add_or_update_project_item(
354            issue["node_id"],
355            outcome="Noise",
356            closed_as=state_reason,
357            status="Needs review",
358            notes=notes,
359        )
360
361
362def classify_as_assist(issue, bot_comment):
363    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
364    suggested = parse_suggested_issues(bot_comment)
365    original = None
366    try:
367        original = get_closed_as_duplicate_of(issue["number"])
368    except (requests.RequestException, RuntimeError) as error:
369        print(f"  Warning: failed to get the original-for the duplicate issue: {error}")
370
371    if original and suggested:
372        if original in suggested:
373            status = "Auto-classified"
374            notes = None
375            print(f"  -> Assist (original #{original} matches bot suggestion)")
376        else:
377            status = "Needs review"
378            suggested_str = ", ".join(f"#{number}" for number in suggested)
379            notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
380            print(f"  -> Possible Assist, needs review ({notes})")
381    else:
382        # couldn't determine original or no suggestions parsed
383        status = "Needs review"
384        if not original:
385            notes = "Could not determine original issue from timeline"
386        else:
387            notes = f"Closed as dup of #{original}; could not parse bot suggestions"
388        print(f"  -> Possible Assist, needs review ({notes})")
389
390    add_or_update_project_item(
391        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
392
393
394def classify_as_missed_opportunity(issue):
395    """Issue closed as duplicate but the bot never commented."""
396    print("  -> Missed opportunity")
397    add_or_update_project_item(
398        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
399
400
401def classify_open():
402    """Classify open, triaged, bot-commented issues as Noise."""
403    print("Classifying open issues")
404
405    query = (
406        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
407        f"commenter:app/{BOT_APP_SLUG} "
408        f'-label:"{NEEDS_TRIAGE_LABEL}" '
409        f"created:>={BOT_START_DATE}"
410    )
411    print(f"  Search query: {query}")
412
413    results = github_search_issues(query)
414    print(f"  Found {len(results)} candidate issues")
415
416    added, skipped, errors = 0, 0, 0
417    for item in results:
418        number = item["number"]
419        try:
420            type_name = (item.get("type") or {}).get("name")
421            author = (item.get("user") or {}).get("login", "")
422            node_id = item["node_id"]
423
424            skip_reason = (
425                f"type is {type_name}" if type_name not in ("Bug", "Crash")
426                else f"author {author} is staff" if is_staff_member(author)
427                else "already on the board" if find_project_item(node_id)
428                else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
429                else None
430            )
431            if skip_reason:
432                print(f"  #{number}: skipping, {skip_reason}")
433                skipped += 1
434                continue
435
436            print(f"  #{number}: adding as Noise")
437            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
438            added += 1
439        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
440            print(f"  #{number}: error processing issue, skipping: {error}")
441            errors += 1
442
443    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
444
445
446if __name__ == "__main__":
447    parser = argparse.ArgumentParser(
448        description="Track duplicate bot effectiveness on a GitHub project board.",
449    )
450    subparsers = parser.add_subparsers(dest="command", required=True)
451
452    classify_parser = subparsers.add_parser(
453        "classify-closed",
454        help="Classify a closed issue and add it to the project board.",
455    )
456    classify_parser.add_argument("issue_number", type=int)
457    classify_parser.add_argument("closer_login")
458    classify_parser.add_argument("state_reason")
459
460    subparsers.add_parser(
461        "classify-open",
462        help="Classify open, triaged, bot-commented issues as Noise.",
463    )
464
465    args = parser.parse_args()
466
467    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
468    if not GITHUB_TOKEN:
469        print("Error: GITHUB_TOKEN environment variable is required")
470        sys.exit(1)
471
472    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
473    if raw_project_number:
474        try:
475            PROJECT_NUMBER = int(raw_project_number)
476        except ValueError:
477            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
478            sys.exit(1)
479    else:
480        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
481
482    GITHUB_HEADERS = {
483        "Authorization": f"token {GITHUB_TOKEN}",
484        "Accept": "application/vnd.github+json",
485    }
486
487    if args.command == "classify-closed":
488        classify_closed(args.issue_number, args.closer_login, args.state_reason)
489    elif args.command == "classify-open":
490        classify_open()