github-track-duplicate-bot-effectiveness.py

  1#!/usr/bin/env python3
  2"""
  3Track the effectiveness of the duplicate-detection bot by classifying issues
  4into outcome categories on a GitHub Projects v2 board.
  5
  6Subcommands:
  7    classify-closed <issue_number> <closer_login> <state_reason>
  8        Classify a closed issue and add it to the project board.
  9
 10    classify-open
 11        Classify open, triaged, bot-commented issues and add them to
 12        the project board as Noise.
 13
 14Requires:
 15    requests (pip install requests)
 16
 17Environment variables:
 18    GITHUB_TOKEN     - GitHub App token
 19    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
 20"""
 21
 22import argparse
 23import functools
 24import os
 25import re
 26import sys
 27
 28import requests
 29
 30GITHUB_API = "https://api.github.com"
 31GRAPHQL_URL = "https://api.github.com/graphql"
 32REPO_OWNER = "zed-industries"
 33REPO_NAME = "zed"
 34STAFF_TEAM_SLUG = "staff"
 35BOT_LOGIN = "zed-community-bot[bot]"
 36BOT_APP_SLUG = "zed-community-bot"
 37BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
 38BOT_START_DATE = "2026-02-18"
 39NEEDS_TRIAGE_LABEL = "state:needs triage"
 40DEFAULT_PROJECT_NUMBER = 76
 41VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
 42
 43
 44def github_api_get(path, params=None):
 45    url = f"{GITHUB_API}/{path.lstrip('/')}"
 46    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
 47    response.raise_for_status()
 48    return response.json()
 49
 50
 51def github_search_issues(query):
 52    """Search issues, returning most recently created first."""
 53    # not handling pagination on purpose: the oldest issues are on the board already
 54    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
 55    return github_api_get("/search/issues", params).get("items", [])
 56
 57
 58def is_staff_member(username):
 59    """Check if user is an active member of the staff team."""
 60    try:
 61        data = github_api_get(
 62            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
 63        )
 64        return data.get("state") == "active"
 65    except requests.HTTPError as error:
 66        if error.response.status_code == 404:
 67            return False
 68        raise
 69
 70
 71def fetch_issue(issue_number):
 72    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
 73    return {
 74        "number": issue_number,
 75        "node_id": data["node_id"],
 76        "author": (data.get("user") or {}).get("login", ""),
 77        "type_name": (data.get("type") or {}).get("name"),
 78    }
 79
 80
 81def get_bot_duplicate_comment(issue_number):
 82    """Get the bot's duplicate-detection comment body from an issue.
 83
 84    Returns the comment body if found, else None.
 85    """
 86    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
 87    page = 1
 88    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
 89        for comment in comments:
 90            author = (comment.get("user") or {}).get("login", "")
 91            body = comment.get("body", "")
 92            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
 93                return body
 94        page += 1
 95    return None
 96
 97
 98def parse_suggested_issues(comment_body):
 99    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
100    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
101
102
103def github_api_graphql(query, variables=None):
104    """Execute a GitHub GraphQL query. Raises on errors."""
105    response = requests.post(
106        GRAPHQL_URL,
107        headers=GITHUB_HEADERS,
108        json={"query": query, "variables": variables or {}},
109    )
110    response.raise_for_status()
111    data = response.json()
112    if "errors" in data:
113        raise RuntimeError(f"GraphQL errors: {data['errors']}")
114    return data["data"]
115
116
117def get_closed_as_duplicate_of(issue_number):
118    """Get the issue number this issue was closed as a duplicate of.
119
120    Uses the timeline to find the most recent MarkedAsDuplicateEvent.
121    Returns the original issue number, or None.
122
123    Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
124    If the closer used the "Close as duplicate" button without separately
125    marking the duplicate relationship, no event is created and this returns
126    None. The caller handles this by flagging the item for manual review.
127    """
128    data = github_api_graphql(
129        """
130        query($owner: String!, $repo: String!, $number: Int!) {
131          repository(owner: $owner, name: $repo) {
132            issue(number: $number) {
133              timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
134                nodes {
135                  ... on MarkedAsDuplicateEvent {
136                    canonical { ... on Issue { number } }
137                  }
138                }
139              }
140            }
141          }
142        }
143        """,
144        {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
145    )
146    nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
147    for node in reversed(nodes):
148        if original := (node.get("canonical") or {}).get("number"):
149            return original
150    return None
151
152
153@functools.lru_cache
154def get_project_config():
155    """Fetch the project board's ID, field IDs, and option IDs."""
156    data = github_api_graphql(
157        """
158        query($org: String!, $number: Int!) {
159          organization(login: $org) {
160            projectV2(number: $number) {
161              id
162              fields(first: 30) {
163                nodes {
164                  ... on ProjectV2SingleSelectField { id name options { id name } }
165                  ... on ProjectV2Field { id name }
166                }
167              }
168            }
169          }
170        }
171        """,
172        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
173    )
174    project = data["organization"]["projectV2"]
175
176    config = {"project_id": project["id"], "fields": {}}
177    for field_node in project["fields"]["nodes"]:
178        name = field_node.get("name")
179        if not name:
180            continue
181        field_info = {"id": field_node["id"]}
182        if "options" in field_node:
183            field_info["options"] = {
184                option["name"]: option["id"] for option in field_node["options"]
185            }
186        config["fields"][name] = field_info
187
188    print(f"  Project config loaded: {len(config['fields'])} fields")
189    return config
190
191
192def find_project_item(issue_node_id):
193    """Check if an issue is already on our project board.
194
195    Returns the project item ID if found, or None.
196    """
197    data = github_api_graphql(
198        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
199        {"id": issue_node_id},
200    )
201    for item in data["node"]["projectItems"]["nodes"]:
202        if item["project"]["number"] == PROJECT_NUMBER:
203            return item["id"]
204    return None
205
206
207def add_project_item(issue_node_id):
208    """Add an issue to the project board. Returns the new item ID."""
209    config = get_project_config()
210    data = github_api_graphql(
211        """
212        mutation($projectId: ID!, $contentId: ID!) {
213          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
214            item { id }
215          }
216        }
217        """,
218        {"projectId": config["project_id"], "contentId": issue_node_id},
219    )
220    return data["addProjectV2ItemById"]["item"]["id"]
221
222
223def set_field_value(item_id, field_name, value):
224    """Set a single field value on a project board item."""
225    config = get_project_config()
226    field = config["fields"].get(field_name)
227    if not field:
228        print(f"  Warning: field '{field_name}' not found on project board")
229        return
230
231    if "options" in field:
232        # single-select field
233        option_id = field["options"].get(value)
234        if not option_id:
235            print(f"  Warning: option '{value}' not found for field '{field_name}'")
236            return
237        field_value = {"singleSelectOptionId": option_id}
238    else:
239        # text field
240        field_value = {"text": str(value)}
241
242    github_api_graphql(
243        """
244        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
245          updateProjectV2ItemFieldValue(input: {
246            projectId: $projectId
247            itemId: $itemId
248            fieldId: $fieldId
249            value: $value
250          }) {
251            projectV2Item { id }
252          }
253        }
254        """,
255        {
256            "projectId": config["project_id"],
257            "itemId": item_id,
258            "fieldId": field["id"],
259            "value": field_value,
260        },
261    )
262
263
264def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
265    """Add an issue to the project board (or update it if already there), setting field values."""
266    item_id = find_project_item(issue_node_id)
267    if item_id:
268        print(f"  Issue already on board, updating (item {item_id})")
269    else:
270        item_id = add_project_item(issue_node_id)
271        print(f"  Added to project board (item {item_id})")
272
273    set_field_value(item_id, "Outcome", outcome)
274    set_field_value(item_id, "Status", status)
275
276    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
277        set_field_value(item_id, "Closed as", closed_as)
278
279    if notes:
280        set_field_value(item_id, "Notes", notes)
281
282    return item_id
283
284
285def classify_closed(issue_number, closer_login, state_reason):
286    """Classify a closed issue and add/update it on the project board."""
287    state_reason = state_reason or "unknown"
288    print(f"Classifying closed issue #{issue_number}")
289    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
290
291    issue = fetch_issue(issue_number)
292    author = issue["author"]
293    print(f"  Author: {author}, type: {issue['type_name']}")
294
295    if is_staff_member(author):
296        print(f"  Skipping: author '{author}' is a staff member")
297        return
298
299    bot_comment = get_bot_duplicate_comment(issue_number)
300    bot_commented = bot_comment is not None
301    print(f"  Bot commented: {bot_commented}")
302
303    closer_is_author = closer_login == author
304
305    if bot_commented and closer_is_author:
306        classify_as_success(issue, state_reason)
307    elif bot_commented and not closer_is_author:
308        # Only authors, staff, and triagers can close issues, so
309        # a non-author closer is always someone with elevated permissions.
310        classify_non_author_closed(issue, bot_comment, state_reason)
311    elif not bot_commented and state_reason == "duplicate":
312        classify_as_missed_opportunity(issue)
313    else:
314        print("  Skipping: no bot comment and not closed as duplicate")
315
316
317def classify_as_success(issue, state_reason):
318    """Author closed their own issue after the bot commented."""
319    if state_reason == "duplicate":
320        status = "Auto-classified"
321        notes = None
322    else:
323        # could be closed for an unrelated reason; flag for review
324        status = "Needs review"
325        notes = f"Author closed as {state_reason}"
326
327    if status == "Auto-classified":
328        print(f"  -> Success (closed as {state_reason})")
329    else:
330        print(f"  -> Possible Success, needs review ({notes})")
331    add_or_update_project_item(
332        issue["node_id"],
333        outcome="Success",
334        closed_as=state_reason,
335        status=status,
336        notes=notes,
337    )
338
339
340def classify_non_author_closed(issue, bot_comment, state_reason):
341    """Non-author (staff or triager) closed an issue the bot had commented on."""
342    if state_reason == "duplicate":
343        classify_as_assist(issue, bot_comment)
344    else:
345        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
346        print(f"  -> Possible Noise, needs review ({notes})")
347        add_or_update_project_item(
348            issue["node_id"],
349            outcome="Noise",
350            closed_as=state_reason,
351            status="Needs review",
352            notes=notes,
353        )
354
355
356def classify_as_assist(issue, bot_comment):
357    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
358    suggested = parse_suggested_issues(bot_comment)
359    original = None
360    try:
361        original = get_closed_as_duplicate_of(issue["number"])
362    except (requests.RequestException, RuntimeError) as error:
363        print(f"  Warning: failed to get the original-for the duplicate issue: {error}")
364
365    if original and suggested:
366        if original in suggested:
367            status = "Auto-classified"
368            notes = None
369            print(f"  -> Assist (original #{original} matches bot suggestion)")
370        else:
371            status = "Needs review"
372            suggested_str = ", ".join(f"#{number}" for number in suggested)
373            notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
374            print(f"  -> Possible Assist, needs review ({notes})")
375    else:
376        # couldn't determine original or no suggestions parsed
377        status = "Needs review"
378        if not original:
379            notes = "Could not determine original issue from timeline"
380        else:
381            notes = f"Closed as dup of #{original}; could not parse bot suggestions"
382        print(f"  -> Possible Assist, needs review ({notes})")
383
384    add_or_update_project_item(
385        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
386
387
388def classify_as_missed_opportunity(issue):
389    """Issue closed as duplicate but the bot never commented."""
390    print("  -> Missed opportunity")
391    add_or_update_project_item(
392        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
393
394
395def classify_open():
396    """Classify open, triaged, bot-commented issues as Noise."""
397    print("Classifying open issues")
398
399    query = (
400        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
401        f"commenter:app/{BOT_APP_SLUG} "
402        f'-label:"{NEEDS_TRIAGE_LABEL}" '
403        f"created:>={BOT_START_DATE}"
404    )
405    print(f"  Search query: {query}")
406
407    results = github_search_issues(query)
408    print(f"  Found {len(results)} candidate issues")
409
410    added, skipped, errors = 0, 0, 0
411    for item in results:
412        number = item["number"]
413        try:
414            type_name = (item.get("type") or {}).get("name")
415            author = (item.get("user") or {}).get("login", "")
416            node_id = item["node_id"]
417
418            skip_reason = (
419                f"type is {type_name}" if type_name not in ("Bug", "Crash")
420                else f"author {author} is staff" if is_staff_member(author)
421                else "already on the board" if find_project_item(node_id)
422                else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
423                else None
424            )
425            if skip_reason:
426                print(f"  #{number}: skipping, {skip_reason}")
427                skipped += 1
428                continue
429
430            print(f"  #{number}: adding as Noise")
431            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
432            added += 1
433        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
434            print(f"  #{number}: error processing issue, skipping: {error}")
435            errors += 1
436
437    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
438
439
440if __name__ == "__main__":
441    parser = argparse.ArgumentParser(
442        description="Track duplicate bot effectiveness on a GitHub project board.",
443    )
444    subparsers = parser.add_subparsers(dest="command", required=True)
445
446    classify_parser = subparsers.add_parser(
447        "classify-closed",
448        help="Classify a closed issue and add it to the project board.",
449    )
450    classify_parser.add_argument("issue_number", type=int)
451    classify_parser.add_argument("closer_login")
452    classify_parser.add_argument("state_reason")
453
454    subparsers.add_parser(
455        "classify-open",
456        help="Classify open, triaged, bot-commented issues as Noise.",
457    )
458
459    args = parser.parse_args()
460
461    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
462    if not GITHUB_TOKEN:
463        print("Error: GITHUB_TOKEN environment variable is required")
464        sys.exit(1)
465
466    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
467    if raw_project_number:
468        try:
469            PROJECT_NUMBER = int(raw_project_number)
470        except ValueError:
471            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
472            sys.exit(1)
473    else:
474        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
475
476    GITHUB_HEADERS = {
477        "Authorization": f"token {GITHUB_TOKEN}",
478        "Accept": "application/vnd.github+json",
479    }
480
481    if args.command == "classify-closed":
482        classify_closed(args.issue_number, args.closer_login, args.state_reason)
483    elif args.command == "classify-open":
484        classify_open()