Add duplicate bot effectiveness tracking (w/github project) (#49879)

Lena created

Add a script that classifies open and closed issues that the duplicate
bot commented (or not commented) on and puts them into the appropriate
columns of the dedicated github project board. Add a workflow that calls
that script for every closed issue and also on a schedule (that's for
the open ones).

If you're reading this some time way later and there's no bot running
around the repository leaving comments like β€œThis issue appears to be a
duplicate of...”, you can delete these files.

Release Notes:

- N/A

Change summary

.github/workflows/track_duplicate_bot_effectiveness.yml |  89 ++
script/github-track-duplicate-bot-effectiveness.py      | 484 +++++++++++
2 files changed, 573 insertions(+)

Detailed changes

.github/workflows/track_duplicate_bot_effectiveness.yml πŸ”—

@@ -0,0 +1,89 @@
+name: Track duplicate bot effectiveness
+
+on:
+  issues:
+    types: [closed]
+  schedule:
+    - cron: "0 8 */2 * *" # every 2 days at 8 AM UTC
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  classify-closed-issue:
+    if: >
+      github.event_name == 'issues' &&
+      github.repository == 'zed-industries/zed' &&
+      github.event.issue.pull_request == null &&
+      github.event.issue.type != null &&
+      (github.event.issue.type.name == 'Bug' || github.event.issue.type.name == 'Crash')
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          sparse-checkout: script/github-track-duplicate-bot-effectiveness.py
+          sparse-checkout-cone-mode: false
+
+      - name: Get github app token
+        id: get-app-token
+        uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7
+        with:
+          app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }}
+          private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }}
+          owner: zed-industries
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install requests
+
+      - name: Classify closed issue
+        env:
+          GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          CLOSER_LOGIN: ${{ github.event.sender.login }}
+          STATE_REASON: ${{ github.event.issue.state_reason }}
+        run: |
+          python script/github-track-duplicate-bot-effectiveness.py \
+            classify-closed "$ISSUE_NUMBER" "$CLOSER_LOGIN" "$STATE_REASON"
+
+  classify-open:
+    if: >
+      github.repository == 'zed-industries/zed' &&
+      (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          sparse-checkout: script/github-track-duplicate-bot-effectiveness.py
+          sparse-checkout-cone-mode: false
+
+      - name: Get github app token
+        id: get-app-token
+        uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7
+        with:
+          app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }}
+          private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }}
+          owner: zed-industries
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install requests
+
+      - name: Classify open issues
+        env:
+          GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }}
+        run: |
+          python script/github-track-duplicate-bot-effectiveness.py classify-open

script/github-track-duplicate-bot-effectiveness.py πŸ”—

@@ -0,0 +1,484 @@
+#!/usr/bin/env python3
+"""
+Track the effectiveness of the duplicate-detection bot by classifying issues
+into outcome categories on a GitHub Projects v2 board.
+
+Subcommands:
+    classify-closed <issue_number> <closer_login> <state_reason>
+        Classify a closed issue and add it to the project board.
+
+    classify-open
+        Classify open, triaged, bot-commented issues and add them to
+        the project board as Noise.
+
+Requires:
+    requests (pip install requests)
+
+Environment variables:
+    GITHUB_TOKEN     - GitHub App token
+    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
+"""
+
+import argparse
+import functools
+import os
+import re
+import sys
+
+import requests
+
+GITHUB_API = "https://api.github.com"
+GRAPHQL_URL = "https://api.github.com/graphql"
+REPO_OWNER = "zed-industries"
+REPO_NAME = "zed"
+STAFF_TEAM_SLUG = "staff"
+BOT_LOGIN = "zed-community-bot[bot]"
+BOT_APP_SLUG = "zed-community-bot"
+BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
+BOT_START_DATE = "2026-02-18"
+NEEDS_TRIAGE_LABEL = "state:needs triage"
+DEFAULT_PROJECT_NUMBER = 76
+VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
+
+
+def github_api_get(path, params=None):
+    url = f"{GITHUB_API}/{path.lstrip('/')}"
+    response = requests.get(url, headers=GITHUB_HEADERS, params=params)
+    response.raise_for_status()
+    return response.json()
+
+
+def github_search_issues(query):
+    """Search issues, returning most recently created first."""
+    # not handling pagination on purpose: the oldest issues are on the board already
+    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
+    return github_api_get("/search/issues", params).get("items", [])
+
+
+def is_staff_member(username):
+    """Check if user is an active member of the staff team."""
+    try:
+        data = github_api_get(
+            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
+        )
+        return data.get("state") == "active"
+    except requests.HTTPError as error:
+        if error.response.status_code == 404:
+            return False
+        raise
+
+
+def fetch_issue(issue_number):
+    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
+    return {
+        "number": issue_number,
+        "node_id": data["node_id"],
+        "author": (data.get("user") or {}).get("login", ""),
+        "type_name": (data.get("type") or {}).get("name"),
+    }
+
+
+def get_bot_duplicate_comment(issue_number):
+    """Get the bot's duplicate-detection comment body from an issue.
+
+    Returns the comment body if found, else None.
+    """
+    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
+    page = 1
+    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
+        for comment in comments:
+            author = (comment.get("user") or {}).get("login", "")
+            body = comment.get("body", "")
+            if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
+                return body
+        page += 1
+    return None
+
+
+def parse_suggested_issues(comment_body):
+    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
+    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
+
+
+def github_api_graphql(query, variables=None):
+    """Execute a GitHub GraphQL query. Raises on errors."""
+    response = requests.post(
+        GRAPHQL_URL,
+        headers=GITHUB_HEADERS,
+        json={"query": query, "variables": variables or {}},
+    )
+    response.raise_for_status()
+    data = response.json()
+    if "errors" in data:
+        raise RuntimeError(f"GraphQL errors: {data['errors']}")
+    return data["data"]
+
+
+def get_closed_as_duplicate_of(issue_number):
+    """Get the issue number this issue was closed as a duplicate of.
+
+    Uses the timeline to find the most recent MarkedAsDuplicateEvent.
+    Returns the original issue number, or None.
+
+    Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
+    If the closer used the "Close as duplicate" button without separately
+    marking the duplicate relationship, no event is created and this returns
+    None. The caller handles this by flagging the item for manual review.
+    """
+    data = github_api_graphql(
+        """
+        query($owner: String!, $repo: String!, $number: Int!) {
+          repository(owner: $owner, name: $repo) {
+            issue(number: $number) {
+              timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
+                nodes {
+                  ... on MarkedAsDuplicateEvent {
+                    canonical { ... on Issue { number } }
+                  }
+                }
+              }
+            }
+          }
+        }
+        """,
+        {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
+    )
+    nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
+    for node in reversed(nodes):
+        if original := (node.get("canonical") or {}).get("number"):
+            return original
+    return None
+
+
+@functools.lru_cache
+def get_project_config():
+    """Fetch the project board's ID, field IDs, and option IDs."""
+    data = github_api_graphql(
+        """
+        query($org: String!, $number: Int!) {
+          organization(login: $org) {
+            projectV2(number: $number) {
+              id
+              fields(first: 30) {
+                nodes {
+                  ... on ProjectV2SingleSelectField { id name options { id name } }
+                  ... on ProjectV2Field { id name }
+                }
+              }
+            }
+          }
+        }
+        """,
+        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
+    )
+    project = data["organization"]["projectV2"]
+
+    config = {"project_id": project["id"], "fields": {}}
+    for field_node in project["fields"]["nodes"]:
+        name = field_node.get("name")
+        if not name:
+            continue
+        field_info = {"id": field_node["id"]}
+        if "options" in field_node:
+            field_info["options"] = {
+                option["name"]: option["id"] for option in field_node["options"]
+            }
+        config["fields"][name] = field_info
+
+    print(f"  Project config loaded: {len(config['fields'])} fields")
+    return config
+
+
+def find_project_item(issue_node_id):
+    """Check if an issue is already on our project board.
+
+    Returns the project item ID if found, or None.
+    """
+    data = github_api_graphql(
+        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
+        {"id": issue_node_id},
+    )
+    for item in data["node"]["projectItems"]["nodes"]:
+        if item["project"]["number"] == PROJECT_NUMBER:
+            return item["id"]
+    return None
+
+
+def add_project_item(issue_node_id):
+    """Add an issue to the project board. Returns the new item ID."""
+    config = get_project_config()
+    data = github_api_graphql(
+        """
+        mutation($projectId: ID!, $contentId: ID!) {
+          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
+            item { id }
+          }
+        }
+        """,
+        {"projectId": config["project_id"], "contentId": issue_node_id},
+    )
+    return data["addProjectV2ItemById"]["item"]["id"]
+
+
+def set_field_value(item_id, field_name, value):
+    """Set a single field value on a project board item."""
+    config = get_project_config()
+    field = config["fields"].get(field_name)
+    if not field:
+        print(f"  Warning: field '{field_name}' not found on project board")
+        return
+
+    if "options" in field:
+        # single-select field
+        option_id = field["options"].get(value)
+        if not option_id:
+            print(f"  Warning: option '{value}' not found for field '{field_name}'")
+            return
+        field_value = {"singleSelectOptionId": option_id}
+    else:
+        # text field
+        field_value = {"text": str(value)}
+
+    github_api_graphql(
+        """
+        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
+          updateProjectV2ItemFieldValue(input: {
+            projectId: $projectId
+            itemId: $itemId
+            fieldId: $fieldId
+            value: $value
+          }) {
+            projectV2Item { id }
+          }
+        }
+        """,
+        {
+            "projectId": config["project_id"],
+            "itemId": item_id,
+            "fieldId": field["id"],
+            "value": field_value,
+        },
+    )
+
+
+def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
+    """Add an issue to the project board (or update it if already there), setting field values."""
+    item_id = find_project_item(issue_node_id)
+    if item_id:
+        print(f"  Issue already on board, updating (item {item_id})")
+    else:
+        item_id = add_project_item(issue_node_id)
+        print(f"  Added to project board (item {item_id})")
+
+    set_field_value(item_id, "Outcome", outcome)
+    set_field_value(item_id, "Status", status)
+
+    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
+        set_field_value(item_id, "Closed as", closed_as)
+
+    if notes:
+        set_field_value(item_id, "Notes", notes)
+
+    return item_id
+
+
+def classify_closed(issue_number, closer_login, state_reason):
+    """Classify a closed issue and add/update it on the project board."""
+    state_reason = state_reason or "unknown"
+    print(f"Classifying closed issue #{issue_number}")
+    print(f"  Closer: {closer_login}, state_reason: {state_reason}")
+
+    issue = fetch_issue(issue_number)
+    author = issue["author"]
+    print(f"  Author: {author}, type: {issue['type_name']}")
+
+    if is_staff_member(author):
+        print(f"  Skipping: author '{author}' is a staff member")
+        return
+
+    bot_comment = get_bot_duplicate_comment(issue_number)
+    bot_commented = bot_comment is not None
+    print(f"  Bot commented: {bot_commented}")
+
+    closer_is_author = closer_login == author
+
+    if bot_commented and closer_is_author:
+        classify_as_success(issue, state_reason)
+    elif bot_commented and not closer_is_author:
+        # Only authors, staff, and triagers can close issues, so
+        # a non-author closer is always someone with elevated permissions.
+        classify_non_author_closed(issue, bot_comment, state_reason)
+    elif not bot_commented and state_reason == "duplicate":
+        classify_as_missed_opportunity(issue)
+    else:
+        print("  Skipping: no bot comment and not closed as duplicate")
+
+
+def classify_as_success(issue, state_reason):
+    """Author closed their own issue after the bot commented."""
+    if state_reason == "duplicate":
+        status = "Auto-classified"
+        notes = None
+    else:
+        # could be closed for an unrelated reason; flag for review
+        status = "Needs review"
+        notes = f"Author closed as {state_reason}"
+
+    if status == "Auto-classified":
+        print(f"  -> Success (closed as {state_reason})")
+    else:
+        print(f"  -> Possible Success, needs review ({notes})")
+    add_or_update_project_item(
+        issue["node_id"],
+        outcome="Success",
+        closed_as=state_reason,
+        status=status,
+        notes=notes,
+    )
+
+
+def classify_non_author_closed(issue, bot_comment, state_reason):
+    """Non-author (staff or triager) closed an issue the bot had commented on."""
+    if state_reason == "duplicate":
+        classify_as_assist(issue, bot_comment)
+    else:
+        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
+        print(f"  -> Possible Noise, needs review ({notes})")
+        add_or_update_project_item(
+            issue["node_id"],
+            outcome="Noise",
+            closed_as=state_reason,
+            status="Needs review",
+            notes=notes,
+        )
+
+
+def classify_as_assist(issue, bot_comment):
+    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
+    suggested = parse_suggested_issues(bot_comment)
+    original = None
+    try:
+        original = get_closed_as_duplicate_of(issue["number"])
+    except (requests.RequestException, RuntimeError) as error:
+        print(f"  Warning: failed to get the original-for the duplicate issue: {error}")
+
+    if original and suggested:
+        if original in suggested:
+            status = "Auto-classified"
+            notes = None
+            print(f"  -> Assist (original #{original} matches bot suggestion)")
+        else:
+            status = "Needs review"
+            suggested_str = ", ".join(f"#{number}" for number in suggested)
+            notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
+            print(f"  -> Possible Assist, needs review ({notes})")
+    else:
+        # couldn't determine original or no suggestions parsed
+        status = "Needs review"
+        if not original:
+            notes = "Could not determine original issue from timeline"
+        else:
+            notes = f"Closed as dup of #{original}; could not parse bot suggestions"
+        print(f"  -> Possible Assist, needs review ({notes})")
+
+    add_or_update_project_item(
+        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
+
+
+def classify_as_missed_opportunity(issue):
+    """Issue closed as duplicate but the bot never commented."""
+    print("  -> Missed opportunity")
+    add_or_update_project_item(
+        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
+
+
+def classify_open():
+    """Classify open, triaged, bot-commented issues as Noise."""
+    print("Classifying open issues")
+
+    query = (
+        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
+        f"commenter:app/{BOT_APP_SLUG} "
+        f'-label:"{NEEDS_TRIAGE_LABEL}" '
+        f"created:>={BOT_START_DATE}"
+    )
+    print(f"  Search query: {query}")
+
+    results = github_search_issues(query)
+    print(f"  Found {len(results)} candidate issues")
+
+    added, skipped, errors = 0, 0, 0
+    for item in results:
+        number = item["number"]
+        try:
+            type_name = (item.get("type") or {}).get("name")
+            author = (item.get("user") or {}).get("login", "")
+            node_id = item["node_id"]
+
+            skip_reason = (
+                f"type is {type_name}" if type_name not in ("Bug", "Crash")
+                else f"author {author} is staff" if is_staff_member(author)
+                else "already on the board" if find_project_item(node_id)
+                else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
+                else None
+            )
+            if skip_reason:
+                print(f"  #{number}: skipping, {skip_reason}")
+                skipped += 1
+                continue
+
+            print(f"  #{number}: adding as Noise")
+            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
+            added += 1
+        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
+            print(f"  #{number}: error processing issue, skipping: {error}")
+            errors += 1
+
+    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Track duplicate bot effectiveness on a GitHub project board.",
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    classify_parser = subparsers.add_parser(
+        "classify-closed",
+        help="Classify a closed issue and add it to the project board.",
+    )
+    classify_parser.add_argument("issue_number", type=int)
+    classify_parser.add_argument("closer_login")
+    classify_parser.add_argument("state_reason")
+
+    subparsers.add_parser(
+        "classify-open",
+        help="Classify open, triaged, bot-commented issues as Noise.",
+    )
+
+    args = parser.parse_args()
+
+    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
+    if not GITHUB_TOKEN:
+        print("Error: GITHUB_TOKEN environment variable is required")
+        sys.exit(1)
+
+    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
+    if raw_project_number:
+        try:
+            PROJECT_NUMBER = int(raw_project_number)
+        except ValueError:
+            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
+            sys.exit(1)
+    else:
+        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
+
+    GITHUB_HEADERS = {
+        "Authorization": f"token {GITHUB_TOKEN}",
+        "Accept": "application/vnd.github+json",
+    }
+
+    if args.command == "classify-closed":
+        classify_closed(args.issue_number, args.closer_login, args.state_reason)
+    elif args.command == "classify-open":
+        classify_open()