From 6741a1df286528a5480256eb1f9301a178f6846f Mon Sep 17 00:00:00 2001 From: Lena <241371603+zelenenka@users.noreply.github.com> Date: Mon, 23 Feb 2026 14:22:29 +0100 Subject: [PATCH] Add duplicate bot effectiveness tracking (w/github project) (#49879) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a script that classifies open and closed issues that the duplicate bot commented (or not commented) on and puts them into the appropriate columns of the dedicated github project board. Add a workflow that calls that script for every closed issue and also on a schedule (that's for the open ones). If you're reading this some time way later and there's no bot running around the repository leaving comments like “This issue appears to be a duplicate of...”, you can delete these files. Release Notes: - N/A --- .../track_duplicate_bot_effectiveness.yml | 89 ++++ ...ithub-track-duplicate-bot-effectiveness.py | 484 ++++++++++++++++++ 2 files changed, 573 insertions(+) create mode 100644 .github/workflows/track_duplicate_bot_effectiveness.yml create mode 100644 script/github-track-duplicate-bot-effectiveness.py diff --git a/.github/workflows/track_duplicate_bot_effectiveness.yml b/.github/workflows/track_duplicate_bot_effectiveness.yml new file mode 100644 index 0000000000000000000000000000000000000000..fa1c80616cb6133a7a4cad8841bbaad03115ff58 --- /dev/null +++ b/.github/workflows/track_duplicate_bot_effectiveness.yml @@ -0,0 +1,89 @@ +name: Track duplicate bot effectiveness + +on: + issues: + types: [closed] + schedule: + - cron: "0 8 */2 * *" # every 2 days at 8 AM UTC + workflow_dispatch: + +permissions: + contents: read + +jobs: + classify-closed-issue: + if: > + github.event_name == 'issues' && + github.repository == 'zed-industries/zed' && + github.event.issue.pull_request == null && + github.event.issue.type != null && + (github.event.issue.type.name == 'Bug' || github.event.issue.type.name == 'Crash') + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + sparse-checkout: script/github-track-duplicate-bot-effectiveness.py + sparse-checkout-cone-mode: false + + - name: Get github app token + id: get-app-token + uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7 + with: + app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }} + private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }} + owner: zed-industries + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install requests + + - name: Classify closed issue + env: + GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + CLOSER_LOGIN: ${{ github.event.sender.login }} + STATE_REASON: ${{ github.event.issue.state_reason }} + run: | + python script/github-track-duplicate-bot-effectiveness.py \ + classify-closed "$ISSUE_NUMBER" "$CLOSER_LOGIN" "$STATE_REASON" + + classify-open: + if: > + github.repository == 'zed-industries/zed' && + (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + sparse-checkout: script/github-track-duplicate-bot-effectiveness.py + sparse-checkout-cone-mode: false + + - name: Get github app token + id: get-app-token + uses: actions/create-github-app-token@bef1eaf1c0ac2b148ee2a0a74c65fbe6db0631f1 # v1.11.7 + with: + app-id: ${{ secrets.ZED_COMMUNITY_BOT_APP_ID }} + private-key: ${{ secrets.ZED_COMMUNITY_BOT_PRIVATE_KEY }} + owner: zed-industries + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install requests + + - name: Classify open issues + env: + GITHUB_TOKEN: ${{ steps.get-app-token.outputs.token }} + run: | + python script/github-track-duplicate-bot-effectiveness.py classify-open diff --git a/script/github-track-duplicate-bot-effectiveness.py b/script/github-track-duplicate-bot-effectiveness.py new file mode 100644 index 0000000000000000000000000000000000000000..a3056e856da717783ca0fc6538a131bb8a1d1d73 --- /dev/null +++ b/script/github-track-duplicate-bot-effectiveness.py @@ -0,0 +1,484 @@ +#!/usr/bin/env python3 +""" +Track the effectiveness of the duplicate-detection bot by classifying issues +into outcome categories on a GitHub Projects v2 board. + +Subcommands: + classify-closed + Classify a closed issue and add it to the project board. + + classify-open + Classify open, triaged, bot-commented issues and add them to + the project board as Noise. + +Requires: + requests (pip install requests) + +Environment variables: + GITHUB_TOKEN - GitHub App token + PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing) +""" + +import argparse +import functools +import os +import re +import sys + +import requests + +GITHUB_API = "https://api.github.com" +GRAPHQL_URL = "https://api.github.com/graphql" +REPO_OWNER = "zed-industries" +REPO_NAME = "zed" +STAFF_TEAM_SLUG = "staff" +BOT_LOGIN = "zed-community-bot[bot]" +BOT_APP_SLUG = "zed-community-bot" +BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of" +BOT_START_DATE = "2026-02-18" +NEEDS_TRIAGE_LABEL = "state:needs triage" +DEFAULT_PROJECT_NUMBER = 76 +VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"} + + +def github_api_get(path, params=None): + url = f"{GITHUB_API}/{path.lstrip('/')}" + response = requests.get(url, headers=GITHUB_HEADERS, params=params) + response.raise_for_status() + return response.json() + + +def github_search_issues(query): + """Search issues, returning most recently created first.""" + # not handling pagination on purpose: the oldest issues are on the board already + params = {"q": query, "sort": "created", "order": "desc", "per_page": 100} + return github_api_get("/search/issues", params).get("items", []) + + +def is_staff_member(username): + """Check if user is an active member of the staff team.""" + try: + data = github_api_get( + f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}" + ) + return data.get("state") == "active" + except requests.HTTPError as error: + if error.response.status_code == 404: + return False + raise + + +def fetch_issue(issue_number): + data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}") + return { + "number": issue_number, + "node_id": data["node_id"], + "author": (data.get("user") or {}).get("login", ""), + "type_name": (data.get("type") or {}).get("name"), + } + + +def get_bot_duplicate_comment(issue_number): + """Get the bot's duplicate-detection comment body from an issue. + + Returns the comment body if found, else None. + """ + comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments" + page = 1 + while comments := github_api_get(comments_path, {"per_page": 100, "page": page}): + for comment in comments: + author = (comment.get("user") or {}).get("login", "") + body = comment.get("body", "") + if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX): + return body + page += 1 + return None + + +def parse_suggested_issues(comment_body): + """Extract issue numbers from the bot's comment (lines like '- #12345').""" + return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)] + + +def github_api_graphql(query, variables=None): + """Execute a GitHub GraphQL query. Raises on errors.""" + response = requests.post( + GRAPHQL_URL, + headers=GITHUB_HEADERS, + json={"query": query, "variables": variables or {}}, + ) + response.raise_for_status() + data = response.json() + if "errors" in data: + raise RuntimeError(f"GraphQL errors: {data['errors']}") + return data["data"] + + +def get_closed_as_duplicate_of(issue_number): + """Get the issue number this issue was closed as a duplicate of. + + Uses the timeline to find the most recent MarkedAsDuplicateEvent. + Returns the original issue number, or None. + + Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent. + If the closer used the "Close as duplicate" button without separately + marking the duplicate relationship, no event is created and this returns + None. The caller handles this by flagging the item for manual review. + """ + data = github_api_graphql( + """ + query($owner: String!, $repo: String!, $number: Int!) { + repository(owner: $owner, name: $repo) { + issue(number: $number) { + timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) { + nodes { + ... on MarkedAsDuplicateEvent { + canonical { ... on Issue { number } } + } + } + } + } + } + } + """, + {"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number}, + ) + nodes = data["repository"]["issue"]["timelineItems"]["nodes"] + for node in reversed(nodes): + if original := (node.get("canonical") or {}).get("number"): + return original + return None + + +@functools.lru_cache +def get_project_config(): + """Fetch the project board's ID, field IDs, and option IDs.""" + data = github_api_graphql( + """ + query($org: String!, $number: Int!) { + organization(login: $org) { + projectV2(number: $number) { + id + fields(first: 30) { + nodes { + ... on ProjectV2SingleSelectField { id name options { id name } } + ... on ProjectV2Field { id name } + } + } + } + } + } + """, + {"org": REPO_OWNER, "number": PROJECT_NUMBER}, + ) + project = data["organization"]["projectV2"] + + config = {"project_id": project["id"], "fields": {}} + for field_node in project["fields"]["nodes"]: + name = field_node.get("name") + if not name: + continue + field_info = {"id": field_node["id"]} + if "options" in field_node: + field_info["options"] = { + option["name"]: option["id"] for option in field_node["options"] + } + config["fields"][name] = field_info + + print(f" Project config loaded: {len(config['fields'])} fields") + return config + + +def find_project_item(issue_node_id): + """Check if an issue is already on our project board. + + Returns the project item ID if found, or None. + """ + data = github_api_graphql( + "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }", + {"id": issue_node_id}, + ) + for item in data["node"]["projectItems"]["nodes"]: + if item["project"]["number"] == PROJECT_NUMBER: + return item["id"] + return None + + +def add_project_item(issue_node_id): + """Add an issue to the project board. Returns the new item ID.""" + config = get_project_config() + data = github_api_graphql( + """ + mutation($projectId: ID!, $contentId: ID!) { + addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) { + item { id } + } + } + """, + {"projectId": config["project_id"], "contentId": issue_node_id}, + ) + return data["addProjectV2ItemById"]["item"]["id"] + + +def set_field_value(item_id, field_name, value): + """Set a single field value on a project board item.""" + config = get_project_config() + field = config["fields"].get(field_name) + if not field: + print(f" Warning: field '{field_name}' not found on project board") + return + + if "options" in field: + # single-select field + option_id = field["options"].get(value) + if not option_id: + print(f" Warning: option '{value}' not found for field '{field_name}'") + return + field_value = {"singleSelectOptionId": option_id} + else: + # text field + field_value = {"text": str(value)} + + github_api_graphql( + """ + mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) { + updateProjectV2ItemFieldValue(input: { + projectId: $projectId + itemId: $itemId + fieldId: $fieldId + value: $value + }) { + projectV2Item { id } + } + } + """, + { + "projectId": config["project_id"], + "itemId": item_id, + "fieldId": field["id"], + "value": field_value, + }, + ) + + +def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None): + """Add an issue to the project board (or update it if already there), setting field values.""" + item_id = find_project_item(issue_node_id) + if item_id: + print(f" Issue already on board, updating (item {item_id})") + else: + item_id = add_project_item(issue_node_id) + print(f" Added to project board (item {item_id})") + + set_field_value(item_id, "Outcome", outcome) + set_field_value(item_id, "Status", status) + + if closed_as and closed_as in VALID_CLOSED_AS_VALUES: + set_field_value(item_id, "Closed as", closed_as) + + if notes: + set_field_value(item_id, "Notes", notes) + + return item_id + + +def classify_closed(issue_number, closer_login, state_reason): + """Classify a closed issue and add/update it on the project board.""" + state_reason = state_reason or "unknown" + print(f"Classifying closed issue #{issue_number}") + print(f" Closer: {closer_login}, state_reason: {state_reason}") + + issue = fetch_issue(issue_number) + author = issue["author"] + print(f" Author: {author}, type: {issue['type_name']}") + + if is_staff_member(author): + print(f" Skipping: author '{author}' is a staff member") + return + + bot_comment = get_bot_duplicate_comment(issue_number) + bot_commented = bot_comment is not None + print(f" Bot commented: {bot_commented}") + + closer_is_author = closer_login == author + + if bot_commented and closer_is_author: + classify_as_success(issue, state_reason) + elif bot_commented and not closer_is_author: + # Only authors, staff, and triagers can close issues, so + # a non-author closer is always someone with elevated permissions. + classify_non_author_closed(issue, bot_comment, state_reason) + elif not bot_commented and state_reason == "duplicate": + classify_as_missed_opportunity(issue) + else: + print(" Skipping: no bot comment and not closed as duplicate") + + +def classify_as_success(issue, state_reason): + """Author closed their own issue after the bot commented.""" + if state_reason == "duplicate": + status = "Auto-classified" + notes = None + else: + # could be closed for an unrelated reason; flag for review + status = "Needs review" + notes = f"Author closed as {state_reason}" + + if status == "Auto-classified": + print(f" -> Success (closed as {state_reason})") + else: + print(f" -> Possible Success, needs review ({notes})") + add_or_update_project_item( + issue["node_id"], + outcome="Success", + closed_as=state_reason, + status=status, + notes=notes, + ) + + +def classify_non_author_closed(issue, bot_comment, state_reason): + """Non-author (staff or triager) closed an issue the bot had commented on.""" + if state_reason == "duplicate": + classify_as_assist(issue, bot_comment) + else: + notes = f"Closed by staff/triager as {state_reason}, not duplicate" + print(f" -> Possible Noise, needs review ({notes})") + add_or_update_project_item( + issue["node_id"], + outcome="Noise", + closed_as=state_reason, + status="Needs review", + notes=notes, + ) + + +def classify_as_assist(issue, bot_comment): + """Staff member closed as duplicate after the bot commented. Check if the dup matches.""" + suggested = parse_suggested_issues(bot_comment) + original = None + try: + original = get_closed_as_duplicate_of(issue["number"]) + except (requests.RequestException, RuntimeError) as error: + print(f" Warning: failed to get the original-for the duplicate issue: {error}") + + if original and suggested: + if original in suggested: + status = "Auto-classified" + notes = None + print(f" -> Assist (original #{original} matches bot suggestion)") + else: + status = "Needs review" + suggested_str = ", ".join(f"#{number}" for number in suggested) + notes = f"Bot suggested {suggested_str}; closed as dup of #{original}" + print(f" -> Possible Assist, needs review ({notes})") + else: + # couldn't determine original or no suggestions parsed + status = "Needs review" + if not original: + notes = "Could not determine original issue from timeline" + else: + notes = f"Closed as dup of #{original}; could not parse bot suggestions" + print(f" -> Possible Assist, needs review ({notes})") + + add_or_update_project_item( + issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes) + + +def classify_as_missed_opportunity(issue): + """Issue closed as duplicate but the bot never commented.""" + print(" -> Missed opportunity") + add_or_update_project_item( + issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified") + + +def classify_open(): + """Classify open, triaged, bot-commented issues as Noise.""" + print("Classifying open issues") + + query = ( + f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open " + f"commenter:app/{BOT_APP_SLUG} " + f'-label:"{NEEDS_TRIAGE_LABEL}" ' + f"created:>={BOT_START_DATE}" + ) + print(f" Search query: {query}") + + results = github_search_issues(query) + print(f" Found {len(results)} candidate issues") + + added, skipped, errors = 0, 0, 0 + for item in results: + number = item["number"] + try: + type_name = (item.get("type") or {}).get("name") + author = (item.get("user") or {}).get("login", "") + node_id = item["node_id"] + + skip_reason = ( + f"type is {type_name}" if type_name not in ("Bug", "Crash") + else f"author {author} is staff" if is_staff_member(author) + else "already on the board" if find_project_item(node_id) + else "no bot duplicate comment found" if not get_bot_duplicate_comment(number) + else None + ) + if skip_reason: + print(f" #{number}: skipping, {skip_reason}") + skipped += 1 + continue + + print(f" #{number}: adding as Noise") + add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified") + added += 1 + except Exception as error: # broad catch: one issue failing shouldn't stop the sweep + print(f" #{number}: error processing issue, skipping: {error}") + errors += 1 + + print(f" Done: added {added}, skipped {skipped}, errors {errors}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Track duplicate bot effectiveness on a GitHub project board.", + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + classify_parser = subparsers.add_parser( + "classify-closed", + help="Classify a closed issue and add it to the project board.", + ) + classify_parser.add_argument("issue_number", type=int) + classify_parser.add_argument("closer_login") + classify_parser.add_argument("state_reason") + + subparsers.add_parser( + "classify-open", + help="Classify open, triaged, bot-commented issues as Noise.", + ) + + args = parser.parse_args() + + GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") + if not GITHUB_TOKEN: + print("Error: GITHUB_TOKEN environment variable is required") + sys.exit(1) + + raw_project_number = os.environ.get("PROJECT_NUMBER", "") + if raw_project_number: + try: + PROJECT_NUMBER = int(raw_project_number) + except ValueError: + print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'") + sys.exit(1) + else: + PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER + + GITHUB_HEADERS = { + "Authorization": f"token {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", + } + + if args.command == "classify-closed": + classify_closed(args.issue_number, args.closer_login, args.state_reason) + elif args.command == "classify-open": + classify_open()