github-find-top-duplicated-bugs.py

  1#!/usr/bin/env python3
  2"""
  3Find open issues that have the most duplicates filed against them and update
  4a GitHub issue with the results.
  5
  6Queries open issues and looks for MarkedAsDuplicateEvent in their timelines.
  7Only includes issues that have been re-reported at least twice (2+ duplicates
  8closed against them). Groups results by area: label. The output is formatted
  9as markdown with issue URLs (GitHub renders the titles automatically).
 10
 11This script is run regularly by the update_duplicate_magnets.yml workflow.
 12
 13Requires: requests (pip install requests)
 14GitHub token permissions: issues:write
 15
 16Usage:
 17    # Print to stdout only for testing:
 18    python github-find-top-duplicated-bugs.py --github-token ghp_xxx
 19
 20    # Update a GitHub issue:
 21    python github-find-top-duplicated-bugs.py --github-token ghp_xxx --issue-number 46355
 22"""
 23
 24import argparse
 25import os
 26import sys
 27from collections import Counter, defaultdict
 28
 29import requests
 30
 31OWNER = "zed-industries"
 32REPO = "zed"
 33
 34GRAPHQL_URL = "https://api.github.com/graphql"
 35REST_API_URL = "https://api.github.com"
 36
 37headers = None
 38
 39ISSUES_WITH_DUPLICATES_QUERY = """
 40query($owner: String!, $repo: String!, $cursor: String) {
 41  repository(owner: $owner, name: $repo) {
 42    issues(
 43      first: 100
 44      after: $cursor
 45      states: [OPEN]
 46      orderBy: {field: UPDATED_AT, direction: DESC}
 47    ) {
 48      pageInfo {
 49        hasNextPage
 50        endCursor
 51      }
 52      nodes {
 53        number
 54        url
 55        labels(first: 20) {
 56          nodes {
 57            name
 58          }
 59        }
 60        timelineItems(first: 100, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
 61          nodes {
 62            ... on MarkedAsDuplicateEvent {
 63              duplicate {
 64                ... on Issue {
 65                  number
 66                  state
 67                }
 68              }
 69            }
 70          }
 71        }
 72      }
 73    }
 74  }
 75}
 76"""
 77
 78
 79def extract_duplicate_info(issue):
 80    """Extract duplicate count and info from an issue. Returns None if < 2 duplicates."""
 81    seen_duplicates = set()
 82    for event in issue["timelineItems"]["nodes"]:
 83        try:
 84            if event["duplicate"]["state"] == "CLOSED":
 85                seen_duplicates.add(event["duplicate"]["number"])
 86        except (KeyError, TypeError):
 87            continue
 88
 89    if len(seen_duplicates) < 2:
 90        return None
 91
 92    labels = [l["name"] for l in issue["labels"]["nodes"]]
 93    areas = [l.replace("area:", "") for l in labels if l.startswith("area:")]
 94
 95    return {
 96        "number": issue["number"],
 97        "url": issue["url"],
 98        "areas": areas if areas else ["(unlabeled)"],
 99        "duplicate_count": len(seen_duplicates),
100    }
101
102
103def fetch_canonical_issues_with_duplicates(max_pages=100):
104    """Fetch open issues and count how many duplicates point to each."""
105    print(f"Finding open issues with the most duplicates in {OWNER}/{REPO}")
106
107    cursor = None
108    duplicate_magnets = []
109    total_issues_scanned = 0
110
111    for page in range(max_pages):
112        response = requests.post(
113            GRAPHQL_URL,
114            headers=headers,
115            json={
116                "query": ISSUES_WITH_DUPLICATES_QUERY,
117                "variables": {"owner": OWNER, "repo": REPO, "cursor": cursor},
118            },
119        )
120        response.raise_for_status()
121        data = response.json()
122
123        if "errors" in data:
124            print(f"GraphQL errors: {data['errors']}")
125            break
126
127        issues = data["data"]["repository"]["issues"]
128        total_issues_scanned += len(issues["nodes"])
129
130        for issue in issues["nodes"]:
131            if info := extract_duplicate_info(issue):
132                duplicate_magnets.append(info)
133
134        page_info = issues["pageInfo"]
135        if not page_info["hasNextPage"]:
136            print(f"Done: scanned {total_issues_scanned} open issues")
137            break
138        cursor = page_info["endCursor"]
139
140        print(
141            f"Page {page + 1}: scanned {total_issues_scanned} open issues, "
142            f"{len(duplicate_magnets)} have duplicates"
143        )
144
145    return duplicate_magnets
146
147
148def build_markdown_body(duplicate_magnets):
149    """Group results by area and build markdown body for the GitHub issue.
150
151    NOTE: the output format is parsed by fetch_duplicate_magnets() in
152    github-check-new-issue-for-duplicates.py — update that if you change this.
153    """
154    by_area = defaultdict(list)
155    area_totals = Counter()
156    for info in duplicate_magnets:
157        for area in info["areas"]:
158            by_area[area].append(info)
159            area_totals[area] += info["duplicate_count"]
160
161    lines = [
162        "These are the issues that are frequently re-reported. "
163        "The list is generated regularly by running a script."
164    ]
165
166    for area, _ in area_totals.most_common():
167        issues = sorted(by_area[area], key=lambda x: x["duplicate_count"], reverse=True)
168
169        lines.append("")
170        lines.append(f"## {area}")
171        lines.append("")
172
173        for info in issues:
174            lines.append(
175                f"-   [{info['duplicate_count']:2d} dupes] {info['url']}"
176            )
177
178    return "\n".join(lines)
179
180
181def update_github_issue(issue_number, body):
182    """Update the body of a GitHub issue."""
183    url = f"{REST_API_URL}/repos/{OWNER}/{REPO}/issues/{issue_number}"
184    response = requests.patch(url, headers=headers, json={"body": body})
185    response.raise_for_status()
186    print(f"Updated issue #{issue_number}")
187
188
189def parse_args():
190    parser = argparse.ArgumentParser(
191        description="Find open issues with the most duplicates filed against them."
192    )
193    parser.add_argument(
194        "--github-token",
195        default=os.environ.get("GITHUB_TOKEN"),
196        help="GitHub token (or set GITHUB_TOKEN env var)",
197    )
198    parser.add_argument(
199        "--issue-number",
200        type=int,
201        help="GitHub issue number to update (if not provided, prints to stdout)",
202    )
203    return parser.parse_args()
204
205
206if __name__ == "__main__":
207    args = parse_args()
208
209    if not args.github_token:
210        print("Error: --github-token is required (or set GITHUB_TOKEN env var)")
211        sys.exit(1)
212
213    headers = {
214        "Authorization": f"Bearer {args.github_token}",
215        "Content-Type": "application/json",
216    }
217
218    if duplicate_magnets := fetch_canonical_issues_with_duplicates():
219        body = build_markdown_body(duplicate_magnets)
220        if args.issue_number:
221            update_github_issue(args.issue_number, body)
222        else:
223            print(body)