diff --git a/script/github-check-new-issue-for-duplicates.py b/script/github-check-new-issue-for-duplicates.py index c8978d421b0c68779dfcd98f628ff1af6f57c91e..245d9aaa9a85bcab686b15bef6dbec7b7aaf6545 100644 --- a/script/github-check-new-issue-for-duplicates.py +++ b/script/github-check-new-issue-for-duplicates.py @@ -89,7 +89,12 @@ def post_comment(issue_number: int, body): def build_duplicate_comment(matches): """Build the comment body for potential duplicates.""" match_list = "\n".join(f"- #{m['number']}" for m in matches) - explanations = "\n\n".join(f"**#{m['number']}:** {m['explanation']}" for m in matches) + explanations = "\n\n".join( + f"**#{m['number']}:** {m['explanation']}\n\n**Shared root cause:** {m['shared_root_cause']}" + if m.get('shared_root_cause') + else f"**#{m['number']}:** {m['explanation']}" + for m in matches + ) return f"""This issue appears to be a duplicate of: @@ -307,7 +312,7 @@ def enrich_magnets(magnets): for magnet in magnets: data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}") magnet["title"] = data["title"] - magnet["body_preview"] = (data.get("body") or "")[:500] + magnet["body_preview"] = (data.get("body") or "")[:1000] def areas_match(detected, magnet_area): @@ -381,7 +386,7 @@ def search_for_similar_issues(issue, detected_areas, max_searches=6): "title": item["title"], "state": item.get("state", ""), "created_at": item.get("created_at", ""), - "body_preview": body[:500], + "body_preview": body[:1000], "source": search_type, } except requests.RequestException as e: @@ -414,12 +419,30 @@ def analyze_duplicates(anthropic_key, issue, magnets, search_results): system_prompt = """You analyze GitHub issues to identify potential duplicates. -Given a new issue and a list of existing issues, identify which existing issues might be duplicates. +Given a new issue and a list of existing issues, identify which existing issues are duplicates — meaning +they are caused by the SAME BUG in the code, not just similar symptoms. + +CRITICAL DISTINCTION — shared symptoms vs shared root cause: +- "models missing", "can't sign in", "editor hangs", "venv not detected" are SYMPTOMS that many + different bugs can produce. Two reports of the same symptom are NOT duplicates unless you can + identify a specific shared root cause. +- A duplicate means: if a developer fixed the existing issue, the new issue would also be fixed. +- If the issues just happen to be in the same feature area, or describe similar-sounding problems + with different specifics (different error messages, different triggers, different platforms, different + configurations), they are NOT duplicates. For each potential duplicate, assess confidence: -- "high": Very likely the same issue (same root cause, same symptoms) -- "medium": Possibly related (likely to be the same root cause) -- Do NOT include tangentially related issues (same general area but probably different issues) +- "high": Almost certainly the same bug. You can name a specific shared root cause, and the + reproduction steps / error messages / triggers are consistent. +- "medium": Likely the same bug based on specific technical details, but some uncertainty remains. +- Do NOT include issues that merely share symptoms, affect the same feature area, or sound similar + at a surface level. + +Examples of things that are NOT duplicates: +- Two issues about "Copilot models not showing" — one caused by a Zed update breaking the model list, + the other caused by the user's plan not including those models. +- Two issues about "Zed hangs" — one triggered by network drives, the other by large projects. +- Two issues about "can't sign in" — one caused by a missing system package, the other by a server-side error. Output only valid JSON (no markdown code blocks) with this structure: { @@ -427,13 +450,18 @@ Output only valid JSON (no markdown code blocks) with this structure: { "number": 12345, "confidence": "high|medium", - "explanation": "Brief explanation of why this might be a duplicate" + "shared_root_cause": "The specific bug/root cause shared by both issues", + "explanation": "Brief explanation with concrete evidence from both issues" } ], "summary": "One sentence summary of findings" } -Only include matches with "high" or "medium" confidence. Return empty matches array if none found.""" +When in doubt, return an empty matches array. A false positive (flagging a non-duplicate) is much +worse than a false negative (missing a real duplicate), because it wastes the time of both the +issue author and the maintainers. + +Return empty matches array if none found or if you can only identify shared symptoms.""" user_content = f"""## New Issue #{issue['number']} **Title:** {issue['title']} diff --git a/script/github-track-duplicate-bot-effectiveness.py b/script/github-track-duplicate-bot-effectiveness.py index a3056e856da717783ca0fc6538a131bb8a1d1d73..ca1ec5a9165bb9264dac1ad3fba7345a12d90f55 100644 --- a/script/github-track-duplicate-bot-effectiveness.py +++ b/script/github-track-duplicate-bot-effectiveness.py @@ -39,6 +39,10 @@ BOT_START_DATE = "2026-02-18" NEEDS_TRIAGE_LABEL = "state:needs triage" DEFAULT_PROJECT_NUMBER = 76 VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"} +# Bump this when the duplicate-detection bot's behavior changes in a way that +# could affect outcome rates (e.g. prompt rewrites, model swaps, candidate +# filtering changes). Don't bump for unrelated changes like comment formatting. +BOT_VERSION = "v2" def github_api_get(path, params=None): @@ -279,6 +283,8 @@ def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="A if notes: set_field_value(item_id, "Notes", notes) + set_field_value(item_id, "Bot version", BOT_VERSION) + return item_id