1use crate::{example::read_example_files, metrics};
2use anyhow::Context as _;
3use clap::Args;
4use std::fmt::Write as _;
5use std::path::PathBuf;
6
7#[derive(Args, Debug, Clone)]
8#[command(
9 about = "Generate token-match debug HTML for expected vs predicted patches",
10 after_help = r#"EXAMPLES:
11 # Debug all examples from a jsonl dataset
12 ep token-match-debug examples.jsonl
13
14 # Write HTML files to a specific directory
15 ep token-match-debug examples.jsonl --output-dir out/token-debug
16
17 # Keep only the best expected patch per prediction
18 ep token-match-debug examples.jsonl --best-only
19
20 # Limit generated files
21 ep token-match-debug examples.jsonl --limit 50
22"#
23)]
24pub struct TokenMatchDebugArgs {
25 /// Directory where HTML reports are written.
26 #[arg(long, default_value = "token-match-debug")]
27 pub output_dir: PathBuf,
28
29 /// Only emit one report per prediction (best matching expected patch).
30 #[arg(long, default_value_t = false)]
31 pub best_only: bool,
32
33 /// Maximum number of reports to write.
34 #[arg(long)]
35 pub limit: Option<usize>,
36}
37
38pub fn run_token_match_debug(args: &TokenMatchDebugArgs, inputs: &[PathBuf]) -> anyhow::Result<()> {
39 let stdin_path = PathBuf::from("-");
40 let inputs = if inputs.is_empty() {
41 std::slice::from_ref(&stdin_path)
42 } else {
43 inputs
44 };
45
46 let examples = read_example_files(inputs);
47 std::fs::create_dir_all(&args.output_dir).with_context(|| {
48 format!(
49 "failed to create output directory '{}'",
50 args.output_dir.display()
51 )
52 })?;
53
54 let mut written = 0usize;
55 for example in &examples {
56 let expected_patches = example.spec.expected_patches_with_cursor_positions();
57 if expected_patches.is_empty() || example.predictions.is_empty() {
58 continue;
59 }
60
61 for (prediction_index, prediction) in example.predictions.iter().enumerate() {
62 let Some(actual_patch) = prediction.actual_patch.as_deref() else {
63 continue;
64 };
65 if actual_patch.trim().is_empty() {
66 continue;
67 }
68
69 if args.best_only {
70 if let Some((expected_index, report)) =
71 best_expected_patch_report(&expected_patches, actual_patch)
72 {
73 let html = render_report_html(
74 &example.spec.name,
75 prediction_index,
76 expected_index,
77 &expected_patches[expected_index].0,
78 actual_patch,
79 &report,
80 );
81
82 let path = args.output_dir.join(report_filename(
83 &example.spec.filename(),
84 prediction_index,
85 expected_index,
86 ));
87 std::fs::write(&path, html)
88 .with_context(|| format!("failed to write report '{}'", path.display()))?;
89 written += 1;
90 if args.limit.is_some_and(|limit| written >= limit) {
91 eprintln!(
92 "Wrote {} report(s) to {}",
93 written,
94 args.output_dir.display()
95 );
96 return Ok(());
97 }
98 }
99 continue;
100 }
101
102 for (expected_index, (expected_patch, _)) in expected_patches.iter().enumerate() {
103 let report = metrics::token_match_debug_report(expected_patch, actual_patch);
104 let html = render_report_html(
105 &example.spec.name,
106 prediction_index,
107 expected_index,
108 expected_patch,
109 actual_patch,
110 &report,
111 );
112 let path = args.output_dir.join(report_filename(
113 &example.spec.filename(),
114 prediction_index,
115 expected_index,
116 ));
117
118 std::fs::write(&path, html)
119 .with_context(|| format!("failed to write report '{}'", path.display()))?;
120 written += 1;
121
122 if args.limit.is_some_and(|limit| written >= limit) {
123 eprintln!(
124 "Wrote {} report(s) to {}",
125 written,
126 args.output_dir.display()
127 );
128 return Ok(());
129 }
130 }
131 }
132 }
133
134 eprintln!(
135 "Wrote {} report(s) to {}",
136 written,
137 args.output_dir.display()
138 );
139 Ok(())
140}
141
142fn best_expected_patch_report(
143 expected_patches: &[(String, Option<usize>)],
144 actual_patch: &str,
145) -> Option<(usize, metrics::TokenMatchDebugReport)> {
146 let mut best: Option<(usize, metrics::TokenMatchDebugReport)> = None;
147 for (index, (expected_patch, _)) in expected_patches.iter().enumerate() {
148 let report = metrics::token_match_debug_report(expected_patch, actual_patch);
149 match &best {
150 Some((_, current)) => {
151 if metrics::compare_classification_metrics(&report.metrics, ¤t.metrics)
152 .is_gt()
153 {
154 best = Some((index, report));
155 }
156 }
157 None => best = Some((index, report)),
158 }
159 }
160 best
161}
162
163fn report_filename(example_name: &str, prediction_index: usize, expected_index: usize) -> String {
164 format!(
165 "{}__prediction-{}__expected-{}.html",
166 example_name, prediction_index, expected_index
167 )
168}
169
170fn render_report_html(
171 example_name: &str,
172 prediction_index: usize,
173 expected_index: usize,
174 expected_patch: &str,
175 actual_patch: &str,
176 report: &metrics::TokenMatchDebugReport,
177) -> String {
178 let mut html = String::new();
179
180 let precision = report.metrics.precision() * 100.0;
181 let recall = report.metrics.recall() * 100.0;
182 let f1 = report.metrics.f1() * 100.0;
183
184 let _ = write!(
185 html,
186 r#"<!doctype html>
187<html lang="en">
188<head>
189<meta charset="utf-8" />
190<meta name="viewport" content="width=device-width, initial-scale=1" />
191<title>Token Match Debug</title>
192<style>
193:root {{
194 color-scheme: light dark;
195 --bg: #0f1115;
196 --panel: #161a22;
197 --muted: #9ca3af;
198 --text: #e5e7eb;
199 --tp: #22c55e33;
200 --fp: #ef444433;
201 --fn: #f59e0b33;
202 --border: #2a3140;
203}}
204* {{ box-sizing: border-box; }}
205body {{
206 margin: 0;
207 font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
208 background: var(--bg);
209 color: var(--text);
210}}
211main {{
212 max-width: 1400px;
213 margin: 0 auto;
214 padding: 20px;
215}}
216h1, h2, h3 {{
217 margin: 0 0 10px;
218}}
219.meta {{
220 color: var(--muted);
221 margin-bottom: 16px;
222}}
223.grid {{
224 display: grid;
225 gap: 16px;
226}}
227.grid.two {{
228 grid-template-columns: repeat(2, minmax(0, 1fr));
229}}
230.panel {{
231 border: 1px solid var(--border);
232 background: var(--panel);
233 border-radius: 10px;
234 padding: 12px;
235}}
236.metrics {{
237 display: flex;
238 gap: 18px;
239 flex-wrap: wrap;
240}}
241.metric {{
242 min-width: 160px;
243}}
244.metric .label {{
245 color: var(--muted);
246 font-size: 12px;
247}}
248.metric .value {{
249 font-weight: 700;
250 font-size: 20px;
251}}
252pre {{
253 white-space: pre-wrap;
254 word-break: break-word;
255 margin: 0;
256 font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
257 font-size: 12px;
258 line-height: 1.5;
259}}
260.legend {{
261 display: flex;
262 gap: 12px;
263 font-size: 12px;
264 color: var(--muted);
265 flex-wrap: wrap;
266}}
267.swatch {{
268 display: inline-block;
269 width: 12px;
270 height: 12px;
271 border-radius: 2px;
272 margin-right: 4px;
273 vertical-align: middle;
274}}
275.tp {{ background: var(--tp); }}
276.fp {{ background: var(--fp); }}
277.fn {{ background: var(--fn); }}
278.token.tp {{ background: var(--tp); }}
279.token.fp {{ background: var(--fp); }}
280.token.fn {{ background: var(--fn); }}
281.token {{
282 border-radius: 3px;
283}}
284.section-title {{
285 margin-bottom: 8px;
286 color: var(--muted);
287 font-size: 12px;
288 text-transform: uppercase;
289 letter-spacing: 0.05em;
290}}
291</style>
292</head>
293<body>
294<main>
295 <h1>Token Match Debug</h1>
296 <p class="meta">Example: {example_name} · Prediction #{prediction_index} · Expected Patch #{expected_index}</p>
297
298 <section class="panel">
299 <div class="metrics">
300 <div class="metric"><div class="label">Precision</div><div class="value">{precision:.1}%</div></div>
301 <div class="metric"><div class="label">Recall</div><div class="value">{recall:.1}%</div></div>
302 <div class="metric"><div class="label">F1</div><div class="value">{f1:.1}%</div></div>
303 <div class="metric"><div class="label">TP</div><div class="value">{tp}</div></div>
304 <div class="metric"><div class="label">FP</div><div class="value">{fp}</div></div>
305 <div class="metric"><div class="label">FN</div><div class="value">{fn}</div></div>
306 </div>
307 <div class="legend" style="margin-top: 10px;">
308 <span><span class="swatch tp"></span>True Positive</span>
309 <span><span class="swatch fp"></span>False Positive</span>
310 <span><span class="swatch fn"></span>False Negative</span>
311 </div>
312 </section>
313
314 <div class="grid two" style="margin-top: 16px;">
315 <section class="panel">
316 <div class="section-title">Expected patch</div>
317 <pre>{expected_patch}</pre>
318 </section>
319 <section class="panel">
320 <div class="section-title">Actual patch</div>
321 <pre>{actual_patch}</pre>
322 </section>
323 </div>
324
325 <div class="grid two" style="margin-top: 16px;">
326 <section class="panel">
327 <h3>Deleted-side token alignment</h3>
328 <div class="section-title">Expected deleted text</div>
329 <pre>{expected_deleted_text}</pre>
330 <div class="section-title" style="margin-top: 10px;">Actual deleted text</div>
331 <pre>{actual_deleted_text}</pre>
332 <div class="section-title" style="margin-top: 10px;">Expected deleted tokens (FN highlighted)</div>
333 <pre>{deleted_expected_tokens}</pre>
334 <div class="section-title" style="margin-top: 10px;">Actual deleted tokens (FP highlighted)</div>
335 <pre>{deleted_actual_tokens}</pre>
336 </section>
337
338 <section class="panel">
339 <h3>Inserted-side token alignment</h3>
340 <div class="section-title">Expected inserted text</div>
341 <pre>{expected_inserted_text}</pre>
342 <div class="section-title" style="margin-top: 10px;">Actual inserted text</div>
343 <pre>{actual_inserted_text}</pre>
344 <div class="section-title" style="margin-top: 10px;">Expected inserted tokens (FN highlighted)</div>
345 <pre>{inserted_expected_tokens}</pre>
346 <div class="section-title" style="margin-top: 10px;">Actual inserted tokens (FP highlighted)</div>
347 <pre>{inserted_actual_tokens}</pre>
348 </section>
349 </div>
350</main>
351</body>
352</html>"#,
353 example_name = escape_html(example_name),
354 prediction_index = prediction_index,
355 expected_index = expected_index,
356 precision = precision,
357 recall = recall,
358 f1 = f1,
359 tp = report.metrics.true_positives,
360 fp = report.metrics.false_positives,
361 fn = report.metrics.false_negatives,
362 expected_patch = escape_html(expected_patch),
363 actual_patch = escape_html(actual_patch),
364 expected_deleted_text = escape_html(&report.expected_deleted_text),
365 actual_deleted_text = escape_html(&report.actual_deleted_text),
366 expected_inserted_text = escape_html(&report.expected_inserted_text),
367 actual_inserted_text = escape_html(&report.actual_inserted_text),
368 deleted_expected_tokens = render_classified_tokens(&report.deleted.expected_tokens),
369 deleted_actual_tokens = render_classified_tokens(&report.deleted.actual_tokens),
370 inserted_expected_tokens = render_classified_tokens(&report.inserted.expected_tokens),
371 inserted_actual_tokens = render_classified_tokens(&report.inserted.actual_tokens),
372 );
373
374 html
375}
376
377fn render_classified_tokens(tokens: &[metrics::ClassifiedToken]) -> String {
378 let mut result = String::new();
379 for token in tokens {
380 let class = match token.class {
381 metrics::TokenClass::TruePositive => "tp",
382 metrics::TokenClass::FalsePositive => "fp",
383 metrics::TokenClass::FalseNegative => "fn",
384 };
385 let escaped = escape_html(&token.token);
386 let _ = write!(result, r#"<span class="token {class}">{escaped}</span>"#);
387 }
388 result
389}
390
391fn escape_html(input: &str) -> String {
392 let mut result = String::with_capacity(input.len());
393 for character in input.chars() {
394 match character {
395 '&' => result.push_str("&"),
396 '<' => result.push_str("<"),
397 '>' => result.push_str(">"),
398 '"' => result.push_str("""),
399 '\'' => result.push_str("'"),
400 _ => result.push(character),
401 }
402 }
403 result
404}