token_match_debug.rs

  1use crate::{example::read_example_files, metrics};
  2use anyhow::Context as _;
  3use clap::Args;
  4use std::fmt::Write as _;
  5use std::path::PathBuf;
  6
  7#[derive(Args, Debug, Clone)]
  8#[command(
  9    about = "Generate token-match debug HTML for expected vs predicted patches",
 10    after_help = r#"EXAMPLES:
 11  # Debug all examples from a jsonl dataset
 12  ep token-match-debug examples.jsonl
 13
 14  # Write HTML files to a specific directory
 15  ep token-match-debug examples.jsonl --output-dir out/token-debug
 16
 17  # Keep only the best expected patch per prediction
 18  ep token-match-debug examples.jsonl --best-only
 19
 20  # Limit generated files
 21  ep token-match-debug examples.jsonl --limit 50
 22"#
 23)]
 24pub struct TokenMatchDebugArgs {
 25    /// Directory where HTML reports are written.
 26    #[arg(long, default_value = "token-match-debug")]
 27    pub output_dir: PathBuf,
 28
 29    /// Only emit one report per prediction (best matching expected patch).
 30    #[arg(long, default_value_t = false)]
 31    pub best_only: bool,
 32
 33    /// Maximum number of reports to write.
 34    #[arg(long)]
 35    pub limit: Option<usize>,
 36}
 37
 38pub fn run_token_match_debug(args: &TokenMatchDebugArgs, inputs: &[PathBuf]) -> anyhow::Result<()> {
 39    let stdin_path = PathBuf::from("-");
 40    let inputs = if inputs.is_empty() {
 41        std::slice::from_ref(&stdin_path)
 42    } else {
 43        inputs
 44    };
 45
 46    let examples = read_example_files(inputs);
 47    std::fs::create_dir_all(&args.output_dir).with_context(|| {
 48        format!(
 49            "failed to create output directory '{}'",
 50            args.output_dir.display()
 51        )
 52    })?;
 53
 54    let mut written = 0usize;
 55    for example in &examples {
 56        let expected_patches = example.spec.expected_patches_with_cursor_positions();
 57        if expected_patches.is_empty() || example.predictions.is_empty() {
 58            continue;
 59        }
 60
 61        for (prediction_index, prediction) in example.predictions.iter().enumerate() {
 62            let Some(actual_patch) = prediction.actual_patch.as_deref() else {
 63                continue;
 64            };
 65            if actual_patch.trim().is_empty() {
 66                continue;
 67            }
 68
 69            if args.best_only {
 70                if let Some((expected_index, report)) =
 71                    best_expected_patch_report(&expected_patches, actual_patch)
 72                {
 73                    let html = render_report_html(
 74                        &example.spec.name,
 75                        prediction_index,
 76                        expected_index,
 77                        &expected_patches[expected_index].0,
 78                        actual_patch,
 79                        &report,
 80                    );
 81
 82                    let path = args.output_dir.join(report_filename(
 83                        &example.spec.filename(),
 84                        prediction_index,
 85                        expected_index,
 86                    ));
 87                    std::fs::write(&path, html)
 88                        .with_context(|| format!("failed to write report '{}'", path.display()))?;
 89                    written += 1;
 90                    if args.limit.is_some_and(|limit| written >= limit) {
 91                        eprintln!(
 92                            "Wrote {} report(s) to {}",
 93                            written,
 94                            args.output_dir.display()
 95                        );
 96                        return Ok(());
 97                    }
 98                }
 99                continue;
100            }
101
102            for (expected_index, (expected_patch, _)) in expected_patches.iter().enumerate() {
103                let report = metrics::token_match_debug_report(expected_patch, actual_patch);
104                let html = render_report_html(
105                    &example.spec.name,
106                    prediction_index,
107                    expected_index,
108                    expected_patch,
109                    actual_patch,
110                    &report,
111                );
112                let path = args.output_dir.join(report_filename(
113                    &example.spec.filename(),
114                    prediction_index,
115                    expected_index,
116                ));
117
118                std::fs::write(&path, html)
119                    .with_context(|| format!("failed to write report '{}'", path.display()))?;
120                written += 1;
121
122                if args.limit.is_some_and(|limit| written >= limit) {
123                    eprintln!(
124                        "Wrote {} report(s) to {}",
125                        written,
126                        args.output_dir.display()
127                    );
128                    return Ok(());
129                }
130            }
131        }
132    }
133
134    eprintln!(
135        "Wrote {} report(s) to {}",
136        written,
137        args.output_dir.display()
138    );
139    Ok(())
140}
141
142fn best_expected_patch_report(
143    expected_patches: &[(String, Option<usize>)],
144    actual_patch: &str,
145) -> Option<(usize, metrics::TokenMatchDebugReport)> {
146    let mut best: Option<(usize, metrics::TokenMatchDebugReport)> = None;
147    for (index, (expected_patch, _)) in expected_patches.iter().enumerate() {
148        let report = metrics::token_match_debug_report(expected_patch, actual_patch);
149        match &best {
150            Some((_, current)) => {
151                if metrics::compare_classification_metrics(&report.metrics, &current.metrics)
152                    .is_gt()
153                {
154                    best = Some((index, report));
155                }
156            }
157            None => best = Some((index, report)),
158        }
159    }
160    best
161}
162
163fn report_filename(example_name: &str, prediction_index: usize, expected_index: usize) -> String {
164    format!(
165        "{}__prediction-{}__expected-{}.html",
166        example_name, prediction_index, expected_index
167    )
168}
169
170fn render_report_html(
171    example_name: &str,
172    prediction_index: usize,
173    expected_index: usize,
174    expected_patch: &str,
175    actual_patch: &str,
176    report: &metrics::TokenMatchDebugReport,
177) -> String {
178    let mut html = String::new();
179
180    let precision = report.metrics.precision() * 100.0;
181    let recall = report.metrics.recall() * 100.0;
182    let f1 = report.metrics.f1() * 100.0;
183
184    let _ = write!(
185        html,
186        r#"<!doctype html>
187<html lang="en">
188<head>
189<meta charset="utf-8" />
190<meta name="viewport" content="width=device-width, initial-scale=1" />
191<title>Token Match Debug</title>
192<style>
193:root {{
194  color-scheme: light dark;
195  --bg: #0f1115;
196  --panel: #161a22;
197  --muted: #9ca3af;
198  --text: #e5e7eb;
199  --tp: #22c55e33;
200  --fp: #ef444433;
201  --fn: #f59e0b33;
202  --border: #2a3140;
203}}
204* {{ box-sizing: border-box; }}
205body {{
206  margin: 0;
207  font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
208  background: var(--bg);
209  color: var(--text);
210}}
211main {{
212  max-width: 1400px;
213  margin: 0 auto;
214  padding: 20px;
215}}
216h1, h2, h3 {{
217  margin: 0 0 10px;
218}}
219.meta {{
220  color: var(--muted);
221  margin-bottom: 16px;
222}}
223.grid {{
224  display: grid;
225  gap: 16px;
226}}
227.grid.two {{
228  grid-template-columns: repeat(2, minmax(0, 1fr));
229}}
230.panel {{
231  border: 1px solid var(--border);
232  background: var(--panel);
233  border-radius: 10px;
234  padding: 12px;
235}}
236.metrics {{
237  display: flex;
238  gap: 18px;
239  flex-wrap: wrap;
240}}
241.metric {{
242  min-width: 160px;
243}}
244.metric .label {{
245  color: var(--muted);
246  font-size: 12px;
247}}
248.metric .value {{
249  font-weight: 700;
250  font-size: 20px;
251}}
252pre {{
253  white-space: pre-wrap;
254  word-break: break-word;
255  margin: 0;
256  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
257  font-size: 12px;
258  line-height: 1.5;
259}}
260.legend {{
261  display: flex;
262  gap: 12px;
263  font-size: 12px;
264  color: var(--muted);
265  flex-wrap: wrap;
266}}
267.swatch {{
268  display: inline-block;
269  width: 12px;
270  height: 12px;
271  border-radius: 2px;
272  margin-right: 4px;
273  vertical-align: middle;
274}}
275.tp {{ background: var(--tp); }}
276.fp {{ background: var(--fp); }}
277.fn {{ background: var(--fn); }}
278.token.tp {{ background: var(--tp); }}
279.token.fp {{ background: var(--fp); }}
280.token.fn {{ background: var(--fn); }}
281.token {{
282  border-radius: 3px;
283}}
284.section-title {{
285  margin-bottom: 8px;
286  color: var(--muted);
287  font-size: 12px;
288  text-transform: uppercase;
289  letter-spacing: 0.05em;
290}}
291</style>
292</head>
293<body>
294<main>
295  <h1>Token Match Debug</h1>
296  <p class="meta">Example: {example_name} · Prediction #{prediction_index} · Expected Patch #{expected_index}</p>
297
298  <section class="panel">
299    <div class="metrics">
300      <div class="metric"><div class="label">Precision</div><div class="value">{precision:.1}%</div></div>
301      <div class="metric"><div class="label">Recall</div><div class="value">{recall:.1}%</div></div>
302      <div class="metric"><div class="label">F1</div><div class="value">{f1:.1}%</div></div>
303      <div class="metric"><div class="label">TP</div><div class="value">{tp}</div></div>
304      <div class="metric"><div class="label">FP</div><div class="value">{fp}</div></div>
305      <div class="metric"><div class="label">FN</div><div class="value">{fn}</div></div>
306    </div>
307    <div class="legend" style="margin-top: 10px;">
308      <span><span class="swatch tp"></span>True Positive</span>
309      <span><span class="swatch fp"></span>False Positive</span>
310      <span><span class="swatch fn"></span>False Negative</span>
311    </div>
312  </section>
313
314  <div class="grid two" style="margin-top: 16px;">
315    <section class="panel">
316      <div class="section-title">Expected patch</div>
317      <pre>{expected_patch}</pre>
318    </section>
319    <section class="panel">
320      <div class="section-title">Actual patch</div>
321      <pre>{actual_patch}</pre>
322    </section>
323  </div>
324
325  <div class="grid two" style="margin-top: 16px;">
326    <section class="panel">
327      <h3>Deleted-side token alignment</h3>
328      <div class="section-title">Expected deleted text</div>
329      <pre>{expected_deleted_text}</pre>
330      <div class="section-title" style="margin-top: 10px;">Actual deleted text</div>
331      <pre>{actual_deleted_text}</pre>
332      <div class="section-title" style="margin-top: 10px;">Expected deleted tokens (FN highlighted)</div>
333      <pre>{deleted_expected_tokens}</pre>
334      <div class="section-title" style="margin-top: 10px;">Actual deleted tokens (FP highlighted)</div>
335      <pre>{deleted_actual_tokens}</pre>
336    </section>
337
338    <section class="panel">
339      <h3>Inserted-side token alignment</h3>
340      <div class="section-title">Expected inserted text</div>
341      <pre>{expected_inserted_text}</pre>
342      <div class="section-title" style="margin-top: 10px;">Actual inserted text</div>
343      <pre>{actual_inserted_text}</pre>
344      <div class="section-title" style="margin-top: 10px;">Expected inserted tokens (FN highlighted)</div>
345      <pre>{inserted_expected_tokens}</pre>
346      <div class="section-title" style="margin-top: 10px;">Actual inserted tokens (FP highlighted)</div>
347      <pre>{inserted_actual_tokens}</pre>
348    </section>
349  </div>
350</main>
351</body>
352</html>"#,
353        example_name = escape_html(example_name),
354        prediction_index = prediction_index,
355        expected_index = expected_index,
356        precision = precision,
357        recall = recall,
358        f1 = f1,
359        tp = report.metrics.true_positives,
360        fp = report.metrics.false_positives,
361        fn = report.metrics.false_negatives,
362        expected_patch = escape_html(expected_patch),
363        actual_patch = escape_html(actual_patch),
364        expected_deleted_text = escape_html(&report.expected_deleted_text),
365        actual_deleted_text = escape_html(&report.actual_deleted_text),
366        expected_inserted_text = escape_html(&report.expected_inserted_text),
367        actual_inserted_text = escape_html(&report.actual_inserted_text),
368        deleted_expected_tokens = render_classified_tokens(&report.deleted.expected_tokens),
369        deleted_actual_tokens = render_classified_tokens(&report.deleted.actual_tokens),
370        inserted_expected_tokens = render_classified_tokens(&report.inserted.expected_tokens),
371        inserted_actual_tokens = render_classified_tokens(&report.inserted.actual_tokens),
372    );
373
374    html
375}
376
377fn render_classified_tokens(tokens: &[metrics::ClassifiedToken]) -> String {
378    let mut result = String::new();
379    for token in tokens {
380        let class = match token.class {
381            metrics::TokenClass::TruePositive => "tp",
382            metrics::TokenClass::FalsePositive => "fp",
383            metrics::TokenClass::FalseNegative => "fn",
384        };
385        let escaped = escape_html(&token.token);
386        let _ = write!(result, r#"<span class="token {class}">{escaped}</span>"#);
387    }
388    result
389}
390
391fn escape_html(input: &str) -> String {
392    let mut result = String::with_capacity(input.len());
393    for character in input.chars() {
394        match character {
395            '&' => result.push_str("&amp;"),
396            '<' => result.push_str("&lt;"),
397            '>' => result.push_str("&gt;"),
398            '"' => result.push_str("&quot;"),
399            '\'' => result.push_str("&#39;"),
400            _ => result.push(character),
401        }
402    }
403    result
404}