From 0ac17526687bf11007f0fbb5c3b2ff463ce47293 Mon Sep 17 00:00:00 2001
From: Joseph Mearman <joseph@mearman.co.uk>
Date: Tue, 9 Sep 2025 15:39:09 +0100
Subject: [PATCH] terminal: Sanitize trailing periods in URL detection (#37684)

Fixes #12338, related to #37616

This change improves URL detection in the terminal by removing trailing
periods that appear to be sentence punctuation rather than part of the
URL structure. It builds upon the parentheses sanitization work from
#37076 by consolidating both approaches into a unified
`sanitize_url_punctuation` function.

## Changes
- Combines parentheses and period sanitization into a single
`sanitize_url_punctuation` function
- Uses optimized single traversal with `fold()` for parentheses counting
(addressing code review feedback)
- Removes trailing periods using heuristics to distinguish sentence
punctuation from legitimate URL components
- Removes multiple trailing periods (always considered punctuation)
- Removes single trailing periods when they appear after alphanumeric
characters or slashes
- Preserves periods that are part of legitimate URL structure (e.g.,
version numbers, IP addresses, subdomains)
- Maintains existing parentheses balancing logic from #37076

## Implementation Details
- **Parentheses handling**: Counts opening and closing parentheses,
removes trailing `)` when unbalanced
- **Period handling**: Uses `take_while()` iterator for efficient period
counting
- **Performance**: Single pass counting with optimized loop to avoid
redundant work
- **Code clarity**: Uses let-else pattern for readable conditional logic

## Testing
- Added comprehensive test coverage for both parentheses and period
sanitization
- Tests cover balanced vs unbalanced parentheses cases
- Tests cover various period scenarios including legitimate URL periods
vs sentence punctuation
- All existing tests continue to pass

## Release Notes

- Improved terminal URL detection by further trimming trailing
punctuation. URLs ending with periods (like
`https://example.com.`) and unbalanced parentheses (like
`https://example.com/path)`) are now properly detected without including
the trailing punctuation.
---
 crates/terminal/src/terminal_hyperlinks.rs | 149 ++++++++++++++++++++-
 1 file changed, 146 insertions(+), 3 deletions(-)
diff --git a/crates/terminal/src/terminal_hyperlinks.rs b/crates/terminal/src/terminal_hyperlinks.rs
index 4126b3d948072e8d8d24b84fce4574fba889e492..2d3d356b4663a8aa271dda8d36d5fab720228527 100644
--- a/crates/terminal/src/terminal_hyperlinks.rs
+++ b/crates/terminal/src/terminal_hyperlinks.rs
@@ -79,7 +79,8 @@ pub(super) fn find_from_grid_point<T: EventListener>(
         Some((url, true, url_match))
     } else if let Some(url_match) = regex_match_at(term, point, &mut regex_searches.url_regex) {
         let url = term.bounds_to_string(*url_match.start(), *url_match.end());
-        Some((url, true, url_match))
+        let (sanitized_url, sanitized_match) = sanitize_url_punctuation(url, url_match, term);
+        Some((sanitized_url, true, sanitized_match))
     } else if let Some(python_match) =
         regex_match_at(term, point, &mut regex_searches.python_file_line_regex)
     {
@@ -164,6 +165,63 @@ pub(super) fn find_from_grid_point<T: EventListener>(
     })
 }
 
+fn sanitize_url_punctuation<T: EventListener>(
+    url: String,
+    url_match: Match,
+    term: &Term<T>,
+) -> (String, Match) {
+    let mut sanitized_url = url;
+    let mut chars_trimmed = 0;
+
+    // First, handle parentheses balancing using single traversal
+    let (open_parens, close_parens) =
+        sanitized_url
+            .chars()
+            .fold((0, 0), |(opens, closes), c| match c {
+                '(' => (opens + 1, closes),
+                ')' => (opens, closes + 1),
+                _ => (opens, closes),
+            });
+
+    // Trim unbalanced closing parentheses
+    if close_parens > open_parens {
+        let mut remaining_close = close_parens;
+        while sanitized_url.ends_with(')') && remaining_close > open_parens {
+            sanitized_url.pop();
+            chars_trimmed += 1;
+            remaining_close -= 1;
+        }
+    }
+
+    // Handle trailing periods
+    if sanitized_url.ends_with('.') {
+        let trailing_periods = sanitized_url
+            .chars()
+            .rev()
+            .take_while(|&c| c == '.')
+            .count();
+
+        if trailing_periods > 1 {
+            sanitized_url.truncate(sanitized_url.len() - trailing_periods);
+            chars_trimmed += trailing_periods;
+        } else if trailing_periods == 1
+            && let Some(second_last_char) = sanitized_url.chars().rev().nth(1)
+            && (second_last_char.is_alphanumeric() || second_last_char == '/')
+        {
+            sanitized_url.pop();
+            chars_trimmed += 1;
+        }
+    }
+
+    if chars_trimmed > 0 {
+        let new_end = url_match.end().sub(term, Boundary::Grid, chars_trimmed);
+        let sanitized_match = Match::new(*url_match.start(), new_end);
+        (sanitized_url, sanitized_match)
+    } else {
+        (sanitized_url, url_match)
+    }
+}
+
 fn is_path_surrounded_by_common_symbols(path: &str) -> bool {
     // Avoid detecting `[]` or `()` strings as paths, surrounded by common symbols
     path.len() > 2
@@ -203,8 +261,8 @@ mod tests {
     use super::*;
     use alacritty_terminal::{
         event::VoidListener,
-        index::{Boundary, Point as AlacPoint},
-        term::{Config, cell::Flags, test::TermSize},
+        index::{Boundary, Column, Line, Point as AlacPoint},
+        term::{Config, cell::Flags, search::Match, test::TermSize},
         vte::ansi::Handler,
     };
     use std::{cell::RefCell, ops::RangeInclusive, path::PathBuf};
@@ -233,6 +291,91 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_url_parentheses_sanitization() {
+        // Test our sanitize_url_parentheses function directly
+        let test_cases = vec![
+            // Cases that should be sanitized (unbalanced parentheses)
+            ("https://www.google.com/)", "https://www.google.com/"),
+            ("https://example.com/path)", "https://example.com/path"),
+            ("https://test.com/))", "https://test.com/"),
+            // Cases that should NOT be sanitized (balanced parentheses)
+            (
+                "https://en.wikipedia.org/wiki/Example_(disambiguation)",
+                "https://en.wikipedia.org/wiki/Example_(disambiguation)",
+            ),
+            ("https://test.com/(hello)", "https://test.com/(hello)"),
+            (
+                "https://example.com/path(1)(2)",
+                "https://example.com/path(1)(2)",
+            ),
+            // Edge cases
+            ("https://test.com/", "https://test.com/"),
+            ("https://example.com", "https://example.com"),
+        ];
+
+        for (input, expected) in test_cases {
+            // Create a minimal terminal for testing
+            let term = Term::new(Config::default(), &TermSize::new(80, 24), VoidListener);
+
+            // Create a dummy match that spans the entire input
+            let start_point = AlacPoint::new(Line(0), Column(0));
+            let end_point = AlacPoint::new(Line(0), Column(input.len()));
+            let dummy_match = Match::new(start_point, end_point);
+
+            let (result, _) = sanitize_url_punctuation(input.to_string(), dummy_match, &term);
+            assert_eq!(result, expected, "Failed for input: {}", input);
+        }
+    }
+
+    #[test]
+    fn test_url_periods_sanitization() {
+        // Test URLs with trailing periods (sentence punctuation)
+        let test_cases = vec![
+            // Cases that should be sanitized (trailing periods likely punctuation)
+            ("https://example.com.", "https://example.com"),
+            (
+                "https://github.com/zed-industries/zed.",
+                "https://github.com/zed-industries/zed",
+            ),
+            (
+                "https://example.com/path/file.html.",
+                "https://example.com/path/file.html",
+            ),
+            (
+                "https://example.com/file.pdf.",
+                "https://example.com/file.pdf",
+            ),
+            ("https://example.com:8080.", "https://example.com:8080"),
+            ("https://example.com..", "https://example.com"),
+            (
+                "https://en.wikipedia.org/wiki/C.E.O.",
+                "https://en.wikipedia.org/wiki/C.E.O",
+            ),
+            // Cases that should NOT be sanitized (periods are part of URL structure)
+            (
+                "https://example.com/v1.0/api",
+                "https://example.com/v1.0/api",
+            ),
+            ("https://192.168.1.1", "https://192.168.1.1"),
+            ("https://sub.domain.com", "https://sub.domain.com"),
+        ];
+
+        for (input, expected) in test_cases {
+            // Create a minimal terminal for testing
+            let term = Term::new(Config::default(), &TermSize::new(80, 24), VoidListener);
+
+            // Create a dummy match that spans the entire input
+            let start_point = AlacPoint::new(Line(0), Column(0));
+            let end_point = AlacPoint::new(Line(0), Column(input.len()));
+            let dummy_match = Match::new(start_point, end_point);
+
+            // This test should initially fail since we haven't implemented period sanitization yet
+            let (result, _) = sanitize_url_punctuation(input.to_string(), dummy_match, &term);
+            assert_eq!(result, expected, "Failed for input: {}", input);
+        }
+    }
+
     #[test]
     fn test_word_regex() {
         re_test(