terminal: Sanitize URLs with characters that cannot be last (#43559)

Marco Mihai Condrache created

Closes #43345

The list of characters comes from the linkify crate, which is already
used for URL detection in the editor:


https://github.com/robinst/linkify/blob/5239e12e26c633f42323e51ed81b0ff534528077/src/url.rs#L228

Release Notes:

- Improved url links detection in terminals.

---------

Signed-off-by: Marco Mihai Condrache <52580954+marcocondrache@users.noreply.github.com>

Change summary

crates/terminal/src/terminal_hyperlinks.rs | 80 ++++++++++++++---------
1 file changed, 49 insertions(+), 31 deletions(-)

Detailed changes

crates/terminal/src/terminal_hyperlinks.rs 🔗

@@ -160,8 +160,8 @@ fn sanitize_url_punctuation<T: EventListener>(
     let mut sanitized_url = url;
     let mut chars_trimmed = 0;
 
-    // First, handle parentheses balancing using single traversal
-    let (open_parens, close_parens) =
+    // Count parentheses in the URL
+    let (open_parens, mut close_parens) =
         sanitized_url
             .chars()
             .fold((0, 0), |(opens, closes), c| match c {
@@ -170,33 +170,27 @@ fn sanitize_url_punctuation<T: EventListener>(
                 _ => (opens, closes),
             });
 
-    // Trim unbalanced closing parentheses
-    if close_parens > open_parens {
-        let mut remaining_close = close_parens;
-        while sanitized_url.ends_with(')') && remaining_close > open_parens {
-            sanitized_url.pop();
-            chars_trimmed += 1;
-            remaining_close -= 1;
-        }
-    }
+    // Remove trailing characters that shouldn't be at the end of URLs
+    while let Some(last_char) = sanitized_url.chars().last() {
+        let should_remove = match last_char {
+            // These may be part of a URL but not at the end. It's not that the spec
+            // doesn't allow them, but they are frequently used in plain text as delimiters
+            // where they're not meant to be part of the URL.
+            '.' | ',' | ':' | ';' => true,
+            '(' => true,
+            ')' if close_parens > open_parens => {
+                close_parens -= 1;
+
+                true
+            }
+            _ => false,
+        };
 
-    // Handle trailing periods
-    if sanitized_url.ends_with('.') {
-        let trailing_periods = sanitized_url
-            .chars()
-            .rev()
-            .take_while(|&c| c == '.')
-            .count();
-
-        if trailing_periods > 1 {
-            sanitized_url.truncate(sanitized_url.len() - trailing_periods);
-            chars_trimmed += trailing_periods;
-        } else if trailing_periods == 1
-            && let Some(second_last_char) = sanitized_url.chars().rev().nth(1)
-            && (second_last_char.is_alphanumeric() || second_last_char == '/')
-        {
+        if should_remove {
             sanitized_url.pop();
             chars_trimmed += 1;
+        } else {
+            break;
         }
     }
 
@@ -413,6 +407,8 @@ mod tests {
             ("https://www.google.com/)", "https://www.google.com/"),
             ("https://example.com/path)", "https://example.com/path"),
             ("https://test.com/))", "https://test.com/"),
+            ("https://test.com/(((", "https://test.com/"),
+            ("https://test.com/(test)(", "https://test.com/(test)"),
             // Cases that should NOT be sanitized (balanced parentheses)
             (
                 "https://en.wikipedia.org/wiki/Example_(disambiguation)",
@@ -443,10 +439,10 @@ mod tests {
     }
 
     #[test]
-    fn test_url_periods_sanitization() {
-        // Test URLs with trailing periods (sentence punctuation)
+    fn test_url_punctuation_sanitization() {
+        // Test URLs with trailing punctuation (sentence/text punctuation)
+        // The sanitize_url_punctuation function removes ., ,, :, ;, from the end
         let test_cases = vec![
-            // Cases that should be sanitized (trailing periods likely punctuation)
             ("https://example.com.", "https://example.com"),
             (
                 "https://github.com/zed-industries/zed.",
@@ -466,13 +462,36 @@ mod tests {
                 "https://en.wikipedia.org/wiki/C.E.O.",
                 "https://en.wikipedia.org/wiki/C.E.O",
             ),
-            // Cases that should NOT be sanitized (periods are part of URL structure)
+            ("https://example.com,", "https://example.com"),
+            ("https://example.com/path,", "https://example.com/path"),
+            ("https://example.com,,", "https://example.com"),
+            ("https://example.com:", "https://example.com"),
+            ("https://example.com/path:", "https://example.com/path"),
+            ("https://example.com::", "https://example.com"),
+            ("https://example.com;", "https://example.com"),
+            ("https://example.com/path;", "https://example.com/path"),
+            ("https://example.com;;", "https://example.com"),
+            ("https://example.com.,", "https://example.com"),
+            ("https://example.com.:;", "https://example.com"),
+            ("https://example.com!.", "https://example.com!"),
+            ("https://example.com/).", "https://example.com/"),
+            ("https://example.com/);", "https://example.com/"),
+            ("https://example.com/;)", "https://example.com/"),
             (
                 "https://example.com/v1.0/api",
                 "https://example.com/v1.0/api",
             ),
             ("https://192.168.1.1", "https://192.168.1.1"),
             ("https://sub.domain.com", "https://sub.domain.com"),
+            (
+                "https://example.com?query=value",
+                "https://example.com?query=value",
+            ),
+            ("https://example.com?a=1&b=2", "https://example.com?a=1&b=2"),
+            (
+                "https://example.com/path:8080",
+                "https://example.com/path:8080",
+            ),
         ];
 
         for (input, expected) in test_cases {
@@ -484,7 +503,6 @@ mod tests {
             let end_point = AlacPoint::new(Line(0), Column(input.len()));
             let dummy_match = Match::new(start_point, end_point);
 
-            // This test should initially fail since we haven't implemented period sanitization yet
             let (result, _) = sanitize_url_punctuation(input.to_string(), dummy_match, &term);
             assert_eq!(result, expected, "Failed for input: {}", input);
         }