[terminal] Consider "main.cs(20,5)" to be a single clickable word (#19004)

Shish created

[terminal] Consider "main.cs(20,5)" to be a single clickable word

First, adding unit tests for the regexes because I'm not certain how
these regexes are _intended_ to work, and unit tests work nicely as
demonstrations of intended behaviour.

The comment string, and the regex itself, seem to imply that
"main.cs(20,5)" is supposed be a single "word" (for the purposes of
being clicked on)... but the regex doesn't actually work like that. This
PR makes it work :)

(I don't know _why_ "word with an optional `(\d+,\d+)` on the end"
doesn't match the full string, while "word with a required `(\d+,\d+)`
on the end" _does_ match the full string - aren't regexes supposed to
match as much as possible, so it should take the optional extra whenever
the extra exists? Either way, "word with a required (\d+,\d+), or word
by itself" has the correct behaviour, as demonstrated by the unit test)

Release Notes:

- N/A

Change summary

Cargo.lock                      |  1 
crates/terminal/Cargo.toml      |  1 
crates/terminal/src/terminal.rs | 69 ++++++++++++++++++++++++++++++----
3 files changed, 63 insertions(+), 8 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -11407,6 +11407,7 @@ dependencies = [
  "gpui",
  "libc",
  "rand 0.8.5",
+ "regex",
  "release_channel",
  "schemars",
  "serde",

crates/terminal/src/terminal.rs 🔗

@@ -307,6 +307,11 @@ impl Display for TerminalError {
 // https://github.com/alacritty/alacritty/blob/cb3a79dbf6472740daca8440d5166c1d4af5029e/extra/man/alacritty.5.scd?plain=1#L207-L213
 const DEFAULT_SCROLL_HISTORY_LINES: usize = 10_000;
 const MAX_SCROLL_HISTORY_LINES: usize = 100_000;
+const URL_REGEX: &str = r#"(ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file://|git://|ssh:|ftp://)[^\u{0000}-\u{001F}\u{007F}-\u{009F}<>"\s{-}\^⟨⟩`]+"#;
+// Optional suffix matches MSBuild diagnostic suffixes for path parsing in PathLikeWithPosition
+// https://learn.microsoft.com/en-us/visualstudio/msbuild/msbuild-diagnostic-format-for-tasks
+const WORD_REGEX: &str =
+    r#"[\$\+\w.\[\]:/\\@\-~()]+(?:\((?:\d+|\d+,\d+)\))|[\$\+\w.\[\]:/\\@\-~()]+"#;
 
 pub struct TerminalBuilder {
     terminal: Terminal,
@@ -425,12 +430,6 @@ impl TerminalBuilder {
         let pty_tx = event_loop.channel();
         let _io_thread = event_loop.spawn(); // DANGER
 
-        let url_regex = RegexSearch::new(r#"(ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file://|git://|ssh:|ftp://)[^\u{0000}-\u{001F}\u{007F}-\u{009F}<>"\s{-}\^⟨⟩`]+"#).unwrap();
-        // Optional suffix matches MSBuild diagnostic suffixes for path parsing in PathLikeWithPosition
-        // https://learn.microsoft.com/en-us/visualstudio/msbuild/msbuild-diagnostic-format-for-tasks
-        let word_regex =
-            RegexSearch::new(r#"[\$\+\w.\[\]:/\\@\-~()]+(?:\((?:\d+|\d+,\d+)\))?"#).unwrap();
-
         let terminal = Terminal {
             task,
             pty_tx: Notifier(pty_tx),
@@ -450,8 +449,8 @@ impl TerminalBuilder {
             selection_phase: SelectionPhase::Ended,
             secondary_pressed: false,
             hovered_word: false,
-            url_regex,
-            word_regex,
+            url_regex: RegexSearch::new(URL_REGEX).unwrap(),
+            word_regex: RegexSearch::new(WORD_REGEX).unwrap(),
             vi_mode_enabled: false,
         };
 
@@ -2066,4 +2065,58 @@ mod tests {
             ..Default::default()
         }
     }
+
+    fn re_test(re: &str, hay: &str, expected: Vec<&str>) {
+        let results: Vec<_> = regex::Regex::new(re)
+            .unwrap()
+            .find_iter(hay)
+            .map(|m| m.as_str())
+            .collect();
+        assert_eq!(results, expected);
+    }
+    #[test]
+    fn test_url_regex() {
+        re_test(
+            crate::URL_REGEX,
+            "test http://example.com test mailto:bob@example.com train",
+            vec!["http://example.com", "mailto:bob@example.com"],
+        );
+    }
+    #[test]
+    fn test_word_regex() {
+        re_test(
+            crate::WORD_REGEX,
+            "hello, world! \"What\" is this?",
+            vec!["hello", "world", "What", "is", "this"],
+        );
+    }
+    #[test]
+    fn test_word_regex_with_linenum() {
+        // filename(line) and filename(line,col) as used in MSBuild output
+        // should be considered a single "word", even though comma is
+        // usually a word separator
+        re_test(
+            crate::WORD_REGEX,
+            "a Main.cs(20) b",
+            vec!["a", "Main.cs(20)", "b"],
+        );
+        re_test(
+            crate::WORD_REGEX,
+            "Main.cs(20,5) Error desc",
+            vec!["Main.cs(20,5)", "Error", "desc"],
+        );
+        // filename:line:col is a popular format for unix tools
+        re_test(
+            crate::WORD_REGEX,
+            "a Main.cs:20:5 b",
+            vec!["a", "Main.cs:20:5", "b"],
+        );
+        // Some tools output "filename:line:col:message", which currently isn't
+        // handled correctly, but might be in the future
+        re_test(
+            crate::WORD_REGEX,
+            "Main.cs:20:5:Error desc",
+            vec!["Main.cs:20:5:Error", "desc"],
+        );
+    }
 }