@@ -160,8 +160,8 @@ fn sanitize_url_punctuation<T: EventListener>(
let mut sanitized_url = url;
let mut chars_trimmed = 0;
- // First, handle parentheses balancing using single traversal
- let (open_parens, close_parens) =
+ // Count parentheses in the URL
+ let (open_parens, mut close_parens) =
sanitized_url
.chars()
.fold((0, 0), |(opens, closes), c| match c {
@@ -170,33 +170,27 @@ fn sanitize_url_punctuation<T: EventListener>(
_ => (opens, closes),
});
- // Trim unbalanced closing parentheses
- if close_parens > open_parens {
- let mut remaining_close = close_parens;
- while sanitized_url.ends_with(')') && remaining_close > open_parens {
- sanitized_url.pop();
- chars_trimmed += 1;
- remaining_close -= 1;
- }
- }
+ // Remove trailing characters that shouldn't be at the end of URLs
+ while let Some(last_char) = sanitized_url.chars().last() {
+ let should_remove = match last_char {
+ // These may be part of a URL but not at the end. It's not that the spec
+ // doesn't allow them, but they are frequently used in plain text as delimiters
+ // where they're not meant to be part of the URL.
+ '.' | ',' | ':' | ';' => true,
+ '(' => true,
+ ')' if close_parens > open_parens => {
+ close_parens -= 1;
+
+ true
+ }
+ _ => false,
+ };
- // Handle trailing periods
- if sanitized_url.ends_with('.') {
- let trailing_periods = sanitized_url
- .chars()
- .rev()
- .take_while(|&c| c == '.')
- .count();
-
- if trailing_periods > 1 {
- sanitized_url.truncate(sanitized_url.len() - trailing_periods);
- chars_trimmed += trailing_periods;
- } else if trailing_periods == 1
- && let Some(second_last_char) = sanitized_url.chars().rev().nth(1)
- && (second_last_char.is_alphanumeric() || second_last_char == '/')
- {
+ if should_remove {
sanitized_url.pop();
chars_trimmed += 1;
+ } else {
+ break;
}
}
@@ -413,6 +407,8 @@ mod tests {
("https://www.google.com/)", "https://www.google.com/"),
("https://example.com/path)", "https://example.com/path"),
("https://test.com/))", "https://test.com/"),
+ ("https://test.com/(((", "https://test.com/"),
+ ("https://test.com/(test)(", "https://test.com/(test)"),
// Cases that should NOT be sanitized (balanced parentheses)
(
"https://en.wikipedia.org/wiki/Example_(disambiguation)",
@@ -443,10 +439,10 @@ mod tests {
}
#[test]
- fn test_url_periods_sanitization() {
- // Test URLs with trailing periods (sentence punctuation)
+ fn test_url_punctuation_sanitization() {
+ // Test URLs with trailing punctuation (sentence/text punctuation)
+ // The sanitize_url_punctuation function removes ., ,, :, ;, from the end
let test_cases = vec![
- // Cases that should be sanitized (trailing periods likely punctuation)
("https://example.com.", "https://example.com"),
(
"https://github.com/zed-industries/zed.",
@@ -466,13 +462,36 @@ mod tests {
"https://en.wikipedia.org/wiki/C.E.O.",
"https://en.wikipedia.org/wiki/C.E.O",
),
- // Cases that should NOT be sanitized (periods are part of URL structure)
+ ("https://example.com,", "https://example.com"),
+ ("https://example.com/path,", "https://example.com/path"),
+ ("https://example.com,,", "https://example.com"),
+ ("https://example.com:", "https://example.com"),
+ ("https://example.com/path:", "https://example.com/path"),
+ ("https://example.com::", "https://example.com"),
+ ("https://example.com;", "https://example.com"),
+ ("https://example.com/path;", "https://example.com/path"),
+ ("https://example.com;;", "https://example.com"),
+ ("https://example.com.,", "https://example.com"),
+ ("https://example.com.:;", "https://example.com"),
+ ("https://example.com!.", "https://example.com!"),
+ ("https://example.com/).", "https://example.com/"),
+ ("https://example.com/);", "https://example.com/"),
+ ("https://example.com/;)", "https://example.com/"),
(
"https://example.com/v1.0/api",
"https://example.com/v1.0/api",
),
("https://192.168.1.1", "https://192.168.1.1"),
("https://sub.domain.com", "https://sub.domain.com"),
+ (
+ "https://example.com?query=value",
+ "https://example.com?query=value",
+ ),
+ ("https://example.com?a=1&b=2", "https://example.com?a=1&b=2"),
+ (
+ "https://example.com/path:8080",
+ "https://example.com/path:8080",
+ ),
];
for (input, expected) in test_cases {
@@ -484,7 +503,6 @@ mod tests {
let end_point = AlacPoint::new(Line(0), Column(input.len()));
let dummy_match = Match::new(start_point, end_point);
- // This test should initially fail since we haven't implemented period sanitization yet
let (result, _) = sanitize_url_punctuation(input.to_string(), dummy_match, &term);
assert_eq!(result, expected, "Failed for input: {}", input);
}