language: Select language based on longest matching path extension (#29716)

claytonrcarter created

Closes #8408  
Closes #10997

This is a reboot of [my original
PR](https://github.com/zed-industries/zed/pull/11697) from last year. I
believe that I've addressed all the comments raised in that original
review, but Zed has changed a lot in the past year, so I'm sure there
will be some new stuff to consider too.

- updates the language matching and lookup to consider not just "does
the suffix/glob match" but also "... and is it the longest such match"
- adds a new `LanguageCustomFileTypes` struct to pass user globs from
settings to the registry
- _minor/unrelated:_ updates a test for the JS extension that wasn't
actually testing what is intended to
- _minor/unrelated:_ removed 2 redundant path extensions from the JS
lang extension

**Languages that may use this**

- Laravel Blade templates use the `blade.php` compound extension
-
[apparently](https://github.com/zed-industries/zed/issues/10765#issuecomment-2091293304)
Angular uses `component.html`
  - see also https://github.com/zed-industries/extensions/issues/169
- _hypothetically_ someone could publish a "JS test" extension w/ custom
highlights and/or snippets; many JS tests use `test.js` or `spec.js`

**Verifying these changes**

I added a number of assertions for this new behavior, and I also
confirmed that the (recently patched) [Laravel Blade
extension](https://github.com/bajrangCoder/zed-laravel-blade) opens as
expected for `blade.php` files, whereas on `main` it does not.

cc @maxbrunsfeld (reviewed my original PR last year), @osiewicz and
@MrSubidubi (have recently been in this part of the code)

Release Notes:

- Added support for "compound" file extensions in language extensions,
such `blade.php` and `component.html`. Closes #8408 and #10997.

Change summary

crates/language/src/buffer_tests.rs         |  61 ++++++++
crates/language/src/language_registry.rs    | 168 ++++++++++++++++------
crates/languages/src/typescript/config.toml |   2 
3 files changed, 178 insertions(+), 53 deletions(-)

Detailed changes

crates/language/src/buffer_tests.rs 🔗

@@ -83,6 +83,17 @@ fn test_select_language(cx: &mut App) {
         },
         Some(tree_sitter_rust::LANGUAGE.into()),
     )));
+    registry.add(Arc::new(Language::new(
+        LanguageConfig {
+            name: "Rust with longer extension".into(),
+            matcher: LanguageMatcher {
+                path_suffixes: vec!["longer.rs".to_string()],
+                ..Default::default()
+            },
+            ..Default::default()
+        },
+        Some(tree_sitter_rust::LANGUAGE.into()),
+    )));
     registry.add(Arc::new(Language::new(
         LanguageConfig {
             name: LanguageName::new("Make"),
@@ -109,6 +120,14 @@ fn test_select_language(cx: &mut App) {
         Some("Make".into())
     );
 
+    // matching longer, compound extension, part of which could also match another lang
+    assert_eq!(
+        registry
+            .language_for_file(&file("src/lib.longer.rs"), None, cx)
+            .map(|l| l.name()),
+        Some("Rust with longer extension".into())
+    );
+
     // matching filename
     assert_eq!(
         registry
@@ -181,7 +200,11 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext)
         init_settings(cx, |settings| {
             settings.file_types.extend([
                 ("TypeScript".into(), vec!["js".into()]),
-                ("C++".into(), vec!["c".into()]),
+                (
+                    "JavaScript".into(),
+                    vec!["*longer.ts".into(), "ecmascript".into()],
+                ),
+                ("C++".into(), vec!["c".into(), "*.dev".into()]),
                 (
                     "Dockerfile".into(),
                     vec!["Dockerfile".into(), "Dockerfile.*".into()],
@@ -204,7 +227,7 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext)
         LanguageConfig {
             name: "TypeScript".into(),
             matcher: LanguageMatcher {
-                path_suffixes: vec!["js".to_string()],
+                path_suffixes: vec!["ts".to_string(), "ts.ecmascript".to_string()],
                 ..Default::default()
             },
             ..Default::default()
@@ -237,6 +260,21 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext)
         languages.add(Arc::new(Language::new(config, None)));
     }
 
+    // matches system-provided lang extension
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("foo.ts"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "TypeScript".into());
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("foo.ts.ecmascript"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "TypeScript".into());
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("foo.cpp"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "C++".into());
+
+    // user configured lang extension, same length as system-provided
     let language = cx
         .read(|cx| languages.language_for_file(&file("foo.js"), None, cx))
         .unwrap();
@@ -245,6 +283,25 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext)
         .read(|cx| languages.language_for_file(&file("foo.c"), None, cx))
         .unwrap();
     assert_eq!(language.name(), "C++".into());
+
+    // user configured lang extension, longer than system-provided
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("foo.longer.ts"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "JavaScript".into());
+
+    // user configured lang extension, shorter than system-provided
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("foo.ecmascript"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "JavaScript".into());
+
+    // user configured glob matches
+    let language = cx
+        .read(|cx| languages.language_for_file(&file("c-plus-plus.dev"), None, cx))
+        .unwrap();
+    assert_eq!(language.name(), "C++".into());
+    // should match Dockerfile.* => Dockerfile, not *.dev => C++
     let language = cx
         .read(|cx| languages.language_for_file(&file("Dockerfile.dev"), None, cx))
         .unwrap();

crates/language/src/language_registry.rs 🔗

@@ -16,8 +16,6 @@ use futures::{
 };
 use globset::GlobSet;
 use gpui::{App, BackgroundExecutor, SharedString};
-use itertools::FoldWhile::{Continue, Done};
-use itertools::Itertools;
 use lsp::LanguageServerId;
 use parking_lot::{Mutex, RwLock};
 use postage::watch;
@@ -173,18 +171,12 @@ impl AvailableLanguage {
     }
 }
 
-#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Copy, Clone, Default)]
 enum LanguageMatchPrecedence {
     #[default]
     Undetermined,
-    PathOrContent,
-    UserConfigured,
-}
-
-impl LanguageMatchPrecedence {
-    fn best_possible_match(&self) -> bool {
-        *self == LanguageMatchPrecedence::UserConfigured
-    }
+    PathOrContent(usize),
+    UserConfigured(usize),
 }
 
 enum AvailableGrammar {
@@ -626,9 +618,14 @@ impl LanguageRegistry {
     ) -> impl Future<Output = Result<Arc<Language>>> + use<> {
         let name = UniCase::new(name);
         let rx = self.get_or_load_language(|language_name, _, current_best_match| {
-            (current_best_match < LanguageMatchPrecedence::PathOrContent
-                && UniCase::new(&language_name.0) == name)
-                .then_some(LanguageMatchPrecedence::PathOrContent)
+            match current_best_match {
+                LanguageMatchPrecedence::Undetermined if UniCase::new(&language_name.0) == name => {
+                    Some(LanguageMatchPrecedence::PathOrContent(name.len()))
+                }
+                LanguageMatchPrecedence::Undetermined
+                | LanguageMatchPrecedence::UserConfigured(_)
+                | LanguageMatchPrecedence::PathOrContent(_) => None,
+            }
         });
         async move { rx.await? }
     }
@@ -655,13 +652,23 @@ impl LanguageRegistry {
     ) -> impl Future<Output = Result<Arc<Language>>> {
         let string = UniCase::new(string);
         let rx = self.get_or_load_language(|name, config, current_best_match| {
-            (current_best_match < LanguageMatchPrecedence::PathOrContent
-                && (UniCase::new(&name.0) == string
+            let name_matches = || {
+                UniCase::new(&name.0) == string
                     || config
                         .path_suffixes
                         .iter()
-                        .any(|suffix| UniCase::new(suffix) == string)))
-            .then_some(LanguageMatchPrecedence::PathOrContent)
+                        .any(|suffix| UniCase::new(suffix) == string)
+            };
+
+            match current_best_match {
+                LanguageMatchPrecedence::Undetermined => {
+                    name_matches().then_some(LanguageMatchPrecedence::PathOrContent(string.len()))
+                }
+                LanguageMatchPrecedence::PathOrContent(len) => (string.len() > len
+                    && name_matches())
+                .then_some(LanguageMatchPrecedence::PathOrContent(string.len())),
+                LanguageMatchPrecedence::UserConfigured(_) => None,
+            }
         });
         async move { rx.await? }
     }
@@ -717,10 +724,9 @@ impl LanguageRegistry {
         // and no other extension which is not the desired behavior here,
         // as we want `.zshrc` to result in extension being `Some("zshrc")`
         let extension = filename.and_then(|filename| filename.split('.').next_back());
-        let path_suffixes = [extension, filename, path.to_str()];
-        let path_suffixes_candidates = path_suffixes
+        let path_suffixes = [extension, filename, path.to_str()]
             .iter()
-            .filter_map(|suffix| suffix.map(globset::Candidate::new))
+            .filter_map(|suffix| suffix.map(|suffix| (suffix, globset::Candidate::new(suffix))))
             .collect::<SmallVec<[_; 3]>>();
         let content = LazyCell::new(|| {
             content.map(|content| {
@@ -731,20 +737,37 @@ impl LanguageRegistry {
         });
         self.find_matching_language(move |language_name, config, current_best_match| {
             let path_matches_default_suffix = || {
-                config
-                    .path_suffixes
-                    .iter()
-                    .any(|suffix| path_suffixes.contains(&Some(suffix.as_str())))
+                let len =
+                    config
+                        .path_suffixes
+                        .iter()
+                        .fold(0, |acc: usize, path_suffix: &String| {
+                            let ext = ".".to_string() + path_suffix;
+
+                            let matched_suffix_len = path_suffixes
+                                .iter()
+                                .find(|(suffix, _)| suffix.ends_with(&ext) || suffix == path_suffix)
+                                .map(|(suffix, _)| suffix.len());
+
+                            match matched_suffix_len {
+                                Some(len) => acc.max(len),
+                                None => acc,
+                            }
+                        });
+                (len > 0).then_some(len)
             };
+
             let path_matches_custom_suffix = || {
                 user_file_types
                     .and_then(|types| types.get(language_name.as_ref()))
-                    .map_or(false, |custom_suffixes| {
-                        path_suffixes_candidates
+                    .map_or(None, |custom_suffixes| {
+                        path_suffixes
                             .iter()
-                            .any(|suffix| custom_suffixes.is_match_candidate(suffix))
+                            .find(|(_, candidate)| custom_suffixes.is_match_candidate(candidate))
+                            .map(|(suffix, _)| suffix.len())
                     })
             };
+
             let content_matches = || {
                 config.first_line_pattern.as_ref().map_or(false, |pattern| {
                     content
@@ -756,17 +779,29 @@ impl LanguageRegistry {
             // Only return a match for the given file if we have a better match than
             // the current one.
             match current_best_match {
-                LanguageMatchPrecedence::PathOrContent | LanguageMatchPrecedence::Undetermined
-                    if path_matches_custom_suffix() =>
-                {
-                    Some(LanguageMatchPrecedence::UserConfigured)
+                LanguageMatchPrecedence::PathOrContent(current_len) => {
+                    if let Some(len) = path_matches_custom_suffix() {
+                        // >= because user config should win tie with system ext len
+                        (len >= current_len).then_some(LanguageMatchPrecedence::UserConfigured(len))
+                    } else if let Some(len) = path_matches_default_suffix() {
+                        // >= because user config should win tie with system ext len
+                        (len >= current_len).then_some(LanguageMatchPrecedence::PathOrContent(len))
+                    } else {
+                        None
+                    }
                 }
-                LanguageMatchPrecedence::Undetermined
-                    if path_matches_default_suffix() || content_matches() =>
-                {
-                    Some(LanguageMatchPrecedence::PathOrContent)
+                LanguageMatchPrecedence::Undetermined => {
+                    if let Some(len) = path_matches_custom_suffix() {
+                        Some(LanguageMatchPrecedence::UserConfigured(len))
+                    } else if let Some(len) = path_matches_default_suffix() {
+                        Some(LanguageMatchPrecedence::PathOrContent(len))
+                    } else if content_matches() {
+                        Some(LanguageMatchPrecedence::PathOrContent(1))
+                    } else {
+                        None
+                    }
                 }
-                _ => None,
+                LanguageMatchPrecedence::UserConfigured(_) => None,
             }
         })
     }
@@ -784,28 +819,61 @@ impl LanguageRegistry {
             .available_languages
             .iter()
             .rev()
-            .fold_while(None, |best_language_match, language| {
+            .fold(None, |best_language_match, language| {
                 let current_match_type = best_language_match
                     .as_ref()
                     .map_or(LanguageMatchPrecedence::default(), |(_, score)| *score);
                 let language_score =
                     callback(&language.name, &language.matcher, current_match_type);
-                debug_assert!(
-                    language_score.is_none_or(|new_score| new_score > current_match_type),
-                    "Matching callback should only return a better match than the current one"
-                );
-
-                match language_score {
-                    Some(new_score) if new_score.best_possible_match() => {
-                        Done(Some((language.clone(), new_score)))
+
+                match (language_score, current_match_type) {
+                    // no current best, so our candidate is better
+                    (
+                        Some(
+                            LanguageMatchPrecedence::PathOrContent(_)
+                            | LanguageMatchPrecedence::UserConfigured(_),
+                        ),
+                        LanguageMatchPrecedence::Undetermined,
+                    ) => language_score.map(|new_score| (language.clone(), new_score)),
+
+                    // our candidate is better only if the name is longer
+                    (
+                        Some(LanguageMatchPrecedence::PathOrContent(new_len)),
+                        LanguageMatchPrecedence::PathOrContent(current_len),
+                    )
+                    | (
+                        Some(LanguageMatchPrecedence::UserConfigured(new_len)),
+                        LanguageMatchPrecedence::UserConfigured(current_len),
+                    )
+                    | (
+                        Some(LanguageMatchPrecedence::PathOrContent(new_len)),
+                        LanguageMatchPrecedence::UserConfigured(current_len),
+                    ) => {
+                        if new_len > current_len {
+                            language_score.map(|new_score| (language.clone(), new_score))
+                        } else {
+                            best_language_match
+                        }
                     }
-                    Some(new_score) if current_match_type < new_score => {
-                        Continue(Some((language.clone(), new_score)))
+
+                    // our candidate is better if the name is longer or equal to
+                    (
+                        Some(LanguageMatchPrecedence::UserConfigured(new_len)),
+                        LanguageMatchPrecedence::PathOrContent(current_len),
+                    ) => {
+                        if new_len >= current_len {
+                            language_score.map(|new_score| (language.clone(), new_score))
+                        } else {
+                            best_language_match
+                        }
+                    }
+
+                    // no candidate, use current best
+                    (None, _) | (Some(LanguageMatchPrecedence::Undetermined), _) => {
+                        best_language_match
                     }
-                    _ => Continue(best_language_match),
                 }
             })
-            .into_inner()
             .map(|(available_language, _)| available_language);
         drop(state);
         available_language

crates/languages/src/typescript/config.toml 🔗

@@ -1,6 +1,6 @@
 name = "TypeScript"
 grammar = "typescript"
-path_suffixes = ["ts", "cts", "d.cts", "d.mts", "mts"]
+path_suffixes = ["ts", "cts", "mts"]
 first_line_pattern = '^#!.*\b(?:deno run|ts-node|bun|tsx)\b'
 line_comments = ["// "]
 block_comment = ["/*", "*/"]