WIP: Search language injections also by file extension

Antonio Scandurra created

There are still a few things left:

1. Add test to verify we can successfully locate a language by its extension
2. Add test to reproduce bug where changing the fenced code block language
   won't reparse the block with the new language
3. Reparse injections for which we couldn't find a language when the language
   registry changes.
4. Check why the markdown grammar considers the trailing triple backtick as
   `(code_block_content)`, as opposed to being part of the outer markdown.

Change summary

Cargo.lock                        |  1 +
crates/language/Cargo.toml        |  3 ++-
crates/language/src/language.rs   | 15 +++++++++++++++
crates/language/src/syntax_map.rs | 27 +++++++++++++++++++++++++--
4 files changed, 43 insertions(+), 3 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -3153,6 +3153,7 @@ dependencies = [
  "tree-sitter-html",
  "tree-sitter-javascript",
  "tree-sitter-json 0.19.0",
+ "tree-sitter-markdown",
  "tree-sitter-python",
  "tree-sitter-ruby",
  "tree-sitter-rust",

crates/language/Cargo.toml 🔗

@@ -66,12 +66,13 @@ util = { path = "../util", features = ["test-support"] }
 ctor = "0.1"
 env_logger = "0.9"
 rand = "0.8.3"
+tree-sitter-embedded-template = "*"
 tree-sitter-html = "*"
 tree-sitter-javascript = "*"
 tree-sitter-json = "*"
+tree-sitter-markdown = { git = "https://github.com/MDeiml/tree-sitter-markdown", rev = "330ecab87a3e3a7211ac69bbadc19eabecdb1cca" }
 tree-sitter-rust = "*"
 tree-sitter-python = "*"
 tree-sitter-typescript = "*"
 tree-sitter-ruby = "*"
-tree-sitter-embedded-template = "*"
 unindent = "0.1.7"

crates/language/src/language.rs 🔗

@@ -476,6 +476,21 @@ impl LanguageRegistry {
             .cloned()
     }
 
+    pub fn language_for_extension(&self, extension: &str) -> Option<Arc<Language>> {
+        let extension = UniCase::new(extension);
+        self.languages
+            .read()
+            .iter()
+            .find(|language| {
+                language
+                    .config
+                    .path_suffixes
+                    .iter()
+                    .any(|suffix| UniCase::new(suffix) == extension)
+            })
+            .cloned()
+    }
+
     pub fn to_vec(&self) -> Vec<Arc<Language>> {
         self.languages.read().iter().cloned().collect()
     }

crates/language/src/syntax_map.rs 🔗

@@ -1015,8 +1015,10 @@ fn get_injections(
                 });
 
             if let Some(language_name) = language_name {
-                if let Some(language) = language_registry.language_for_name(language_name.as_ref())
-                {
+                let language = language_registry
+                    .language_for_name(&language_name)
+                    .or_else(|| language_registry.language_for_extension(&language_name));
+                if let Some(language) = language {
                     result = true;
                     let range = text.anchor_before(content_range.start)
                         ..text.anchor_after(content_range.end);
@@ -2255,6 +2257,7 @@ mod tests {
         registry.add(Arc::new(ruby_lang()));
         registry.add(Arc::new(html_lang()));
         registry.add(Arc::new(erb_lang()));
+        registry.add(Arc::new(markdown_lang()));
         let language = registry.language_for_name(language_name).unwrap();
         let mut buffer = Buffer::new(0, 0, Default::default());
 
@@ -2393,6 +2396,26 @@ mod tests {
         .unwrap()
     }
 
+    fn markdown_lang() -> Language {
+        Language::new(
+            LanguageConfig {
+                name: "Markdown".into(),
+                path_suffixes: vec!["md".into()],
+                ..Default::default()
+            },
+            Some(tree_sitter_markdown::language()),
+        )
+        .with_injection_query(
+            r#"
+                (fenced_code_block
+                (info_string
+                    (language) @language)
+                (code_fence_content) @content)
+            "#,
+        )
+        .unwrap()
+    }
+
     fn range_for_text(buffer: &Buffer, text: &str) -> Range<usize> {
         let start = buffer.as_rope().to_string().find(text).unwrap();
         start..start + text.len()