Improve handling of `injection.combined` injections in `SyntaxSnapshot::layers_for_range` (#32145)

Ben Kunkle created

Closes #27596

The problem in this case was incorrect identification of which language
(layer) contains the selection.

Language layer selection incorrectly assumed that the deepest
`SyntaxLayer` containing a range was the most specific. This worked for
Markdown (base document + injected subtrees) but failed for PHP, where
`injection.combined` injections are used to make HTML logically function
as the base layer, despite being at a greater depth in the layer stack.
This caused HTML to be incorrectly identified as the most specific
language for PHP ranges.

The solution is to track included sub-ranges for syntax layers and
filter out layers that don't contain a sub-range covering the desired
range. The top-level layer is never filtered to ensure gaps between
sibling nodes always have a fallback language, as the top-level layer is
likely more correct than the default language settings.

Release Notes:

- Fixed an issue in PHP where PHP language settings would be
occasionally overridden by HTML language settings

Change summary

crates/language/src/buffer.rs     | 21 +++++++++++++++++++++
crates/language/src/syntax_map.rs | 33 ++++++++++++++++++++++++++++-----
2 files changed, 49 insertions(+), 5 deletions(-)

Detailed changes

crates/language/src/buffer.rs 🔗

@@ -1387,9 +1387,30 @@ impl Buffer {
     /// Returns the [`Language`] at the given location.
     pub fn language_at<D: ToOffset>(&self, position: D) -> Option<Arc<Language>> {
         let offset = position.to_offset(self);
+        let mut is_first = true;
+        let start_anchor = self.anchor_before(offset);
+        let end_anchor = self.anchor_after(offset);
         self.syntax_map
             .lock()
             .layers_for_range(offset..offset, &self.text, false)
+            .filter(|layer| {
+                if is_first {
+                    is_first = false;
+                    return true;
+                }
+                let any_sub_ranges_contain_range = layer
+                    .included_sub_ranges
+                    .map(|sub_ranges| {
+                        sub_ranges.iter().any(|sub_range| {
+                            let is_before_start = sub_range.end.cmp(&start_anchor, self).is_lt();
+                            let is_after_end = sub_range.start.cmp(&end_anchor, self).is_gt();
+                            !is_before_start && !is_after_end
+                        })
+                    })
+                    .unwrap_or(true);
+                let result = any_sub_ranges_contain_range;
+                return result;
+            })
             .last()
             .map(|info| info.language.clone())
             .or_else(|| self.language.clone())

crates/language/src/syntax_map.rs 🔗

@@ -94,6 +94,7 @@ enum SyntaxLayerContent {
     Parsed {
         tree: tree_sitter::Tree,
         language: Arc<Language>,
+        included_sub_ranges: Option<Vec<Range<Anchor>>>,
     },
     Pending {
         language_name: Arc<str>,
@@ -122,6 +123,7 @@ impl SyntaxLayerContent {
 pub struct SyntaxLayer<'a> {
     /// The language for this layer.
     pub language: &'a Arc<Language>,
+    pub included_sub_ranges: Option<&'a [Range<Anchor>]>,
     pub(crate) depth: usize,
     tree: &'a Tree,
     pub(crate) offset: (usize, tree_sitter::Point),
@@ -621,7 +623,7 @@ impl SyntaxSnapshot {
                             grammar,
                             text.as_rope(),
                             step_start_byte,
-                            included_ranges,
+                            &included_ranges,
                             Some(old_tree.clone()),
                         );
                         match result {
@@ -674,7 +676,7 @@ impl SyntaxSnapshot {
                             grammar,
                             text.as_rope(),
                             step_start_byte,
-                            included_ranges,
+                            &included_ranges,
                             None,
                         );
                         match result {
@@ -717,7 +719,21 @@ impl SyntaxSnapshot {
                         );
                     }
 
-                    SyntaxLayerContent::Parsed { tree, language }
+                    let included_sub_ranges: Option<Vec<Range<Anchor>>> =
+                        (included_ranges.len() > 1).then_some(
+                            included_ranges
+                                .into_iter()
+                                .map(|r| {
+                                    text.anchor_before(r.start_byte + step_start_byte)
+                                        ..text.anchor_after(r.end_byte + step_start_byte)
+                                })
+                                .collect(),
+                        );
+                    SyntaxLayerContent::Parsed {
+                        tree,
+                        language,
+                        included_sub_ranges,
+                    }
                 }
                 ParseStepLanguage::Pending { name } => SyntaxLayerContent::Pending {
                     language_name: name,
@@ -783,6 +799,7 @@ impl SyntaxSnapshot {
             [SyntaxLayer {
                 language,
                 tree,
+                included_sub_ranges: None,
                 depth: 0,
                 offset: (0, tree_sitter::Point::new(0, 0)),
             }]
@@ -867,13 +884,19 @@ impl SyntaxSnapshot {
         iter::from_fn(move || {
             while let Some(layer) = cursor.item() {
                 let mut info = None;
-                if let SyntaxLayerContent::Parsed { tree, language } = &layer.content {
+                if let SyntaxLayerContent::Parsed {
+                    tree,
+                    language,
+                    included_sub_ranges,
+                } = &layer.content
+                {
                     let layer_start_offset = layer.range.start.to_offset(buffer);
                     let layer_start_point = layer.range.start.to_point(buffer).to_ts_point();
                     if include_hidden || !language.config.hidden {
                         info = Some(SyntaxLayer {
                             tree,
                             language,
+                            included_sub_ranges: included_sub_ranges.as_deref(),
                             depth: layer.depth,
                             offset: (layer_start_offset, layer_start_point),
                         });
@@ -1231,7 +1254,7 @@ fn parse_text(
     grammar: &Grammar,
     text: &Rope,
     start_byte: usize,
-    ranges: Vec<tree_sitter::Range>,
+    ranges: &[tree_sitter::Range],
     old_tree: Option<Tree>,
 ) -> anyhow::Result<Tree> {
     with_parser(|parser| {