excerpt_ranges.rs

  1use std::ops::Range;
  2
  3use serde::{Deserialize, Serialize};
  4
  5use crate::estimate_tokens;
  6
  7/// Pre-computed byte offset ranges within `cursor_excerpt` for different
  8/// editable and context token budgets. Allows the server to select the
  9/// appropriate ranges for whichever model it uses.
 10#[derive(Clone, Debug, Default, PartialEq, Hash, Serialize, Deserialize)]
 11pub struct ExcerptRanges {
 12    /// Editable region computed with a 150-token budget.
 13    pub editable_150: Range<usize>,
 14    /// Editable region computed with a 180-token budget.
 15    pub editable_180: Range<usize>,
 16    /// Editable region computed with a 350-token budget.
 17    pub editable_350: Range<usize>,
 18    /// Editable region computed with a 350-token budget.
 19    pub editable_512: Option<Range<usize>>,
 20    /// Context boundary when using editable_150 with 350 tokens of additional context.
 21    pub editable_150_context_350: Range<usize>,
 22    /// Context boundary when using editable_180 with 350 tokens of additional context.
 23    pub editable_180_context_350: Range<usize>,
 24    /// Context boundary when using editable_350 with 150 tokens of additional context.
 25    pub editable_350_context_150: Range<usize>,
 26    pub editable_350_context_512: Option<Range<usize>>,
 27    pub editable_350_context_1024: Option<Range<usize>>,
 28    pub context_4096: Option<Range<usize>>,
 29    pub context_8192: Option<Range<usize>>,
 30}
 31
 32/// Builds an `ExcerptRanges` by computing editable and context ranges for each
 33/// budget combination, using the syntax-aware logic in
 34/// `compute_editable_and_context_ranges`.
 35pub fn compute_legacy_excerpt_ranges(
 36    cursor_excerpt: &str,
 37    cursor_offset: usize,
 38    syntax_ranges: &[Range<usize>],
 39) -> ExcerptRanges {
 40    let compute = |editable_tokens, context_tokens| {
 41        compute_editable_and_context_ranges(
 42            cursor_excerpt,
 43            cursor_offset,
 44            syntax_ranges,
 45            editable_tokens,
 46            context_tokens,
 47        )
 48    };
 49
 50    let (editable_150, editable_150_context_350) = compute(150, 350);
 51    let (editable_180, editable_180_context_350) = compute(180, 350);
 52    let (editable_350, editable_350_context_150) = compute(350, 150);
 53    let (editable_512, _) = compute(512, 0);
 54    let (_, editable_350_context_512) = compute(350, 512);
 55    let (_, editable_350_context_1024) = compute(350, 1024);
 56    let (_, context_4096) = compute(350, 4096);
 57    let (_, context_8192) = compute(350, 8192);
 58
 59    ExcerptRanges {
 60        editable_150,
 61        editable_180,
 62        editable_350,
 63        editable_512: Some(editable_512),
 64        editable_150_context_350,
 65        editable_180_context_350,
 66        editable_350_context_150,
 67        editable_350_context_512: Some(editable_350_context_512),
 68        editable_350_context_1024: Some(editable_350_context_1024),
 69        context_4096: Some(context_4096),
 70        context_8192: Some(context_8192),
 71    }
 72}
 73
 74/// Given the cursor excerpt text, cursor offset, and the syntax node ranges
 75/// containing the cursor (innermost to outermost), compute the editable range
 76/// and context range as byte offset ranges within `cursor_excerpt`.
 77///
 78/// This is the server-side equivalent of `compute_excerpt_ranges` in
 79/// `edit_prediction::cursor_excerpt`, but operates on plain text with
 80/// pre-computed syntax boundaries instead of a `BufferSnapshot`.
 81pub fn compute_editable_and_context_ranges(
 82    cursor_excerpt: &str,
 83    cursor_offset: usize,
 84    syntax_ranges: &[Range<usize>],
 85    editable_token_limit: usize,
 86    context_token_limit: usize,
 87) -> (Range<usize>, Range<usize>) {
 88    let line_starts = compute_line_starts(cursor_excerpt);
 89    let cursor_row = offset_to_row(&line_starts, cursor_offset);
 90    let max_row = line_starts.len().saturating_sub(1) as u32;
 91
 92    let editable_range = compute_editable_range_from_text(
 93        cursor_excerpt,
 94        &line_starts,
 95        cursor_row,
 96        max_row,
 97        syntax_ranges,
 98        editable_token_limit,
 99    );
100
101    let context_range = expand_context_from_text(
102        cursor_excerpt,
103        &line_starts,
104        max_row,
105        &editable_range,
106        syntax_ranges,
107        context_token_limit,
108    );
109
110    (editable_range, context_range)
111}
112
113fn compute_line_starts(text: &str) -> Vec<usize> {
114    let mut starts = vec![0];
115    for (index, byte) in text.bytes().enumerate() {
116        if byte == b'\n' {
117            starts.push(index + 1);
118        }
119    }
120    starts
121}
122
123fn offset_to_row(line_starts: &[usize], offset: usize) -> u32 {
124    match line_starts.binary_search(&offset) {
125        Ok(row) => row as u32,
126        Err(row) => (row.saturating_sub(1)) as u32,
127    }
128}
129
130fn row_start_offset(line_starts: &[usize], row: u32) -> usize {
131    line_starts.get(row as usize).copied().unwrap_or(0)
132}
133
134fn row_end_offset(text: &str, line_starts: &[usize], row: u32) -> usize {
135    if let Some(&next_start) = line_starts.get(row as usize + 1) {
136        // End before the newline of this row.
137        next_start.saturating_sub(1).min(text.len())
138    } else {
139        text.len()
140    }
141}
142
143fn row_range_to_byte_range(
144    text: &str,
145    line_starts: &[usize],
146    start_row: u32,
147    end_row: u32,
148) -> Range<usize> {
149    let start = row_start_offset(line_starts, start_row);
150    let end = row_end_offset(text, line_starts, end_row);
151    start..end
152}
153
154fn estimate_tokens_for_row_range(
155    text: &str,
156    line_starts: &[usize],
157    start_row: u32,
158    end_row: u32,
159) -> usize {
160    let mut tokens = 0;
161    for row in start_row..end_row {
162        let row_len = row_end_offset(text, line_starts, row)
163            .saturating_sub(row_start_offset(line_starts, row));
164        tokens += estimate_tokens(row_len).max(1);
165    }
166    tokens
167}
168
169fn line_token_count_from_text(text: &str, line_starts: &[usize], row: u32) -> usize {
170    let row_len =
171        row_end_offset(text, line_starts, row).saturating_sub(row_start_offset(line_starts, row));
172    estimate_tokens(row_len).max(1)
173}
174
175/// Returns syntax boundaries (as row ranges) that contain the given row range
176/// and extend beyond it, ordered from smallest to largest.
177fn containing_syntax_boundaries_from_ranges(
178    line_starts: &[usize],
179    syntax_ranges: &[Range<usize>],
180    start_row: u32,
181    end_row: u32,
182) -> Vec<(u32, u32)> {
183    let mut boundaries = Vec::new();
184    let mut last: Option<(u32, u32)> = None;
185
186    // syntax_ranges is innermost to outermost, so iterate in order.
187    for range in syntax_ranges {
188        let node_start_row = offset_to_row(line_starts, range.start);
189        let node_end_row = offset_to_row(line_starts, range.end);
190
191        // Skip nodes that don't extend beyond the current range.
192        if node_start_row >= start_row && node_end_row <= end_row {
193            continue;
194        }
195
196        let rows = (node_start_row, node_end_row);
197        if last == Some(rows) {
198            continue;
199        }
200
201        last = Some(rows);
202        boundaries.push(rows);
203    }
204
205    boundaries
206}
207
208fn compute_editable_range_from_text(
209    text: &str,
210    line_starts: &[usize],
211    cursor_row: u32,
212    max_row: u32,
213    syntax_ranges: &[Range<usize>],
214    token_limit: usize,
215) -> Range<usize> {
216    // Phase 1: Expand symmetrically from cursor using 75% of budget.
217    let initial_budget = (token_limit * 3) / 4;
218    let (mut start_row, mut end_row, mut remaining_tokens) =
219        expand_symmetric(text, line_starts, cursor_row, max_row, initial_budget);
220
221    remaining_tokens += token_limit.saturating_sub(initial_budget);
222
223    let original_start = start_row;
224    let original_end = end_row;
225
226    // Phase 2: Expand to syntax boundaries that fit within budget.
227    let boundaries =
228        containing_syntax_boundaries_from_ranges(line_starts, syntax_ranges, start_row, end_row);
229    for (boundary_start, boundary_end) in &boundaries {
230        let tokens_for_start = if *boundary_start < start_row {
231            estimate_tokens_for_row_range(text, line_starts, *boundary_start, start_row)
232        } else {
233            0
234        };
235        let tokens_for_end = if *boundary_end > end_row {
236            estimate_tokens_for_row_range(text, line_starts, end_row + 1, *boundary_end + 1)
237        } else {
238            0
239        };
240
241        let total_needed = tokens_for_start + tokens_for_end;
242        if total_needed <= remaining_tokens {
243            if *boundary_start < start_row {
244                start_row = *boundary_start;
245            }
246            if *boundary_end > end_row {
247                end_row = *boundary_end;
248            }
249            remaining_tokens = remaining_tokens.saturating_sub(total_needed);
250        } else {
251            break;
252        }
253    }
254
255    // Phase 3: Continue line-wise in the direction we expanded least.
256    let expanded_up = original_start.saturating_sub(start_row);
257    let expanded_down = end_row.saturating_sub(original_end);
258    let prefer_up = expanded_up <= expanded_down;
259
260    (start_row, end_row, _) = expand_linewise(
261        text,
262        line_starts,
263        start_row,
264        end_row,
265        max_row,
266        remaining_tokens,
267        prefer_up,
268    );
269
270    row_range_to_byte_range(text, line_starts, start_row, end_row)
271}
272
273fn expand_context_from_text(
274    text: &str,
275    line_starts: &[usize],
276    max_row: u32,
277    editable_range: &Range<usize>,
278    syntax_ranges: &[Range<usize>],
279    context_token_limit: usize,
280) -> Range<usize> {
281    let mut start_row = offset_to_row(line_starts, editable_range.start);
282    let mut end_row = offset_to_row(line_starts, editable_range.end);
283    let mut remaining_tokens = context_token_limit;
284    let mut did_syntax_expand = false;
285
286    let boundaries =
287        containing_syntax_boundaries_from_ranges(line_starts, syntax_ranges, start_row, end_row);
288    for (boundary_start, boundary_end) in &boundaries {
289        let tokens_for_start = if *boundary_start < start_row {
290            estimate_tokens_for_row_range(text, line_starts, *boundary_start, start_row)
291        } else {
292            0
293        };
294        let tokens_for_end = if *boundary_end > end_row {
295            estimate_tokens_for_row_range(text, line_starts, end_row + 1, *boundary_end + 1)
296        } else {
297            0
298        };
299
300        let total_needed = tokens_for_start + tokens_for_end;
301        if total_needed <= remaining_tokens {
302            if *boundary_start < start_row {
303                start_row = *boundary_start;
304            }
305            if *boundary_end > end_row {
306                end_row = *boundary_end;
307            }
308            remaining_tokens = remaining_tokens.saturating_sub(total_needed);
309            did_syntax_expand = true;
310        } else {
311            break;
312        }
313    }
314
315    // Only expand line-wise if no syntax expansion occurred.
316    if !did_syntax_expand {
317        (start_row, end_row, _) = expand_linewise(
318            text,
319            line_starts,
320            start_row,
321            end_row,
322            max_row,
323            remaining_tokens,
324            true,
325        );
326    }
327
328    row_range_to_byte_range(text, line_starts, start_row, end_row)
329}
330
331fn expand_symmetric(
332    text: &str,
333    line_starts: &[usize],
334    cursor_row: u32,
335    max_row: u32,
336    mut token_budget: usize,
337) -> (u32, u32, usize) {
338    let mut start_row = cursor_row;
339    let mut end_row = cursor_row;
340
341    let cursor_line_tokens = line_token_count_from_text(text, line_starts, cursor_row);
342    token_budget = token_budget.saturating_sub(cursor_line_tokens);
343
344    loop {
345        let can_expand_up = start_row > 0;
346        let can_expand_down = end_row < max_row;
347
348        if token_budget == 0 || (!can_expand_up && !can_expand_down) {
349            break;
350        }
351
352        if can_expand_down {
353            let next_row = end_row + 1;
354            let line_tokens = line_token_count_from_text(text, line_starts, next_row);
355            if line_tokens <= token_budget {
356                end_row = next_row;
357                token_budget = token_budget.saturating_sub(line_tokens);
358            } else {
359                break;
360            }
361        }
362
363        if can_expand_up && token_budget > 0 {
364            let next_row = start_row - 1;
365            let line_tokens = line_token_count_from_text(text, line_starts, next_row);
366            if line_tokens <= token_budget {
367                start_row = next_row;
368                token_budget = token_budget.saturating_sub(line_tokens);
369            } else {
370                break;
371            }
372        }
373    }
374
375    (start_row, end_row, token_budget)
376}
377
378fn expand_linewise(
379    text: &str,
380    line_starts: &[usize],
381    mut start_row: u32,
382    mut end_row: u32,
383    max_row: u32,
384    mut remaining_tokens: usize,
385    prefer_up: bool,
386) -> (u32, u32, usize) {
387    loop {
388        let can_expand_up = start_row > 0;
389        let can_expand_down = end_row < max_row;
390
391        if remaining_tokens == 0 || (!can_expand_up && !can_expand_down) {
392            break;
393        }
394
395        let mut expanded = false;
396
397        if prefer_up {
398            if can_expand_up {
399                let next_row = start_row - 1;
400                let line_tokens = line_token_count_from_text(text, line_starts, next_row);
401                if line_tokens <= remaining_tokens {
402                    start_row = next_row;
403                    remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
404                    expanded = true;
405                }
406            }
407            if can_expand_down && remaining_tokens > 0 {
408                let next_row = end_row + 1;
409                let line_tokens = line_token_count_from_text(text, line_starts, next_row);
410                if line_tokens <= remaining_tokens {
411                    end_row = next_row;
412                    remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
413                    expanded = true;
414                }
415            }
416        } else {
417            if can_expand_down {
418                let next_row = end_row + 1;
419                let line_tokens = line_token_count_from_text(text, line_starts, next_row);
420                if line_tokens <= remaining_tokens {
421                    end_row = next_row;
422                    remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
423                    expanded = true;
424                }
425            }
426            if can_expand_up && remaining_tokens > 0 {
427                let next_row = start_row - 1;
428                let line_tokens = line_token_count_from_text(text, line_starts, next_row);
429                if line_tokens <= remaining_tokens {
430                    start_row = next_row;
431                    remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
432                    expanded = true;
433                }
434            }
435        }
436
437        if !expanded {
438            break;
439        }
440    }
441
442    (start_row, end_row, remaining_tokens)
443}