1use std::ops::Range;
2
3use serde::{Deserialize, Serialize};
4
5use crate::estimate_tokens;
6
7/// Pre-computed byte offset ranges within `cursor_excerpt` for different
8/// editable and context token budgets. Allows the server to select the
9/// appropriate ranges for whichever model it uses.
10#[derive(Clone, Debug, Default, PartialEq, Hash, Serialize, Deserialize)]
11pub struct ExcerptRanges {
12 /// Editable region computed with a 150-token budget.
13 pub editable_150: Range<usize>,
14 /// Editable region computed with a 180-token budget.
15 pub editable_180: Range<usize>,
16 /// Editable region computed with a 350-token budget.
17 pub editable_350: Range<usize>,
18 /// Editable region computed with a 350-token budget.
19 pub editable_512: Option<Range<usize>>,
20 /// Context boundary when using editable_150 with 350 tokens of additional context.
21 pub editable_150_context_350: Range<usize>,
22 /// Context boundary when using editable_180 with 350 tokens of additional context.
23 pub editable_180_context_350: Range<usize>,
24 /// Context boundary when using editable_350 with 150 tokens of additional context.
25 pub editable_350_context_150: Range<usize>,
26 pub editable_350_context_512: Option<Range<usize>>,
27 pub editable_350_context_1024: Option<Range<usize>>,
28 pub context_4096: Option<Range<usize>>,
29 pub context_8192: Option<Range<usize>>,
30}
31
32/// Builds an `ExcerptRanges` by computing editable and context ranges for each
33/// budget combination, using the syntax-aware logic in
34/// `compute_editable_and_context_ranges`.
35pub fn compute_legacy_excerpt_ranges(
36 cursor_excerpt: &str,
37 cursor_offset: usize,
38 syntax_ranges: &[Range<usize>],
39) -> ExcerptRanges {
40 let compute = |editable_tokens, context_tokens| {
41 compute_editable_and_context_ranges(
42 cursor_excerpt,
43 cursor_offset,
44 syntax_ranges,
45 editable_tokens,
46 context_tokens,
47 )
48 };
49
50 let (editable_150, editable_150_context_350) = compute(150, 350);
51 let (editable_180, editable_180_context_350) = compute(180, 350);
52 let (editable_350, editable_350_context_150) = compute(350, 150);
53 let (editable_512, _) = compute(512, 0);
54 let (_, editable_350_context_512) = compute(350, 512);
55 let (_, editable_350_context_1024) = compute(350, 1024);
56 let (_, context_4096) = compute(350, 4096);
57 let (_, context_8192) = compute(350, 8192);
58
59 ExcerptRanges {
60 editable_150,
61 editable_180,
62 editable_350,
63 editable_512: Some(editable_512),
64 editable_150_context_350,
65 editable_180_context_350,
66 editable_350_context_150,
67 editable_350_context_512: Some(editable_350_context_512),
68 editable_350_context_1024: Some(editable_350_context_1024),
69 context_4096: Some(context_4096),
70 context_8192: Some(context_8192),
71 }
72}
73
74/// Given the cursor excerpt text, cursor offset, and the syntax node ranges
75/// containing the cursor (innermost to outermost), compute the editable range
76/// and context range as byte offset ranges within `cursor_excerpt`.
77///
78/// This is the server-side equivalent of `compute_excerpt_ranges` in
79/// `edit_prediction::cursor_excerpt`, but operates on plain text with
80/// pre-computed syntax boundaries instead of a `BufferSnapshot`.
81pub fn compute_editable_and_context_ranges(
82 cursor_excerpt: &str,
83 cursor_offset: usize,
84 syntax_ranges: &[Range<usize>],
85 editable_token_limit: usize,
86 context_token_limit: usize,
87) -> (Range<usize>, Range<usize>) {
88 let line_starts = compute_line_starts(cursor_excerpt);
89 let cursor_row = offset_to_row(&line_starts, cursor_offset);
90 let max_row = line_starts.len().saturating_sub(1) as u32;
91
92 let editable_range = compute_editable_range_from_text(
93 cursor_excerpt,
94 &line_starts,
95 cursor_row,
96 max_row,
97 syntax_ranges,
98 editable_token_limit,
99 );
100
101 let context_range = expand_context_from_text(
102 cursor_excerpt,
103 &line_starts,
104 max_row,
105 &editable_range,
106 syntax_ranges,
107 context_token_limit,
108 );
109
110 (editable_range, context_range)
111}
112
113fn compute_line_starts(text: &str) -> Vec<usize> {
114 let mut starts = vec![0];
115 for (index, byte) in text.bytes().enumerate() {
116 if byte == b'\n' {
117 starts.push(index + 1);
118 }
119 }
120 starts
121}
122
123fn offset_to_row(line_starts: &[usize], offset: usize) -> u32 {
124 match line_starts.binary_search(&offset) {
125 Ok(row) => row as u32,
126 Err(row) => (row.saturating_sub(1)) as u32,
127 }
128}
129
130fn row_start_offset(line_starts: &[usize], row: u32) -> usize {
131 line_starts.get(row as usize).copied().unwrap_or(0)
132}
133
134fn row_end_offset(text: &str, line_starts: &[usize], row: u32) -> usize {
135 if let Some(&next_start) = line_starts.get(row as usize + 1) {
136 // End before the newline of this row.
137 next_start.saturating_sub(1).min(text.len())
138 } else {
139 text.len()
140 }
141}
142
143fn row_range_to_byte_range(
144 text: &str,
145 line_starts: &[usize],
146 start_row: u32,
147 end_row: u32,
148) -> Range<usize> {
149 let start = row_start_offset(line_starts, start_row);
150 let end = row_end_offset(text, line_starts, end_row);
151 start..end
152}
153
154fn estimate_tokens_for_row_range(
155 text: &str,
156 line_starts: &[usize],
157 start_row: u32,
158 end_row: u32,
159) -> usize {
160 let mut tokens = 0;
161 for row in start_row..end_row {
162 let row_len = row_end_offset(text, line_starts, row)
163 .saturating_sub(row_start_offset(line_starts, row));
164 tokens += estimate_tokens(row_len).max(1);
165 }
166 tokens
167}
168
169fn line_token_count_from_text(text: &str, line_starts: &[usize], row: u32) -> usize {
170 let row_len =
171 row_end_offset(text, line_starts, row).saturating_sub(row_start_offset(line_starts, row));
172 estimate_tokens(row_len).max(1)
173}
174
175/// Returns syntax boundaries (as row ranges) that contain the given row range
176/// and extend beyond it, ordered from smallest to largest.
177fn containing_syntax_boundaries_from_ranges(
178 line_starts: &[usize],
179 syntax_ranges: &[Range<usize>],
180 start_row: u32,
181 end_row: u32,
182) -> Vec<(u32, u32)> {
183 let mut boundaries = Vec::new();
184 let mut last: Option<(u32, u32)> = None;
185
186 // syntax_ranges is innermost to outermost, so iterate in order.
187 for range in syntax_ranges {
188 let node_start_row = offset_to_row(line_starts, range.start);
189 let node_end_row = offset_to_row(line_starts, range.end);
190
191 // Skip nodes that don't extend beyond the current range.
192 if node_start_row >= start_row && node_end_row <= end_row {
193 continue;
194 }
195
196 let rows = (node_start_row, node_end_row);
197 if last == Some(rows) {
198 continue;
199 }
200
201 last = Some(rows);
202 boundaries.push(rows);
203 }
204
205 boundaries
206}
207
208fn compute_editable_range_from_text(
209 text: &str,
210 line_starts: &[usize],
211 cursor_row: u32,
212 max_row: u32,
213 syntax_ranges: &[Range<usize>],
214 token_limit: usize,
215) -> Range<usize> {
216 // Phase 1: Expand symmetrically from cursor using 75% of budget.
217 let initial_budget = (token_limit * 3) / 4;
218 let (mut start_row, mut end_row, mut remaining_tokens) =
219 expand_symmetric(text, line_starts, cursor_row, max_row, initial_budget);
220
221 remaining_tokens += token_limit.saturating_sub(initial_budget);
222
223 let original_start = start_row;
224 let original_end = end_row;
225
226 // Phase 2: Expand to syntax boundaries that fit within budget.
227 let boundaries =
228 containing_syntax_boundaries_from_ranges(line_starts, syntax_ranges, start_row, end_row);
229 for (boundary_start, boundary_end) in &boundaries {
230 let tokens_for_start = if *boundary_start < start_row {
231 estimate_tokens_for_row_range(text, line_starts, *boundary_start, start_row)
232 } else {
233 0
234 };
235 let tokens_for_end = if *boundary_end > end_row {
236 estimate_tokens_for_row_range(text, line_starts, end_row + 1, *boundary_end + 1)
237 } else {
238 0
239 };
240
241 let total_needed = tokens_for_start + tokens_for_end;
242 if total_needed <= remaining_tokens {
243 if *boundary_start < start_row {
244 start_row = *boundary_start;
245 }
246 if *boundary_end > end_row {
247 end_row = *boundary_end;
248 }
249 remaining_tokens = remaining_tokens.saturating_sub(total_needed);
250 } else {
251 break;
252 }
253 }
254
255 // Phase 3: Continue line-wise in the direction we expanded least.
256 let expanded_up = original_start.saturating_sub(start_row);
257 let expanded_down = end_row.saturating_sub(original_end);
258 let prefer_up = expanded_up <= expanded_down;
259
260 (start_row, end_row, _) = expand_linewise(
261 text,
262 line_starts,
263 start_row,
264 end_row,
265 max_row,
266 remaining_tokens,
267 prefer_up,
268 );
269
270 row_range_to_byte_range(text, line_starts, start_row, end_row)
271}
272
273fn expand_context_from_text(
274 text: &str,
275 line_starts: &[usize],
276 max_row: u32,
277 editable_range: &Range<usize>,
278 syntax_ranges: &[Range<usize>],
279 context_token_limit: usize,
280) -> Range<usize> {
281 let mut start_row = offset_to_row(line_starts, editable_range.start);
282 let mut end_row = offset_to_row(line_starts, editable_range.end);
283 let mut remaining_tokens = context_token_limit;
284 let mut did_syntax_expand = false;
285
286 let boundaries =
287 containing_syntax_boundaries_from_ranges(line_starts, syntax_ranges, start_row, end_row);
288 for (boundary_start, boundary_end) in &boundaries {
289 let tokens_for_start = if *boundary_start < start_row {
290 estimate_tokens_for_row_range(text, line_starts, *boundary_start, start_row)
291 } else {
292 0
293 };
294 let tokens_for_end = if *boundary_end > end_row {
295 estimate_tokens_for_row_range(text, line_starts, end_row + 1, *boundary_end + 1)
296 } else {
297 0
298 };
299
300 let total_needed = tokens_for_start + tokens_for_end;
301 if total_needed <= remaining_tokens {
302 if *boundary_start < start_row {
303 start_row = *boundary_start;
304 }
305 if *boundary_end > end_row {
306 end_row = *boundary_end;
307 }
308 remaining_tokens = remaining_tokens.saturating_sub(total_needed);
309 did_syntax_expand = true;
310 } else {
311 break;
312 }
313 }
314
315 // Only expand line-wise if no syntax expansion occurred.
316 if !did_syntax_expand {
317 (start_row, end_row, _) = expand_linewise(
318 text,
319 line_starts,
320 start_row,
321 end_row,
322 max_row,
323 remaining_tokens,
324 true,
325 );
326 }
327
328 row_range_to_byte_range(text, line_starts, start_row, end_row)
329}
330
331fn expand_symmetric(
332 text: &str,
333 line_starts: &[usize],
334 cursor_row: u32,
335 max_row: u32,
336 mut token_budget: usize,
337) -> (u32, u32, usize) {
338 let mut start_row = cursor_row;
339 let mut end_row = cursor_row;
340
341 let cursor_line_tokens = line_token_count_from_text(text, line_starts, cursor_row);
342 token_budget = token_budget.saturating_sub(cursor_line_tokens);
343
344 loop {
345 let can_expand_up = start_row > 0;
346 let can_expand_down = end_row < max_row;
347
348 if token_budget == 0 || (!can_expand_up && !can_expand_down) {
349 break;
350 }
351
352 if can_expand_down {
353 let next_row = end_row + 1;
354 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
355 if line_tokens <= token_budget {
356 end_row = next_row;
357 token_budget = token_budget.saturating_sub(line_tokens);
358 } else {
359 break;
360 }
361 }
362
363 if can_expand_up && token_budget > 0 {
364 let next_row = start_row - 1;
365 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
366 if line_tokens <= token_budget {
367 start_row = next_row;
368 token_budget = token_budget.saturating_sub(line_tokens);
369 } else {
370 break;
371 }
372 }
373 }
374
375 (start_row, end_row, token_budget)
376}
377
378fn expand_linewise(
379 text: &str,
380 line_starts: &[usize],
381 mut start_row: u32,
382 mut end_row: u32,
383 max_row: u32,
384 mut remaining_tokens: usize,
385 prefer_up: bool,
386) -> (u32, u32, usize) {
387 loop {
388 let can_expand_up = start_row > 0;
389 let can_expand_down = end_row < max_row;
390
391 if remaining_tokens == 0 || (!can_expand_up && !can_expand_down) {
392 break;
393 }
394
395 let mut expanded = false;
396
397 if prefer_up {
398 if can_expand_up {
399 let next_row = start_row - 1;
400 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
401 if line_tokens <= remaining_tokens {
402 start_row = next_row;
403 remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
404 expanded = true;
405 }
406 }
407 if can_expand_down && remaining_tokens > 0 {
408 let next_row = end_row + 1;
409 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
410 if line_tokens <= remaining_tokens {
411 end_row = next_row;
412 remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
413 expanded = true;
414 }
415 }
416 } else {
417 if can_expand_down {
418 let next_row = end_row + 1;
419 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
420 if line_tokens <= remaining_tokens {
421 end_row = next_row;
422 remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
423 expanded = true;
424 }
425 }
426 if can_expand_up && remaining_tokens > 0 {
427 let next_row = start_row - 1;
428 let line_tokens = line_token_count_from_text(text, line_starts, next_row);
429 if line_tokens <= remaining_tokens {
430 start_row = next_row;
431 remaining_tokens = remaining_tokens.saturating_sub(line_tokens);
432 expanded = true;
433 }
434 }
435 }
436
437 if !expanded {
438 break;
439 }
440 }
441
442 (start_row, end_row, remaining_tokens)
443}