Set cache breakpoint on second-to-last message (#27632)

Richard Feldman created

Here's a sample `dbg!` of token usage after this change, for a small
agent thread:

```
[crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage {
    input_tokens: 5354,
    output_tokens: 184,
    cache_creation_input_tokens: 0,
    cache_read_input_tokens: 0,
}
[crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage {
    input_tokens: 54,
    output_tokens: 132,
    cache_creation_input_tokens: 5518,
    cache_read_input_tokens: 0,
}
[crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage {
    input_tokens: 54,
    output_tokens: 113,
    cache_creation_input_tokens: 166,
    cache_read_input_tokens: 5518,
}
[crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage {
    input_tokens: 291,
    output_tokens: 181,
    cache_creation_input_tokens: 147,
    cache_read_input_tokens: 5684,
}
```

Release Notes:

- N/A

Change summary

crates/assistant2/src/thread.rs | 7 +++++++
1 file changed, 7 insertions(+)

Detailed changes

crates/assistant2/src/thread.rs 🔗

@@ -857,6 +857,13 @@ impl Thread {
             request.messages.push(request_message);
         }
 
+        // Set a cache breakpoint at the second-to-last message.
+        // https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
+        let breakpoint_index = request.messages.len() - 2;
+        for (index, message) in request.messages.iter_mut().enumerate() {
+            message.cache = index == breakpoint_index;
+        }
+
         if !referenced_context_ids.is_empty() {
             let mut context_message = LanguageModelRequestMessage {
                 role: Role::User,