1import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
2import * as fs from "node:fs";
3import * as path from "node:path";
4import * as os from "node:os";
5import { randomUUID } from "node:crypto";
6
7// Bump on any user-facing behaviour change. Surfaced in the extension log so a
8// session's behaviour can be tied to a specific revision when triaging reports.
9// 2.0.0 — honest in-flight chip (tools#33 / inference_frontend#3954):
10// neutral "working…" label, silent grace window, tightened MCR gating.
11// 2.1.0 — send the extension version on the wire as X-NW-MCR-Ext-Version so
12// the gateway can log which client revision served a request (server
13// logs previously had no way to tell a user's extension version).
14const EXTENSION_VERSION = "2.1.0";
15
16const MCR_ANCHOR_USER_MESSAGES = 3;
17
18const LOG_FILE = path.join(
19 os.homedir(),
20 ".pi",
21 "agent",
22 "extensions",
23 "neuralwatt-mcr.log",
24);
25
26function nwlog(event: string, data: Record<string, unknown> = {}): void {
27 try {
28 const line =
29 JSON.stringify({ ts: new Date().toISOString(), event, ...data }) + "\n";
30 fs.appendFileSync(LOG_FILE, line);
31 } catch {
32 // never break the extension on logging failure
33 }
34}
35
36interface MCRMetadata {
37 session_fp: string;
38 stored_through: number;
39 safe_drop_before: number;
40}
41
42interface EnergyData {
43 energy_joules: number;
44 mcr?: {
45 session_turns: number;
46 context_tokens: number;
47 compaction_triggered: boolean;
48 apc_hit_rate?: number;
49 mcr_compacted_tokens?: number;
50 mcr_original_tokens?: number;
51 };
52}
53
54interface SessionState {
55 sessionFp: string | null;
56 safeDropBefore: number;
57 storedThrough: number;
58 totalEnergyJoules: number;
59 sessionTurns: number;
60 contextTokens: number;
61 lastMcrMeta: MCRMetadata | null;
62 lastEnergy: EnergyData | null;
63 // In-flight request UX. When an MCR request is sent we record the wall-clock
64 // send time so the chip can surface a neutral "working…" indicator on long
65 // waits, and cleared on the first ``message_update`` / ``message_end``.
66 //
67 // HONESTY NOTE (tools#33 / inference_frontend#3954): this is a wall-clock
68 // proxy, NOT a real compaction signal. The extension cannot observe whether
69 // the gateway is actually compacting — see ``markRequestSent`` for why Pi's
70 // API doesn't surface the gateway's ``mcr-status`` SSE frames. So the chip
71 // must NOT claim "optimizing context"; it can only honestly say a request is
72 // in flight. We also stay silent for the first few seconds so ordinary turns
73 // (the ~96% with no compaction) show nothing at all.
74 inFlightSince: number | null;
75 inFlightTickerHandle: NodeJS.Timeout | null;
76}
77
78const state: SessionState = {
79 sessionFp: null,
80 safeDropBefore: 0,
81 storedThrough: 0,
82 totalEnergyJoules: 0,
83 sessionTurns: 0,
84 contextTokens: 0,
85 lastMcrMeta: null,
86 lastEnergy: null,
87 inFlightSince: null,
88 inFlightTickerHandle: null,
89};
90
91// How often to refresh the chip while the in-flight indicator is showing, so
92// the elapsed counter advances visibly. 500ms is fast enough to feel live but
93// well below the refresh budget of Pi's footer.
94const IN_FLIGHT_TICK_MS = 500;
95
96// Honesty grace window (tools#33 / inference_frontend#3954): suppress the
97// in-flight indicator for the first few seconds of every request. Large MCR
98// prompts (100k+ tokens) have a naturally long prefill/TTFT — 10-60s — on
99// EVERY turn, with no compaction happening on the ~96% that don't compact.
100// The old chip labelled that ordinary latency "optimizing context…" for the
101// whole window, which read as "MCR is making every prompt slow" and drove
102// churn. We can't tell prefill from compaction (no SSE phase signal reaches
103// the extension), so the only honest move is: say nothing until the wait is
104// long enough that the user genuinely wants reassurance the model isn't hung,
105// and even then use a neutral label that doesn't assert MCR is doing work.
106const IN_FLIGHT_GRACE_MS = 6000;
107
108// Recognise ONLY the MCR-backed aliases. The MCR pipeline (server-side
109// compaction + 1M virtual context + the X-MCR-* response protocol) runs only
110// for the `neuralwatt/…-long` aliases; the base-model and fast/flex IDs route
111// straight to the provider with no compaction.
112//
113// tools#33: the earlier predicate also matched `zai-org/`, `moonshotai/`,
114// `glm-5`, and `kimi-k2`, so every fast/flex alias and direct base-model call
115// lit the in-flight chip and ran the context-drop / fp handlers — none of which
116// apply off-MCR. Nico reported "optimizing context" on non-long models. The
117// client always selects the alias (never the forwarded base name), so matching
118// the alias shape is sufficient and correct. This narrows every call site
119// (chip gating, context-drop, fp-set, compaction-suppression) to MCR-only,
120// which is the intended behaviour for all of them.
121function isMCRModel(modelId: string): boolean {
122 return modelId.includes("neuralwatt/") || modelId.endsWith("-long");
123}
124
125function extractMCRFromHeaders(
126 headers: Record<string, string>,
127): MCRMetadata | null {
128 const fp = headers["x-mcr-session-fp"];
129 if (!fp) return null;
130 return {
131 session_fp: fp,
132 stored_through: parseInt(headers["x-mcr-stored-through"] || "0", 10),
133 safe_drop_before: parseInt(
134 headers["x-mcr-safe-drop-before"] || "0",
135 10,
136 ),
137 };
138}
139
140// Parse a header value as an integer if present, else null. We keep null
141// distinct from 0 because:
142// * absent header -> gateway did not run the M1.b ref-recovery code path
143// (older deploy, non-MCR request, or path bailed before emitting)
144// * 0 -> ref-recovery ran but had nothing to surface
145// This distinction is the whole point of issue #3371 follow-up.
146function parseOptionalIntHeader(
147 headers: Record<string, string>,
148 name: string,
149): number | null {
150 const raw = headers[name];
151 if (raw === undefined || raw === null || raw === "") return null;
152 const n = parseInt(raw, 10);
153 return Number.isFinite(n) ? n : null;
154}
155
156function extractMCRFromBody(
157 body: Record<string, unknown>,
158): MCRMetadata | null {
159 const mcr = body.mcr as Record<string, unknown> | undefined;
160 if (!mcr || typeof mcr !== "object" || typeof mcr.session_fp !== "string")
161 return null;
162 return {
163 session_fp: mcr.session_fp as string,
164 stored_through:
165 typeof mcr.stored_through === "number" ? mcr.stored_through : 0,
166 safe_drop_before:
167 typeof mcr.safe_drop_before === "number" ? mcr.safe_drop_before : 0,
168 };
169}
170
171function extractEnergyFromBody(
172 body: Record<string, unknown>,
173): EnergyData | null {
174 const energy = body.energy as Record<string, unknown> | undefined;
175 if (!energy || typeof energy !== "object") return null;
176 const result: EnergyData = {
177 energy_joules:
178 typeof energy.energy_joules === "number" ? energy.energy_joules : 0,
179 };
180 if (energy.mcr && typeof energy.mcr === "object") {
181 const m = energy.mcr as Record<string, unknown>;
182 result.mcr = {
183 session_turns:
184 typeof m.session_turns === "number" ? m.session_turns : 0,
185 context_tokens:
186 typeof m.context_tokens === "number" ? m.context_tokens : 0,
187 compaction_triggered:
188 typeof m.compaction_triggered === "boolean"
189 ? m.compaction_triggered
190 : false,
191 apc_hit_rate:
192 typeof m.apc_hit_rate === "number" ? m.apc_hit_rate : undefined,
193 mcr_compacted_tokens:
194 typeof m.mcr_compacted_tokens === "number"
195 ? m.mcr_compacted_tokens
196 : undefined,
197 mcr_original_tokens:
198 typeof m.mcr_original_tokens === "number"
199 ? m.mcr_original_tokens
200 : undefined,
201 };
202 }
203 return result;
204}
205
206function formatEnergy(joules: number): string {
207 if (joules < 1) return `${(joules * 1000).toFixed(0)}mJ`;
208 if (joules < 1000) return `${joules.toFixed(1)}J`;
209 return `${(joules / 1000).toFixed(2)}kJ`;
210}
211
212// Render an in-flight elapsed-time stamp for the chip. Short formats (<10s →
213// "1.4s", >=10s → "12s", >=60s → "1m 5s") keep the chip from growing wide
214// enough to push other footer widgets off-screen.
215function formatElapsed(ms: number): string {
216 if (ms < 10000) return `${(ms / 1000).toFixed(1)}s`;
217 const seconds = Math.floor(ms / 1000);
218 if (seconds < 60) return `${seconds}s`;
219 const minutes = Math.floor(seconds / 60);
220 return `${minutes}m ${seconds % 60}s`;
221}
222
223// Extract the role/type marker from an entry. Pi's outbound HTTP payload
224// uses OpenAI-shape ``role`` (``user``/``assistant``/``tool``/``system``);
225// Pi's internal session-log records sometimes serialize the same field as
226// ``type``. Read both so this extension works regardless of which shape
227// the agent-runtime hands us via the ``context`` hook event.
228function entryRole(entry: { role?: string; type?: string }): string | undefined {
229 return entry.role ?? entry.type;
230}
231
232// Returns the FULL message index of the Nth user message (matching the
233// indexing space of the server's `safe_drop_before`, which counts every
234// message — user/assistant/tool/system — in send order). The earlier
235// version returned a user+assistant-subset index, which mixed index
236// spaces with `safe_drop_before` in `computeDropRange` and caused
237// `context_drop` to silently nuke the most-recent user prompts when the
238// upper bound exceeded the user+assistant subset size.
239function findAnchorFloor(
240 entries: Array<{ role?: string; type?: string }>,
241 nAnchors: number,
242): number {
243 let userCount = 0;
244 for (let i = 0; i < entries.length; i++) {
245 const role = entryRole(entries[i]);
246 if (role === "user") {
247 userCount++;
248 if (userCount === nAnchors) return i;
249 }
250 }
251 return -1;
252}
253
254// Server-side validation for X-NW-Conversation-ID (mirrors
255// services/mcr_v3_session.py::validate_client_conversation_id):
256//
257// * non-empty, ≤ 256 chars
258// * all code points printable (no control / whitespace beyond plain space)
259//
260// We sanitize here so a malformed Pi session id can never bounce off the
261// server as HTTP 400 — the worst case is we fall back to the in-process UUID.
262const MAX_CONVERSATION_ID_LEN = 256;
263
264function isWellFormedConversationId(value: string): boolean {
265 if (!value || value.length === 0 || value.length > MAX_CONVERSATION_ID_LEN) {
266 return false;
267 }
268 // Reject any control character (matches server's `not c.isprintable()` rule).
269 // We allow plain ASCII space (0x20) and everything ≥ 0x20 except 0x7F (DEL).
270 for (let i = 0; i < value.length; i++) {
271 const code = value.charCodeAt(i);
272 if (code < 0x20 || code === 0x7f) return false;
273 }
274 return true;
275}
276
277// Per-process fallback id. Generated lazily so it stays stable across
278// auto-compact within one `pi` invocation, and differs across invocations.
279let uuidFallback: string | null = null;
280function getUuidFallback(): string {
281 if (!uuidFallback) {
282 uuidFallback = randomUUID();
283 }
284 return uuidFallback;
285}
286
287type ConversationIdSource = "pi-session" | "uuid-fallback";
288
289function resolveConversationId(
290 ctx: ExtensionContext,
291): { id: string; source: ConversationIdSource } {
292 // Preferred source: Pi's own session id. Stable across auto-compact in a
293 // single `pi` session, distinct across sessions, naturally string-form.
294 try {
295 const piSessionId = ctx.sessionManager?.getSessionId?.();
296 if (
297 typeof piSessionId === "string" &&
298 isWellFormedConversationId(piSessionId)
299 ) {
300 return { id: piSessionId, source: "pi-session" };
301 }
302 } catch {
303 // fall through to uuid fallback
304 }
305 return { id: getUuidFallback(), source: "uuid-fallback" };
306}
307
308function computeDropRange(
309 entries: Array<{ role?: string; type?: string }>,
310 safeDropBefore: number,
311): [number, number] {
312 if (safeDropBefore <= 0) return [0, 0];
313 const anchorIdx = findAnchorFloor(entries, MCR_ANCHOR_USER_MESSAGES);
314 if (anchorIdx < 0) return [0, 0];
315 const dropStart = anchorIdx + 1;
316 const dropEnd = safeDropBefore;
317 if (dropEnd <= dropStart) return [0, 0];
318 return [dropStart, dropEnd];
319}
320
321export default function (pi: ExtensionAPI) {
322 const MCR_STATUS_KEY = "nw-mcr";
323 const ENERGY_STATUS_KEY = "nw-energy";
324
325 // ── Outbound header wiring (X-NW-Conversation-ID, X-NW-MCR-Ext-Version) ──
326 // pi-coding-agent's `before_provider_request` is a *body* hook — the
327 // earlier `payload.headers[...]` mutation reached extension memory only,
328 // never the HTTP wire. The documented per-request header path is
329 // `pi.registerProvider({ headers })`, whose values are env-var NAMES that
330 // the SDK re-reads from `process.env` on every stream
331 // (dist/core/resolve-config-value.js). Net: real HTTP headers, no body
332 // touch, no APC impact. Both headers below ride this same mechanism.
333 //
334 // Boot order: we seed the env var with a UUID at extension load so any
335 // request fired before the first `before_provider_request` tick still
336 // carries *some* id. The hook upgrades it to Pi's stable per-invocation
337 // session id (see resolveConversationId). Subsequent requests in the
338 // same `pi` invocation reuse the upgraded value — invocation-stable
339 // session_fp by construction.
340 const CONV_ID_ENV = "X_NW_CONVERSATION_ID";
341 if (!process.env[CONV_ID_ENV]) {
342 process.env[CONV_ID_ENV] = getUuidFallback();
343 }
344 // X-NW-MCR-Ext-Version (2.1.0): surface the client extension
345 // version on the wire so the gateway can log which revision served a
346 // request — server logs previously had no way to tell a user's version.
347 // Unlike the conversation id, the version is static for the life of the
348 // process (it never changes at runtime), so we seed it once at load and
349 // never touch it again — no upgrade-on-hook logic needed.
350 const EXT_VERSION_ENV = "X_NW_MCR_EXT_VERSION";
351 process.env[EXT_VERSION_ENV] = EXTENSION_VERSION;
352 // `apiKey` mirrors the env-var name from ~/.pi/agent/models.json so the
353 // partial config doesn't shadow the existing auth. `storeProviderRequestConfig`
354 // doesn't merge with the models.json-derived entry (different map), so we
355 // have to re-state any field we need; only apiKey here, since baseUrl/api
356 // flow through via the override-only branch of applyProviderConfig.
357 pi.registerProvider("neuralwatt", {
358 apiKey: "NEURALWATT_API_KEY",
359 headers: {
360 "X-NW-Conversation-ID": CONV_ID_ENV,
361 "X-NW-MCR-Ext-Version": EXT_VERSION_ENV,
362 },
363 });
364
365 function updateStatusBar(ctx: ExtensionContext) {
366 // In-flight indicator (tools#33 / inference_frontend#3954). HONESTY RULES:
367 //
368 // * The extension cannot observe whether the gateway is compacting (Pi
369 // doesn't surface the ``mcr-status`` SSE frames — see markRequestSent).
370 // So we NEVER claim "optimizing context"; we only ever say a request is
371 // in flight ("working…").
372 // * We stay completely silent for the first IN_FLIGHT_GRACE_MS so normal
373 // turns — including the long-but-uncompacted prefill that dominates real
374 // usage — show nothing. Only a genuinely long wait surfaces the neutral
375 // reassurance that the model hasn't hung.
376 // * Once real model output starts, ``markStreamProducing`` clears
377 // ``inFlightSince`` and the chip reverts to the standard MCR view.
378 const inFlightElapsedMs =
379 state.inFlightSince !== null ? Date.now() - state.inFlightSince : 0;
380 if (
381 state.inFlightSince !== null &&
382 inFlightElapsedMs >= IN_FLIGHT_GRACE_MS
383 ) {
384 const fpPrefix = state.sessionFp
385 ? `MCR ${state.sessionFp.slice(0, 8)} | `
386 : "MCR | ";
387 ctx.ui.setStatus(
388 MCR_STATUS_KEY,
389 `${fpPrefix}working… ${formatElapsed(inFlightElapsedMs)}`,
390 );
391 } else if (state.sessionFp) {
392 const parts: string[] = [`MCR ${state.sessionFp.slice(0, 8)}`];
393 if (state.safeDropBefore > 0) {
394 parts.push(`drop<${state.safeDropBefore}`);
395 }
396 ctx.ui.setStatus(MCR_STATUS_KEY, parts.join(" | "));
397 } else {
398 ctx.ui.setStatus(MCR_STATUS_KEY, "");
399 }
400
401 if (state.totalEnergyJoules > 0) {
402 const parts: string[] = [`⚡ ${formatEnergy(state.totalEnergyJoules)}`];
403 if (state.lastEnergy?.mcr) {
404 const m = state.lastEnergy.mcr;
405 if (m.apc_hit_rate !== undefined) {
406 parts.push(`APC ${(m.apc_hit_rate * 100).toFixed(0)}%`);
407 }
408 if (m.mcr_compacted_tokens && m.mcr_original_tokens) {
409 const ratio = m.mcr_compacted_tokens / m.mcr_original_tokens;
410 parts.push(`compact ${(ratio * 100).toFixed(0)}%`);
411 }
412 }
413 ctx.ui.setStatus(ENERGY_STATUS_KEY, parts.join(" | "));
414 } else {
415 ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
416 }
417 }
418
419 // Lifecycle helpers for the in-flight indicator. We bracket the wait window
420 // with ``markRequestSent`` (on before_provider_request) and
421 // ``markStreamProducing`` (on the first message_update / message_end that
422 // carries real model output). The elapsed counter advances via a
423 // ``setInterval`` that re-calls ``updateStatusBar`` every ~0.5s.
424 //
425 // ── WHY THIS IS A NEUTRAL "working…" PROXY, NOT A REAL MCR PHASE SIGNAL ──
426 //
427 // The gateway (inference_frontend #3916) emits the ground truth as
428 // ``event: mcr-status`` SSE frames carrying {phase: compacting | warming |
429 // idle, elapsed_ms}. The ideal chip would show "optimizing context…" ONLY
430 // while a ``compacting`` phase is live. That is NOT achievable from a Pi
431 // extension on the version this targets (Pi v0.72/0.73), verified against
432 // the published type defs:
433 //
434 // 1. The only stream hook is ``message_update``, whose payload is
435 // ``assistantMessageEvent: AssistantMessageEvent`` (pi-ai types.d.ts).
436 // That union is closed to text/thinking/toolcall/start/done/error —
437 // there is no member that can carry a raw ``mcr-status`` frame.
438 // 2. There is no "raw SSE frame" / "stream event" hook anywhere on
439 // ``ExtensionAPI`` (pi-coding-agent extensions/types.d.ts): the event
440 // surface is session/agent/turn/message/tool lifecycle only.
441 // 3. The neuralwatt provider streams through pi-ai's openai-completions
442 // handler, which consumes the response via the official ``openai`` SDK
443 // (``client.chat.completions.create(...).withResponse()``). The SDK's
444 // decoder yields only typed ``chat.completion.chunk`` objects; any SSE
445 // frame with ``event: mcr-status`` is dropped by the SDK before pi-ai —
446 // and therefore the extension — can ever see it.
447 //
448 // So the extension genuinely cannot tell prefill from compaction. Claiming
449 // "optimizing context" would be a lie on the ~96% of turns where nothing is
450 // compacted (the churn driver in #3954). The honest behaviour is: neutral
451 // "working…" label, only after a grace window. Path A (real phase-driven
452 // chip) is blocked on an upstream Pi capability: a hook that surfaces raw
453 // provider SSE events (or an OpenAI-compat passthrough for unknown event
454 // types) to extensions. Track that as a Pi feature request; revisit here
455 // once it ships.
456 function markRequestSent(ctx: ExtensionContext) {
457 state.inFlightSince = Date.now();
458 if (state.inFlightTickerHandle !== null) {
459 clearInterval(state.inFlightTickerHandle);
460 }
461 state.inFlightTickerHandle = setInterval(() => {
462 // Defensive: stop ticking if the in-flight flag was cleared
463 // out-of-band (e.g. session_start reset).
464 if (state.inFlightSince === null) {
465 if (state.inFlightTickerHandle !== null) {
466 clearInterval(state.inFlightTickerHandle);
467 state.inFlightTickerHandle = null;
468 }
469 return;
470 }
471 updateStatusBar(ctx);
472 }, IN_FLIGHT_TICK_MS);
473 updateStatusBar(ctx);
474 }
475
476 function markStreamProducing(ctx: ExtensionContext) {
477 if (state.inFlightSince === null) return;
478 state.inFlightSince = null;
479 if (state.inFlightTickerHandle !== null) {
480 clearInterval(state.inFlightTickerHandle);
481 state.inFlightTickerHandle = null;
482 }
483 updateStatusBar(ctx);
484 }
485
486 // Upgrade X_NW_CONVERSATION_ID to Pi's stable session-id as early as
487 // possible — `session_start` fires before any provider request and is the
488 // earliest hook with ctx. Without this, the first request of each fresh
489 // `pi -p` invocation would carry the boot-time UUID (different per
490 // process) and produce a cold-cache session_fp that drags APC averages
491 // down on `--continue` chains.
492 pi.on("session_start", async (_event, ctx) => {
493 const { id: conversationId } = resolveConversationId(ctx);
494 process.env[CONV_ID_ENV] = conversationId;
495 });
496
497 pi.on("after_provider_response", async (event, ctx) => {
498 const modelId = ctx.model?.id || "";
499 if (!isMCRModel(modelId)) return;
500
501 const headers = event.headers as Record<string, string>;
502 const mcrFromHeaders = extractMCRFromHeaders(headers);
503
504 nwlog("after_provider_response", {
505 model: modelId,
506 header_fp: headers["x-mcr-session-fp"] ?? null,
507 header_safe_drop_before: headers["x-mcr-safe-drop-before"] ?? null,
508 header_stored_through: headers["x-mcr-stored-through"] ?? null,
509 parsed: mcrFromHeaders,
510 });
511
512 // M1.b ref-recovery observability (issue #3371 follow-up). The
513 // gateway's recent_ref_expansion path runs on the gateway->backend
514 // payload, which is invisible to Pi — without these headers in the
515 // log we can't tell "ref-recovery ran with no refs available" apart
516 // from "ref-recovery never ran". Counts only, no preview/sha values.
517 // Server-side emission: API_Gateway/app/services/mcr_anthropic_native_proxy.py::_add_mcr_refs_headers
518 const refsRecovered = parseOptionalIntHeader(headers, "x-mcr-refs-recovered");
519 const refsInForward = parseOptionalIntHeader(headers, "x-mcr-refs-in-forward");
520 const refsSkippedBudget = parseOptionalIntHeader(
521 headers,
522 "x-mcr-refs-skipped-budget",
523 );
524 const refsSkippedMissing = parseOptionalIntHeader(
525 headers,
526 "x-mcr-refs-skipped-missing",
527 );
528 const recoveryTokensAdded = parseOptionalIntHeader(
529 headers,
530 "x-mcr-recovery-tokens-added",
531 );
532 const manifestEntries = parseOptionalIntHeader(
533 headers,
534 "x-mcr-manifest-entries",
535 );
536
537 if (
538 refsRecovered !== null ||
539 refsInForward !== null ||
540 refsSkippedBudget !== null ||
541 refsSkippedMissing !== null ||
542 recoveryTokensAdded !== null ||
543 manifestEntries !== null
544 ) {
545 nwlog("mcr_refs", {
546 refs_recovered: refsRecovered,
547 refs_in_forward: refsInForward,
548 refs_skipped_budget: refsSkippedBudget,
549 refs_skipped_missing: refsSkippedMissing,
550 recovery_tokens_added: recoveryTokensAdded,
551 manifest_entries: manifestEntries,
552 });
553 }
554
555 if (mcrFromHeaders) {
556 state.sessionFp = mcrFromHeaders.session_fp;
557 state.safeDropBefore = mcrFromHeaders.safe_drop_before;
558 state.storedThrough = mcrFromHeaders.stored_through;
559 state.lastMcrMeta = mcrFromHeaders;
560 }
561
562 updateStatusBar(ctx);
563 });
564
565 // Clear the in-flight indicator on the first message_update for an MCR
566 // model — at that point real model tokens are flowing, the "is it hung?"
567 // perception is gone, and the chip should revert to the standard
568 // MCR-fingerprint view. The MessageUpdateEvent fires for any assistant
569 // streaming update (text/thinking/toolcall deltas); we don't need to narrow
570 // further since any of these prove the wait is over.
571 pi.on("message_update", async (event, ctx) => {
572 if (event.message.role !== "assistant") return;
573 if (!isMCRModel(ctx.model?.id || "")) return;
574 markStreamProducing(ctx);
575 });
576
577 pi.on("message_end", async (event, ctx) => {
578 if (event.message.role !== "assistant") return;
579 if (!isMCRModel(ctx.model?.id || "")) return;
580
581 // Backstop — if a response was short enough that no message_update ever
582 // fired, message_end is the latest possible chance to clear the indicator
583 // before it gets stale.
584 markStreamProducing(ctx);
585
586 const msg = event.message as Record<string, unknown>;
587
588 const mcrFromBody = extractMCRFromBody(msg);
589 if (mcrFromBody) {
590 nwlog("message_end_mcr_body", { parsed: mcrFromBody });
591 state.sessionFp = mcrFromBody.session_fp;
592 state.safeDropBefore = mcrFromBody.safe_drop_before;
593 state.storedThrough = mcrFromBody.stored_through;
594 state.lastMcrMeta = mcrFromBody;
595 }
596
597 const energy = extractEnergyFromBody(msg);
598 if (energy) {
599 nwlog("message_end_energy", {
600 energy_joules: energy.energy_joules,
601 cumulative_joules: state.totalEnergyJoules + energy.energy_joules,
602 session_turns: energy.mcr?.session_turns,
603 context_tokens: energy.mcr?.context_tokens,
604 compaction_triggered: energy.mcr?.compaction_triggered,
605 apc_hit_rate: energy.mcr?.apc_hit_rate,
606 mcr_compacted_tokens: energy.mcr?.mcr_compacted_tokens,
607 mcr_original_tokens: energy.mcr?.mcr_original_tokens,
608 });
609 state.totalEnergyJoules += energy.energy_joules;
610 state.lastEnergy = energy;
611 pi.appendEntry("neuralwatt-energy", { energy_joules: energy.energy_joules });
612 if (energy.mcr) {
613 state.sessionTurns = energy.mcr.session_turns;
614 state.contextTokens = energy.mcr.context_tokens;
615 }
616 }
617
618 updateStatusBar(ctx);
619 });
620
621 pi.on("context", async (event, ctx) => {
622 const modelId = ctx.model?.id || "";
623 const numMsgs = event.messages.length;
624
625 if (!state.sessionFp) {
626 nwlog("context_skip", {
627 reason: "no_session_fp",
628 model: modelId,
629 num_msgs: numMsgs,
630 });
631 return;
632 }
633 if (state.safeDropBefore <= 0) {
634 nwlog("context_skip", {
635 reason: "safe_drop_before_zero",
636 model: modelId,
637 num_msgs: numMsgs,
638 safe_drop_before: state.safeDropBefore,
639 session_fp: state.sessionFp,
640 });
641 return;
642 }
643 if (!isMCRModel(modelId)) {
644 nwlog("context_skip", {
645 reason: "not_mcr_model",
646 model: modelId,
647 num_msgs: numMsgs,
648 });
649 return;
650 }
651
652 const [dropStart, dropEnd] = computeDropRange(
653 event.messages as Array<{ type: string }>,
654 state.safeDropBefore,
655 );
656
657 if (dropEnd <= dropStart) {
658 nwlog("context_no_drop", {
659 reason: "empty_range",
660 drop_start: dropStart,
661 drop_end: dropEnd,
662 safe_drop_before: state.safeDropBefore,
663 num_msgs: numMsgs,
664 session_fp: state.sessionFp,
665 });
666 return;
667 }
668
669 // Drop messages in the full-index range [dropStart, dropEnd). Both
670 // bounds are in the same indexing space as `event.messages` and as the
671 // server's `safe_drop_before` (every message counted, all roles). The
672 // earlier version maintained a separate user+assistant-subset counter
673 // and compared it against full-index bounds, which silently nuked the
674 // most-recent user prompts (#bug discovered 2026-05-16 — see commit
675 // message for the trace).
676 const clampedEnd = Math.min(dropEnd, event.messages.length);
677 const filtered = event.messages.filter(
678 (_: unknown, i: number) => i < dropStart || i >= clampedEnd,
679 );
680 const droppedCount = numMsgs - filtered.length;
681
682 if (droppedCount === 0) {
683 nwlog("context_no_drop", {
684 reason: "no_indices_matched",
685 drop_start: dropStart,
686 drop_end: clampedEnd,
687 safe_drop_before: state.safeDropBefore,
688 num_msgs: numMsgs,
689 session_fp: state.sessionFp,
690 });
691 return;
692 }
693
694 nwlog("context_drop", {
695 drop_start: dropStart,
696 drop_end: clampedEnd,
697 safe_drop_before: state.safeDropBefore,
698 num_msgs_before: numMsgs,
699 num_msgs_after: filtered.length,
700 dropped: droppedCount,
701 session_fp: state.sessionFp,
702 });
703
704 return { messages: filtered };
705 });
706
707 pi.on("before_provider_request", async (event, ctx) => {
708 if (!isMCRModel(ctx.model?.id || "")) return;
709
710 const payload = event.payload as Record<string, unknown>;
711
712 // Start the in-flight indicator. It stays silent for IN_FLIGHT_GRACE_MS;
713 // only on a genuinely long wait does it surface a neutral
714 // "working… 12s" so the chat UI isn't indistinguishable from a hung
715 // model. It deliberately does NOT claim "optimizing context" — the
716 // extension can't observe whether compaction is happening (see
717 // ``markRequestSent`` for the Pi-API limitation that blocks the real
718 // phase signal).
719 markRequestSent(ctx);
720
721 // Upgrade the X_NW_CONVERSATION_ID env var (seeded with a UUID at
722 // extension load) to Pi's stable per-invocation session id. The SDK
723 // re-reads process.env on every stream, so this propagates to the
724 // next outbound request as the real X-NW-Conversation-ID HTTP header
725 // — no body touch. See the header-wiring block at the top of this fn
726 // and pi-header-surface.md for the full mechanism trace.
727 const { id: conversationId, source: conversationIdSource } =
728 resolveConversationId(ctx);
729 process.env[CONV_ID_ENV] = conversationId;
730 nwlog("conversation_id_attached", {
731 conversation_id_prefix: conversationId.slice(0, 8),
732 source: conversationIdSource,
733 });
734
735 // X-MCR-Session-FP fast-path hint: previously set via the same
736 // body-only mechanism that never reached the wire. Currently a no-op;
737 // diagnostic log retained so we still see when the gateway has
738 // assigned an fp. Revisit via registerProvider if we want it back.
739 if (state.sessionFp) {
740 nwlog("before_provider_request_fp_set", {
741 session_fp: state.sessionFp,
742 safe_drop_before: state.safeDropBefore,
743 });
744 }
745
746 // DEBUG (#3323 follow-up): capture the actual outbound payload shape
747 // so we can diagnose why the deployed pin-final-user-turn fix isn't
748 // firing for real Pi sessions. Logs shape only — no message content.
749 try {
750 const msgs = (payload.body as { messages?: Array<Record<string, unknown>> } | undefined)?.messages
751 ?? (payload.messages as Array<Record<string, unknown>> | undefined)
752 ?? [];
753 const roles = msgs.map((m) => String(m.role ?? m.type ?? "?"));
754 const lastN = msgs.slice(-5).map((m) => {
755 const role = String(m.role ?? m.type ?? "?");
756 const contentField = m.content;
757 let contentKind: string;
758 let contentLen: number;
759 if (typeof contentField === "string") {
760 contentKind = "string";
761 contentLen = contentField.length;
762 } else if (Array.isArray(contentField)) {
763 contentKind = "array[" + contentField.length + "]";
764 contentLen = contentField.reduce((acc, b) => acc + JSON.stringify(b).length, 0);
765 } else if (contentField === null || contentField === undefined) {
766 contentKind = "null";
767 contentLen = 0;
768 } else {
769 contentKind = typeof contentField;
770 contentLen = JSON.stringify(contentField).length;
771 }
772 const blockTypes = Array.isArray(contentField)
773 ? (contentField as Array<Record<string, unknown>>).map((b) => String(b.type ?? "?"))
774 : null;
775 return { role, contentKind, contentLen, blockTypes, hasToolCallId: "tool_call_id" in m };
776 });
777 const roleCounts: Record<string, number> = {};
778 for (const r of roles) roleCounts[r] = (roleCounts[r] || 0) + 1;
779 nwlog("outbound_payload_shape", {
780 session_fp: state.sessionFp,
781 num_messages: msgs.length,
782 role_distribution: roleCounts,
783 last_5_messages: lastN,
784 stream: Boolean((payload.body as { stream?: boolean } | undefined)?.stream ?? payload.stream),
785 payload_keys: Object.keys(payload),
786 });
787 } catch (err) {
788 nwlog("outbound_payload_shape_error", {
789 session_fp: state.sessionFp,
790 error: err instanceof Error ? err.message : String(err),
791 });
792 }
793 });
794
795 pi.on("session_before_compact", async (_event, ctx) => {
796 if (!isMCRModel(ctx.model?.id || "")) return;
797 if (state.sessionFp) {
798 nwlog("compaction_cancelled", { session_fp: state.sessionFp });
799 return { cancel: true };
800 }
801 });
802
803 pi.on("session_start", async (_event, ctx) => {
804 nwlog("session_start", { extension_version: EXTENSION_VERSION });
805 state.sessionFp = null;
806 state.safeDropBefore = 0;
807 state.storedThrough = 0;
808 state.totalEnergyJoules = 0;
809 state.sessionTurns = 0;
810 state.contextTokens = 0;
811 state.lastMcrMeta = null;
812 state.lastEnergy = null;
813 // Clear any in-flight indicator left over from a prior
814 // session (defensive — session_start would normally fire after any
815 // active request has completed, but a forced restart or fork can land
816 // here while inFlightSince is still set).
817 state.inFlightSince = null;
818 if (state.inFlightTickerHandle !== null) {
819 clearInterval(state.inFlightTickerHandle);
820 state.inFlightTickerHandle = null;
821 }
822 // Reset the UUID fallback so a new Pi session gets a fresh conversation
823 // id when getSessionId() isn't usable. The Pi session id itself rotates
824 // on its own.
825 uuidFallback = null;
826 ctx.ui.setStatus(MCR_STATUS_KEY, "");
827 ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
828 });
829
830 pi.on("session_tree", async (_event, ctx) => {
831 // Branch navigation invalidates MCR session state — sessionFp and
832 // safeDropBefore are tied to a specific message sequence that no
833 // longer matches the new branch. Clear everything and let the
834 // next server response repopulate. Energy is replayed from the
835 // session log (same as the main provider's session_tree handler).
836 state.sessionFp = null;
837 state.safeDropBefore = 0;
838 state.storedThrough = 0;
839 state.totalEnergyJoules = 0;
840 state.sessionTurns = 0;
841 state.contextTokens = 0;
842 state.lastMcrMeta = null;
843 state.lastEnergy = null;
844
845 // Replay energy events from the session log for the new branch.
846 for (const entry of ctx.sessionManager.getBranch()) {
847 if (
848 entry.type === "custom" &&
849 entry.customType === "neuralwatt-energy" &&
850 typeof entry.data === "object" &&
851 entry.data
852 ) {
853 state.totalEnergyJoules += (entry.data as { energy_joules: number }).energy_joules || 0;
854 }
855 }
856
857 nwlog("session_tree", { total_energy_replayed: state.totalEnergyJoules });
858 updateStatusBar(ctx);
859 });
860
861 pi.on("session_shutdown", async (_event, ctx) => {
862 nwlog("session_shutdown", {
863 final_session_fp: state.sessionFp,
864 total_energy_joules: state.totalEnergyJoules,
865 session_turns: state.sessionTurns,
866 });
867 // Tear down the in-flight ticker so the interval handle
868 // doesn't outlive the session.
869 state.inFlightSince = null;
870 if (state.inFlightTickerHandle !== null) {
871 clearInterval(state.inFlightTickerHandle);
872 state.inFlightTickerHandle = null;
873 }
874 ctx.ui.setStatus(MCR_STATUS_KEY, "");
875 ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
876 });
877}