@@ -0,0 +1,877 @@
+import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import * as os from "node:os";
+import { randomUUID } from "node:crypto";
+
+// Bump on any user-facing behaviour change. Surfaced in the extension log so a
+// session's behaviour can be tied to a specific revision when triaging reports.
+// 2.0.0 — honest in-flight chip (tools#33 / inference_frontend#3954):
+// neutral "working…" label, silent grace window, tightened MCR gating.
+// 2.1.0 — send the extension version on the wire as X-NW-MCR-Ext-Version so
+// the gateway can log which client revision served a request (server
+// logs previously had no way to tell a user's extension version).
+const EXTENSION_VERSION = "2.1.0";
+
+const MCR_ANCHOR_USER_MESSAGES = 3;
+
+const LOG_FILE = path.join(
+ os.homedir(),
+ ".pi",
+ "agent",
+ "extensions",
+ "neuralwatt-mcr.log",
+);
+
+function nwlog(event: string, data: Record<string, unknown> = {}): void {
+ try {
+ const line =
+ JSON.stringify({ ts: new Date().toISOString(), event, ...data }) + "\n";
+ fs.appendFileSync(LOG_FILE, line);
+ } catch {
+ // never break the extension on logging failure
+ }
+}
+
+interface MCRMetadata {
+ session_fp: string;
+ stored_through: number;
+ safe_drop_before: number;
+}
+
+interface EnergyData {
+ energy_joules: number;
+ mcr?: {
+ session_turns: number;
+ context_tokens: number;
+ compaction_triggered: boolean;
+ apc_hit_rate?: number;
+ mcr_compacted_tokens?: number;
+ mcr_original_tokens?: number;
+ };
+}
+
+interface SessionState {
+ sessionFp: string | null;
+ safeDropBefore: number;
+ storedThrough: number;
+ totalEnergyJoules: number;
+ sessionTurns: number;
+ contextTokens: number;
+ lastMcrMeta: MCRMetadata | null;
+ lastEnergy: EnergyData | null;
+ // In-flight request UX. When an MCR request is sent we record the wall-clock
+ // send time so the chip can surface a neutral "working…" indicator on long
+ // waits, and cleared on the first ``message_update`` / ``message_end``.
+ //
+ // HONESTY NOTE (tools#33 / inference_frontend#3954): this is a wall-clock
+ // proxy, NOT a real compaction signal. The extension cannot observe whether
+ // the gateway is actually compacting — see ``markRequestSent`` for why Pi's
+ // API doesn't surface the gateway's ``mcr-status`` SSE frames. So the chip
+ // must NOT claim "optimizing context"; it can only honestly say a request is
+ // in flight. We also stay silent for the first few seconds so ordinary turns
+ // (the ~96% with no compaction) show nothing at all.
+ inFlightSince: number | null;
+ inFlightTickerHandle: NodeJS.Timeout | null;
+}
+
+const state: SessionState = {
+ sessionFp: null,
+ safeDropBefore: 0,
+ storedThrough: 0,
+ totalEnergyJoules: 0,
+ sessionTurns: 0,
+ contextTokens: 0,
+ lastMcrMeta: null,
+ lastEnergy: null,
+ inFlightSince: null,
+ inFlightTickerHandle: null,
+};
+
+// How often to refresh the chip while the in-flight indicator is showing, so
+// the elapsed counter advances visibly. 500ms is fast enough to feel live but
+// well below the refresh budget of Pi's footer.
+const IN_FLIGHT_TICK_MS = 500;
+
+// Honesty grace window (tools#33 / inference_frontend#3954): suppress the
+// in-flight indicator for the first few seconds of every request. Large MCR
+// prompts (100k+ tokens) have a naturally long prefill/TTFT — 10-60s — on
+// EVERY turn, with no compaction happening on the ~96% that don't compact.
+// The old chip labelled that ordinary latency "optimizing context…" for the
+// whole window, which read as "MCR is making every prompt slow" and drove
+// churn. We can't tell prefill from compaction (no SSE phase signal reaches
+// the extension), so the only honest move is: say nothing until the wait is
+// long enough that the user genuinely wants reassurance the model isn't hung,
+// and even then use a neutral label that doesn't assert MCR is doing work.
+const IN_FLIGHT_GRACE_MS = 6000;
+
+// Recognise ONLY the MCR-backed aliases. The MCR pipeline (server-side
+// compaction + 1M virtual context + the X-MCR-* response protocol) runs only
+// for the `neuralwatt/…-long` aliases; the base-model and fast/flex IDs route
+// straight to the provider with no compaction.
+//
+// tools#33: the earlier predicate also matched `zai-org/`, `moonshotai/`,
+// `glm-5`, and `kimi-k2`, so every fast/flex alias and direct base-model call
+// lit the in-flight chip and ran the context-drop / fp handlers — none of which
+// apply off-MCR. Nico reported "optimizing context" on non-long models. The
+// client always selects the alias (never the forwarded base name), so matching
+// the alias shape is sufficient and correct. This narrows every call site
+// (chip gating, context-drop, fp-set, compaction-suppression) to MCR-only,
+// which is the intended behaviour for all of them.
+function isMCRModel(modelId: string): boolean {
+ return modelId.includes("neuralwatt/") || modelId.endsWith("-long");
+}
+
+function extractMCRFromHeaders(
+ headers: Record<string, string>,
+): MCRMetadata | null {
+ const fp = headers["x-mcr-session-fp"];
+ if (!fp) return null;
+ return {
+ session_fp: fp,
+ stored_through: parseInt(headers["x-mcr-stored-through"] || "0", 10),
+ safe_drop_before: parseInt(
+ headers["x-mcr-safe-drop-before"] || "0",
+ 10,
+ ),
+ };
+}
+
+// Parse a header value as an integer if present, else null. We keep null
+// distinct from 0 because:
+// * absent header -> gateway did not run the M1.b ref-recovery code path
+// (older deploy, non-MCR request, or path bailed before emitting)
+// * 0 -> ref-recovery ran but had nothing to surface
+// This distinction is the whole point of issue #3371 follow-up.
+function parseOptionalIntHeader(
+ headers: Record<string, string>,
+ name: string,
+): number | null {
+ const raw = headers[name];
+ if (raw === undefined || raw === null || raw === "") return null;
+ const n = parseInt(raw, 10);
+ return Number.isFinite(n) ? n : null;
+}
+
+function extractMCRFromBody(
+ body: Record<string, unknown>,
+): MCRMetadata | null {
+ const mcr = body.mcr as Record<string, unknown> | undefined;
+ if (!mcr || typeof mcr !== "object" || typeof mcr.session_fp !== "string")
+ return null;
+ return {
+ session_fp: mcr.session_fp as string,
+ stored_through:
+ typeof mcr.stored_through === "number" ? mcr.stored_through : 0,
+ safe_drop_before:
+ typeof mcr.safe_drop_before === "number" ? mcr.safe_drop_before : 0,
+ };
+}
+
+function extractEnergyFromBody(
+ body: Record<string, unknown>,
+): EnergyData | null {
+ const energy = body.energy as Record<string, unknown> | undefined;
+ if (!energy || typeof energy !== "object") return null;
+ const result: EnergyData = {
+ energy_joules:
+ typeof energy.energy_joules === "number" ? energy.energy_joules : 0,
+ };
+ if (energy.mcr && typeof energy.mcr === "object") {
+ const m = energy.mcr as Record<string, unknown>;
+ result.mcr = {
+ session_turns:
+ typeof m.session_turns === "number" ? m.session_turns : 0,
+ context_tokens:
+ typeof m.context_tokens === "number" ? m.context_tokens : 0,
+ compaction_triggered:
+ typeof m.compaction_triggered === "boolean"
+ ? m.compaction_triggered
+ : false,
+ apc_hit_rate:
+ typeof m.apc_hit_rate === "number" ? m.apc_hit_rate : undefined,
+ mcr_compacted_tokens:
+ typeof m.mcr_compacted_tokens === "number"
+ ? m.mcr_compacted_tokens
+ : undefined,
+ mcr_original_tokens:
+ typeof m.mcr_original_tokens === "number"
+ ? m.mcr_original_tokens
+ : undefined,
+ };
+ }
+ return result;
+}
+
+function formatEnergy(joules: number): string {
+ if (joules < 1) return `${(joules * 1000).toFixed(0)}mJ`;
+ if (joules < 1000) return `${joules.toFixed(1)}J`;
+ return `${(joules / 1000).toFixed(2)}kJ`;
+}
+
+// Render an in-flight elapsed-time stamp for the chip. Short formats (<10s →
+// "1.4s", >=10s → "12s", >=60s → "1m 5s") keep the chip from growing wide
+// enough to push other footer widgets off-screen.
+function formatElapsed(ms: number): string {
+ if (ms < 10000) return `${(ms / 1000).toFixed(1)}s`;
+ const seconds = Math.floor(ms / 1000);
+ if (seconds < 60) return `${seconds}s`;
+ const minutes = Math.floor(seconds / 60);
+ return `${minutes}m ${seconds % 60}s`;
+}
+
+// Extract the role/type marker from an entry. Pi's outbound HTTP payload
+// uses OpenAI-shape ``role`` (``user``/``assistant``/``tool``/``system``);
+// Pi's internal session-log records sometimes serialize the same field as
+// ``type``. Read both so this extension works regardless of which shape
+// the agent-runtime hands us via the ``context`` hook event.
+function entryRole(entry: { role?: string; type?: string }): string | undefined {
+ return entry.role ?? entry.type;
+}
+
+// Returns the FULL message index of the Nth user message (matching the
+// indexing space of the server's `safe_drop_before`, which counts every
+// message — user/assistant/tool/system — in send order). The earlier
+// version returned a user+assistant-subset index, which mixed index
+// spaces with `safe_drop_before` in `computeDropRange` and caused
+// `context_drop` to silently nuke the most-recent user prompts when the
+// upper bound exceeded the user+assistant subset size.
+function findAnchorFloor(
+ entries: Array<{ role?: string; type?: string }>,
+ nAnchors: number,
+): number {
+ let userCount = 0;
+ for (let i = 0; i < entries.length; i++) {
+ const role = entryRole(entries[i]);
+ if (role === "user") {
+ userCount++;
+ if (userCount === nAnchors) return i;
+ }
+ }
+ return -1;
+}
+
+// Server-side validation for X-NW-Conversation-ID (mirrors
+// services/mcr_v3_session.py::validate_client_conversation_id):
+//
+// * non-empty, ≤ 256 chars
+// * all code points printable (no control / whitespace beyond plain space)
+//
+// We sanitize here so a malformed Pi session id can never bounce off the
+// server as HTTP 400 — the worst case is we fall back to the in-process UUID.
+const MAX_CONVERSATION_ID_LEN = 256;
+
+function isWellFormedConversationId(value: string): boolean {
+ if (!value || value.length === 0 || value.length > MAX_CONVERSATION_ID_LEN) {
+ return false;
+ }
+ // Reject any control character (matches server's `not c.isprintable()` rule).
+ // We allow plain ASCII space (0x20) and everything ≥ 0x20 except 0x7F (DEL).
+ for (let i = 0; i < value.length; i++) {
+ const code = value.charCodeAt(i);
+ if (code < 0x20 || code === 0x7f) return false;
+ }
+ return true;
+}
+
+// Per-process fallback id. Generated lazily so it stays stable across
+// auto-compact within one `pi` invocation, and differs across invocations.
+let uuidFallback: string | null = null;
+function getUuidFallback(): string {
+ if (!uuidFallback) {
+ uuidFallback = randomUUID();
+ }
+ return uuidFallback;
+}
+
+type ConversationIdSource = "pi-session" | "uuid-fallback";
+
+function resolveConversationId(
+ ctx: ExtensionContext,
+): { id: string; source: ConversationIdSource } {
+ // Preferred source: Pi's own session id. Stable across auto-compact in a
+ // single `pi` session, distinct across sessions, naturally string-form.
+ try {
+ const piSessionId = ctx.sessionManager?.getSessionId?.();
+ if (
+ typeof piSessionId === "string" &&
+ isWellFormedConversationId(piSessionId)
+ ) {
+ return { id: piSessionId, source: "pi-session" };
+ }
+ } catch {
+ // fall through to uuid fallback
+ }
+ return { id: getUuidFallback(), source: "uuid-fallback" };
+}
+
+function computeDropRange(
+ entries: Array<{ role?: string; type?: string }>,
+ safeDropBefore: number,
+): [number, number] {
+ if (safeDropBefore <= 0) return [0, 0];
+ const anchorIdx = findAnchorFloor(entries, MCR_ANCHOR_USER_MESSAGES);
+ if (anchorIdx < 0) return [0, 0];
+ const dropStart = anchorIdx + 1;
+ const dropEnd = safeDropBefore;
+ if (dropEnd <= dropStart) return [0, 0];
+ return [dropStart, dropEnd];
+}
+
+export default function (pi: ExtensionAPI) {
+ const MCR_STATUS_KEY = "nw-mcr";
+ const ENERGY_STATUS_KEY = "nw-energy";
+
+ // ── Outbound header wiring (X-NW-Conversation-ID, X-NW-MCR-Ext-Version) ──
+ // pi-coding-agent's `before_provider_request` is a *body* hook — the
+ // earlier `payload.headers[...]` mutation reached extension memory only,
+ // never the HTTP wire. The documented per-request header path is
+ // `pi.registerProvider({ headers })`, whose values are env-var NAMES that
+ // the SDK re-reads from `process.env` on every stream
+ // (dist/core/resolve-config-value.js). Net: real HTTP headers, no body
+ // touch, no APC impact. Both headers below ride this same mechanism.
+ //
+ // Boot order: we seed the env var with a UUID at extension load so any
+ // request fired before the first `before_provider_request` tick still
+ // carries *some* id. The hook upgrades it to Pi's stable per-invocation
+ // session id (see resolveConversationId). Subsequent requests in the
+ // same `pi` invocation reuse the upgraded value — invocation-stable
+ // session_fp by construction.
+ const CONV_ID_ENV = "X_NW_CONVERSATION_ID";
+ if (!process.env[CONV_ID_ENV]) {
+ process.env[CONV_ID_ENV] = getUuidFallback();
+ }
+ // X-NW-MCR-Ext-Version (2.1.0): surface the client extension
+ // version on the wire so the gateway can log which revision served a
+ // request — server logs previously had no way to tell a user's version.
+ // Unlike the conversation id, the version is static for the life of the
+ // process (it never changes at runtime), so we seed it once at load and
+ // never touch it again — no upgrade-on-hook logic needed.
+ const EXT_VERSION_ENV = "X_NW_MCR_EXT_VERSION";
+ process.env[EXT_VERSION_ENV] = EXTENSION_VERSION;
+ // `apiKey` mirrors the env-var name from ~/.pi/agent/models.json so the
+ // partial config doesn't shadow the existing auth. `storeProviderRequestConfig`
+ // doesn't merge with the models.json-derived entry (different map), so we
+ // have to re-state any field we need; only apiKey here, since baseUrl/api
+ // flow through via the override-only branch of applyProviderConfig.
+ pi.registerProvider("neuralwatt", {
+ apiKey: "NEURALWATT_API_KEY",
+ headers: {
+ "X-NW-Conversation-ID": CONV_ID_ENV,
+ "X-NW-MCR-Ext-Version": EXT_VERSION_ENV,
+ },
+ });
+
+ function updateStatusBar(ctx: ExtensionContext) {
+ // In-flight indicator (tools#33 / inference_frontend#3954). HONESTY RULES:
+ //
+ // * The extension cannot observe whether the gateway is compacting (Pi
+ // doesn't surface the ``mcr-status`` SSE frames — see markRequestSent).
+ // So we NEVER claim "optimizing context"; we only ever say a request is
+ // in flight ("working…").
+ // * We stay completely silent for the first IN_FLIGHT_GRACE_MS so normal
+ // turns — including the long-but-uncompacted prefill that dominates real
+ // usage — show nothing. Only a genuinely long wait surfaces the neutral
+ // reassurance that the model hasn't hung.
+ // * Once real model output starts, ``markStreamProducing`` clears
+ // ``inFlightSince`` and the chip reverts to the standard MCR view.
+ const inFlightElapsedMs =
+ state.inFlightSince !== null ? Date.now() - state.inFlightSince : 0;
+ if (
+ state.inFlightSince !== null &&
+ inFlightElapsedMs >= IN_FLIGHT_GRACE_MS
+ ) {
+ const fpPrefix = state.sessionFp
+ ? `MCR ${state.sessionFp.slice(0, 8)} | `
+ : "MCR | ";
+ ctx.ui.setStatus(
+ MCR_STATUS_KEY,
+ `${fpPrefix}working… ${formatElapsed(inFlightElapsedMs)}`,
+ );
+ } else if (state.sessionFp) {
+ const parts: string[] = [`MCR ${state.sessionFp.slice(0, 8)}`];
+ if (state.safeDropBefore > 0) {
+ parts.push(`drop<${state.safeDropBefore}`);
+ }
+ ctx.ui.setStatus(MCR_STATUS_KEY, parts.join(" | "));
+ } else {
+ ctx.ui.setStatus(MCR_STATUS_KEY, "");
+ }
+
+ if (state.totalEnergyJoules > 0) {
+ const parts: string[] = [`⚡ ${formatEnergy(state.totalEnergyJoules)}`];
+ if (state.lastEnergy?.mcr) {
+ const m = state.lastEnergy.mcr;
+ if (m.apc_hit_rate !== undefined) {
+ parts.push(`APC ${(m.apc_hit_rate * 100).toFixed(0)}%`);
+ }
+ if (m.mcr_compacted_tokens && m.mcr_original_tokens) {
+ const ratio = m.mcr_compacted_tokens / m.mcr_original_tokens;
+ parts.push(`compact ${(ratio * 100).toFixed(0)}%`);
+ }
+ }
+ ctx.ui.setStatus(ENERGY_STATUS_KEY, parts.join(" | "));
+ } else {
+ ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+ }
+ }
+
+ // Lifecycle helpers for the in-flight indicator. We bracket the wait window
+ // with ``markRequestSent`` (on before_provider_request) and
+ // ``markStreamProducing`` (on the first message_update / message_end that
+ // carries real model output). The elapsed counter advances via a
+ // ``setInterval`` that re-calls ``updateStatusBar`` every ~0.5s.
+ //
+ // ── WHY THIS IS A NEUTRAL "working…" PROXY, NOT A REAL MCR PHASE SIGNAL ──
+ //
+ // The gateway (inference_frontend #3916) emits the ground truth as
+ // ``event: mcr-status`` SSE frames carrying {phase: compacting | warming |
+ // idle, elapsed_ms}. The ideal chip would show "optimizing context…" ONLY
+ // while a ``compacting`` phase is live. That is NOT achievable from a Pi
+ // extension on the version this targets (Pi v0.72/0.73), verified against
+ // the published type defs:
+ //
+ // 1. The only stream hook is ``message_update``, whose payload is
+ // ``assistantMessageEvent: AssistantMessageEvent`` (pi-ai types.d.ts).
+ // That union is closed to text/thinking/toolcall/start/done/error —
+ // there is no member that can carry a raw ``mcr-status`` frame.
+ // 2. There is no "raw SSE frame" / "stream event" hook anywhere on
+ // ``ExtensionAPI`` (pi-coding-agent extensions/types.d.ts): the event
+ // surface is session/agent/turn/message/tool lifecycle only.
+ // 3. The neuralwatt provider streams through pi-ai's openai-completions
+ // handler, which consumes the response via the official ``openai`` SDK
+ // (``client.chat.completions.create(...).withResponse()``). The SDK's
+ // decoder yields only typed ``chat.completion.chunk`` objects; any SSE
+ // frame with ``event: mcr-status`` is dropped by the SDK before pi-ai —
+ // and therefore the extension — can ever see it.
+ //
+ // So the extension genuinely cannot tell prefill from compaction. Claiming
+ // "optimizing context" would be a lie on the ~96% of turns where nothing is
+ // compacted (the churn driver in #3954). The honest behaviour is: neutral
+ // "working…" label, only after a grace window. Path A (real phase-driven
+ // chip) is blocked on an upstream Pi capability: a hook that surfaces raw
+ // provider SSE events (or an OpenAI-compat passthrough for unknown event
+ // types) to extensions. Track that as a Pi feature request; revisit here
+ // once it ships.
+ function markRequestSent(ctx: ExtensionContext) {
+ state.inFlightSince = Date.now();
+ if (state.inFlightTickerHandle !== null) {
+ clearInterval(state.inFlightTickerHandle);
+ }
+ state.inFlightTickerHandle = setInterval(() => {
+ // Defensive: stop ticking if the in-flight flag was cleared
+ // out-of-band (e.g. session_start reset).
+ if (state.inFlightSince === null) {
+ if (state.inFlightTickerHandle !== null) {
+ clearInterval(state.inFlightTickerHandle);
+ state.inFlightTickerHandle = null;
+ }
+ return;
+ }
+ updateStatusBar(ctx);
+ }, IN_FLIGHT_TICK_MS);
+ updateStatusBar(ctx);
+ }
+
+ function markStreamProducing(ctx: ExtensionContext) {
+ if (state.inFlightSince === null) return;
+ state.inFlightSince = null;
+ if (state.inFlightTickerHandle !== null) {
+ clearInterval(state.inFlightTickerHandle);
+ state.inFlightTickerHandle = null;
+ }
+ updateStatusBar(ctx);
+ }
+
+ // Upgrade X_NW_CONVERSATION_ID to Pi's stable session-id as early as
+ // possible — `session_start` fires before any provider request and is the
+ // earliest hook with ctx. Without this, the first request of each fresh
+ // `pi -p` invocation would carry the boot-time UUID (different per
+ // process) and produce a cold-cache session_fp that drags APC averages
+ // down on `--continue` chains.
+ pi.on("session_start", async (_event, ctx) => {
+ const { id: conversationId } = resolveConversationId(ctx);
+ process.env[CONV_ID_ENV] = conversationId;
+ });
+
+ pi.on("after_provider_response", async (event, ctx) => {
+ const modelId = ctx.model?.id || "";
+ if (!isMCRModel(modelId)) return;
+
+ const headers = event.headers as Record<string, string>;
+ const mcrFromHeaders = extractMCRFromHeaders(headers);
+
+ nwlog("after_provider_response", {
+ model: modelId,
+ header_fp: headers["x-mcr-session-fp"] ?? null,
+ header_safe_drop_before: headers["x-mcr-safe-drop-before"] ?? null,
+ header_stored_through: headers["x-mcr-stored-through"] ?? null,
+ parsed: mcrFromHeaders,
+ });
+
+ // M1.b ref-recovery observability (issue #3371 follow-up). The
+ // gateway's recent_ref_expansion path runs on the gateway->backend
+ // payload, which is invisible to Pi — without these headers in the
+ // log we can't tell "ref-recovery ran with no refs available" apart
+ // from "ref-recovery never ran". Counts only, no preview/sha values.
+ // Server-side emission: API_Gateway/app/services/mcr_anthropic_native_proxy.py::_add_mcr_refs_headers
+ const refsRecovered = parseOptionalIntHeader(headers, "x-mcr-refs-recovered");
+ const refsInForward = parseOptionalIntHeader(headers, "x-mcr-refs-in-forward");
+ const refsSkippedBudget = parseOptionalIntHeader(
+ headers,
+ "x-mcr-refs-skipped-budget",
+ );
+ const refsSkippedMissing = parseOptionalIntHeader(
+ headers,
+ "x-mcr-refs-skipped-missing",
+ );
+ const recoveryTokensAdded = parseOptionalIntHeader(
+ headers,
+ "x-mcr-recovery-tokens-added",
+ );
+ const manifestEntries = parseOptionalIntHeader(
+ headers,
+ "x-mcr-manifest-entries",
+ );
+
+ if (
+ refsRecovered !== null ||
+ refsInForward !== null ||
+ refsSkippedBudget !== null ||
+ refsSkippedMissing !== null ||
+ recoveryTokensAdded !== null ||
+ manifestEntries !== null
+ ) {
+ nwlog("mcr_refs", {
+ refs_recovered: refsRecovered,
+ refs_in_forward: refsInForward,
+ refs_skipped_budget: refsSkippedBudget,
+ refs_skipped_missing: refsSkippedMissing,
+ recovery_tokens_added: recoveryTokensAdded,
+ manifest_entries: manifestEntries,
+ });
+ }
+
+ if (mcrFromHeaders) {
+ state.sessionFp = mcrFromHeaders.session_fp;
+ state.safeDropBefore = mcrFromHeaders.safe_drop_before;
+ state.storedThrough = mcrFromHeaders.stored_through;
+ state.lastMcrMeta = mcrFromHeaders;
+ }
+
+ updateStatusBar(ctx);
+ });
+
+ // Clear the in-flight indicator on the first message_update for an MCR
+ // model — at that point real model tokens are flowing, the "is it hung?"
+ // perception is gone, and the chip should revert to the standard
+ // MCR-fingerprint view. The MessageUpdateEvent fires for any assistant
+ // streaming update (text/thinking/toolcall deltas); we don't need to narrow
+ // further since any of these prove the wait is over.
+ pi.on("message_update", async (event, ctx) => {
+ if (event.message.role !== "assistant") return;
+ if (!isMCRModel(ctx.model?.id || "")) return;
+ markStreamProducing(ctx);
+ });
+
+ pi.on("message_end", async (event, ctx) => {
+ if (event.message.role !== "assistant") return;
+ if (!isMCRModel(ctx.model?.id || "")) return;
+
+ // Backstop — if a response was short enough that no message_update ever
+ // fired, message_end is the latest possible chance to clear the indicator
+ // before it gets stale.
+ markStreamProducing(ctx);
+
+ const msg = event.message as Record<string, unknown>;
+
+ const mcrFromBody = extractMCRFromBody(msg);
+ if (mcrFromBody) {
+ nwlog("message_end_mcr_body", { parsed: mcrFromBody });
+ state.sessionFp = mcrFromBody.session_fp;
+ state.safeDropBefore = mcrFromBody.safe_drop_before;
+ state.storedThrough = mcrFromBody.stored_through;
+ state.lastMcrMeta = mcrFromBody;
+ }
+
+ const energy = extractEnergyFromBody(msg);
+ if (energy) {
+ nwlog("message_end_energy", {
+ energy_joules: energy.energy_joules,
+ cumulative_joules: state.totalEnergyJoules + energy.energy_joules,
+ session_turns: energy.mcr?.session_turns,
+ context_tokens: energy.mcr?.context_tokens,
+ compaction_triggered: energy.mcr?.compaction_triggered,
+ apc_hit_rate: energy.mcr?.apc_hit_rate,
+ mcr_compacted_tokens: energy.mcr?.mcr_compacted_tokens,
+ mcr_original_tokens: energy.mcr?.mcr_original_tokens,
+ });
+ state.totalEnergyJoules += energy.energy_joules;
+ state.lastEnergy = energy;
+ pi.appendEntry("neuralwatt-energy", { energy_joules: energy.energy_joules });
+ if (energy.mcr) {
+ state.sessionTurns = energy.mcr.session_turns;
+ state.contextTokens = energy.mcr.context_tokens;
+ }
+ }
+
+ updateStatusBar(ctx);
+ });
+
+ pi.on("context", async (event, ctx) => {
+ const modelId = ctx.model?.id || "";
+ const numMsgs = event.messages.length;
+
+ if (!state.sessionFp) {
+ nwlog("context_skip", {
+ reason: "no_session_fp",
+ model: modelId,
+ num_msgs: numMsgs,
+ });
+ return;
+ }
+ if (state.safeDropBefore <= 0) {
+ nwlog("context_skip", {
+ reason: "safe_drop_before_zero",
+ model: modelId,
+ num_msgs: numMsgs,
+ safe_drop_before: state.safeDropBefore,
+ session_fp: state.sessionFp,
+ });
+ return;
+ }
+ if (!isMCRModel(modelId)) {
+ nwlog("context_skip", {
+ reason: "not_mcr_model",
+ model: modelId,
+ num_msgs: numMsgs,
+ });
+ return;
+ }
+
+ const [dropStart, dropEnd] = computeDropRange(
+ event.messages as Array<{ type: string }>,
+ state.safeDropBefore,
+ );
+
+ if (dropEnd <= dropStart) {
+ nwlog("context_no_drop", {
+ reason: "empty_range",
+ drop_start: dropStart,
+ drop_end: dropEnd,
+ safe_drop_before: state.safeDropBefore,
+ num_msgs: numMsgs,
+ session_fp: state.sessionFp,
+ });
+ return;
+ }
+
+ // Drop messages in the full-index range [dropStart, dropEnd). Both
+ // bounds are in the same indexing space as `event.messages` and as the
+ // server's `safe_drop_before` (every message counted, all roles). The
+ // earlier version maintained a separate user+assistant-subset counter
+ // and compared it against full-index bounds, which silently nuked the
+ // most-recent user prompts (#bug discovered 2026-05-16 — see commit
+ // message for the trace).
+ const clampedEnd = Math.min(dropEnd, event.messages.length);
+ const filtered = event.messages.filter(
+ (_: unknown, i: number) => i < dropStart || i >= clampedEnd,
+ );
+ const droppedCount = numMsgs - filtered.length;
+
+ if (droppedCount === 0) {
+ nwlog("context_no_drop", {
+ reason: "no_indices_matched",
+ drop_start: dropStart,
+ drop_end: clampedEnd,
+ safe_drop_before: state.safeDropBefore,
+ num_msgs: numMsgs,
+ session_fp: state.sessionFp,
+ });
+ return;
+ }
+
+ nwlog("context_drop", {
+ drop_start: dropStart,
+ drop_end: clampedEnd,
+ safe_drop_before: state.safeDropBefore,
+ num_msgs_before: numMsgs,
+ num_msgs_after: filtered.length,
+ dropped: droppedCount,
+ session_fp: state.sessionFp,
+ });
+
+ return { messages: filtered };
+ });
+
+ pi.on("before_provider_request", async (event, ctx) => {
+ if (!isMCRModel(ctx.model?.id || "")) return;
+
+ const payload = event.payload as Record<string, unknown>;
+
+ // Start the in-flight indicator. It stays silent for IN_FLIGHT_GRACE_MS;
+ // only on a genuinely long wait does it surface a neutral
+ // "working… 12s" so the chat UI isn't indistinguishable from a hung
+ // model. It deliberately does NOT claim "optimizing context" — the
+ // extension can't observe whether compaction is happening (see
+ // ``markRequestSent`` for the Pi-API limitation that blocks the real
+ // phase signal).
+ markRequestSent(ctx);
+
+ // Upgrade the X_NW_CONVERSATION_ID env var (seeded with a UUID at
+ // extension load) to Pi's stable per-invocation session id. The SDK
+ // re-reads process.env on every stream, so this propagates to the
+ // next outbound request as the real X-NW-Conversation-ID HTTP header
+ // — no body touch. See the header-wiring block at the top of this fn
+ // and pi-header-surface.md for the full mechanism trace.
+ const { id: conversationId, source: conversationIdSource } =
+ resolveConversationId(ctx);
+ process.env[CONV_ID_ENV] = conversationId;
+ nwlog("conversation_id_attached", {
+ conversation_id_prefix: conversationId.slice(0, 8),
+ source: conversationIdSource,
+ });
+
+ // X-MCR-Session-FP fast-path hint: previously set via the same
+ // body-only mechanism that never reached the wire. Currently a no-op;
+ // diagnostic log retained so we still see when the gateway has
+ // assigned an fp. Revisit via registerProvider if we want it back.
+ if (state.sessionFp) {
+ nwlog("before_provider_request_fp_set", {
+ session_fp: state.sessionFp,
+ safe_drop_before: state.safeDropBefore,
+ });
+ }
+
+ // DEBUG (#3323 follow-up): capture the actual outbound payload shape
+ // so we can diagnose why the deployed pin-final-user-turn fix isn't
+ // firing for real Pi sessions. Logs shape only — no message content.
+ try {
+ const msgs = (payload.body as { messages?: Array<Record<string, unknown>> } | undefined)?.messages
+ ?? (payload.messages as Array<Record<string, unknown>> | undefined)
+ ?? [];
+ const roles = msgs.map((m) => String(m.role ?? m.type ?? "?"));
+ const lastN = msgs.slice(-5).map((m) => {
+ const role = String(m.role ?? m.type ?? "?");
+ const contentField = m.content;
+ let contentKind: string;
+ let contentLen: number;
+ if (typeof contentField === "string") {
+ contentKind = "string";
+ contentLen = contentField.length;
+ } else if (Array.isArray(contentField)) {
+ contentKind = "array[" + contentField.length + "]";
+ contentLen = contentField.reduce((acc, b) => acc + JSON.stringify(b).length, 0);
+ } else if (contentField === null || contentField === undefined) {
+ contentKind = "null";
+ contentLen = 0;
+ } else {
+ contentKind = typeof contentField;
+ contentLen = JSON.stringify(contentField).length;
+ }
+ const blockTypes = Array.isArray(contentField)
+ ? (contentField as Array<Record<string, unknown>>).map((b) => String(b.type ?? "?"))
+ : null;
+ return { role, contentKind, contentLen, blockTypes, hasToolCallId: "tool_call_id" in m };
+ });
+ const roleCounts: Record<string, number> = {};
+ for (const r of roles) roleCounts[r] = (roleCounts[r] || 0) + 1;
+ nwlog("outbound_payload_shape", {
+ session_fp: state.sessionFp,
+ num_messages: msgs.length,
+ role_distribution: roleCounts,
+ last_5_messages: lastN,
+ stream: Boolean((payload.body as { stream?: boolean } | undefined)?.stream ?? payload.stream),
+ payload_keys: Object.keys(payload),
+ });
+ } catch (err) {
+ nwlog("outbound_payload_shape_error", {
+ session_fp: state.sessionFp,
+ error: err instanceof Error ? err.message : String(err),
+ });
+ }
+ });
+
+ pi.on("session_before_compact", async (_event, ctx) => {
+ if (!isMCRModel(ctx.model?.id || "")) return;
+ if (state.sessionFp) {
+ nwlog("compaction_cancelled", { session_fp: state.sessionFp });
+ return { cancel: true };
+ }
+ });
+
+ pi.on("session_start", async (_event, ctx) => {
+ nwlog("session_start", { extension_version: EXTENSION_VERSION });
+ state.sessionFp = null;
+ state.safeDropBefore = 0;
+ state.storedThrough = 0;
+ state.totalEnergyJoules = 0;
+ state.sessionTurns = 0;
+ state.contextTokens = 0;
+ state.lastMcrMeta = null;
+ state.lastEnergy = null;
+ // Clear any in-flight indicator left over from a prior
+ // session (defensive — session_start would normally fire after any
+ // active request has completed, but a forced restart or fork can land
+ // here while inFlightSince is still set).
+ state.inFlightSince = null;
+ if (state.inFlightTickerHandle !== null) {
+ clearInterval(state.inFlightTickerHandle);
+ state.inFlightTickerHandle = null;
+ }
+ // Reset the UUID fallback so a new Pi session gets a fresh conversation
+ // id when getSessionId() isn't usable. The Pi session id itself rotates
+ // on its own.
+ uuidFallback = null;
+ ctx.ui.setStatus(MCR_STATUS_KEY, "");
+ ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+ });
+
+ pi.on("session_tree", async (_event, ctx) => {
+ // Branch navigation invalidates MCR session state — sessionFp and
+ // safeDropBefore are tied to a specific message sequence that no
+ // longer matches the new branch. Clear everything and let the
+ // next server response repopulate. Energy is replayed from the
+ // session log (same as the main provider's session_tree handler).
+ state.sessionFp = null;
+ state.safeDropBefore = 0;
+ state.storedThrough = 0;
+ state.totalEnergyJoules = 0;
+ state.sessionTurns = 0;
+ state.contextTokens = 0;
+ state.lastMcrMeta = null;
+ state.lastEnergy = null;
+
+ // Replay energy events from the session log for the new branch.
+ for (const entry of ctx.sessionManager.getBranch()) {
+ if (
+ entry.type === "custom" &&
+ entry.customType === "neuralwatt-energy" &&
+ typeof entry.data === "object" &&
+ entry.data
+ ) {
+ state.totalEnergyJoules += (entry.data as { energy_joules: number }).energy_joules || 0;
+ }
+ }
+
+ nwlog("session_tree", { total_energy_replayed: state.totalEnergyJoules });
+ updateStatusBar(ctx);
+ });
+
+ pi.on("session_shutdown", async (_event, ctx) => {
+ nwlog("session_shutdown", {
+ final_session_fp: state.sessionFp,
+ total_energy_joules: state.totalEnergyJoules,
+ session_turns: state.sessionTurns,
+ });
+ // Tear down the in-flight ticker so the interval handle
+ // doesn't outlive the session.
+ state.inFlightSince = null;
+ if (state.inFlightTickerHandle !== null) {
+ clearInterval(state.inFlightTickerHandle);
+ state.inFlightTickerHandle = null;
+ }
+ ctx.ui.setStatus(MCR_STATUS_KEY, "");
+ ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+ });
+}