pi: add NW MCR, update config

Amolith created

Change summary

dot_config/pi/extensions/neuralwatt-mcr.ts | 877 ++++++++++++++++++++++++
dot_config/pi/models.json                  |  48 +
dot_config/pi/settings.json                |   6 
3 files changed, 928 insertions(+), 3 deletions(-)

Detailed changes

dot_config/pi/extensions/neuralwatt-mcr.ts 🔗

@@ -0,0 +1,877 @@
+import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import * as os from "node:os";
+import { randomUUID } from "node:crypto";
+
+// Bump on any user-facing behaviour change. Surfaced in the extension log so a
+// session's behaviour can be tied to a specific revision when triaging reports.
+//   2.0.0 — honest in-flight chip (tools#33 / inference_frontend#3954):
+//           neutral "working…" label, silent grace window, tightened MCR gating.
+//   2.1.0 — send the extension version on the wire as X-NW-MCR-Ext-Version so
+//           the gateway can log which client revision served a request (server
+//           logs previously had no way to tell a user's extension version).
+const EXTENSION_VERSION = "2.1.0";
+
+const MCR_ANCHOR_USER_MESSAGES = 3;
+
+const LOG_FILE = path.join(
+  os.homedir(),
+  ".pi",
+  "agent",
+  "extensions",
+  "neuralwatt-mcr.log",
+);
+
+function nwlog(event: string, data: Record<string, unknown> = {}): void {
+  try {
+    const line =
+      JSON.stringify({ ts: new Date().toISOString(), event, ...data }) + "\n";
+    fs.appendFileSync(LOG_FILE, line);
+  } catch {
+    // never break the extension on logging failure
+  }
+}
+
+interface MCRMetadata {
+  session_fp: string;
+  stored_through: number;
+  safe_drop_before: number;
+}
+
+interface EnergyData {
+  energy_joules: number;
+  mcr?: {
+    session_turns: number;
+    context_tokens: number;
+    compaction_triggered: boolean;
+    apc_hit_rate?: number;
+    mcr_compacted_tokens?: number;
+    mcr_original_tokens?: number;
+  };
+}
+
+interface SessionState {
+  sessionFp: string | null;
+  safeDropBefore: number;
+  storedThrough: number;
+  totalEnergyJoules: number;
+  sessionTurns: number;
+  contextTokens: number;
+  lastMcrMeta: MCRMetadata | null;
+  lastEnergy: EnergyData | null;
+  // In-flight request UX. When an MCR request is sent we record the wall-clock
+  // send time so the chip can surface a neutral "working…" indicator on long
+  // waits, and cleared on the first ``message_update`` / ``message_end``.
+  //
+  // HONESTY NOTE (tools#33 / inference_frontend#3954): this is a wall-clock
+  // proxy, NOT a real compaction signal. The extension cannot observe whether
+  // the gateway is actually compacting — see ``markRequestSent`` for why Pi's
+  // API doesn't surface the gateway's ``mcr-status`` SSE frames. So the chip
+  // must NOT claim "optimizing context"; it can only honestly say a request is
+  // in flight. We also stay silent for the first few seconds so ordinary turns
+  // (the ~96% with no compaction) show nothing at all.
+  inFlightSince: number | null;
+  inFlightTickerHandle: NodeJS.Timeout | null;
+}
+
+const state: SessionState = {
+  sessionFp: null,
+  safeDropBefore: 0,
+  storedThrough: 0,
+  totalEnergyJoules: 0,
+  sessionTurns: 0,
+  contextTokens: 0,
+  lastMcrMeta: null,
+  lastEnergy: null,
+  inFlightSince: null,
+  inFlightTickerHandle: null,
+};
+
+// How often to refresh the chip while the in-flight indicator is showing, so
+// the elapsed counter advances visibly. 500ms is fast enough to feel live but
+// well below the refresh budget of Pi's footer.
+const IN_FLIGHT_TICK_MS = 500;
+
+// Honesty grace window (tools#33 / inference_frontend#3954): suppress the
+// in-flight indicator for the first few seconds of every request. Large MCR
+// prompts (100k+ tokens) have a naturally long prefill/TTFT — 10-60s — on
+// EVERY turn, with no compaction happening on the ~96% that don't compact.
+// The old chip labelled that ordinary latency "optimizing context…" for the
+// whole window, which read as "MCR is making every prompt slow" and drove
+// churn. We can't tell prefill from compaction (no SSE phase signal reaches
+// the extension), so the only honest move is: say nothing until the wait is
+// long enough that the user genuinely wants reassurance the model isn't hung,
+// and even then use a neutral label that doesn't assert MCR is doing work.
+const IN_FLIGHT_GRACE_MS = 6000;
+
+// Recognise ONLY the MCR-backed aliases. The MCR pipeline (server-side
+// compaction + 1M virtual context + the X-MCR-* response protocol) runs only
+// for the `neuralwatt/…-long` aliases; the base-model and fast/flex IDs route
+// straight to the provider with no compaction.
+//
+// tools#33: the earlier predicate also matched `zai-org/`, `moonshotai/`,
+// `glm-5`, and `kimi-k2`, so every fast/flex alias and direct base-model call
+// lit the in-flight chip and ran the context-drop / fp handlers — none of which
+// apply off-MCR. Nico reported "optimizing context" on non-long models. The
+// client always selects the alias (never the forwarded base name), so matching
+// the alias shape is sufficient and correct. This narrows every call site
+// (chip gating, context-drop, fp-set, compaction-suppression) to MCR-only,
+// which is the intended behaviour for all of them.
+function isMCRModel(modelId: string): boolean {
+  return modelId.includes("neuralwatt/") || modelId.endsWith("-long");
+}
+
+function extractMCRFromHeaders(
+  headers: Record<string, string>,
+): MCRMetadata | null {
+  const fp = headers["x-mcr-session-fp"];
+  if (!fp) return null;
+  return {
+    session_fp: fp,
+    stored_through: parseInt(headers["x-mcr-stored-through"] || "0", 10),
+    safe_drop_before: parseInt(
+      headers["x-mcr-safe-drop-before"] || "0",
+      10,
+    ),
+  };
+}
+
+// Parse a header value as an integer if present, else null. We keep null
+// distinct from 0 because:
+//   * absent header  -> gateway did not run the M1.b ref-recovery code path
+//     (older deploy, non-MCR request, or path bailed before emitting)
+//   * 0              -> ref-recovery ran but had nothing to surface
+// This distinction is the whole point of issue #3371 follow-up.
+function parseOptionalIntHeader(
+  headers: Record<string, string>,
+  name: string,
+): number | null {
+  const raw = headers[name];
+  if (raw === undefined || raw === null || raw === "") return null;
+  const n = parseInt(raw, 10);
+  return Number.isFinite(n) ? n : null;
+}
+
+function extractMCRFromBody(
+  body: Record<string, unknown>,
+): MCRMetadata | null {
+  const mcr = body.mcr as Record<string, unknown> | undefined;
+  if (!mcr || typeof mcr !== "object" || typeof mcr.session_fp !== "string")
+    return null;
+  return {
+    session_fp: mcr.session_fp as string,
+    stored_through:
+      typeof mcr.stored_through === "number" ? mcr.stored_through : 0,
+    safe_drop_before:
+      typeof mcr.safe_drop_before === "number" ? mcr.safe_drop_before : 0,
+  };
+}
+
+function extractEnergyFromBody(
+  body: Record<string, unknown>,
+): EnergyData | null {
+  const energy = body.energy as Record<string, unknown> | undefined;
+  if (!energy || typeof energy !== "object") return null;
+  const result: EnergyData = {
+    energy_joules:
+      typeof energy.energy_joules === "number" ? energy.energy_joules : 0,
+  };
+  if (energy.mcr && typeof energy.mcr === "object") {
+    const m = energy.mcr as Record<string, unknown>;
+    result.mcr = {
+      session_turns:
+        typeof m.session_turns === "number" ? m.session_turns : 0,
+      context_tokens:
+        typeof m.context_tokens === "number" ? m.context_tokens : 0,
+      compaction_triggered:
+        typeof m.compaction_triggered === "boolean"
+          ? m.compaction_triggered
+          : false,
+      apc_hit_rate:
+        typeof m.apc_hit_rate === "number" ? m.apc_hit_rate : undefined,
+      mcr_compacted_tokens:
+        typeof m.mcr_compacted_tokens === "number"
+          ? m.mcr_compacted_tokens
+          : undefined,
+      mcr_original_tokens:
+        typeof m.mcr_original_tokens === "number"
+          ? m.mcr_original_tokens
+          : undefined,
+    };
+  }
+  return result;
+}
+
+function formatEnergy(joules: number): string {
+  if (joules < 1) return `${(joules * 1000).toFixed(0)}mJ`;
+  if (joules < 1000) return `${joules.toFixed(1)}J`;
+  return `${(joules / 1000).toFixed(2)}kJ`;
+}
+
+// Render an in-flight elapsed-time stamp for the chip. Short formats (<10s →
+// "1.4s", >=10s → "12s", >=60s → "1m 5s") keep the chip from growing wide
+// enough to push other footer widgets off-screen.
+function formatElapsed(ms: number): string {
+  if (ms < 10000) return `${(ms / 1000).toFixed(1)}s`;
+  const seconds = Math.floor(ms / 1000);
+  if (seconds < 60) return `${seconds}s`;
+  const minutes = Math.floor(seconds / 60);
+  return `${minutes}m ${seconds % 60}s`;
+}
+
+// Extract the role/type marker from an entry. Pi's outbound HTTP payload
+// uses OpenAI-shape ``role`` (``user``/``assistant``/``tool``/``system``);
+// Pi's internal session-log records sometimes serialize the same field as
+// ``type``. Read both so this extension works regardless of which shape
+// the agent-runtime hands us via the ``context`` hook event.
+function entryRole(entry: { role?: string; type?: string }): string | undefined {
+  return entry.role ?? entry.type;
+}
+
+// Returns the FULL message index of the Nth user message (matching the
+// indexing space of the server's `safe_drop_before`, which counts every
+// message — user/assistant/tool/system — in send order). The earlier
+// version returned a user+assistant-subset index, which mixed index
+// spaces with `safe_drop_before` in `computeDropRange` and caused
+// `context_drop` to silently nuke the most-recent user prompts when the
+// upper bound exceeded the user+assistant subset size.
+function findAnchorFloor(
+  entries: Array<{ role?: string; type?: string }>,
+  nAnchors: number,
+): number {
+  let userCount = 0;
+  for (let i = 0; i < entries.length; i++) {
+    const role = entryRole(entries[i]);
+    if (role === "user") {
+      userCount++;
+      if (userCount === nAnchors) return i;
+    }
+  }
+  return -1;
+}
+
+// Server-side validation for X-NW-Conversation-ID (mirrors
+// services/mcr_v3_session.py::validate_client_conversation_id):
+//
+//   * non-empty, ≤ 256 chars
+//   * all code points printable (no control / whitespace beyond plain space)
+//
+// We sanitize here so a malformed Pi session id can never bounce off the
+// server as HTTP 400 — the worst case is we fall back to the in-process UUID.
+const MAX_CONVERSATION_ID_LEN = 256;
+
+function isWellFormedConversationId(value: string): boolean {
+  if (!value || value.length === 0 || value.length > MAX_CONVERSATION_ID_LEN) {
+    return false;
+  }
+  // Reject any control character (matches server's `not c.isprintable()` rule).
+  // We allow plain ASCII space (0x20) and everything ≥ 0x20 except 0x7F (DEL).
+  for (let i = 0; i < value.length; i++) {
+    const code = value.charCodeAt(i);
+    if (code < 0x20 || code === 0x7f) return false;
+  }
+  return true;
+}
+
+// Per-process fallback id. Generated lazily so it stays stable across
+// auto-compact within one `pi` invocation, and differs across invocations.
+let uuidFallback: string | null = null;
+function getUuidFallback(): string {
+  if (!uuidFallback) {
+    uuidFallback = randomUUID();
+  }
+  return uuidFallback;
+}
+
+type ConversationIdSource = "pi-session" | "uuid-fallback";
+
+function resolveConversationId(
+  ctx: ExtensionContext,
+): { id: string; source: ConversationIdSource } {
+  // Preferred source: Pi's own session id. Stable across auto-compact in a
+  // single `pi` session, distinct across sessions, naturally string-form.
+  try {
+    const piSessionId = ctx.sessionManager?.getSessionId?.();
+    if (
+      typeof piSessionId === "string" &&
+      isWellFormedConversationId(piSessionId)
+    ) {
+      return { id: piSessionId, source: "pi-session" };
+    }
+  } catch {
+    // fall through to uuid fallback
+  }
+  return { id: getUuidFallback(), source: "uuid-fallback" };
+}
+
+function computeDropRange(
+  entries: Array<{ role?: string; type?: string }>,
+  safeDropBefore: number,
+): [number, number] {
+  if (safeDropBefore <= 0) return [0, 0];
+  const anchorIdx = findAnchorFloor(entries, MCR_ANCHOR_USER_MESSAGES);
+  if (anchorIdx < 0) return [0, 0];
+  const dropStart = anchorIdx + 1;
+  const dropEnd = safeDropBefore;
+  if (dropEnd <= dropStart) return [0, 0];
+  return [dropStart, dropEnd];
+}
+
+export default function (pi: ExtensionAPI) {
+  const MCR_STATUS_KEY = "nw-mcr";
+  const ENERGY_STATUS_KEY = "nw-energy";
+
+  // ── Outbound header wiring (X-NW-Conversation-ID, X-NW-MCR-Ext-Version) ──
+  // pi-coding-agent's `before_provider_request` is a *body* hook — the
+  // earlier `payload.headers[...]` mutation reached extension memory only,
+  // never the HTTP wire. The documented per-request header path is
+  // `pi.registerProvider({ headers })`, whose values are env-var NAMES that
+  // the SDK re-reads from `process.env` on every stream
+  // (dist/core/resolve-config-value.js). Net: real HTTP headers, no body
+  // touch, no APC impact. Both headers below ride this same mechanism.
+  //
+  // Boot order: we seed the env var with a UUID at extension load so any
+  // request fired before the first `before_provider_request` tick still
+  // carries *some* id. The hook upgrades it to Pi's stable per-invocation
+  // session id (see resolveConversationId). Subsequent requests in the
+  // same `pi` invocation reuse the upgraded value — invocation-stable
+  // session_fp by construction.
+  const CONV_ID_ENV = "X_NW_CONVERSATION_ID";
+  if (!process.env[CONV_ID_ENV]) {
+    process.env[CONV_ID_ENV] = getUuidFallback();
+  }
+  // X-NW-MCR-Ext-Version (2.1.0): surface the client extension
+  // version on the wire so the gateway can log which revision served a
+  // request — server logs previously had no way to tell a user's version.
+  // Unlike the conversation id, the version is static for the life of the
+  // process (it never changes at runtime), so we seed it once at load and
+  // never touch it again — no upgrade-on-hook logic needed.
+  const EXT_VERSION_ENV = "X_NW_MCR_EXT_VERSION";
+  process.env[EXT_VERSION_ENV] = EXTENSION_VERSION;
+  // `apiKey` mirrors the env-var name from ~/.pi/agent/models.json so the
+  // partial config doesn't shadow the existing auth. `storeProviderRequestConfig`
+  // doesn't merge with the models.json-derived entry (different map), so we
+  // have to re-state any field we need; only apiKey here, since baseUrl/api
+  // flow through via the override-only branch of applyProviderConfig.
+  pi.registerProvider("neuralwatt", {
+    apiKey: "NEURALWATT_API_KEY",
+    headers: {
+      "X-NW-Conversation-ID": CONV_ID_ENV,
+      "X-NW-MCR-Ext-Version": EXT_VERSION_ENV,
+    },
+  });
+
+  function updateStatusBar(ctx: ExtensionContext) {
+    // In-flight indicator (tools#33 / inference_frontend#3954). HONESTY RULES:
+    //
+    //  * The extension cannot observe whether the gateway is compacting (Pi
+    //    doesn't surface the ``mcr-status`` SSE frames — see markRequestSent).
+    //    So we NEVER claim "optimizing context"; we only ever say a request is
+    //    in flight ("working…").
+    //  * We stay completely silent for the first IN_FLIGHT_GRACE_MS so normal
+    //    turns — including the long-but-uncompacted prefill that dominates real
+    //    usage — show nothing. Only a genuinely long wait surfaces the neutral
+    //    reassurance that the model hasn't hung.
+    //  * Once real model output starts, ``markStreamProducing`` clears
+    //    ``inFlightSince`` and the chip reverts to the standard MCR view.
+    const inFlightElapsedMs =
+      state.inFlightSince !== null ? Date.now() - state.inFlightSince : 0;
+    if (
+      state.inFlightSince !== null &&
+      inFlightElapsedMs >= IN_FLIGHT_GRACE_MS
+    ) {
+      const fpPrefix = state.sessionFp
+        ? `MCR ${state.sessionFp.slice(0, 8)} | `
+        : "MCR | ";
+      ctx.ui.setStatus(
+        MCR_STATUS_KEY,
+        `${fpPrefix}working… ${formatElapsed(inFlightElapsedMs)}`,
+      );
+    } else if (state.sessionFp) {
+      const parts: string[] = [`MCR ${state.sessionFp.slice(0, 8)}`];
+      if (state.safeDropBefore > 0) {
+        parts.push(`drop<${state.safeDropBefore}`);
+      }
+      ctx.ui.setStatus(MCR_STATUS_KEY, parts.join(" | "));
+    } else {
+      ctx.ui.setStatus(MCR_STATUS_KEY, "");
+    }
+
+    if (state.totalEnergyJoules > 0) {
+      const parts: string[] = [`⚡ ${formatEnergy(state.totalEnergyJoules)}`];
+      if (state.lastEnergy?.mcr) {
+        const m = state.lastEnergy.mcr;
+        if (m.apc_hit_rate !== undefined) {
+          parts.push(`APC ${(m.apc_hit_rate * 100).toFixed(0)}%`);
+        }
+        if (m.mcr_compacted_tokens && m.mcr_original_tokens) {
+          const ratio = m.mcr_compacted_tokens / m.mcr_original_tokens;
+          parts.push(`compact ${(ratio * 100).toFixed(0)}%`);
+        }
+      }
+      ctx.ui.setStatus(ENERGY_STATUS_KEY, parts.join(" | "));
+    } else {
+      ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+    }
+  }
+
+  // Lifecycle helpers for the in-flight indicator. We bracket the wait window
+  // with ``markRequestSent`` (on before_provider_request) and
+  // ``markStreamProducing`` (on the first message_update / message_end that
+  // carries real model output). The elapsed counter advances via a
+  // ``setInterval`` that re-calls ``updateStatusBar`` every ~0.5s.
+  //
+  // ── WHY THIS IS A NEUTRAL "working…" PROXY, NOT A REAL MCR PHASE SIGNAL ──
+  //
+  // The gateway (inference_frontend #3916) emits the ground truth as
+  // ``event: mcr-status`` SSE frames carrying {phase: compacting | warming |
+  // idle, elapsed_ms}. The ideal chip would show "optimizing context…" ONLY
+  // while a ``compacting`` phase is live. That is NOT achievable from a Pi
+  // extension on the version this targets (Pi v0.72/0.73), verified against
+  // the published type defs:
+  //
+  //   1. The only stream hook is ``message_update``, whose payload is
+  //      ``assistantMessageEvent: AssistantMessageEvent`` (pi-ai types.d.ts).
+  //      That union is closed to text/thinking/toolcall/start/done/error —
+  //      there is no member that can carry a raw ``mcr-status`` frame.
+  //   2. There is no "raw SSE frame" / "stream event" hook anywhere on
+  //      ``ExtensionAPI`` (pi-coding-agent extensions/types.d.ts): the event
+  //      surface is session/agent/turn/message/tool lifecycle only.
+  //   3. The neuralwatt provider streams through pi-ai's openai-completions
+  //      handler, which consumes the response via the official ``openai`` SDK
+  //      (``client.chat.completions.create(...).withResponse()``). The SDK's
+  //      decoder yields only typed ``chat.completion.chunk`` objects; any SSE
+  //      frame with ``event: mcr-status`` is dropped by the SDK before pi-ai —
+  //      and therefore the extension — can ever see it.
+  //
+  // So the extension genuinely cannot tell prefill from compaction. Claiming
+  // "optimizing context" would be a lie on the ~96% of turns where nothing is
+  // compacted (the churn driver in #3954). The honest behaviour is: neutral
+  // "working…" label, only after a grace window. Path A (real phase-driven
+  // chip) is blocked on an upstream Pi capability: a hook that surfaces raw
+  // provider SSE events (or an OpenAI-compat passthrough for unknown event
+  // types) to extensions. Track that as a Pi feature request; revisit here
+  // once it ships.
+  function markRequestSent(ctx: ExtensionContext) {
+    state.inFlightSince = Date.now();
+    if (state.inFlightTickerHandle !== null) {
+      clearInterval(state.inFlightTickerHandle);
+    }
+    state.inFlightTickerHandle = setInterval(() => {
+      // Defensive: stop ticking if the in-flight flag was cleared
+      // out-of-band (e.g. session_start reset).
+      if (state.inFlightSince === null) {
+        if (state.inFlightTickerHandle !== null) {
+          clearInterval(state.inFlightTickerHandle);
+          state.inFlightTickerHandle = null;
+        }
+        return;
+      }
+      updateStatusBar(ctx);
+    }, IN_FLIGHT_TICK_MS);
+    updateStatusBar(ctx);
+  }
+
+  function markStreamProducing(ctx: ExtensionContext) {
+    if (state.inFlightSince === null) return;
+    state.inFlightSince = null;
+    if (state.inFlightTickerHandle !== null) {
+      clearInterval(state.inFlightTickerHandle);
+      state.inFlightTickerHandle = null;
+    }
+    updateStatusBar(ctx);
+  }
+
+  // Upgrade X_NW_CONVERSATION_ID to Pi's stable session-id as early as
+  // possible — `session_start` fires before any provider request and is the
+  // earliest hook with ctx. Without this, the first request of each fresh
+  // `pi -p` invocation would carry the boot-time UUID (different per
+  // process) and produce a cold-cache session_fp that drags APC averages
+  // down on `--continue` chains.
+  pi.on("session_start", async (_event, ctx) => {
+    const { id: conversationId } = resolveConversationId(ctx);
+    process.env[CONV_ID_ENV] = conversationId;
+  });
+
+  pi.on("after_provider_response", async (event, ctx) => {
+    const modelId = ctx.model?.id || "";
+    if (!isMCRModel(modelId)) return;
+
+    const headers = event.headers as Record<string, string>;
+    const mcrFromHeaders = extractMCRFromHeaders(headers);
+
+    nwlog("after_provider_response", {
+      model: modelId,
+      header_fp: headers["x-mcr-session-fp"] ?? null,
+      header_safe_drop_before: headers["x-mcr-safe-drop-before"] ?? null,
+      header_stored_through: headers["x-mcr-stored-through"] ?? null,
+      parsed: mcrFromHeaders,
+    });
+
+    // M1.b ref-recovery observability (issue #3371 follow-up). The
+    // gateway's recent_ref_expansion path runs on the gateway->backend
+    // payload, which is invisible to Pi — without these headers in the
+    // log we can't tell "ref-recovery ran with no refs available" apart
+    // from "ref-recovery never ran". Counts only, no preview/sha values.
+    // Server-side emission: API_Gateway/app/services/mcr_anthropic_native_proxy.py::_add_mcr_refs_headers
+    const refsRecovered = parseOptionalIntHeader(headers, "x-mcr-refs-recovered");
+    const refsInForward = parseOptionalIntHeader(headers, "x-mcr-refs-in-forward");
+    const refsSkippedBudget = parseOptionalIntHeader(
+      headers,
+      "x-mcr-refs-skipped-budget",
+    );
+    const refsSkippedMissing = parseOptionalIntHeader(
+      headers,
+      "x-mcr-refs-skipped-missing",
+    );
+    const recoveryTokensAdded = parseOptionalIntHeader(
+      headers,
+      "x-mcr-recovery-tokens-added",
+    );
+    const manifestEntries = parseOptionalIntHeader(
+      headers,
+      "x-mcr-manifest-entries",
+    );
+
+    if (
+      refsRecovered !== null ||
+      refsInForward !== null ||
+      refsSkippedBudget !== null ||
+      refsSkippedMissing !== null ||
+      recoveryTokensAdded !== null ||
+      manifestEntries !== null
+    ) {
+      nwlog("mcr_refs", {
+        refs_recovered: refsRecovered,
+        refs_in_forward: refsInForward,
+        refs_skipped_budget: refsSkippedBudget,
+        refs_skipped_missing: refsSkippedMissing,
+        recovery_tokens_added: recoveryTokensAdded,
+        manifest_entries: manifestEntries,
+      });
+    }
+
+    if (mcrFromHeaders) {
+      state.sessionFp = mcrFromHeaders.session_fp;
+      state.safeDropBefore = mcrFromHeaders.safe_drop_before;
+      state.storedThrough = mcrFromHeaders.stored_through;
+      state.lastMcrMeta = mcrFromHeaders;
+    }
+
+    updateStatusBar(ctx);
+  });
+
+  // Clear the in-flight indicator on the first message_update for an MCR
+  // model — at that point real model tokens are flowing, the "is it hung?"
+  // perception is gone, and the chip should revert to the standard
+  // MCR-fingerprint view. The MessageUpdateEvent fires for any assistant
+  // streaming update (text/thinking/toolcall deltas); we don't need to narrow
+  // further since any of these prove the wait is over.
+  pi.on("message_update", async (event, ctx) => {
+    if (event.message.role !== "assistant") return;
+    if (!isMCRModel(ctx.model?.id || "")) return;
+    markStreamProducing(ctx);
+  });
+
+  pi.on("message_end", async (event, ctx) => {
+    if (event.message.role !== "assistant") return;
+    if (!isMCRModel(ctx.model?.id || "")) return;
+
+    // Backstop — if a response was short enough that no message_update ever
+    // fired, message_end is the latest possible chance to clear the indicator
+    // before it gets stale.
+    markStreamProducing(ctx);
+
+    const msg = event.message as Record<string, unknown>;
+
+    const mcrFromBody = extractMCRFromBody(msg);
+    if (mcrFromBody) {
+      nwlog("message_end_mcr_body", { parsed: mcrFromBody });
+      state.sessionFp = mcrFromBody.session_fp;
+      state.safeDropBefore = mcrFromBody.safe_drop_before;
+      state.storedThrough = mcrFromBody.stored_through;
+      state.lastMcrMeta = mcrFromBody;
+    }
+
+    const energy = extractEnergyFromBody(msg);
+    if (energy) {
+      nwlog("message_end_energy", {
+        energy_joules: energy.energy_joules,
+        cumulative_joules: state.totalEnergyJoules + energy.energy_joules,
+        session_turns: energy.mcr?.session_turns,
+        context_tokens: energy.mcr?.context_tokens,
+        compaction_triggered: energy.mcr?.compaction_triggered,
+        apc_hit_rate: energy.mcr?.apc_hit_rate,
+        mcr_compacted_tokens: energy.mcr?.mcr_compacted_tokens,
+        mcr_original_tokens: energy.mcr?.mcr_original_tokens,
+      });
+      state.totalEnergyJoules += energy.energy_joules;
+      state.lastEnergy = energy;
+      pi.appendEntry("neuralwatt-energy", { energy_joules: energy.energy_joules });
+      if (energy.mcr) {
+        state.sessionTurns = energy.mcr.session_turns;
+        state.contextTokens = energy.mcr.context_tokens;
+      }
+    }
+
+    updateStatusBar(ctx);
+  });
+
+  pi.on("context", async (event, ctx) => {
+    const modelId = ctx.model?.id || "";
+    const numMsgs = event.messages.length;
+
+    if (!state.sessionFp) {
+      nwlog("context_skip", {
+        reason: "no_session_fp",
+        model: modelId,
+        num_msgs: numMsgs,
+      });
+      return;
+    }
+    if (state.safeDropBefore <= 0) {
+      nwlog("context_skip", {
+        reason: "safe_drop_before_zero",
+        model: modelId,
+        num_msgs: numMsgs,
+        safe_drop_before: state.safeDropBefore,
+        session_fp: state.sessionFp,
+      });
+      return;
+    }
+    if (!isMCRModel(modelId)) {
+      nwlog("context_skip", {
+        reason: "not_mcr_model",
+        model: modelId,
+        num_msgs: numMsgs,
+      });
+      return;
+    }
+
+    const [dropStart, dropEnd] = computeDropRange(
+      event.messages as Array<{ type: string }>,
+      state.safeDropBefore,
+    );
+
+    if (dropEnd <= dropStart) {
+      nwlog("context_no_drop", {
+        reason: "empty_range",
+        drop_start: dropStart,
+        drop_end: dropEnd,
+        safe_drop_before: state.safeDropBefore,
+        num_msgs: numMsgs,
+        session_fp: state.sessionFp,
+      });
+      return;
+    }
+
+    // Drop messages in the full-index range [dropStart, dropEnd). Both
+    // bounds are in the same indexing space as `event.messages` and as the
+    // server's `safe_drop_before` (every message counted, all roles). The
+    // earlier version maintained a separate user+assistant-subset counter
+    // and compared it against full-index bounds, which silently nuked the
+    // most-recent user prompts (#bug discovered 2026-05-16 — see commit
+    // message for the trace).
+    const clampedEnd = Math.min(dropEnd, event.messages.length);
+    const filtered = event.messages.filter(
+      (_: unknown, i: number) => i < dropStart || i >= clampedEnd,
+    );
+    const droppedCount = numMsgs - filtered.length;
+
+    if (droppedCount === 0) {
+      nwlog("context_no_drop", {
+        reason: "no_indices_matched",
+        drop_start: dropStart,
+        drop_end: clampedEnd,
+        safe_drop_before: state.safeDropBefore,
+        num_msgs: numMsgs,
+        session_fp: state.sessionFp,
+      });
+      return;
+    }
+
+    nwlog("context_drop", {
+      drop_start: dropStart,
+      drop_end: clampedEnd,
+      safe_drop_before: state.safeDropBefore,
+      num_msgs_before: numMsgs,
+      num_msgs_after: filtered.length,
+      dropped: droppedCount,
+      session_fp: state.sessionFp,
+    });
+
+    return { messages: filtered };
+  });
+
+  pi.on("before_provider_request", async (event, ctx) => {
+    if (!isMCRModel(ctx.model?.id || "")) return;
+
+    const payload = event.payload as Record<string, unknown>;
+
+    // Start the in-flight indicator. It stays silent for IN_FLIGHT_GRACE_MS;
+    // only on a genuinely long wait does it surface a neutral
+    // "working… 12s" so the chat UI isn't indistinguishable from a hung
+    // model. It deliberately does NOT claim "optimizing context" — the
+    // extension can't observe whether compaction is happening (see
+    // ``markRequestSent`` for the Pi-API limitation that blocks the real
+    // phase signal).
+    markRequestSent(ctx);
+
+    // Upgrade the X_NW_CONVERSATION_ID env var (seeded with a UUID at
+    // extension load) to Pi's stable per-invocation session id. The SDK
+    // re-reads process.env on every stream, so this propagates to the
+    // next outbound request as the real X-NW-Conversation-ID HTTP header
+    // — no body touch. See the header-wiring block at the top of this fn
+    // and pi-header-surface.md for the full mechanism trace.
+    const { id: conversationId, source: conversationIdSource } =
+      resolveConversationId(ctx);
+    process.env[CONV_ID_ENV] = conversationId;
+    nwlog("conversation_id_attached", {
+      conversation_id_prefix: conversationId.slice(0, 8),
+      source: conversationIdSource,
+    });
+
+    // X-MCR-Session-FP fast-path hint: previously set via the same
+    // body-only mechanism that never reached the wire. Currently a no-op;
+    // diagnostic log retained so we still see when the gateway has
+    // assigned an fp. Revisit via registerProvider if we want it back.
+    if (state.sessionFp) {
+      nwlog("before_provider_request_fp_set", {
+        session_fp: state.sessionFp,
+        safe_drop_before: state.safeDropBefore,
+      });
+    }
+
+    // DEBUG (#3323 follow-up): capture the actual outbound payload shape
+    // so we can diagnose why the deployed pin-final-user-turn fix isn't
+    // firing for real Pi sessions. Logs shape only — no message content.
+    try {
+      const msgs = (payload.body as { messages?: Array<Record<string, unknown>> } | undefined)?.messages
+        ?? (payload.messages as Array<Record<string, unknown>> | undefined)
+        ?? [];
+      const roles = msgs.map((m) => String(m.role ?? m.type ?? "?"));
+      const lastN = msgs.slice(-5).map((m) => {
+        const role = String(m.role ?? m.type ?? "?");
+        const contentField = m.content;
+        let contentKind: string;
+        let contentLen: number;
+        if (typeof contentField === "string") {
+          contentKind = "string";
+          contentLen = contentField.length;
+        } else if (Array.isArray(contentField)) {
+          contentKind = "array[" + contentField.length + "]";
+          contentLen = contentField.reduce((acc, b) => acc + JSON.stringify(b).length, 0);
+        } else if (contentField === null || contentField === undefined) {
+          contentKind = "null";
+          contentLen = 0;
+        } else {
+          contentKind = typeof contentField;
+          contentLen = JSON.stringify(contentField).length;
+        }
+        const blockTypes = Array.isArray(contentField)
+          ? (contentField as Array<Record<string, unknown>>).map((b) => String(b.type ?? "?"))
+          : null;
+        return { role, contentKind, contentLen, blockTypes, hasToolCallId: "tool_call_id" in m };
+      });
+      const roleCounts: Record<string, number> = {};
+      for (const r of roles) roleCounts[r] = (roleCounts[r] || 0) + 1;
+      nwlog("outbound_payload_shape", {
+        session_fp: state.sessionFp,
+        num_messages: msgs.length,
+        role_distribution: roleCounts,
+        last_5_messages: lastN,
+        stream: Boolean((payload.body as { stream?: boolean } | undefined)?.stream ?? payload.stream),
+        payload_keys: Object.keys(payload),
+      });
+    } catch (err) {
+      nwlog("outbound_payload_shape_error", {
+        session_fp: state.sessionFp,
+        error: err instanceof Error ? err.message : String(err),
+      });
+    }
+  });
+
+  pi.on("session_before_compact", async (_event, ctx) => {
+    if (!isMCRModel(ctx.model?.id || "")) return;
+    if (state.sessionFp) {
+      nwlog("compaction_cancelled", { session_fp: state.sessionFp });
+      return { cancel: true };
+    }
+  });
+
+  pi.on("session_start", async (_event, ctx) => {
+    nwlog("session_start", { extension_version: EXTENSION_VERSION });
+    state.sessionFp = null;
+    state.safeDropBefore = 0;
+    state.storedThrough = 0;
+    state.totalEnergyJoules = 0;
+    state.sessionTurns = 0;
+    state.contextTokens = 0;
+    state.lastMcrMeta = null;
+    state.lastEnergy = null;
+    // Clear any in-flight indicator left over from a prior
+    // session (defensive — session_start would normally fire after any
+    // active request has completed, but a forced restart or fork can land
+    // here while inFlightSince is still set).
+    state.inFlightSince = null;
+    if (state.inFlightTickerHandle !== null) {
+      clearInterval(state.inFlightTickerHandle);
+      state.inFlightTickerHandle = null;
+    }
+    // Reset the UUID fallback so a new Pi session gets a fresh conversation
+    // id when getSessionId() isn't usable. The Pi session id itself rotates
+    // on its own.
+    uuidFallback = null;
+    ctx.ui.setStatus(MCR_STATUS_KEY, "");
+    ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+  });
+
+  pi.on("session_tree", async (_event, ctx) => {
+    // Branch navigation invalidates MCR session state — sessionFp and
+    // safeDropBefore are tied to a specific message sequence that no
+    // longer matches the new branch. Clear everything and let the
+    // next server response repopulate. Energy is replayed from the
+    // session log (same as the main provider's session_tree handler).
+    state.sessionFp = null;
+    state.safeDropBefore = 0;
+    state.storedThrough = 0;
+    state.totalEnergyJoules = 0;
+    state.sessionTurns = 0;
+    state.contextTokens = 0;
+    state.lastMcrMeta = null;
+    state.lastEnergy = null;
+
+    // Replay energy events from the session log for the new branch.
+    for (const entry of ctx.sessionManager.getBranch()) {
+      if (
+        entry.type === "custom" &&
+        entry.customType === "neuralwatt-energy" &&
+        typeof entry.data === "object" &&
+        entry.data
+      ) {
+        state.totalEnergyJoules += (entry.data as { energy_joules: number }).energy_joules || 0;
+      }
+    }
+
+    nwlog("session_tree", { total_energy_replayed: state.totalEnergyJoules });
+    updateStatusBar(ctx);
+  });
+
+  pi.on("session_shutdown", async (_event, ctx) => {
+    nwlog("session_shutdown", {
+      final_session_fp: state.sessionFp,
+      total_energy_joules: state.totalEnergyJoules,
+      session_turns: state.sessionTurns,
+    });
+    // Tear down the in-flight ticker so the interval handle
+    // doesn't outlive the session.
+    state.inFlightSince = null;
+    if (state.inFlightTickerHandle !== null) {
+      clearInterval(state.inFlightTickerHandle);
+      state.inFlightTickerHandle = null;
+    }
+    ctx.ui.setStatus(MCR_STATUS_KEY, "");
+    ctx.ui.setStatus(ENERGY_STATUS_KEY, "");
+  });
+}

dot_config/pi/models.json 🔗

@@ -0,0 +1,48 @@
+{
+  "providers": {
+    "neuralwatt": {
+      "baseUrl": "https://api.neuralwatt.com/v1",
+      "api": "openai-completions",
+      "apiKey": "$NEURALWATT_API_KEY",
+      "compat": {
+        "supportsDeveloperRole": false,
+        "supportsReasoningEffort": false,
+        "maxTokensField": "max_tokens"
+      },
+        {
+          "id": "neuralwatt/glm-5.1-long",
+          "name": "GLM-5.1 Long (MCR 1M)",
+          "reasoning": true,
+          "input": [
+            "text"
+          ],
+          "contextWindow": 1048576,
+          "maxTokens": 16384,
+          "cost": {
+            "input": 0,
+            "output": 0,
+            "cacheRead": 0,
+            "cacheWrite": 0
+          }
+        },
+        {
+          "id": "neuralwatt/kimi-k2.6-long",
+          "name": "Kimi K2.6 Long (MCR 1M)",
+          "reasoning": true,
+          "input": [
+            "text",
+            "image"
+          ],
+          "contextWindow": 1048576,
+          "maxTokens": 16384,
+          "cost": {
+            "input": 0,
+            "output": 0,
+            "cacheRead": 0,
+            "cacheWrite": 0
+          }
+        }
+      ]
+    }
+  }
+}

dot_config/pi/settings.json 🔗

@@ -1,8 +1,8 @@
 {
-  "lastChangelogVersion": "0.75.5",
+  "lastChangelogVersion": "0.78.0",
   "defaultThinkingLevel": "high",
-  "defaultProvider": "plexus",
-  "defaultModel": "glm-5.1",
+  "defaultProvider": "neuralwatt",
+  "defaultModel": "neuralwatt/glm-5.1-long",
   "quietStartup": true,
   "autocompleteMaxVisible": 10,
   "steeringMode": "all",