From dbe958ee8cd95ace900f77d34252fe5ee5c0b379 Mon Sep 17 00:00:00 2001 From: Amolith Date: Sun, 31 May 2026 10:24:13 -0600 Subject: [PATCH] pi: add NW MCR, update config --- dot_config/pi/extensions/neuralwatt-mcr.ts | 877 +++++++++++++++++++++ dot_config/pi/models.json | 48 ++ dot_config/pi/settings.json | 6 +- 3 files changed, 928 insertions(+), 3 deletions(-) create mode 100644 dot_config/pi/extensions/neuralwatt-mcr.ts create mode 100644 dot_config/pi/models.json diff --git a/dot_config/pi/extensions/neuralwatt-mcr.ts b/dot_config/pi/extensions/neuralwatt-mcr.ts new file mode 100644 index 0000000000000000000000000000000000000000..cf883ae1a9b5ba2045e9f386e8c9faac95196664 --- /dev/null +++ b/dot_config/pi/extensions/neuralwatt-mcr.ts @@ -0,0 +1,877 @@ +import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import * as os from "node:os"; +import { randomUUID } from "node:crypto"; + +// Bump on any user-facing behaviour change. Surfaced in the extension log so a +// session's behaviour can be tied to a specific revision when triaging reports. +// 2.0.0 — honest in-flight chip (tools#33 / inference_frontend#3954): +// neutral "working…" label, silent grace window, tightened MCR gating. +// 2.1.0 — send the extension version on the wire as X-NW-MCR-Ext-Version so +// the gateway can log which client revision served a request (server +// logs previously had no way to tell a user's extension version). +const EXTENSION_VERSION = "2.1.0"; + +const MCR_ANCHOR_USER_MESSAGES = 3; + +const LOG_FILE = path.join( + os.homedir(), + ".pi", + "agent", + "extensions", + "neuralwatt-mcr.log", +); + +function nwlog(event: string, data: Record = {}): void { + try { + const line = + JSON.stringify({ ts: new Date().toISOString(), event, ...data }) + "\n"; + fs.appendFileSync(LOG_FILE, line); + } catch { + // never break the extension on logging failure + } +} + +interface MCRMetadata { + session_fp: string; + stored_through: number; + safe_drop_before: number; +} + +interface EnergyData { + energy_joules: number; + mcr?: { + session_turns: number; + context_tokens: number; + compaction_triggered: boolean; + apc_hit_rate?: number; + mcr_compacted_tokens?: number; + mcr_original_tokens?: number; + }; +} + +interface SessionState { + sessionFp: string | null; + safeDropBefore: number; + storedThrough: number; + totalEnergyJoules: number; + sessionTurns: number; + contextTokens: number; + lastMcrMeta: MCRMetadata | null; + lastEnergy: EnergyData | null; + // In-flight request UX. When an MCR request is sent we record the wall-clock + // send time so the chip can surface a neutral "working…" indicator on long + // waits, and cleared on the first ``message_update`` / ``message_end``. + // + // HONESTY NOTE (tools#33 / inference_frontend#3954): this is a wall-clock + // proxy, NOT a real compaction signal. The extension cannot observe whether + // the gateway is actually compacting — see ``markRequestSent`` for why Pi's + // API doesn't surface the gateway's ``mcr-status`` SSE frames. So the chip + // must NOT claim "optimizing context"; it can only honestly say a request is + // in flight. We also stay silent for the first few seconds so ordinary turns + // (the ~96% with no compaction) show nothing at all. + inFlightSince: number | null; + inFlightTickerHandle: NodeJS.Timeout | null; +} + +const state: SessionState = { + sessionFp: null, + safeDropBefore: 0, + storedThrough: 0, + totalEnergyJoules: 0, + sessionTurns: 0, + contextTokens: 0, + lastMcrMeta: null, + lastEnergy: null, + inFlightSince: null, + inFlightTickerHandle: null, +}; + +// How often to refresh the chip while the in-flight indicator is showing, so +// the elapsed counter advances visibly. 500ms is fast enough to feel live but +// well below the refresh budget of Pi's footer. +const IN_FLIGHT_TICK_MS = 500; + +// Honesty grace window (tools#33 / inference_frontend#3954): suppress the +// in-flight indicator for the first few seconds of every request. Large MCR +// prompts (100k+ tokens) have a naturally long prefill/TTFT — 10-60s — on +// EVERY turn, with no compaction happening on the ~96% that don't compact. +// The old chip labelled that ordinary latency "optimizing context…" for the +// whole window, which read as "MCR is making every prompt slow" and drove +// churn. We can't tell prefill from compaction (no SSE phase signal reaches +// the extension), so the only honest move is: say nothing until the wait is +// long enough that the user genuinely wants reassurance the model isn't hung, +// and even then use a neutral label that doesn't assert MCR is doing work. +const IN_FLIGHT_GRACE_MS = 6000; + +// Recognise ONLY the MCR-backed aliases. The MCR pipeline (server-side +// compaction + 1M virtual context + the X-MCR-* response protocol) runs only +// for the `neuralwatt/…-long` aliases; the base-model and fast/flex IDs route +// straight to the provider with no compaction. +// +// tools#33: the earlier predicate also matched `zai-org/`, `moonshotai/`, +// `glm-5`, and `kimi-k2`, so every fast/flex alias and direct base-model call +// lit the in-flight chip and ran the context-drop / fp handlers — none of which +// apply off-MCR. Nico reported "optimizing context" on non-long models. The +// client always selects the alias (never the forwarded base name), so matching +// the alias shape is sufficient and correct. This narrows every call site +// (chip gating, context-drop, fp-set, compaction-suppression) to MCR-only, +// which is the intended behaviour for all of them. +function isMCRModel(modelId: string): boolean { + return modelId.includes("neuralwatt/") || modelId.endsWith("-long"); +} + +function extractMCRFromHeaders( + headers: Record, +): MCRMetadata | null { + const fp = headers["x-mcr-session-fp"]; + if (!fp) return null; + return { + session_fp: fp, + stored_through: parseInt(headers["x-mcr-stored-through"] || "0", 10), + safe_drop_before: parseInt( + headers["x-mcr-safe-drop-before"] || "0", + 10, + ), + }; +} + +// Parse a header value as an integer if present, else null. We keep null +// distinct from 0 because: +// * absent header -> gateway did not run the M1.b ref-recovery code path +// (older deploy, non-MCR request, or path bailed before emitting) +// * 0 -> ref-recovery ran but had nothing to surface +// This distinction is the whole point of issue #3371 follow-up. +function parseOptionalIntHeader( + headers: Record, + name: string, +): number | null { + const raw = headers[name]; + if (raw === undefined || raw === null || raw === "") return null; + const n = parseInt(raw, 10); + return Number.isFinite(n) ? n : null; +} + +function extractMCRFromBody( + body: Record, +): MCRMetadata | null { + const mcr = body.mcr as Record | undefined; + if (!mcr || typeof mcr !== "object" || typeof mcr.session_fp !== "string") + return null; + return { + session_fp: mcr.session_fp as string, + stored_through: + typeof mcr.stored_through === "number" ? mcr.stored_through : 0, + safe_drop_before: + typeof mcr.safe_drop_before === "number" ? mcr.safe_drop_before : 0, + }; +} + +function extractEnergyFromBody( + body: Record, +): EnergyData | null { + const energy = body.energy as Record | undefined; + if (!energy || typeof energy !== "object") return null; + const result: EnergyData = { + energy_joules: + typeof energy.energy_joules === "number" ? energy.energy_joules : 0, + }; + if (energy.mcr && typeof energy.mcr === "object") { + const m = energy.mcr as Record; + result.mcr = { + session_turns: + typeof m.session_turns === "number" ? m.session_turns : 0, + context_tokens: + typeof m.context_tokens === "number" ? m.context_tokens : 0, + compaction_triggered: + typeof m.compaction_triggered === "boolean" + ? m.compaction_triggered + : false, + apc_hit_rate: + typeof m.apc_hit_rate === "number" ? m.apc_hit_rate : undefined, + mcr_compacted_tokens: + typeof m.mcr_compacted_tokens === "number" + ? m.mcr_compacted_tokens + : undefined, + mcr_original_tokens: + typeof m.mcr_original_tokens === "number" + ? m.mcr_original_tokens + : undefined, + }; + } + return result; +} + +function formatEnergy(joules: number): string { + if (joules < 1) return `${(joules * 1000).toFixed(0)}mJ`; + if (joules < 1000) return `${joules.toFixed(1)}J`; + return `${(joules / 1000).toFixed(2)}kJ`; +} + +// Render an in-flight elapsed-time stamp for the chip. Short formats (<10s → +// "1.4s", >=10s → "12s", >=60s → "1m 5s") keep the chip from growing wide +// enough to push other footer widgets off-screen. +function formatElapsed(ms: number): string { + if (ms < 10000) return `${(ms / 1000).toFixed(1)}s`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + return `${minutes}m ${seconds % 60}s`; +} + +// Extract the role/type marker from an entry. Pi's outbound HTTP payload +// uses OpenAI-shape ``role`` (``user``/``assistant``/``tool``/``system``); +// Pi's internal session-log records sometimes serialize the same field as +// ``type``. Read both so this extension works regardless of which shape +// the agent-runtime hands us via the ``context`` hook event. +function entryRole(entry: { role?: string; type?: string }): string | undefined { + return entry.role ?? entry.type; +} + +// Returns the FULL message index of the Nth user message (matching the +// indexing space of the server's `safe_drop_before`, which counts every +// message — user/assistant/tool/system — in send order). The earlier +// version returned a user+assistant-subset index, which mixed index +// spaces with `safe_drop_before` in `computeDropRange` and caused +// `context_drop` to silently nuke the most-recent user prompts when the +// upper bound exceeded the user+assistant subset size. +function findAnchorFloor( + entries: Array<{ role?: string; type?: string }>, + nAnchors: number, +): number { + let userCount = 0; + for (let i = 0; i < entries.length; i++) { + const role = entryRole(entries[i]); + if (role === "user") { + userCount++; + if (userCount === nAnchors) return i; + } + } + return -1; +} + +// Server-side validation for X-NW-Conversation-ID (mirrors +// services/mcr_v3_session.py::validate_client_conversation_id): +// +// * non-empty, ≤ 256 chars +// * all code points printable (no control / whitespace beyond plain space) +// +// We sanitize here so a malformed Pi session id can never bounce off the +// server as HTTP 400 — the worst case is we fall back to the in-process UUID. +const MAX_CONVERSATION_ID_LEN = 256; + +function isWellFormedConversationId(value: string): boolean { + if (!value || value.length === 0 || value.length > MAX_CONVERSATION_ID_LEN) { + return false; + } + // Reject any control character (matches server's `not c.isprintable()` rule). + // We allow plain ASCII space (0x20) and everything ≥ 0x20 except 0x7F (DEL). + for (let i = 0; i < value.length; i++) { + const code = value.charCodeAt(i); + if (code < 0x20 || code === 0x7f) return false; + } + return true; +} + +// Per-process fallback id. Generated lazily so it stays stable across +// auto-compact within one `pi` invocation, and differs across invocations. +let uuidFallback: string | null = null; +function getUuidFallback(): string { + if (!uuidFallback) { + uuidFallback = randomUUID(); + } + return uuidFallback; +} + +type ConversationIdSource = "pi-session" | "uuid-fallback"; + +function resolveConversationId( + ctx: ExtensionContext, +): { id: string; source: ConversationIdSource } { + // Preferred source: Pi's own session id. Stable across auto-compact in a + // single `pi` session, distinct across sessions, naturally string-form. + try { + const piSessionId = ctx.sessionManager?.getSessionId?.(); + if ( + typeof piSessionId === "string" && + isWellFormedConversationId(piSessionId) + ) { + return { id: piSessionId, source: "pi-session" }; + } + } catch { + // fall through to uuid fallback + } + return { id: getUuidFallback(), source: "uuid-fallback" }; +} + +function computeDropRange( + entries: Array<{ role?: string; type?: string }>, + safeDropBefore: number, +): [number, number] { + if (safeDropBefore <= 0) return [0, 0]; + const anchorIdx = findAnchorFloor(entries, MCR_ANCHOR_USER_MESSAGES); + if (anchorIdx < 0) return [0, 0]; + const dropStart = anchorIdx + 1; + const dropEnd = safeDropBefore; + if (dropEnd <= dropStart) return [0, 0]; + return [dropStart, dropEnd]; +} + +export default function (pi: ExtensionAPI) { + const MCR_STATUS_KEY = "nw-mcr"; + const ENERGY_STATUS_KEY = "nw-energy"; + + // ── Outbound header wiring (X-NW-Conversation-ID, X-NW-MCR-Ext-Version) ── + // pi-coding-agent's `before_provider_request` is a *body* hook — the + // earlier `payload.headers[...]` mutation reached extension memory only, + // never the HTTP wire. The documented per-request header path is + // `pi.registerProvider({ headers })`, whose values are env-var NAMES that + // the SDK re-reads from `process.env` on every stream + // (dist/core/resolve-config-value.js). Net: real HTTP headers, no body + // touch, no APC impact. Both headers below ride this same mechanism. + // + // Boot order: we seed the env var with a UUID at extension load so any + // request fired before the first `before_provider_request` tick still + // carries *some* id. The hook upgrades it to Pi's stable per-invocation + // session id (see resolveConversationId). Subsequent requests in the + // same `pi` invocation reuse the upgraded value — invocation-stable + // session_fp by construction. + const CONV_ID_ENV = "X_NW_CONVERSATION_ID"; + if (!process.env[CONV_ID_ENV]) { + process.env[CONV_ID_ENV] = getUuidFallback(); + } + // X-NW-MCR-Ext-Version (2.1.0): surface the client extension + // version on the wire so the gateway can log which revision served a + // request — server logs previously had no way to tell a user's version. + // Unlike the conversation id, the version is static for the life of the + // process (it never changes at runtime), so we seed it once at load and + // never touch it again — no upgrade-on-hook logic needed. + const EXT_VERSION_ENV = "X_NW_MCR_EXT_VERSION"; + process.env[EXT_VERSION_ENV] = EXTENSION_VERSION; + // `apiKey` mirrors the env-var name from ~/.pi/agent/models.json so the + // partial config doesn't shadow the existing auth. `storeProviderRequestConfig` + // doesn't merge with the models.json-derived entry (different map), so we + // have to re-state any field we need; only apiKey here, since baseUrl/api + // flow through via the override-only branch of applyProviderConfig. + pi.registerProvider("neuralwatt", { + apiKey: "NEURALWATT_API_KEY", + headers: { + "X-NW-Conversation-ID": CONV_ID_ENV, + "X-NW-MCR-Ext-Version": EXT_VERSION_ENV, + }, + }); + + function updateStatusBar(ctx: ExtensionContext) { + // In-flight indicator (tools#33 / inference_frontend#3954). HONESTY RULES: + // + // * The extension cannot observe whether the gateway is compacting (Pi + // doesn't surface the ``mcr-status`` SSE frames — see markRequestSent). + // So we NEVER claim "optimizing context"; we only ever say a request is + // in flight ("working…"). + // * We stay completely silent for the first IN_FLIGHT_GRACE_MS so normal + // turns — including the long-but-uncompacted prefill that dominates real + // usage — show nothing. Only a genuinely long wait surfaces the neutral + // reassurance that the model hasn't hung. + // * Once real model output starts, ``markStreamProducing`` clears + // ``inFlightSince`` and the chip reverts to the standard MCR view. + const inFlightElapsedMs = + state.inFlightSince !== null ? Date.now() - state.inFlightSince : 0; + if ( + state.inFlightSince !== null && + inFlightElapsedMs >= IN_FLIGHT_GRACE_MS + ) { + const fpPrefix = state.sessionFp + ? `MCR ${state.sessionFp.slice(0, 8)} | ` + : "MCR | "; + ctx.ui.setStatus( + MCR_STATUS_KEY, + `${fpPrefix}working… ${formatElapsed(inFlightElapsedMs)}`, + ); + } else if (state.sessionFp) { + const parts: string[] = [`MCR ${state.sessionFp.slice(0, 8)}`]; + if (state.safeDropBefore > 0) { + parts.push(`drop<${state.safeDropBefore}`); + } + ctx.ui.setStatus(MCR_STATUS_KEY, parts.join(" | ")); + } else { + ctx.ui.setStatus(MCR_STATUS_KEY, ""); + } + + if (state.totalEnergyJoules > 0) { + const parts: string[] = [`⚡ ${formatEnergy(state.totalEnergyJoules)}`]; + if (state.lastEnergy?.mcr) { + const m = state.lastEnergy.mcr; + if (m.apc_hit_rate !== undefined) { + parts.push(`APC ${(m.apc_hit_rate * 100).toFixed(0)}%`); + } + if (m.mcr_compacted_tokens && m.mcr_original_tokens) { + const ratio = m.mcr_compacted_tokens / m.mcr_original_tokens; + parts.push(`compact ${(ratio * 100).toFixed(0)}%`); + } + } + ctx.ui.setStatus(ENERGY_STATUS_KEY, parts.join(" | ")); + } else { + ctx.ui.setStatus(ENERGY_STATUS_KEY, ""); + } + } + + // Lifecycle helpers for the in-flight indicator. We bracket the wait window + // with ``markRequestSent`` (on before_provider_request) and + // ``markStreamProducing`` (on the first message_update / message_end that + // carries real model output). The elapsed counter advances via a + // ``setInterval`` that re-calls ``updateStatusBar`` every ~0.5s. + // + // ── WHY THIS IS A NEUTRAL "working…" PROXY, NOT A REAL MCR PHASE SIGNAL ── + // + // The gateway (inference_frontend #3916) emits the ground truth as + // ``event: mcr-status`` SSE frames carrying {phase: compacting | warming | + // idle, elapsed_ms}. The ideal chip would show "optimizing context…" ONLY + // while a ``compacting`` phase is live. That is NOT achievable from a Pi + // extension on the version this targets (Pi v0.72/0.73), verified against + // the published type defs: + // + // 1. The only stream hook is ``message_update``, whose payload is + // ``assistantMessageEvent: AssistantMessageEvent`` (pi-ai types.d.ts). + // That union is closed to text/thinking/toolcall/start/done/error — + // there is no member that can carry a raw ``mcr-status`` frame. + // 2. There is no "raw SSE frame" / "stream event" hook anywhere on + // ``ExtensionAPI`` (pi-coding-agent extensions/types.d.ts): the event + // surface is session/agent/turn/message/tool lifecycle only. + // 3. The neuralwatt provider streams through pi-ai's openai-completions + // handler, which consumes the response via the official ``openai`` SDK + // (``client.chat.completions.create(...).withResponse()``). The SDK's + // decoder yields only typed ``chat.completion.chunk`` objects; any SSE + // frame with ``event: mcr-status`` is dropped by the SDK before pi-ai — + // and therefore the extension — can ever see it. + // + // So the extension genuinely cannot tell prefill from compaction. Claiming + // "optimizing context" would be a lie on the ~96% of turns where nothing is + // compacted (the churn driver in #3954). The honest behaviour is: neutral + // "working…" label, only after a grace window. Path A (real phase-driven + // chip) is blocked on an upstream Pi capability: a hook that surfaces raw + // provider SSE events (or an OpenAI-compat passthrough for unknown event + // types) to extensions. Track that as a Pi feature request; revisit here + // once it ships. + function markRequestSent(ctx: ExtensionContext) { + state.inFlightSince = Date.now(); + if (state.inFlightTickerHandle !== null) { + clearInterval(state.inFlightTickerHandle); + } + state.inFlightTickerHandle = setInterval(() => { + // Defensive: stop ticking if the in-flight flag was cleared + // out-of-band (e.g. session_start reset). + if (state.inFlightSince === null) { + if (state.inFlightTickerHandle !== null) { + clearInterval(state.inFlightTickerHandle); + state.inFlightTickerHandle = null; + } + return; + } + updateStatusBar(ctx); + }, IN_FLIGHT_TICK_MS); + updateStatusBar(ctx); + } + + function markStreamProducing(ctx: ExtensionContext) { + if (state.inFlightSince === null) return; + state.inFlightSince = null; + if (state.inFlightTickerHandle !== null) { + clearInterval(state.inFlightTickerHandle); + state.inFlightTickerHandle = null; + } + updateStatusBar(ctx); + } + + // Upgrade X_NW_CONVERSATION_ID to Pi's stable session-id as early as + // possible — `session_start` fires before any provider request and is the + // earliest hook with ctx. Without this, the first request of each fresh + // `pi -p` invocation would carry the boot-time UUID (different per + // process) and produce a cold-cache session_fp that drags APC averages + // down on `--continue` chains. + pi.on("session_start", async (_event, ctx) => { + const { id: conversationId } = resolveConversationId(ctx); + process.env[CONV_ID_ENV] = conversationId; + }); + + pi.on("after_provider_response", async (event, ctx) => { + const modelId = ctx.model?.id || ""; + if (!isMCRModel(modelId)) return; + + const headers = event.headers as Record; + const mcrFromHeaders = extractMCRFromHeaders(headers); + + nwlog("after_provider_response", { + model: modelId, + header_fp: headers["x-mcr-session-fp"] ?? null, + header_safe_drop_before: headers["x-mcr-safe-drop-before"] ?? null, + header_stored_through: headers["x-mcr-stored-through"] ?? null, + parsed: mcrFromHeaders, + }); + + // M1.b ref-recovery observability (issue #3371 follow-up). The + // gateway's recent_ref_expansion path runs on the gateway->backend + // payload, which is invisible to Pi — without these headers in the + // log we can't tell "ref-recovery ran with no refs available" apart + // from "ref-recovery never ran". Counts only, no preview/sha values. + // Server-side emission: API_Gateway/app/services/mcr_anthropic_native_proxy.py::_add_mcr_refs_headers + const refsRecovered = parseOptionalIntHeader(headers, "x-mcr-refs-recovered"); + const refsInForward = parseOptionalIntHeader(headers, "x-mcr-refs-in-forward"); + const refsSkippedBudget = parseOptionalIntHeader( + headers, + "x-mcr-refs-skipped-budget", + ); + const refsSkippedMissing = parseOptionalIntHeader( + headers, + "x-mcr-refs-skipped-missing", + ); + const recoveryTokensAdded = parseOptionalIntHeader( + headers, + "x-mcr-recovery-tokens-added", + ); + const manifestEntries = parseOptionalIntHeader( + headers, + "x-mcr-manifest-entries", + ); + + if ( + refsRecovered !== null || + refsInForward !== null || + refsSkippedBudget !== null || + refsSkippedMissing !== null || + recoveryTokensAdded !== null || + manifestEntries !== null + ) { + nwlog("mcr_refs", { + refs_recovered: refsRecovered, + refs_in_forward: refsInForward, + refs_skipped_budget: refsSkippedBudget, + refs_skipped_missing: refsSkippedMissing, + recovery_tokens_added: recoveryTokensAdded, + manifest_entries: manifestEntries, + }); + } + + if (mcrFromHeaders) { + state.sessionFp = mcrFromHeaders.session_fp; + state.safeDropBefore = mcrFromHeaders.safe_drop_before; + state.storedThrough = mcrFromHeaders.stored_through; + state.lastMcrMeta = mcrFromHeaders; + } + + updateStatusBar(ctx); + }); + + // Clear the in-flight indicator on the first message_update for an MCR + // model — at that point real model tokens are flowing, the "is it hung?" + // perception is gone, and the chip should revert to the standard + // MCR-fingerprint view. The MessageUpdateEvent fires for any assistant + // streaming update (text/thinking/toolcall deltas); we don't need to narrow + // further since any of these prove the wait is over. + pi.on("message_update", async (event, ctx) => { + if (event.message.role !== "assistant") return; + if (!isMCRModel(ctx.model?.id || "")) return; + markStreamProducing(ctx); + }); + + pi.on("message_end", async (event, ctx) => { + if (event.message.role !== "assistant") return; + if (!isMCRModel(ctx.model?.id || "")) return; + + // Backstop — if a response was short enough that no message_update ever + // fired, message_end is the latest possible chance to clear the indicator + // before it gets stale. + markStreamProducing(ctx); + + const msg = event.message as Record; + + const mcrFromBody = extractMCRFromBody(msg); + if (mcrFromBody) { + nwlog("message_end_mcr_body", { parsed: mcrFromBody }); + state.sessionFp = mcrFromBody.session_fp; + state.safeDropBefore = mcrFromBody.safe_drop_before; + state.storedThrough = mcrFromBody.stored_through; + state.lastMcrMeta = mcrFromBody; + } + + const energy = extractEnergyFromBody(msg); + if (energy) { + nwlog("message_end_energy", { + energy_joules: energy.energy_joules, + cumulative_joules: state.totalEnergyJoules + energy.energy_joules, + session_turns: energy.mcr?.session_turns, + context_tokens: energy.mcr?.context_tokens, + compaction_triggered: energy.mcr?.compaction_triggered, + apc_hit_rate: energy.mcr?.apc_hit_rate, + mcr_compacted_tokens: energy.mcr?.mcr_compacted_tokens, + mcr_original_tokens: energy.mcr?.mcr_original_tokens, + }); + state.totalEnergyJoules += energy.energy_joules; + state.lastEnergy = energy; + pi.appendEntry("neuralwatt-energy", { energy_joules: energy.energy_joules }); + if (energy.mcr) { + state.sessionTurns = energy.mcr.session_turns; + state.contextTokens = energy.mcr.context_tokens; + } + } + + updateStatusBar(ctx); + }); + + pi.on("context", async (event, ctx) => { + const modelId = ctx.model?.id || ""; + const numMsgs = event.messages.length; + + if (!state.sessionFp) { + nwlog("context_skip", { + reason: "no_session_fp", + model: modelId, + num_msgs: numMsgs, + }); + return; + } + if (state.safeDropBefore <= 0) { + nwlog("context_skip", { + reason: "safe_drop_before_zero", + model: modelId, + num_msgs: numMsgs, + safe_drop_before: state.safeDropBefore, + session_fp: state.sessionFp, + }); + return; + } + if (!isMCRModel(modelId)) { + nwlog("context_skip", { + reason: "not_mcr_model", + model: modelId, + num_msgs: numMsgs, + }); + return; + } + + const [dropStart, dropEnd] = computeDropRange( + event.messages as Array<{ type: string }>, + state.safeDropBefore, + ); + + if (dropEnd <= dropStart) { + nwlog("context_no_drop", { + reason: "empty_range", + drop_start: dropStart, + drop_end: dropEnd, + safe_drop_before: state.safeDropBefore, + num_msgs: numMsgs, + session_fp: state.sessionFp, + }); + return; + } + + // Drop messages in the full-index range [dropStart, dropEnd). Both + // bounds are in the same indexing space as `event.messages` and as the + // server's `safe_drop_before` (every message counted, all roles). The + // earlier version maintained a separate user+assistant-subset counter + // and compared it against full-index bounds, which silently nuked the + // most-recent user prompts (#bug discovered 2026-05-16 — see commit + // message for the trace). + const clampedEnd = Math.min(dropEnd, event.messages.length); + const filtered = event.messages.filter( + (_: unknown, i: number) => i < dropStart || i >= clampedEnd, + ); + const droppedCount = numMsgs - filtered.length; + + if (droppedCount === 0) { + nwlog("context_no_drop", { + reason: "no_indices_matched", + drop_start: dropStart, + drop_end: clampedEnd, + safe_drop_before: state.safeDropBefore, + num_msgs: numMsgs, + session_fp: state.sessionFp, + }); + return; + } + + nwlog("context_drop", { + drop_start: dropStart, + drop_end: clampedEnd, + safe_drop_before: state.safeDropBefore, + num_msgs_before: numMsgs, + num_msgs_after: filtered.length, + dropped: droppedCount, + session_fp: state.sessionFp, + }); + + return { messages: filtered }; + }); + + pi.on("before_provider_request", async (event, ctx) => { + if (!isMCRModel(ctx.model?.id || "")) return; + + const payload = event.payload as Record; + + // Start the in-flight indicator. It stays silent for IN_FLIGHT_GRACE_MS; + // only on a genuinely long wait does it surface a neutral + // "working… 12s" so the chat UI isn't indistinguishable from a hung + // model. It deliberately does NOT claim "optimizing context" — the + // extension can't observe whether compaction is happening (see + // ``markRequestSent`` for the Pi-API limitation that blocks the real + // phase signal). + markRequestSent(ctx); + + // Upgrade the X_NW_CONVERSATION_ID env var (seeded with a UUID at + // extension load) to Pi's stable per-invocation session id. The SDK + // re-reads process.env on every stream, so this propagates to the + // next outbound request as the real X-NW-Conversation-ID HTTP header + // — no body touch. See the header-wiring block at the top of this fn + // and pi-header-surface.md for the full mechanism trace. + const { id: conversationId, source: conversationIdSource } = + resolveConversationId(ctx); + process.env[CONV_ID_ENV] = conversationId; + nwlog("conversation_id_attached", { + conversation_id_prefix: conversationId.slice(0, 8), + source: conversationIdSource, + }); + + // X-MCR-Session-FP fast-path hint: previously set via the same + // body-only mechanism that never reached the wire. Currently a no-op; + // diagnostic log retained so we still see when the gateway has + // assigned an fp. Revisit via registerProvider if we want it back. + if (state.sessionFp) { + nwlog("before_provider_request_fp_set", { + session_fp: state.sessionFp, + safe_drop_before: state.safeDropBefore, + }); + } + + // DEBUG (#3323 follow-up): capture the actual outbound payload shape + // so we can diagnose why the deployed pin-final-user-turn fix isn't + // firing for real Pi sessions. Logs shape only — no message content. + try { + const msgs = (payload.body as { messages?: Array> } | undefined)?.messages + ?? (payload.messages as Array> | undefined) + ?? []; + const roles = msgs.map((m) => String(m.role ?? m.type ?? "?")); + const lastN = msgs.slice(-5).map((m) => { + const role = String(m.role ?? m.type ?? "?"); + const contentField = m.content; + let contentKind: string; + let contentLen: number; + if (typeof contentField === "string") { + contentKind = "string"; + contentLen = contentField.length; + } else if (Array.isArray(contentField)) { + contentKind = "array[" + contentField.length + "]"; + contentLen = contentField.reduce((acc, b) => acc + JSON.stringify(b).length, 0); + } else if (contentField === null || contentField === undefined) { + contentKind = "null"; + contentLen = 0; + } else { + contentKind = typeof contentField; + contentLen = JSON.stringify(contentField).length; + } + const blockTypes = Array.isArray(contentField) + ? (contentField as Array>).map((b) => String(b.type ?? "?")) + : null; + return { role, contentKind, contentLen, blockTypes, hasToolCallId: "tool_call_id" in m }; + }); + const roleCounts: Record = {}; + for (const r of roles) roleCounts[r] = (roleCounts[r] || 0) + 1; + nwlog("outbound_payload_shape", { + session_fp: state.sessionFp, + num_messages: msgs.length, + role_distribution: roleCounts, + last_5_messages: lastN, + stream: Boolean((payload.body as { stream?: boolean } | undefined)?.stream ?? payload.stream), + payload_keys: Object.keys(payload), + }); + } catch (err) { + nwlog("outbound_payload_shape_error", { + session_fp: state.sessionFp, + error: err instanceof Error ? err.message : String(err), + }); + } + }); + + pi.on("session_before_compact", async (_event, ctx) => { + if (!isMCRModel(ctx.model?.id || "")) return; + if (state.sessionFp) { + nwlog("compaction_cancelled", { session_fp: state.sessionFp }); + return { cancel: true }; + } + }); + + pi.on("session_start", async (_event, ctx) => { + nwlog("session_start", { extension_version: EXTENSION_VERSION }); + state.sessionFp = null; + state.safeDropBefore = 0; + state.storedThrough = 0; + state.totalEnergyJoules = 0; + state.sessionTurns = 0; + state.contextTokens = 0; + state.lastMcrMeta = null; + state.lastEnergy = null; + // Clear any in-flight indicator left over from a prior + // session (defensive — session_start would normally fire after any + // active request has completed, but a forced restart or fork can land + // here while inFlightSince is still set). + state.inFlightSince = null; + if (state.inFlightTickerHandle !== null) { + clearInterval(state.inFlightTickerHandle); + state.inFlightTickerHandle = null; + } + // Reset the UUID fallback so a new Pi session gets a fresh conversation + // id when getSessionId() isn't usable. The Pi session id itself rotates + // on its own. + uuidFallback = null; + ctx.ui.setStatus(MCR_STATUS_KEY, ""); + ctx.ui.setStatus(ENERGY_STATUS_KEY, ""); + }); + + pi.on("session_tree", async (_event, ctx) => { + // Branch navigation invalidates MCR session state — sessionFp and + // safeDropBefore are tied to a specific message sequence that no + // longer matches the new branch. Clear everything and let the + // next server response repopulate. Energy is replayed from the + // session log (same as the main provider's session_tree handler). + state.sessionFp = null; + state.safeDropBefore = 0; + state.storedThrough = 0; + state.totalEnergyJoules = 0; + state.sessionTurns = 0; + state.contextTokens = 0; + state.lastMcrMeta = null; + state.lastEnergy = null; + + // Replay energy events from the session log for the new branch. + for (const entry of ctx.sessionManager.getBranch()) { + if ( + entry.type === "custom" && + entry.customType === "neuralwatt-energy" && + typeof entry.data === "object" && + entry.data + ) { + state.totalEnergyJoules += (entry.data as { energy_joules: number }).energy_joules || 0; + } + } + + nwlog("session_tree", { total_energy_replayed: state.totalEnergyJoules }); + updateStatusBar(ctx); + }); + + pi.on("session_shutdown", async (_event, ctx) => { + nwlog("session_shutdown", { + final_session_fp: state.sessionFp, + total_energy_joules: state.totalEnergyJoules, + session_turns: state.sessionTurns, + }); + // Tear down the in-flight ticker so the interval handle + // doesn't outlive the session. + state.inFlightSince = null; + if (state.inFlightTickerHandle !== null) { + clearInterval(state.inFlightTickerHandle); + state.inFlightTickerHandle = null; + } + ctx.ui.setStatus(MCR_STATUS_KEY, ""); + ctx.ui.setStatus(ENERGY_STATUS_KEY, ""); + }); +} diff --git a/dot_config/pi/models.json b/dot_config/pi/models.json new file mode 100644 index 0000000000000000000000000000000000000000..a7cdd3ddb1ab9bb83df5abf9472cde6e7bea42f2 --- /dev/null +++ b/dot_config/pi/models.json @@ -0,0 +1,48 @@ +{ + "providers": { + "neuralwatt": { + "baseUrl": "https://api.neuralwatt.com/v1", + "api": "openai-completions", + "apiKey": "$NEURALWATT_API_KEY", + "compat": { + "supportsDeveloperRole": false, + "supportsReasoningEffort": false, + "maxTokensField": "max_tokens" + }, + { + "id": "neuralwatt/glm-5.1-long", + "name": "GLM-5.1 Long (MCR 1M)", + "reasoning": true, + "input": [ + "text" + ], + "contextWindow": 1048576, + "maxTokens": 16384, + "cost": { + "input": 0, + "output": 0, + "cacheRead": 0, + "cacheWrite": 0 + } + }, + { + "id": "neuralwatt/kimi-k2.6-long", + "name": "Kimi K2.6 Long (MCR 1M)", + "reasoning": true, + "input": [ + "text", + "image" + ], + "contextWindow": 1048576, + "maxTokens": 16384, + "cost": { + "input": 0, + "output": 0, + "cacheRead": 0, + "cacheWrite": 0 + } + } + ] + } + } +} diff --git a/dot_config/pi/settings.json b/dot_config/pi/settings.json index fd2a8459c106333d55d5bacf73697073ca0c3dd2..9689cd8fa9806525c6bf0a547bf1a647bbb5fb8d 100644 --- a/dot_config/pi/settings.json +++ b/dot_config/pi/settings.json @@ -1,8 +1,8 @@ { - "lastChangelogVersion": "0.75.5", + "lastChangelogVersion": "0.78.0", "defaultThinkingLevel": "high", - "defaultProvider": "plexus", - "defaultModel": "glm-5.1", + "defaultProvider": "neuralwatt", + "defaultModel": "neuralwatt/glm-5.1-long", "quietStartup": true, "autocompleteMaxVisible": 10, "steeringMode": "all",