From ea8f291585c7447feda734c02ecf90eff226838f Mon Sep 17 00:00:00 2001
From: Christian Rocha <christian@rocha.is>
Date: Thu, 30 Apr 2026 12:49:11 -0400
Subject: [PATCH] fix(openai): handle media tool results (#221)

This fix applies to media tool results for every downstream provider
that builds on `providers/openai` (Hyper, OpenRouter, Vercel, MiniMax,
Copilot, Azure, `openai-compat`).

Tool results with image or audio content fell through the
MessageRoleTool switch in both the Chat Completions and Responses
paths, leaving a dangling `tool_call` with no matching tool message and
400-ing every subsequent turn. This fix emit a text tool message with
a placeholder so the pairing stays valid, then attaches the media in
a synthetic follow-up user message so vision and audio-capable models
still see it. It will also warn on unknown tool result content types so
new variants can't silently drop messages again.
---
 providers/openai/language_model_hooks.go     |  70 ++++++
 providers/openai/responses_language_model.go |  42 ++++
 providers/openai/tool_result_media_test.go   | 247 +++++++++++++++++++
 3 files changed, 359 insertions(+)
 create mode 100644 providers/openai/tool_result_media_test.go

diff --git a/providers/openai/language_model_hooks.go b/providers/openai/language_model_hooks.go
index e1131bc771f9c34aff8488c45a8102e8694290b8..7da22c9dc5c30658213ec317a7ab740cebc8e571 100644
--- a/providers/openai/language_model_hooks.go
+++ b/providers/openai/language_model_hooks.go
@@ -564,6 +564,41 @@ func DefaultToPrompt(prompt fantasy.Prompt, _, _ string) ([]openai.ChatCompletio
 						continue
 					}
 					messages = append(messages, openai.ToolMessage(output.Error.Error(), toolResultPart.ToolCallID))
+				case fantasy.ToolResultContentTypeMedia:
+					output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output)
+					if !ok {
+						warnings = append(warnings, fantasy.CallWarning{
+							Type:    fantasy.CallWarningTypeOther,
+							Message: "tool result output does not have the right type",
+						})
+						continue
+					}
+					// OpenAI Chat Completions tool messages cannot carry image
+					// or audio content directly; the SDK's content union only
+					// accepts text. To keep the tool_call/tool_result pairing
+					// valid while still surfacing the media to vision-capable
+					// models, emit a text tool message with a placeholder (or
+					// any accompanying text) and follow it with a synthetic
+					// user message holding the actual media content part.
+					placeholder := output.Text
+					if placeholder == "" {
+						placeholder = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType)
+					}
+					messages = append(messages, openai.ToolMessage(placeholder, toolResultPart.ToolCallID))
+					mediaPart, mediaWarning, emit := toolResultMediaUserPart(output)
+					if mediaWarning != nil {
+						warnings = append(warnings, *mediaWarning)
+					}
+					if emit {
+						messages = append(messages, openai.UserMessage(
+							[]openai.ChatCompletionContentPartUnionParam{mediaPart},
+						))
+					}
+				default:
+					warnings = append(warnings, fantasy.CallWarning{
+						Type:    fantasy.CallWarningTypeOther,
+						Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()),
+					})
 				}
 			}
 		}
@@ -571,6 +606,41 @@ func DefaultToPrompt(prompt fantasy.Prompt, _, _ string) ([]openai.ChatCompletio
 	return messages, warnings
 }
 
+// toolResultMediaUserPart maps a tool-result media output to an OpenAI chat
+// completions user content part. It returns the content part, an optional
+// warning, and whether the caller should emit the returned part.
+func toolResultMediaUserPart(output fantasy.ToolResultOutputContentMedia) (openai.ChatCompletionContentPartUnionParam, *fantasy.CallWarning, bool) {
+	switch {
+	case strings.HasPrefix(output.MediaType, "image/"):
+		data := "data:" + output.MediaType + ";base64," + output.Data
+		imageBlock := openai.ChatCompletionContentPartImageParam{
+			ImageURL: openai.ChatCompletionContentPartImageImageURLParam{URL: data},
+		}
+		return openai.ChatCompletionContentPartUnionParam{OfImageURL: &imageBlock}, nil, true
+	case output.MediaType == "audio/wav":
+		audioBlock := openai.ChatCompletionContentPartInputAudioParam{
+			InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{
+				Data:   output.Data,
+				Format: "wav",
+			},
+		}
+		return openai.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true
+	case output.MediaType == "audio/mpeg" || output.MediaType == "audio/mp3":
+		audioBlock := openai.ChatCompletionContentPartInputAudioParam{
+			InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{
+				Data:   output.Data,
+				Format: "mp3",
+			},
+		}
+		return openai.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true
+	default:
+		return openai.ChatCompletionContentPartUnionParam{}, &fantasy.CallWarning{
+			Type:    fantasy.CallWarningTypeOther,
+			Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType),
+		}, false
+	}
+}
+
 func hasVisibleUserContent(content []openai.ChatCompletionContentPartUnionParam) bool {
 	for _, part := range content {
 		if part.OfText != nil || part.OfImageURL != nil || part.OfInputAudio != nil || part.OfFile != nil {
diff --git a/providers/openai/responses_language_model.go b/providers/openai/responses_language_model.go
index bd61a68ba0f2d2608c1c6512ada5ade2dae92c14..06da0aa6e3d869fffe618bf0cd61c30ea5fa35e8 100644
--- a/providers/openai/responses_language_model.go
+++ b/providers/openai/responses_language_model.go
@@ -611,6 +611,7 @@ func toResponsesPrompt(prompt fantasy.Prompt, systemMessageMode string, store bo
 				}
 
 				var outputStr string
+				var followupParts responses.ResponseInputMessageContentListParam
 
 				switch toolResultPart.Output.GetType() {
 				case fantasy.ToolResultContentTypeText:
@@ -633,9 +634,50 @@ func toResponsesPrompt(prompt fantasy.Prompt, systemMessageMode string, store bo
 						continue
 					}
 					outputStr = output.Error.Error()
+				case fantasy.ToolResultContentTypeMedia:
+					output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output)
+					if !ok {
+						warnings = append(warnings, fantasy.CallWarning{
+							Type:    fantasy.CallWarningTypeOther,
+							Message: "tool result output does not have the right type",
+						})
+						continue
+					}
+					// The Responses API function_call_output only accepts a
+					// string. Emit a text placeholder (preserving any
+					// accompanying text) so the tool_call/tool_result pairing
+					// stays valid, then attach the media as a synthetic user
+					// input_image so vision-capable models still receive it.
+					outputStr = output.Text
+					if outputStr == "" {
+						outputStr = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType)
+					}
+					if strings.HasPrefix(output.MediaType, "image/") {
+						imageURL := fmt.Sprintf("data:%s;base64,%s", output.MediaType, output.Data)
+						followupParts = append(followupParts, responses.ResponseInputContentUnionParam{
+							OfInputImage: &responses.ResponseInputImageParam{
+								Type:     "input_image",
+								ImageURL: param.NewOpt(imageURL),
+							},
+						})
+					} else {
+						warnings = append(warnings, fantasy.CallWarning{
+							Type:    fantasy.CallWarningTypeOther,
+							Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType),
+						})
+					}
+				default:
+					warnings = append(warnings, fantasy.CallWarning{
+						Type:    fantasy.CallWarningTypeOther,
+						Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()),
+					})
+					continue
 				}
 
 				input = append(input, responses.ResponseInputItemParamOfFunctionCallOutput(toolResultPart.ToolCallID, outputStr))
+				if len(followupParts) > 0 {
+					input = append(input, responses.ResponseInputItemParamOfMessage(followupParts, responses.EasyInputMessageRoleUser))
+				}
 			}
 		}
 	}
diff --git a/providers/openai/tool_result_media_test.go b/providers/openai/tool_result_media_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..cc7424cf024141847d5aca19f35f05118dbc1db8
--- /dev/null
+++ b/providers/openai/tool_result_media_test.go
@@ -0,0 +1,247 @@
+package openai
+
+import (
+	"encoding/base64"
+	"testing"
+
+	"charm.land/fantasy"
+	"github.com/stretchr/testify/require"
+)
+
+// Tool messages in the OpenAI Chat Completions API cannot carry image or audio
+// content directly — the SDK's content union only accepts text. When a tool
+// returns media, DefaultToPrompt must still emit a text tool message so the
+// tool_call/tool_result pairing stays valid, and attach the media to a
+// synthetic follow-up user message so vision- and audio-capable models can see
+// it.
+
+func TestDefaultToPrompt_MediaToolResult_ImagePNG(t *testing.T) {
+	t.Parallel()
+
+	imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3})
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "img-1",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      imageData,
+						MediaType: "image/png",
+					},
+				},
+			},
+		},
+	}
+
+	messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7")
+
+	require.Empty(t, warnings)
+	// Assistant tool call + text tool message + synthetic user image message.
+	require.Len(t, messages, 3)
+
+	toolMsg := messages[1].OfTool
+	require.NotNil(t, toolMsg)
+	require.Equal(t, "img-1", toolMsg.ToolCallID)
+	require.Contains(t, toolMsg.Content.OfString.Value, "image/png")
+
+	userMsg := messages[2].OfUser
+	require.NotNil(t, userMsg)
+	require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
+	imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL
+	require.NotNil(t, imagePart)
+	require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL)
+}
+
+func TestDefaultToPrompt_MediaToolResult_PrefersAccompanyingText(t *testing.T) {
+	t.Parallel()
+
+	imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9})
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "img-2",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      imageData,
+						MediaType: "image/jpeg",
+						Text:      "Screenshot of the blockquote element.",
+					},
+				},
+			},
+		},
+	}
+
+	messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7")
+
+	require.Empty(t, warnings)
+	require.Len(t, messages, 3)
+	require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value)
+}
+
+func TestDefaultToPrompt_MediaToolResult_AudioWAV(t *testing.T) {
+	t.Parallel()
+
+	audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes"))
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "audio-1",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      audio,
+						MediaType: "audio/wav",
+					},
+				},
+			},
+		},
+	}
+
+	messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-4o-audio")
+
+	require.Empty(t, warnings)
+	require.Len(t, messages, 3)
+	require.NotNil(t, messages[1].OfTool)
+	userMsg := messages[2].OfUser
+	require.NotNil(t, userMsg)
+	require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
+	audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio
+	require.NotNil(t, audioPart)
+	require.Equal(t, audio, audioPart.InputAudio.Data)
+	require.Equal(t, "wav", audioPart.InputAudio.Format)
+}
+
+func TestDefaultToPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) {
+	t.Parallel()
+
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "vid-1",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      "AAAA",
+						MediaType: "video/mp4",
+					},
+				},
+			},
+		},
+	}
+
+	messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-5")
+
+	// Assistant tool call + text tool message, but no synthetic user image.
+	require.Len(t, messages, 2)
+	require.NotNil(t, messages[1].OfTool)
+	require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID)
+	require.Len(t, warnings, 1)
+	require.Contains(t, warnings[0].Message, "video/mp4")
+}
+
+func TestToResponsesPrompt_MediaToolResult_ImagePNG(t *testing.T) {
+	t.Parallel()
+
+	imageData := base64.StdEncoding.EncodeToString([]byte{7, 7, 7, 7})
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "img-resp-1", ToolName: "view", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "img-resp-1",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      imageData,
+						MediaType: "image/png",
+					},
+				},
+			},
+		},
+	}
+
+	input, warnings := toResponsesPrompt(prompt, "system", false)
+
+	require.Empty(t, warnings)
+	// Assistant function call + function_call_output + synthetic user image
+	// message.
+	require.Len(t, input, 3)
+
+	funcOut := input[1].OfFunctionCallOutput
+	require.NotNil(t, funcOut)
+	require.Equal(t, "img-resp-1", funcOut.CallID)
+	require.Contains(t, funcOut.Output.OfString.Value, "image/png")
+
+	userMsg := input[2].OfMessage
+	require.NotNil(t, userMsg)
+	parts := userMsg.Content.OfInputItemContentList
+	require.Len(t, parts, 1)
+	imagePart := parts[0].OfInputImage
+	require.NotNil(t, imagePart)
+	require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.Value)
+}
+
+func TestToResponsesPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) {
+	t.Parallel()
+
+	prompt := fantasy.Prompt{
+		{
+			Role: fantasy.MessageRoleAssistant,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolCallPart{ToolCallID: "vid-resp-1", ToolName: "record", Input: "{}"},
+			},
+		},
+		{
+			Role: fantasy.MessageRoleTool,
+			Content: []fantasy.MessagePart{
+				fantasy.ToolResultPart{
+					ToolCallID: "vid-resp-1",
+					Output: fantasy.ToolResultOutputContentMedia{
+						Data:      "AAAA",
+						MediaType: "video/mp4",
+					},
+				},
+			},
+		},
+	}
+
+	input, warnings := toResponsesPrompt(prompt, "system", false)
+
+	// Assistant function call + function_call_output, but no synthetic user
+	// image message.
+	require.Len(t, input, 2)
+	require.NotNil(t, input[1].OfFunctionCallOutput)
+	require.Equal(t, "vid-resp-1", input[1].OfFunctionCallOutput.CallID)
+	require.Len(t, warnings, 1)
+	require.Contains(t, warnings[0].Message, "video/mp4")
+}