From ea8f291585c7447feda734c02ecf90eff226838f Mon Sep 17 00:00:00 2001 From: Christian Rocha Date: Thu, 30 Apr 2026 12:49:11 -0400 Subject: [PATCH] fix(openai): handle media tool results (#221) This fix applies to media tool results for every downstream provider that builds on `providers/openai` (Hyper, OpenRouter, Vercel, MiniMax, Copilot, Azure, `openai-compat`). Tool results with image or audio content fell through the MessageRoleTool switch in both the Chat Completions and Responses paths, leaving a dangling `tool_call` with no matching tool message and 400-ing every subsequent turn. This fix emit a text tool message with a placeholder so the pairing stays valid, then attaches the media in a synthetic follow-up user message so vision and audio-capable models still see it. It will also warn on unknown tool result content types so new variants can't silently drop messages again. --- providers/openai/language_model_hooks.go | 70 ++++++ providers/openai/responses_language_model.go | 42 ++++ providers/openai/tool_result_media_test.go | 247 +++++++++++++++++++ 3 files changed, 359 insertions(+) create mode 100644 providers/openai/tool_result_media_test.go diff --git a/providers/openai/language_model_hooks.go b/providers/openai/language_model_hooks.go index e1131bc771f9c34aff8488c45a8102e8694290b8..7da22c9dc5c30658213ec317a7ab740cebc8e571 100644 --- a/providers/openai/language_model_hooks.go +++ b/providers/openai/language_model_hooks.go @@ -564,6 +564,41 @@ func DefaultToPrompt(prompt fantasy.Prompt, _, _ string) ([]openai.ChatCompletio continue } messages = append(messages, openai.ToolMessage(output.Error.Error(), toolResultPart.ToolCallID)) + case fantasy.ToolResultContentTypeMedia: + output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output) + if !ok { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: "tool result output does not have the right type", + }) + continue + } + // OpenAI Chat Completions tool messages cannot carry image + // or audio content directly; the SDK's content union only + // accepts text. To keep the tool_call/tool_result pairing + // valid while still surfacing the media to vision-capable + // models, emit a text tool message with a placeholder (or + // any accompanying text) and follow it with a synthetic + // user message holding the actual media content part. + placeholder := output.Text + if placeholder == "" { + placeholder = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType) + } + messages = append(messages, openai.ToolMessage(placeholder, toolResultPart.ToolCallID)) + mediaPart, mediaWarning, emit := toolResultMediaUserPart(output) + if mediaWarning != nil { + warnings = append(warnings, *mediaWarning) + } + if emit { + messages = append(messages, openai.UserMessage( + []openai.ChatCompletionContentPartUnionParam{mediaPart}, + )) + } + default: + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()), + }) } } } @@ -571,6 +606,41 @@ func DefaultToPrompt(prompt fantasy.Prompt, _, _ string) ([]openai.ChatCompletio return messages, warnings } +// toolResultMediaUserPart maps a tool-result media output to an OpenAI chat +// completions user content part. It returns the content part, an optional +// warning, and whether the caller should emit the returned part. +func toolResultMediaUserPart(output fantasy.ToolResultOutputContentMedia) (openai.ChatCompletionContentPartUnionParam, *fantasy.CallWarning, bool) { + switch { + case strings.HasPrefix(output.MediaType, "image/"): + data := "data:" + output.MediaType + ";base64," + output.Data + imageBlock := openai.ChatCompletionContentPartImageParam{ + ImageURL: openai.ChatCompletionContentPartImageImageURLParam{URL: data}, + } + return openai.ChatCompletionContentPartUnionParam{OfImageURL: &imageBlock}, nil, true + case output.MediaType == "audio/wav": + audioBlock := openai.ChatCompletionContentPartInputAudioParam{ + InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "wav", + }, + } + return openai.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + case output.MediaType == "audio/mpeg" || output.MediaType == "audio/mp3": + audioBlock := openai.ChatCompletionContentPartInputAudioParam{ + InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "mp3", + }, + } + return openai.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + default: + return openai.ChatCompletionContentPartUnionParam{}, &fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType), + }, false + } +} + func hasVisibleUserContent(content []openai.ChatCompletionContentPartUnionParam) bool { for _, part := range content { if part.OfText != nil || part.OfImageURL != nil || part.OfInputAudio != nil || part.OfFile != nil { diff --git a/providers/openai/responses_language_model.go b/providers/openai/responses_language_model.go index bd61a68ba0f2d2608c1c6512ada5ade2dae92c14..06da0aa6e3d869fffe618bf0cd61c30ea5fa35e8 100644 --- a/providers/openai/responses_language_model.go +++ b/providers/openai/responses_language_model.go @@ -611,6 +611,7 @@ func toResponsesPrompt(prompt fantasy.Prompt, systemMessageMode string, store bo } var outputStr string + var followupParts responses.ResponseInputMessageContentListParam switch toolResultPart.Output.GetType() { case fantasy.ToolResultContentTypeText: @@ -633,9 +634,50 @@ func toResponsesPrompt(prompt fantasy.Prompt, systemMessageMode string, store bo continue } outputStr = output.Error.Error() + case fantasy.ToolResultContentTypeMedia: + output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output) + if !ok { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: "tool result output does not have the right type", + }) + continue + } + // The Responses API function_call_output only accepts a + // string. Emit a text placeholder (preserving any + // accompanying text) so the tool_call/tool_result pairing + // stays valid, then attach the media as a synthetic user + // input_image so vision-capable models still receive it. + outputStr = output.Text + if outputStr == "" { + outputStr = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType) + } + if strings.HasPrefix(output.MediaType, "image/") { + imageURL := fmt.Sprintf("data:%s;base64,%s", output.MediaType, output.Data) + followupParts = append(followupParts, responses.ResponseInputContentUnionParam{ + OfInputImage: &responses.ResponseInputImageParam{ + Type: "input_image", + ImageURL: param.NewOpt(imageURL), + }, + }) + } else { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType), + }) + } + default: + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()), + }) + continue } input = append(input, responses.ResponseInputItemParamOfFunctionCallOutput(toolResultPart.ToolCallID, outputStr)) + if len(followupParts) > 0 { + input = append(input, responses.ResponseInputItemParamOfMessage(followupParts, responses.EasyInputMessageRoleUser)) + } } } } diff --git a/providers/openai/tool_result_media_test.go b/providers/openai/tool_result_media_test.go new file mode 100644 index 0000000000000000000000000000000000000000..cc7424cf024141847d5aca19f35f05118dbc1db8 --- /dev/null +++ b/providers/openai/tool_result_media_test.go @@ -0,0 +1,247 @@ +package openai + +import ( + "encoding/base64" + "testing" + + "charm.land/fantasy" + "github.com/stretchr/testify/require" +) + +// Tool messages in the OpenAI Chat Completions API cannot carry image or audio +// content directly — the SDK's content union only accepts text. When a tool +// returns media, DefaultToPrompt must still emit a text tool message so the +// tool_call/tool_result pairing stays valid, and attach the media to a +// synthetic follow-up user message so vision- and audio-capable models can see +// it. + +func TestDefaultToPrompt_MediaToolResult_ImagePNG(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/png", + }, + }, + }, + }, + } + + messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7") + + require.Empty(t, warnings) + // Assistant tool call + text tool message + synthetic user image message. + require.Len(t, messages, 3) + + toolMsg := messages[1].OfTool + require.NotNil(t, toolMsg) + require.Equal(t, "img-1", toolMsg.ToolCallID) + require.Contains(t, toolMsg.Content.OfString.Value, "image/png") + + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL + require.NotNil(t, imagePart) + require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL) +} + +func TestDefaultToPrompt_MediaToolResult_PrefersAccompanyingText(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-2", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/jpeg", + Text: "Screenshot of the blockquote element.", + }, + }, + }, + }, + } + + messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value) +} + +func TestDefaultToPrompt_MediaToolResult_AudioWAV(t *testing.T) { + t.Parallel() + + audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes")) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "audio-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: audio, + MediaType: "audio/wav", + }, + }, + }, + }, + } + + messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-4o-audio") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.NotNil(t, messages[1].OfTool) + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio + require.NotNil(t, audioPart) + require.Equal(t, audio, audioPart.InputAudio.Data) + require.Equal(t, "wav", audioPart.InputAudio.Format) +} + +func TestDefaultToPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) { + t.Parallel() + + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "vid-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: "AAAA", + MediaType: "video/mp4", + }, + }, + }, + }, + } + + messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-5") + + // Assistant tool call + text tool message, but no synthetic user image. + require.Len(t, messages, 2) + require.NotNil(t, messages[1].OfTool) + require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID) + require.Len(t, warnings, 1) + require.Contains(t, warnings[0].Message, "video/mp4") +} + +func TestToResponsesPrompt_MediaToolResult_ImagePNG(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{7, 7, 7, 7}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-resp-1", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-resp-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/png", + }, + }, + }, + }, + } + + input, warnings := toResponsesPrompt(prompt, "system", false) + + require.Empty(t, warnings) + // Assistant function call + function_call_output + synthetic user image + // message. + require.Len(t, input, 3) + + funcOut := input[1].OfFunctionCallOutput + require.NotNil(t, funcOut) + require.Equal(t, "img-resp-1", funcOut.CallID) + require.Contains(t, funcOut.Output.OfString.Value, "image/png") + + userMsg := input[2].OfMessage + require.NotNil(t, userMsg) + parts := userMsg.Content.OfInputItemContentList + require.Len(t, parts, 1) + imagePart := parts[0].OfInputImage + require.NotNil(t, imagePart) + require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.Value) +} + +func TestToResponsesPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) { + t.Parallel() + + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "vid-resp-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "vid-resp-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: "AAAA", + MediaType: "video/mp4", + }, + }, + }, + }, + } + + input, warnings := toResponsesPrompt(prompt, "system", false) + + // Assistant function call + function_call_output, but no synthetic user + // image message. + require.Len(t, input, 2) + require.NotNil(t, input[1].OfFunctionCallOutput) + require.Equal(t, "vid-resp-1", input[1].OfFunctionCallOutput.CallID) + require.Len(t, warnings, 1) + require.Contains(t, warnings[0].Message, "video/mp4") +}