tool_result_media_test.go

  1package openai
  2
  3import (
  4	"encoding/base64"
  5	"testing"
  6
  7	"charm.land/fantasy"
  8	"github.com/stretchr/testify/require"
  9)
 10
 11// Tool messages in the OpenAI Chat Completions API cannot carry image or audio
 12// content directly — the SDK's content union only accepts text. When a tool
 13// returns media, DefaultToPrompt must still emit a text tool message so the
 14// tool_call/tool_result pairing stays valid, and attach the media to a
 15// synthetic follow-up user message so vision- and audio-capable models can see
 16// it.
 17
 18func TestDefaultToPrompt_MediaToolResult_ImagePNG(t *testing.T) {
 19	t.Parallel()
 20
 21	imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3})
 22	prompt := fantasy.Prompt{
 23		{
 24			Role: fantasy.MessageRoleAssistant,
 25			Content: []fantasy.MessagePart{
 26				fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"},
 27			},
 28		},
 29		{
 30			Role: fantasy.MessageRoleTool,
 31			Content: []fantasy.MessagePart{
 32				fantasy.ToolResultPart{
 33					ToolCallID: "img-1",
 34					Output: fantasy.ToolResultOutputContentMedia{
 35						Data:      imageData,
 36						MediaType: "image/png",
 37					},
 38				},
 39			},
 40		},
 41	}
 42
 43	messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7")
 44
 45	require.Empty(t, warnings)
 46	// Assistant tool call + text tool message + synthetic user image message.
 47	require.Len(t, messages, 3)
 48
 49	toolMsg := messages[1].OfTool
 50	require.NotNil(t, toolMsg)
 51	require.Equal(t, "img-1", toolMsg.ToolCallID)
 52	require.Contains(t, toolMsg.Content.OfString.Value, "image/png")
 53
 54	userMsg := messages[2].OfUser
 55	require.NotNil(t, userMsg)
 56	require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
 57	imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL
 58	require.NotNil(t, imagePart)
 59	require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL)
 60}
 61
 62func TestDefaultToPrompt_MediaToolResult_PrefersAccompanyingText(t *testing.T) {
 63	t.Parallel()
 64
 65	imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9})
 66	prompt := fantasy.Prompt{
 67		{
 68			Role: fantasy.MessageRoleAssistant,
 69			Content: []fantasy.MessagePart{
 70				fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"},
 71			},
 72		},
 73		{
 74			Role: fantasy.MessageRoleTool,
 75			Content: []fantasy.MessagePart{
 76				fantasy.ToolResultPart{
 77					ToolCallID: "img-2",
 78					Output: fantasy.ToolResultOutputContentMedia{
 79						Data:      imageData,
 80						MediaType: "image/jpeg",
 81						Text:      "Screenshot of the blockquote element.",
 82					},
 83				},
 84			},
 85		},
 86	}
 87
 88	messages, warnings := DefaultToPrompt(prompt, "openrouter", "anthropic/claude-opus-4.7")
 89
 90	require.Empty(t, warnings)
 91	require.Len(t, messages, 3)
 92	require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value)
 93}
 94
 95func TestDefaultToPrompt_MediaToolResult_AudioWAV(t *testing.T) {
 96	t.Parallel()
 97
 98	audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes"))
 99	prompt := fantasy.Prompt{
100		{
101			Role: fantasy.MessageRoleAssistant,
102			Content: []fantasy.MessagePart{
103				fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"},
104			},
105		},
106		{
107			Role: fantasy.MessageRoleTool,
108			Content: []fantasy.MessagePart{
109				fantasy.ToolResultPart{
110					ToolCallID: "audio-1",
111					Output: fantasy.ToolResultOutputContentMedia{
112						Data:      audio,
113						MediaType: "audio/wav",
114					},
115				},
116			},
117		},
118	}
119
120	messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-4o-audio")
121
122	require.Empty(t, warnings)
123	require.Len(t, messages, 3)
124	require.NotNil(t, messages[1].OfTool)
125	userMsg := messages[2].OfUser
126	require.NotNil(t, userMsg)
127	require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
128	audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio
129	require.NotNil(t, audioPart)
130	require.Equal(t, audio, audioPart.InputAudio.Data)
131	require.Equal(t, "wav", audioPart.InputAudio.Format)
132}
133
134func TestDefaultToPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) {
135	t.Parallel()
136
137	prompt := fantasy.Prompt{
138		{
139			Role: fantasy.MessageRoleAssistant,
140			Content: []fantasy.MessagePart{
141				fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"},
142			},
143		},
144		{
145			Role: fantasy.MessageRoleTool,
146			Content: []fantasy.MessagePart{
147				fantasy.ToolResultPart{
148					ToolCallID: "vid-1",
149					Output: fantasy.ToolResultOutputContentMedia{
150						Data:      "AAAA",
151						MediaType: "video/mp4",
152					},
153				},
154			},
155		},
156	}
157
158	messages, warnings := DefaultToPrompt(prompt, "openai", "gpt-5")
159
160	// Assistant tool call + text tool message, but no synthetic user image.
161	require.Len(t, messages, 2)
162	require.NotNil(t, messages[1].OfTool)
163	require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID)
164	require.Len(t, warnings, 1)
165	require.Contains(t, warnings[0].Message, "video/mp4")
166}
167
168func TestToResponsesPrompt_MediaToolResult_ImagePNG(t *testing.T) {
169	t.Parallel()
170
171	imageData := base64.StdEncoding.EncodeToString([]byte{7, 7, 7, 7})
172	prompt := fantasy.Prompt{
173		{
174			Role: fantasy.MessageRoleAssistant,
175			Content: []fantasy.MessagePart{
176				fantasy.ToolCallPart{ToolCallID: "img-resp-1", ToolName: "view", Input: "{}"},
177			},
178		},
179		{
180			Role: fantasy.MessageRoleTool,
181			Content: []fantasy.MessagePart{
182				fantasy.ToolResultPart{
183					ToolCallID: "img-resp-1",
184					Output: fantasy.ToolResultOutputContentMedia{
185						Data:      imageData,
186						MediaType: "image/png",
187					},
188				},
189			},
190		},
191	}
192
193	input, warnings := toResponsesPrompt(prompt, "system", false)
194
195	require.Empty(t, warnings)
196	// Assistant function call + function_call_output + synthetic user image
197	// message.
198	require.Len(t, input, 3)
199
200	funcOut := input[1].OfFunctionCallOutput
201	require.NotNil(t, funcOut)
202	require.Equal(t, "img-resp-1", funcOut.CallID)
203	require.Contains(t, funcOut.Output.OfString.Value, "image/png")
204
205	userMsg := input[2].OfMessage
206	require.NotNil(t, userMsg)
207	parts := userMsg.Content.OfInputItemContentList
208	require.Len(t, parts, 1)
209	imagePart := parts[0].OfInputImage
210	require.NotNil(t, imagePart)
211	require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.Value)
212}
213
214func TestToResponsesPrompt_MediaToolResult_UnsupportedMediaType(t *testing.T) {
215	t.Parallel()
216
217	prompt := fantasy.Prompt{
218		{
219			Role: fantasy.MessageRoleAssistant,
220			Content: []fantasy.MessagePart{
221				fantasy.ToolCallPart{ToolCallID: "vid-resp-1", ToolName: "record", Input: "{}"},
222			},
223		},
224		{
225			Role: fantasy.MessageRoleTool,
226			Content: []fantasy.MessagePart{
227				fantasy.ToolResultPart{
228					ToolCallID: "vid-resp-1",
229					Output: fantasy.ToolResultOutputContentMedia{
230						Data:      "AAAA",
231						MediaType: "video/mp4",
232					},
233				},
234			},
235		},
236	}
237
238	input, warnings := toResponsesPrompt(prompt, "system", false)
239
240	// Assistant function call + function_call_output, but no synthetic user
241	// image message.
242	require.Len(t, input, 2)
243	require.NotNil(t, input[1].OfFunctionCallOutput)
244	require.Equal(t, "vid-resp-1", input[1].OfFunctionCallOutput.CallID)
245	require.Len(t, warnings, 1)
246	require.Contains(t, warnings[0].Message, "video/mp4")
247}