audiotranscription.go

  1// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2
  3package openai
  4
  5import (
  6	"bytes"
  7	"context"
  8	"encoding/json"
  9	"io"
 10	"mime/multipart"
 11	"net/http"
 12
 13	"github.com/openai/openai-go/internal/apiform"
 14	"github.com/openai/openai-go/internal/apijson"
 15	"github.com/openai/openai-go/internal/requestconfig"
 16	"github.com/openai/openai-go/option"
 17	"github.com/openai/openai-go/packages/param"
 18	"github.com/openai/openai-go/packages/resp"
 19	"github.com/openai/openai-go/packages/ssestream"
 20	"github.com/openai/openai-go/shared/constant"
 21)
 22
 23// AudioTranscriptionService contains methods and other services that help with
 24// interacting with the openai API.
 25//
 26// Note, unlike clients, this service does not read variables from the environment
 27// automatically. You should not instantiate this service directly, and instead use
 28// the [NewAudioTranscriptionService] method instead.
 29type AudioTranscriptionService struct {
 30	Options []option.RequestOption
 31}
 32
 33// NewAudioTranscriptionService generates a new service that applies the given
 34// options to each request. These options are applied after the parent client's
 35// options (if there is one), and before any request-specific options.
 36func NewAudioTranscriptionService(opts ...option.RequestOption) (r AudioTranscriptionService) {
 37	r = AudioTranscriptionService{}
 38	r.Options = opts
 39	return
 40}
 41
 42// Transcribes audio into the input language.
 43func (r *AudioTranscriptionService) New(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (res *Transcription, err error) {
 44	opts = append(r.Options[:], opts...)
 45	path := "audio/transcriptions"
 46	err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &res, opts...)
 47	return
 48}
 49
 50// Transcribes audio into the input language.
 51func (r *AudioTranscriptionService) NewStreaming(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (stream *ssestream.Stream[TranscriptionStreamEventUnion]) {
 52	var (
 53		raw *http.Response
 54		err error
 55	)
 56	opts = append(r.Options[:], opts...)
 57	opts = append([]option.RequestOption{option.WithJSONSet("stream", true)}, opts...)
 58	path := "audio/transcriptions"
 59	err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &raw, opts...)
 60	return ssestream.NewStream[TranscriptionStreamEventUnion](ssestream.NewDecoder(raw), err)
 61}
 62
 63// Represents a transcription response returned by model, based on the provided
 64// input.
 65type Transcription struct {
 66	// The transcribed text.
 67	Text string `json:"text,required"`
 68	// The log probabilities of the tokens in the transcription. Only returned with the
 69	// models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added
 70	// to the `include` array.
 71	Logprobs []TranscriptionLogprob `json:"logprobs"`
 72	// Metadata for the response, check the presence of optional fields with the
 73	// [resp.Field.IsPresent] method.
 74	JSON struct {
 75		Text        resp.Field
 76		Logprobs    resp.Field
 77		ExtraFields map[string]resp.Field
 78		raw         string
 79	} `json:"-"`
 80}
 81
 82// Returns the unmodified JSON received from the API
 83func (r Transcription) RawJSON() string { return r.JSON.raw }
 84func (r *Transcription) UnmarshalJSON(data []byte) error {
 85	return apijson.UnmarshalRoot(data, r)
 86}
 87
 88type TranscriptionLogprob struct {
 89	// The token in the transcription.
 90	Token string `json:"token"`
 91	// The bytes of the token.
 92	Bytes []float64 `json:"bytes"`
 93	// The log probability of the token.
 94	Logprob float64 `json:"logprob"`
 95	// Metadata for the response, check the presence of optional fields with the
 96	// [resp.Field.IsPresent] method.
 97	JSON struct {
 98		Token       resp.Field
 99		Bytes       resp.Field
100		Logprob     resp.Field
101		ExtraFields map[string]resp.Field
102		raw         string
103	} `json:"-"`
104}
105
106// Returns the unmodified JSON received from the API
107func (r TranscriptionLogprob) RawJSON() string { return r.JSON.raw }
108func (r *TranscriptionLogprob) UnmarshalJSON(data []byte) error {
109	return apijson.UnmarshalRoot(data, r)
110}
111
112type TranscriptionInclude string
113
114const (
115	TranscriptionIncludeLogprobs TranscriptionInclude = "logprobs"
116)
117
118// TranscriptionStreamEventUnion contains all possible properties and values from
119// [TranscriptionTextDeltaEvent], [TranscriptionTextDoneEvent].
120//
121// Use the [TranscriptionStreamEventUnion.AsAny] method to switch on the variant.
122//
123// Use the methods beginning with 'As' to cast the union to one of its variants.
124type TranscriptionStreamEventUnion struct {
125	// This field is from variant [TranscriptionTextDeltaEvent].
126	Delta string `json:"delta"`
127	// Any of "transcript.text.delta", "transcript.text.done".
128	Type string `json:"type"`
129	// This field is a union of [[]TranscriptionTextDeltaEventLogprob],
130	// [[]TranscriptionTextDoneEventLogprob]
131	Logprobs TranscriptionStreamEventUnionLogprobs `json:"logprobs"`
132	// This field is from variant [TranscriptionTextDoneEvent].
133	Text string `json:"text"`
134	JSON struct {
135		Delta    resp.Field
136		Type     resp.Field
137		Logprobs resp.Field
138		Text     resp.Field
139		raw      string
140	} `json:"-"`
141}
142
143// Use the following switch statement to find the correct variant
144//
145//	switch variant := TranscriptionStreamEventUnion.AsAny().(type) {
146//	case TranscriptionTextDeltaEvent:
147//	case TranscriptionTextDoneEvent:
148//	default:
149//	  fmt.Errorf("no variant present")
150//	}
151func (u TranscriptionStreamEventUnion) AsAny() any {
152	switch u.Type {
153	case "transcript.text.delta":
154		return u.AsTranscriptTextDelta()
155	case "transcript.text.done":
156		return u.AsTranscriptTextDone()
157	}
158	return nil
159}
160
161func (u TranscriptionStreamEventUnion) AsTranscriptTextDelta() (v TranscriptionTextDeltaEvent) {
162	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
163	return
164}
165
166func (u TranscriptionStreamEventUnion) AsTranscriptTextDone() (v TranscriptionTextDoneEvent) {
167	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
168	return
169}
170
171// Returns the unmodified JSON received from the API
172func (u TranscriptionStreamEventUnion) RawJSON() string { return u.JSON.raw }
173
174func (r *TranscriptionStreamEventUnion) UnmarshalJSON(data []byte) error {
175	return apijson.UnmarshalRoot(data, r)
176}
177
178// TranscriptionStreamEventUnionLogprobs is an implicit subunion of
179// [TranscriptionStreamEventUnion]. TranscriptionStreamEventUnionLogprobs provides
180// convenient access to the sub-properties of the union.
181//
182// For type safety it is recommended to directly use a variant of the
183// [TranscriptionStreamEventUnion].
184//
185// If the underlying value is not a json object, one of the following properties
186// will be valid: OfTranscriptionTextDeltaEventLogprobs
187// OfTranscriptionTextDoneEventLogprobs]
188type TranscriptionStreamEventUnionLogprobs struct {
189	// This field will be present if the value is a
190	// [[]TranscriptionTextDeltaEventLogprob] instead of an object.
191	OfTranscriptionTextDeltaEventLogprobs []TranscriptionTextDeltaEventLogprob `json:",inline"`
192	// This field will be present if the value is a
193	// [[]TranscriptionTextDoneEventLogprob] instead of an object.
194	OfTranscriptionTextDoneEventLogprobs []TranscriptionTextDoneEventLogprob `json:",inline"`
195	JSON                                 struct {
196		OfTranscriptionTextDeltaEventLogprobs resp.Field
197		OfTranscriptionTextDoneEventLogprobs  resp.Field
198		raw                                   string
199	} `json:"-"`
200}
201
202func (r *TranscriptionStreamEventUnionLogprobs) UnmarshalJSON(data []byte) error {
203	return apijson.UnmarshalRoot(data, r)
204}
205
206// Emitted when there is an additional text delta. This is also the first event
207// emitted when the transcription starts. Only emitted when you
208// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
209// with the `Stream` parameter set to `true`.
210type TranscriptionTextDeltaEvent struct {
211	// The text delta that was additionally transcribed.
212	Delta string `json:"delta,required"`
213	// The type of the event. Always `transcript.text.delta`.
214	Type constant.TranscriptTextDelta `json:"type,required"`
215	// The log probabilities of the delta. Only included if you
216	// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
217	// with the `include[]` parameter set to `logprobs`.
218	Logprobs []TranscriptionTextDeltaEventLogprob `json:"logprobs"`
219	// Metadata for the response, check the presence of optional fields with the
220	// [resp.Field.IsPresent] method.
221	JSON struct {
222		Delta       resp.Field
223		Type        resp.Field
224		Logprobs    resp.Field
225		ExtraFields map[string]resp.Field
226		raw         string
227	} `json:"-"`
228}
229
230// Returns the unmodified JSON received from the API
231func (r TranscriptionTextDeltaEvent) RawJSON() string { return r.JSON.raw }
232func (r *TranscriptionTextDeltaEvent) UnmarshalJSON(data []byte) error {
233	return apijson.UnmarshalRoot(data, r)
234}
235
236type TranscriptionTextDeltaEventLogprob struct {
237	// The token that was used to generate the log probability.
238	Token string `json:"token"`
239	// The bytes that were used to generate the log probability.
240	Bytes []interface{} `json:"bytes"`
241	// The log probability of the token.
242	Logprob float64 `json:"logprob"`
243	// Metadata for the response, check the presence of optional fields with the
244	// [resp.Field.IsPresent] method.
245	JSON struct {
246		Token       resp.Field
247		Bytes       resp.Field
248		Logprob     resp.Field
249		ExtraFields map[string]resp.Field
250		raw         string
251	} `json:"-"`
252}
253
254// Returns the unmodified JSON received from the API
255func (r TranscriptionTextDeltaEventLogprob) RawJSON() string { return r.JSON.raw }
256func (r *TranscriptionTextDeltaEventLogprob) UnmarshalJSON(data []byte) error {
257	return apijson.UnmarshalRoot(data, r)
258}
259
260// Emitted when the transcription is complete. Contains the complete transcription
261// text. Only emitted when you
262// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
263// with the `Stream` parameter set to `true`.
264type TranscriptionTextDoneEvent struct {
265	// The text that was transcribed.
266	Text string `json:"text,required"`
267	// The type of the event. Always `transcript.text.done`.
268	Type constant.TranscriptTextDone `json:"type,required"`
269	// The log probabilities of the individual tokens in the transcription. Only
270	// included if you
271	// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
272	// with the `include[]` parameter set to `logprobs`.
273	Logprobs []TranscriptionTextDoneEventLogprob `json:"logprobs"`
274	// Metadata for the response, check the presence of optional fields with the
275	// [resp.Field.IsPresent] method.
276	JSON struct {
277		Text        resp.Field
278		Type        resp.Field
279		Logprobs    resp.Field
280		ExtraFields map[string]resp.Field
281		raw         string
282	} `json:"-"`
283}
284
285// Returns the unmodified JSON received from the API
286func (r TranscriptionTextDoneEvent) RawJSON() string { return r.JSON.raw }
287func (r *TranscriptionTextDoneEvent) UnmarshalJSON(data []byte) error {
288	return apijson.UnmarshalRoot(data, r)
289}
290
291type TranscriptionTextDoneEventLogprob struct {
292	// The token that was used to generate the log probability.
293	Token string `json:"token"`
294	// The bytes that were used to generate the log probability.
295	Bytes []interface{} `json:"bytes"`
296	// The log probability of the token.
297	Logprob float64 `json:"logprob"`
298	// Metadata for the response, check the presence of optional fields with the
299	// [resp.Field.IsPresent] method.
300	JSON struct {
301		Token       resp.Field
302		Bytes       resp.Field
303		Logprob     resp.Field
304		ExtraFields map[string]resp.Field
305		raw         string
306	} `json:"-"`
307}
308
309// Returns the unmodified JSON received from the API
310func (r TranscriptionTextDoneEventLogprob) RawJSON() string { return r.JSON.raw }
311func (r *TranscriptionTextDoneEventLogprob) UnmarshalJSON(data []byte) error {
312	return apijson.UnmarshalRoot(data, r)
313}
314
315type AudioTranscriptionNewParams struct {
316	// The audio file object (not file name) to transcribe, in one of these formats:
317	// flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
318	File io.Reader `json:"file,required" format:"binary"`
319	// ID of the model to use. The options are `gpt-4o-transcribe`,
320	// `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
321	// Whisper V2 model).
322	Model AudioModel `json:"model,omitzero,required"`
323	// The language of the input audio. Supplying the input language in
324	// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
325	// format will improve accuracy and latency.
326	Language param.Opt[string] `json:"language,omitzero"`
327	// An optional text to guide the model's style or continue a previous audio
328	// segment. The
329	// [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
330	// should match the audio language.
331	Prompt param.Opt[string] `json:"prompt,omitzero"`
332	// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
333	// output more random, while lower values like 0.2 will make it more focused and
334	// deterministic. If set to 0, the model will use
335	// [log probability](https://en.wikipedia.org/wiki/Log_probability) to
336	// automatically increase the temperature until certain thresholds are hit.
337	Temperature param.Opt[float64] `json:"temperature,omitzero"`
338	// Additional information to include in the transcription response. `logprobs` will
339	// return the log probabilities of the tokens in the response to understand the
340	// model's confidence in the transcription. `logprobs` only works with
341	// response_format set to `json` and only with the models `gpt-4o-transcribe` and
342	// `gpt-4o-mini-transcribe`.
343	Include []TranscriptionInclude `json:"include,omitzero"`
344	// The format of the output, in one of these options: `json`, `text`, `srt`,
345	// `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
346	// the only supported format is `json`.
347	//
348	// Any of "json", "text", "srt", "verbose_json", "vtt".
349	ResponseFormat AudioResponseFormat `json:"response_format,omitzero"`
350	// The timestamp granularities to populate for this transcription.
351	// `response_format` must be set `verbose_json` to use timestamp granularities.
352	// Either or both of these options are supported: `word`, or `segment`. Note: There
353	// is no additional latency for segment timestamps, but generating word timestamps
354	// incurs additional latency.
355	TimestampGranularities []string `json:"timestamp_granularities,omitzero"`
356	paramObj
357}
358
359// IsPresent returns true if the field's value is not omitted and not the JSON
360// "null". To check if this field is omitted, use [param.IsOmitted].
361func (f AudioTranscriptionNewParams) IsPresent() bool { return !param.IsOmitted(f) && !f.IsNull() }
362
363func (r AudioTranscriptionNewParams) MarshalMultipart() (data []byte, contentType string, err error) {
364	buf := bytes.NewBuffer(nil)
365	writer := multipart.NewWriter(buf)
366	err = apiform.MarshalRoot(r, writer)
367	if err != nil {
368		writer.Close()
369		return nil, "", err
370	}
371	err = writer.Close()
372	if err != nil {
373		return nil, "", err
374	}
375	return buf.Bytes(), writer.FormDataContentType(), nil
376}