Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 1cb00f8

Browse files
committedDec 17, 2024·
docs: add examples + guidance on Realtime API support
1 parent 8829c32 commit 1cb00f8

File tree

5 files changed

+493
-1
lines changed

5 files changed

+493
-1
lines changed
 

‎README.md

+61
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,67 @@ We recommend that you always instantiate a client (e.g., with `client = OpenAI()
258258
- It's harder to mock for testing purposes
259259
- It's not possible to control cleanup of network connections
260260

261+
## Realtime API beta
262+
263+
The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a WebSocket connection.
264+
265+
Under the hood the SDK uses the [`websockets`](https://websockets.readthedocs.io/en/stable/) library to manage connections.
266+
267+
The Realtime API works through a combination of client-sent events and server-sent events. Clients can send events to do things like update session configuration or send text and audio inputs. Server events confirm when audio responses have completed, or when a text response from the model has been received. A full event reference can be found [here](platform.openai.com/docs/api-reference/realtime-client-events) and a guide can be found [here](https://platform.openai.com/docs/guides/realtime).
268+
269+
Basic text based example:
270+
271+
```py
272+
import asyncio
273+
from openai import AsyncOpenAI
274+
275+
async def main():
276+
client = AsyncOpenAI()
277+
278+
async with client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as connection:
279+
await connection.session.update(session={'modalities': ['text']})
280+
281+
await connection.conversation.item.create(
282+
item={
283+
"type": "message",
284+
"role": "user",
285+
"content": [{"type": "input_text", "text": "Say hello!"}],
286+
}
287+
)
288+
await connection.response.create()
289+
290+
async for event in connection:
291+
if event.type == 'response.text.delta':
292+
print(event.delta, flush=True, end="")
293+
294+
elif event.type == 'response.text.done':
295+
print()
296+
297+
elif event.type == "response.done":
298+
break
299+
300+
asyncio.run(main())
301+
```
302+
303+
However the real magic of the Realtime API is handling audio inputs / outputs, see this example [TUI script](https://github.com/stainless-sdks/openai-python/blob/robert/realtime-docs-preview/examples/realtime/push_to_talk_app.py) for a fully fledged example.
304+
305+
### Realtime error handling
306+
307+
Whenever an error occurs, the Realtime API will send an [`error` event](https://platform.openai.com/docs/guides/realtime/realtime-api-beta#handling-errors) and the connection will stay open and remain usable. This means you need to handle it yourself, as *no errors are raised directly* by the SDK when an `error` event comes in.
308+
309+
```py
310+
client = AsyncOpenAI()
311+
312+
async with client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as connection:
313+
...
314+
async for event in connection:
315+
if event.type == 'error':
316+
print(event.error.type)
317+
print(event.error.code)
318+
print(event.error.event_id)
319+
print(event.error.message)
320+
```
321+
261322
## Using types
262323

263324
Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:

‎examples/realtime/audio_util.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import base64
5+
import asyncio
6+
import threading
7+
from typing import Callable, Awaitable
8+
9+
import numpy as np
10+
import pyaudio
11+
import sounddevice as sd
12+
from pydub import AudioSegment
13+
14+
from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
15+
16+
CHUNK_LENGTH_S = 0.05 # 100ms
17+
SAMPLE_RATE = 24000
18+
FORMAT = pyaudio.paInt16
19+
CHANNELS = 1
20+
21+
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
22+
23+
24+
def audio_to_pcm16_base64(audio_bytes: bytes) -> bytes:
25+
# load the audio file from the byte stream
26+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
27+
print(f"Loaded audio: {audio.frame_rate=} {audio.channels=} {audio.sample_width=} {audio.frame_width=}")
28+
# resample to 24kHz mono pcm16
29+
pcm_audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(CHANNELS).set_sample_width(2).raw_data
30+
return pcm_audio
31+
32+
33+
class AudioPlayerAsync:
34+
def __init__(self):
35+
self.queue = []
36+
self.lock = threading.Lock()
37+
self.stream = sd.OutputStream(
38+
callback=self.callback,
39+
samplerate=SAMPLE_RATE,
40+
channels=CHANNELS,
41+
dtype=np.int16,
42+
blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE),
43+
)
44+
self.playing = False
45+
self._frame_count = 0
46+
47+
def callback(self, outdata, frames, time, status): # noqa
48+
with self.lock:
49+
data = np.empty(0, dtype=np.int16)
50+
51+
# get next item from queue if there is still space in the buffer
52+
while len(data) < frames and len(self.queue) > 0:
53+
item = self.queue.pop(0)
54+
frames_needed = frames - len(data)
55+
data = np.concatenate((data, item[:frames_needed]))
56+
if len(item) > frames_needed:
57+
self.queue.insert(0, item[frames_needed:])
58+
59+
self._frame_count += len(data)
60+
61+
# fill the rest of the frames with zeros if there is no more data
62+
if len(data) < frames:
63+
data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16)))
64+
65+
outdata[:] = data.reshape(-1, 1)
66+
67+
def reset_frame_count(self):
68+
self._frame_count = 0
69+
70+
def get_frame_count(self):
71+
return self._frame_count
72+
73+
def add_data(self, data: bytes):
74+
with self.lock:
75+
# bytes is pcm16 single channel audio data, convert to numpy array
76+
np_data = np.frombuffer(data, dtype=np.int16)
77+
self.queue.append(np_data)
78+
if not self.playing:
79+
self.start()
80+
81+
def start(self):
82+
self.playing = True
83+
self.stream.start()
84+
85+
def stop(self):
86+
self.playing = False
87+
self.stream.stop()
88+
with self.lock:
89+
self.queue = []
90+
91+
def terminate(self):
92+
self.stream.close()
93+
94+
95+
async def send_audio_worker_sounddevice(
96+
connection: AsyncRealtimeConnection,
97+
should_send: Callable[[], bool] | None = None,
98+
start_send: Callable[[], Awaitable[None]] | None = None,
99+
):
100+
sent_audio = False
101+
102+
device_info = sd.query_devices()
103+
print(device_info)
104+
105+
read_size = int(SAMPLE_RATE * 0.02)
106+
107+
stream = sd.InputStream(
108+
channels=CHANNELS,
109+
samplerate=SAMPLE_RATE,
110+
dtype="int16",
111+
)
112+
stream.start()
113+
114+
try:
115+
while True:
116+
if stream.read_available < read_size:
117+
await asyncio.sleep(0)
118+
continue
119+
120+
data, _ = stream.read(read_size)
121+
122+
if should_send() if should_send else True:
123+
if not sent_audio and start_send:
124+
await start_send()
125+
await connection.send(
126+
{"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")}
127+
)
128+
sent_audio = True
129+
130+
elif sent_audio:
131+
print("Done, triggering inference")
132+
await connection.send({"type": "input_audio_buffer.commit"})
133+
await connection.send({"type": "response.create", "response": {}})
134+
sent_audio = False
135+
136+
await asyncio.sleep(0)
137+
138+
except KeyboardInterrupt:
139+
pass
140+
finally:
141+
stream.stop()
142+
stream.close()

‎examples/realtime/push_to_talk_app.py

+281
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#!/usr/bin/env uv run
2+
####################################################################
3+
# Sample TUI app with a push to talk interface to the Realtime API #
4+
# If you have `uv` installed and the `OPENAI_API_KEY` #
5+
# environment variable set, you can run this example with just #
6+
# #
7+
# `./examples/realtime/push_to_talk_app.py` #
8+
####################################################################
9+
#
10+
# /// script
11+
# requires-python = ">=3.9"
12+
# dependencies = [
13+
# "textual",
14+
# "numpy",
15+
# "pyaudio",
16+
# "pydub",
17+
# "sounddevice",
18+
# "openai[realtime]",
19+
# ]
20+
#
21+
# [tool.uv.sources]
22+
# openai = { path = "../../", editable = true }
23+
# ///
24+
from __future__ import annotations
25+
26+
import base64
27+
import asyncio
28+
from typing import Any, cast
29+
from typing_extensions import override
30+
31+
from textual import events
32+
from audio_util import CHANNELS, SAMPLE_RATE, AudioPlayerAsync
33+
from textual.app import App, ComposeResult
34+
from textual.widgets import Button, Static, RichLog
35+
from textual.reactive import reactive
36+
from textual.containers import Container
37+
38+
from openai import AsyncOpenAI
39+
from openai.types.beta.realtime.session import Session
40+
from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
41+
42+
43+
class SessionDisplay(Static):
44+
"""A widget that shows the current session ID."""
45+
46+
session_id = reactive("")
47+
48+
@override
49+
def render(self) -> str:
50+
return f"Session ID: {self.session_id}" if self.session_id else "Connecting..."
51+
52+
53+
class AudioStatusIndicator(Static):
54+
"""A widget that shows the current audio recording status."""
55+
56+
is_recording = reactive(False)
57+
58+
@override
59+
def render(self) -> str:
60+
status = (
61+
"🔴 Recording... (Press K to stop)" if self.is_recording else "⚪ Press K to start recording (Q to quit)"
62+
)
63+
return status
64+
65+
66+
class RealtimeApp(App[None]):
67+
CSS = """
68+
Screen {
69+
background: #1a1b26; /* Dark blue-grey background */
70+
}
71+
72+
Container {
73+
border: double rgb(91, 164, 91);
74+
}
75+
76+
Horizontal {
77+
width: 100%;
78+
}
79+
80+
#input-container {
81+
height: 5; /* Explicit height for input container */
82+
margin: 1 1;
83+
padding: 1 2;
84+
}
85+
86+
Input {
87+
width: 80%;
88+
height: 3; /* Explicit height for input */
89+
}
90+
91+
Button {
92+
width: 20%;
93+
height: 3; /* Explicit height for button */
94+
}
95+
96+
#bottom-pane {
97+
width: 100%;
98+
height: 82%; /* Reduced to make room for session display */
99+
border: round rgb(205, 133, 63);
100+
content-align: center middle;
101+
}
102+
103+
#status-indicator {
104+
height: 3;
105+
content-align: center middle;
106+
background: #2a2b36;
107+
border: solid rgb(91, 164, 91);
108+
margin: 1 1;
109+
}
110+
111+
#session-display {
112+
height: 3;
113+
content-align: center middle;
114+
background: #2a2b36;
115+
border: solid rgb(91, 164, 91);
116+
margin: 1 1;
117+
}
118+
119+
Static {
120+
color: white;
121+
}
122+
"""
123+
124+
client: AsyncOpenAI
125+
should_send_audio: asyncio.Event
126+
audio_player: AudioPlayerAsync
127+
last_audio_item_id: str | None
128+
connection: AsyncRealtimeConnection | None
129+
session: Session | None
130+
connected: asyncio.Event
131+
132+
def __init__(self) -> None:
133+
super().__init__()
134+
self.connection = None
135+
self.session = None
136+
self.client = AsyncOpenAI()
137+
self.audio_player = AudioPlayerAsync()
138+
self.last_audio_item_id = None
139+
self.should_send_audio = asyncio.Event()
140+
self.connected = asyncio.Event()
141+
142+
@override
143+
def compose(self) -> ComposeResult:
144+
"""Create child widgets for the app."""
145+
with Container():
146+
yield SessionDisplay(id="session-display")
147+
yield AudioStatusIndicator(id="status-indicator")
148+
yield RichLog(id="bottom-pane", wrap=True, highlight=True, markup=True)
149+
150+
async def on_mount(self) -> None:
151+
self.run_worker(self.handle_realtime_connection())
152+
self.run_worker(self.send_mic_audio())
153+
154+
async def handle_realtime_connection(self) -> None:
155+
async with self.client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as conn:
156+
self.connection = conn
157+
self.connected.set()
158+
159+
# note: this is the default and can be omitted
160+
# if you want to manually handle VAD yourself, then set `'turn_detection': None`
161+
await conn.session.update(session={"turn_detection": {"type": "server_vad"}})
162+
163+
acc_items: dict[str, Any] = {}
164+
165+
async for event in conn:
166+
if event.type == "session.created":
167+
self.session = event.session
168+
session_display = self.query_one(SessionDisplay)
169+
assert event.session.id is not None
170+
session_display.session_id = event.session.id
171+
continue
172+
173+
if event.type == "session.updated":
174+
self.session = event.session
175+
continue
176+
177+
if event.type == "response.audio.delta":
178+
if event.item_id != self.last_audio_item_id:
179+
self.audio_player.reset_frame_count()
180+
self.last_audio_item_id = event.item_id
181+
182+
bytes_data = base64.b64decode(event.delta)
183+
self.audio_player.add_data(bytes_data)
184+
continue
185+
186+
if event.type == "response.audio_transcript.delta":
187+
try:
188+
text = acc_items[event.item_id]
189+
except KeyError:
190+
acc_items[event.item_id] = event.delta
191+
else:
192+
acc_items[event.item_id] = text + event.delta
193+
194+
# Clear and update the entire content because RichLog otherwise treats each delta as a new line
195+
bottom_pane = self.query_one("#bottom-pane", RichLog)
196+
bottom_pane.clear()
197+
bottom_pane.write(acc_items[event.item_id])
198+
continue
199+
200+
async def _get_connection(self) -> AsyncRealtimeConnection:
201+
await self.connected.wait()
202+
assert self.connection is not None
203+
return self.connection
204+
205+
async def send_mic_audio(self) -> None:
206+
import sounddevice as sd # type: ignore
207+
208+
sent_audio = False
209+
210+
device_info = sd.query_devices()
211+
print(device_info)
212+
213+
read_size = int(SAMPLE_RATE * 0.02)
214+
215+
stream = sd.InputStream(
216+
channels=CHANNELS,
217+
samplerate=SAMPLE_RATE,
218+
dtype="int16",
219+
)
220+
stream.start()
221+
222+
status_indicator = self.query_one(AudioStatusIndicator)
223+
224+
try:
225+
while True:
226+
if stream.read_available < read_size:
227+
await asyncio.sleep(0)
228+
continue
229+
230+
await self.should_send_audio.wait()
231+
status_indicator.is_recording = True
232+
233+
data, _ = stream.read(read_size)
234+
235+
connection = await self._get_connection()
236+
if not sent_audio:
237+
asyncio.create_task(connection.send({"type": "response.cancel"}))
238+
sent_audio = True
239+
240+
await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8"))
241+
242+
await asyncio.sleep(0)
243+
except KeyboardInterrupt:
244+
pass
245+
finally:
246+
stream.stop()
247+
stream.close()
248+
249+
async def on_key(self, event: events.Key) -> None:
250+
"""Handle key press events."""
251+
if event.key == "enter":
252+
self.query_one(Button).press()
253+
return
254+
255+
if event.key == "q":
256+
self.exit()
257+
return
258+
259+
if event.key == "k":
260+
status_indicator = self.query_one(AudioStatusIndicator)
261+
if status_indicator.is_recording:
262+
self.should_send_audio.clear()
263+
status_indicator.is_recording = False
264+
265+
if self.session and self.session.turn_detection is None:
266+
# The default in the API is that the model will automatically detect when the user has
267+
# stopped talking and then start responding itself.
268+
#
269+
# However if we're in manual `turn_detection` mode then we need to
270+
# manually tell the model to commit the audio buffer and start responding.
271+
conn = await self._get_connection()
272+
await conn.input_audio_buffer.commit()
273+
await conn.response.create()
274+
else:
275+
self.should_send_audio.set()
276+
status_indicator.is_recording = True
277+
278+
279+
if __name__ == "__main__":
280+
app = RealtimeApp()
281+
app.run()

‎mypy.ini

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ show_error_codes = True
88
#
99
# We also exclude our `tests` as mypy doesn't always infer
1010
# types correctly and Pyright will still catch any type errors.
11-
exclude = ^(src/openai/_files\.py|src/openai/_utils/_logs\.py|_dev/.*\.py|tests/.*)$
11+
12+
# realtime examples use inline `uv` script dependencies
13+
# which means it can't be type checked
14+
exclude = ^(src/openai/_files\.py|_dev/.*\.py|tests/.*|src/openai/_utils/_logs\.py|examples/realtime/audio_util\.py|examples/realtime/push_to_talk_app\.py)$
1215

1316
strict_equality = True
1417
implicit_reexport = True

‎pyproject.toml

+5
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,11 @@ exclude = [
157157
"_dev",
158158
".venv",
159159
".nox",
160+
161+
# uses inline `uv` script dependencies
162+
# which means it can't be type checked
163+
"examples/realtime/audio_util.py",
164+
"examples/realtime/push_to_talk_app.py"
160165
]
161166

162167
reportImplicitOverride = true

0 commit comments

Comments
 (0)
Please sign in to comment.