Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 423655c

Browse files
authoredMar 20, 2025··
feat: add audio helpers
* add audio helpers * update ignore, lockfile, add execute * fix examples, lint errors * lint and export errors * temp: ignore type errors
1 parent ab5192d commit 423655c

11 files changed

+341
-28
lines changed
 

‎.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@ dist
1414
.envrc
1515
codegen.log
1616
Brewfile.lock.json
17+
18+
.DS_Store
19+
20+
examples/*.mp3

‎examples/audio.py

-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env rye run python
22

3-
import time
43
from pathlib import Path
54

65
from openai import OpenAI
@@ -12,8 +11,6 @@
1211

1312

1413
def main() -> None:
15-
stream_to_speakers()
16-
1714
# Create text-to-speech audio file
1815
with openai.audio.speech.with_streaming_response.create(
1916
model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
3734
print(translation.text)
3835

3936

40-
def stream_to_speakers() -> None:
41-
import pyaudio
42-
43-
player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
44-
45-
start_time = time.time()
46-
47-
with openai.audio.speech.with_streaming_response.create(
48-
model="tts-1",
49-
voice="alloy",
50-
response_format="pcm", # similar to WAV, but without a header chunk at the start.
51-
input="""I see skies of blue and clouds of white
52-
The bright blessed days, the dark sacred nights
53-
And I think to myself
54-
What a wonderful world""",
55-
) as response:
56-
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
57-
for chunk in response.iter_bytes(chunk_size=1024):
58-
player_stream.write(chunk)
59-
60-
print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
61-
62-
6337
if __name__ == "__main__":
6438
main()

‎examples/speech_to_text.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env rye run python
2+
3+
import asyncio
4+
5+
from openai import AsyncOpenAI
6+
from openai.helpers import Microphone
7+
8+
# gets OPENAI_API_KEY from your environment variables
9+
openai = AsyncOpenAI()
10+
11+
12+
async def main() -> None:
13+
print("Recording for the next 10 seconds...")
14+
recording = await Microphone(timeout=10).record()
15+
print("Recording complete")
16+
transcription = await openai.audio.transcriptions.create(
17+
model="whisper-1",
18+
file=recording,
19+
)
20+
21+
print(transcription.text)
22+
23+
24+
if __name__ == "__main__":
25+
asyncio.run(main())

‎examples/text_to_speech.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env rye run python
2+
3+
import time
4+
import asyncio
5+
6+
from openai import AsyncOpenAI
7+
from openai.helpers import LocalAudioPlayer
8+
9+
# gets OPENAI_API_KEY from your environment variables
10+
openai = AsyncOpenAI()
11+
12+
13+
async def main() -> None:
14+
start_time = time.time()
15+
16+
async with openai.audio.speech.with_streaming_response.create(
17+
model="tts-1",
18+
voice="alloy",
19+
response_format="pcm", # similar to WAV, but without a header chunk at the start.
20+
input="""I see skies of blue and clouds of white
21+
The bright blessed days, the dark sacred nights
22+
And I think to myself
23+
What a wonderful world""",
24+
) as response:
25+
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
26+
await LocalAudioPlayer().play(response)
27+
print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
28+
29+
30+
if __name__ == "__main__":
31+
asyncio.run(main())

‎pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ dependencies = [
1616
"sniffio",
1717
"tqdm > 4",
1818
"jiter>=0.4.0, <1",
19+
"sounddevice>=0.5.1",
20+
"numpy>=2.0.2",
1921
]
2022
requires-python = ">= 3.8"
2123
classifiers = [

‎requirements-dev.lock

+5-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ certifi==2023.7.22
3333
# via requests
3434
cffi==1.16.0
3535
# via cryptography
36+
# via sounddevice
3637
charset-normalizer==3.3.2
3738
# via requests
3839
click==8.1.7
@@ -92,7 +93,7 @@ nest-asyncio==1.6.0
9293
nodeenv==1.8.0
9394
# via pyright
9495
nox==2023.4.22
95-
numpy==1.26.3
96+
numpy==2.0.2
9697
# via openai
9798
# via pandas
9899
# via pandas-stubs
@@ -102,7 +103,7 @@ packaging==23.2
102103
# via black
103104
# via nox
104105
# via pytest
105-
pandas==2.1.4
106+
pandas==2.2.3
106107
# via openai
107108
pandas-stubs==2.1.4.231227
108109
# via openai
@@ -154,6 +155,8 @@ sniffio==1.3.0
154155
# via trio
155156
sortedcontainers==2.4.0
156157
# via trio
158+
sounddevice==0.5.1
159+
# via openai
157160
time-machine==2.9.0
158161
toml==0.10.2
159162
# via inline-snapshot

‎requirements.lock

+6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ anyio==4.1.0
1818
certifi==2023.7.22
1919
# via httpcore
2020
# via httpx
21+
cffi==1.17.1
22+
# via sounddevice
2123
distro==1.8.0
2224
# via openai
2325
exceptiongroup==1.2.2
@@ -41,6 +43,8 @@ pandas==2.2.3
4143
# via openai
4244
pandas-stubs==2.2.2.240807
4345
# via openai
46+
pycparser==2.22
47+
# via cffi
4448
pydantic==2.10.3
4549
# via openai
4650
pydantic-core==2.27.1
@@ -54,6 +58,8 @@ six==1.16.0
5458
sniffio==1.3.0
5559
# via anyio
5660
# via openai
61+
sounddevice==0.5.1
62+
# via openai
5763
tqdm==4.66.5
5864
# via openai
5965
types-pytz==2024.2.0.20241003

‎src/openai/helpers.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .helpers.microphone import Microphone
2+
from .helpers.local_audio_player import LocalAudioPlayer
3+
4+
__all__ = ["LocalAudioPlayer", "Microphone"]

‎src/openai/helpers/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .microphone import Microphone
2+
from .local_audio_player import LocalAudioPlayer
3+
4+
__all__ = ["Microphone", "LocalAudioPlayer"]
+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# mypy: ignore-errors
2+
import queue
3+
import asyncio
4+
from typing import Any, Union, Callable, AsyncGenerator, cast
5+
6+
import numpy as np
7+
import sounddevice as sd # type: ignore
8+
import numpy.typing as npt
9+
10+
from .. import _legacy_response
11+
from .._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
12+
13+
SAMPLE_RATE = 24000
14+
15+
16+
class LocalAudioPlayer:
17+
def __init__(
18+
self,
19+
should_stop: Union[Callable[[], bool], None] = None,
20+
):
21+
self.channels = 1
22+
self.dtype = np.float32
23+
self.should_stop = should_stop
24+
25+
async def _tts_response_to_buffer(
26+
self,
27+
response: Union[
28+
_legacy_response.HttpxBinaryResponseContent,
29+
AsyncStreamedBinaryAPIResponse,
30+
StreamedBinaryAPIResponse,
31+
],
32+
) -> npt.NDArray[np.float32]:
33+
chunks: list[bytes] = []
34+
if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
35+
response, StreamedBinaryAPIResponse
36+
):
37+
for chunk in response.iter_bytes(chunk_size=1024):
38+
if chunk:
39+
chunks.append(chunk)
40+
else:
41+
async for chunk in response.iter_bytes(chunk_size=1024):
42+
if chunk:
43+
chunks.append(chunk)
44+
45+
audio_bytes = b"".join(chunks)
46+
audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
47+
audio_np = audio_np.reshape(-1, 1)
48+
return audio_np
49+
50+
async def play(
51+
self,
52+
input: Union[
53+
npt.NDArray[np.int16],
54+
npt.NDArray[np.float32],
55+
_legacy_response.HttpxBinaryResponseContent,
56+
AsyncStreamedBinaryAPIResponse,
57+
StreamedBinaryAPIResponse,
58+
],
59+
) -> None:
60+
audio_content: npt.NDArray[np.float32]
61+
if isinstance(input, np.ndarray):
62+
if input.dtype == np.int16 and self.dtype == np.float32:
63+
audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
64+
elif input.dtype == np.float32:
65+
audio_content = cast(npt.NDArray[np.float32], input)
66+
else:
67+
raise ValueError(f"Unsupported dtype: {input.dtype}")
68+
else:
69+
audio_content = await self._tts_response_to_buffer(input)
70+
71+
loop = asyncio.get_event_loop()
72+
event = asyncio.Event()
73+
idx = 0
74+
75+
def callback(
76+
outdata: npt.NDArray[np.float32],
77+
frame_count: int,
78+
_time_info: Any,
79+
_status: Any,
80+
):
81+
nonlocal idx
82+
83+
remainder = len(audio_content) - idx
84+
if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
85+
loop.call_soon_threadsafe(event.set)
86+
raise sd.CallbackStop
87+
valid_frames = frame_count if remainder >= frame_count else remainder
88+
outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
89+
outdata[valid_frames:] = 0
90+
idx += valid_frames
91+
92+
stream = sd.OutputStream(
93+
samplerate=SAMPLE_RATE,
94+
callback=callback,
95+
dtype=audio_content.dtype,
96+
channels=audio_content.shape[1],
97+
)
98+
with stream:
99+
await event.wait()
100+
101+
async def play_stream(
102+
self,
103+
buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
104+
) -> None:
105+
loop = asyncio.get_event_loop()
106+
event = asyncio.Event()
107+
buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
108+
109+
async def buffer_producer():
110+
async for buffer in buffer_stream:
111+
if buffer is None:
112+
break
113+
await loop.run_in_executor(None, buffer_queue.put, buffer)
114+
await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion
115+
116+
def callback(
117+
outdata: npt.NDArray[np.float32],
118+
frame_count: int,
119+
_time_info: Any,
120+
_status: Any,
121+
):
122+
nonlocal current_buffer, buffer_pos
123+
124+
frames_written = 0
125+
while frames_written < frame_count:
126+
if current_buffer is None or buffer_pos >= len(current_buffer):
127+
try:
128+
current_buffer = buffer_queue.get(timeout=0.1)
129+
if current_buffer is None:
130+
loop.call_soon_threadsafe(event.set)
131+
raise sd.CallbackStop
132+
buffer_pos = 0
133+
134+
if current_buffer.dtype == np.int16 and self.dtype == np.float32:
135+
current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
136+
137+
except queue.Empty:
138+
outdata[frames_written:] = 0
139+
return
140+
141+
remaining_frames = len(current_buffer) - buffer_pos
142+
frames_to_write = min(frame_count - frames_written, remaining_frames)
143+
outdata[frames_written : frames_written + frames_to_write] = current_buffer[
144+
buffer_pos : buffer_pos + frames_to_write
145+
]
146+
buffer_pos += frames_to_write
147+
frames_written += frames_to_write
148+
149+
current_buffer = None
150+
buffer_pos = 0
151+
152+
producer_task = asyncio.create_task(buffer_producer())
153+
154+
with sd.OutputStream(
155+
samplerate=SAMPLE_RATE,
156+
channels=self.channels,
157+
dtype=self.dtype,
158+
callback=callback,
159+
):
160+
await event.wait()
161+
162+
await producer_task

‎src/openai/helpers/microphone.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# mypy: ignore-errors
2+
import io
3+
import time
4+
import wave
5+
import asyncio
6+
from typing import Any, Type, Union, Generic, TypeVar, Callable, overload
7+
from typing_extensions import Literal
8+
9+
import numpy as np
10+
import sounddevice as sd # type: ignore
11+
import numpy.typing as npt
12+
13+
from openai._types import FileTypes, FileContent
14+
15+
SAMPLE_RATE = 24000
16+
17+
DType = TypeVar("DType", bound=np.generic)
18+
19+
20+
class Microphone(Generic[DType]):
21+
def __init__(
22+
self,
23+
channels: int = 1,
24+
dtype: Type[DType] = np.int16,
25+
should_record: Union[Callable[[], bool], None] = None,
26+
timeout: Union[float, None] = None,
27+
):
28+
self.channels = channels
29+
self.dtype = dtype
30+
self.should_record = should_record
31+
self.buffer_chunks = []
32+
self.timeout = timeout
33+
self.has_record_function = callable(should_record)
34+
35+
def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes:
36+
buffer: FileContent = io.BytesIO()
37+
with wave.open(buffer, "w") as wav_file:
38+
wav_file.setnchannels(self.channels)
39+
wav_file.setsampwidth(np.dtype(self.dtype).itemsize)
40+
wav_file.setframerate(SAMPLE_RATE)
41+
wav_file.writeframes(audio_data.tobytes())
42+
buffer.seek(0)
43+
return ("audio.wav", buffer, "audio/wav")
44+
45+
@overload
46+
async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ...
47+
48+
@overload
49+
async def record(self, return_ndarray: Literal[False]) -> FileTypes: ...
50+
51+
@overload
52+
async def record(self, return_ndarray: None = ...) -> FileTypes: ...
53+
54+
async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]:
55+
loop = asyncio.get_event_loop()
56+
event = asyncio.Event()
57+
self.buffer_chunks: list[npt.NDArray[DType]] = []
58+
start_time = time.perf_counter()
59+
60+
def callback(
61+
indata: npt.NDArray[DType],
62+
_frame_count: int,
63+
_time_info: Any,
64+
_status: Any,
65+
):
66+
execution_time = time.perf_counter() - start_time
67+
reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False
68+
if reached_recording_timeout:
69+
loop.call_soon_threadsafe(event.set)
70+
raise sd.CallbackStop
71+
72+
should_be_recording = self.should_record() if callable(self.should_record) else True
73+
if not should_be_recording:
74+
loop.call_soon_threadsafe(event.set)
75+
raise sd.CallbackStop
76+
77+
self.buffer_chunks.append(indata.copy())
78+
79+
stream = sd.InputStream(
80+
callback=callback,
81+
dtype=self.dtype,
82+
samplerate=SAMPLE_RATE,
83+
channels=self.channels,
84+
)
85+
with stream:
86+
await event.wait()
87+
88+
# Concatenate all chunks into a single buffer, handle empty case
89+
concatenated_chunks: npt.NDArray[DType] = (
90+
np.concatenate(self.buffer_chunks, axis=0)
91+
if len(self.buffer_chunks) > 0
92+
else np.array([], dtype=self.dtype)
93+
)
94+
95+
if return_ndarray:
96+
return concatenated_chunks
97+
else:
98+
return self._ndarray_to_wav(concatenated_chunks)

0 commit comments

Comments
 (0)
Please sign in to comment.