feat: add audio helpers

kwhinnery-openai · web-flow · commit 423655ca9077 · 2025-03-20T16:31:58.000Z
* add audio helpers

* update ignore, lockfile, add execute

* fix examples, lint errors

* lint and export errors

* temp: ignore type errors
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,7 @@ dist
 .envrc
 codegen.log
 Brewfile.lock.json
+
+.DS_Store
+
+examples/*.mp3
diff --git a/examples/audio.py b/examples/audio.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env rye run python
 
-import time
 from pathlib import Path
 
 from openai import OpenAI
@@ -12,8 +11,6 @@
 
 
 def main() -> None:
-    stream_to_speakers()
-
     # Create text-to-speech audio file
     with openai.audio.speech.with_streaming_response.create(
         model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
     print(translation.text)
 
 
-def stream_to_speakers() -> None:
-    import pyaudio
-
-    player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
-
-    start_time = time.time()
-
-    with openai.audio.speech.with_streaming_response.create(
-        model="tts-1",
-        voice="alloy",
-        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
-        input="""I see skies of blue and clouds of white
-                The bright blessed days, the dark sacred nights
-                And I think to myself
-                What a wonderful world""",
-    ) as response:
-        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
-        for chunk in response.iter_bytes(chunk_size=1024):
-            player_stream.write(chunk)
-
-    print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
-
-
 if __name__ == "__main__":
     main()
diff --git a/examples/speech_to_text.py b/examples/speech_to_text.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env rye run python
+
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import Microphone
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+    print("Recording for the next 10 seconds...")
+    recording = await Microphone(timeout=10).record()
+    print("Recording complete")
+    transcription = await openai.audio.transcriptions.create(
+        model="whisper-1",
+        file=recording,
+    )
+
+    print(transcription.text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/text_to_speech.py b/examples/text_to_speech.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env rye run python
+
+import time
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import LocalAudioPlayer
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+    start_time = time.time()
+
+    async with openai.audio.speech.with_streaming_response.create(
+        model="tts-1",
+        voice="alloy",
+        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
+        input="""I see skies of blue and clouds of white
+                The bright blessed days, the dark sacred nights
+                And I think to myself
+                What a wonderful world""",
+    ) as response:
+        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+        await LocalAudioPlayer().play(response)
+        print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     "sniffio",
     "tqdm > 4",
     "jiter>=0.4.0, <1",
+    "sounddevice>=0.5.1",
+    "numpy>=2.0.2",
 ]
 requires-python = ">= 3.8"
 classifiers = [
diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -33,6 +33,7 @@ certifi==2023.7.22
     # via requests
 cffi==1.16.0
     # via cryptography
+    # via sounddevice
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
@@ -92,7 +93,7 @@ nest-asyncio==1.6.0
 nodeenv==1.8.0
     # via pyright
 nox==2023.4.22
-numpy==1.26.3
+numpy==2.0.2
     # via openai
     # via pandas
     # via pandas-stubs
@@ -102,7 +103,7 @@ packaging==23.2
     # via black
     # via nox
     # via pytest
-pandas==2.1.4
+pandas==2.2.3
     # via openai
 pandas-stubs==2.1.4.231227
     # via openai
@@ -154,6 +155,8 @@ sniffio==1.3.0
     # via trio
 sortedcontainers==2.4.0
     # via trio
+sounddevice==0.5.1
+    # via openai
 time-machine==2.9.0
 toml==0.10.2
     # via inline-snapshot
diff --git a/requirements.lock b/requirements.lock
@@ -18,6 +18,8 @@ anyio==4.1.0
 certifi==2023.7.22
     # via httpcore
     # via httpx
+cffi==1.17.1
+    # via sounddevice
 distro==1.8.0
     # via openai
 exceptiongroup==1.2.2
@@ -41,6 +43,8 @@ pandas==2.2.3
     # via openai
 pandas-stubs==2.2.2.240807
     # via openai
+pycparser==2.22
+    # via cffi
 pydantic==2.10.3
     # via openai
 pydantic-core==2.27.1
@@ -54,6 +58,8 @@ six==1.16.0
 sniffio==1.3.0
     # via anyio
     # via openai
+sounddevice==0.5.1
+    # via openai
 tqdm==4.66.5
     # via openai
 types-pytz==2024.2.0.20241003
diff --git a/src/openai/helpers.py b/src/openai/helpers.py
@@ -0,0 +1,4 @@
+from .helpers.microphone import Microphone
+from .helpers.local_audio_player import LocalAudioPlayer
+
+__all__ = ["LocalAudioPlayer", "Microphone"]
diff --git a/src/openai/helpers/__init__.py b/src/openai/helpers/__init__.py
@@ -0,0 +1,4 @@
+from .microphone import Microphone
+from .local_audio_player import LocalAudioPlayer
+
+__all__ = ["Microphone", "LocalAudioPlayer"]
diff --git a/src/openai/helpers/local_audio_player.py b/src/openai/helpers/local_audio_player.py
@@ -0,0 +1,162 @@
+# mypy: ignore-errors
+import queue
+import asyncio
+from typing import Any, Union, Callable, AsyncGenerator, cast
+
+import numpy as np
+import sounddevice as sd  # type: ignore
+import numpy.typing as npt
+
+from .. import _legacy_response
+from .._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
+
+SAMPLE_RATE = 24000
+
+
+class LocalAudioPlayer:
+    def __init__(
+        self,
+        should_stop: Union[Callable[[], bool], None] = None,
+    ):
+        self.channels = 1
+        self.dtype = np.float32
+        self.should_stop = should_stop
+
+    async def _tts_response_to_buffer(
+        self,
+        response: Union[
+            _legacy_response.HttpxBinaryResponseContent,
+            AsyncStreamedBinaryAPIResponse,
+            StreamedBinaryAPIResponse,
+        ],
+    ) -> npt.NDArray[np.float32]:
+        chunks: list[bytes] = []
+        if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
+            response, StreamedBinaryAPIResponse
+        ):
+            for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunks.append(chunk)
+        else:
+            async for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunks.append(chunk)
+
+        audio_bytes = b"".join(chunks)
+        audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
+        audio_np = audio_np.reshape(-1, 1)
+        return audio_np
+
+    async def play(
+        self,
+        input: Union[
+            npt.NDArray[np.int16],
+            npt.NDArray[np.float32],
+            _legacy_response.HttpxBinaryResponseContent,
+            AsyncStreamedBinaryAPIResponse,
+            StreamedBinaryAPIResponse,
+        ],
+    ) -> None:
+        audio_content: npt.NDArray[np.float32]
+        if isinstance(input, np.ndarray):
+            if input.dtype == np.int16 and self.dtype == np.float32:
+                audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+            elif input.dtype == np.float32:
+                audio_content = cast(npt.NDArray[np.float32], input)
+            else:
+                raise ValueError(f"Unsupported dtype: {input.dtype}")
+        else:
+            audio_content = await self._tts_response_to_buffer(input)
+
+        loop = asyncio.get_event_loop()
+        event = asyncio.Event()
+        idx = 0
+
+        def callback(
+            outdata: npt.NDArray[np.float32],
+            frame_count: int,
+            _time_info: Any,
+            _status: Any,
+        ):
+            nonlocal idx
+
+            remainder = len(audio_content) - idx
+            if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
+                loop.call_soon_threadsafe(event.set)
+                raise sd.CallbackStop
+            valid_frames = frame_count if remainder >= frame_count else remainder
+            outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
+            outdata[valid_frames:] = 0
+            idx += valid_frames
+
+        stream = sd.OutputStream(
+            samplerate=SAMPLE_RATE,
+            callback=callback,
+            dtype=audio_content.dtype,
+            channels=audio_content.shape[1],
+        )
+        with stream:
+            await event.wait()
+
+    async def play_stream(
+        self,
+        buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
+    ) -> None:
+        loop = asyncio.get_event_loop()
+        event = asyncio.Event()
+        buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
+
+        async def buffer_producer():
+            async for buffer in buffer_stream:
+                if buffer is None:
+                    break
+                await loop.run_in_executor(None, buffer_queue.put, buffer)
+            await loop.run_in_executor(None, buffer_queue.put, None)  # Signal completion
+
+        def callback(
+            outdata: npt.NDArray[np.float32],
+            frame_count: int,
+            _time_info: Any,
+            _status: Any,
+        ):
+            nonlocal current_buffer, buffer_pos
+
+            frames_written = 0
+            while frames_written < frame_count:
+                if current_buffer is None or buffer_pos >= len(current_buffer):
+                    try:
+                        current_buffer = buffer_queue.get(timeout=0.1)
+                        if current_buffer is None:
+                            loop.call_soon_threadsafe(event.set)
+                            raise sd.CallbackStop
+                        buffer_pos = 0
+
+                        if current_buffer.dtype == np.int16 and self.dtype == np.float32:
+                            current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+
+                    except queue.Empty:
+                        outdata[frames_written:] = 0
+                        return
+
+                remaining_frames = len(current_buffer) - buffer_pos
+                frames_to_write = min(frame_count - frames_written, remaining_frames)
+                outdata[frames_written : frames_written + frames_to_write] = current_buffer[
+                    buffer_pos : buffer_pos + frames_to_write
+                ]
+                buffer_pos += frames_to_write
+                frames_written += frames_to_write
+
+        current_buffer = None
+        buffer_pos = 0
+
+        producer_task = asyncio.create_task(buffer_producer())
+
+        with sd.OutputStream(
+            samplerate=SAMPLE_RATE,
+            channels=self.channels,
+            dtype=self.dtype,
+            callback=callback,
+        ):
+            await event.wait()
+
+        await producer_task
diff --git a/src/openai/helpers/microphone.py b/src/openai/helpers/microphone.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@ dependencies = [`
`16`	`16`	`"sniffio",`
`17`	`17`	`"tqdm > 4",`
`18`	`18`	`"jiter>=0.4.0, <1",`
	`19`	`+ "sounddevice>=0.5.1",`
	`20`	`+ "numpy>=2.0.2",`
`19`	`21`	`]`
`20`	`22`	`requires-python = ">= 3.8"`
`21`	`23`	`classifiers = [`
-Original file line number
+Diff line change
 +# mypy: ignore-errors
 +import io
 +import time
 +import wave
 +import asyncio
 +from typing import Any, Type, Union, Generic, TypeVar, Callable, overload
 +from typing_extensions import Literal
++
 +import numpy as np
 +import sounddevice as sd  # type: ignore
 +import numpy.typing as npt
++
 +from openai._types import FileTypes, FileContent
++
 +SAMPLE_RATE = 24000
++
 +DType = TypeVar("DType", bound=np.generic)
++
++
 +class Microphone(Generic[DType]):
 +    def __init__(
 +        self,
 +        channels: int = 1,
 +        dtype: Type[DType] = np.int16,
 +        should_record: Union[Callable[[], bool], None] = None,
 +        timeout: Union[float, None] = None,
 +    ):
 +        self.channels = channels
 +        self.dtype = dtype
 +        self.should_record = should_record
 +        self.buffer_chunks = []
 +        self.timeout = timeout
 +        self.has_record_function = callable(should_record)
++
 +    def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes:
 +        buffer: FileContent = io.BytesIO()
 +        with wave.open(buffer, "w") as wav_file:
 +            wav_file.setnchannels(self.channels)
 +            wav_file.setsampwidth(np.dtype(self.dtype).itemsize)
 +            wav_file.setframerate(SAMPLE_RATE)
 +            wav_file.writeframes(audio_data.tobytes())
 +        buffer.seek(0)
 +        return ("audio.wav", buffer, "audio/wav")
++
 +    @overload
 +    async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ...
++
 +    @overload
 +    async def record(self, return_ndarray: Literal[False]) -> FileTypes: ...
++
 +    @overload
 +    async def record(self, return_ndarray: None = ...) -> FileTypes: ...
++
 +    async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]:
 +        loop = asyncio.get_event_loop()
 +        event = asyncio.Event()
 +        self.buffer_chunks: list[npt.NDArray[DType]] = []
 +        start_time = time.perf_counter()
++
 +        def callback(
 +            indata: npt.NDArray[DType],
 +            _frame_count: int,
 +            _time_info: Any,
 +            _status: Any,
 +        ):
 +            execution_time = time.perf_counter() - start_time
 +            reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False
 +            if reached_recording_timeout:
 +                loop.call_soon_threadsafe(event.set)
 +                raise sd.CallbackStop
++
 +            should_be_recording = self.should_record() if callable(self.should_record) else True
 +            if not should_be_recording:
 +                loop.call_soon_threadsafe(event.set)
 +                raise sd.CallbackStop
++
 +            self.buffer_chunks.append(indata.copy())
++
 +        stream = sd.InputStream(
 +            callback=callback,
 +            dtype=self.dtype,
 +            samplerate=SAMPLE_RATE,
 +            channels=self.channels,
 +        )
 +        with stream:
 +            await event.wait()
++
 +        # Concatenate all chunks into a single buffer, handle empty case
 +        concatenated_chunks: npt.NDArray[DType] = (
 +            np.concatenate(self.buffer_chunks, axis=0)
 +            if len(self.buffer_chunks) > 0
 +            else np.array([], dtype=self.dtype)
 +        )
++
 +        if return_ndarray:
 +            return concatenated_chunks
 +        else:
 +            return self._ndarray_to_wav(concatenated_chunks)