disler · timothyjoh · Oct 6, 2024 · Oct 6, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/.env.sample b/.env.sample
@@ -1,3 +1,5 @@
 OPENAI_API_KEY=
 PERSONALIZATION_FILE=./personalization.json
+BROWSER_CUSTOMIZATION_FILE=./browser.json
 SCRATCH_PAD_DIR=./scratchpad
+RUN_TIME_TABLE_LOG_JSON=runtime_time_table.jsonl
diff --git a/README.md b/README.md
@@ -1,7 +1,9 @@
 # POC Python Realtime API o1 assistant
 > This is a proof of concept for using the OpenAI's [Realtime API](https://openai.com/index/introducing-the-realtime-api/) to chain tools, call o1-preview & o1-mini, [structure output](https://openai.com/index/introducing-structured-outputs-in-the-api/) responses, and glimpse into the future of **AI assistant powered engineering**.
 >
-> See video where we [use this POC](https://youtu.be/vN0t-kcPOXo)
+> See video where we [use and discuss this POC](https://youtu.be/vN0t-kcPOXo)
+>
+> This codebase is a v0, poc. It's buggy, but contains the core ideas for realtime personal ai assistants & AI Agents.
 
 <img src="./images/ada-is-back.png" alt="realtime-assistant" style="max-width: 800px;">
 
@@ -63,6 +65,7 @@ The codebase includes various utility functions for tasks such as structured out
 ## Improvements
 > Up for a challenge? Here are some ideas on how to improve the experience:
 
+- Organize code.
 - Add interruption handling. Current version prevents it for simplicity.
 - Add transcript logging.
 - Make personalization.json a pydantic type.

diff --git a/browser.json b/browser.json
@@ -0,0 +1,17 @@
+{
+  "browser_urls": [
+    "https://aider.chat",
+    "https://simonwillison.net",
+    "https://hackernews.com",
+    "https://chat.openai.com",
+    "https://notebooklm.google.com",
+    "https://google.com",
+    "https://youtube.com",
+    "https://twitter.com",
+    "https://claude.ai/chat",
+    "https://changelog.cursor.com",
+    "https://gemini.google.com/u/1/",
+    "https://openai.com/index/introducing-the-realtime-api/"
+  ],
+  "browser": "open -a /Applications/Google\\ Chrome.app %s"
+}
diff --git a/personalization.json b/personalization.json
@@ -1,19 +1,4 @@
 {
-  "browser_urls": [
-    "https://aider.chat",
-    "https://simonwillison.net",
-    "https://hackernews.com",
-    "https://chat.openai.com",
-    "https://notebooklm.google.com",
-    "https://google.com",
-    "https://youtube.com",
-    "https://twitter.com",
-    "https://claude.ai/chat",
-    "https://changelog.cursor.com",
-    "https://gemini.google.com/u/1/",
-    "https://openai.com/index/introducing-the-realtime-api/"
-  ],
-  "browser": "chrome",
   "ai_assistant_name": "Ada",
   "human_name": "Dan"
-}
+}
diff --git a/src/realtime_api_async_python/audio/__init__.py b/src/realtime_api_async_python/audio/__init__.py
diff --git a/src/realtime_api_async_python/audio/bidirectional_audio.py b/src/realtime_api_async_python/audio/bidirectional_audio.py
@@ -0,0 +1,84 @@
+import pyaudio
+import queue
+import asyncio
+from ..utils.logging import logging
+
+# Audio recording parameters
+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 24000
+
+class BidirectionalAudio:
+    def __init__(self):
+        self.p = pyaudio.PyAudio()
+        self.stream = self.p.open(
+            format=FORMAT,
+            channels=CHANNELS,
+            rate=RATE,
+            input=True,
+            frames_per_buffer=CHUNK,
+            stream_callback=self.callback,
+        )
+        self.queue = queue.Queue()
+        self.is_recording = False
+        self.is_receiving = False
+        logging.info("AsyncMicrophone initialized")
+
+    def callback(self, in_data, frame_count, time_info, status):
+        if self.is_recording and not self.is_receiving:
+            self.queue.put(in_data)
+        # if self.is_recording:
+        #     self.queue.put(in_data)
+        return (None, pyaudio.paContinue)
+
+    def start_recording(self):
+        self.is_recording = True
+        logging.info("Started recording")
+
+    def stop_recording(self):
+        self.is_recording = False
+        logging.info("Stopped recording")
+
+    def start_receiving(self):
+        self.is_receiving = True
+        self.is_recording = False
+        logging.info("Started receiving assistant response")
+
+    def stop_receiving(self):
+        self.is_receiving = False
+        logging.info("Stopped receiving assistant response")
+
+    def get_audio_data(self):
+        data = b""
+        while not self.queue.empty():
+            data += self.queue.get()
+        return data if data else None
+
+    def close(self):
+        self.stream.stop_stream()
+        self.stream.close()
+        self.p.terminate()
+        logging.info("AsyncMicrophone closed")
+
+async def play_audio(audio_data):
+    p = pyaudio.PyAudio()
+    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, output=True)
+    stream.write(audio_data)
+
+    # Add a small delay (e.g., 100ms) of silence at the end to prevent popping, and weird cuts off sounds
+    silence_duration = 0.2  # 200ms
+    silence_frames = int(RATE * silence_duration)
+    silence = b"\x00" * (
+        silence_frames * CHANNELS * 2
+    )  # 2 bytes per sample for 16-bit audio
+    stream.write(silence)
+
+    # Add a small pause before closing the stream to make sure the audio is fully played
+    await asyncio.sleep(0.5)
+
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    logging.debug("Audio playback completed")
+