Skip to content

Commit fbb68b4

Browse files
committed
adding some random example tests froma while back.
1 parent 8b15265 commit fbb68b4

File tree

3 files changed

+259
-0
lines changed

3 files changed

+259
-0
lines changed

tests/gpt4omini_tts.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import openai
2+
import whisper
3+
import pyttsx3
4+
5+
# Initialize Whisper model
6+
model = whisper.load_model(
7+
"base"
8+
) # You can choose 'base', 'small', 'medium', or 'large'
9+
10+
11+
# Function for Speech-to-Text using Whisper
12+
def speech_to_text(audio_path):
13+
result = model.transcribe(audio_path)
14+
return result["text"]
15+
16+
17+
# Initialize TTS engine (pyttsx3)
18+
engine = pyttsx3.init()
19+
20+
21+
# Function to speak text
22+
def text_to_speech(text):
23+
engine.say(text)
24+
engine.runAndWait()
25+
26+
27+
# Example usage
28+
if __name__ == "__main__":
29+
# Convert speech to text from an audio file
30+
audio_file = "your_audio_file.wav" # Path to the audio file
31+
print("Converting speech to text...")
32+
text = speech_to_text(audio_file)
33+
print("Recognized Text:", text)
34+
35+
# Use GPT-4O Mini to generate a response (for now, just a placeholder example)
36+
# Ideally, replace this with a GPT-4 call for a response based on 'text'
37+
gpt_response = "This is the response from GPT-4O Mini."
38+
39+
# Convert the GPT response to speech
40+
print("Speaking the response...")
41+
text_to_speech(gpt_response)

tests/realtime_openai_voiceE_chat.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import websocket
2+
import pyaudio
3+
import numpy as np
4+
import json
5+
import threading
6+
import os
7+
8+
import asyncio
9+
from openai import AsyncOpenAI
10+
11+
12+
async def main():
13+
client = AsyncOpenAI()
14+
15+
async with client.beta.realtime.connect(
16+
model="gpt-4o-mini-realtime-preview"
17+
) as connection:
18+
await connection.session.update(session={"modalities": ["text"]})
19+
20+
await connection.conversation.item.create(
21+
item={
22+
"type": "message",
23+
"role": "user",
24+
"content": [{"type": "input_text", "text": "Say hello!"}],
25+
}
26+
)
27+
await connection.response.create()
28+
29+
async for event in connection:
30+
if event.type == "response.text.delta":
31+
print(event.delta, flush=True, end="")
32+
33+
elif event.type == "response.text.done":
34+
print()
35+
36+
elif event.type == "response.done":
37+
break
38+
39+
40+
asyncio.run(main())

tests/test_tars.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
import os
2+
import base64
3+
import requests
4+
import json
5+
import torch
6+
import pyautogui # For taking screenshots
7+
import time
8+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
9+
from screeninfo import get_monitors
10+
from npcsh.llm_funcs import get_openai_response
11+
12+
13+
def get_screen_resolution():
14+
monitor = get_monitors()[0] # Get primary monitor
15+
return monitor.width, monitor.height
16+
17+
18+
from PIL import Image
19+
20+
21+
def capture_screenshot() -> str:
22+
"""Captures a screenshot and saves it to a specified path."""
23+
screenshot_path = "screenshot.png"
24+
screenshot = pyautogui.screenshot()
25+
26+
# Resize screenshot to fit model's pixel range
27+
desired_width = 1280 # Adjust as needed based on max_pixels range
28+
desired_height = int(
29+
(desired_width * screenshot.height) / screenshot.width
30+
) # Maintain aspect ratio
31+
screenshot = screenshot.resize((desired_width, desired_height))
32+
33+
screenshot.save(screenshot_path)
34+
return screenshot_path
35+
36+
37+
# Adjust processor for specific pixel range
38+
min_pixels = 256 * 28 * 28
39+
max_pixels = 1280 * 28 * 28
40+
41+
42+
def encode_image_to_base64(image_path: str) -> str:
43+
"""Encodes an image file to a base64 string."""
44+
with open(image_path, "rb") as image_file:
45+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
46+
return f"data:image/png;base64,{encoded_string}"
47+
48+
49+
def get_tars_response(command: str, model_name: str) -> str:
50+
"""Generates a response from the UI-TARS model based on the command and screenshot image."""
51+
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
52+
# model_name, torch_dtype="auto", device_map="auto"
53+
# )
54+
# processor = AutoProcessor.from_pretrained(model_name)
55+
56+
# capture the current screen
57+
im = capture_screenshot()
58+
image_data = encode_image_to_base64(im)
59+
prompt = (
60+
f"""You are a GUI agent. You are given a task and your action history,
61+
with screenshots. You need to perform the next action or set of actions to complete the task.
62+
here is the task you must complete: {command}
63+
"""
64+
+ r"""
65+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
66+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
67+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
68+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
69+
hotkey(key='')
70+
type(content='') #If you want to submit your input, use "\
71+
" at the end of `content`.
72+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
73+
wait() #Sleep for 5s and take a screenshot to check for any changes.
74+
finished()
75+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
76+
77+
your response should be a list of actions to perform in the order they should be performed.
78+
Provide a single json object with the following format:
79+
{ "actions": ['action1', 'action2', 'action3'] }
80+
Do not provide any additional text or markdown formatting.
81+
"""
82+
)
83+
84+
messages = [
85+
{
86+
"role": "user",
87+
"content": [
88+
{
89+
"type": "image_url",
90+
"image_url": {"url": image_data},
91+
},
92+
{"type": "text", "text": command},
93+
],
94+
}
95+
]
96+
97+
# tars:
98+
"""text = processor.apply_chat_template(
99+
messages, tokenize=False, add_generation_prompt=True
100+
)
101+
inputs = processor(text=text, padding=True, return_tensors="pt").to(model.device)
102+
generated_ids = model.generate(**inputs, max_new_tokens=128)
103+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
104+
return output_text[0]
105+
"""
106+
gpt4o_response = get_openai_response(
107+
prompt, model="gpt-4o-mini", messages=messages, format="json"
108+
)
109+
110+
return gpt4o_response
111+
112+
113+
def execute_actions(actions: list):
114+
"""Executes the actions received from the model using pyautogui."""
115+
for action in actions:
116+
if action.startswith("click"):
117+
x, y = map(int, action[action.find("(") + 1 : action.find(")")].split(","))
118+
pyautogui.click(x, y)
119+
elif action.startswith("left_double"):
120+
x, y = map(int, action[action.find("(") + 1 : action.find(")")].split(","))
121+
pyautogui.doubleClick(x, y)
122+
elif action.startswith("right_single"):
123+
x, y = map(int, action[action.find("(") + 1 : action.find(")")].split(","))
124+
pyautogui.rightClick(x, y)
125+
elif action.startswith("drag"):
126+
coords = list(
127+
map(
128+
int,
129+
action[action.find("(") + 1 : action.find(")")]
130+
.replace("(", "")
131+
.replace(")", "")
132+
.split(","),
133+
)
134+
)
135+
pyautogui.moveTo(coords[0], coords[1])
136+
pyautogui.dragTo(coords[2], coords[3], duration=0.5)
137+
elif action.startswith("type"):
138+
text = action.split("('")[1].split("')")[0]
139+
pyautogui.write(text, interval=0.05)
140+
elif action.startswith("hotkey"):
141+
key = action.split("('")[1].split("')")[0]
142+
pyautogui.hotkey(key)
143+
elif action.startswith("scroll"):
144+
direction = action.split("('")[1].split("')")[0]
145+
amount = -100 if direction == "down" else 100
146+
pyautogui.scroll(amount)
147+
elif action.startswith("wait"):
148+
time.sleep(5)
149+
elif action.startswith("finished"):
150+
print("Task completed.")
151+
152+
153+
def ui_tars_control_loop(model_name: str):
154+
"""Main loop for interacting with the user and executing commands via UI-TARS."""
155+
print("UI-TARS Control Loop Started.")
156+
screen_width, screen_height = get_screen_resolution()
157+
print(f"Screen resolution: {screen_width}x{screen_height}")
158+
159+
while True:
160+
command = input("Enter your command (or type 'exit' to quit): ")
161+
if command.lower() == "exit":
162+
print("Exiting UI-TARS Control Loop.")
163+
break
164+
165+
screenshot_path = capture_screenshot()
166+
tars_result = get_tars_response(command, screenshot_path, model_name)
167+
print(f"UI-TARS Response: {tars_result}")
168+
169+
try:
170+
actions = json.loads(tars_result).get("actions", [])
171+
execute_actions(actions)
172+
except json.JSONDecodeError:
173+
print("Error parsing actions from UI-TARS response.")
174+
175+
176+
if __name__ == "__main__":
177+
MODEL_NAME = "ui-tars-7B" # Replace with your actual UI-TARS model name
178+
ui_tars_control_loop(MODEL_NAME)

0 commit comments

Comments
 (0)