先上Demo:
vl_agent_demo
代码如下:
0 设置工作目录:
你的工作目录需要如下:
其中utils文件夹和qwenvl_agent.py均参考自
YourProj(文件夹):
utils(文件夹)
agent_function_call.py
mobile_agent.py
qwenvl_agent.py
(1)运行代码mobile_agent.py:
python
import os
import time
import json
from ppadb.client import Client as AdbClient
import uiautomator2 as u2
import base64
from qwenvl_agent import perform_gui_grounding_with_api
class Android_VL_Agent:
def __init__(self):
self.client = AdbClient(host="127.0.0.1", port=5037)
self.device_serial = None
self.u2_device = None
self.SCREENSHOT_PATH = None
self.QWEN_MODEL_ID = 'qwen2.5-vl-7b-instruct'
self.__set_up()
@staticmethod
def check_adb_service():
try:
result = os.popen("adb devices").read()
if "List of devices attached" in result:
return True
else:
os.system("adb start-server")
time.sleep(5) # 等待 ADB 服务启动
result = os.popen("adb devices").read()
if "List of devices attached" in result:
return True
else:
return False
except Exception:
print("ADB服务启动失败")
return False
@staticmethod
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
@staticmethod
def info_parser(info):
try:
body = info.split("<tool_call>")[1].split("</tool_call>")[0]
return json.loads(body)
except Exception as e:
print(f"解析失败: {str(e)}")
return None
# 启动
def __set_up(self):
assert self.check_adb_service()
devices = self.client.devices()
self.device_serial = devices[0].serial if devices else None
self.u2_device = u2.connect(self.device_serial)
self.SCREENSHOT_PATH = "screenshot.png"
# 定义单点事件
def __single_point_event(self,x,y):
try:
self.u2_device.click(x, y)
return True
except Exception as e:
print(f"单点失败: {str(e)}")
return False
# 定义输入内容
def __input_content(self,content):
try:
self.u2_device.send_keys(content)
return True
except Exception as e:
print(f"输入失败: {str(e)}")
return False
# 截图并保存
def __screenshot(self):
try:
# 清除之前的截图
if os.path.exists(self.SCREENSHOT_PATH):
os.remove(self.SCREENSHOT_PATH)
screenshot = self.u2_device.screenshot()
screenshot.save(self.SCREENSHOT_PATH)
# screenshot.show()
return True
except Exception as e:
print(f"截图失败: {str(e)}")
return False
def __Qwen_vl_agent(self, query):
output_info = perform_gui_grounding_with_api(self.SCREENSHOT_PATH, query, self.QWEN_MODEL_ID)
# print(output_info)
result = self.info_parser(str(output_info))["arguments"]
return result
def __action(self,result):
if "click" in result["action"]:
coordinate = result["coordinate"]
self.__single_point_event(coordinate[0],coordinate[1])
elif "type" in result["action"]:
self.__input_content(result["text"])
def run(self,query):
# 重新连接
self.u2_device = u2.connect(self.device_serial)
# 感知
self.__screenshot()
# 理解
result = self.__Qwen_vl_agent(query)
print(result)
# 执行
self.__action(result)
def __call__(self,query):
self.run(query)
if __name__ == "__main__":
agent = Android_VL_Agent()
# timestep
timestep = 2
name = "名字"
message = "信息"
agent.run("打开微信")
time.sleep(timestep)
agent.run(f"点击和{name}聊天框的的顶部区域进入聊天界面")
time.sleep(timestep)
agent.run("点击屏幕底部的输入框部分进入输入界面")
time.sleep(timestep)
agent.run(f"在聊天框输入内容:{message}")
time.sleep(timestep)
agent.run("点击右侧发送按钮中心位置发送消息")
(2)方法代码qwenvl_agent.py
python
import json
import base64
from openai import OpenAI
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
NousFnCallPrompt,
Message,
ContentItem,
)
from PIL import Image, ImageDraw, ImageColor
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
import warnings
warnings.filterwarnings("ignore")
from utils.agent_function_call import ComputerUse
def draw_point(image: Image.Image, point: list, color=None):
if isinstance(color, str):
try:
color = ImageColor.getrgb(color)
color = color + (128,)
except ValueError:
color = (255, 0, 0, 128)
else:
color = (255, 0, 0, 128)
overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
overlay_draw = ImageDraw.Draw(overlay)
radius = min(image.size) * 0.05
x, y = point
overlay_draw.ellipse(
[(x - radius, y - radius), (x + radius, y + radius)],
fill=color)
center_radius = radius * 0.1
overlay_draw.ellipse(
[(x - center_radius, y - center_radius),
(x + center_radius, y + center_radius)],
fill=(0, 255, 0, 255))
image = image.convert('RGBA')
combined = Image.alpha_composite(image, overlay)
return combined.convert('RGB')
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def perform_gui_grounding_with_api(screenshot_path, user_query, model_id, min_pixels=3136, max_pixels=12845056):
"""
Perform GUI grounding using Qwen model to interpret user query on a screenshot.
Args:
screenshot_path (str): Path to the screenshot image
user_query (str): User's query/instruction
model: Preloaded Qwen model
min_pixels: Minimum pixels for the image
max_pixels: Maximum pixels for the image
Returns:
tuple: (output_text, display_image) - Model's output text and annotated image
"""
# Open and process image
input_image = Image.open(screenshot_path)
base64_image = encode_image(screenshot_path)
client = OpenAI(
# If the environment variable is not configured, please replace the following line with the Dashscope API Key: api_key="sk-xxx". Access via https://bailian.console.alibabacloud.com/?apiKey=1 "
api_key="xxx",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
resized_height, resized_width = smart_resize(
input_image.height,
input_image.width,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
# Initialize computer use function
computer_use = ComputerUse(
cfg={"display_width_px": resized_width, "display_height_px": resized_height}
)
# Build messages
system_message = NousFnCallPrompt.preprocess_fncall_messages(
messages=[
Message(role="system", content=[ContentItem(text="You are a helpful assistant.")]),
],
functions=[computer_use.function],
lang=None,
)
system_message = system_message[0].model_dump()
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": msg["text"]} for msg in system_message["content"]
],
},
{
"role": "user",
"content": [
{
"type": "image_url",
"min_pixels": min_pixels,
"max_pixels": max_pixels,
# Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
# PNG image: f"data:image/png;base64,{base64_image}"
# JPEG image: f"data:image/jpeg;base64,{base64_image}"
# WEBP image: f"data:image/webp;base64,{base64_image}"
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
{"type": "text", "text": user_query},
],
}
]
# print(json.dumps(messages, indent=4))
completion = client.chat.completions.create(
model=model_id,
messages=messages,
)
output_text = completion.choices[0].message.content
# Parse action and visualize
# print(output_text)
action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
# display_image = input_image.resize((resized_width, resized_height))
# display_image = draw_point(input_image, action['arguments']['coordinate'], color='green')
return output_text
if __name__ == "__main__":
screenshot = "screenshot.png"
user_query = '在聊天框输入内容:下午好!'
model_id = "qwen2.5-vl-7b-instruct"
output_text = perform_gui_grounding_with_api(screenshot, user_query, model_id)
print(output_text)