基于Qwen-VL的手机智能体开发

先上Demo：

vl_agent_demo

代码如下：

0 设置工作目录：

你的工作目录需要如下：

其中utils文件夹和qwenvl_agent.py均参考自

GitHub - QwenLM/Qwen2.5-VL: Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud.Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud. - QwenLM/Qwen2.5-VLhttps://github.com/QwenLM/Qwen2.5-VL

YourProj(文件夹)：

utils(文件夹)

agent_function_call.py

mobile_agent.py

qwenvl_agent.py

（1）运行代码mobile_agent.py：

python 复制代码

import os
import time
import json
from ppadb.client import Client as AdbClient
import uiautomator2 as u2
import base64
from qwenvl_agent import perform_gui_grounding_with_api


class Android_VL_Agent:

    def __init__(self):
        self.client = AdbClient(host="127.0.0.1", port=5037)
        self.device_serial = None
        self.u2_device = None
        self.SCREENSHOT_PATH = None
        self.QWEN_MODEL_ID = 'qwen2.5-vl-7b-instruct'
        self.__set_up()

    @staticmethod
    def check_adb_service():
        try:
            result = os.popen("adb devices").read()
            if "List of devices attached" in result:
                return True
            else:
                os.system("adb start-server")
                time.sleep(5)  # 等待 ADB 服务启动
                result = os.popen("adb devices").read()
                if "List of devices attached" in result:
                    return True
                else:
                    return False
        except Exception:
            print("ADB服务启动失败")
            return False
    @staticmethod
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    @staticmethod
    def info_parser(info):
        try:
            body = info.split("<tool_call>")[1].split("</tool_call>")[0]
            return json.loads(body)
        except Exception as e:
            print(f"解析失败: {str(e)}")
            return None

    # 启动
    def __set_up(self):
        assert self.check_adb_service()
        devices = self.client.devices()
        self.device_serial = devices[0].serial if devices else None
        self.u2_device = u2.connect(self.device_serial)
        self.SCREENSHOT_PATH = "screenshot.png"

    # 定义单点事件
    def __single_point_event(self,x,y):
        try:
            self.u2_device.click(x, y)
            return True
        except Exception as e:
            print(f"单点失败: {str(e)}")
            return False

    # 定义输入内容
    def __input_content(self,content):
        try:
            self.u2_device.send_keys(content)
            return True
        except Exception as e:
            print(f"输入失败: {str(e)}")
            return False

    # 截图并保存
    def __screenshot(self):
        try:
            # 清除之前的截图
            if os.path.exists(self.SCREENSHOT_PATH):
                os.remove(self.SCREENSHOT_PATH)
            screenshot = self.u2_device.screenshot()
            screenshot.save(self.SCREENSHOT_PATH)
            # screenshot.show()
            return True
        except Exception as e:
            print(f"截图失败: {str(e)}")
            return False

    def __Qwen_vl_agent(self, query):
        output_info = perform_gui_grounding_with_api(self.SCREENSHOT_PATH, query, self.QWEN_MODEL_ID)
        # print(output_info)
        result = self.info_parser(str(output_info))["arguments"]
        return result

    def __action(self,result):
        if "click" in result["action"]:
            coordinate = result["coordinate"]
            self.__single_point_event(coordinate[0],coordinate[1])
        elif "type" in result["action"]:
            self.__input_content(result["text"])

    def run(self,query):
        # 重新连接
        self.u2_device = u2.connect(self.device_serial)
        # 感知
        self.__screenshot()
        # 理解
        result = self.__Qwen_vl_agent(query)
        print(result)
        # 执行
        self.__action(result)

    def __call__(self,query):
        self.run(query)

if __name__ == "__main__":
    agent = Android_VL_Agent()
    # timestep
    timestep = 2
    name = "名字"
    message = "信息"

    agent.run("打开微信")
    time.sleep(timestep)
    agent.run(f"点击和{name}聊天框的的顶部区域进入聊天界面")
    time.sleep(timestep)
    agent.run("点击屏幕底部的输入框部分进入输入界面")
    time.sleep(timestep)
    agent.run(f"在聊天框输入内容：{message}")
    time.sleep(timestep)
    agent.run("点击右侧发送按钮中心位置发送消息")

（2）方法代码qwenvl_agent.py

python 复制代码

import json
import base64
from openai import OpenAI
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
    NousFnCallPrompt,
    Message,
    ContentItem,
)
from PIL import Image, ImageDraw, ImageColor
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
import warnings
warnings.filterwarnings("ignore")
from utils.agent_function_call import ComputerUse

def draw_point(image: Image.Image, point: list, color=None):
    if isinstance(color, str):
        try:
            color = ImageColor.getrgb(color)
            color = color + (128,)
        except ValueError:
            color = (255, 0, 0, 128)
    else:
        color = (255, 0, 0, 128)

    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)
    radius = min(image.size) * 0.05
    x, y = point

    overlay_draw.ellipse(
        [(x - radius, y - radius), (x + radius, y + radius)],
        fill=color)

    center_radius = radius * 0.1
    overlay_draw.ellipse(
        [(x - center_radius, y - center_radius),
         (x + center_radius, y + center_radius)],
        fill=(0, 255, 0, 255))

    image = image.convert('RGBA')
    combined = Image.alpha_composite(image, overlay)
    return combined.convert('RGB')

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def perform_gui_grounding_with_api(screenshot_path, user_query, model_id, min_pixels=3136, max_pixels=12845056):
    """
    Perform GUI grounding using Qwen model to interpret user query on a screenshot.

    Args:
        screenshot_path (str): Path to the screenshot image
        user_query (str): User's query/instruction
        model: Preloaded Qwen model
        min_pixels: Minimum pixels for the image
        max_pixels: Maximum pixels for the image

    Returns:
        tuple: (output_text, display_image) - Model's output text and annotated image
    """

    # Open and process image
    input_image = Image.open(screenshot_path)
    base64_image = encode_image(screenshot_path)
    client = OpenAI(
        # If the environment variable is not configured, please replace the following line with the Dashscope API Key: api_key="sk-xxx". Access via https://bailian.console.alibabacloud.com/?apiKey=1 "
        api_key="xxx",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    resized_height, resized_width = smart_resize(
        input_image.height,
        input_image.width,
        min_pixels=min_pixels,
        max_pixels=max_pixels,
    )

    # Initialize computer use function
    computer_use = ComputerUse(
        cfg={"display_width_px": resized_width, "display_height_px": resized_height}
    )

    # Build messages
    system_message = NousFnCallPrompt.preprocess_fncall_messages(
        messages=[
            Message(role="system", content=[ContentItem(text="You are a helpful assistant.")]),
        ],
        functions=[computer_use.function],
        lang=None,
    )
    system_message = system_message[0].model_dump()
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": msg["text"]} for msg in system_message["content"]
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "min_pixels": min_pixels,
                    "max_pixels": max_pixels,
                    # Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
                    # PNG image:  f"data:image/png;base64,{base64_image}"
                    # JPEG image: f"data:image/jpeg;base64,{base64_image}"
                    # WEBP image: f"data:image/webp;base64,{base64_image}"
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                },
                {"type": "text", "text": user_query},
            ],
        }
    ]
    # print(json.dumps(messages, indent=4))
    completion = client.chat.completions.create(
        model=model_id,
        messages=messages,
    )
    output_text = completion.choices[0].message.content

    # Parse action and visualize
    # print(output_text)
    action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
    # display_image = input_image.resize((resized_width, resized_height))
    # display_image = draw_point(input_image, action['arguments']['coordinate'], color='green')

    return output_text


if __name__ == "__main__":
    screenshot = "screenshot.png"
    user_query = '在聊天框输入内容：下午好！'
    model_id = "qwen2.5-vl-7b-instruct"
    output_text = perform_gui_grounding_with_api(screenshot, user_query, model_id)
    print(output_text)