基于Qwen-VL的手机智能体开发

先上Demo:

vl_agent_demo

代码如下:

0 设置工作目录:

你的工作目录需要如下:

其中utils文件夹和qwenvl_agent.py均参考自

GitHub - QwenLM/Qwen2.5-VL: Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud.Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud. - QwenLM/Qwen2.5-VLhttps://github.com/QwenLM/Qwen2.5-VL

YourProj(文件夹):

utils(文件夹)

agent_function_call.py

mobile_agent.py

qwenvl_agent.py

(1)运行代码mobile_agent.py:

python 复制代码
import os
import time
import json
from ppadb.client import Client as AdbClient
import uiautomator2 as u2
import base64
from qwenvl_agent import perform_gui_grounding_with_api


class Android_VL_Agent:

    def __init__(self):
        self.client = AdbClient(host="127.0.0.1", port=5037)
        self.device_serial = None
        self.u2_device = None
        self.SCREENSHOT_PATH = None
        self.QWEN_MODEL_ID = 'qwen2.5-vl-7b-instruct'
        self.__set_up()

    @staticmethod
    def check_adb_service():
        try:
            result = os.popen("adb devices").read()
            if "List of devices attached" in result:
                return True
            else:
                os.system("adb start-server")
                time.sleep(5)  # 等待 ADB 服务启动
                result = os.popen("adb devices").read()
                if "List of devices attached" in result:
                    return True
                else:
                    return False
        except Exception:
            print("ADB服务启动失败")
            return False
    @staticmethod
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    @staticmethod
    def info_parser(info):
        try:
            body = info.split("<tool_call>")[1].split("</tool_call>")[0]
            return json.loads(body)
        except Exception as e:
            print(f"解析失败: {str(e)}")
            return None

    # 启动
    def __set_up(self):
        assert self.check_adb_service()
        devices = self.client.devices()
        self.device_serial = devices[0].serial if devices else None
        self.u2_device = u2.connect(self.device_serial)
        self.SCREENSHOT_PATH = "screenshot.png"

    # 定义单点事件
    def __single_point_event(self,x,y):
        try:
            self.u2_device.click(x, y)
            return True
        except Exception as e:
            print(f"单点失败: {str(e)}")
            return False

    # 定义输入内容
    def __input_content(self,content):
        try:
            self.u2_device.send_keys(content)
            return True
        except Exception as e:
            print(f"输入失败: {str(e)}")
            return False

    # 截图并保存
    def __screenshot(self):
        try:
            # 清除之前的截图
            if os.path.exists(self.SCREENSHOT_PATH):
                os.remove(self.SCREENSHOT_PATH)
            screenshot = self.u2_device.screenshot()
            screenshot.save(self.SCREENSHOT_PATH)
            # screenshot.show()
            return True
        except Exception as e:
            print(f"截图失败: {str(e)}")
            return False

    def __Qwen_vl_agent(self, query):
        output_info = perform_gui_grounding_with_api(self.SCREENSHOT_PATH, query, self.QWEN_MODEL_ID)
        # print(output_info)
        result = self.info_parser(str(output_info))["arguments"]
        return result

    def __action(self,result):
        if "click" in result["action"]:
            coordinate = result["coordinate"]
            self.__single_point_event(coordinate[0],coordinate[1])
        elif "type" in result["action"]:
            self.__input_content(result["text"])

    def run(self,query):
        # 重新连接
        self.u2_device = u2.connect(self.device_serial)
        # 感知
        self.__screenshot()
        # 理解
        result = self.__Qwen_vl_agent(query)
        print(result)
        # 执行
        self.__action(result)

    def __call__(self,query):
        self.run(query)

if __name__ == "__main__":
    agent = Android_VL_Agent()
    # timestep
    timestep = 2
    name = "名字"
    message = "信息"

    agent.run("打开微信")
    time.sleep(timestep)
    agent.run(f"点击和{name}聊天框的的顶部区域进入聊天界面")
    time.sleep(timestep)
    agent.run("点击屏幕底部的输入框部分进入输入界面")
    time.sleep(timestep)
    agent.run(f"在聊天框输入内容:{message}")
    time.sleep(timestep)
    agent.run("点击右侧发送按钮中心位置发送消息")

(2)方法代码qwenvl_agent.py

python 复制代码
import json
import base64
from openai import OpenAI
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
    NousFnCallPrompt,
    Message,
    ContentItem,
)
from PIL import Image, ImageDraw, ImageColor
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
import warnings
warnings.filterwarnings("ignore")
from utils.agent_function_call import ComputerUse

def draw_point(image: Image.Image, point: list, color=None):
    if isinstance(color, str):
        try:
            color = ImageColor.getrgb(color)
            color = color + (128,)
        except ValueError:
            color = (255, 0, 0, 128)
    else:
        color = (255, 0, 0, 128)

    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)
    radius = min(image.size) * 0.05
    x, y = point

    overlay_draw.ellipse(
        [(x - radius, y - radius), (x + radius, y + radius)],
        fill=color)

    center_radius = radius * 0.1
    overlay_draw.ellipse(
        [(x - center_radius, y - center_radius),
         (x + center_radius, y + center_radius)],
        fill=(0, 255, 0, 255))

    image = image.convert('RGBA')
    combined = Image.alpha_composite(image, overlay)
    return combined.convert('RGB')

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def perform_gui_grounding_with_api(screenshot_path, user_query, model_id, min_pixels=3136, max_pixels=12845056):
    """
    Perform GUI grounding using Qwen model to interpret user query on a screenshot.

    Args:
        screenshot_path (str): Path to the screenshot image
        user_query (str): User's query/instruction
        model: Preloaded Qwen model
        min_pixels: Minimum pixels for the image
        max_pixels: Maximum pixels for the image

    Returns:
        tuple: (output_text, display_image) - Model's output text and annotated image
    """

    # Open and process image
    input_image = Image.open(screenshot_path)
    base64_image = encode_image(screenshot_path)
    client = OpenAI(
        # If the environment variable is not configured, please replace the following line with the Dashscope API Key: api_key="sk-xxx". Access via https://bailian.console.alibabacloud.com/?apiKey=1 "
        api_key="xxx",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    resized_height, resized_width = smart_resize(
        input_image.height,
        input_image.width,
        min_pixels=min_pixels,
        max_pixels=max_pixels,
    )

    # Initialize computer use function
    computer_use = ComputerUse(
        cfg={"display_width_px": resized_width, "display_height_px": resized_height}
    )

    # Build messages
    system_message = NousFnCallPrompt.preprocess_fncall_messages(
        messages=[
            Message(role="system", content=[ContentItem(text="You are a helpful assistant.")]),
        ],
        functions=[computer_use.function],
        lang=None,
    )
    system_message = system_message[0].model_dump()
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": msg["text"]} for msg in system_message["content"]
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "min_pixels": min_pixels,
                    "max_pixels": max_pixels,
                    # Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
                    # PNG image:  f"data:image/png;base64,{base64_image}"
                    # JPEG image: f"data:image/jpeg;base64,{base64_image}"
                    # WEBP image: f"data:image/webp;base64,{base64_image}"
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                },
                {"type": "text", "text": user_query},
            ],
        }
    ]
    # print(json.dumps(messages, indent=4))
    completion = client.chat.completions.create(
        model=model_id,
        messages=messages,
    )
    output_text = completion.choices[0].message.content

    # Parse action and visualize
    # print(output_text)
    action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
    # display_image = input_image.resize((resized_width, resized_height))
    # display_image = draw_point(input_image, action['arguments']['coordinate'], color='green')

    return output_text


if __name__ == "__main__":
    screenshot = "screenshot.png"
    user_query = '在聊天框输入内容:下午好!'
    model_id = "qwen2.5-vl-7b-instruct"
    output_text = perform_gui_grounding_with_api(screenshot, user_query, model_id)
    print(output_text)
相关推荐
AGG_Chan9 分钟前
flutter专栏--深入了解widget原理
开发语言·javascript·flutter
Darenm1111 小时前
JavaScript事件流:冒泡与捕获的深度解析
开发语言·前端·javascript
whltaoin1 小时前
Java 后端与 AI 融合:技术路径、实战案例与未来趋势
java·开发语言·人工智能·编程思想·ai生态
wjs20241 小时前
jEasyUI 自定义窗口工具栏
开发语言
二十雨辰1 小时前
vite与ts的结合
开发语言·前端·vue.js
xiaohanbao091 小时前
Transformer架构与NLP词表示演进
python·深度学习·神经网络
亦良Cool1 小时前
如何部署一个Java项目
java·开发语言
沐知全栈开发2 小时前
JavaScript 输出
开发语言
love530love2 小时前
【笔记】 Podman Desktop 中部署 Stable Diffusion WebUI (GPU 支持)
人工智能·windows·笔记·python·容器·stable diffusion·podman
程序员晚枫2 小时前
Python 3.14正式发布!这5大新特性太炸裂了
python