Spaces:

junyangwang0410
/

Mobile-Agent

Running

File size: 6,661 Bytes

1e96bca

import os
import base64
from io import BytesIO
from PIL import Image

from MobileAgent.api import inference_chat
from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt
from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image

from dashscope import MultiModalConversation
import dashscope
import concurrent


API_url = os.environ.get('url')
token = os.environ.get('token')


def base64_to_pil(base64_string):
    if base64_string.startswith('data:image'):
        base64_string = base64_string.split(',')[-1]
    image_data = base64.b64decode(base64_string)
    image_stream = BytesIO(image_data)
    pil_image = Image.open(image_stream)
    return pil_image


def process_image(image, query):
    dashscope.api_key = os.environ.get('qwen')
    image = "file://" + image
    messages = [{
        'role': 'user',
        'content': [
            {
                'image': image
            },
            {
                'text': query
            },
        ]
    }]
    response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages)

    try:
        response = response['output']['choices'][0]['message']['content'][0]["text"]
    except:
        response = "This is an icon."

    return response


if not os.path.exists("screenshot"):
    os.mkdir("screenshot")
if not os.path.exists("temp"):
    os.mkdir("temp")


def mobile_agent_infer(json_data):
    task = json_data["task"]
    if task == "caption":
        query = json_data["query"]
        images = json_data["images"]
        local_images = []
        for image in images:
            image_name = image["image_name"]
            image_file = image["image_file"]
            image_file = base64_to_pil(image_file)
            image_path = "temp/" + image_name
            image_file.save(image_path, "PNG")
            local_images.append(image_path)

        icon_map = {}
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)}

            for future in concurrent.futures.as_completed(futures):
                i = futures[future]
                response = future.result()
                icon_map[i + 1] = response

        output = {"icon_map": icon_map}
        return output

    elif task == "planning":
        instruction = json_data["instruction"]
        thought_history = json_data["thought_history"]
        summary_history = json_data["summary_history"]
        action_history = json_data["action_history"]
        completed_requirements = json_data["completed_requirements"]
        add_info = json_data["add_info"]

        prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history,
                                             completed_requirements, add_info)
        chat_planning = init_memory_chat()
        chat_planning = add_response("user", prompt_planning, chat_planning)
        output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token)

        output = {"planning": output_planning}
        return output

    elif task == "decision":
        screenshot_file = json_data["screenshot_file"]
        screenshot_file = base64_to_pil(screenshot_file)
        image_path = "screenshot/screenshot_local.png"
        screenshot_file.save(image_path, "PNG")

        instruction = json_data["instruction"]
        perception_infos = json_data["perception_infos"]
        width = json_data["width"]
        height = json_data["height"]
        summary_history = json_data["summary_history"]
        action_history = json_data["action_history"]
        summary = json_data["summary"]
        action = json_data["action"]
        add_info = json_data["add_info"]
        error_flag = json_data["error_flag"]
        completed_requirements = json_data["completed_requirements"]
        memory = json_data["memory"]
        memory_switch = json_data["memory_switch"]
        insight = json_data["insight"]

        prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history,
                                          action_history, summary, action, add_info, error_flag, completed_requirements,
                                          memory)
        chat_action = init_action_chat()
        chat_action = add_response("user", prompt_action, chat_action, image_path)
        output_action = inference_chat(chat_action, 'gpt-4o', API_url, token)
        if output_action == "No token":
            output = {"decision": "No token", "memory": None}
            return output
        chat_action = add_response("assistant", output_action, chat_action)

        output_memory = None
        if memory_switch:
            prompt_memory = get_memory_prompt(insight)
            chat_action = add_response("user", prompt_memory, chat_action)
            output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token)

        output = {"decision": output_action, "memory": output_memory}
        return output

    elif task == "reflection":
        screenshot_file = json_data["screenshot_file"]
        screenshot_file = base64_to_pil(screenshot_file)
        image_path = "screenshot/screenshot_local.png"
        screenshot_file.save(image_path, "PNG")
        last_screenshot_file = json_data["last_screenshot_file"]
        last_screenshot_file = base64_to_pil(last_screenshot_file)
        last_image_path = "screenshot/last_screenshot_local.png"
        last_screenshot_file.save(last_image_path, "PNG")

        instruction = json_data["instruction"]
        last_perception_infos = json_data["last_perception_infos"]
        perception_infos = json_data["perception_infos"]
        width = json_data["width"]
        height = json_data["height"]
        summary = json_data["summary"]
        action = json_data["action"]
        add_info = json_data["add_info"]

        prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height,
                                            summary, action, add_info)
        chat_reflect = init_reflect_chat()
        chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path])
        output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token)

        output = {"reflection": output_reflect}
        return output

    else:
        output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."}
        return output