🌊 Welcome to LaVague

import gradio as gr
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from lavague.ActionEngine import ActionEngine
from lavague.defaults import DefaultLocalLLM, DefaultLLM
from llama_index.llms.huggingface import HuggingFaceInferenceAPI

MAX_CHARS = 1500

# Use this action_engine instead to have a local inference
# action_engine = ActionEngine(llm=DefaultLocalLLM())


import os
from llama_index.llms.azure_openai import AzureOpenAI

api_key=os.getenv("AZURE_OPENAI_KEY")
api_version="2023-05-15"
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
model = "gpt-4"
deployment_name = "gpt-4-turbo"

llm = AzureOpenAI(
    model=model,
    deployment_name=deployment_name,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    temperature=0.0
)

action_engine = ActionEngine(llm=llm)

## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1600,900")

# Set path to chrome/chromedriver as per your configuration

import os.path
homedir = os.path.expanduser("~")
chrome_options.binary_location = "./chrome-linux64/chrome"
webdriver_service = Service("./chromedriver-linux64/chromedriver")


title = """
<div align="center">
  <h1>🌊 Welcome to LaVague</h1>
  <p>Redefining internet surfing by transforming natural language instructions into seamless browser interactions.</p>
</div>
"""

# Choose Chrome Browser
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# action_engine = ActionEngine(llm, embedder)

def process_url(url):
    driver.get(url)
    driver.save_screenshot("screenshot.png")
    # This function is supposed to fetch and return the image from the URL.
    # Placeholder function: replace with actual image fetching logic.
    return "screenshot.png"

def process_instruction(query, url_input):
    if url_input != driver.current_url:
        driver.get(url_input)
    state = driver.page_source
    query_engine = action_engine.get_query_engine(state)
    streaming_response = query_engine.query(query)

    source_nodes = streaming_response.get_formatted_sources(MAX_CHARS)

    response = ""

    for text in streaming_response.response_gen:
    # do something with text as they arrive.
        response += text
        yield response, source_nodes

import re

def extract_first_python_code(markdown_text):
    # Pattern to match the first ```python ``` code block
    pattern = r"```python(.*?)```"
    
    # Using re.DOTALL to make '.' match also newlines
    match = re.search(pattern, markdown_text, re.DOTALL)
    if match:
        # Return the first matched group, which is the code inside the ```python ```
        return match.group(1).strip()
    else:
        # Return None if no match is found
        return None


def exec_code(code, source_nodes, full_code):
    print(code)
    code = extract_first_python_code(code)
    html = driver.page_source
    try:
        exec(code)
        output = "Successful code execution"
        status = """<p style="color: green; font-size: 20px; font-weight: bold;">Success!</p>"""
        full_code += code
    except Exception as e:
        output = f"Error in code execution: {str(e)}"
        status = """<p style="color: red; font-size: 20px; font-weight: bold;">Failure! Open the Debug tab for more information</p>"""
    return output, code, html, status, full_code

def update_image_display(img):
    driver.save_screenshot("screenshot.png")
    url = driver.current_url
    return "screenshot.png", url

def show_processing_message():
    return "Processing..."

def update_image_display(img):
    driver.save_screenshot("screenshot.png")
    url = driver.current_url
    return "screenshot.png", url

base_url = "https://huggingface.co/"

instructions = ["Click on the Datasets item on the menu, between Models and Spaces",
                "Click on the search bar 'Filter by name', type 'The Stack', and press 'Enter'",
                "Scroll by 500 pixels",]

with gr.Blocks() as demo:
    with gr.Tab("LaVague"):
        with gr.Row():
            gr.HTML(title)
        with gr.Row():
            url_input = gr.Textbox(value=base_url, label="Enter URL and press 'Enter' to load the page.")
        
        with gr.Row():
            with gr.Column(scale=7):
                image_display = gr.Image(label="Browser", interactive=False)
            
            with gr.Column(scale=3):
                with gr.Accordion(label="Full code", open=False):
                    full_code = gr.Code(value="", language="python", interactive=False)
                code_display = gr.Code(label="Generated code", language="python",
                                        lines=5, interactive=True)
                
                status_html = gr.HTML()
        with gr.Row():
            with gr.Column(scale=8):
                text_area = gr.Textbox(label="Enter instructions and press 'Enter' to generate code.")
                gr.Examples(examples=instructions, inputs=text_area)
    with gr.Tab("Debug"):
        with gr.Row():
            with gr.Column():
                log_display = gr.Textbox(interactive=False, lines=20)
            with gr.Column():
                source_display = gr.Code(language="html", label="Retrieved nodes", interactive=False, lines=20)
        with gr.Row():
            with gr.Accordion(label="Full HTML", open=False):
                full_html = gr.Code(language="html", label="Full HTML", interactive=False, lines=20)

    # Linking components
    url_input.submit(process_url, inputs=url_input, outputs=image_display)
    text_area.submit(show_processing_message, outputs=[status_html]).then(
        process_instruction, inputs=[text_area, url_input], outputs=[code_display, source_display]
        ).then(
        exec_code, inputs=[code_display, source_display, full_code], 
        outputs=[log_display, code_display, full_html, status_html, full_code]
    ).then(
        update_image_display, inputs=image_display, outputs=[image_display, url_input]
    )
demo.launch(debug=True)