Spaces:

sandz7
/

smart-reader

Runtime error

App Files Files Community

sandz7 commited on Jul 29, 2024

Commit

46e059f

1 Parent(s): 48e85e0

commit with 2 UI

Browse files

Files changed (2) hide show

app.py +180 -7
steps.txt +3 -2

app.py CHANGED Viewed

@@ -1,19 +1,192 @@
 import torch
-import subprocess
 import gradio as gr
-import os
 import openai
 import base64
-import numpy as np
 API_KEY = os.getenv('OPEN_AI_API_KEY')
-from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to('cuda')
 DESCRIPTION = '''
 <div>
-<h1 style="text-align: center;">Atoo 🦜</h1>
-<p style="text-align: center;">This carries a Multi-Spearker and a Multi-lingual Model by <a href="https://github.com/coqui-ai/TTS"><b>coqui-ai</b></a></p>
 </div>
 '''

 import torch
+from diffusers import DiffusionPipeline
 import gradio as gr
+import numpy as np
 import openai
+import os
+import spaces
 import base64
+# Setup logging
+# logging.basicConfig(level=logging.DEBUG)
+# logger = logging.getLogger(__name__)
+# Retrieve the OpenAI API key from the environment
 API_KEY = os.getenv('OPEN_AI_API_KEY')
 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">Book-Reader</h1>
+<p style="text-align: center;">This contains a Stable Diffusor from <a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><b>stabilityai/stable-diffusion-xl-base-1.0</b></a></p>
+<p style="text-align: center;">For Instructions on how to use the models <a href="https://huggingface.co/spaces/sandz7/chimera/blob/main/README.md"><b>view this</b></a></p>
 </div>
 '''
+# load both base and refiner
+base = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16").to("cuda:0")
+refiner = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",
+                                            text_encoder_2=base.text_encoder_2,
+                                            vae=base.vae,
+                                            torch_dtype=torch.float16,
+                                            use_safetensor=True,
+                                            variant="fp16").to("cuda:0")
+chat_mode = {}
+def encode_image(image_path):
+    chat_mode["the_mode"] = "diffusing"
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def generation(message, history):
+    """
+    Generates a response based on the input message and optionally an image.
+    """
+    global chat_mode
+    image_path = None
+    if "files" in message and message["files"]:
+        if type(message["files"][-1]) == dict:
+            image_path = message["files"][-1]["path"]
+        else:
+            image_path = message["files"][-1]
+    else:
+        for hist in history:
+            if type(hist[0]) == tuple:
+                image_path = hist[0][0]
+    input_prompt = message if isinstance(message, str) else message.get("text", "")
+    if image_path is None:
+        chat_mode["mode"] = "text"
+        client = openai.OpenAI(api_key=API_KEY)
+        stream = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "system", "content": "You are a helpful assistant called 'chimera'."},
+                        {"role": "user", "content": input_prompt}],
+            stream=True,
+        )
+        return stream
+    else:
+        chat_mode["mode"] = "image"
+        base64_image = encode_image(image_path=image_path)
+        client = openai.OpenAI(api_key=API_KEY)
+        stream = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[{"role": "system", "content": "You are a helpful assistant called 'chimera'."},
+                        {"role": "user", "content": [
+                            {"type": "text", "text": input_prompt},
+                            {"type": "image_url", "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }}
+                        ]}],
+            stream=True,
+        )
+        return stream
+# function to take input and generate text tokena
+@spaces.GPU(duration=120)
+def diffusing(prompt: str,
+              n_steps: int,
+              denoising: float):
+    """
+    Takes input, passes it into the pipeline,
+    get the top 5 scores, and ouput those scores into images
+    """
+    # Generate image based on text
+    image_base = base(
+        prompt=prompt,
+        num_inference_steps=n_steps,
+        denoising_end=denoising,
+        output_type="latent"
+    ).images
+    image = refiner(
+        prompt=prompt,
+        num_inference_steps=n_steps,
+        denoising_start=denoising,
+        image=image_base
+    ).images[0]
+    return image
+def check_cuda_availability():
+    if torch.cuda.is_available():
+        return f"GPU: {torch.cuda.get_device_name(0)}"
+    else:
+        return "No CUDA device found."
+# Image created from diffusing
+image_created = {}
+@spaces.GPU(duration=120)
+def bot_comms(message, history):
+    """
+    Handles communication between Gradio and the models.
+    """
+    # ensures message is a dictionary
+    if not isinstance(message, dict):
+        message = {"text": message}
+    if message["text"] == "check cuda":
+        yield check_cuda_availability()
+        return
+    buffer = ""
+    gpt_outputs = []
+    stream = generation(message, history)
+    for chunk in stream:
+        if chunk.choices[0].delta.content is not None:
+            text = chunk.choices[0].delta.content
+            if text:
+                gpt_outputs.append(text)
+                buffer += text
+            yield "".join(gpt_outputs)
+chat_input = gr.MultimodalTextbox(interactive=True, file_types=["images"], placeholder="Enter your question or upload an image.", show_label=False)
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Row():
+        # Diffusing
+        with gr.Column():
+            gr.Markdown(DESCRIPTION)
+            image_prompt = gr.Textbox(label="Image Prompt")
+            output_image = gr.Image(label="Generated Image")
+            generate_image_button = gr.Button("Generate Image")
+            # generate_image_button.click(fn=diffusing, inputs=image_prompt, outputs=output_image)
+            with gr.Accordion(label="⚙️ Parameters", open=False):
+                steps_slider = gr.Slider(
+                    minimum=20,
+                    maximum=100,
+                    step=1,
+                    value=40,
+                    label="Number of Inference Steps"
+                )
+                denoising_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.1,
+                    value=0.8,
+                    label="High Noise Fraction"
+                )
+            generate_image_button.click(
+                fn=diffusing,
+                inputs=[image_prompt, steps_slider, denoising_slider],
+                outputs=output_image
+            )
+        with gr.Column():
+            # GPT-3.5
+            gr.Markdown('''
+<div>
+<h1 style="text-align: center;">Smart Reader</h1>
+<p style="text-align: center;">This contains a Generative LLM from <a href="https://openai.com/"><b>Open AI</b></a> called GPT-3.5-Turbo and Vision.</p>
+<p style="text-align: center;">For Instructions on how to use the models <a href="https://huggingface.co/spaces/sandz7/chimera/blob/main/README.md"><b>view this</b></a></p>
+</div>
+''')
+            chat = gr.ChatInterface(fn=bot_comms,
+                                    multimodal=True,
+                                    textbox=chat_input)
+demo.launch()

steps.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-> Add a LLM with a multimodal to understand images and text in order to pass the base prompt with instructions to model to make a reformated
-version of the original prompt to pass it to the text-to-speech


1	+ It passed as a string to the API regardless of sending a message as an image to be encoded, needs to be sent to API as str to understand
2	+
3	+ > Use Openai Vision instead for the content in message being misinterpretated