sandz7 commited on
Commit
46e059f
·
1 Parent(s): 48e85e0

commit with 2 UI

Browse files
Files changed (2) hide show
  1. app.py +180 -7
  2. steps.txt +3 -2
app.py CHANGED
@@ -1,19 +1,192 @@
1
  import torch
2
- import subprocess
3
  import gradio as gr
4
- import os
5
  import openai
 
 
6
  import base64
7
- import numpy as np
8
 
 
 
 
 
 
9
  API_KEY = os.getenv('OPEN_AI_API_KEY')
10
- from TTS.api import TTS
11
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to('cuda')
12
 
13
  DESCRIPTION = '''
14
  <div>
15
- <h1 style="text-align: center;">Atoo 🦜</h1>
16
- <p style="text-align: center;">This carries a Multi-Spearker and a Multi-lingual Model by <a href="https://github.com/coqui-ai/TTS"><b>coqui-ai</b></a></p>
 
17
  </div>
18
  '''
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from diffusers import DiffusionPipeline
3
  import gradio as gr
4
+ import numpy as np
5
  import openai
6
+ import os
7
+ import spaces
8
  import base64
 
9
 
10
+ # Setup logging
11
+ # logging.basicConfig(level=logging.DEBUG)
12
+ # logger = logging.getLogger(__name__)
13
+
14
+ # Retrieve the OpenAI API key from the environment
15
  API_KEY = os.getenv('OPEN_AI_API_KEY')
 
 
16
 
17
  DESCRIPTION = '''
18
  <div>
19
+ <h1 style="text-align: center;">Book-Reader</h1>
20
+ <p style="text-align: center;">This contains a Stable Diffusor from <a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><b>stabilityai/stable-diffusion-xl-base-1.0</b></a></p>
21
+ <p style="text-align: center;">For Instructions on how to use the models <a href="https://huggingface.co/spaces/sandz7/chimera/blob/main/README.md"><b>view this</b></a></p>
22
  </div>
23
  '''
24
 
25
+ # load both base and refiner
26
+ base = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16").to("cuda:0")
27
+ refiner = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",
28
+ text_encoder_2=base.text_encoder_2,
29
+ vae=base.vae,
30
+ torch_dtype=torch.float16,
31
+ use_safetensor=True,
32
+ variant="fp16").to("cuda:0")
33
+
34
+ chat_mode = {}
35
+
36
+ def encode_image(image_path):
37
+ chat_mode["the_mode"] = "diffusing"
38
+ with open(image_path, "rb") as image_file:
39
+ return base64.b64encode(image_file.read()).decode('utf-8')
40
+
41
+ def generation(message, history):
42
+ """
43
+ Generates a response based on the input message and optionally an image.
44
+ """
45
+ global chat_mode
46
+ image_path = None
47
+ if "files" in message and message["files"]:
48
+ if type(message["files"][-1]) == dict:
49
+ image_path = message["files"][-1]["path"]
50
+ else:
51
+ image_path = message["files"][-1]
52
+ else:
53
+ for hist in history:
54
+ if type(hist[0]) == tuple:
55
+ image_path = hist[0][0]
56
+
57
+ input_prompt = message if isinstance(message, str) else message.get("text", "")
58
+
59
+ if image_path is None:
60
+ chat_mode["mode"] = "text"
61
+ client = openai.OpenAI(api_key=API_KEY)
62
+ stream = client.chat.completions.create(
63
+ model="gpt-3.5-turbo",
64
+ messages=[{"role": "system", "content": "You are a helpful assistant called 'chimera'."},
65
+ {"role": "user", "content": input_prompt}],
66
+ stream=True,
67
+ )
68
+ return stream
69
+ else:
70
+ chat_mode["mode"] = "image"
71
+ base64_image = encode_image(image_path=image_path)
72
+ client = openai.OpenAI(api_key=API_KEY)
73
+ stream = client.chat.completions.create(
74
+ model="gpt-4o",
75
+ messages=[{"role": "system", "content": "You are a helpful assistant called 'chimera'."},
76
+ {"role": "user", "content": [
77
+ {"type": "text", "text": input_prompt},
78
+ {"type": "image_url", "image_url": {
79
+ "url": f"data:image/jpeg;base64,{base64_image}"
80
+ }}
81
+ ]}],
82
+ stream=True,
83
+ )
84
+ return stream
85
+
86
+ # function to take input and generate text tokena
87
+ @spaces.GPU(duration=120)
88
+ def diffusing(prompt: str,
89
+ n_steps: int,
90
+ denoising: float):
91
+ """
92
+ Takes input, passes it into the pipeline,
93
+ get the top 5 scores, and ouput those scores into images
94
+ """
95
+
96
+ # Generate image based on text
97
+ image_base = base(
98
+ prompt=prompt,
99
+ num_inference_steps=n_steps,
100
+ denoising_end=denoising,
101
+ output_type="latent"
102
+ ).images
103
+
104
+ image = refiner(
105
+ prompt=prompt,
106
+ num_inference_steps=n_steps,
107
+ denoising_start=denoising,
108
+ image=image_base
109
+ ).images[0]
110
+
111
+ return image
112
+
113
+ def check_cuda_availability():
114
+ if torch.cuda.is_available():
115
+ return f"GPU: {torch.cuda.get_device_name(0)}"
116
+ else:
117
+ return "No CUDA device found."
118
+
119
+ # Image created from diffusing
120
+ image_created = {}
121
+
122
+ @spaces.GPU(duration=120)
123
+ def bot_comms(message, history):
124
+ """
125
+ Handles communication between Gradio and the models.
126
+ """
127
+
128
+ # ensures message is a dictionary
129
+ if not isinstance(message, dict):
130
+ message = {"text": message}
131
+
132
+ if message["text"] == "check cuda":
133
+ yield check_cuda_availability()
134
+ return
135
+
136
+ buffer = ""
137
+ gpt_outputs = []
138
+ stream = generation(message, history)
139
+
140
+ for chunk in stream:
141
+ if chunk.choices[0].delta.content is not None:
142
+ text = chunk.choices[0].delta.content
143
+ if text:
144
+ gpt_outputs.append(text)
145
+ buffer += text
146
+ yield "".join(gpt_outputs)
147
+
148
+ chat_input = gr.MultimodalTextbox(interactive=True, file_types=["images"], placeholder="Enter your question or upload an image.", show_label=False)
149
+
150
+ with gr.Blocks(fill_height=True) as demo:
151
+ with gr.Row():
152
+ # Diffusing
153
+ with gr.Column():
154
+ gr.Markdown(DESCRIPTION)
155
+ image_prompt = gr.Textbox(label="Image Prompt")
156
+ output_image = gr.Image(label="Generated Image")
157
+ generate_image_button = gr.Button("Generate Image")
158
+ # generate_image_button.click(fn=diffusing, inputs=image_prompt, outputs=output_image)
159
+ with gr.Accordion(label="⚙️ Parameters", open=False):
160
+ steps_slider = gr.Slider(
161
+ minimum=20,
162
+ maximum=100,
163
+ step=1,
164
+ value=40,
165
+ label="Number of Inference Steps"
166
+ )
167
+ denoising_slider = gr.Slider(
168
+ minimum=0.0,
169
+ maximum=1.0,
170
+ step=0.1,
171
+ value=0.8,
172
+ label="High Noise Fraction"
173
+ )
174
+ generate_image_button.click(
175
+ fn=diffusing,
176
+ inputs=[image_prompt, steps_slider, denoising_slider],
177
+ outputs=output_image
178
+ )
179
+ with gr.Column():
180
+ # GPT-3.5
181
+ gr.Markdown('''
182
+ <div>
183
+ <h1 style="text-align: center;">Smart Reader</h1>
184
+ <p style="text-align: center;">This contains a Generative LLM from <a href="https://openai.com/"><b>Open AI</b></a> called GPT-3.5-Turbo and Vision.</p>
185
+ <p style="text-align: center;">For Instructions on how to use the models <a href="https://huggingface.co/spaces/sandz7/chimera/blob/main/README.md"><b>view this</b></a></p>
186
+ </div>
187
+ ''')
188
+ chat = gr.ChatInterface(fn=bot_comms,
189
+ multimodal=True,
190
+ textbox=chat_input)
191
+
192
+ demo.launch()
steps.txt CHANGED
@@ -1,2 +1,3 @@
1
- > Add a LLM with a multimodal to understand images and text in order to pass the base prompt with instructions to model to make a reformated
2
- version of the original prompt to pass it to the text-to-speech
 
 
1
+ It passed as a string to the API regardless of sending a message as an image to be encoded, needs to be sent to API as str to understand
2
+
3
+ > Use Openai Vision instead for the content in message being misinterpretated