import gradio as gr import threading import codecs #from ast import literal_eval from datetime import datetime import os os.environ['TRANSFORMERS_CACHE'] = '/data/.modelcache/huggingface/hub/' os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:516" from transformers import BloomTokenizerFast from petals.client import DistributedBloomForCausalLM import torch import gc DEVICE = "cuda" if torch.cuda.is_available() else "cpu" TORCH_DTYPE = torch.bfloat16 MODEL_NAMES = ["bigscience/bloom-petals", "bigscience/bloomz-petals"] models = {} output = {} def gen_thread(model_name, inputs, max_new_tokens, min_length, temperature, top_p, repetition_penalty): global output n_input_tokens = inputs.shape[1] outputs = models[model_name][1].generate(inputs, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty ) output[model_name] = models[model_name][0].decode(outputs[0, n_input_tokens:]) def to_md(text): # return text.replace("\n", "
") return text.replace("\n", "
") def infer( prompt, min_length=2, max_new_tokens=10, temperature=0.1, top_p=1.0, repetition_penalty = 1.0, stop="\n", num_completions=1, seed=42, ): #gc.collect() #torch.cuda.empty_cache() if not models: for model_name in MODEL_NAMES: tokenizer = BloomTokenizerFast.from_pretrained(model_name) model = DistributedBloomForCausalLM.from_pretrained(model_name, torch_dtype=TORCH_DTYPE) model = model.to(DEVICE) models[model_name] = tokenizer, model max_new_tokens = int(max_new_tokens) num_completions = int(num_completions) temperature = float(temperature) top_p = float(top_p) stop = stop.split(";") repetition_penalty = float(repetition_penalty) seed = seed assert 1 <= max_new_tokens <= 384 assert 0 <= min_length <= max_new_tokens assert 1 <= num_completions <= 5 assert 0.0 <= temperature <= 1.0 assert 0.0 <= top_p <= 1.0 assert 0.9 <= repetition_penalty <= 3.0 if temperature == 0.0: temperature = 0.01 if prompt == "": prompt = " " threads = list() print(f"START -> ({datetime.now()})\n") print(f"PROMPT ({datetime.now()}):\n-------\n{prompt}\n") for model_name in MODEL_NAMES: inputs = models[model_name][0](prompt, return_tensors="pt")["input_ids"].to(DEVICE) x = threading.Thread(target=gen_thread, args=(model_name, inputs, max_new_tokens, min_length, temperature, top_p, repetition_penalty)) threads.append(x) x.start() #n_input_tokens = inputs.shape[1] # outputs = models[model_name][1].generate(inputs, # max_new_tokens=max_new_tokens, # min_length=min_length, # do_sample=True, # temperature=temperature, # top_p=top_p, # repetition_penalty=repetition_penalty # ) #output[model_name] = models[model_name][0].decode(outputs[0, n_input_tokens:]) #output[model_name] = outputs[len(prompt):] # Join Threads for model_name, thread in enumerate(threads): print(f"waiting on: {model_name}\n") thread.join() print(f"{model_name} thread done\n") for model_name in MODEL_NAMES: stop = codecs.getdecoder("unicode_escape")(stop[0])[0] stop = [x.strip(' ') for x in stop.split(',')] for stop_word in stop: if stop_word != '' and stop_word in output[model_name]: output[model_name] = output[model_name][:output[model_name].find(stop_word)] print(f"--- START: {model_name} --- \n{output[model_name]}\n--- END {model_name} ---\n\n") print(f"DONE -> ({datetime.now()})\n") return output[MODEL_NAMES[0]], output[MODEL_NAMES[1]] examples = [ [ # Question Answering '''Please answer the following question: Question: What is the capital of Germany? Answer:''', 1, 3, 0.2, 1.0, 1.0, "\\n,"], [ # Natural Language Interface '''Given a pair of sentences, choose whether the two sentences agree (entailment)/disagree (contradiction) with each other. Possible labels: 1. entailment 2. contradiction Sentence 1: The skier was on the edge of the ramp. Sentence 2: The skier was dressed in winter clothes. Label: entailment Sentence 1: The boy skated down the staircase railing. Sentence 2: The boy is a newbie skater. Label: contradiction Sentence 1: Two middle-aged people stand by a golf hole. Sentence 2: A couple riding in a golf cart. Label:''', 1, 2, 0.2, 1.0, 1.0, "\\n,"] ] def main(): iface = gr.Interface( fn=infer, allow_flagging="never", inputs=[ gr.Textbox(lines=20), # prompt gr.Slider(0, 256, value=1), #min_length gr.Slider(1, 384, value=20), # max_tokens gr.Slider(0.0, 1.0, value=0.2), # temperature gr.Slider(0.0, 1.0, value=0.9), # top_p gr.Slider(0.9, 3.0, value=1.0), # repetition penalty gr.Textbox(lines=1, value="\\n,") # stop ], outputs=[gr.Textbox(lines=7, label="BLOOM OUTPUT:"), gr.Textbox(lines=7,label="BLOOMZ OUTPUT:")], examples=examples, cache_examples=True, title="BLOOM vs BLOOMZ", description='''

Compare outputs of the BLOOM and BLOOMZ 176 billion parameter models using the [Petals](https://petals.ml/) network. Please consider joining the Petals network to help speed up inference.

Big thanks to [RFTCapital](https://www.rftcapital.com) for providing initial compute resources.

''' ) iface.launch(debug=True, share=False) if __name__ == '__main__': main()