Spaces:

RinInori
/

Vicuna_ChatBot

Runtime error

App Files Files Community

RinInori commited on May 21, 2023

Commit

370703b

•

1 Parent(s): e9fdaca

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -42

app.py CHANGED Viewed

@@ -2,54 +2,132 @@ import torch
 from peft import PeftModel
 import transformers
 import gradio as gr
-from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
-from transformers import Trainer
 BASE_MODEL = "TheBloke/vicuna-7B-1.1-HF"
-model = LlamaForCausalLM.from_pretrained(
-    BASE_MODEL,
-    torch_dtype=torch.float16,
-    load_in_8bit=True,
-    device_map = "auto",
-    offload_folder="./cache",
-)
-tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
-tokenizer.pad_token_id = 0
-tokenizer.padding_side = "left"
-def format_prompt(prompt: str) -> str:
-    return f"### Human: {prompt}\n### Assistant:"
-generation_config = GenerationConfig(
     max_new_tokens=128,
-    temperature=0.2,
-    repetition_penalty=1.0,
-)
-def generate_text(prompt: str):
-    formatted_prompt = format_prompt(prompt)
-    inputs = tokenizer(
-        formatted_prompt,
-        padding=False,
-        add_special_tokens=False,
-        return_tensors="pt"
-    ).to(model.device)
-    with torch.inference_mode():
-        tokens = model.generate(**inputs, generation_config=generation_config)
-    response = tokenizer.decode(tokens[0], skip_special_tokens=True)
-    assistant_index = response.find("### Assistant:") + len("### Assistant:")
-    return response[assistant_index:].strip()
-iface = gr.Interface(
-    fn=generate_text,
-    inputs="text",
-    outputs="text",
-    title="Chatbot",
-    description="This vicuna app is using this model: https://huggingface.co/TheBloke/vicuna-7B-1.1-HF"
 )
-iface.launch()

 from peft import PeftModel
 import transformers
 import gradio as gr
+assert (
+    "LlamaTokenizer" in transformers._import_structure["models.llama"]
+), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
+tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
 BASE_MODEL = "TheBloke/vicuna-7B-1.1-HF"
+LORA_WEIGHTS = "RinInori/vicuna_finetuned_6_sentiments" #Fine-tuned Alpaca model for sentiment analysis
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+try:
+    if torch.backends.mps.is_available():
+        device = "mps"
+except:
+    pass
+if device == "cuda":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        load_in_8bit=False,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    model = PeftModel.from_pretrained(
+        model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
+    )
+elif device == "mps":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+else:
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+    )
+def generate_prompt(instruction, input=None):
+    if input:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+{instruction}
+### Input:
+{input}
+### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction :
+{instruction}
+### Response :"""
+if device != "cpu":
+    model.half()
+model.eval()
+if torch.__version__ >= "2":
+    model = torch.compile(model)
+def evaluate(
+    instruction,
+    input=None,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=4,
     max_new_tokens=128,
+    **kwargs,
+):
+    prompt = generate_prompt(instruction, input)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    return output.split("### Response:")[1].strip()
+g = gr.Interface(
+    fn=evaluate,
+    inputs=[
+        gr.inputs.Textbox(
+            label="Instruction",
+            placeholder="Type your instruction here...",
+            lines=3
+        ),
+        gr.inputs.Textbox(
+            label="Input",
+            placeholder="Type additional input here...",
+            lines=3
+        ),
+    ],
+    outputs=gr.outputs.Textbox(label="Output"),
+    title="Instruction-based Text Generation",
+    description="Enter an instruction and optional input, and the model will generate a response based on the instruction.",
+    theme="default",
 )
+if __name__ == "__main__":
+    g.launch()