RinInori commited on
Commit
370703b
1 Parent(s): e9fdaca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -42
app.py CHANGED
@@ -2,54 +2,132 @@ import torch
2
  from peft import PeftModel
3
  import transformers
4
  import gradio as gr
5
- from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
6
- from transformers import Trainer
 
 
 
 
 
7
 
8
  BASE_MODEL = "TheBloke/vicuna-7B-1.1-HF"
 
9
 
10
- model = LlamaForCausalLM.from_pretrained(
11
- BASE_MODEL,
12
- torch_dtype=torch.float16,
13
- load_in_8bit=True,
14
- device_map = "auto",
15
- offload_folder="./cache",
16
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
19
- tokenizer.pad_token_id = 0
20
- tokenizer.padding_side = "left"
21
 
22
- def format_prompt(prompt: str) -> str:
23
- return f"### Human: {prompt}\n### Assistant:"
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- generation_config = GenerationConfig(
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  max_new_tokens=128,
27
- temperature=0.2,
28
- repetition_penalty=1.0,
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- def generate_text(prompt: str):
32
- formatted_prompt = format_prompt(prompt)
33
-
34
- inputs = tokenizer(
35
- formatted_prompt,
36
- padding=False,
37
- add_special_tokens=False,
38
- return_tensors="pt"
39
- ).to(model.device)
40
-
41
- with torch.inference_mode():
42
- tokens = model.generate(**inputs, generation_config=generation_config)
43
-
44
- response = tokenizer.decode(tokens[0], skip_special_tokens=True)
45
- assistant_index = response.find("### Assistant:") + len("### Assistant:")
46
- return response[assistant_index:].strip()
47
-
48
- iface = gr.Interface(
49
- fn=generate_text,
50
- inputs="text",
51
- outputs="text",
52
- title="Chatbot",
53
- description="This vicuna app is using this model: https://huggingface.co/TheBloke/vicuna-7B-1.1-HF"
54
  )
55
- iface.launch()
 
 
 
2
  from peft import PeftModel
3
  import transformers
4
  import gradio as gr
5
+
6
+ assert (
7
+ "LlamaTokenizer" in transformers._import_structure["models.llama"]
8
+ ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
9
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
10
+
11
+ tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
12
 
13
  BASE_MODEL = "TheBloke/vicuna-7B-1.1-HF"
14
+ LORA_WEIGHTS = "RinInori/vicuna_finetuned_6_sentiments" #Fine-tuned Alpaca model for sentiment analysis
15
 
16
+ if torch.cuda.is_available():
17
+ device = "cuda"
18
+ else:
19
+ device = "cpu"
20
+ try:
21
+ if torch.backends.mps.is_available():
22
+ device = "mps"
23
+ except:
24
+ pass
25
+
26
+ if device == "cuda":
27
+ model = LlamaForCausalLM.from_pretrained(
28
+ BASE_MODEL,
29
+ load_in_8bit=False,
30
+ torch_dtype=torch.float16,
31
+ device_map="auto",
32
+ )
33
+ model = PeftModel.from_pretrained(
34
+ model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
35
+ )
36
+ elif device == "mps":
37
+ model = LlamaForCausalLM.from_pretrained(
38
+ BASE_MODEL,
39
+ device_map={"": device},
40
+ torch_dtype=torch.float16,
41
+ )
42
+ model = PeftModel.from_pretrained(
43
+ model,
44
+ LORA_WEIGHTS,
45
+ device_map={"": device},
46
+ torch_dtype=torch.float16,
47
+ )
48
+ else:
49
+ model = LlamaForCausalLM.from_pretrained(
50
+ BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
51
+ )
52
+ model = PeftModel.from_pretrained(
53
+ model,
54
+ LORA_WEIGHTS,
55
+ device_map={"": device},
56
+ )
57
 
 
 
 
58
 
59
+ def generate_prompt(instruction, input=None):
60
+ if input:
61
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
62
+ ### Instruction:
63
+ {instruction}
64
+ ### Input:
65
+ {input}
66
+ ### Response:"""
67
+ else:
68
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
69
+ ### Instruction :
70
+ {instruction}
71
+ ### Response :"""
72
 
73
+ if device != "cpu":
74
+ model.half()
75
+ model.eval()
76
+ if torch.__version__ >= "2":
77
+ model = torch.compile(model)
78
+
79
+
80
+ def evaluate(
81
+ instruction,
82
+ input=None,
83
+ temperature=0.1,
84
+ top_p=0.75,
85
+ top_k=40,
86
+ num_beams=4,
87
  max_new_tokens=128,
88
+ **kwargs,
89
+ ):
90
+ prompt = generate_prompt(instruction, input)
91
+ inputs = tokenizer(prompt, return_tensors="pt")
92
+ input_ids = inputs["input_ids"].to(device)
93
+ generation_config = GenerationConfig(
94
+ temperature=temperature,
95
+ top_p=top_p,
96
+ top_k=top_k,
97
+ num_beams=num_beams,
98
+ **kwargs,
99
+ )
100
+ with torch.no_grad():
101
+ generation_output = model.generate(
102
+ input_ids=input_ids,
103
+ generation_config=generation_config,
104
+ return_dict_in_generate=True,
105
+ output_scores=True,
106
+ max_new_tokens=max_new_tokens,
107
+ )
108
+ s = generation_output.sequences[0]
109
+ output = tokenizer.decode(s)
110
+ return output.split("### Response:")[1].strip()
111
 
112
+ g = gr.Interface(
113
+ fn=evaluate,
114
+ inputs=[
115
+ gr.inputs.Textbox(
116
+ label="Instruction",
117
+ placeholder="Type your instruction here...",
118
+ lines=3
119
+ ),
120
+ gr.inputs.Textbox(
121
+ label="Input",
122
+ placeholder="Type additional input here...",
123
+ lines=3
124
+ ),
125
+ ],
126
+ outputs=gr.outputs.Textbox(label="Output"),
127
+ title="Instruction-based Text Generation",
128
+ description="Enter an instruction and optional input, and the model will generate a response based on the instruction.",
129
+ theme="default",
 
 
 
 
 
130
  )
131
+
132
+ if __name__ == "__main__":
133
+ g.launch()