DR-Rakshitha commited on
Commit
0c8c4f5
1 Parent(s): 7217acf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -18
app.py CHANGED
@@ -1,26 +1,118 @@
1
- import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
- # # Specify the directory containing the model and tokenizer
5
- # model_name = "gpt4all" # Make sure this matches the actual model directory
6
- # model_path = f"./" # Path to the model directory
7
 
8
- # # Initialize the GPT-4 model and tokenizer
9
- # model = AutoModelForCausalLM.from_pretrained(model_path)
10
- # tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- from gpt4all import GPT4All
13
- model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
14
- # output = model.generate("How to go to the hospital?")
15
- # print(output)
16
 
17
- def generate_text(input_text):
18
- # input_ids = tokenizer(input_text, return_tensors="pt").input_ids
19
- # generated_ids = model.generate(input_ids)
20
- # generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
21
 
22
- output = model.generate(input_text)
23
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  text_generation_interface = gr.Interface(
26
  fn=generate_text,
 
1
+ # import gradio as gr
2
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
+ # from gpt4all import GPT4All
5
+ # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
 
6
 
7
+ #----------------------------------------------------------------------------------------------------------------------------
8
+ !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
9
+ import os
10
+ import torch
11
+ from datasets import load_dataset
12
+ from transformers import (
13
+ AutoModelForCausalLM,
14
+ AutoTokenizer,
15
+ BitsAndBytesConfig,
16
+ HfArgumentParser,
17
+ TrainingArguments,
18
+ pipeline,
19
+ logging,
20
+ )
21
+ from peft import LoraConfig, PeftModel
22
+ from trl import SFTTrainer
23
+ # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
24
 
25
+ # LoRA attention dimension
26
+ lora_r = 64
 
 
27
 
28
+ # Alpha parameter for LoRA scaling
29
+ lora_alpha = 16
 
 
30
 
31
+ # Dropout probability for LoRA layers
32
+ lora_dropout = 0.1
33
+
34
+ ################################################################################
35
+ # bitsandbytes parameters
36
+ ################################################################################
37
+
38
+ # Activate 4-bit precision base model loading
39
+ use_4bit = True
40
+
41
+ # Compute dtype for 4-bit base models
42
+ bnb_4bit_compute_dtype = "float16"
43
+
44
+ # Quantization type (fp4 or nf4)
45
+ bnb_4bit_quant_type = "nf4"
46
+
47
+ # Activate nested quantization for 4-bit base models (double quantization)
48
+ use_nested_quant = False
49
+
50
+ # Load the entire model on the GPU 0
51
+ device_map = {"": 0}
52
+
53
+ #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
54
+ model_name = "DR-DRR/Model_001"
55
+ model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
56
+
57
+ #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
58
+
59
+ # Load tokenizer and model with QLoRA configuration
60
+ compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
61
+
62
+ bnb_config = BitsAndBytesConfig(
63
+ load_in_4bit=use_4bit,
64
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
65
+ bnb_4bit_compute_dtype=compute_dtype,
66
+ bnb_4bit_use_double_quant=use_nested_quant,
67
+ )
68
+
69
+ # Check GPU compatibility with bfloat16
70
+ if compute_dtype == torch.float16 and use_4bit:
71
+ major, _ = torch.cuda.get_device_capability()
72
+ if major >= 8:
73
+ print("=" * 80)
74
+ print("Your GPU supports bfloat16: accelerate training with bf16=True")
75
+ print("=" * 80)
76
+
77
+ # Load base model
78
+ model = AutoModelForCausalLM.from_pretrained(
79
+ model_name,
80
+ quantization_config=bnb_config,
81
+ device_map=device_map
82
+ )
83
+ model.config.use_cache = False
84
+ model.config.pretraining_tp = 1
85
+
86
+ # Load LLaMA tokenizer
87
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
88
+ tokenizer.pad_token = tokenizer.eos_token
89
+ tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
90
+
91
+ # Load LoRA configuration
92
+ peft_config = LoraConfig(
93
+ lora_alpha=lora_alpha,
94
+ lora_dropout=lora_dropout,
95
+ r=lora_r,
96
+ bias="none",
97
+ task_type="CAUSAL_LM",
98
+ )
99
+
100
+ #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
101
+ # Ignore warnings
102
+ logging.set_verbosity(logging.CRITICAL)
103
+
104
+ # Run text generation pipeline with our next model
105
+ # prompt = "What is a large language model?"
106
+ # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
107
+ # result = pipe(f"<s>[INST] {prompt} [/INST]")
108
+ # print(result[0]['generated_text'])
109
+
110
+
111
+ def generate_text(prompt):
112
+ # output = model.generate(input_text)
113
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
114
+ result = pipe(f"<s>[INST] {prompt} [/INST]")
115
+ return result
116
 
117
  text_generation_interface = gr.Interface(
118
  fn=generate_text,