Caslow commited on
Commit
78ad200
·
1 Parent(s): 9efc447
Files changed (1) hide show
  1. inference.py +10 -9
inference.py CHANGED
@@ -21,9 +21,12 @@ def load_model(
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
24
- device = torch.device("cpu")
25
 
26
- model_name = "lora_model"
 
 
 
 
27
 
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
 
@@ -31,7 +34,8 @@ def load_model(
31
  pretrained_model_name_or_path=model_name,
32
  device_map="cpu",
33
  torch_dtype=torch.float32, # Use float32 for CPU
34
- low_cpu_mem_usage=True # Helps with memory efficiency
 
35
  )
36
 
37
  model.eval() # Set model to evaluation mode
@@ -91,10 +95,10 @@ def generate_response(
91
  # text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
92
  inputs = tokenizer(inputs, return_tensors="pt").to(device)
93
  outputs = model.generate(
94
- **inputs,
95
  max_length=2000,
 
96
  # num_return_sequences=1,
97
- # do_sample=False # Deterministic generation
98
  # streamer=text_streamer,
99
  # max_new_tokens=max_new_tokens,
100
  # use_cache=True,
@@ -125,10 +129,7 @@ def main(
125
 
126
  # Load model
127
  model, tokenizer = load_model(
128
- model_name=MODEL_PATH,
129
- max_seq_length=max_seq_length,
130
- dtype=dtype,
131
- load_in_4bit=load_in_4bit
132
  )
133
 
134
  # Prepare input
 
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
 
24
 
25
+ try:
26
+ from transformers import BitsAndBytesConfig
27
+ bnb_config = BitsAndBytesConfig(load_in_4bit=False)
28
+ except ImportError:
29
+ bnb_config = None
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(model_name)
32
 
 
34
  pretrained_model_name_or_path=model_name,
35
  device_map="cpu",
36
  torch_dtype=torch.float32, # Use float32 for CPU
37
+ low_cpu_mem_usage=True, # Helps with memory efficiency
38
+ quantization_config=bnb_config
39
  )
40
 
41
  model.eval() # Set model to evaluation mode
 
95
  # text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
96
  inputs = tokenizer(inputs, return_tensors="pt").to(device)
97
  outputs = model.generate(
98
+ inputs,
99
  max_length=2000,
100
+ do_sample=False # Deterministic generation
101
  # num_return_sequences=1,
 
102
  # streamer=text_streamer,
103
  # max_new_tokens=max_new_tokens,
104
  # use_cache=True,
 
129
 
130
  # Load model
131
  model, tokenizer = load_model(
132
+ model_name=MODEL_PATH
 
 
 
133
  )
134
 
135
  # Prepare input