Caslow commited on
Commit
d177a85
·
1 Parent(s): c49fe71

modify to cpu

Browse files
Files changed (1) hide show
  1. inference.py +12 -4
inference.py CHANGED
@@ -21,13 +21,16 @@ def load_model(
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
 
 
24
  model_name = "lora_model"
25
 
26
  tokenizer = AutoTokenizer.from_pretrained(model_name)
27
 
28
  model = AutoModelForCausalLM.from_pretrained(
29
- pretrained_model_name_or_path=model_name
30
- )
 
31
 
32
  model.eval() # Set model to evaluation mode
33
 
@@ -80,11 +83,16 @@ def generate_response(
80
  Returns:
81
  str: Generated response
82
  """
 
 
 
83
  # text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
84
- inputs = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
85
  outputs = model.generate(
86
  **inputs,
87
- max_length=2000
 
 
88
  # streamer=text_streamer,
89
  # max_new_tokens=max_new_tokens,
90
  # use_cache=True,
 
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
24
+ device = torch.device("cpu")
25
+
26
  model_name = "lora_model"
27
 
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
 
30
  model = AutoModelForCausalLM.from_pretrained(
31
+ pretrained_model_name_or_path=model_name,
32
+ torch_dtype=torch.float32 # Use float32 for CPU
33
+ ).to(device)
34
 
35
  model.eval() # Set model to evaluation mode
36
 
 
83
  Returns:
84
  str: Generated response
85
  """
86
+
87
+ device = torch.device("cpu")
88
+
89
  # text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
90
+ inputs = tokenizer(inputs, return_tensors="pt").to(device)
91
  outputs = model.generate(
92
  **inputs,
93
+ max_length=2000,
94
+ # num_return_sequences=1,
95
+ # do_sample=False # Deterministic generation
96
  # streamer=text_streamer,
97
  # max_new_tokens=max_new_tokens,
98
  # use_cache=True,