Spaces:
Sleeping
Sleeping
modify to cpu
Browse files- inference.py +12 -4
inference.py
CHANGED
@@ -21,13 +21,16 @@ def load_model(
|
|
21 |
Returns:
|
22 |
Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
|
23 |
"""
|
|
|
|
|
24 |
model_name = "lora_model"
|
25 |
|
26 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
|
28 |
model = AutoModelForCausalLM.from_pretrained(
|
29 |
-
pretrained_model_name_or_path=model_name
|
30 |
-
|
|
|
31 |
|
32 |
model.eval() # Set model to evaluation mode
|
33 |
|
@@ -80,11 +83,16 @@ def generate_response(
|
|
80 |
Returns:
|
81 |
str: Generated response
|
82 |
"""
|
|
|
|
|
|
|
83 |
# text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
|
84 |
-
inputs = tokenizer(inputs, return_tensors="pt").to(
|
85 |
outputs = model.generate(
|
86 |
**inputs,
|
87 |
-
max_length=2000
|
|
|
|
|
88 |
# streamer=text_streamer,
|
89 |
# max_new_tokens=max_new_tokens,
|
90 |
# use_cache=True,
|
|
|
21 |
Returns:
|
22 |
Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
|
23 |
"""
|
24 |
+
device = torch.device("cpu")
|
25 |
+
|
26 |
model_name = "lora_model"
|
27 |
|
28 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
|
30 |
model = AutoModelForCausalLM.from_pretrained(
|
31 |
+
pretrained_model_name_or_path=model_name,
|
32 |
+
torch_dtype=torch.float32 # Use float32 for CPU
|
33 |
+
).to(device)
|
34 |
|
35 |
model.eval() # Set model to evaluation mode
|
36 |
|
|
|
83 |
Returns:
|
84 |
str: Generated response
|
85 |
"""
|
86 |
+
|
87 |
+
device = torch.device("cpu")
|
88 |
+
|
89 |
# text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
|
90 |
+
inputs = tokenizer(inputs, return_tensors="pt").to(device)
|
91 |
outputs = model.generate(
|
92 |
**inputs,
|
93 |
+
max_length=2000,
|
94 |
+
# num_return_sequences=1,
|
95 |
+
# do_sample=False # Deterministic generation
|
96 |
# streamer=text_streamer,
|
97 |
# max_new_tokens=max_new_tokens,
|
98 |
# use_cache=True,
|