if you are trying to load deepseek r1 qwen distll.. this I crated and works fine. I can chat with the model. # -- coding: utf-8 --
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

Set the path to the locally downloaded model directory

MODEL_PATH = "F:/Deepseek1.5" # Change this to your actual path

Load the tokenizer from the local directory

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Load the model from local files

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True # Enables loading custom DeepSeek model code
)

Move model to the available device

model.to(device)

def chat_with_model(history, max_length=150, temperature=0.7, top_p=0.9, repetition_penalty=1.2):
"""Generate chatbot response while managing history effectively."""

# Keep only the last few exchanges for context (prevents infinite history buildup)
MAX_HISTORY_LENGTH = 1000  # Adjust based on available memory
if len(history) > MAX_HISTORY_LENGTH:
    history = history[-MAX_HISTORY_LENGTH:]  

inputs = tokenizer(history, return_tensors="pt", truncation=True, max_length=4096).to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=temperature,   # Controls randomness (higher = more creative)
        top_p=top_p,               # Nucleus sampling for diversity
        repetition_penalty=repetition_penalty,  # Reduces repeated phrases
        pad_token_id=tokenizer.eos_token_id  # Fixes padding issue
    )

response = tokenizer.decode(output[0], skip_special_tokens=True)

# Extract only new model-generated text (remove repeated history)
response = response[len(history):].strip()

# Stop the model from going off-track
response = response.split("\n")[0]  # Keep only the first response line

return response

Interactive chat loop

if name == "main":
print("\n🤖 DeepSeek Chatbot: Type 'exit', 'quit', or 'bye' to end the chat.\n")

history = ""  # Keeps conversation history
while True:
    user_input = input("You: ")

    if user_input.lower() in ["exit", "quit", "bye"]:
        print("DeepSeek: Goodbye! 👋")
        break

    # Append user input to history
    history += f"\nYou: {user_input}\nDeepSeek:"

    # Generate response
    response = chat_with_model(history)

    # Display response
    print(f"DeepSeek: {response}")

    # Append response to history for continuity
    history += f" {response}"

deepseek-ai
/

DeepSeek-R1-Distill-Qwen-1.5B

Failed to load the model

Set the path to the locally downloaded model directory

Load the tokenizer from the local directory

Load the model from local files

Move model to the available device

Interactive chat loop