File size: 4,075 Bytes
bc05c47 7c4ff3a bc05c47 0a3a30a 93c81ee 7e9053b bc05c47 072013e 02e9ec7 072013e 519ee66 072013e 35dd9d3 bc05c47 7e9053b bc05c47 375de14 bc05c47 7c4ff3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
---
license: apache-2.0
language:
- en
- zh
base_model:
- meta-llama/Llama-3.2-3B-Instruct
library_name: transformers
tags:
- CoT
- LongCoT
- o1
pipeline_tag: text-generation
---
# Llama-3.2-3B-LongCoT
A small model with **LongCoT** capability.
![Example Image](images/example.jpg)
## Features
- Using high-quality synthetic data for fine-tuning.
- The model can adjust whether to use LongCoT based on the complexity of the question.
- Good at mathematics and reasoning
## Benchmark
| Benchmark | Llama-3.2-3B-Instruct | Llama-3.2-3B-LongCoT |
|-----------|-----------------------|----------------------|
| Math | 35.5 | **52.0** |
| GSM8K | 77.3 | **82.3** |
## Inference
Example of Stream Inference:
```python
import time
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextStreamer,
)
# Model ID from Hugging Face
model_id = "Kadins/Llama-3.2-3B-LongCoT"
# Load the pre-trained model with appropriate data type and device mapping
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Use bfloat16 for optimized performance
device_map="auto", # Automatically map the model to available devices
)
# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_id)
def stream_chat(messages, max_new_tokens=8192, top_p=0.95, temperature=0.6):
"""
Generates a response using streaming inference.
Args:
messages (list): A list of dictionaries containing the conversation prompt.
max_new_tokens (int): Maximum number of tokens to generate.
top_p (float): Nucleus sampling parameter for controlling diversity.
temperature (float): Sampling temperature to control response creativity.
"""
# Prepare the input by applying the chat template and tokenizing
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True, # Ensure the output is a dictionary
).to(model.device) # Move the inputs to the same device as the model
# Initialize the TextStreamer for real-time output
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Record the start time for performance measurement
start_time = time.time()
# Generate the response using the model's generate method with streaming
model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
repetition_penalty=1.1,
top_p=top_p,
temperature=temperature,
streamer=streamer, # Enable streaming of the generated tokens
)
# Calculate and print the total response time
total_time = time.time() - start_time
print(f"\n--- Response finished in {total_time:.2f} seconds ---")
def chat_loop():
"""
Initiates an interactive chat session with the model.
Continuously reads user input and generates model responses until the user exits.
"""
while True:
# Initialize the conversation with a system message
messages = [
{"role": "system", "content": "You are a reasoning expert and helpful assistant."},
]
# Prompt the user for input
user_input = input("\nUser: ")
if user_input.strip().lower() in ["exit", "quit"]:
print("Exiting chat...")
break
# Append the user's message to the conversation history
messages.append({"role": "user", "content": user_input})
print("Assistant: ", end="", flush=True)
# Generate and stream the assistant's response
stream_chat(messages)
# Note: Currently, the assistant's reply is streamed directly to the console.
# To store the assistant's reply in the conversation history, additional handling is required.
if __name__ == "__main__":
# Start the interactive chat loop when the script is executed
chat_loop()
``` |