Spaces:
Sleeping
Sleeping
File size: 2,630 Bytes
2be9e40 85af8eb 2be9e40 4cde825 2be9e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import streamlit as st
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
ds = load_dataset("higgsfield/school-math-questions")
qa_pairs = [(item['prompt'], item['completion']) for item in ds['train']]
class MathDataset(torch.utils.data.Dataset):
def __init__(self, qa_pairs, tokenizer, max_length=128):
self.qa_pairs = qa_pairs
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.qa_pairs)
def __getitem__(self, idx):
question, answer = self.qa_pairs[idx]
input_text = f"Q: {question} A:"
# Tokenize and pad input and target sequences
input_ids = self.tokenizer.encode(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)
target_ids = self.tokenizer.encode(answer.strip(), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)
# Set the labels to -100 where input_ids are padding tokens
target_ids[target_ids == self.tokenizer.pad_token_id] = -100
return {
"input_ids": input_ids,
"labels": target_ids,
}
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)
math_dataset = MathDataset(qa_pairs, tokenizer)
from transformers import Trainer, TrainingArguments
# Set training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=2,
save_steps=10,
save_total_limit=2,
)
# Create a Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=math_dataset,
)
# Fine-tune the model
trainer.train()
class MathChatBot:
def __init__(self, model_name="gpt2"):
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
self.model = GPT2LMHeadModel.from_pretrained(model_name)
def get_response(self, question):
input_text = f"Q: {question} A:"
input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
output = self.model.generate(input_ids, max_length=50, num_return_sequences=1)
answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
return answer.split("A:")[-1].strip()
# Usage
if __name__ == "__main__":
bot = MathChatBot()
user_input = st.text_area("Enter your question:")
response = bot.get_response(user_input)
st.write(f"Bot: {response}") |