import os
import csv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = "C:\\models\\llama-3-8b-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
num_samples = 100000
output_file = 'raw_data.csv'
sentiment_labels = {
0: "very positive",
1: "positive",
2: "somewhat positive",
3: "neutral",
4: "somewhat negative",
5: "negative",
6: "very negative"
if not os.path.exists(output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['text', 'label'])
for i in range(num_samples):
label = i % len(sentiment_labels)
sentiment = sentiment_labels[label]
prompt = f"Generate a short article on a random topic and writing style, ensuring the sentiment is {sentiment}. Write nothing but the article text. Do not include the sentiment in the text of the article."
print(f"Generating sample {i+1}/{num_samples}: {prompt}")
input_ids = tokenizer.encode(prompt, return_tensors='pt')
output = model.generate(input_ids, max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
response = tokenizer.decode(output[0], skip_special_tokens=True)
new_tokens = response[len(prompt):].strip()
with open(output_file, 'a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([new_tokens, label])
print(f"Data generation completed. Data appended to {output_file}")