|
import os
|
|
import csv
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
import torch
|
|
|
|
|
|
model_path = "C:\\models\\llama-3-8b-Instruct-bnb-4bit"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
model = AutoModelForCausalLM.from_pretrained(model_path)
|
|
|
|
|
|
num_samples = 100000
|
|
output_file = 'raw_data.csv'
|
|
|
|
|
|
sentiment_labels = {
|
|
0: "very positive",
|
|
1: "positive",
|
|
2: "somewhat positive",
|
|
3: "neutral",
|
|
4: "somewhat negative",
|
|
5: "negative",
|
|
6: "very negative"
|
|
}
|
|
|
|
|
|
if not os.path.exists(output_file):
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerow(['text', 'label'])
|
|
|
|
|
|
for i in range(num_samples):
|
|
label = i % len(sentiment_labels)
|
|
sentiment = sentiment_labels[label]
|
|
|
|
prompt = f"Generate a short article on a random topic and writing style, ensuring the sentiment is {sentiment}. Write nothing but the article text. Do not include the sentiment in the text of the article."
|
|
print(f"Generating sample {i+1}/{num_samples}: {prompt}")
|
|
input_ids = tokenizer.encode(prompt, return_tensors='pt')
|
|
|
|
|
|
output = model.generate(input_ids, max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
|
|
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
|
|
|
new_tokens = response[len(prompt):].strip()
|
|
|
|
|
|
with open(output_file, 'a', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerow([new_tokens, label])
|
|
|
|
print(f"Data generation completed. Data appended to {output_file}")
|
|
|