import torch from datasets import load_dataset from transformers import BertModel,BertTokenizer,AdamW import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt import jieba #加载数据集(数据集说明:0 喜悦,1 愤怒,2 厌恶,3 低落|微博数目:361744|(喜悦):199496(愤怒):51714(厌恶):55267(低落):55267) class Dataset(torch.utils.data.Dataset): def __init__(self,split): self.dataset=load_dataset(path='./dataset',split=split) def __len__(self): return len(self.dataset) def __getitem__(self,i): text = self.dataset[i]['review'] label = self.dataset[i]['label'] return text, label dataset = Dataset('train') train_size = int(0.8 * len(dataset)) test_size = len(dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size]) lengths = [len(s[0]) for s in dataset] print(dataset[lengths.index(max(lengths))]) lengths=sorted(lengths,reverse=True) print(np.mean(lengths)) print(lengths) print('数据示例:',dataset[0],'数据条数:',len(dataset),'训练集:',len(train_dataset),'测试集:',len(test_dataset)) plt.rcParams["font.sans-serif"] = ['Simhei'] plt.rcParams["axes.unicode_minus"] = False wc = WordCloud( # 设置字体,不指定就会出现乱码 font_path=r'C:\Windows\Fonts\方正粗黑宋简体', # 设置背景色 background_color='white', # 设置背景宽 width=500, # 设置背景高 height=350, # 最大字体 max_font_size=50, # 最小字体 min_font_size=10, mode='RGBA' #colormap='pink' ) word = [] word.append(jieba.lcut(s[0]) for s in dataset) #word = jieba.lcut(dataset[:][0]) n="" pos_dataset = dataset for s in dataset: sentence = [] if s[1]==1: sentence = jieba.lcut(s[0]) for i in sentence: if len(i)!=1 and str(i)!='太慢' and str(i)!='一般': n=n+" "+str(i) # print(n) # 产生词云 wc.generate(n) # 保存图片 wc.to_file(r"wordcloud3.png") # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰 # 4.显示图片 # 指定所绘图名称 plt.figure("jay") # 以图片的形式显示词云 plt.imshow(wc) # 关闭图像坐标系 plt.axis("off") #数据集进入loader #加载Bert中文分词器 tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path='bert-base-chinese' ) def collate_fn(data): input_sentences = [i[0] for i in data] labels = [i[1] for i in data] data = tokenizer.batch_encode_plus( batch_text_or_text_pairs=input_sentences, truncation=True, padding='max_length', max_length=140, return_tensors='pt', return_length=True ) input_ids = data['input_ids'] attention_mask = data['attention_mask'] token_type_ids = data['token_type_ids'] labels = torch.LongTensor(labels) return input_ids, attention_mask, token_type_ids, labels loader_train=torch.utils.data.DataLoader( dataset=train_dataset, batch_size=100, collate_fn=collate_fn, shuffle=True, drop_last=True ) loader_test = torch.utils.data.DataLoader(dataset=test_dataset,batch_size=100,collate_fn=collate_fn,shuffle=True,drop_last=True)