Spaces:
Runtime error
Runtime error
File size: 7,573 Bytes
817741c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
# # Step 1: Prepare the dataset
# # Load your training and validation datasets
# def read_data(file_path):
# with open(file_path, 'r', encoding='utf-8') as file:
# data = file.readlines()
# return data
# src_train = read_data('src_train.txt') # File containing original sentences for training
# tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training
# src_valid = read_data('src_valid.txt') # File containing original sentences for validation
# tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation
# # Step 2: Fine-tune the BERT model
# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# # Fine-tune the model on your training dataset
# # You need to define the training loop here
# # Step 3: Evaluate the model
# def evaluate_model(model, tokenizer, src_valid, tgt_valid):
# predicted_sentences = []
# true_labels = []
# for src_sentence, tgt_sentence in zip(src_valid, tgt_valid):
# # Tokenize and get predictions
# tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
# with torch.no_grad():
# outputs = model(tokenized_sentence)
# predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()
# # Decode predicted sentence
# predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)
# # Append to lists
# predicted_sentences.append(predicted_sentence)
# true_labels.append(tgt_sentence)
# # Calculate evaluation metrics
# precision = precision_score(true_labels, predicted_sentences, average='weighted')
# recall = recall_score(true_labels, predicted_sentences, average='weighted')
# f1 = f1_score(true_labels, predicted_sentences, average='weighted')
# # Create confusion matrix
# labels = np.unique(true_labels)
# cm = confusion_matrix(true_labels, predicted_sentences, labels=labels)
# return precision, recall, f1, cm
# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)
# print("Confusion Matrix:")
# print(confusion_matrix)
# # Step 4: Analyze the results
# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match)
# def match_percentage(sentence1, sentence2):
# n = len(sentence1)
# if n == 0:
# return 0.0
# common = sum([1 for x, y in zip(sentence1, sentence2) if x == y])
# return common / n
# matches_70 = 0
# matches_50 = 0
# matches_20 = 0
# for pred, true in zip(predicted_sentences, tgt_valid):
# percentage = match_percentage(pred, true)
# if percentage > 0.7:
# matches_70 += 1
# if percentage > 0.5:
# matches_50 += 1
# if percentage < 0.2:
# matches_20 += 1
# print("Number of sentences with >70% match:", matches_70)
# print("Number of sentences with >50% match:", matches_50)
# print("Number of sentences with <20% match:", matches_20)
# # Save confusion matrix as image
# plt.figure(figsize=(8, 6))
# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# tick_marks = np.arange(len(labels))
# plt.xticks(tick_marks, labels, rotation=45)
# plt.yticks(tick_marks, labels)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.tight_layout()
# plt.savefig('confusion_matrix.png')
# Step 1: Prepare the dataset
# Load your training and validation datasets
def read_data(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = file.readlines()
return data
def read_picto_ids(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = file.readlines()
picto_ids = [list(map(int, line.split())) for line in data]
return picto_ids
src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training
tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training
picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training
src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation
tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation
picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation
# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files.
# Step 2: Fine-tune the BERT model
# Same as before
# Step 3: Evaluate the model
def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid):
predicted_sentences = []
true_labels = []
for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid):
# Tokenize and get predictions
tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
with torch.no_grad():
outputs = model(tokenized_sentence)
predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()
# Decode predicted sentence
predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)
# Append to lists
predicted_sentences.append(predicted_sentence)
true_labels.append(tgt_sentence)
# Calculate evaluation metrics based on picto IDs
accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0}
for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid):
if pred == true:
accuracies["100%"] += 1
elif len(pred.split()) == len(picto_pred) == len(picto_true):
match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y)
match_percentage = match_count / len(picto_pred)
if match_percentage >= 0.7:
accuracies["70%"] += 1
elif match_percentage >= 0.5:
accuracies["50%"] += 1
elif match_percentage >= 0.2:
accuracies["20%"] += 1
return accuracies
from transformers import CamembertModel, CamembertTokenizer
# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")
accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid)
print("Accuracies based on picto IDs:")
print(accuracies)
|