gaurav0026 commited on
Commit
4a9b381
·
verified ·
1 Parent(s): 3ebd95a

upload model

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModel, AutoTokenizer
2
+ import torch
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ import gradio as gr
6
+ from collections import Counter
7
+ import pandas as pd
8
+
9
+
10
+
11
+
12
+ # Load paraphrase model and tokenizer
13
+ model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
14
+ tokenizer = T5Tokenizer.from_pretrained('t5-base')
15
+
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ model = model.to(device)
18
+
19
+ # Load Sentence-BERT model for semantic similarity calculation
20
+ embed_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
21
+ embed_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
22
+ embed_model = embed_model.to(device)
23
+
24
+ # Function to get sentence embeddings
25
+ def get_sentence_embedding(sentence):
26
+ inputs = embed_tokenizer(sentence, return_tensors="pt", padding=True).to(device)
27
+ with torch.no_grad():
28
+ embeddings = embed_model(**inputs).last_hidden_state.mean(dim=1)
29
+ return embeddings
30
+
31
+ # Paraphrasing function
32
+ def paraphrase_sentence(sentence):
33
+ # Updated prompt for statement-like output
34
+ text = "rephrase as a statement: " + sentence
35
+ encoding = tokenizer.encode_plus(text, padding=False, return_tensors="pt")
36
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
37
+
38
+ beam_outputs = model.generate(
39
+ input_ids=input_ids,
40
+ attention_mask=attention_masks,
41
+ do_sample=True,
42
+ max_length=128,
43
+ top_k=40, # Reduced top_k for less randomness
44
+ top_p=0.85, # Reduced top_p for focused sampling
45
+ early_stopping=True,
46
+ num_return_sequences=5 # Generate 5 paraphrases
47
+ )
48
+
49
+ # Decode and format paraphrases with numbering
50
+ paraphrases = []
51
+ for i, line in enumerate(beam_outputs, 1):
52
+ paraphrase = tokenizer.decode(line, skip_special_tokens=True, clean_up_tokenization_spaces=True)
53
+ paraphrases.append(f"{i}. {paraphrase}")
54
+
55
+ return "\n".join(paraphrases)
56
+
57
+ # Precision, Recall, and Overall Accuracy Calculation
58
+ def calculate_precision_recall_accuracy(sentences):
59
+ total_similarity = 0
60
+ paraphrase_count = 0
61
+ total_precision = 0
62
+ total_recall = 0
63
+
64
+ for sentence in sentences:
65
+ paraphrases = paraphrase_sentence(sentence).split("\n")
66
+
67
+ # Get the original embedding and token counts
68
+ original_embedding = get_sentence_embedding(sentence)
69
+ original_tokens = Counter(sentence.lower().split())
70
+
71
+ for paraphrase in paraphrases:
72
+ # Remove numbering before evaluation
73
+ paraphrase = paraphrase.split(". ", 1)[1]
74
+ paraphrase_embedding = get_sentence_embedding(paraphrase)
75
+ similarity = cosine_similarity(original_embedding.cpu(), paraphrase_embedding.cpu())[0][0]
76
+ total_similarity += similarity
77
+
78
+ # Calculate precision and recall based on token overlap
79
+ paraphrase_tokens = Counter(paraphrase.lower().split())
80
+ overlap = sum((paraphrase_tokens & original_tokens).values())
81
+ precision = overlap / sum(paraphrase_tokens.values()) if paraphrase_tokens else 0
82
+ recall = overlap / sum(original_tokens.values()) if original_tokens else 0
83
+
84
+ total_precision += precision
85
+ total_recall += recall
86
+ paraphrase_count += 1
87
+
88
+ # Calculate averages for accuracy, precision, and recall
89
+ overall_accuracy = (total_similarity / paraphrase_count) * 100
90
+ avg_precision = (total_precision / paraphrase_count) * 100
91
+ avg_recall = (total_recall / paraphrase_count) * 100
92
+
93
+ print(f"Overall Model Accuracy (Semantic Similarity): {overall_accuracy:.2f}%")
94
+ print(f"Average Precision (Token Overlap): {avg_precision:.2f}%")
95
+ print(f"Average Recall (Token Overlap): {avg_recall:.2f}%")
96
+
97
+ # Define Gradio UI
98
+ iface = gr.Interface(
99
+ fn=paraphrase_sentence,
100
+ inputs="text",
101
+ outputs="text",
102
+ title="PARA-GEN (T5 Paraphraser)",
103
+ description="Enter a sentence, and the model will generate five numbered paraphrases in statement form."
104
+ )
105
+
106
+ # List of test sentences to evaluate metrics
107
+ test_sentences = [
108
+ "The quick brown fox jumps over the lazy dog.",
109
+ "Artificial intelligence is transforming industries.",
110
+ "The weather is sunny and warm today.",
111
+ "He enjoys reading books on machine learning.",
112
+ "The stock market fluctuates daily due to various factors."
113
+ ]
114
+
115
+ # Calculate overall accuracy, precision, and recall for the list of test sentences
116
+ calculate_precision_recall_accuracy(test_sentences)
117
+
118
+ # Launch Gradio app (Gradio UI will not show metrics)
119
+ iface.launch(share=False)