theArijitDas commited on
Commit
517a261
·
verified ·
1 Parent(s): 875978a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +12 -0
  2. description_validator.py +65 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from description_validator import Description_Validator
2
+ import gradio as gr
3
+
4
+ description_validator = Description_Validator(model_name="DistilRoBERTa-v1")
5
+
6
+ iface = gr.Interface(description_validator.similarity_score,
7
+ inputs=["text", "text"],
8
+ outputs="number",
9
+ title="Product Description Similarity Calculator",
10
+ description="Enter two product descriptions to calculate their similarity."
11
+ )
12
+ iface.launch()
description_validator.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from transformers import AutoTokenizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+
6
+ from warnings import filterwarnings
7
+ filterwarnings("ignore")
8
+
9
+ models = ["MPNet-base-v2", "DistilRoBERTa-v1", "MiniLM-L12-v2", "MiniLM-L6-v2"]
10
+ models_info = {
11
+ "MPNet-base-v2": {
12
+ "model_size": "420MB",
13
+ "model_url": "sentence-transformers/all-mpnet-base-v2",
14
+ "efficiency": "Moderate",
15
+ "chunk_size": 512
16
+ },
17
+ "DistilRoBERTa-v1": {
18
+ "model_size": "263MB",
19
+ "model_url": "sentence-transformers/all-distilroberta-v1",
20
+ "efficiency": "High",
21
+ "chunk_size": 512
22
+ },
23
+ "MiniLM-L12-v2": {
24
+ "model_size": "118MB",
25
+ "model_url": "sentence-transformers/all-MiniLM-L12-v2",
26
+ "efficiency": "High",
27
+ "chunk_size": 512
28
+ },
29
+ "MiniLM-L6-v2": {
30
+ "model_size": "82MB",
31
+ "model_url": "sentence-transformers/all-MiniLM-L6-v2",
32
+ "efficiency": "Very High",
33
+ "chunk_size": 512
34
+ }
35
+ }
36
+
37
+ class Description_Validator:
38
+ def __init__(self, model_name=None):
39
+ if model_name is None: model_name="DistilRoBERTa-v1"
40
+
41
+ self.model_info = models_info[model_name]
42
+ model_url = self.model_info["model_url"]
43
+
44
+ self.model = SentenceTransformer(model_url)
45
+ self.tokenizer = AutoTokenizer.from_pretrained(model_url)
46
+ self.chunk_size = self.model_info["chunk_size"]
47
+
48
+ def tokenize_and_chunk(self, text):
49
+ tokens = self.tokenizer(text, truncation=False, padding=True, add_special_tokens=False)['input_ids']
50
+ token_chunks = [tokens[i:i+self.chunk_size] for i in range(0, len(tokens), self.chunk_size)]
51
+ return token_chunks
52
+
53
+ def get_average_embedding(self, text):
54
+ token_chunks = self.tokenize_and_chunk(text)
55
+ chunk_embeddings = []
56
+ for chunk in token_chunks:
57
+ chunk_embedding = self.model.encode(self.tokenizer.decode(chunk), show_progress_bar=False)
58
+ chunk_embeddings.append(chunk_embedding)
59
+ return np.mean(chunk_embeddings, axis=0)
60
+
61
+ def similarity_score(self, desc1, desc2):
62
+ embedding1 = self.get_average_embedding(desc1).reshape(1, -1)
63
+ embedding2 = self.get_average_embedding(desc2).reshape(1, -1)
64
+ similarity = cosine_similarity(embedding1, embedding2)
65
+ return similarity[0][0]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence_transformers
2
+ transformers
3
+ scikit-learn
4
+ numpy==1.25.2
5
+ gradio