theArijitDas
commited on
Upload 3 files
Browse files- app.py +12 -0
- description_validator.py +65 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from description_validator import Description_Validator
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
description_validator = Description_Validator(model_name="DistilRoBERTa-v1")
|
5 |
+
|
6 |
+
iface = gr.Interface(description_validator.similarity_score,
|
7 |
+
inputs=["text", "text"],
|
8 |
+
outputs="number",
|
9 |
+
title="Product Description Similarity Calculator",
|
10 |
+
description="Enter two product descriptions to calculate their similarity."
|
11 |
+
)
|
12 |
+
iface.launch()
|
description_validator.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from warnings import filterwarnings
|
7 |
+
filterwarnings("ignore")
|
8 |
+
|
9 |
+
models = ["MPNet-base-v2", "DistilRoBERTa-v1", "MiniLM-L12-v2", "MiniLM-L6-v2"]
|
10 |
+
models_info = {
|
11 |
+
"MPNet-base-v2": {
|
12 |
+
"model_size": "420MB",
|
13 |
+
"model_url": "sentence-transformers/all-mpnet-base-v2",
|
14 |
+
"efficiency": "Moderate",
|
15 |
+
"chunk_size": 512
|
16 |
+
},
|
17 |
+
"DistilRoBERTa-v1": {
|
18 |
+
"model_size": "263MB",
|
19 |
+
"model_url": "sentence-transformers/all-distilroberta-v1",
|
20 |
+
"efficiency": "High",
|
21 |
+
"chunk_size": 512
|
22 |
+
},
|
23 |
+
"MiniLM-L12-v2": {
|
24 |
+
"model_size": "118MB",
|
25 |
+
"model_url": "sentence-transformers/all-MiniLM-L12-v2",
|
26 |
+
"efficiency": "High",
|
27 |
+
"chunk_size": 512
|
28 |
+
},
|
29 |
+
"MiniLM-L6-v2": {
|
30 |
+
"model_size": "82MB",
|
31 |
+
"model_url": "sentence-transformers/all-MiniLM-L6-v2",
|
32 |
+
"efficiency": "Very High",
|
33 |
+
"chunk_size": 512
|
34 |
+
}
|
35 |
+
}
|
36 |
+
|
37 |
+
class Description_Validator:
|
38 |
+
def __init__(self, model_name=None):
|
39 |
+
if model_name is None: model_name="DistilRoBERTa-v1"
|
40 |
+
|
41 |
+
self.model_info = models_info[model_name]
|
42 |
+
model_url = self.model_info["model_url"]
|
43 |
+
|
44 |
+
self.model = SentenceTransformer(model_url)
|
45 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_url)
|
46 |
+
self.chunk_size = self.model_info["chunk_size"]
|
47 |
+
|
48 |
+
def tokenize_and_chunk(self, text):
|
49 |
+
tokens = self.tokenizer(text, truncation=False, padding=True, add_special_tokens=False)['input_ids']
|
50 |
+
token_chunks = [tokens[i:i+self.chunk_size] for i in range(0, len(tokens), self.chunk_size)]
|
51 |
+
return token_chunks
|
52 |
+
|
53 |
+
def get_average_embedding(self, text):
|
54 |
+
token_chunks = self.tokenize_and_chunk(text)
|
55 |
+
chunk_embeddings = []
|
56 |
+
for chunk in token_chunks:
|
57 |
+
chunk_embedding = self.model.encode(self.tokenizer.decode(chunk), show_progress_bar=False)
|
58 |
+
chunk_embeddings.append(chunk_embedding)
|
59 |
+
return np.mean(chunk_embeddings, axis=0)
|
60 |
+
|
61 |
+
def similarity_score(self, desc1, desc2):
|
62 |
+
embedding1 = self.get_average_embedding(desc1).reshape(1, -1)
|
63 |
+
embedding2 = self.get_average_embedding(desc2).reshape(1, -1)
|
64 |
+
similarity = cosine_similarity(embedding1, embedding2)
|
65 |
+
return similarity[0][0]
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentence_transformers
|
2 |
+
transformers
|
3 |
+
scikit-learn
|
4 |
+
numpy==1.25.2
|
5 |
+
gradio
|