File size: 4,909 Bytes
50fdd96
062d5b7
 
 
50fdd96
 
 
 
 
 
 
 
 
062d5b7
50fdd96
 
 
062d5b7
50fdd96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
062d5b7
50fdd96
8e1de57
50fdd96
 
 
 
062d5b7
8e1de57
50fdd96
062d5b7
50fdd96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer, util

# Available models
model_dict = {
    "AviLaBSE" :"sartifyllc/AviLaBSE",
    "MultiLinguSwahili-serengeti-E250-nli-matryoshka": "sartifyllc/MultiLinguSwahili-serengeti-E250-nli-matryoshka",
    "MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka": "Mollel/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka",
    "swahili-paraphrase-multilingual-mpnet-base-v2-nli-matryoshka": "sartifyllc/swahili-paraphrase-multilingual-mpnet-base-v2-nli-matryoshka",
    "bge-base-swahili-matryoshka":"sartifyllc/bge-base-swahili-matryoshka",
    "MultiLinguSwahili-bge-small-en-v1.5-nli-matryoshka": "sartifyllc/MultiLinguSwahili-bge-small-en-v1.5-nli-matryoshka",
}

# Function to load the selected model
def load_model(model_name):
    return SentenceTransformer(model_dict[model_name])

# Function to compute similarity and classify relationship
def predict(model_name, mode, sentence1, sentence2=None, sentence3=None, sentence4=None, dimension="64"):
    model = load_model(model_name)
    dimension = int(dimension)
    result = {
        "Selected Dimension": dimension,
        "Input Sentences": {
            "Sentence 1": sentence1,
            "Sentence 2": sentence2,
            "Sentence 3": sentence3,
            "Sentence 4": sentence4
        },
        "Similarity Scores": {}
    }
    
    if mode == "Compare one to three":
        if sentence2 is None or sentence3 is None or sentence4 is None:
            return "Please provide three sentences for comparison.", {}
        sentences = [sentence1, sentence2, sentence3, sentence4]
    else:
        if sentence2 is None:
            return "Please provide the second sentence for comparison.", {}
        sentences = [sentence1, sentence2]
    
    embeddings = model.encode(sentences)
    embeddings = embeddings[..., :dimension]
    
    if mode == "Compare one to three":
        similarities = util.cos_sim(embeddings[0], embeddings[1:])
        similarity_scores = {f"Sentence {i+2}": float(similarities[0, i]) for i in range(3)}
        result["Similarity Scores"] = similarity_scores
    else:
        similarity_score = util.cos_sim(embeddings[0], embeddings[1])
        similarity_scores = {"Similarity Score": float(similarity_score)}
        result["Similarity Scores"] = similarity_scores
    
    # Word-level similarity
    if mode == "Compare two sentences" and sentence2 is not None:
        words1 = sentence1.split()
        words2 = sentence2.split()
        word_pairs = [(w1, w2) for w1 in words1 for w2 in words2]
        word_embeddings1 = model.encode(words1)[..., :dimension]
        word_embeddings2 = model.encode(words2)[..., :dimension]
        word_similarities = {
            f"{w1} - {w2}": float(util.cos_sim(we1, we2))
            for (w1, we1) in zip(words1, word_embeddings1)
            for (w2, we2) in zip(words2, word_embeddings2)
        }
        result["Word-level Similarities"] = word_similarities
    
    return result

# Define inputs and outputs for Gradio interface
model_name = "AviLABASE", label="Model")
sentence1_input = gr.Textbox(lines=2, placeholder="Enter the first sentence here...", label="Sentence 1")
sentence2_input = gr.Textbox(lines=2, placeholder="Enter the second sentence here...", label="Sentence 2 (or first of three for mode)")
sentence3_input = gr.Textbox(lines=2, placeholder="Enter the third sentence here...", label="Sentence 3")
sentence4_input = gr.Textbox(lines=2, placeholder="Enter the fourth sentence here...", label="Sentence 4")

inputs = [model_name, sentence1_input, sentence2_input, sentence3_input, sentence4_input]
outputs = gr.JSON(label="Detailed Similarity Scores")

examples = [
    ["MultiLinguSwahili-serengeti-E250-nli-matryoshka", "Compare one to three", "Mtoto mdogo anaruka mikononi mwa mwanamke aliyevalia suti nyeusi ya kuogelea akiwa kwenye dimbwi.", "Mtoto akiruka mikononi mwa mwanamke aliyevalia suti ya kuogelea kwenye dimbwi.", "Mama na binti wakinunua viatu.", "Mtu anashindana katika mashindano ya mbio.", "64"],
    ["MultiLinguSwahili-serengeti-E250-nli-matryoshka", "Compare two sentences", "Mwanamume na mwanamke wachanga waliovaa mikoba wanaweka au kuondoa kitu kutoka kwenye mti mweupe wa zamani, huku watu wengine wamesimama au wameketi nyuma.", "tai huruka", None, None, "64"]
]

# Create Gradio interface
gr.Interface(
    fn=predict,
    title="Swahili Sentence Similarity with Matryoshka Model",
    description="Compute the semantic similarity between Swahili sentences using various SentenceTransformer models.",
    inputs=inputs,
    examples=examples,
    outputs=outputs,
    cache_examples=False,
    article="Author: Michael Mollel. Model from Hugging Face Hub (sartify.com): [Swahili-Nli-Matryoshka](https://huggingface.co/sartifyllc/MultiLinguSwahili-serengeti-E250-nli-matryoshka)",
).launch(debug=True, share=True)