|
import streamlit as st |
|
import tensorflow as tf |
|
import sentencepiece as spm |
|
import numpy as np |
|
from scipy.spatial.distance import cosine |
|
import pandas as pd |
|
from openTSNE import TSNE |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.block-container { |
|
padding-top: 1rem; |
|
padding-bottom: 0rem; |
|
padding-left: 1rem; |
|
padding-right: 1rem; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
tflite_model_path = "model.tflite" |
|
spm_model_path = "sentencepiece.model" |
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(spm_model_path) |
|
|
|
interpreter = tf.lite.Interpreter(model_path=tflite_model_path) |
|
interpreter.allocate_tensors() |
|
|
|
input_details = interpreter.get_input_details() |
|
output_details = interpreter.get_output_details() |
|
required_input_length = 64 |
|
|
|
|
|
def preprocess_text(text, sp, required_length): |
|
input_ids = sp.encode(text, out_type=int) |
|
input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids)) |
|
return np.array(input_ids, dtype=np.int32).reshape(1, -1) |
|
|
|
|
|
def generate_embeddings(text): |
|
input_data = preprocess_text(text, sp, required_input_length) |
|
interpreter.set_tensor(input_details[0]['index'], input_data) |
|
interpreter.invoke() |
|
embedding = interpreter.get_tensor(output_details[0]['index']) |
|
return embedding.flatten() |
|
|
|
|
|
def calculate_similarity(embedding1, embedding2): |
|
return 1 - cosine(embedding1, embedding2) |
|
|
|
|
|
preset_sentences_a = [ |
|
"Dan Petrovic predicted conversational search in 2013.", |
|
"Understanding user intent is key to effective SEO.", |
|
"Dejan SEO has been a leader in data-driven SEO.", |
|
"Machine learning is transforming search engines.", |
|
"The future of search is AI-driven and personalized.", |
|
"Search algorithms are evolving to better match user intent.", |
|
"AI technologies enhance digital marketing strategies." |
|
] |
|
|
|
preset_sentences_b = [ |
|
"Advances in machine learning reshape how search engines operate.", |
|
"Personalized content is becoming more prevalent with AI.", |
|
"Customer behavior insights are crucial for marketing strategies.", |
|
"Dan Petrovic anticipated the rise of chat-based search interactions.", |
|
"Dejan SEO is recognized for innovative SEO research and analysis.", |
|
"Quantum computing is advancing rapidly in the tech world.", |
|
"Studying user behavior can improve the effectiveness of online ads." |
|
] |
|
|
|
|
|
if "input_text_a" not in st.session_state: |
|
st.session_state["input_text_a"] = "\n".join(preset_sentences_a) |
|
if "input_text_b" not in st.session_state: |
|
st.session_state["input_text_b"] = "\n".join(preset_sentences_b) |
|
|
|
|
|
if st.button("Clear Fields"): |
|
st.session_state["input_text_a"] = "" |
|
st.session_state["input_text_b"] = "" |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Set A Sentences") |
|
input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200) |
|
|
|
with col2: |
|
st.subheader("Set B Sentences") |
|
input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200) |
|
|
|
|
|
iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250) |
|
|
|
|
|
similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.5, 0.05) |
|
|
|
|
|
if st.button("Calculate Similarity"): |
|
sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()] |
|
sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()] |
|
|
|
if len(sentences_a) > 0 and len(sentences_b) > 0: |
|
|
|
embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a] |
|
embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b] |
|
|
|
|
|
all_sentences = sentences_a + sentences_b |
|
all_embeddings = np.array(embeddings_a + embeddings_b) |
|
labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b) |
|
|
|
|
|
similarity_matrix = np.zeros((len(sentences_a), len(sentences_b))) |
|
for i, emb_a in enumerate(embeddings_a): |
|
for j, emb_b in enumerate(embeddings_b): |
|
similarity_matrix[i, j] = calculate_similarity(emb_a, emb_b) |
|
|
|
|
|
used_a = set() |
|
used_b = set() |
|
matches = [] |
|
pairs = [] |
|
for i in range(len(sentences_a)): |
|
for j in range(len(sentences_b)): |
|
pairs.append((i, j, similarity_matrix[i, j])) |
|
|
|
|
|
pairs.sort(key=lambda x: x[2], reverse=True) |
|
|
|
for i, j, sim in pairs: |
|
if i not in used_a and j not in used_b and sim >= similarity_threshold: |
|
matches.append((i, j, sim)) |
|
used_a.add(i) |
|
used_b.add(j) |
|
|
|
|
|
|
|
|
|
if len(matches) == 0: |
|
st.warning("No sentence pairs exceeded the similarity threshold.") |
|
else: |
|
|
|
df_matches = pd.DataFrame( |
|
[ |
|
(i+1, sentences_a[i], j+1, sentences_b[j], round(sim, 3)) |
|
for (i, j, sim) in matches |
|
], |
|
columns=["Set A Order", "Set A Sentence", "Set B Order", "Set B Sentence", "Similarity"] |
|
) |
|
st.subheader("Matched Sentences (Above Threshold)") |
|
st.dataframe(df_matches, use_container_width=True) |
|
|
|
|
|
|
|
|
|
perplexity_value = min(5, len(all_sentences) - 1) |
|
|
|
tsne = TSNE( |
|
n_components=3, |
|
perplexity=perplexity_value, |
|
n_iter=iterations, |
|
initialization="pca", |
|
random_state=42 |
|
) |
|
tsne_results = tsne.fit(all_embeddings) |
|
|
|
|
|
df_tsne = pd.DataFrame({ |
|
"Sentence": all_sentences, |
|
"Set": labels, |
|
"X": tsne_results[:, 0], |
|
"Y": tsne_results[:, 1], |
|
"Z": tsne_results[:, 2] |
|
}) |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Scatter3d( |
|
x=df_tsne[df_tsne["Set"] == "Set A"]["X"], |
|
y=df_tsne[df_tsne["Set"] == "Set A"]["Y"], |
|
z=df_tsne[df_tsne["Set"] == "Set A"]["Z"], |
|
text=df_tsne[df_tsne["Set"] == "Set A"]["Sentence"], |
|
mode='markers', |
|
name='Set A', |
|
marker=dict(size=5, color='blue') |
|
)) |
|
|
|
|
|
fig.add_trace(go.Scatter3d( |
|
x=df_tsne[df_tsne["Set"] == "Set B"]["X"], |
|
y=df_tsne[df_tsne["Set"] == "Set B"]["Y"], |
|
z=df_tsne[df_tsne["Set"] == "Set B"]["Z"], |
|
text=df_tsne[df_tsne["Set"] == "Set B"]["Sentence"], |
|
mode='markers', |
|
name='Set B', |
|
marker=dict(size=5, color='red') |
|
)) |
|
|
|
|
|
for i, emb_a in enumerate(embeddings_a): |
|
pos_a = tsne_results[i] |
|
for j, emb_b in enumerate(embeddings_b): |
|
sim = similarity_matrix[i, j] |
|
if sim >= similarity_threshold: |
|
pos_b = tsne_results[j + len(sentences_a)] |
|
fig.add_trace(go.Scatter3d( |
|
x=[pos_a[0], pos_b[0]], |
|
y=[pos_a[1], pos_b[1]], |
|
z=[pos_a[2], pos_b[2]], |
|
mode='lines', |
|
line=dict(color=f'rgba(150,150,150,{sim})', width=2), |
|
name=f'Similarity: {sim:.2f}', |
|
showlegend=False |
|
)) |
|
|
|
fig.update_layout( |
|
title="3D Visualization of Sentence Similarity with Connections", |
|
width=1200, |
|
height=800, |
|
scene=dict( |
|
xaxis_title="t-SNE Dimension 1", |
|
yaxis_title="t-SNE Dimension 2", |
|
zaxis_title="t-SNE Dimension 3" |
|
) |
|
) |
|
st.plotly_chart(fig) |
|
|
|
|
|
|
|
|
|
fig_heatmap = go.Figure(data=go.Heatmap( |
|
z=similarity_matrix, |
|
x=[f"B{i+1}" for i in range(len(sentences_b))], |
|
y=[f"A{i+1}" for i in range(len(sentences_a))], |
|
colorscale="Viridis", |
|
text=np.round(similarity_matrix, 2), |
|
texttemplate="%{text}", |
|
textfont={"size": 10}, |
|
hoverongaps=False |
|
)) |
|
|
|
fig_heatmap.update_layout( |
|
title="Similarity Heatmap between Set A and Set B", |
|
width=None, |
|
height=400, |
|
margin=dict(l=20, r=20, t=40, b=20), |
|
xaxis_title="Set B Sentences", |
|
yaxis_title="Set A Sentences" |
|
) |
|
|
|
st.plotly_chart(fig_heatmap) |
|
|
|
else: |
|
st.warning("Please enter sentences in both Set A and Set B.") |
|
|