import streamlit as st import numpy as np from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel import torch # Load Romanian BERT model and tokenizer model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Load pre-saved embeddings and sentences saved_embeddings = np.load("sentence_embeddings.npy") sentences = np.load("sentences.npy") # Function to get sentence embedding def get_sentence_embedding(sentence, model, tokenizer): inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding return cls_embedding.numpy() # Streamlit UI st.title("Sentence Similarity with Pre-trained BERT") st.write("Enter a sentence in Romanian to find similar sentences.") # User input user_input = st.text_input("Your sentence") # Check if user input exists if user_input: # Embed the user input user_embedding = get_sentence_embedding(user_input, model, tokenizer) # Compute similarity with saved embeddings similarities = cosine_similarity(user_embedding, saved_embeddings.reshape(saved_embeddings.shape[0], -1)) # Get the top 5 most similar sentences top_n = 5 top_indices = np.argsort(similarities[0])[::-1][:top_n] st.write("Top similar sentences:") # Display the most similar sentences with similarity scores for idx in top_indices: st.write(f"Sentence: {sentences[idx]}") st.write(f"Similarity score: {similarities[0][idx]:.4f}")