Spaces:

albertmartinez
/

sentence-transformers

Sleeping

App Files Files Community

sentence-transformers / sts.py

albertmartinez

Added Semantic Textual Similarity

1822f54 7 months ago

raw

history blame

2.5 kB

	import time
	import pandas as pd
	import polars as pl
	import torch
	from datasets import Dataset
	from sentence_transformers import SentenceTransformer

	def sts(data1, data2):
	st = time.time()

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = SentenceTransformer(
	"sentence-transformers/all-MiniLM-L6-v2",
	backend="openvino",
	model_kwargs={"file_name": "openvino/openvino_model.xml"},
	device=device,
	trust_remote_code=True,
	)

	sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"]))
	sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"]))

	embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
	show_progress_bar=True)
	embeddings2 = model.encode(sentences2["text"], normalize_embeddings=True, batch_size=1024,
	show_progress_bar=True)

	similarity_matrix = model.similarity(embeddings1, embeddings2)

	df_pd = pd.DataFrame(similarity_matrix)
	dfi = df_pd.__dataframe__()
	df = pl.from_dataframe(dfi)

	df_matrix_with_index = df.with_row_index(name="row_index").with_columns(pl.col("row_index").cast(pl.UInt64))
	df_long = df_matrix_with_index.unpivot(index="row_index", variable_name="column_index",
	value_name="score").with_columns(pl.col("column_index").cast(pl.UInt64))
	df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
	pl.col("row_index").cast(pl.UInt64))
	df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
	pl.col("column_index").cast(pl.UInt64))

	df_long = (df_long
	.with_columns([pl.col("score").round(4).cast(pl.Float32)]) # Ensure column_index is UInt32
	.join(df_sentences1, on="row_index")
	.join(df_sentences2, on="column_index"))

	df_long = df_long.rename({
	"text": "setences1",
	"text_right": "sentences2",
	})

	elapsed_time = time.time() - st
	print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

	return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > 0.96).sort(["score"],
	descending=True)