import time import pandas as pd import polars as pl import torch from datasets import Dataset from sentence_transformers import SentenceTransformer from sentence_transformers.util import paraphrase_mining def mining(path): st = time.time() data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"])) device = "cuda" if torch.cuda.is_available() else "cpu" model = SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2", backend="openvino", model_kwargs={"file_name": "openvino/openvino_model.xml"}, device=device, trust_remote_code=True, ) paraphrases = paraphrase_mining( model, data["text"], corpus_chunk_size=len(data), show_progress_bar=True, batch_size=1024, max_pairs=len(data) ** 2, ) df_pd = pd.DataFrame(paraphrases) df = pl.from_pandas(df_pd) df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"}) union_df = pl.DataFrame(data.to_pandas()) df = df.with_columns([ pl.col("score").round(3).cast(pl.Float32), union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"), union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"), ]).filter(pl.col("score") > 0.96).sort(["score"], descending=True) elapsed_time = time.time() - st print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) return df