albertmartinez commited on
Commit
1822f54
·
1 Parent(s): fb3abe1

Added Semantic Textual Similarity

Browse files
Files changed (5) hide show
  1. app.py +89 -44
  2. mining.py +46 -0
  3. requirements.txt +1 -0
  4. sts.py +56 -0
  5. utils.py +25 -0
app.py CHANGED
@@ -1,61 +1,106 @@
1
- from pathlib import Path
 
2
  import gradio as gr
3
- import pandas as pd
4
- import polars as pl
5
- from datasets import Dataset
6
- from sentence_transformers import SentenceTransformer
7
- from sentence_transformers.util import paraphrase_mining
8
- import torch
9
 
 
10
 
11
- def upload_file(filepath):
12
- name = Path(filepath).name
13
- return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
 
 
 
 
 
14
 
 
 
 
15
 
16
- def getData(path):
17
- #data = Dataset.from_csv(path, column_names=["text"])
18
- data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', names=["text"]))
19
- device = "cuda" if torch.cuda.is_available() else "cpu"
20
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
21
- backend="openvino",
22
- device=device,
23
- trust_remote_code=True)
24
 
25
- paraphrases = paraphrase_mining(
26
- model,
27
- data["text"],
28
- corpus_chunk_size=len(data),
29
- show_progress_bar=True,
30
- batch_size=1024,
31
- max_pairs=len(data) ** 2
32
- )
33
 
34
- df_pd = pd.DataFrame(paraphrases)
35
- df = pl.from_pandas(df_pd)
36
- df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
 
 
37
 
38
- union_df = pl.DataFrame(data.to_pandas())
 
39
 
40
- df = df.with_columns([
41
- pl.col("score").round(3).cast(pl.Float32),
42
- union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
43
- union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
44
- ]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)
45
 
46
- data = pl.from_arrow(data.data.table)
 
 
 
 
 
 
 
 
 
47
 
48
- return [data, df]
 
49
 
 
 
 
 
 
50
 
51
- with gr.Blocks() as demo:
52
- with gr.Column():
53
- upload_button = gr.UploadButton(label="upload csv", file_types=['.csv'], file_count="single")
54
- output_data = gr.Dataframe(headers=["text"], col_count=1, label="Uploaded Data")
55
- output_paraphrases = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
56
- label="Paraphrase Mining Results")
 
 
 
 
 
 
 
57
 
58
- upload_button.upload(fn=getData, inputs=upload_button, outputs=[output_data, output_paraphrases])
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  if __name__ == "__main__":
 
 
 
 
 
 
 
61
  demo.launch()
 
1
+ import multiprocessing
2
+ import threading
3
  import gradio as gr
4
+ from mining import mining
5
+ from sts import sts
6
+ from utils import getDataFrame, save_to_csv, delete_folder_periodically
 
 
 
7
 
8
+ CONCURRENCY_LIMIT = 5
9
 
10
+ with gr.Blocks() as demo:
11
+ with gr.Tab("Paraphrase Mining"):
12
+ with gr.Row():
13
+ gr.Markdown(
14
+ "### Paraphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a large corpus of sentences")
15
+ with gr.Row():
16
+ with gr.Column():
17
+ gr.Markdown("#### sentences")
18
 
19
+ upload_button_sentences = gr.UploadButton(label="upload sentences csv", file_types=['.csv'],
20
+ file_count="single")
21
+ output_data_sentences = gr.Dataframe(headers=["text"], col_count=1, label="sentences data")
22
 
23
+ upload_button_sentences.upload(fn=getDataFrame, inputs=upload_button_sentences,
24
+ outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT)
 
 
 
 
 
 
25
 
26
+ with gr.Row():
27
+ with gr.Column():
28
+ submit_button_mining = gr.Button("Submit", variant="primary")
29
+ with gr.Row():
30
+ with gr.Column():
31
+ output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
32
+ label="Mining")
 
33
 
34
+ submit_button_mining.click(
35
+ fn=mining,
36
+ inputs=upload_button_sentences,
37
+ outputs=output_mining
38
+ )
39
 
40
+ download_button = gr.Button("Download Results as CSV", variant="huggingface")
41
+ download_file = gr.File(label="Downloadable File")
42
 
43
+ download_button.click(
44
+ fn=save_to_csv,
45
+ inputs=output_mining,
46
+ outputs=download_file
47
+ )
48
 
49
+ with gr.Tab("Semantic Textual Similarity"):
50
+ with gr.Row(): # Row for the title
51
+ gr.Markdown(
52
+ "### Semantic Textual Similarity (STS), we want to produce embeddings for all texts involved and calculate the similarities between them")
53
+ with gr.Row(): # First row of two columns
54
+ with gr.Column():
55
+ gr.Markdown("#### sentences 1")
56
+ upload_button_sentences1 = gr.UploadButton(label="upload sentences 1 csv", file_types=['.csv'],
57
+ file_count="single")
58
+ output_data_sentences1 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 1 data")
59
 
60
+ upload_button_sentences1.upload(fn=getDataFrame, inputs=upload_button_sentences1,
61
+ outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT)
62
 
63
+ with gr.Column():
64
+ gr.Markdown("#### sentences 2")
65
+ upload_button_sentences2 = gr.UploadButton(label="upload sentences 2 csv", file_types=['.csv'],
66
+ file_count="single")
67
+ output_data_sentences2 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 2 data")
68
 
69
+ upload_button_sentences2.upload(fn=getDataFrame, inputs=upload_button_sentences2,
70
+ outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT)
71
+
72
+ with gr.Row():
73
+ with gr.Column():
74
+ submit_button_sts = gr.Button("Submit", variant="primary")
75
+
76
+ with gr.Row():
77
+ with gr.Column():
78
+ gr.Markdown("#### STS Results")
79
+
80
+ output_sts = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
81
+ label="Semantic Textual Similarit")
82
 
83
+ submit_button_sts.click(
84
+ fn=sts,
85
+ inputs=[upload_button_sentences1, upload_button_sentences2],
86
+ outputs=output_sts
87
+ )
88
+
89
+ download_button = gr.Button("Download Results as CSV", variant="huggingface")
90
+ download_file = gr.File(label="Downloadable File")
91
+
92
+ download_button.click(
93
+ fn=save_to_csv,
94
+ inputs=output_sts,
95
+ outputs=download_file
96
+ )
97
 
98
  if __name__ == "__main__":
99
+ multiprocessing.set_start_method("spawn")
100
+
101
+ folder_path = "data"
102
+ thread = threading.Thread(target=delete_folder_periodically, args=(folder_path, 1800), daemon=True)
103
+ thread.start()
104
+
105
+ print(gr.__version__)
106
  demo.launch()
mining.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ import polars as pl
4
+ import torch
5
+ from datasets import Dataset
6
+ from sentence_transformers import SentenceTransformer
7
+ from sentence_transformers.util import paraphrase_mining
8
+
9
+ def mining(path):
10
+ st = time.time()
11
+ data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"]))
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model = SentenceTransformer(
15
+ "sentence-transformers/all-MiniLM-L6-v2",
16
+ backend="openvino",
17
+ model_kwargs={"file_name": "openvino/openvino_model.xml"},
18
+ device=device,
19
+ trust_remote_code=True,
20
+ )
21
+
22
+ paraphrases = paraphrase_mining(
23
+ model,
24
+ data["text"],
25
+ corpus_chunk_size=len(data),
26
+ show_progress_bar=True,
27
+ batch_size=1024,
28
+ max_pairs=len(data) ** 2,
29
+ )
30
+
31
+ df_pd = pd.DataFrame(paraphrases)
32
+ df = pl.from_pandas(df_pd)
33
+ df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
34
+
35
+ union_df = pl.DataFrame(data.to_pandas())
36
+
37
+ df = df.with_columns([
38
+ pl.col("score").round(3).cast(pl.Float32),
39
+ union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
40
+ union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
41
+ ]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)
42
+
43
+ elapsed_time = time.time() - st
44
+ print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
45
+
46
+ return df
requirements.txt CHANGED
@@ -4,3 +4,4 @@ pandas
4
  polars
5
  datasets
6
  sentence-transformers[openvino,onnx-gpu,onnx]
 
 
4
  polars
5
  datasets
6
  sentence-transformers[openvino,onnx-gpu,onnx]
7
+ gradio
sts.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ import polars as pl
4
+ import torch
5
+ from datasets import Dataset
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ def sts(data1, data2):
9
+ st = time.time()
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model = SentenceTransformer(
13
+ "sentence-transformers/all-MiniLM-L6-v2",
14
+ backend="openvino",
15
+ model_kwargs={"file_name": "openvino/openvino_model.xml"},
16
+ device=device,
17
+ trust_remote_code=True,
18
+ )
19
+
20
+ sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"]))
21
+ sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"]))
22
+
23
+ embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
24
+ show_progress_bar=True)
25
+ embeddings2 = model.encode(sentences2["text"], normalize_embeddings=True, batch_size=1024,
26
+ show_progress_bar=True)
27
+
28
+ similarity_matrix = model.similarity(embeddings1, embeddings2)
29
+
30
+ df_pd = pd.DataFrame(similarity_matrix)
31
+ dfi = df_pd.__dataframe__()
32
+ df = pl.from_dataframe(dfi)
33
+
34
+ df_matrix_with_index = df.with_row_index(name="row_index").with_columns(pl.col("row_index").cast(pl.UInt64))
35
+ df_long = df_matrix_with_index.unpivot(index="row_index", variable_name="column_index",
36
+ value_name="score").with_columns(pl.col("column_index").cast(pl.UInt64))
37
+ df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
38
+ pl.col("row_index").cast(pl.UInt64))
39
+ df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
40
+ pl.col("column_index").cast(pl.UInt64))
41
+
42
+ df_long = (df_long
43
+ .with_columns([pl.col("score").round(4).cast(pl.Float32)]) # Ensure column_index is UInt32
44
+ .join(df_sentences1, on="row_index")
45
+ .join(df_sentences2, on="column_index"))
46
+
47
+ df_long = df_long.rename({
48
+ "text": "setences1",
49
+ "text_right": "sentences2",
50
+ })
51
+
52
+ elapsed_time = time.time() - st
53
+ print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
54
+
55
+ return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > 0.96).sort(["score"],
56
+ descending=True)
utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import shutil
4
+ import pandas as pd
5
+ import polars as pl
6
+ import time
7
+
8
+ def getDataFrame(path):
9
+ data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"])
10
+ return pl.from_pandas(data)
11
+
12
+ def save_to_csv(dataframe):
13
+ folder_path = "data"
14
+ if not dataframe.is_empty():
15
+ os.makedirs(folder_path, exist_ok=True)
16
+ csv_path = f"{folder_path}/{uuid.uuid4()}.csv"
17
+ dataframe.write_csv(csv_path, separator="\t")
18
+ return csv_path
19
+
20
+ def delete_folder_periodically(path, interval=3600):
21
+ while True:
22
+ if os.path.exists(path):
23
+ shutil.rmtree(path)
24
+ os.makedirs(path, exist_ok=True)
25
+ time.sleep(interval)