Figea commited on
Commit
813ce50
·
1 Parent(s): c9f9492

Add DockerFile and synonyms_final_vf.py

Browse files
Files changed (2) hide show
  1. Dockerfile +22 -0
  2. src/synonyms_final_vf.py +125 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ RUN apt-get update && \
4
+ apt-get upgrade -y && \
5
+ apt-get install ffmpeg libsm6 libxext6 -y && \
6
+ apt-get clean
7
+
8
+ # Install the dependancies
9
+ COPY requirements.txt /
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy the code files
13
+ COPY src /
14
+
15
+ # Listen to port 5000 (the default port of flask)
16
+ EXPOSE 7860
17
+
18
+ # Define the working dir in the contener
19
+ WORKDIR /
20
+
21
+ # Commande to start the app
22
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "main:app"]
src/synonyms_final_vf.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import spacy
4
+ import numpy as np
5
+ from sklearn.cluster import DBSCAN
6
+ from sklearn.metrics.pairwise import cosine_distances
7
+ import matplotlib.pyplot as plt
8
+ import nltk
9
+ from nltk.corpus import wordnet
10
+
11
+ def load_data(file_path):
12
+ """
13
+ This function loads the data from a given file_path
14
+
15
+ parameter: str the file path
16
+
17
+ Returns: the unique words in gloss column
18
+ """
19
+ data = pd.read_csv(file_path, delimiter=";")
20
+ return data["gloss"].unique()
21
+
22
+ def initialize_spacy_model(model_name="en_core_web_md"):
23
+ return spacy.load(model_name)
24
+
25
+ def download_wordnet():
26
+ """
27
+ This function downloads a dictionary that will be used to find antonyms
28
+ """
29
+ nltk.download('wordnet')
30
+
31
+ def generate_word_vectors(words, model):
32
+ return np.array([model(word).vector for word in words])
33
+
34
+ def plot_k_distance_graph(distances, k):
35
+ k_distances = np.sort(distances, axis=1)[:, k]
36
+ k_distances = np.sort(k_distances)
37
+ plt.figure(figsize=(10, 5))
38
+ plt.plot(k_distances)
39
+ plt.xlabel('Points sorted by distance')
40
+ plt.ylabel(f'{k}-th Nearest Neighbor Distance')
41
+ plt.title(f'k-distance Graph for k={k}')
42
+ plt.grid(True)
43
+ plt.show()
44
+
45
+ def perform_dbscan_clustering(word_vectors, eps, min_samples=5):
46
+ dbscan = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
47
+ dbscan.fit(word_vectors)
48
+ return dbscan
49
+
50
+ def create_cluster_mapping(words, dbscan_labels):
51
+ cluster_to_words = {}
52
+ for word, cluster in zip(words, dbscan_labels):
53
+ if cluster not in cluster_to_words:
54
+ cluster_to_words[cluster] = []
55
+ cluster_to_words[cluster].append(word)
56
+ return cluster_to_words
57
+
58
+ def find_antonyms(word):
59
+ antonyms = set()
60
+ for syn in wordnet.synsets(word):
61
+ for lemma in syn.lemmas():
62
+ if lemma.antonyms():
63
+ antonyms.add(lemma.antonyms()[0].name())
64
+ return antonyms
65
+
66
+ def find_synonyms_in_cluster(word, model, cluster_to_words, dbscan_model):
67
+ """
68
+ This function finds the most similar word in the same cluster, and excludes antonyms
69
+ """
70
+ word_vector = model(word).vector
71
+ cluster_label = dbscan_model.fit_predict([word_vector])[0]
72
+ cluster_words = cluster_to_words.get(cluster_label, [])
73
+
74
+ if not cluster_words:
75
+ return None
76
+
77
+ antonyms = find_antonyms(word)
78
+ similarities = [(dict_word, model(dict_word).similarity(model(word))) for dict_word in cluster_words if dict_word != word and dict_word not in antonyms]
79
+
80
+ if not similarities:
81
+ return None
82
+
83
+ most_similar_word = sorted(similarities, key=lambda item: -item[1])[0][0]
84
+ return most_similar_word
85
+
86
+ def display_clusters(cluster_to_words):
87
+ for cluster_label, words in cluster_to_words.items():
88
+ if cluster_label != -1: # Exclude noise points
89
+ print(f"Cluster {cluster_label}: {words}")
90
+ else:
91
+ print(f"Noise: {words}")
92
+
93
+ def main(file_path, model_name="en_core_web_md", eps=0.23, min_samples=5, k=5):
94
+ global nlp, cluster_to_words, dbscan
95
+
96
+ dict_2000 = load_data(file_path)
97
+ nlp = initialize_spacy_model(model_name)
98
+ download_wordnet()
99
+
100
+ word_vectors = generate_word_vectors(dict_2000, nlp)
101
+
102
+ # distances = cosine_distances(word_vectors)
103
+ # plot_k_distance_graph(distances, k)
104
+
105
+ dbscan = perform_dbscan_clustering(word_vectors, eps, min_samples)
106
+ cluster_to_words = create_cluster_mapping(dict_2000, dbscan.labels_)
107
+
108
+ if __name__ == "__main__":
109
+ main("filtered_WLASL.csv")
110
+
111
+ ##TEST##
112
+ #target_word = "unhappy"
113
+ #synonym = find_synonyms_in_cluster(target_word, nlp, cluster_to_words, dbscan)
114
+ #print(f"The most similar word to '{target_word}' is '{synonym}'")
115
+
116
+ ##If you want to see clusters##
117
+ #num_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
118
+ #print(f"Number of clusters: {num_clusters}")
119
+
120
+ #cluster_label = dbscan.fit_predict([nlp("unhappy").vector])[0]
121
+ #same_cluster_words = cluster_to_words.get(cluster_label, [])
122
+ #print(f"Words in the same cluster as 'unhappy': {same_cluster_words}")
123
+
124
+ #display_clusters(cluster_to_words)
125
+