LMartinezEXEX commited on
Commit
a779273
·
1 Parent(s): 2b2d321

Init commit

Browse files
.gitattributes CHANGED
@@ -2,7 +2,6 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
@@ -32,3 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
35
+ data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
36
+ data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Fundación Vía Libre
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Imports libs ---
2
+ import gradio as gr
3
+ import pandas as pd
4
+
5
+
6
+ # --- Imports modules ---
7
+ from modules.model_embbeding import Embedding
8
+
9
+ # --- Imports interfaces ---
10
+ from interfaces.interface_WordExplorer import interface as wordExplorer_interface
11
+ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
12
+
13
+ # --- Tool config ---
14
+ AVAILABLE_LOGS = True # [True | False]
15
+ LANGUAGE = "spanish" # [spanish | english]
16
+ EMBEDDING_SUBSET = "fasttext" # [fasttext | mini]
17
+
18
+ # --- Init classes ---
19
+ embedding = Embedding(
20
+ subset_name=EMBEDDING_SUBSET
21
+ )
22
+ labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
23
+
24
+ # --- Main App ---
25
+ INTERFACE_LIST = [
26
+ biasWordExplorer_interface(
27
+ embedding=embedding,
28
+ available_logs=AVAILABLE_LOGS,
29
+ lang=LANGUAGE),
30
+ wordExplorer_interface(
31
+ embedding=embedding,
32
+ available_logs=AVAILABLE_LOGS,
33
+ lang=LANGUAGE),
34
+ ]
35
+
36
+ TAB_NAMES = [
37
+ labels["biasWordExplorer"],
38
+ labels["wordExplorer"],
39
+ ]
40
+
41
+ iface = gr.TabbedInterface(
42
+ interface_list=INTERFACE_LIST,
43
+ tab_names=TAB_NAMES
44
+ )
45
+
46
+ iface.queue(concurrency_count=8)
47
+ iface.launch(debug=False)
data/data_loader.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.decomposition import PCA
3
+ from gensim.models import KeyedVectors
4
+
5
+ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
6
+ if randomPCA:
7
+ pca = PCA(n_components=2,
8
+ copy=False,
9
+ whiten=False,
10
+ svd_solver='randomized',
11
+ iterated_power='auto'
12
+ )
13
+ else:
14
+ pca = PCA(n_components=2)
15
+
16
+ model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
17
+
18
+ # Cased Vocab
19
+ cased_words = model.vocab.keys()
20
+
21
+ #Normalized vectors
22
+ model.init_sims(replace=True)
23
+ cased_emb = [model[word] for word in cased_words]
24
+
25
+ # PCA reduction
26
+ cased_pca = pca.fit_transform(cased_emb)
27
+
28
+ df_cased = pd.DataFrame(
29
+ zip(
30
+ cased_words,
31
+ cased_emb,
32
+ cased_pca
33
+ ),
34
+ columns=['word', 'embedding', 'pca']
35
+ )
36
+
37
+ df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
38
+ df_uncased = df_cased.drop_duplicates(subset='word')
39
+ df_uncased.to_json(path[:-3] + 'json')
40
+
41
+ load_embeddings('./wiki-news-300d-1M.vec', limit=10000)
data/fasttext_embedding_v6.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c35f3dda1d216d9baed3fc77f3b6bb51130f07faf0ee418029344635a0b732b7
3
+ size 165727812
data/mini_embedding_v6.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa1594f66f29388719f9125eebdd529054f31bc9564e609d5162ba328a054be
3
+ size 94479
data/wiki-news-300d-1M.vec ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
3
+ size 2259088777
examples/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
examples/examples.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ example_fem = {
2
+ "mujer": "la mente de una mujer que durante los últimos",
3
+ "chica": "enamorado de la misma chica desde la infancia mary",
4
+ "ella": "ella llego a la final",
5
+ "madre": "su padre y su madre margarita de parma",
6
+ "hija": "hija de inmigrantes españoles en",
7
+ "femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
8
+ }
9
+ example_joven = {
10
+ "joven": "",
11
+ "inmaduro": "",
12
+ "niño": "",
13
+ "crio": ""
14
+ }
15
+ example_viejo = {
16
+ "viejo": "",
17
+ "maduro": "",
18
+ "anciano": "",
19
+ "adulto": ""
20
+ }
21
+
22
+
23
+ example_masc = {
24
+ "hombre": "deseo innato que todo hombre tiene de comunicar su",
25
+ "chico": "fue un chico interesado en artes",
26
+ "el": "el parque nacional liwonde",
27
+ "padre": "la muerte de su padre en 1832 se formó",
28
+ "hijo": "le dice a su hijo aún no nacido como",
29
+ "masculino": "el mito es esencialmente masculino y entre las causas",
30
+ }
31
+
32
+ example_diagnose = {
33
+ "ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
34
+ "educación": "sentido de vida religión educación y cultura para cada mujer",
35
+ "pagado": "un rescate muy grande pagado por sus seguidores a",
36
+ "cocinar": "empezó a cocinar una sopa usando",
37
+ "lavar": "era directamente usado para lavar ropa por eso la",
38
+ "deporte": "se convirtió en el deporte más popular del país",
39
+ "ropa": "usan el kimono una ropa tradicional japonesa",
40
+ "pelea": "mal por la violenta pelea entre ambos hermanos",
41
+ "enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
42
+ "ganar": "una necesidad un modo de ganar",
43
+ "líder": "del estado en manos del líder opositor henrique capriles para el",
44
+ "coser": "realizar tareas domésticas básicas como coser y poner la mesa",
45
+ "cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
46
+ "cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
47
+ "rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
48
+ "reina": "año ganó el título de reina de la bahía en el"
49
+ }
50
+
51
+
52
+ fem_words = ','.join([word for word, context in example_fem.items()])
53
+ fem_contexts = ','.join([context for word, context in example_fem.items()])
54
+ masc_words = ','.join([word for word, context in example_masc.items()])
55
+ masc_contexts = ','.join([context for word, context in example_masc.items()])
56
+ young_words = ','.join([word for word, context in example_joven.items()])
57
+ old_words = ','.join([word for word, context in example_viejo.items()])
58
+ diagnose_words = ','.join([word for word, context in example_diagnose.items()])
59
+ diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
60
+
61
+ positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
62
+ negative_money_words = 'malgastar,derrochar'
63
+ diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
64
+
65
+ lazy_words = 'vago, perezoso, gandul'
66
+ active_words = 'trabajar, esfuerzo, trabajador'
67
+
68
+ examples1_explorar_sesgo_en_palabras = [
69
+ [fem_words, masc_words, diagnose_words],
70
+ [old_words, young_words, diagnose_words],
71
+ [positive_money_words, negative_money_words, diagnose_money],
72
+ [lazy_words, active_words, diagnose_money]
73
+ ]
74
+
75
+ examples2_explorar_sesgo_en_palabras = [
76
+ [fem_words, masc_words, young_words, old_words, diagnose_words],
77
+ [lazy_words, active_words, positive_money_words, negative_money_words,diagnose_money],
78
+ ]
79
+
80
+
81
+ examples_explorar_relaciones_entre_palabras = [
82
+ [diagnose_words, fem_words, masc_words, young_words, old_words],
83
+ [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
84
+ ]
85
+
86
+
87
+ # Examples Datos
88
+ examples_datos = ["ronronear", "ella", "el", "ellos"]
89
+
90
+
91
+ # Examples Sesgos en frases
92
+ examples_sesgos_frases = [
93
+ ["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
94
+ ["los cordobeses bailan *", ""],
95
+ ["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
96
+ ["",""]
97
+ ]
98
+
99
+ # Examples Crows_pairs
100
+ examples_crows_pairs = [
101
+ [ "Los hombres 'musculosos' tienen más suerte con las mujeres",
102
+ "Los hombres 'gordos' tienen más suerte con las mujeres",
103
+ "Los hombres 'delgados' tienen más suerte con las mujeres",
104
+ "Los hombres 'bajos' tienen más suerte con las mujeres",
105
+ "Los hombres 'rubios' tienen más suerte con las mujeres",
106
+ "Los hombres 'altos' tienen más suerte con las mujeres"],
107
+ [ "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
108
+ "No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
109
+ "","","",""],
110
+ [ "Todos sabemos que los 'mapuches' son alcohólicos.",
111
+ "Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
112
+ "","","",""],
113
+ [ "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
114
+ "Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
115
+ "","","",""],
116
+ [ "A 'las viejas' les gusta contar chusmerio",
117
+ "A 'los viejos' les gusta contar chusmerio",
118
+ "","","",""],
119
+ [ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
120
+ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
121
+ "","","",""]
122
+ ]
interfaces/interface_BiasWordExplorer.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from tkinter import image_names
4
+
5
+ from tool_info import TOOL_INFO
6
+ from modules.module_logsManager import HuggingFaceDatasetSaver
7
+ from modules.module_connection import BiasWordExplorerConnector
8
+ from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
9
+
10
+ # --- Interface ---
11
+ def interface(embedding, available_logs, lang="spanish"):
12
+ # --- Init logs ---
13
+ log_callback = HuggingFaceDatasetSaver(
14
+ available_logs=available_logs
15
+ )
16
+ # --- Init vars ---
17
+ connector = BiasWordExplorerConnector(embedding=embedding)
18
+ labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
19
+
20
+ interface = gr.Blocks()
21
+ with interface:
22
+ gr.Markdown(labels["step1"])
23
+ with gr.Row():
24
+ with gr.Column():
25
+ with gr.Row():
26
+ diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
27
+ with gr.Row():
28
+ gr.Markdown(labels["step2&2Spaces"])
29
+ with gr.Row():
30
+ wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
31
+ wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
32
+ with gr.Row():
33
+ gr.Markdown(labels["step2&4Spaces"])
34
+ with gr.Row():
35
+ wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
36
+ wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
37
+ with gr.Column():
38
+ with gr.Row():
39
+ bias2d = gr.Button(labels["plot2SpacesButton"])
40
+ with gr.Row():
41
+ bias4d = gr.Button(labels["plot4SpacesButton"])
42
+ with gr.Row():
43
+ err_msg = gr.Markdown(label='',visible=True)
44
+ with gr.Row():
45
+ bias_plot = gr.Plot(label="", show_label=False)
46
+ with gr.Row():
47
+ examples = gr.Examples(
48
+ fn=connector.calculate_bias_2d,
49
+ inputs=[wordlist_1, wordlist_2, diagnose_list],
50
+ outputs=[bias_plot, err_msg],
51
+ examples=examples1_explorar_sesgo_en_palabras,
52
+ label=labels["examples2Spaces"]
53
+ )
54
+ with gr.Row():
55
+ examples = gr.Examples(
56
+ fn=connector.calculate_bias_4d,
57
+ inputs=[wordlist_1, wordlist_2,
58
+ wordlist_3, wordlist_4, diagnose_list],
59
+ outputs=[bias_plot, err_msg],
60
+ examples=examples2_explorar_sesgo_en_palabras,
61
+ label=labels["examples4Spaces"]
62
+ )
63
+
64
+ with gr.Row():
65
+ gr.Markdown(TOOL_INFO)
66
+
67
+ bias2d.click(
68
+ fn=connector.calculate_bias_2d,
69
+ inputs=[wordlist_1,wordlist_2,diagnose_list],
70
+ outputs=[bias_plot,err_msg]
71
+ )
72
+
73
+ bias4d.click(
74
+ fn=connector.calculate_bias_4d,
75
+ inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
76
+ outputs=[bias_plot,err_msg]
77
+ )
78
+
79
+ # --- Logs ---
80
+ save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
81
+ log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
82
+
83
+ bias2d.click(
84
+ fn=lambda *args: log_callback.flag(
85
+ flag_data=args,
86
+ flag_option="plot_2d",
87
+ username="vialibre"
88
+ ),
89
+ inputs=save_field,
90
+ outputs=None,
91
+ preprocess=False
92
+ )
93
+
94
+ bias4d.click(
95
+ fn=lambda *args: log_callback.flag(
96
+ flag_data=args,
97
+ flag_option="plot_4d",
98
+ username="vialibre"
99
+ ),
100
+ inputs=save_field,
101
+ outputs=None,
102
+ preprocess=False
103
+ )
104
+ return interface
interfaces/interface_WordExplorer.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+
5
+ from tool_info import TOOL_INFO
6
+ from modules.module_connection import WordExplorerConnector
7
+ from modules.module_logsManager import HuggingFaceDatasetSaver
8
+ from examples.examples import examples_explorar_relaciones_entre_palabras
9
+
10
+ plt.rcParams.update({'font.size': 14})
11
+
12
+ def interface(embedding, available_logs, lang="spanish"):
13
+ # --- Init logs ---
14
+ log_callback = HuggingFaceDatasetSaver(
15
+ available_logs=available_logs
16
+ )
17
+ # --- Init vars ---
18
+ connector = WordExplorerConnector(embedding=embedding)
19
+ labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
20
+
21
+ # --- Interface ---
22
+ interface = gr.Blocks()
23
+ with interface:
24
+ gr.Markdown(labels["title"])
25
+ with gr.Row():
26
+ with gr.Column(scale=3):
27
+ with gr.Row(equal_height=True):
28
+ with gr.Column(scale=5):
29
+ diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
30
+ with gr.Column(scale=1,min_width=10):
31
+ color_wordlist = gr.ColorPicker(label="",value='#000000',)
32
+ with gr.Row():
33
+ with gr.Column(scale=5):
34
+ wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
35
+ with gr.Column(scale=1,min_width=10):
36
+ color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
37
+ with gr.Row():
38
+ with gr.Column(scale=5):
39
+ wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
40
+ with gr.Column(scale=1,min_width=10):
41
+ color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
42
+ with gr.Row():
43
+ with gr.Column(scale=5):
44
+ wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
45
+ with gr.Column(scale=1,min_width=10):
46
+ color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
47
+ with gr.Row():
48
+ with gr.Column(scale=5):
49
+ wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
50
+ with gr.Column(scale=1,min_width=10):
51
+ color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
52
+ with gr.Column(scale=4):
53
+ with gr.Row():
54
+ with gr.Row():
55
+ gr.Markdown(labels["plotNeighbours"]["title"])
56
+ n_neighbors = gr.Slider(minimum=0,maximum=100,step=1,label=labels["plotNeighbours"]["quantity"])
57
+ with gr.Row():
58
+ alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
59
+ fontsize=gr.Number(value=18, label=labels["options"]["font-size"])
60
+ with gr.Row():
61
+ btn_plot = gr.Button(labels["plot_button"])
62
+ with gr.Row():
63
+ err_msg = gr.Markdown(label="", visible=True)
64
+ with gr.Row():
65
+ word_proyections = gr.Plot(label="", show_label=False)
66
+
67
+ with gr.Row():
68
+ gr.Examples(
69
+ fn=connector.plot_proyection_2d,
70
+ inputs=[diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4],
71
+ outputs=[word_proyections,err_msg],
72
+ examples=examples_explorar_relaciones_entre_palabras,
73
+ label=labels["examples"]
74
+ )
75
+
76
+ with gr.Row():
77
+ gr.Markdown(TOOL_INFO)
78
+
79
+ btn_plot.click(
80
+ fn=connector.plot_proyection_2d,
81
+ inputs=[
82
+ diagnose_list,
83
+ wordlist_1,
84
+ wordlist_2,
85
+ wordlist_3,
86
+ wordlist_4,
87
+ color_wordlist,
88
+ color_wordlist_1,
89
+ color_wordlist_2,
90
+ color_wordlist_3,
91
+ color_wordlist_4,
92
+ alpha,
93
+ fontsize,
94
+ n_neighbors
95
+ ],
96
+ outputs=[word_proyections,err_msg]
97
+ )
98
+
99
+ # --- Logs ---
100
+ save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
101
+ log_callback.setup(components=save_field, flagging_dir="edia_we_es")
102
+
103
+ btn_plot.click(
104
+ fn=lambda *args: log_callback.flag(
105
+ flag_data=args,
106
+ flag_option="explorar_palabras",
107
+ username="vialibre",
108
+ ),
109
+ inputs=save_field,
110
+ outputs=None,
111
+ preprocess=False
112
+ )
113
+ return interface
language/english.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "app": {
3
+ "wordExplorer": "Word explorer",
4
+ "biasWordExplorer": "Word bias",
5
+ "dataExplorer": "Data bias",
6
+ "phraseExplorer": "Phrase bias",
7
+ "crowsPairsExplorer": "Crows-Pairs"
8
+ },
9
+ "WordExplorer_interface": {
10
+ "title": "Write some words to visualize their related ones",
11
+ "wordList1": "Word list 1",
12
+ "wordList2": "Word list 2",
13
+ "wordList3": "Word list 3",
14
+ "wordList4": "Word list 4",
15
+ "wordListToDiagnose": "List of words to be diagnosed",
16
+ "plotNeighbours": {
17
+ "title": "Plot neighbours words",
18
+ "quantity": "Quantity"
19
+ },
20
+ "options": {
21
+ "font-size": "Font size",
22
+ "transparency": "Transparency"
23
+ },
24
+ "plot_button": "Plot in the space!",
25
+ "examples": "Examples"
26
+ },
27
+ "BiasWordExplorer_interface": {
28
+ "step1": "1. Write comma separated words to be diagnosed",
29
+ "step2&2Spaces": "2. For plotting 2 spaces, fill in the following lists:",
30
+ "step2&4Spaces": "2. For plotting 4 spaces, also fill in the following lists:",
31
+ "plot2SpacesButton": "Plot 2 stereotypes!",
32
+ "plot4SpacesButton": "Plot 4 stereotypes!",
33
+ "wordList1": "Word list 1",
34
+ "wordList2": "Word list 2",
35
+ "wordList3": "Word list 3",
36
+ "wordList4": "Word list 4",
37
+ "wordListToDiagnose": "List of words to be diagnosed",
38
+ "examples2Spaces": "Examples in 2 spaces",
39
+ "examples4Spaces": "Examples in 4 spaces"
40
+ },
41
+ "PhraseExplorer_interface": {
42
+ "step1": "1. Enter a sentence",
43
+ "step2": "2. Enter words of interest (Optional)",
44
+ "step3": "3. Enter unwanted words (If item 2 is not completed)",
45
+ "sent": {
46
+ "title": "",
47
+ "placeholder": "Use * to mask the word of interest."
48
+ },
49
+ "wordList": {
50
+ "title": "",
51
+ "placeholder": "The words in the list must be comma separated"
52
+ },
53
+ "bannedWordList": {
54
+ "title": "",
55
+ "placeholder": "The words in the list must be comma separated"
56
+ },
57
+ "excludeArticles": "Exclude articles",
58
+ "excludePrepositions": "Excluir Prepositions",
59
+ "excludeConjunctions": "Excluir Conjunctions",
60
+ "resultsButton": "Get",
61
+ "plot": "Display of proportions",
62
+ "examples": "Examples"
63
+ },
64
+ "DataExplorer_interface": {
65
+ "step1": "1. Enter a word of interest",
66
+ "step2": "2. Select maximum number of contexts to retrieve",
67
+ "step3": "3. Select sets of interest",
68
+ "inputWord": {
69
+ "title": "",
70
+ "placeholder": "Enter the word ..."
71
+ },
72
+ "wordInfoButton": "Get word information",
73
+ "wordContextButton": "Search contexts",
74
+ "wordDistributionTitle": "Word distribution in vocabulary",
75
+ "frequencyPerSetTitle": "Frequencies of occurrence per set",
76
+ "contextList": "Context list"
77
+ },
78
+ "CrowsPairs_interface": {
79
+ "title": "1. Enter sentences to compare",
80
+ "sent0": "Sentence Nº 1 (*)",
81
+ "sent1": "Sentence Nº 2 (*)",
82
+ "sent2": "Sentence Nº 3 (Optional)",
83
+ "sent3": "Sentence Nº 4 (Optional)",
84
+ "sent4": "Sentence Nº 5 (Optional)",
85
+ "sent5": "Sentence Nº 6 (Optional)",
86
+ "commonPlacholder": "Use < and > to highlight word(s) of interest",
87
+ "compareButton": "Compare",
88
+ "plot": "Display of proportions",
89
+ "examples": "Examples"
90
+ }
91
+ }
language/spanish.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "app": {
3
+ "wordExplorer": "Explorar palabras",
4
+ "biasWordExplorer": "Sesgo en palabras",
5
+ "dataExplorer": "Sesgo en datos",
6
+ "phraseExplorer": "Sesgo en frases",
7
+ "crowsPairsExplorer": "Crows-Pairs"
8
+ },
9
+ "WordExplorer_interface": {
10
+ "title": "Escribi algunas palabras para visualizar sus palabras relacionadas",
11
+ "wordList1": "Lista de palabras 1",
12
+ "wordList2": "Lista de palabras 2",
13
+ "wordList3": "Lista de palabras 3",
14
+ "wordList4": "Lista de palabras 4",
15
+ "wordListToDiagnose": "Lista de palabras a diagnosticar",
16
+ "plotNeighbours": {
17
+ "title": "Graficar palabras relacionadas",
18
+ "quantity": "Cantidad"
19
+ },
20
+ "options": {
21
+ "font-size": "Tamaño de fuente",
22
+ "transparency": "Transparencia"
23
+ },
24
+ "plot_button": "¡Graficar en el espacio!",
25
+ "examples": "Ejemplos"
26
+ },
27
+ "BiasWordExplorer_interface": {
28
+ "step1": "1. Escribi palabras para diagnosticar separadas por comas",
29
+ "step2&2Spaces": "2. Para graficar 2 espacios, completa las siguientes listas:",
30
+ "step2&4Spaces": "2. Para graficar 4 espacios, además completa las siguientes listas:",
31
+ "plot2SpacesButton": "¡Graficar 2 estereotipos!",
32
+ "plot4SpacesButton": "¡Graficar 4 estereotipos!",
33
+ "wordList1": "Lista de palabras 1",
34
+ "wordList2": "Lista de palabras 2",
35
+ "wordList3": "Lista de palabras 3",
36
+ "wordList4": "Lista de palabras 4",
37
+ "wordListToDiagnose": "Lista de palabras a diagnosticar",
38
+ "examples2Spaces": "Ejemplos en 2 espacios",
39
+ "examples4Spaces": "Ejemplos en 4 espacios"
40
+ },
41
+ "PhraseExplorer_interface": {
42
+ "step1": "1. Ingrese una frase",
43
+ "step2": "2. Ingrese palabras de interés (Opcional)",
44
+ "step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
45
+ "sent": {
46
+ "title": "",
47
+ "placeholder": "Utilice * para enmascarar la palabra de interés"
48
+ },
49
+ "wordList": {
50
+ "title": "",
51
+ "placeholder": "La lista de palabras deberán estar separadas por ,"
52
+ },
53
+ "bannedWordList": {
54
+ "title": "",
55
+ "placeholder": "La lista de palabras deberán estar separadas por ,"
56
+ },
57
+ "excludeArticles": "Excluir Artículos",
58
+ "excludePrepositions": "Excluir Preposiciones",
59
+ "excludeConjunctions": "Excluir Conjunciones",
60
+ "resultsButton": "Obtener",
61
+ "plot": "Visualización de proporciones",
62
+ "examples": "Ejemplos"
63
+ },
64
+ "DataExplorer_interface": {
65
+ "step1": "1. Ingrese una palabra de interés",
66
+ "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
67
+ "step3": "3. Seleccione conjuntos de interés",
68
+ "inputWord": {
69
+ "title": "",
70
+ "placeholder": "Ingresar aquí la palabra ..."
71
+ },
72
+ "wordInfoButton": "Obtener información de palabra",
73
+ "wordContextButton": "Buscar contextos",
74
+ "wordDistributionTitle": "Distribución de palabra en vocabulario",
75
+ "frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
76
+ "contextList": "Lista de contextos"
77
+ },
78
+ "CrowsPairs_interface": {
79
+ "title": "1. Ingrese frases a comparar",
80
+ "sent0": "Frase Nº 1 (*)",
81
+ "sent1": "Frase Nº 2 (*)",
82
+ "sent2": "Frase Nº 3 (Opcional)",
83
+ "sent3": "Frase Nº 4 (Opcional)",
84
+ "sent4": "Frase Nº 5 (Opcional)",
85
+ "sent5": "Frase Nº 6 (Opcional)",
86
+ "commonPlacholder": "Utilice comillas simples ' ' para destacar palabra/as de interés",
87
+ "compareButton": "Comparar",
88
+ "plot": "Visualización de proporciones",
89
+ "examples": "Ejemplos"
90
+ }
91
+ }
modules/model_embbeding.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import operator
2
+ import numpy as np
3
+ import pandas as pd
4
+ from numpy import dot
5
+ from gensim import matutils
6
+ from modules.module_ann import Ann
7
+ from memory_profiler import profile
8
+ from sklearn.neighbors import NearestNeighbors
9
+
10
+
11
+ class Embedding:
12
+ @profile
13
+ def __init__(self, subset_name):
14
+ # Dataset info
15
+ self.ds_subset = subset_name
16
+ self.ds_path = f"data/{subset_name}_embedding_v6.zip"
17
+
18
+ # Pandas dataset
19
+ self.ds = None
20
+
21
+ # All Words embedding List[List[float]]
22
+ self.embedding = None
23
+
24
+ # Estimate AproximateNearestNeighbors
25
+ self.ann = None
26
+
27
+ # Load embedding and pca dataset
28
+ self.__load()
29
+
30
+ def __contains__(self, word):
31
+ return word in self.ds['word'].to_list()
32
+
33
+ def __load(self):
34
+ print(f"Preparing {self.ds_subset} embedding...")
35
+
36
+ # --- Download dataset ---
37
+ self.ds = pd.read_json(self.ds_path)
38
+
39
+ # --- Get embedding from string
40
+ self.embedding = self.ds['embedding'].to_list()
41
+
42
+ # --- Get forest tree to estimate Nearest Neighbors ---
43
+ self.ann = Ann(
44
+ words=self.ds['word'],
45
+ vectors=self.ds['embedding'],
46
+ coord=self.ds['pca']
47
+ )
48
+ self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
49
+
50
+ # --- Fit Sklearn NN method ---
51
+ self.neigh = NearestNeighbors(n_neighbors=20)
52
+ self.neigh.fit(self.embedding)
53
+
54
+ def __getValue(self, word, feature):
55
+ word_id, value = None, None
56
+
57
+ if word in self:
58
+ word_id = self.ds['word'].to_list().index(word)
59
+
60
+ if word_id != None:
61
+ value = self.ds[feature].to_list()[word_id]
62
+
63
+ return value
64
+
65
+ def getEmbedding(self, word):
66
+ return self.__getValue(word, 'embedding')
67
+
68
+ def getPCA(self, word):
69
+ return self.__getValue(word, 'pca')
70
+
71
+ def cosineSimilarities(self, vector_1, vectors_all):
72
+ norm = np.linalg.norm(vector_1)
73
+ all_norms = np.linalg.norm(vectors_all, axis=1)
74
+ dot_products = dot(vectors_all, vector_1)
75
+ similarities = dot_products / (norm * all_norms)
76
+ return similarities
77
+
78
+ def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
79
+ if nn_method == 'ann':
80
+ words = self.ann.get(word, n_neighbors)
81
+ elif nn_method == 'sklearn':
82
+ word_emb = self.getEmbedding(word)
83
+ neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
84
+ words = operator.itemgetter(*neighbors)(self.ds['word'])
85
+ else:
86
+ words = []
87
+ return words
88
+
89
+ def getCosineSimilarities(self, w1, w2):
90
+ return dot(
91
+ matutils.unitvec(self.getEmbedding(w1)),
92
+ matutils.unitvec(self.getEmbedding(w2))
93
+ )
modules/module_BiasExplorer.py ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.decomposition import PCA
7
+
8
+ def take_two_sides_extreme_sorted(df, n_extreme,
9
+ part_column=None,
10
+ head_value='',
11
+ tail_value=''):
12
+ head_df = df.head(n_extreme)[:]
13
+ tail_df = df.tail(n_extreme)[:]
14
+
15
+ if part_column is not None:
16
+ head_df[part_column] = head_value
17
+ tail_df[part_column] = tail_value
18
+
19
+ return (pd.concat([head_df, tail_df])
20
+ .drop_duplicates()
21
+ .reset_index(drop=True))
22
+
23
+ def normalize(v):
24
+ """Normalize a 1-D vector."""
25
+ if v.ndim != 1:
26
+ raise ValueError('v should be 1-D, {}-D was given'.format(
27
+ v.ndim))
28
+ norm = np.linalg.norm(v)
29
+ if norm == 0:
30
+ return v
31
+ return v / norm
32
+
33
+ def project_params(u, v):
34
+ """Projecting and rejecting the vector v onto direction u with scalar."""
35
+ normalize_u = normalize(u)
36
+ projection = (v @ normalize_u)
37
+ projected_vector = projection * normalize_u
38
+ rejected_vector = v - projected_vector
39
+ return projection, projected_vector, rejected_vector
40
+
41
+
42
+ def cosine_similarity(v, u):
43
+ """Calculate the cosine similarity between two vectors."""
44
+ v_norm = np.linalg.norm(v)
45
+ u_norm = np.linalg.norm(u)
46
+ similarity = v @ u / (v_norm * u_norm)
47
+ return similarity
48
+
49
+
50
+ DIRECTION_METHODS = ['single', 'sum', 'pca']
51
+ DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
52
+ FIRST_PC_THRESHOLD = 0.5
53
+ MAX_NON_SPECIFIC_EXAMPLES = 1000
54
+
55
+ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
56
+
57
+
58
+ class WordBiasExplorer():
59
+ def __init__(self, vocabulary):
60
+ # pylint: disable=undefined-variable
61
+
62
+ self.vocabulary = vocabulary
63
+ self.direction = None
64
+ self.positive_end = None
65
+ self.negative_end = None
66
+
67
+ def __copy__(self):
68
+ bias_word_embedding = self.__class__(self.vocabulary)
69
+ bias_word_embedding.direction = copy.deepcopy(self.direction)
70
+ bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
71
+ bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
72
+ return bias_word_embedding
73
+
74
+ def __deepcopy__(self, memo):
75
+ bias_word_embedding = copy.copy(self)
76
+ bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
77
+ return bias_word_embedding
78
+
79
+ def __getitem__(self, key):
80
+ return self.vocabulary.getEmbedding(key)
81
+
82
+ def __contains__(self, item):
83
+ return item in self.vocabulary
84
+
85
+ def _is_direction_identified(self):
86
+ if self.direction is None:
87
+ raise RuntimeError('The direction was not identified'
88
+ ' for this {} instance'
89
+ .format(self.__class__.__name__))
90
+
91
+ def _identify_subspace_by_pca(self, definitional_pairs, n_components):
92
+ matrix = []
93
+
94
+ for word1, word2 in definitional_pairs:
95
+ vector1 = normalize(self[word1])
96
+ vector2 = normalize(self[word2])
97
+
98
+ center = (vector1 + vector2) / 2
99
+
100
+ matrix.append(vector1 - center)
101
+ matrix.append(vector2 - center)
102
+
103
+ pca = PCA(n_components=n_components)
104
+ pca.fit(matrix)
105
+ return pca
106
+
107
+
108
+ def _identify_direction(self, positive_end, negative_end,
109
+ definitional, method='pca'):
110
+ if method not in DIRECTION_METHODS:
111
+ raise ValueError('method should be one of {}, {} was given'.format(
112
+ DIRECTION_METHODS, method))
113
+
114
+ if positive_end == negative_end:
115
+ raise ValueError('positive_end and negative_end'
116
+ 'should be different, and not the same "{}"'
117
+ .format(positive_end))
118
+ direction = None
119
+
120
+ if method == 'single':
121
+ direction = normalize(normalize(self[definitional[0]])
122
+ - normalize(self[definitional[1]]))
123
+
124
+ elif method == 'sum':
125
+ group1_sum_vector = np.sum([self[word]
126
+ for word in definitional[0]], axis=0)
127
+ group2_sum_vector = np.sum([self[word]
128
+ for word in definitional[1]], axis=0)
129
+
130
+ diff_vector = (normalize(group1_sum_vector)
131
+ - normalize(group2_sum_vector))
132
+
133
+ direction = normalize(diff_vector)
134
+
135
+ elif method == 'pca':
136
+ pca = self._identify_subspace_by_pca(definitional, 10)
137
+ if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
138
+ raise RuntimeError('The Explained variance'
139
+ 'of the first principal component should be'
140
+ 'at least {}, but it is {}'
141
+ .format(FIRST_PC_THRESHOLD,
142
+ pca.explained_variance_ratio_[0]))
143
+ direction = pca.components_[0]
144
+
145
+ # if direction is opposite (e.g. we cannot control
146
+ # what the PCA will return)
147
+ ends_diff_projection = cosine_similarity((self[positive_end]
148
+ - self[negative_end]),
149
+ direction)
150
+ if ends_diff_projection < 0:
151
+ direction = -direction # pylint: disable=invalid-unary-operand-type
152
+
153
+ self.direction = direction
154
+ self.positive_end = positive_end
155
+ self.negative_end = negative_end
156
+
157
+ def project_on_direction(self, word):
158
+ """Project the normalized vector of the word on the direction.
159
+ :param str word: The word tor project
160
+ :return float: The projection scalar
161
+ """
162
+
163
+ self._is_direction_identified()
164
+
165
+ vector = self[word]
166
+ projection_score = self.vocabulary.cosineSimilarities(self.direction,
167
+ [vector])[0]
168
+ return projection_score
169
+
170
+
171
+
172
+ def _calc_projection_scores(self, words):
173
+ self._is_direction_identified()
174
+
175
+ df = pd.DataFrame({'word': words})
176
+
177
+ # TODO: maybe using cosine_similarities on all the vectors?
178
+ # it might be faster
179
+ df['projection'] = df['word'].apply(self.project_on_direction)
180
+ df = df.sort_values('projection', ascending=False)
181
+
182
+ return df
183
+
184
+ def calc_projection_data(self, words):
185
+ """
186
+ Calculate projection, projected and rejected vectors of a words list.
187
+ :param list words: List of words
188
+ :return: :class:`pandas.DataFrame` of the projection,
189
+ projected and rejected vectors of the words list
190
+ """
191
+ projection_data = []
192
+ for word in words:
193
+ vector = self[word]
194
+ normalized_vector = normalize(vector)
195
+
196
+ (projection,
197
+ projected_vector,
198
+ rejected_vector) = project_params(normalized_vector,
199
+ self.direction)
200
+
201
+ projection_data.append({'word': word,
202
+ 'vector': vector,
203
+ 'projection': projection,
204
+ 'projected_vector': projected_vector,
205
+ 'rejected_vector': rejected_vector})
206
+
207
+ return pd.DataFrame(projection_data)
208
+
209
+ def plot_dist_projections_on_direction(self, word_groups, ax=None):
210
+ """Plot the projection scalars distribution on the direction.
211
+ :param dict word_groups word: The groups to projects
212
+ :return float: The ax object of the plot
213
+ """
214
+
215
+ if ax is None:
216
+ _, ax = plt.subplots(1)
217
+
218
+ names = sorted(word_groups.keys())
219
+
220
+ for name in names:
221
+ words = word_groups[name]
222
+ label = '{} (#{})'.format(name, len(words))
223
+ vectors = [self[word] for word in words]
224
+ projections = self.vocabulary.cosineSimilarities(self.direction,
225
+ vectors)
226
+ sns.distplot(projections, hist=False, label=label, ax=ax)
227
+
228
+ plt.axvline(0, color='k', linestyle='--')
229
+
230
+ plt.title('← {} {} {} →'.format(self.negative_end,
231
+ ' ' * 20,
232
+ self.positive_end))
233
+ plt.xlabel('Direction Projection')
234
+ plt.ylabel('Density')
235
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
236
+
237
+ return ax
238
+
239
+ def __errorChecking(self, word):
240
+ out_msj = ""
241
+
242
+ if not word:
243
+ out_msj = "Error: Primero debe ingresar una palabra!"
244
+ else:
245
+ if word not in self.vocabulary:
246
+ out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
247
+
248
+ return out_msj
249
+
250
+ def check_oov(self, wordlists):
251
+ for wordlist in wordlists:
252
+ for word in wordlist:
253
+ msg = self.__errorChecking(word)
254
+ if msg:
255
+ return msg
256
+ return None
257
+
258
+ def plot_biased_words(self,
259
+ words_to_diagnose,
260
+ wordlist_right,
261
+ wordlist_left,
262
+ wordlist_top=[],
263
+ wordlist_bottom=[]
264
+ ):
265
+ bias_2D = wordlist_top == [] and wordlist_bottom == []
266
+
267
+ if bias_2D and (not wordlist_right or not wordlist_left):
268
+ raise Exception('For bar plot, wordlist right and left can NOT be empty')
269
+ elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
270
+ raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
271
+
272
+ err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
273
+ if err:
274
+ raise Exception(err)
275
+
276
+ return self.get_bias_plot(bias_2D,
277
+ words_to_diagnose,
278
+ definitional_1=(wordlist_right, wordlist_left),
279
+ definitional_2=(wordlist_top, wordlist_bottom)
280
+ )
281
+
282
+ def get_bias_plot(self,
283
+ plot_2D,
284
+ words_to_diagnose,
285
+ definitional_1,
286
+ definitional_2=([], []),
287
+ method='sum',
288
+ n_extreme=10,
289
+ figsize=(15, 10)
290
+ ):
291
+ fig, ax = plt.subplots(1, figsize=figsize)
292
+ self.method = method
293
+ self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
294
+
295
+ if plot_2D:
296
+ fig.tight_layout()
297
+ fig.canvas.draw()
298
+
299
+ return fig
300
+
301
+ def plot_projection_scores(self,
302
+ plot_2D,
303
+ words,
304
+ definitional_1,
305
+ definitional_2=([], []),
306
+ n_extreme=10,
307
+ ax=None,
308
+ axis_projection_step=0.1):
309
+ name_left = ', '.join(definitional_1[1])
310
+ name_right = ', '.join(definitional_1[0])
311
+
312
+ self._identify_direction(name_left, name_right, definitional=definitional_1, method='sum')
313
+ self._is_direction_identified()
314
+
315
+ projections_df = self._calc_projection_scores(words)
316
+ projections_df['projection_x'] = projections_df['projection'].round(2)
317
+
318
+ if not plot_2D:
319
+ name_top = ', '.join(definitional_2[1])
320
+ name_bottom = ', '.join(definitional_2[0])
321
+ self._identify_direction(name_top, name_bottom, definitional=definitional_2, method='sum')
322
+ self._is_direction_identified()
323
+
324
+ projections_df['projection_y'] = self._calc_projection_scores(words)['projection'].round(2)
325
+
326
+ if n_extreme is not None:
327
+ projections_df = take_two_sides_extreme_sorted(projections_df, n_extreme=n_extreme)
328
+
329
+ if ax is None:
330
+ _, ax = plt.subplots(1)
331
+
332
+ cmap = plt.get_cmap('RdBu')
333
+ projections_df['color'] = ((projections_df['projection'] + 0.5).apply(cmap))
334
+ most_extream_projection = np.round(
335
+ projections_df['projection']
336
+ .abs()
337
+ .max(),
338
+ decimals=1)
339
+
340
+ if plot_2D:
341
+ sns.barplot(x='projection', y='word', data=projections_df,
342
+ palette=projections_df['color'])
343
+ else:
344
+ sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
345
+ palette=projections_df['color'])
346
+
347
+ plt.xticks(np.arange(-most_extream_projection,
348
+ most_extream_projection + axis_projection_step,
349
+ axis_projection_step))
350
+
351
+ x_label = '← {} {} {} →'.format(name_left,
352
+ ' ' * 20,
353
+ name_right)
354
+ if not plot_2D:
355
+ y_label = '← {} {} {} →'.format(name_top,
356
+ ' ' * 20,
357
+ name_bottom)
358
+ for _, row in (projections_df.iterrows()):
359
+ ax.annotate(row['word'], (row['projection_x'], row['projection_y']))
360
+
361
+ plt.xlabel(x_label)
362
+ plt.ylabel('Words')
363
+
364
+ if not plot_2D:
365
+ ax.xaxis.set_label_position('bottom')
366
+ ax.xaxis.set_label_coords(.5, 0)
367
+
368
+ plt.ylabel(y_label)
369
+ ax.yaxis.set_label_position('left')
370
+ ax.yaxis.set_label_coords(0, .5)
371
+
372
+ ax.spines['left'].set_position('center')
373
+ ax.spines['bottom'].set_position('center')
374
+
375
+ ax.set_xticks([])
376
+ ax.set_yticks([])
377
+
378
+ return ax
379
+
380
+ # TODO: Would be erased if decided to keep all info in BiasWordExplorer
381
+ class WEBiasExplorer2d(WordBiasExplorer):
382
+ def __init__(self, word_embedding) -> None:
383
+ super().__init__(word_embedding)
384
+
385
+ def calculate_bias( self,
386
+ palabras_extremo_1,
387
+ palabras_extremo_2,
388
+ palabras_para_situar
389
+ ):
390
+ wordlists = [palabras_extremo_1, palabras_extremo_2, palabras_para_situar]
391
+
392
+ err = self.check_oov(wordlists)
393
+ for wordlist in wordlists:
394
+ if not wordlist:
395
+ err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' + "<center><h3>"
396
+ if err:
397
+ return None, err
398
+
399
+ im = self.get_bias_plot(
400
+ palabras_para_situar,
401
+ definitional=(
402
+ palabras_extremo_1, palabras_extremo_2),
403
+ method='sum',
404
+ n_extreme=10
405
+ )
406
+ return im, ''
407
+
408
+ def get_bias_plot(self,
409
+ palabras_para_situar,
410
+ definitional,
411
+ method='sum',
412
+ n_extreme=10,
413
+ figsize=(10, 10)
414
+ ):
415
+
416
+ fig, ax = plt.subplots(1, figsize=figsize)
417
+ self.method = method
418
+ self.plot_projection_scores(
419
+ definitional,
420
+ palabras_para_situar, n_extreme, ax=ax,)
421
+
422
+ fig.tight_layout()
423
+ fig.canvas.draw()
424
+
425
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
426
+ w, h = fig.canvas.get_width_height()
427
+ im = data.reshape((int(h), int(w), -1))
428
+ return im
429
+
430
+ def plot_projection_scores(self, definitional,
431
+ words, n_extreme=10,
432
+ ax=None, axis_projection_step=None):
433
+ """Plot the projection scalar of words on the direction.
434
+ :param list words: The words tor project
435
+ :param int or None n_extreme: The number of extreme words to show
436
+ :return: The ax object of the plot
437
+ """
438
+ nombre_del_extremo_1 = ', '.join(definitional[0])
439
+ nombre_del_extremo_2 = ', '.join(definitional[1])
440
+
441
+ self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
442
+ definitional=definitional,
443
+ method='sum')
444
+
445
+ self._is_direction_identified()
446
+
447
+ projections_df = self._calc_projection_scores(words)
448
+ projections_df['projection'] = projections_df['projection'].round(2)
449
+
450
+ if n_extreme is not None:
451
+ projections_df = take_two_sides_extreme_sorted(projections_df,
452
+ n_extreme=n_extreme)
453
+
454
+ if ax is None:
455
+ _, ax = plt.subplots(1)
456
+
457
+ if axis_projection_step is None:
458
+ axis_projection_step = 0.1
459
+
460
+ cmap = plt.get_cmap('RdBu')
461
+ projections_df['color'] = ((projections_df['projection'] + 0.5)
462
+ .apply(cmap))
463
+
464
+ most_extream_projection = np.round(
465
+ projections_df['projection']
466
+ .abs()
467
+ .max(),
468
+ decimals=1)
469
+
470
+ sns.barplot(x='projection', y='word', data=projections_df,
471
+ palette=projections_df['color'])
472
+
473
+ plt.xticks(np.arange(-most_extream_projection,
474
+ most_extream_projection + axis_projection_step,
475
+ axis_projection_step))
476
+ xlabel = ('← {} {} {} →'.format(self.negative_end,
477
+ ' ' * 20,
478
+ self.positive_end))
479
+
480
+ plt.xlabel(xlabel)
481
+ plt.ylabel('Words')
482
+
483
+ return ax
484
+
485
+
486
+ class WEBiasExplorer4d(WordBiasExplorer):
487
+ def __init__(self, word_embedding) -> None:
488
+ super().__init__(word_embedding)
489
+
490
+ def calculate_bias( self,
491
+ palabras_extremo_1,
492
+ palabras_extremo_2,
493
+ palabras_extremo_3,
494
+ palabras_extremo_4,
495
+ palabras_para_situar
496
+ ):
497
+ wordlists = [
498
+ palabras_extremo_1,
499
+ palabras_extremo_2,
500
+ palabras_extremo_3,
501
+ palabras_extremo_4,
502
+ palabras_para_situar
503
+ ]
504
+ for wordlist in wordlists:
505
+ if not wordlist:
506
+ err = "<center><h3>" + \
507
+ '¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
508
+
509
+ err = self.check_oov(wordlist)
510
+
511
+ if err:
512
+ return None, err
513
+
514
+ im = self.get_bias_plot(
515
+ palabras_para_situar,
516
+ definitional_1=(
517
+ palabras_extremo_1, palabras_extremo_2),
518
+ definitional_2=(
519
+ palabras_extremo_3, palabras_extremo_4),
520
+ method='sum',
521
+ n_extreme=10
522
+ )
523
+ return im, ''
524
+
525
+ def get_bias_plot(self,
526
+ palabras_para_situar,
527
+ definitional_1,
528
+ definitional_2,
529
+ method='sum',
530
+ n_extreme=10,
531
+ figsize=(10, 10)
532
+ ):
533
+
534
+ fig, ax = plt.subplots(1, figsize=figsize)
535
+ self.method = method
536
+ self.plot_projection_scores(
537
+ definitional_1,
538
+ definitional_2,
539
+ palabras_para_situar, n_extreme, ax=ax,)
540
+ fig.canvas.draw()
541
+
542
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
543
+ w, h = fig.canvas.get_width_height()
544
+ im = data.reshape((int(h), int(w), -1))
545
+ return im
546
+
547
+ def plot_projection_scores(self, definitional_1, definitional_2,
548
+ words, n_extreme=10,
549
+ ax=None, axis_projection_step=None):
550
+ """Plot the projection scalar of words on the direction.
551
+ :param list words: The words tor project
552
+ :param int or None n_extreme: The number of extreme words to show
553
+ :return: The ax object of the plot
554
+ """
555
+
556
+ nombre_del_extremo_1 = ', '.join(definitional_1[1])
557
+ nombre_del_extremo_2 = ', '.join(definitional_1[0])
558
+
559
+ self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
560
+ definitional=definitional_1,
561
+ method='sum')
562
+
563
+ self._is_direction_identified()
564
+
565
+ projections_df = self._calc_projection_scores(words)
566
+ projections_df['projection_x'] = projections_df['projection'].round(2)
567
+
568
+ nombre_del_extremo_3 = ', '.join(definitional_2[1])
569
+ nombre_del_extremo_4 = ', '.join(definitional_2[0])
570
+ self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
571
+ definitional=definitional_2,
572
+ method='sum')
573
+
574
+ self._is_direction_identified()
575
+
576
+ projections_df['projection_y'] = self._calc_projection_scores(words)[
577
+ 'projection'].round(2)
578
+
579
+ if n_extreme is not None:
580
+ projections_df = take_two_sides_extreme_sorted(projections_df,
581
+ n_extreme=n_extreme)
582
+
583
+ if ax is None:
584
+ _, ax = plt.subplots(1)
585
+
586
+ if axis_projection_step is None:
587
+ axis_projection_step = 0.1
588
+
589
+ cmap = plt.get_cmap('RdBu')
590
+ projections_df['color'] = ((projections_df['projection'] + 0.5)
591
+ .apply(cmap))
592
+ most_extream_projection = np.round(
593
+ projections_df['projection']
594
+ .abs()
595
+ .max(),
596
+ decimals=1)
597
+ sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
598
+ palette=projections_df['color'])
599
+
600
+ plt.xticks(np.arange(-most_extream_projection,
601
+ most_extream_projection + axis_projection_step,
602
+ axis_projection_step))
603
+ for _, row in (projections_df.iterrows()):
604
+ ax.annotate(
605
+ row['word'], (row['projection_x'], row['projection_y']))
606
+ x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
607
+ ' ' * 20,
608
+ nombre_del_extremo_2)
609
+
610
+ y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
611
+ ' ' * 20,
612
+ nombre_del_extremo_4)
613
+
614
+ plt.xlabel(x_label)
615
+ ax.xaxis.set_label_position('bottom')
616
+ ax.xaxis.set_label_coords(.5, 0)
617
+
618
+ plt.ylabel(y_label)
619
+ ax.yaxis.set_label_position('left')
620
+ ax.yaxis.set_label_coords(0, .5)
621
+
622
+ ax.spines['left'].set_position('center')
623
+ ax.spines['bottom'].set_position('center')
624
+
625
+ ax.set_xticks([])
626
+ ax.set_yticks([])
627
+ #plt.yticks([], [])
628
+ # ax.spines['left'].set_position('zero')
629
+ # ax.spines['bottom'].set_position('zero')
630
+
631
+ return ax
modules/module_WordExplorer.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ from numpy.linalg import norm
5
+
6
+ import matplotlib as mpl
7
+ mpl.use('Agg')
8
+ import matplotlib.pyplot as plt
9
+
10
+ class WordToPlot:
11
+ def __init__(self, word, color, bias_space, alpha):
12
+ self.word = word
13
+ self.color = color
14
+ self.bias_space = bias_space
15
+ self.alpha = alpha
16
+
17
+ class WordExplorer:
18
+ def __init__(self, vocabulary) -> None:
19
+ self.vocabulary = vocabulary
20
+
21
+ def __errorChecking(self, word):
22
+ out_msj = ""
23
+
24
+ if not word:
25
+ out_msj = "Error: Primero debe ingresar una palabra!"
26
+ else:
27
+ if word not in self.vocabulary:
28
+ out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
29
+
30
+ return out_msj
31
+
32
+ def parse_words(self, string):
33
+ words = string.strip()
34
+ if words:
35
+ words = [word.strip() for word in words.split(',') if word != ""]
36
+ return words
37
+
38
+ def check_oov(self, wordlists):
39
+ for wordlist in wordlists:
40
+ for word in wordlist:
41
+ msg = self.__errorChecking(word)
42
+ if msg:
43
+ return msg
44
+ return None
45
+
46
+ def get_neighbors(self, word, n_neighbors, nn_method):
47
+ return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
48
+
49
+ def get_df(self, words_embedded, processed_word_list):
50
+ df = pd.DataFrame(words_embedded)
51
+
52
+ df['word'] = [wtp.word for wtp in processed_word_list]
53
+ df['color'] = [wtp.color for wtp in processed_word_list]
54
+ df['alpha'] = [wtp.alpha for wtp in processed_word_list]
55
+ df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
56
+ return df
57
+
58
+ def get_plot(self,
59
+ data,
60
+ processed_word_list,
61
+ words_embedded,
62
+ color_dict,
63
+ n_neighbors,
64
+ n_alpha,
65
+ fontsize=18,
66
+ figsize=(20, 15)
67
+ ):
68
+ fig, ax = plt.subplots(figsize=figsize)
69
+
70
+ sns.scatterplot(
71
+ data=data[data['alpha'] == 1],
72
+ x=0,
73
+ y=1,
74
+ style='word_bias_space',
75
+ hue='word_bias_space',
76
+ ax=ax,
77
+ palette=color_dict
78
+ )
79
+
80
+ if n_neighbors > 0:
81
+ sns.scatterplot(
82
+ data=data[data['alpha'] != 1],
83
+ x=0,
84
+ y=1,
85
+ style='color',
86
+ hue='word_bias_space',
87
+ ax=ax,
88
+ alpha=n_alpha,
89
+ legend=False,
90
+ palette=color_dict
91
+ )
92
+ for i, wtp in enumerate(processed_word_list):
93
+ x, y = words_embedded[i, :]
94
+ ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
95
+ textcoords='offset points',
96
+ ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
97
+
98
+ ax.set_xticks([])
99
+ ax.set_yticks([])
100
+ ax.set_xlabel('')
101
+ ax.set_ylabel('')
102
+ fig.tight_layout()
103
+
104
+ return fig
105
+
106
+ def plot_projections_2d(self,
107
+ wordlist_0,
108
+ wordlist_1 = [],
109
+ wordlist_2 = [],
110
+ wordlist_3 = [],
111
+ wordlist_4 = [],
112
+ **kwargs
113
+ ):
114
+ # convertirlas a vector
115
+ choices = [0, 1, 2, 3, 4]
116
+ wordlist_choice = [
117
+ wordlist_0,
118
+ wordlist_1,
119
+ wordlist_2,
120
+ wordlist_3,
121
+ wordlist_4
122
+ ]
123
+
124
+ err = self.check_oov(wordlist_choice)
125
+ if err:
126
+ raise Exception(err)
127
+
128
+ color_dict = {
129
+ 0: kwargs.get('color_wordlist_0', '#000000'),
130
+ 1: kwargs.get('color_wordlist_1', '#1f78b4'),
131
+ 2: kwargs.get('color_wordlist_2', '#33a02c'),
132
+ 3: kwargs.get('color_wordlist_3', '#e31a1c'),
133
+ 4: kwargs.get('color_wordlist_4', '#6a3d9a')
134
+ }
135
+
136
+ n_neighbors = kwargs.get('n_neighbors', 0)
137
+ n_alpha = kwargs.get('n_alpha', 0.3)
138
+
139
+ processed_word_list = []
140
+ for word_list_to_process, color in zip(wordlist_choice, choices):
141
+ for word in word_list_to_process:
142
+ processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
143
+
144
+ if n_neighbors > 0:
145
+ neighbors = self.get_neighbors(word,
146
+ n_neighbors=n_neighbors+1,
147
+ nn_method=kwargs.get('nn_method', 'sklearn')
148
+ )
149
+ for n in neighbors:
150
+ if n not in [wtp.word for wtp in processed_word_list]:
151
+ processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
152
+
153
+ if not processed_word_list:
154
+ raise Exception('Only empty lists were passed')
155
+
156
+ words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
157
+
158
+ data = self.get_df(words_embedded, processed_word_list)
159
+
160
+ fig = self.get_plot(data, processed_word_list, words_embedded,
161
+ color_dict, n_neighbors, n_alpha,
162
+ kwargs.get('fontsize', 18),
163
+ kwargs.get('figsize', (20, 15))
164
+ )
165
+ plt.show()
166
+ return fig
167
+
168
+ def doesnt_match(self, wordlist):
169
+ err = self.check_oov([wordlist])
170
+ if err:
171
+ raise Exception(err)
172
+
173
+ words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
174
+ mean_vec = np.mean(words_emb, axis=0)
175
+
176
+ doesnt_match = ""
177
+ farthest_emb = 1.0
178
+ for word in wordlist:
179
+ word_emb = self.vocabulary.getEmbedding(word)
180
+ cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
181
+ if cos_sim <= farthest_emb:
182
+ farthest_emb = cos_sim
183
+ doesnt_match = word
184
+
185
+ return doesnt_match
modules/module_ann.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import operator
3
+ from tqdm import tqdm
4
+ from annoy import AnnoyIndex
5
+ from memory_profiler import profile
6
+
7
+ class TicToc:
8
+ def __init__(self):
9
+ self.i = None
10
+ def start(self):
11
+ self.i = time.time()
12
+ def stop(self):
13
+ f = time.time()
14
+ print(f - self.i, "seg.")
15
+
16
+ class Ann:
17
+ def __init__(self, words, vectors, coord):
18
+ self.words = words.to_list()
19
+ self.vectors = vectors.to_list()
20
+ self.coord = coord.to_list()
21
+ self.tree = None
22
+
23
+ self.tt = TicToc()
24
+
25
+ @profile
26
+ def init(self, n_trees=10, metric='angular', n_jobs=-1):
27
+ # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
28
+ # n_jobs=-1 Run over all CPU availables
29
+
30
+ print("Init tree...")
31
+ self.tt.start()
32
+ self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
33
+ for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
34
+ self.tree.add_item(i,v)
35
+ self.tt.stop()
36
+
37
+ print("Build tree...")
38
+ self.tt.start()
39
+ self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
40
+ self.tt.stop()
41
+
42
+ def __getWordId(self, word):
43
+ word_id = None
44
+ try:
45
+ word_id = self.words.index(word)
46
+ except:
47
+ pass
48
+ return word_id
49
+
50
+ def get(self, word, n_neighbors=10):
51
+ word_id = self.__getWordId(word)
52
+ reword_xy_list = None
53
+
54
+ if word_id != None:
55
+ neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
56
+ # word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
57
+ # word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
58
+ word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
59
+ else:
60
+ print(f"The word '{word}' does not exist")
61
+
62
+ return word_xy_list
modules/module_connection.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from abc import ABC, abstractmethod
5
+
6
+ from modules.module_WordExplorer import WordExplorer
7
+ from modules.module_BiasExplorer import WordBiasExplorer
8
+
9
+ class Connector(ABC):
10
+ def parse_word(self, word : str):
11
+ return word.lower().strip()
12
+
13
+ def parse_words(self, array_in_string : str):
14
+ words = array_in_string.strip()
15
+ if not words:
16
+ return []
17
+ words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
18
+ return words
19
+
20
+ def process_error(self, err: str):
21
+ if err is None:
22
+ return
23
+ return "<center><h3>" + err + "</h3></center>"
24
+
25
+
26
+ class WordExplorerConnector(Connector):
27
+
28
+ def __init__(self, **kwargs):
29
+ if 'embedding' in kwargs:
30
+ embedding = kwargs.get('embedding')
31
+ else:
32
+ raise KeyError
33
+ self.word_explorer = WordExplorer(embedding)
34
+
35
+ def plot_proyection_2d( self,
36
+ wordlist_0,
37
+ wordlist_1,
38
+ wordlist_2,
39
+ wordlist_3,
40
+ wordlist_4,
41
+ color_wordlist_0,
42
+ color_wordlist_1,
43
+ color_wordlist_2,
44
+ color_wordlist_3,
45
+ color_wordlist_4,
46
+ n_alpha,
47
+ fontsize,
48
+ n_neighbors
49
+ ):
50
+ err = ""
51
+ neighbors_method = 'sklearn'
52
+ wordlist_0 = self.parse_words(wordlist_0)
53
+ wordlist_1 = self.parse_words(wordlist_1)
54
+ wordlist_2 = self.parse_words(wordlist_2)
55
+ wordlist_3 = self.parse_words(wordlist_3)
56
+ wordlist_4 = self.parse_words(wordlist_4)
57
+
58
+ if not (wordlist_0 or wordlist_1 or wordlist_2 or wordlist_1 or wordlist_4):
59
+ err = self.process_error("Ingresa al menos 1 palabras para continuar")
60
+ return None, err
61
+
62
+ err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
63
+ if err:
64
+ return None, self.process_error(err)
65
+
66
+ fig = self.word_explorer.plot_projections_2d(wordlist_0,
67
+ wordlist_1,
68
+ wordlist_2,
69
+ wordlist_3,
70
+ wordlist_4,
71
+ color_wordlist_0=color_wordlist_0,
72
+ color_wordlist_1=color_wordlist_1,
73
+ color_wordlist_2=color_wordlist_2,
74
+ color_wordlist_3=color_wordlist_3,
75
+ color_wordlist_4=color_wordlist_4,
76
+ n_alpha=n_alpha,
77
+ fontsize=fontsize,
78
+ n_neighbors=n_neighbors,
79
+ nn_method = neighbors_method
80
+ )
81
+ return fig, self.process_error(err)
82
+
83
+ class BiasWordExplorerConnector(Connector):
84
+
85
+ def __init__(self, **kwargs):
86
+ if 'embedding' in kwargs:
87
+ embedding = kwargs.get('embedding')
88
+ else:
89
+ raise KeyError
90
+ self.bias_word_explorer = WordBiasExplorer(embedding)
91
+
92
+ def calculate_bias_2d(self,
93
+ wordlist_1,
94
+ wordlist_2,
95
+ to_diagnose_list
96
+ ):
97
+ err = ""
98
+ wordlist_1 = self.parse_words(wordlist_1)
99
+ wordlist_2 = self.parse_words(wordlist_2)
100
+ to_diagnose_list = self.parse_words(to_diagnose_list)
101
+
102
+ word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
103
+ for list in word_lists:
104
+ if not list:
105
+ err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
106
+ if err:
107
+ return None, self.process_error(err)
108
+
109
+ err = self.bias_word_explorer.check_oov(word_lists)
110
+ if err:
111
+ return None, self.process_error(err)
112
+
113
+ fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
114
+
115
+ return fig, self.process_error(err)
116
+
117
+ def calculate_bias_4d(self,
118
+ wordlist_1,
119
+ wordlist_2,
120
+ wordlist_3,
121
+ wordlist_4,
122
+ to_diagnose_list
123
+ ):
124
+ err = ""
125
+ wordlist_1 = self.parse_words(wordlist_1)
126
+ wordlist_2 = self.parse_words(wordlist_2)
127
+ wordlist_3 = self.parse_words(wordlist_3)
128
+ wordlist_4 = self.parse_words(wordlist_4)
129
+ to_diagnose_list = self.parse_words(to_diagnose_list)
130
+
131
+ wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
132
+ for list in wordlists:
133
+ if not list:
134
+ err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
135
+ if err:
136
+ return None, self.process_error(err)
137
+
138
+ err = self.bias_word_explorer.check_oov(wordlists)
139
+ if err:
140
+ return None, self.process_error(err)
141
+
142
+ fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
143
+ return fig, self.process_error(err)
modules/module_logsManager.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv, os, pytz
2
+ from gradio import utils
3
+ from datetime import datetime
4
+ from dotenv import load_dotenv
5
+ from distutils.log import debug
6
+ from typing import Any, List, Optional
7
+ from gradio.components import IOComponent
8
+ from gradio.flagging import FlaggingCallback, _get_dataset_features_info
9
+
10
+
11
+ # --- Load environments vars ---
12
+ load_dotenv()
13
+
14
+ # --- Classes declaration ---
15
+ class DateLogs:
16
+ def __init__(self, zone="America/Argentina/Cordoba"):
17
+ self.time_zone = pytz.timezone(zone)
18
+
19
+ def full(self):
20
+ now = datetime.now(self.time_zone)
21
+ return now.strftime("%H:%M:%S %d-%m-%Y")
22
+
23
+ def day(self):
24
+ now = datetime.now(self.time_zone)
25
+ return now.strftime("%d-%m-%Y")
26
+
27
+ class HuggingFaceDatasetSaver(FlaggingCallback):
28
+ """
29
+ A callback that saves each flagged sample (both the input and output data)
30
+ to a HuggingFace dataset.
31
+ Example:
32
+ import gradio as gr
33
+ hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
34
+ def image_classifier(inp):
35
+ return {'cat': 0.3, 'dog': 0.7}
36
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
37
+ allow_flagging="manual", flagging_callback=hf_writer)
38
+ Guides: using_flagging
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ hf_token: str = os.getenv('HF_TOKEN'),
44
+ dataset_name: str = os.getenv('DS_LOGS_NAME'),
45
+ organization: Optional[str] = os.getenv('ORG_NAME'),
46
+ private: bool = True,
47
+ available_logs: bool = False
48
+ ):
49
+ """
50
+ Parameters:
51
+ hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
52
+ dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
53
+ organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
54
+ private: Whether the dataset should be private (defaults to False).
55
+ """
56
+ self.hf_token = hf_token
57
+ self.dataset_name = dataset_name
58
+ self.organization_name = organization
59
+ self.dataset_private = private
60
+ self.datetime = DateLogs()
61
+ self.available_logs = available_logs
62
+
63
+ if not available_logs:
64
+ print("Push: logs DISABLED!...")
65
+
66
+
67
+ def setup(
68
+ self,
69
+ components: List[IOComponent],
70
+ flagging_dir: str
71
+ ):
72
+ """
73
+ Params:
74
+ flagging_dir (str): local directory where the dataset is cloned,
75
+ updated, and pushed from.
76
+ """
77
+ if self.available_logs:
78
+
79
+ try:
80
+ import huggingface_hub
81
+ except (ImportError, ModuleNotFoundError):
82
+ raise ImportError(
83
+ "Package `huggingface_hub` not found is needed "
84
+ "for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
85
+ )
86
+
87
+ path_to_dataset_repo = huggingface_hub.create_repo(
88
+ repo_id=os.path.join(self.organization_name, self.dataset_name),
89
+ token=self.hf_token,
90
+ private=self.dataset_private,
91
+ repo_type="dataset",
92
+ exist_ok=True,
93
+ )
94
+
95
+ self.path_to_dataset_repo = path_to_dataset_repo
96
+ self.components = components
97
+ self.flagging_dir = flagging_dir
98
+ self.dataset_dir = self.dataset_name
99
+
100
+ self.repo = huggingface_hub.Repository(
101
+ local_dir=self.dataset_dir,
102
+ clone_from=path_to_dataset_repo,
103
+ use_auth_token=self.hf_token,
104
+ )
105
+
106
+ self.repo.git_pull(lfs=True)
107
+
108
+ # Should filename be user-specified?
109
+ # log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
110
+ self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
111
+
112
+ def flag(
113
+ self,
114
+ flag_data: List[Any],
115
+ flag_option: Optional[str] = None,
116
+ flag_index: Optional[int] = None,
117
+ username: Optional[str] = None,
118
+ ) -> int:
119
+
120
+ if self.available_logs:
121
+ self.repo.git_pull(lfs=True)
122
+
123
+ is_new = not os.path.exists(self.log_file)
124
+
125
+ with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
126
+ writer = csv.writer(csvfile)
127
+
128
+ # File previews for certain input and output types
129
+ infos, file_preview_types, headers = _get_dataset_features_info(
130
+ is_new, self.components
131
+ )
132
+
133
+ # Generate the headers and dataset_infos
134
+ if is_new:
135
+ headers = [
136
+ component.label or f"component {idx}"
137
+ for idx, component in enumerate(self.components)
138
+ ] + [
139
+ "flag",
140
+ "username",
141
+ "timestamp",
142
+ ]
143
+ writer.writerow(utils.sanitize_list_for_csv(headers))
144
+
145
+ # Generate the row corresponding to the flagged sample
146
+ csv_data = []
147
+ for component, sample in zip(self.components, flag_data):
148
+ save_dir = os.path.join(
149
+ self.dataset_dir,
150
+ utils.strip_invalid_filename_characters(component.label),
151
+ )
152
+ filepath = component.deserialize(sample, save_dir, None)
153
+ csv_data.append(filepath)
154
+ if isinstance(component, tuple(file_preview_types)):
155
+ csv_data.append(
156
+ "{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
157
+ )
158
+
159
+ csv_data.append(flag_option if flag_option is not None else "")
160
+ csv_data.append(username if username is not None else "")
161
+ csv_data.append(self.datetime.full())
162
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
163
+
164
+
165
+ with open(self.log_file, "r", encoding="utf-8") as csvfile:
166
+ line_count = len([None for row in csv.reader(csvfile)]) - 1
167
+
168
+ self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
169
+
170
+ else:
171
+ line_count = 0
172
+ print("Logs: Virtual push...")
173
+
174
+ return line_count
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ sklearn
2
+ gensim==3.7.3
3
+ transformers
4
+ matplotlib
5
+ numpy
6
+ seaborn
7
+ uuid
8
+ python-dotenv
9
+ memory_profiler
10
+ annoy
tool_info.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TOOL_INFO = """
2
+ > ### A tool to overcome technical barriers for bias assessment in human language technologies
3
+
4
+ * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
+
6
+ > ### Licensing Information
7
+ * [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
8
+
9
+ > ### Citation Information
10
+ ```c
11
+ @misc{https://doi.org/10.48550/arxiv.2207.06591,
12
+ doi = {10.48550/ARXIV.2207.06591},
13
+ url = {https://arxiv.org/abs/2207.06591},
14
+ author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
15
+ keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
16
+ FOS: Computer and information sciences, FOS: Computer and information sciences},
17
+ title = {A tool to overcome technical barriers for bias assessment in human language technologies},
18
+ publisher = {arXiv},
19
+ year = {2022},
20
+ copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
21
+ }
22
+ ```
23
+ """