Spaces:
Configuration error
Configuration error
LMartinezEXEX
commited on
Commit
·
a779273
1
Parent(s):
2b2d321
Init commit
Browse files- .gitattributes +3 -1
- .gitignore +1 -0
- LICENSE +21 -0
- app.py +47 -0
- data/data_loader.py +41 -0
- data/fasttext_embedding_v6.zip +3 -0
- data/mini_embedding_v6.zip +3 -0
- data/wiki-news-300d-1M.vec +3 -0
- examples/.gitignore +1 -0
- examples/examples.py +122 -0
- interfaces/interface_BiasWordExplorer.py +104 -0
- interfaces/interface_WordExplorer.py +113 -0
- language/english.json +91 -0
- language/spanish.json +91 -0
- modules/model_embbeding.py +93 -0
- modules/module_BiasExplorer.py +631 -0
- modules/module_WordExplorer.py +185 -0
- modules/module_ann.py +62 -0
- modules/module_connection.py +143 -0
- modules/module_logsManager.py +174 -0
- requirements.txt +10 -0
- tool_info.py +23 -0
.gitattributes
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
@@ -32,3 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
|
5 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Fundación Vía Libre
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --- Imports libs ---
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
|
6 |
+
# --- Imports modules ---
|
7 |
+
from modules.model_embbeding import Embedding
|
8 |
+
|
9 |
+
# --- Imports interfaces ---
|
10 |
+
from interfaces.interface_WordExplorer import interface as wordExplorer_interface
|
11 |
+
from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
|
12 |
+
|
13 |
+
# --- Tool config ---
|
14 |
+
AVAILABLE_LOGS = True # [True | False]
|
15 |
+
LANGUAGE = "spanish" # [spanish | english]
|
16 |
+
EMBEDDING_SUBSET = "fasttext" # [fasttext | mini]
|
17 |
+
|
18 |
+
# --- Init classes ---
|
19 |
+
embedding = Embedding(
|
20 |
+
subset_name=EMBEDDING_SUBSET
|
21 |
+
)
|
22 |
+
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
23 |
+
|
24 |
+
# --- Main App ---
|
25 |
+
INTERFACE_LIST = [
|
26 |
+
biasWordExplorer_interface(
|
27 |
+
embedding=embedding,
|
28 |
+
available_logs=AVAILABLE_LOGS,
|
29 |
+
lang=LANGUAGE),
|
30 |
+
wordExplorer_interface(
|
31 |
+
embedding=embedding,
|
32 |
+
available_logs=AVAILABLE_LOGS,
|
33 |
+
lang=LANGUAGE),
|
34 |
+
]
|
35 |
+
|
36 |
+
TAB_NAMES = [
|
37 |
+
labels["biasWordExplorer"],
|
38 |
+
labels["wordExplorer"],
|
39 |
+
]
|
40 |
+
|
41 |
+
iface = gr.TabbedInterface(
|
42 |
+
interface_list=INTERFACE_LIST,
|
43 |
+
tab_names=TAB_NAMES
|
44 |
+
)
|
45 |
+
|
46 |
+
iface.queue(concurrency_count=8)
|
47 |
+
iface.launch(debug=False)
|
data/data_loader.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.decomposition import PCA
|
3 |
+
from gensim.models import KeyedVectors
|
4 |
+
|
5 |
+
def load_embeddings(path, binary = False, randomPCA = False, limit = None):
|
6 |
+
if randomPCA:
|
7 |
+
pca = PCA(n_components=2,
|
8 |
+
copy=False,
|
9 |
+
whiten=False,
|
10 |
+
svd_solver='randomized',
|
11 |
+
iterated_power='auto'
|
12 |
+
)
|
13 |
+
else:
|
14 |
+
pca = PCA(n_components=2)
|
15 |
+
|
16 |
+
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
|
17 |
+
|
18 |
+
# Cased Vocab
|
19 |
+
cased_words = model.vocab.keys()
|
20 |
+
|
21 |
+
#Normalized vectors
|
22 |
+
model.init_sims(replace=True)
|
23 |
+
cased_emb = [model[word] for word in cased_words]
|
24 |
+
|
25 |
+
# PCA reduction
|
26 |
+
cased_pca = pca.fit_transform(cased_emb)
|
27 |
+
|
28 |
+
df_cased = pd.DataFrame(
|
29 |
+
zip(
|
30 |
+
cased_words,
|
31 |
+
cased_emb,
|
32 |
+
cased_pca
|
33 |
+
),
|
34 |
+
columns=['word', 'embedding', 'pca']
|
35 |
+
)
|
36 |
+
|
37 |
+
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
38 |
+
df_uncased = df_cased.drop_duplicates(subset='word')
|
39 |
+
df_uncased.to_json(path[:-3] + 'json')
|
40 |
+
|
41 |
+
load_embeddings('./wiki-news-300d-1M.vec', limit=10000)
|
data/fasttext_embedding_v6.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c35f3dda1d216d9baed3fc77f3b6bb51130f07faf0ee418029344635a0b732b7
|
3 |
+
size 165727812
|
data/mini_embedding_v6.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fa1594f66f29388719f9125eebdd529054f31bc9564e609d5162ba328a054be
|
3 |
+
size 94479
|
data/wiki-news-300d-1M.vec
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
|
3 |
+
size 2259088777
|
examples/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
examples/examples.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
example_fem = {
|
2 |
+
"mujer": "la mente de una mujer que durante los últimos",
|
3 |
+
"chica": "enamorado de la misma chica desde la infancia mary",
|
4 |
+
"ella": "ella llego a la final",
|
5 |
+
"madre": "su padre y su madre margarita de parma",
|
6 |
+
"hija": "hija de inmigrantes españoles en",
|
7 |
+
"femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
|
8 |
+
}
|
9 |
+
example_joven = {
|
10 |
+
"joven": "",
|
11 |
+
"inmaduro": "",
|
12 |
+
"niño": "",
|
13 |
+
"crio": ""
|
14 |
+
}
|
15 |
+
example_viejo = {
|
16 |
+
"viejo": "",
|
17 |
+
"maduro": "",
|
18 |
+
"anciano": "",
|
19 |
+
"adulto": ""
|
20 |
+
}
|
21 |
+
|
22 |
+
|
23 |
+
example_masc = {
|
24 |
+
"hombre": "deseo innato que todo hombre tiene de comunicar su",
|
25 |
+
"chico": "fue un chico interesado en artes",
|
26 |
+
"el": "el parque nacional liwonde",
|
27 |
+
"padre": "la muerte de su padre en 1832 se formó",
|
28 |
+
"hijo": "le dice a su hijo aún no nacido como",
|
29 |
+
"masculino": "el mito es esencialmente masculino y entre las causas",
|
30 |
+
}
|
31 |
+
|
32 |
+
example_diagnose = {
|
33 |
+
"ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
|
34 |
+
"educación": "sentido de vida religión educación y cultura para cada mujer",
|
35 |
+
"pagado": "un rescate muy grande pagado por sus seguidores a",
|
36 |
+
"cocinar": "empezó a cocinar una sopa usando",
|
37 |
+
"lavar": "era directamente usado para lavar ropa por eso la",
|
38 |
+
"deporte": "se convirtió en el deporte más popular del país",
|
39 |
+
"ropa": "usan el kimono una ropa tradicional japonesa",
|
40 |
+
"pelea": "mal por la violenta pelea entre ambos hermanos",
|
41 |
+
"enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
|
42 |
+
"ganar": "una necesidad un modo de ganar",
|
43 |
+
"líder": "del estado en manos del líder opositor henrique capriles para el",
|
44 |
+
"coser": "realizar tareas domésticas básicas como coser y poner la mesa",
|
45 |
+
"cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
|
46 |
+
"cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
|
47 |
+
"rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
|
48 |
+
"reina": "año ganó el título de reina de la bahía en el"
|
49 |
+
}
|
50 |
+
|
51 |
+
|
52 |
+
fem_words = ','.join([word for word, context in example_fem.items()])
|
53 |
+
fem_contexts = ','.join([context for word, context in example_fem.items()])
|
54 |
+
masc_words = ','.join([word for word, context in example_masc.items()])
|
55 |
+
masc_contexts = ','.join([context for word, context in example_masc.items()])
|
56 |
+
young_words = ','.join([word for word, context in example_joven.items()])
|
57 |
+
old_words = ','.join([word for word, context in example_viejo.items()])
|
58 |
+
diagnose_words = ','.join([word for word, context in example_diagnose.items()])
|
59 |
+
diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
|
60 |
+
|
61 |
+
positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
|
62 |
+
negative_money_words = 'malgastar,derrochar'
|
63 |
+
diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
|
64 |
+
|
65 |
+
lazy_words = 'vago, perezoso, gandul'
|
66 |
+
active_words = 'trabajar, esfuerzo, trabajador'
|
67 |
+
|
68 |
+
examples1_explorar_sesgo_en_palabras = [
|
69 |
+
[fem_words, masc_words, diagnose_words],
|
70 |
+
[old_words, young_words, diagnose_words],
|
71 |
+
[positive_money_words, negative_money_words, diagnose_money],
|
72 |
+
[lazy_words, active_words, diagnose_money]
|
73 |
+
]
|
74 |
+
|
75 |
+
examples2_explorar_sesgo_en_palabras = [
|
76 |
+
[fem_words, masc_words, young_words, old_words, diagnose_words],
|
77 |
+
[lazy_words, active_words, positive_money_words, negative_money_words,diagnose_money],
|
78 |
+
]
|
79 |
+
|
80 |
+
|
81 |
+
examples_explorar_relaciones_entre_palabras = [
|
82 |
+
[diagnose_words, fem_words, masc_words, young_words, old_words],
|
83 |
+
[diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
|
84 |
+
]
|
85 |
+
|
86 |
+
|
87 |
+
# Examples Datos
|
88 |
+
examples_datos = ["ronronear", "ella", "el", "ellos"]
|
89 |
+
|
90 |
+
|
91 |
+
# Examples Sesgos en frases
|
92 |
+
examples_sesgos_frases = [
|
93 |
+
["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
|
94 |
+
["los cordobeses bailan *", ""],
|
95 |
+
["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
|
96 |
+
["",""]
|
97 |
+
]
|
98 |
+
|
99 |
+
# Examples Crows_pairs
|
100 |
+
examples_crows_pairs = [
|
101 |
+
[ "Los hombres 'musculosos' tienen más suerte con las mujeres",
|
102 |
+
"Los hombres 'gordos' tienen más suerte con las mujeres",
|
103 |
+
"Los hombres 'delgados' tienen más suerte con las mujeres",
|
104 |
+
"Los hombres 'bajos' tienen más suerte con las mujeres",
|
105 |
+
"Los hombres 'rubios' tienen más suerte con las mujeres",
|
106 |
+
"Los hombres 'altos' tienen más suerte con las mujeres"],
|
107 |
+
[ "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
|
108 |
+
"No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
|
109 |
+
"","","",""],
|
110 |
+
[ "Todos sabemos que los 'mapuches' son alcohólicos.",
|
111 |
+
"Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
|
112 |
+
"","","",""],
|
113 |
+
[ "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
|
114 |
+
"Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
|
115 |
+
"","","",""],
|
116 |
+
[ "A 'las viejas' les gusta contar chusmerio",
|
117 |
+
"A 'los viejos' les gusta contar chusmerio",
|
118 |
+
"","","",""],
|
119 |
+
[ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
|
120 |
+
"Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
|
121 |
+
"","","",""]
|
122 |
+
]
|
interfaces/interface_BiasWordExplorer.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from tkinter import image_names
|
4 |
+
|
5 |
+
from tool_info import TOOL_INFO
|
6 |
+
from modules.module_logsManager import HuggingFaceDatasetSaver
|
7 |
+
from modules.module_connection import BiasWordExplorerConnector
|
8 |
+
from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
|
9 |
+
|
10 |
+
# --- Interface ---
|
11 |
+
def interface(embedding, available_logs, lang="spanish"):
|
12 |
+
# --- Init logs ---
|
13 |
+
log_callback = HuggingFaceDatasetSaver(
|
14 |
+
available_logs=available_logs
|
15 |
+
)
|
16 |
+
# --- Init vars ---
|
17 |
+
connector = BiasWordExplorerConnector(embedding=embedding)
|
18 |
+
labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
|
19 |
+
|
20 |
+
interface = gr.Blocks()
|
21 |
+
with interface:
|
22 |
+
gr.Markdown(labels["step1"])
|
23 |
+
with gr.Row():
|
24 |
+
with gr.Column():
|
25 |
+
with gr.Row():
|
26 |
+
diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
|
27 |
+
with gr.Row():
|
28 |
+
gr.Markdown(labels["step2&2Spaces"])
|
29 |
+
with gr.Row():
|
30 |
+
wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
|
31 |
+
wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
|
32 |
+
with gr.Row():
|
33 |
+
gr.Markdown(labels["step2&4Spaces"])
|
34 |
+
with gr.Row():
|
35 |
+
wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
|
36 |
+
wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
|
37 |
+
with gr.Column():
|
38 |
+
with gr.Row():
|
39 |
+
bias2d = gr.Button(labels["plot2SpacesButton"])
|
40 |
+
with gr.Row():
|
41 |
+
bias4d = gr.Button(labels["plot4SpacesButton"])
|
42 |
+
with gr.Row():
|
43 |
+
err_msg = gr.Markdown(label='',visible=True)
|
44 |
+
with gr.Row():
|
45 |
+
bias_plot = gr.Plot(label="", show_label=False)
|
46 |
+
with gr.Row():
|
47 |
+
examples = gr.Examples(
|
48 |
+
fn=connector.calculate_bias_2d,
|
49 |
+
inputs=[wordlist_1, wordlist_2, diagnose_list],
|
50 |
+
outputs=[bias_plot, err_msg],
|
51 |
+
examples=examples1_explorar_sesgo_en_palabras,
|
52 |
+
label=labels["examples2Spaces"]
|
53 |
+
)
|
54 |
+
with gr.Row():
|
55 |
+
examples = gr.Examples(
|
56 |
+
fn=connector.calculate_bias_4d,
|
57 |
+
inputs=[wordlist_1, wordlist_2,
|
58 |
+
wordlist_3, wordlist_4, diagnose_list],
|
59 |
+
outputs=[bias_plot, err_msg],
|
60 |
+
examples=examples2_explorar_sesgo_en_palabras,
|
61 |
+
label=labels["examples4Spaces"]
|
62 |
+
)
|
63 |
+
|
64 |
+
with gr.Row():
|
65 |
+
gr.Markdown(TOOL_INFO)
|
66 |
+
|
67 |
+
bias2d.click(
|
68 |
+
fn=connector.calculate_bias_2d,
|
69 |
+
inputs=[wordlist_1,wordlist_2,diagnose_list],
|
70 |
+
outputs=[bias_plot,err_msg]
|
71 |
+
)
|
72 |
+
|
73 |
+
bias4d.click(
|
74 |
+
fn=connector.calculate_bias_4d,
|
75 |
+
inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
|
76 |
+
outputs=[bias_plot,err_msg]
|
77 |
+
)
|
78 |
+
|
79 |
+
# --- Logs ---
|
80 |
+
save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
|
81 |
+
log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
|
82 |
+
|
83 |
+
bias2d.click(
|
84 |
+
fn=lambda *args: log_callback.flag(
|
85 |
+
flag_data=args,
|
86 |
+
flag_option="plot_2d",
|
87 |
+
username="vialibre"
|
88 |
+
),
|
89 |
+
inputs=save_field,
|
90 |
+
outputs=None,
|
91 |
+
preprocess=False
|
92 |
+
)
|
93 |
+
|
94 |
+
bias4d.click(
|
95 |
+
fn=lambda *args: log_callback.flag(
|
96 |
+
flag_data=args,
|
97 |
+
flag_option="plot_4d",
|
98 |
+
username="vialibre"
|
99 |
+
),
|
100 |
+
inputs=save_field,
|
101 |
+
outputs=None,
|
102 |
+
preprocess=False
|
103 |
+
)
|
104 |
+
return interface
|
interfaces/interface_WordExplorer.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
|
5 |
+
from tool_info import TOOL_INFO
|
6 |
+
from modules.module_connection import WordExplorerConnector
|
7 |
+
from modules.module_logsManager import HuggingFaceDatasetSaver
|
8 |
+
from examples.examples import examples_explorar_relaciones_entre_palabras
|
9 |
+
|
10 |
+
plt.rcParams.update({'font.size': 14})
|
11 |
+
|
12 |
+
def interface(embedding, available_logs, lang="spanish"):
|
13 |
+
# --- Init logs ---
|
14 |
+
log_callback = HuggingFaceDatasetSaver(
|
15 |
+
available_logs=available_logs
|
16 |
+
)
|
17 |
+
# --- Init vars ---
|
18 |
+
connector = WordExplorerConnector(embedding=embedding)
|
19 |
+
labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
|
20 |
+
|
21 |
+
# --- Interface ---
|
22 |
+
interface = gr.Blocks()
|
23 |
+
with interface:
|
24 |
+
gr.Markdown(labels["title"])
|
25 |
+
with gr.Row():
|
26 |
+
with gr.Column(scale=3):
|
27 |
+
with gr.Row(equal_height=True):
|
28 |
+
with gr.Column(scale=5):
|
29 |
+
diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
|
30 |
+
with gr.Column(scale=1,min_width=10):
|
31 |
+
color_wordlist = gr.ColorPicker(label="",value='#000000',)
|
32 |
+
with gr.Row():
|
33 |
+
with gr.Column(scale=5):
|
34 |
+
wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
|
35 |
+
with gr.Column(scale=1,min_width=10):
|
36 |
+
color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
|
37 |
+
with gr.Row():
|
38 |
+
with gr.Column(scale=5):
|
39 |
+
wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
|
40 |
+
with gr.Column(scale=1,min_width=10):
|
41 |
+
color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
|
42 |
+
with gr.Row():
|
43 |
+
with gr.Column(scale=5):
|
44 |
+
wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
|
45 |
+
with gr.Column(scale=1,min_width=10):
|
46 |
+
color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
|
47 |
+
with gr.Row():
|
48 |
+
with gr.Column(scale=5):
|
49 |
+
wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
|
50 |
+
with gr.Column(scale=1,min_width=10):
|
51 |
+
color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
|
52 |
+
with gr.Column(scale=4):
|
53 |
+
with gr.Row():
|
54 |
+
with gr.Row():
|
55 |
+
gr.Markdown(labels["plotNeighbours"]["title"])
|
56 |
+
n_neighbors = gr.Slider(minimum=0,maximum=100,step=1,label=labels["plotNeighbours"]["quantity"])
|
57 |
+
with gr.Row():
|
58 |
+
alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
|
59 |
+
fontsize=gr.Number(value=18, label=labels["options"]["font-size"])
|
60 |
+
with gr.Row():
|
61 |
+
btn_plot = gr.Button(labels["plot_button"])
|
62 |
+
with gr.Row():
|
63 |
+
err_msg = gr.Markdown(label="", visible=True)
|
64 |
+
with gr.Row():
|
65 |
+
word_proyections = gr.Plot(label="", show_label=False)
|
66 |
+
|
67 |
+
with gr.Row():
|
68 |
+
gr.Examples(
|
69 |
+
fn=connector.plot_proyection_2d,
|
70 |
+
inputs=[diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4],
|
71 |
+
outputs=[word_proyections,err_msg],
|
72 |
+
examples=examples_explorar_relaciones_entre_palabras,
|
73 |
+
label=labels["examples"]
|
74 |
+
)
|
75 |
+
|
76 |
+
with gr.Row():
|
77 |
+
gr.Markdown(TOOL_INFO)
|
78 |
+
|
79 |
+
btn_plot.click(
|
80 |
+
fn=connector.plot_proyection_2d,
|
81 |
+
inputs=[
|
82 |
+
diagnose_list,
|
83 |
+
wordlist_1,
|
84 |
+
wordlist_2,
|
85 |
+
wordlist_3,
|
86 |
+
wordlist_4,
|
87 |
+
color_wordlist,
|
88 |
+
color_wordlist_1,
|
89 |
+
color_wordlist_2,
|
90 |
+
color_wordlist_3,
|
91 |
+
color_wordlist_4,
|
92 |
+
alpha,
|
93 |
+
fontsize,
|
94 |
+
n_neighbors
|
95 |
+
],
|
96 |
+
outputs=[word_proyections,err_msg]
|
97 |
+
)
|
98 |
+
|
99 |
+
# --- Logs ---
|
100 |
+
save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
|
101 |
+
log_callback.setup(components=save_field, flagging_dir="edia_we_es")
|
102 |
+
|
103 |
+
btn_plot.click(
|
104 |
+
fn=lambda *args: log_callback.flag(
|
105 |
+
flag_data=args,
|
106 |
+
flag_option="explorar_palabras",
|
107 |
+
username="vialibre",
|
108 |
+
),
|
109 |
+
inputs=save_field,
|
110 |
+
outputs=None,
|
111 |
+
preprocess=False
|
112 |
+
)
|
113 |
+
return interface
|
language/english.json
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"app": {
|
3 |
+
"wordExplorer": "Word explorer",
|
4 |
+
"biasWordExplorer": "Word bias",
|
5 |
+
"dataExplorer": "Data bias",
|
6 |
+
"phraseExplorer": "Phrase bias",
|
7 |
+
"crowsPairsExplorer": "Crows-Pairs"
|
8 |
+
},
|
9 |
+
"WordExplorer_interface": {
|
10 |
+
"title": "Write some words to visualize their related ones",
|
11 |
+
"wordList1": "Word list 1",
|
12 |
+
"wordList2": "Word list 2",
|
13 |
+
"wordList3": "Word list 3",
|
14 |
+
"wordList4": "Word list 4",
|
15 |
+
"wordListToDiagnose": "List of words to be diagnosed",
|
16 |
+
"plotNeighbours": {
|
17 |
+
"title": "Plot neighbours words",
|
18 |
+
"quantity": "Quantity"
|
19 |
+
},
|
20 |
+
"options": {
|
21 |
+
"font-size": "Font size",
|
22 |
+
"transparency": "Transparency"
|
23 |
+
},
|
24 |
+
"plot_button": "Plot in the space!",
|
25 |
+
"examples": "Examples"
|
26 |
+
},
|
27 |
+
"BiasWordExplorer_interface": {
|
28 |
+
"step1": "1. Write comma separated words to be diagnosed",
|
29 |
+
"step2&2Spaces": "2. For plotting 2 spaces, fill in the following lists:",
|
30 |
+
"step2&4Spaces": "2. For plotting 4 spaces, also fill in the following lists:",
|
31 |
+
"plot2SpacesButton": "Plot 2 stereotypes!",
|
32 |
+
"plot4SpacesButton": "Plot 4 stereotypes!",
|
33 |
+
"wordList1": "Word list 1",
|
34 |
+
"wordList2": "Word list 2",
|
35 |
+
"wordList3": "Word list 3",
|
36 |
+
"wordList4": "Word list 4",
|
37 |
+
"wordListToDiagnose": "List of words to be diagnosed",
|
38 |
+
"examples2Spaces": "Examples in 2 spaces",
|
39 |
+
"examples4Spaces": "Examples in 4 spaces"
|
40 |
+
},
|
41 |
+
"PhraseExplorer_interface": {
|
42 |
+
"step1": "1. Enter a sentence",
|
43 |
+
"step2": "2. Enter words of interest (Optional)",
|
44 |
+
"step3": "3. Enter unwanted words (If item 2 is not completed)",
|
45 |
+
"sent": {
|
46 |
+
"title": "",
|
47 |
+
"placeholder": "Use * to mask the word of interest."
|
48 |
+
},
|
49 |
+
"wordList": {
|
50 |
+
"title": "",
|
51 |
+
"placeholder": "The words in the list must be comma separated"
|
52 |
+
},
|
53 |
+
"bannedWordList": {
|
54 |
+
"title": "",
|
55 |
+
"placeholder": "The words in the list must be comma separated"
|
56 |
+
},
|
57 |
+
"excludeArticles": "Exclude articles",
|
58 |
+
"excludePrepositions": "Excluir Prepositions",
|
59 |
+
"excludeConjunctions": "Excluir Conjunctions",
|
60 |
+
"resultsButton": "Get",
|
61 |
+
"plot": "Display of proportions",
|
62 |
+
"examples": "Examples"
|
63 |
+
},
|
64 |
+
"DataExplorer_interface": {
|
65 |
+
"step1": "1. Enter a word of interest",
|
66 |
+
"step2": "2. Select maximum number of contexts to retrieve",
|
67 |
+
"step3": "3. Select sets of interest",
|
68 |
+
"inputWord": {
|
69 |
+
"title": "",
|
70 |
+
"placeholder": "Enter the word ..."
|
71 |
+
},
|
72 |
+
"wordInfoButton": "Get word information",
|
73 |
+
"wordContextButton": "Search contexts",
|
74 |
+
"wordDistributionTitle": "Word distribution in vocabulary",
|
75 |
+
"frequencyPerSetTitle": "Frequencies of occurrence per set",
|
76 |
+
"contextList": "Context list"
|
77 |
+
},
|
78 |
+
"CrowsPairs_interface": {
|
79 |
+
"title": "1. Enter sentences to compare",
|
80 |
+
"sent0": "Sentence Nº 1 (*)",
|
81 |
+
"sent1": "Sentence Nº 2 (*)",
|
82 |
+
"sent2": "Sentence Nº 3 (Optional)",
|
83 |
+
"sent3": "Sentence Nº 4 (Optional)",
|
84 |
+
"sent4": "Sentence Nº 5 (Optional)",
|
85 |
+
"sent5": "Sentence Nº 6 (Optional)",
|
86 |
+
"commonPlacholder": "Use < and > to highlight word(s) of interest",
|
87 |
+
"compareButton": "Compare",
|
88 |
+
"plot": "Display of proportions",
|
89 |
+
"examples": "Examples"
|
90 |
+
}
|
91 |
+
}
|
language/spanish.json
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"app": {
|
3 |
+
"wordExplorer": "Explorar palabras",
|
4 |
+
"biasWordExplorer": "Sesgo en palabras",
|
5 |
+
"dataExplorer": "Sesgo en datos",
|
6 |
+
"phraseExplorer": "Sesgo en frases",
|
7 |
+
"crowsPairsExplorer": "Crows-Pairs"
|
8 |
+
},
|
9 |
+
"WordExplorer_interface": {
|
10 |
+
"title": "Escribi algunas palabras para visualizar sus palabras relacionadas",
|
11 |
+
"wordList1": "Lista de palabras 1",
|
12 |
+
"wordList2": "Lista de palabras 2",
|
13 |
+
"wordList3": "Lista de palabras 3",
|
14 |
+
"wordList4": "Lista de palabras 4",
|
15 |
+
"wordListToDiagnose": "Lista de palabras a diagnosticar",
|
16 |
+
"plotNeighbours": {
|
17 |
+
"title": "Graficar palabras relacionadas",
|
18 |
+
"quantity": "Cantidad"
|
19 |
+
},
|
20 |
+
"options": {
|
21 |
+
"font-size": "Tamaño de fuente",
|
22 |
+
"transparency": "Transparencia"
|
23 |
+
},
|
24 |
+
"plot_button": "¡Graficar en el espacio!",
|
25 |
+
"examples": "Ejemplos"
|
26 |
+
},
|
27 |
+
"BiasWordExplorer_interface": {
|
28 |
+
"step1": "1. Escribi palabras para diagnosticar separadas por comas",
|
29 |
+
"step2&2Spaces": "2. Para graficar 2 espacios, completa las siguientes listas:",
|
30 |
+
"step2&4Spaces": "2. Para graficar 4 espacios, además completa las siguientes listas:",
|
31 |
+
"plot2SpacesButton": "¡Graficar 2 estereotipos!",
|
32 |
+
"plot4SpacesButton": "¡Graficar 4 estereotipos!",
|
33 |
+
"wordList1": "Lista de palabras 1",
|
34 |
+
"wordList2": "Lista de palabras 2",
|
35 |
+
"wordList3": "Lista de palabras 3",
|
36 |
+
"wordList4": "Lista de palabras 4",
|
37 |
+
"wordListToDiagnose": "Lista de palabras a diagnosticar",
|
38 |
+
"examples2Spaces": "Ejemplos en 2 espacios",
|
39 |
+
"examples4Spaces": "Ejemplos en 4 espacios"
|
40 |
+
},
|
41 |
+
"PhraseExplorer_interface": {
|
42 |
+
"step1": "1. Ingrese una frase",
|
43 |
+
"step2": "2. Ingrese palabras de interés (Opcional)",
|
44 |
+
"step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
|
45 |
+
"sent": {
|
46 |
+
"title": "",
|
47 |
+
"placeholder": "Utilice * para enmascarar la palabra de interés"
|
48 |
+
},
|
49 |
+
"wordList": {
|
50 |
+
"title": "",
|
51 |
+
"placeholder": "La lista de palabras deberán estar separadas por ,"
|
52 |
+
},
|
53 |
+
"bannedWordList": {
|
54 |
+
"title": "",
|
55 |
+
"placeholder": "La lista de palabras deberán estar separadas por ,"
|
56 |
+
},
|
57 |
+
"excludeArticles": "Excluir Artículos",
|
58 |
+
"excludePrepositions": "Excluir Preposiciones",
|
59 |
+
"excludeConjunctions": "Excluir Conjunciones",
|
60 |
+
"resultsButton": "Obtener",
|
61 |
+
"plot": "Visualización de proporciones",
|
62 |
+
"examples": "Ejemplos"
|
63 |
+
},
|
64 |
+
"DataExplorer_interface": {
|
65 |
+
"step1": "1. Ingrese una palabra de interés",
|
66 |
+
"step2": "2. Seleccione cantidad máxima de contextos a recuperar",
|
67 |
+
"step3": "3. Seleccione conjuntos de interés",
|
68 |
+
"inputWord": {
|
69 |
+
"title": "",
|
70 |
+
"placeholder": "Ingresar aquí la palabra ..."
|
71 |
+
},
|
72 |
+
"wordInfoButton": "Obtener información de palabra",
|
73 |
+
"wordContextButton": "Buscar contextos",
|
74 |
+
"wordDistributionTitle": "Distribución de palabra en vocabulario",
|
75 |
+
"frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
|
76 |
+
"contextList": "Lista de contextos"
|
77 |
+
},
|
78 |
+
"CrowsPairs_interface": {
|
79 |
+
"title": "1. Ingrese frases a comparar",
|
80 |
+
"sent0": "Frase Nº 1 (*)",
|
81 |
+
"sent1": "Frase Nº 2 (*)",
|
82 |
+
"sent2": "Frase Nº 3 (Opcional)",
|
83 |
+
"sent3": "Frase Nº 4 (Opcional)",
|
84 |
+
"sent4": "Frase Nº 5 (Opcional)",
|
85 |
+
"sent5": "Frase Nº 6 (Opcional)",
|
86 |
+
"commonPlacholder": "Utilice comillas simples ' ' para destacar palabra/as de interés",
|
87 |
+
"compareButton": "Comparar",
|
88 |
+
"plot": "Visualización de proporciones",
|
89 |
+
"examples": "Ejemplos"
|
90 |
+
}
|
91 |
+
}
|
modules/model_embbeding.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import operator
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
from numpy import dot
|
5 |
+
from gensim import matutils
|
6 |
+
from modules.module_ann import Ann
|
7 |
+
from memory_profiler import profile
|
8 |
+
from sklearn.neighbors import NearestNeighbors
|
9 |
+
|
10 |
+
|
11 |
+
class Embedding:
|
12 |
+
@profile
|
13 |
+
def __init__(self, subset_name):
|
14 |
+
# Dataset info
|
15 |
+
self.ds_subset = subset_name
|
16 |
+
self.ds_path = f"data/{subset_name}_embedding_v6.zip"
|
17 |
+
|
18 |
+
# Pandas dataset
|
19 |
+
self.ds = None
|
20 |
+
|
21 |
+
# All Words embedding List[List[float]]
|
22 |
+
self.embedding = None
|
23 |
+
|
24 |
+
# Estimate AproximateNearestNeighbors
|
25 |
+
self.ann = None
|
26 |
+
|
27 |
+
# Load embedding and pca dataset
|
28 |
+
self.__load()
|
29 |
+
|
30 |
+
def __contains__(self, word):
|
31 |
+
return word in self.ds['word'].to_list()
|
32 |
+
|
33 |
+
def __load(self):
|
34 |
+
print(f"Preparing {self.ds_subset} embedding...")
|
35 |
+
|
36 |
+
# --- Download dataset ---
|
37 |
+
self.ds = pd.read_json(self.ds_path)
|
38 |
+
|
39 |
+
# --- Get embedding from string
|
40 |
+
self.embedding = self.ds['embedding'].to_list()
|
41 |
+
|
42 |
+
# --- Get forest tree to estimate Nearest Neighbors ---
|
43 |
+
self.ann = Ann(
|
44 |
+
words=self.ds['word'],
|
45 |
+
vectors=self.ds['embedding'],
|
46 |
+
coord=self.ds['pca']
|
47 |
+
)
|
48 |
+
self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
|
49 |
+
|
50 |
+
# --- Fit Sklearn NN method ---
|
51 |
+
self.neigh = NearestNeighbors(n_neighbors=20)
|
52 |
+
self.neigh.fit(self.embedding)
|
53 |
+
|
54 |
+
def __getValue(self, word, feature):
|
55 |
+
word_id, value = None, None
|
56 |
+
|
57 |
+
if word in self:
|
58 |
+
word_id = self.ds['word'].to_list().index(word)
|
59 |
+
|
60 |
+
if word_id != None:
|
61 |
+
value = self.ds[feature].to_list()[word_id]
|
62 |
+
|
63 |
+
return value
|
64 |
+
|
65 |
+
def getEmbedding(self, word):
|
66 |
+
return self.__getValue(word, 'embedding')
|
67 |
+
|
68 |
+
def getPCA(self, word):
|
69 |
+
return self.__getValue(word, 'pca')
|
70 |
+
|
71 |
+
def cosineSimilarities(self, vector_1, vectors_all):
|
72 |
+
norm = np.linalg.norm(vector_1)
|
73 |
+
all_norms = np.linalg.norm(vectors_all, axis=1)
|
74 |
+
dot_products = dot(vectors_all, vector_1)
|
75 |
+
similarities = dot_products / (norm * all_norms)
|
76 |
+
return similarities
|
77 |
+
|
78 |
+
def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
|
79 |
+
if nn_method == 'ann':
|
80 |
+
words = self.ann.get(word, n_neighbors)
|
81 |
+
elif nn_method == 'sklearn':
|
82 |
+
word_emb = self.getEmbedding(word)
|
83 |
+
neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
|
84 |
+
words = operator.itemgetter(*neighbors)(self.ds['word'])
|
85 |
+
else:
|
86 |
+
words = []
|
87 |
+
return words
|
88 |
+
|
89 |
+
def getCosineSimilarities(self, w1, w2):
|
90 |
+
return dot(
|
91 |
+
matutils.unitvec(self.getEmbedding(w1)),
|
92 |
+
matutils.unitvec(self.getEmbedding(w2))
|
93 |
+
)
|
modules/module_BiasExplorer.py
ADDED
@@ -0,0 +1,631 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import seaborn as sns
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from sklearn.decomposition import PCA
|
7 |
+
|
8 |
+
def take_two_sides_extreme_sorted(df, n_extreme,
|
9 |
+
part_column=None,
|
10 |
+
head_value='',
|
11 |
+
tail_value=''):
|
12 |
+
head_df = df.head(n_extreme)[:]
|
13 |
+
tail_df = df.tail(n_extreme)[:]
|
14 |
+
|
15 |
+
if part_column is not None:
|
16 |
+
head_df[part_column] = head_value
|
17 |
+
tail_df[part_column] = tail_value
|
18 |
+
|
19 |
+
return (pd.concat([head_df, tail_df])
|
20 |
+
.drop_duplicates()
|
21 |
+
.reset_index(drop=True))
|
22 |
+
|
23 |
+
def normalize(v):
|
24 |
+
"""Normalize a 1-D vector."""
|
25 |
+
if v.ndim != 1:
|
26 |
+
raise ValueError('v should be 1-D, {}-D was given'.format(
|
27 |
+
v.ndim))
|
28 |
+
norm = np.linalg.norm(v)
|
29 |
+
if norm == 0:
|
30 |
+
return v
|
31 |
+
return v / norm
|
32 |
+
|
33 |
+
def project_params(u, v):
|
34 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
35 |
+
normalize_u = normalize(u)
|
36 |
+
projection = (v @ normalize_u)
|
37 |
+
projected_vector = projection * normalize_u
|
38 |
+
rejected_vector = v - projected_vector
|
39 |
+
return projection, projected_vector, rejected_vector
|
40 |
+
|
41 |
+
|
42 |
+
def cosine_similarity(v, u):
|
43 |
+
"""Calculate the cosine similarity between two vectors."""
|
44 |
+
v_norm = np.linalg.norm(v)
|
45 |
+
u_norm = np.linalg.norm(u)
|
46 |
+
similarity = v @ u / (v_norm * u_norm)
|
47 |
+
return similarity
|
48 |
+
|
49 |
+
|
50 |
+
DIRECTION_METHODS = ['single', 'sum', 'pca']
|
51 |
+
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
|
52 |
+
FIRST_PC_THRESHOLD = 0.5
|
53 |
+
MAX_NON_SPECIFIC_EXAMPLES = 1000
|
54 |
+
|
55 |
+
__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
56 |
+
|
57 |
+
|
58 |
+
class WordBiasExplorer():
|
59 |
+
def __init__(self, vocabulary):
|
60 |
+
# pylint: disable=undefined-variable
|
61 |
+
|
62 |
+
self.vocabulary = vocabulary
|
63 |
+
self.direction = None
|
64 |
+
self.positive_end = None
|
65 |
+
self.negative_end = None
|
66 |
+
|
67 |
+
def __copy__(self):
|
68 |
+
bias_word_embedding = self.__class__(self.vocabulary)
|
69 |
+
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
70 |
+
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
71 |
+
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
72 |
+
return bias_word_embedding
|
73 |
+
|
74 |
+
def __deepcopy__(self, memo):
|
75 |
+
bias_word_embedding = copy.copy(self)
|
76 |
+
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
77 |
+
return bias_word_embedding
|
78 |
+
|
79 |
+
def __getitem__(self, key):
|
80 |
+
return self.vocabulary.getEmbedding(key)
|
81 |
+
|
82 |
+
def __contains__(self, item):
|
83 |
+
return item in self.vocabulary
|
84 |
+
|
85 |
+
def _is_direction_identified(self):
|
86 |
+
if self.direction is None:
|
87 |
+
raise RuntimeError('The direction was not identified'
|
88 |
+
' for this {} instance'
|
89 |
+
.format(self.__class__.__name__))
|
90 |
+
|
91 |
+
def _identify_subspace_by_pca(self, definitional_pairs, n_components):
|
92 |
+
matrix = []
|
93 |
+
|
94 |
+
for word1, word2 in definitional_pairs:
|
95 |
+
vector1 = normalize(self[word1])
|
96 |
+
vector2 = normalize(self[word2])
|
97 |
+
|
98 |
+
center = (vector1 + vector2) / 2
|
99 |
+
|
100 |
+
matrix.append(vector1 - center)
|
101 |
+
matrix.append(vector2 - center)
|
102 |
+
|
103 |
+
pca = PCA(n_components=n_components)
|
104 |
+
pca.fit(matrix)
|
105 |
+
return pca
|
106 |
+
|
107 |
+
|
108 |
+
def _identify_direction(self, positive_end, negative_end,
|
109 |
+
definitional, method='pca'):
|
110 |
+
if method not in DIRECTION_METHODS:
|
111 |
+
raise ValueError('method should be one of {}, {} was given'.format(
|
112 |
+
DIRECTION_METHODS, method))
|
113 |
+
|
114 |
+
if positive_end == negative_end:
|
115 |
+
raise ValueError('positive_end and negative_end'
|
116 |
+
'should be different, and not the same "{}"'
|
117 |
+
.format(positive_end))
|
118 |
+
direction = None
|
119 |
+
|
120 |
+
if method == 'single':
|
121 |
+
direction = normalize(normalize(self[definitional[0]])
|
122 |
+
- normalize(self[definitional[1]]))
|
123 |
+
|
124 |
+
elif method == 'sum':
|
125 |
+
group1_sum_vector = np.sum([self[word]
|
126 |
+
for word in definitional[0]], axis=0)
|
127 |
+
group2_sum_vector = np.sum([self[word]
|
128 |
+
for word in definitional[1]], axis=0)
|
129 |
+
|
130 |
+
diff_vector = (normalize(group1_sum_vector)
|
131 |
+
- normalize(group2_sum_vector))
|
132 |
+
|
133 |
+
direction = normalize(diff_vector)
|
134 |
+
|
135 |
+
elif method == 'pca':
|
136 |
+
pca = self._identify_subspace_by_pca(definitional, 10)
|
137 |
+
if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
|
138 |
+
raise RuntimeError('The Explained variance'
|
139 |
+
'of the first principal component should be'
|
140 |
+
'at least {}, but it is {}'
|
141 |
+
.format(FIRST_PC_THRESHOLD,
|
142 |
+
pca.explained_variance_ratio_[0]))
|
143 |
+
direction = pca.components_[0]
|
144 |
+
|
145 |
+
# if direction is opposite (e.g. we cannot control
|
146 |
+
# what the PCA will return)
|
147 |
+
ends_diff_projection = cosine_similarity((self[positive_end]
|
148 |
+
- self[negative_end]),
|
149 |
+
direction)
|
150 |
+
if ends_diff_projection < 0:
|
151 |
+
direction = -direction # pylint: disable=invalid-unary-operand-type
|
152 |
+
|
153 |
+
self.direction = direction
|
154 |
+
self.positive_end = positive_end
|
155 |
+
self.negative_end = negative_end
|
156 |
+
|
157 |
+
def project_on_direction(self, word):
|
158 |
+
"""Project the normalized vector of the word on the direction.
|
159 |
+
:param str word: The word tor project
|
160 |
+
:return float: The projection scalar
|
161 |
+
"""
|
162 |
+
|
163 |
+
self._is_direction_identified()
|
164 |
+
|
165 |
+
vector = self[word]
|
166 |
+
projection_score = self.vocabulary.cosineSimilarities(self.direction,
|
167 |
+
[vector])[0]
|
168 |
+
return projection_score
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
def _calc_projection_scores(self, words):
|
173 |
+
self._is_direction_identified()
|
174 |
+
|
175 |
+
df = pd.DataFrame({'word': words})
|
176 |
+
|
177 |
+
# TODO: maybe using cosine_similarities on all the vectors?
|
178 |
+
# it might be faster
|
179 |
+
df['projection'] = df['word'].apply(self.project_on_direction)
|
180 |
+
df = df.sort_values('projection', ascending=False)
|
181 |
+
|
182 |
+
return df
|
183 |
+
|
184 |
+
def calc_projection_data(self, words):
|
185 |
+
"""
|
186 |
+
Calculate projection, projected and rejected vectors of a words list.
|
187 |
+
:param list words: List of words
|
188 |
+
:return: :class:`pandas.DataFrame` of the projection,
|
189 |
+
projected and rejected vectors of the words list
|
190 |
+
"""
|
191 |
+
projection_data = []
|
192 |
+
for word in words:
|
193 |
+
vector = self[word]
|
194 |
+
normalized_vector = normalize(vector)
|
195 |
+
|
196 |
+
(projection,
|
197 |
+
projected_vector,
|
198 |
+
rejected_vector) = project_params(normalized_vector,
|
199 |
+
self.direction)
|
200 |
+
|
201 |
+
projection_data.append({'word': word,
|
202 |
+
'vector': vector,
|
203 |
+
'projection': projection,
|
204 |
+
'projected_vector': projected_vector,
|
205 |
+
'rejected_vector': rejected_vector})
|
206 |
+
|
207 |
+
return pd.DataFrame(projection_data)
|
208 |
+
|
209 |
+
def plot_dist_projections_on_direction(self, word_groups, ax=None):
|
210 |
+
"""Plot the projection scalars distribution on the direction.
|
211 |
+
:param dict word_groups word: The groups to projects
|
212 |
+
:return float: The ax object of the plot
|
213 |
+
"""
|
214 |
+
|
215 |
+
if ax is None:
|
216 |
+
_, ax = plt.subplots(1)
|
217 |
+
|
218 |
+
names = sorted(word_groups.keys())
|
219 |
+
|
220 |
+
for name in names:
|
221 |
+
words = word_groups[name]
|
222 |
+
label = '{} (#{})'.format(name, len(words))
|
223 |
+
vectors = [self[word] for word in words]
|
224 |
+
projections = self.vocabulary.cosineSimilarities(self.direction,
|
225 |
+
vectors)
|
226 |
+
sns.distplot(projections, hist=False, label=label, ax=ax)
|
227 |
+
|
228 |
+
plt.axvline(0, color='k', linestyle='--')
|
229 |
+
|
230 |
+
plt.title('← {} {} {} →'.format(self.negative_end,
|
231 |
+
' ' * 20,
|
232 |
+
self.positive_end))
|
233 |
+
plt.xlabel('Direction Projection')
|
234 |
+
plt.ylabel('Density')
|
235 |
+
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
236 |
+
|
237 |
+
return ax
|
238 |
+
|
239 |
+
def __errorChecking(self, word):
|
240 |
+
out_msj = ""
|
241 |
+
|
242 |
+
if not word:
|
243 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
244 |
+
else:
|
245 |
+
if word not in self.vocabulary:
|
246 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
247 |
+
|
248 |
+
return out_msj
|
249 |
+
|
250 |
+
def check_oov(self, wordlists):
|
251 |
+
for wordlist in wordlists:
|
252 |
+
for word in wordlist:
|
253 |
+
msg = self.__errorChecking(word)
|
254 |
+
if msg:
|
255 |
+
return msg
|
256 |
+
return None
|
257 |
+
|
258 |
+
def plot_biased_words(self,
|
259 |
+
words_to_diagnose,
|
260 |
+
wordlist_right,
|
261 |
+
wordlist_left,
|
262 |
+
wordlist_top=[],
|
263 |
+
wordlist_bottom=[]
|
264 |
+
):
|
265 |
+
bias_2D = wordlist_top == [] and wordlist_bottom == []
|
266 |
+
|
267 |
+
if bias_2D and (not wordlist_right or not wordlist_left):
|
268 |
+
raise Exception('For bar plot, wordlist right and left can NOT be empty')
|
269 |
+
elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
|
270 |
+
raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
|
271 |
+
|
272 |
+
err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
|
273 |
+
if err:
|
274 |
+
raise Exception(err)
|
275 |
+
|
276 |
+
return self.get_bias_plot(bias_2D,
|
277 |
+
words_to_diagnose,
|
278 |
+
definitional_1=(wordlist_right, wordlist_left),
|
279 |
+
definitional_2=(wordlist_top, wordlist_bottom)
|
280 |
+
)
|
281 |
+
|
282 |
+
def get_bias_plot(self,
|
283 |
+
plot_2D,
|
284 |
+
words_to_diagnose,
|
285 |
+
definitional_1,
|
286 |
+
definitional_2=([], []),
|
287 |
+
method='sum',
|
288 |
+
n_extreme=10,
|
289 |
+
figsize=(15, 10)
|
290 |
+
):
|
291 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
292 |
+
self.method = method
|
293 |
+
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
294 |
+
|
295 |
+
if plot_2D:
|
296 |
+
fig.tight_layout()
|
297 |
+
fig.canvas.draw()
|
298 |
+
|
299 |
+
return fig
|
300 |
+
|
301 |
+
def plot_projection_scores(self,
|
302 |
+
plot_2D,
|
303 |
+
words,
|
304 |
+
definitional_1,
|
305 |
+
definitional_2=([], []),
|
306 |
+
n_extreme=10,
|
307 |
+
ax=None,
|
308 |
+
axis_projection_step=0.1):
|
309 |
+
name_left = ', '.join(definitional_1[1])
|
310 |
+
name_right = ', '.join(definitional_1[0])
|
311 |
+
|
312 |
+
self._identify_direction(name_left, name_right, definitional=definitional_1, method='sum')
|
313 |
+
self._is_direction_identified()
|
314 |
+
|
315 |
+
projections_df = self._calc_projection_scores(words)
|
316 |
+
projections_df['projection_x'] = projections_df['projection'].round(2)
|
317 |
+
|
318 |
+
if not plot_2D:
|
319 |
+
name_top = ', '.join(definitional_2[1])
|
320 |
+
name_bottom = ', '.join(definitional_2[0])
|
321 |
+
self._identify_direction(name_top, name_bottom, definitional=definitional_2, method='sum')
|
322 |
+
self._is_direction_identified()
|
323 |
+
|
324 |
+
projections_df['projection_y'] = self._calc_projection_scores(words)['projection'].round(2)
|
325 |
+
|
326 |
+
if n_extreme is not None:
|
327 |
+
projections_df = take_two_sides_extreme_sorted(projections_df, n_extreme=n_extreme)
|
328 |
+
|
329 |
+
if ax is None:
|
330 |
+
_, ax = plt.subplots(1)
|
331 |
+
|
332 |
+
cmap = plt.get_cmap('RdBu')
|
333 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5).apply(cmap))
|
334 |
+
most_extream_projection = np.round(
|
335 |
+
projections_df['projection']
|
336 |
+
.abs()
|
337 |
+
.max(),
|
338 |
+
decimals=1)
|
339 |
+
|
340 |
+
if plot_2D:
|
341 |
+
sns.barplot(x='projection', y='word', data=projections_df,
|
342 |
+
palette=projections_df['color'])
|
343 |
+
else:
|
344 |
+
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
345 |
+
palette=projections_df['color'])
|
346 |
+
|
347 |
+
plt.xticks(np.arange(-most_extream_projection,
|
348 |
+
most_extream_projection + axis_projection_step,
|
349 |
+
axis_projection_step))
|
350 |
+
|
351 |
+
x_label = '← {} {} {} →'.format(name_left,
|
352 |
+
' ' * 20,
|
353 |
+
name_right)
|
354 |
+
if not plot_2D:
|
355 |
+
y_label = '← {} {} {} →'.format(name_top,
|
356 |
+
' ' * 20,
|
357 |
+
name_bottom)
|
358 |
+
for _, row in (projections_df.iterrows()):
|
359 |
+
ax.annotate(row['word'], (row['projection_x'], row['projection_y']))
|
360 |
+
|
361 |
+
plt.xlabel(x_label)
|
362 |
+
plt.ylabel('Words')
|
363 |
+
|
364 |
+
if not plot_2D:
|
365 |
+
ax.xaxis.set_label_position('bottom')
|
366 |
+
ax.xaxis.set_label_coords(.5, 0)
|
367 |
+
|
368 |
+
plt.ylabel(y_label)
|
369 |
+
ax.yaxis.set_label_position('left')
|
370 |
+
ax.yaxis.set_label_coords(0, .5)
|
371 |
+
|
372 |
+
ax.spines['left'].set_position('center')
|
373 |
+
ax.spines['bottom'].set_position('center')
|
374 |
+
|
375 |
+
ax.set_xticks([])
|
376 |
+
ax.set_yticks([])
|
377 |
+
|
378 |
+
return ax
|
379 |
+
|
380 |
+
# TODO: Would be erased if decided to keep all info in BiasWordExplorer
|
381 |
+
class WEBiasExplorer2d(WordBiasExplorer):
|
382 |
+
def __init__(self, word_embedding) -> None:
|
383 |
+
super().__init__(word_embedding)
|
384 |
+
|
385 |
+
def calculate_bias( self,
|
386 |
+
palabras_extremo_1,
|
387 |
+
palabras_extremo_2,
|
388 |
+
palabras_para_situar
|
389 |
+
):
|
390 |
+
wordlists = [palabras_extremo_1, palabras_extremo_2, palabras_para_situar]
|
391 |
+
|
392 |
+
err = self.check_oov(wordlists)
|
393 |
+
for wordlist in wordlists:
|
394 |
+
if not wordlist:
|
395 |
+
err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' + "<center><h3>"
|
396 |
+
if err:
|
397 |
+
return None, err
|
398 |
+
|
399 |
+
im = self.get_bias_plot(
|
400 |
+
palabras_para_situar,
|
401 |
+
definitional=(
|
402 |
+
palabras_extremo_1, palabras_extremo_2),
|
403 |
+
method='sum',
|
404 |
+
n_extreme=10
|
405 |
+
)
|
406 |
+
return im, ''
|
407 |
+
|
408 |
+
def get_bias_plot(self,
|
409 |
+
palabras_para_situar,
|
410 |
+
definitional,
|
411 |
+
method='sum',
|
412 |
+
n_extreme=10,
|
413 |
+
figsize=(10, 10)
|
414 |
+
):
|
415 |
+
|
416 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
417 |
+
self.method = method
|
418 |
+
self.plot_projection_scores(
|
419 |
+
definitional,
|
420 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
421 |
+
|
422 |
+
fig.tight_layout()
|
423 |
+
fig.canvas.draw()
|
424 |
+
|
425 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
426 |
+
w, h = fig.canvas.get_width_height()
|
427 |
+
im = data.reshape((int(h), int(w), -1))
|
428 |
+
return im
|
429 |
+
|
430 |
+
def plot_projection_scores(self, definitional,
|
431 |
+
words, n_extreme=10,
|
432 |
+
ax=None, axis_projection_step=None):
|
433 |
+
"""Plot the projection scalar of words on the direction.
|
434 |
+
:param list words: The words tor project
|
435 |
+
:param int or None n_extreme: The number of extreme words to show
|
436 |
+
:return: The ax object of the plot
|
437 |
+
"""
|
438 |
+
nombre_del_extremo_1 = ', '.join(definitional[0])
|
439 |
+
nombre_del_extremo_2 = ', '.join(definitional[1])
|
440 |
+
|
441 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
442 |
+
definitional=definitional,
|
443 |
+
method='sum')
|
444 |
+
|
445 |
+
self._is_direction_identified()
|
446 |
+
|
447 |
+
projections_df = self._calc_projection_scores(words)
|
448 |
+
projections_df['projection'] = projections_df['projection'].round(2)
|
449 |
+
|
450 |
+
if n_extreme is not None:
|
451 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
452 |
+
n_extreme=n_extreme)
|
453 |
+
|
454 |
+
if ax is None:
|
455 |
+
_, ax = plt.subplots(1)
|
456 |
+
|
457 |
+
if axis_projection_step is None:
|
458 |
+
axis_projection_step = 0.1
|
459 |
+
|
460 |
+
cmap = plt.get_cmap('RdBu')
|
461 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
462 |
+
.apply(cmap))
|
463 |
+
|
464 |
+
most_extream_projection = np.round(
|
465 |
+
projections_df['projection']
|
466 |
+
.abs()
|
467 |
+
.max(),
|
468 |
+
decimals=1)
|
469 |
+
|
470 |
+
sns.barplot(x='projection', y='word', data=projections_df,
|
471 |
+
palette=projections_df['color'])
|
472 |
+
|
473 |
+
plt.xticks(np.arange(-most_extream_projection,
|
474 |
+
most_extream_projection + axis_projection_step,
|
475 |
+
axis_projection_step))
|
476 |
+
xlabel = ('← {} {} {} →'.format(self.negative_end,
|
477 |
+
' ' * 20,
|
478 |
+
self.positive_end))
|
479 |
+
|
480 |
+
plt.xlabel(xlabel)
|
481 |
+
plt.ylabel('Words')
|
482 |
+
|
483 |
+
return ax
|
484 |
+
|
485 |
+
|
486 |
+
class WEBiasExplorer4d(WordBiasExplorer):
|
487 |
+
def __init__(self, word_embedding) -> None:
|
488 |
+
super().__init__(word_embedding)
|
489 |
+
|
490 |
+
def calculate_bias( self,
|
491 |
+
palabras_extremo_1,
|
492 |
+
palabras_extremo_2,
|
493 |
+
palabras_extremo_3,
|
494 |
+
palabras_extremo_4,
|
495 |
+
palabras_para_situar
|
496 |
+
):
|
497 |
+
wordlists = [
|
498 |
+
palabras_extremo_1,
|
499 |
+
palabras_extremo_2,
|
500 |
+
palabras_extremo_3,
|
501 |
+
palabras_extremo_4,
|
502 |
+
palabras_para_situar
|
503 |
+
]
|
504 |
+
for wordlist in wordlists:
|
505 |
+
if not wordlist:
|
506 |
+
err = "<center><h3>" + \
|
507 |
+
'¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
|
508 |
+
|
509 |
+
err = self.check_oov(wordlist)
|
510 |
+
|
511 |
+
if err:
|
512 |
+
return None, err
|
513 |
+
|
514 |
+
im = self.get_bias_plot(
|
515 |
+
palabras_para_situar,
|
516 |
+
definitional_1=(
|
517 |
+
palabras_extremo_1, palabras_extremo_2),
|
518 |
+
definitional_2=(
|
519 |
+
palabras_extremo_3, palabras_extremo_4),
|
520 |
+
method='sum',
|
521 |
+
n_extreme=10
|
522 |
+
)
|
523 |
+
return im, ''
|
524 |
+
|
525 |
+
def get_bias_plot(self,
|
526 |
+
palabras_para_situar,
|
527 |
+
definitional_1,
|
528 |
+
definitional_2,
|
529 |
+
method='sum',
|
530 |
+
n_extreme=10,
|
531 |
+
figsize=(10, 10)
|
532 |
+
):
|
533 |
+
|
534 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
535 |
+
self.method = method
|
536 |
+
self.plot_projection_scores(
|
537 |
+
definitional_1,
|
538 |
+
definitional_2,
|
539 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
540 |
+
fig.canvas.draw()
|
541 |
+
|
542 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
543 |
+
w, h = fig.canvas.get_width_height()
|
544 |
+
im = data.reshape((int(h), int(w), -1))
|
545 |
+
return im
|
546 |
+
|
547 |
+
def plot_projection_scores(self, definitional_1, definitional_2,
|
548 |
+
words, n_extreme=10,
|
549 |
+
ax=None, axis_projection_step=None):
|
550 |
+
"""Plot the projection scalar of words on the direction.
|
551 |
+
:param list words: The words tor project
|
552 |
+
:param int or None n_extreme: The number of extreme words to show
|
553 |
+
:return: The ax object of the plot
|
554 |
+
"""
|
555 |
+
|
556 |
+
nombre_del_extremo_1 = ', '.join(definitional_1[1])
|
557 |
+
nombre_del_extremo_2 = ', '.join(definitional_1[0])
|
558 |
+
|
559 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
560 |
+
definitional=definitional_1,
|
561 |
+
method='sum')
|
562 |
+
|
563 |
+
self._is_direction_identified()
|
564 |
+
|
565 |
+
projections_df = self._calc_projection_scores(words)
|
566 |
+
projections_df['projection_x'] = projections_df['projection'].round(2)
|
567 |
+
|
568 |
+
nombre_del_extremo_3 = ', '.join(definitional_2[1])
|
569 |
+
nombre_del_extremo_4 = ', '.join(definitional_2[0])
|
570 |
+
self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
|
571 |
+
definitional=definitional_2,
|
572 |
+
method='sum')
|
573 |
+
|
574 |
+
self._is_direction_identified()
|
575 |
+
|
576 |
+
projections_df['projection_y'] = self._calc_projection_scores(words)[
|
577 |
+
'projection'].round(2)
|
578 |
+
|
579 |
+
if n_extreme is not None:
|
580 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
581 |
+
n_extreme=n_extreme)
|
582 |
+
|
583 |
+
if ax is None:
|
584 |
+
_, ax = plt.subplots(1)
|
585 |
+
|
586 |
+
if axis_projection_step is None:
|
587 |
+
axis_projection_step = 0.1
|
588 |
+
|
589 |
+
cmap = plt.get_cmap('RdBu')
|
590 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
591 |
+
.apply(cmap))
|
592 |
+
most_extream_projection = np.round(
|
593 |
+
projections_df['projection']
|
594 |
+
.abs()
|
595 |
+
.max(),
|
596 |
+
decimals=1)
|
597 |
+
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
598 |
+
palette=projections_df['color'])
|
599 |
+
|
600 |
+
plt.xticks(np.arange(-most_extream_projection,
|
601 |
+
most_extream_projection + axis_projection_step,
|
602 |
+
axis_projection_step))
|
603 |
+
for _, row in (projections_df.iterrows()):
|
604 |
+
ax.annotate(
|
605 |
+
row['word'], (row['projection_x'], row['projection_y']))
|
606 |
+
x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
|
607 |
+
' ' * 20,
|
608 |
+
nombre_del_extremo_2)
|
609 |
+
|
610 |
+
y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
|
611 |
+
' ' * 20,
|
612 |
+
nombre_del_extremo_4)
|
613 |
+
|
614 |
+
plt.xlabel(x_label)
|
615 |
+
ax.xaxis.set_label_position('bottom')
|
616 |
+
ax.xaxis.set_label_coords(.5, 0)
|
617 |
+
|
618 |
+
plt.ylabel(y_label)
|
619 |
+
ax.yaxis.set_label_position('left')
|
620 |
+
ax.yaxis.set_label_coords(0, .5)
|
621 |
+
|
622 |
+
ax.spines['left'].set_position('center')
|
623 |
+
ax.spines['bottom'].set_position('center')
|
624 |
+
|
625 |
+
ax.set_xticks([])
|
626 |
+
ax.set_yticks([])
|
627 |
+
#plt.yticks([], [])
|
628 |
+
# ax.spines['left'].set_position('zero')
|
629 |
+
# ax.spines['bottom'].set_position('zero')
|
630 |
+
|
631 |
+
return ax
|
modules/module_WordExplorer.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
from numpy.linalg import norm
|
5 |
+
|
6 |
+
import matplotlib as mpl
|
7 |
+
mpl.use('Agg')
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
|
10 |
+
class WordToPlot:
|
11 |
+
def __init__(self, word, color, bias_space, alpha):
|
12 |
+
self.word = word
|
13 |
+
self.color = color
|
14 |
+
self.bias_space = bias_space
|
15 |
+
self.alpha = alpha
|
16 |
+
|
17 |
+
class WordExplorer:
|
18 |
+
def __init__(self, vocabulary) -> None:
|
19 |
+
self.vocabulary = vocabulary
|
20 |
+
|
21 |
+
def __errorChecking(self, word):
|
22 |
+
out_msj = ""
|
23 |
+
|
24 |
+
if not word:
|
25 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
26 |
+
else:
|
27 |
+
if word not in self.vocabulary:
|
28 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
29 |
+
|
30 |
+
return out_msj
|
31 |
+
|
32 |
+
def parse_words(self, string):
|
33 |
+
words = string.strip()
|
34 |
+
if words:
|
35 |
+
words = [word.strip() for word in words.split(',') if word != ""]
|
36 |
+
return words
|
37 |
+
|
38 |
+
def check_oov(self, wordlists):
|
39 |
+
for wordlist in wordlists:
|
40 |
+
for word in wordlist:
|
41 |
+
msg = self.__errorChecking(word)
|
42 |
+
if msg:
|
43 |
+
return msg
|
44 |
+
return None
|
45 |
+
|
46 |
+
def get_neighbors(self, word, n_neighbors, nn_method):
|
47 |
+
return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
|
48 |
+
|
49 |
+
def get_df(self, words_embedded, processed_word_list):
|
50 |
+
df = pd.DataFrame(words_embedded)
|
51 |
+
|
52 |
+
df['word'] = [wtp.word for wtp in processed_word_list]
|
53 |
+
df['color'] = [wtp.color for wtp in processed_word_list]
|
54 |
+
df['alpha'] = [wtp.alpha for wtp in processed_word_list]
|
55 |
+
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
|
56 |
+
return df
|
57 |
+
|
58 |
+
def get_plot(self,
|
59 |
+
data,
|
60 |
+
processed_word_list,
|
61 |
+
words_embedded,
|
62 |
+
color_dict,
|
63 |
+
n_neighbors,
|
64 |
+
n_alpha,
|
65 |
+
fontsize=18,
|
66 |
+
figsize=(20, 15)
|
67 |
+
):
|
68 |
+
fig, ax = plt.subplots(figsize=figsize)
|
69 |
+
|
70 |
+
sns.scatterplot(
|
71 |
+
data=data[data['alpha'] == 1],
|
72 |
+
x=0,
|
73 |
+
y=1,
|
74 |
+
style='word_bias_space',
|
75 |
+
hue='word_bias_space',
|
76 |
+
ax=ax,
|
77 |
+
palette=color_dict
|
78 |
+
)
|
79 |
+
|
80 |
+
if n_neighbors > 0:
|
81 |
+
sns.scatterplot(
|
82 |
+
data=data[data['alpha'] != 1],
|
83 |
+
x=0,
|
84 |
+
y=1,
|
85 |
+
style='color',
|
86 |
+
hue='word_bias_space',
|
87 |
+
ax=ax,
|
88 |
+
alpha=n_alpha,
|
89 |
+
legend=False,
|
90 |
+
palette=color_dict
|
91 |
+
)
|
92 |
+
for i, wtp in enumerate(processed_word_list):
|
93 |
+
x, y = words_embedded[i, :]
|
94 |
+
ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
|
95 |
+
textcoords='offset points',
|
96 |
+
ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
|
97 |
+
|
98 |
+
ax.set_xticks([])
|
99 |
+
ax.set_yticks([])
|
100 |
+
ax.set_xlabel('')
|
101 |
+
ax.set_ylabel('')
|
102 |
+
fig.tight_layout()
|
103 |
+
|
104 |
+
return fig
|
105 |
+
|
106 |
+
def plot_projections_2d(self,
|
107 |
+
wordlist_0,
|
108 |
+
wordlist_1 = [],
|
109 |
+
wordlist_2 = [],
|
110 |
+
wordlist_3 = [],
|
111 |
+
wordlist_4 = [],
|
112 |
+
**kwargs
|
113 |
+
):
|
114 |
+
# convertirlas a vector
|
115 |
+
choices = [0, 1, 2, 3, 4]
|
116 |
+
wordlist_choice = [
|
117 |
+
wordlist_0,
|
118 |
+
wordlist_1,
|
119 |
+
wordlist_2,
|
120 |
+
wordlist_3,
|
121 |
+
wordlist_4
|
122 |
+
]
|
123 |
+
|
124 |
+
err = self.check_oov(wordlist_choice)
|
125 |
+
if err:
|
126 |
+
raise Exception(err)
|
127 |
+
|
128 |
+
color_dict = {
|
129 |
+
0: kwargs.get('color_wordlist_0', '#000000'),
|
130 |
+
1: kwargs.get('color_wordlist_1', '#1f78b4'),
|
131 |
+
2: kwargs.get('color_wordlist_2', '#33a02c'),
|
132 |
+
3: kwargs.get('color_wordlist_3', '#e31a1c'),
|
133 |
+
4: kwargs.get('color_wordlist_4', '#6a3d9a')
|
134 |
+
}
|
135 |
+
|
136 |
+
n_neighbors = kwargs.get('n_neighbors', 0)
|
137 |
+
n_alpha = kwargs.get('n_alpha', 0.3)
|
138 |
+
|
139 |
+
processed_word_list = []
|
140 |
+
for word_list_to_process, color in zip(wordlist_choice, choices):
|
141 |
+
for word in word_list_to_process:
|
142 |
+
processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
|
143 |
+
|
144 |
+
if n_neighbors > 0:
|
145 |
+
neighbors = self.get_neighbors(word,
|
146 |
+
n_neighbors=n_neighbors+1,
|
147 |
+
nn_method=kwargs.get('nn_method', 'sklearn')
|
148 |
+
)
|
149 |
+
for n in neighbors:
|
150 |
+
if n not in [wtp.word for wtp in processed_word_list]:
|
151 |
+
processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
|
152 |
+
|
153 |
+
if not processed_word_list:
|
154 |
+
raise Exception('Only empty lists were passed')
|
155 |
+
|
156 |
+
words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
|
157 |
+
|
158 |
+
data = self.get_df(words_embedded, processed_word_list)
|
159 |
+
|
160 |
+
fig = self.get_plot(data, processed_word_list, words_embedded,
|
161 |
+
color_dict, n_neighbors, n_alpha,
|
162 |
+
kwargs.get('fontsize', 18),
|
163 |
+
kwargs.get('figsize', (20, 15))
|
164 |
+
)
|
165 |
+
plt.show()
|
166 |
+
return fig
|
167 |
+
|
168 |
+
def doesnt_match(self, wordlist):
|
169 |
+
err = self.check_oov([wordlist])
|
170 |
+
if err:
|
171 |
+
raise Exception(err)
|
172 |
+
|
173 |
+
words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
|
174 |
+
mean_vec = np.mean(words_emb, axis=0)
|
175 |
+
|
176 |
+
doesnt_match = ""
|
177 |
+
farthest_emb = 1.0
|
178 |
+
for word in wordlist:
|
179 |
+
word_emb = self.vocabulary.getEmbedding(word)
|
180 |
+
cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
|
181 |
+
if cos_sim <= farthest_emb:
|
182 |
+
farthest_emb = cos_sim
|
183 |
+
doesnt_match = word
|
184 |
+
|
185 |
+
return doesnt_match
|
modules/module_ann.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import operator
|
3 |
+
from tqdm import tqdm
|
4 |
+
from annoy import AnnoyIndex
|
5 |
+
from memory_profiler import profile
|
6 |
+
|
7 |
+
class TicToc:
|
8 |
+
def __init__(self):
|
9 |
+
self.i = None
|
10 |
+
def start(self):
|
11 |
+
self.i = time.time()
|
12 |
+
def stop(self):
|
13 |
+
f = time.time()
|
14 |
+
print(f - self.i, "seg.")
|
15 |
+
|
16 |
+
class Ann:
|
17 |
+
def __init__(self, words, vectors, coord):
|
18 |
+
self.words = words.to_list()
|
19 |
+
self.vectors = vectors.to_list()
|
20 |
+
self.coord = coord.to_list()
|
21 |
+
self.tree = None
|
22 |
+
|
23 |
+
self.tt = TicToc()
|
24 |
+
|
25 |
+
@profile
|
26 |
+
def init(self, n_trees=10, metric='angular', n_jobs=-1):
|
27 |
+
# metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
|
28 |
+
# n_jobs=-1 Run over all CPU availables
|
29 |
+
|
30 |
+
print("Init tree...")
|
31 |
+
self.tt.start()
|
32 |
+
self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
|
33 |
+
for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
|
34 |
+
self.tree.add_item(i,v)
|
35 |
+
self.tt.stop()
|
36 |
+
|
37 |
+
print("Build tree...")
|
38 |
+
self.tt.start()
|
39 |
+
self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
|
40 |
+
self.tt.stop()
|
41 |
+
|
42 |
+
def __getWordId(self, word):
|
43 |
+
word_id = None
|
44 |
+
try:
|
45 |
+
word_id = self.words.index(word)
|
46 |
+
except:
|
47 |
+
pass
|
48 |
+
return word_id
|
49 |
+
|
50 |
+
def get(self, word, n_neighbors=10):
|
51 |
+
word_id = self.__getWordId(word)
|
52 |
+
reword_xy_list = None
|
53 |
+
|
54 |
+
if word_id != None:
|
55 |
+
neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
|
56 |
+
# word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
|
57 |
+
# word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
|
58 |
+
word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
|
59 |
+
else:
|
60 |
+
print(f"The word '{word}' does not exist")
|
61 |
+
|
62 |
+
return word_xy_list
|
modules/module_connection.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
from abc import ABC, abstractmethod
|
5 |
+
|
6 |
+
from modules.module_WordExplorer import WordExplorer
|
7 |
+
from modules.module_BiasExplorer import WordBiasExplorer
|
8 |
+
|
9 |
+
class Connector(ABC):
|
10 |
+
def parse_word(self, word : str):
|
11 |
+
return word.lower().strip()
|
12 |
+
|
13 |
+
def parse_words(self, array_in_string : str):
|
14 |
+
words = array_in_string.strip()
|
15 |
+
if not words:
|
16 |
+
return []
|
17 |
+
words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
|
18 |
+
return words
|
19 |
+
|
20 |
+
def process_error(self, err: str):
|
21 |
+
if err is None:
|
22 |
+
return
|
23 |
+
return "<center><h3>" + err + "</h3></center>"
|
24 |
+
|
25 |
+
|
26 |
+
class WordExplorerConnector(Connector):
|
27 |
+
|
28 |
+
def __init__(self, **kwargs):
|
29 |
+
if 'embedding' in kwargs:
|
30 |
+
embedding = kwargs.get('embedding')
|
31 |
+
else:
|
32 |
+
raise KeyError
|
33 |
+
self.word_explorer = WordExplorer(embedding)
|
34 |
+
|
35 |
+
def plot_proyection_2d( self,
|
36 |
+
wordlist_0,
|
37 |
+
wordlist_1,
|
38 |
+
wordlist_2,
|
39 |
+
wordlist_3,
|
40 |
+
wordlist_4,
|
41 |
+
color_wordlist_0,
|
42 |
+
color_wordlist_1,
|
43 |
+
color_wordlist_2,
|
44 |
+
color_wordlist_3,
|
45 |
+
color_wordlist_4,
|
46 |
+
n_alpha,
|
47 |
+
fontsize,
|
48 |
+
n_neighbors
|
49 |
+
):
|
50 |
+
err = ""
|
51 |
+
neighbors_method = 'sklearn'
|
52 |
+
wordlist_0 = self.parse_words(wordlist_0)
|
53 |
+
wordlist_1 = self.parse_words(wordlist_1)
|
54 |
+
wordlist_2 = self.parse_words(wordlist_2)
|
55 |
+
wordlist_3 = self.parse_words(wordlist_3)
|
56 |
+
wordlist_4 = self.parse_words(wordlist_4)
|
57 |
+
|
58 |
+
if not (wordlist_0 or wordlist_1 or wordlist_2 or wordlist_1 or wordlist_4):
|
59 |
+
err = self.process_error("Ingresa al menos 1 palabras para continuar")
|
60 |
+
return None, err
|
61 |
+
|
62 |
+
err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
|
63 |
+
if err:
|
64 |
+
return None, self.process_error(err)
|
65 |
+
|
66 |
+
fig = self.word_explorer.plot_projections_2d(wordlist_0,
|
67 |
+
wordlist_1,
|
68 |
+
wordlist_2,
|
69 |
+
wordlist_3,
|
70 |
+
wordlist_4,
|
71 |
+
color_wordlist_0=color_wordlist_0,
|
72 |
+
color_wordlist_1=color_wordlist_1,
|
73 |
+
color_wordlist_2=color_wordlist_2,
|
74 |
+
color_wordlist_3=color_wordlist_3,
|
75 |
+
color_wordlist_4=color_wordlist_4,
|
76 |
+
n_alpha=n_alpha,
|
77 |
+
fontsize=fontsize,
|
78 |
+
n_neighbors=n_neighbors,
|
79 |
+
nn_method = neighbors_method
|
80 |
+
)
|
81 |
+
return fig, self.process_error(err)
|
82 |
+
|
83 |
+
class BiasWordExplorerConnector(Connector):
|
84 |
+
|
85 |
+
def __init__(self, **kwargs):
|
86 |
+
if 'embedding' in kwargs:
|
87 |
+
embedding = kwargs.get('embedding')
|
88 |
+
else:
|
89 |
+
raise KeyError
|
90 |
+
self.bias_word_explorer = WordBiasExplorer(embedding)
|
91 |
+
|
92 |
+
def calculate_bias_2d(self,
|
93 |
+
wordlist_1,
|
94 |
+
wordlist_2,
|
95 |
+
to_diagnose_list
|
96 |
+
):
|
97 |
+
err = ""
|
98 |
+
wordlist_1 = self.parse_words(wordlist_1)
|
99 |
+
wordlist_2 = self.parse_words(wordlist_2)
|
100 |
+
to_diagnose_list = self.parse_words(to_diagnose_list)
|
101 |
+
|
102 |
+
word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
|
103 |
+
for list in word_lists:
|
104 |
+
if not list:
|
105 |
+
err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
|
106 |
+
if err:
|
107 |
+
return None, self.process_error(err)
|
108 |
+
|
109 |
+
err = self.bias_word_explorer.check_oov(word_lists)
|
110 |
+
if err:
|
111 |
+
return None, self.process_error(err)
|
112 |
+
|
113 |
+
fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
|
114 |
+
|
115 |
+
return fig, self.process_error(err)
|
116 |
+
|
117 |
+
def calculate_bias_4d(self,
|
118 |
+
wordlist_1,
|
119 |
+
wordlist_2,
|
120 |
+
wordlist_3,
|
121 |
+
wordlist_4,
|
122 |
+
to_diagnose_list
|
123 |
+
):
|
124 |
+
err = ""
|
125 |
+
wordlist_1 = self.parse_words(wordlist_1)
|
126 |
+
wordlist_2 = self.parse_words(wordlist_2)
|
127 |
+
wordlist_3 = self.parse_words(wordlist_3)
|
128 |
+
wordlist_4 = self.parse_words(wordlist_4)
|
129 |
+
to_diagnose_list = self.parse_words(to_diagnose_list)
|
130 |
+
|
131 |
+
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
132 |
+
for list in wordlists:
|
133 |
+
if not list:
|
134 |
+
err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
|
135 |
+
if err:
|
136 |
+
return None, self.process_error(err)
|
137 |
+
|
138 |
+
err = self.bias_word_explorer.check_oov(wordlists)
|
139 |
+
if err:
|
140 |
+
return None, self.process_error(err)
|
141 |
+
|
142 |
+
fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
|
143 |
+
return fig, self.process_error(err)
|
modules/module_logsManager.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv, os, pytz
|
2 |
+
from gradio import utils
|
3 |
+
from datetime import datetime
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from distutils.log import debug
|
6 |
+
from typing import Any, List, Optional
|
7 |
+
from gradio.components import IOComponent
|
8 |
+
from gradio.flagging import FlaggingCallback, _get_dataset_features_info
|
9 |
+
|
10 |
+
|
11 |
+
# --- Load environments vars ---
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# --- Classes declaration ---
|
15 |
+
class DateLogs:
|
16 |
+
def __init__(self, zone="America/Argentina/Cordoba"):
|
17 |
+
self.time_zone = pytz.timezone(zone)
|
18 |
+
|
19 |
+
def full(self):
|
20 |
+
now = datetime.now(self.time_zone)
|
21 |
+
return now.strftime("%H:%M:%S %d-%m-%Y")
|
22 |
+
|
23 |
+
def day(self):
|
24 |
+
now = datetime.now(self.time_zone)
|
25 |
+
return now.strftime("%d-%m-%Y")
|
26 |
+
|
27 |
+
class HuggingFaceDatasetSaver(FlaggingCallback):
|
28 |
+
"""
|
29 |
+
A callback that saves each flagged sample (both the input and output data)
|
30 |
+
to a HuggingFace dataset.
|
31 |
+
Example:
|
32 |
+
import gradio as gr
|
33 |
+
hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
|
34 |
+
def image_classifier(inp):
|
35 |
+
return {'cat': 0.3, 'dog': 0.7}
|
36 |
+
demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
|
37 |
+
allow_flagging="manual", flagging_callback=hf_writer)
|
38 |
+
Guides: using_flagging
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
hf_token: str = os.getenv('HF_TOKEN'),
|
44 |
+
dataset_name: str = os.getenv('DS_LOGS_NAME'),
|
45 |
+
organization: Optional[str] = os.getenv('ORG_NAME'),
|
46 |
+
private: bool = True,
|
47 |
+
available_logs: bool = False
|
48 |
+
):
|
49 |
+
"""
|
50 |
+
Parameters:
|
51 |
+
hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
|
52 |
+
dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
|
53 |
+
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
54 |
+
private: Whether the dataset should be private (defaults to False).
|
55 |
+
"""
|
56 |
+
self.hf_token = hf_token
|
57 |
+
self.dataset_name = dataset_name
|
58 |
+
self.organization_name = organization
|
59 |
+
self.dataset_private = private
|
60 |
+
self.datetime = DateLogs()
|
61 |
+
self.available_logs = available_logs
|
62 |
+
|
63 |
+
if not available_logs:
|
64 |
+
print("Push: logs DISABLED!...")
|
65 |
+
|
66 |
+
|
67 |
+
def setup(
|
68 |
+
self,
|
69 |
+
components: List[IOComponent],
|
70 |
+
flagging_dir: str
|
71 |
+
):
|
72 |
+
"""
|
73 |
+
Params:
|
74 |
+
flagging_dir (str): local directory where the dataset is cloned,
|
75 |
+
updated, and pushed from.
|
76 |
+
"""
|
77 |
+
if self.available_logs:
|
78 |
+
|
79 |
+
try:
|
80 |
+
import huggingface_hub
|
81 |
+
except (ImportError, ModuleNotFoundError):
|
82 |
+
raise ImportError(
|
83 |
+
"Package `huggingface_hub` not found is needed "
|
84 |
+
"for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
|
85 |
+
)
|
86 |
+
|
87 |
+
path_to_dataset_repo = huggingface_hub.create_repo(
|
88 |
+
repo_id=os.path.join(self.organization_name, self.dataset_name),
|
89 |
+
token=self.hf_token,
|
90 |
+
private=self.dataset_private,
|
91 |
+
repo_type="dataset",
|
92 |
+
exist_ok=True,
|
93 |
+
)
|
94 |
+
|
95 |
+
self.path_to_dataset_repo = path_to_dataset_repo
|
96 |
+
self.components = components
|
97 |
+
self.flagging_dir = flagging_dir
|
98 |
+
self.dataset_dir = self.dataset_name
|
99 |
+
|
100 |
+
self.repo = huggingface_hub.Repository(
|
101 |
+
local_dir=self.dataset_dir,
|
102 |
+
clone_from=path_to_dataset_repo,
|
103 |
+
use_auth_token=self.hf_token,
|
104 |
+
)
|
105 |
+
|
106 |
+
self.repo.git_pull(lfs=True)
|
107 |
+
|
108 |
+
# Should filename be user-specified?
|
109 |
+
# log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
|
110 |
+
self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
|
111 |
+
|
112 |
+
def flag(
|
113 |
+
self,
|
114 |
+
flag_data: List[Any],
|
115 |
+
flag_option: Optional[str] = None,
|
116 |
+
flag_index: Optional[int] = None,
|
117 |
+
username: Optional[str] = None,
|
118 |
+
) -> int:
|
119 |
+
|
120 |
+
if self.available_logs:
|
121 |
+
self.repo.git_pull(lfs=True)
|
122 |
+
|
123 |
+
is_new = not os.path.exists(self.log_file)
|
124 |
+
|
125 |
+
with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
|
126 |
+
writer = csv.writer(csvfile)
|
127 |
+
|
128 |
+
# File previews for certain input and output types
|
129 |
+
infos, file_preview_types, headers = _get_dataset_features_info(
|
130 |
+
is_new, self.components
|
131 |
+
)
|
132 |
+
|
133 |
+
# Generate the headers and dataset_infos
|
134 |
+
if is_new:
|
135 |
+
headers = [
|
136 |
+
component.label or f"component {idx}"
|
137 |
+
for idx, component in enumerate(self.components)
|
138 |
+
] + [
|
139 |
+
"flag",
|
140 |
+
"username",
|
141 |
+
"timestamp",
|
142 |
+
]
|
143 |
+
writer.writerow(utils.sanitize_list_for_csv(headers))
|
144 |
+
|
145 |
+
# Generate the row corresponding to the flagged sample
|
146 |
+
csv_data = []
|
147 |
+
for component, sample in zip(self.components, flag_data):
|
148 |
+
save_dir = os.path.join(
|
149 |
+
self.dataset_dir,
|
150 |
+
utils.strip_invalid_filename_characters(component.label),
|
151 |
+
)
|
152 |
+
filepath = component.deserialize(sample, save_dir, None)
|
153 |
+
csv_data.append(filepath)
|
154 |
+
if isinstance(component, tuple(file_preview_types)):
|
155 |
+
csv_data.append(
|
156 |
+
"{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
|
157 |
+
)
|
158 |
+
|
159 |
+
csv_data.append(flag_option if flag_option is not None else "")
|
160 |
+
csv_data.append(username if username is not None else "")
|
161 |
+
csv_data.append(self.datetime.full())
|
162 |
+
writer.writerow(utils.sanitize_list_for_csv(csv_data))
|
163 |
+
|
164 |
+
|
165 |
+
with open(self.log_file, "r", encoding="utf-8") as csvfile:
|
166 |
+
line_count = len([None for row in csv.reader(csvfile)]) - 1
|
167 |
+
|
168 |
+
self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
|
169 |
+
|
170 |
+
else:
|
171 |
+
line_count = 0
|
172 |
+
print("Logs: Virtual push...")
|
173 |
+
|
174 |
+
return line_count
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sklearn
|
2 |
+
gensim==3.7.3
|
3 |
+
transformers
|
4 |
+
matplotlib
|
5 |
+
numpy
|
6 |
+
seaborn
|
7 |
+
uuid
|
8 |
+
python-dotenv
|
9 |
+
memory_profiler
|
10 |
+
annoy
|
tool_info.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TOOL_INFO = """
|
2 |
+
> ### A tool to overcome technical barriers for bias assessment in human language technologies
|
3 |
+
|
4 |
+
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
5 |
+
|
6 |
+
> ### Licensing Information
|
7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
|
8 |
+
|
9 |
+
> ### Citation Information
|
10 |
+
```c
|
11 |
+
@misc{https://doi.org/10.48550/arxiv.2207.06591,
|
12 |
+
doi = {10.48550/ARXIV.2207.06591},
|
13 |
+
url = {https://arxiv.org/abs/2207.06591},
|
14 |
+
author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
|
15 |
+
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
|
16 |
+
FOS: Computer and information sciences, FOS: Computer and information sciences},
|
17 |
+
title = {A tool to overcome technical barriers for bias assessment in human language technologies},
|
18 |
+
publisher = {arXiv},
|
19 |
+
year = {2022},
|
20 |
+
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
|
21 |
+
}
|
22 |
+
```
|
23 |
+
"""
|