Spaces:

vialibre
/

edia_we_es

Configuration error

App Files Files Community

LMartinezEXEX commited on Dec 1, 2022

Commit

a779273

1 Parent(s): 2b2d321

Init commit

Browse files

Files changed (22) hide show

.gitattributes +3 -1
.gitignore +1 -0
LICENSE +21 -0
app.py +47 -0
data/data_loader.py +41 -0
data/fasttext_embedding_v6.zip +3 -0
data/mini_embedding_v6.zip +3 -0
data/wiki-news-300d-1M.vec +3 -0
examples/.gitignore +1 -0
examples/examples.py +122 -0
interfaces/interface_BiasWordExplorer.py +104 -0
interfaces/interface_WordExplorer.py +113 -0
language/english.json +91 -0
language/spanish.json +91 -0
modules/model_embbeding.py +93 -0
modules/module_BiasExplorer.py +631 -0
modules/module_WordExplorer.py +185 -0
modules/module_ann.py +62 -0
modules/module_connection.py +143 -0
modules/module_logsManager.py +174 -0
requirements.txt +10 -0
tool_info.py +23 -0

.gitattributes CHANGED Viewed

@@ -2,7 +2,6 @@
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
@@ -32,3 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
+data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
+data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Fundación Vía Libre
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# --- Imports libs ---
+import gradio as gr
+import pandas as pd
+# --- Imports modules ---
+from modules.model_embbeding import Embedding
+# --- Imports interfaces ---
+from interfaces.interface_WordExplorer import interface as wordExplorer_interface
+from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
+# --- Tool config ---
+AVAILABLE_LOGS      = True                          # [True     | False]
+LANGUAGE            = "spanish"                     # [spanish  | english]
+EMBEDDING_SUBSET    = "fasttext"                    # [fasttext | mini]
+# --- Init classes ---
+embedding = Embedding(
+    subset_name=EMBEDDING_SUBSET
+)
+labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
+# --- Main App ---
+INTERFACE_LIST = [
+    biasWordExplorer_interface(
+        embedding=embedding,
+        available_logs=AVAILABLE_LOGS,
+        lang=LANGUAGE),
+    wordExplorer_interface(
+        embedding=embedding,
+        available_logs=AVAILABLE_LOGS,
+        lang=LANGUAGE),
+]
+TAB_NAMES = [
+    labels["biasWordExplorer"],
+    labels["wordExplorer"],
+]
+iface = gr.TabbedInterface(
+    interface_list=INTERFACE_LIST,
+    tab_names=TAB_NAMES
+)
+iface.queue(concurrency_count=8)
+iface.launch(debug=False)

data/data_loader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+from sklearn.decomposition import PCA
+from gensim.models import KeyedVectors
+def load_embeddings(path, binary = False, randomPCA = False, limit = None):
+    if randomPCA:
+        pca = PCA(n_components=2,
+                  copy=False,
+                  whiten=False,
+                  svd_solver='randomized',
+                  iterated_power='auto'
+                  )
+    else:
+        pca = PCA(n_components=2)
+    model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
+    # Cased Vocab
+    cased_words = model.vocab.keys()
+    #Normalized vectors
+    model.init_sims(replace=True)
+    cased_emb = [model[word] for word in cased_words]
+    # PCA reduction
+    cased_pca = pca.fit_transform(cased_emb)
+    df_cased = pd.DataFrame(
+        zip(
+            cased_words,
+            cased_emb,
+            cased_pca
+        ),
+        columns=['word', 'embedding', 'pca']
+    )
+    df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
+    df_uncased = df_cased.drop_duplicates(subset='word')
+    df_uncased.to_json(path[:-3] + 'json')
+load_embeddings('./wiki-news-300d-1M.vec', limit=10000)

data/fasttext_embedding_v6.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c35f3dda1d216d9baed3fc77f3b6bb51130f07faf0ee418029344635a0b732b7
+size 165727812

data/mini_embedding_v6.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa1594f66f29388719f9125eebdd529054f31bc9564e609d5162ba328a054be
+size 94479

data/wiki-news-300d-1M.vec ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
+size 2259088777

examples/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

examples/examples.py ADDED Viewed

	@@ -0,0 +1,122 @@

+example_fem = {
+    "mujer": "la mente de una mujer que durante los últimos",
+    "chica": "enamorado de la misma chica desde la infancia mary",
+    "ella": "ella llego a la final",
+            "madre": "su padre y su madre margarita de parma",
+            "hija": "hija de inmigrantes españoles en",
+            "femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
+}
+example_joven = {
+    "joven": "",
+    "inmaduro": "",
+    "niño": "",
+    "crio": ""
+}
+example_viejo = {
+    "viejo": "",
+    "maduro": "",
+    "anciano": "",
+    "adulto": ""
+}
+example_masc = {
+    "hombre": "deseo innato que todo hombre tiene de comunicar su",
+    "chico": "fue un chico interesado en artes",
+    "el": "el parque nacional liwonde",
+    "padre": "la muerte de su padre en 1832 se formó",
+    "hijo": "le dice a su hijo aún no nacido como",
+            "masculino": "el mito es esencialmente masculino y entre las causas",
+}
+example_diagnose = {
+    "ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
+    "educación": "sentido de vida religión educación y cultura para cada mujer",
+    "pagado": "un rescate muy grande pagado por sus seguidores a",
+    "cocinar": "empezó a cocinar una sopa usando",
+    "lavar": "era directamente usado para lavar ropa por eso la",
+    "deporte": "se convirtió en el deporte más popular del país",
+    "ropa": "usan el kimono una ropa tradicional japonesa",
+    "pelea": "mal por la violenta pelea entre ambos hermanos",
+    "enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
+    "ganar": "una necesidad un modo de ganar",
+    "líder": "del estado en manos del líder opositor henrique capriles para el",
+    "coser": "realizar tareas domésticas básicas como coser y poner la mesa",
+    "cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
+    "cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
+    "rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
+    "reina": "año ganó el título de reina de la bahía en el"
+}
+fem_words = ','.join([word for word, context in example_fem.items()])
+fem_contexts = ','.join([context for word, context in example_fem.items()])
+masc_words = ','.join([word for word, context in example_masc.items()])
+masc_contexts = ','.join([context for word, context in example_masc.items()])
+young_words = ','.join([word for word, context in example_joven.items()])
+old_words = ','.join([word for word, context in example_viejo.items()])
+diagnose_words = ','.join([word for word, context in example_diagnose.items()])
+diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
+positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
+negative_money_words = 'malgastar,derrochar'
+diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
+lazy_words = 'vago, perezoso, gandul'
+active_words = 'trabajar, esfuerzo, trabajador'
+examples1_explorar_sesgo_en_palabras = [
+    [fem_words, masc_words, diagnose_words],
+    [old_words, young_words, diagnose_words],
+    [positive_money_words, negative_money_words, diagnose_money],
+    [lazy_words, active_words, diagnose_money]
+]
+examples2_explorar_sesgo_en_palabras = [
+    [fem_words, masc_words, young_words, old_words, diagnose_words],
+    [lazy_words, active_words, positive_money_words, negative_money_words,diagnose_money],
+]
+examples_explorar_relaciones_entre_palabras = [
+    [diagnose_words, fem_words, masc_words, young_words, old_words],
+    [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
+]
+# Examples Datos
+examples_datos = ["ronronear", "ella", "el", "ellos"]
+# Examples Sesgos en frases
+examples_sesgos_frases = [
+    ["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
+    ["los cordobeses bailan *", ""],
+    ["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
+    ["",""]
+]
+# Examples Crows_pairs
+examples_crows_pairs = [
+    [   "Los hombres 'musculosos' tienen más suerte con las mujeres",
+        "Los hombres 'gordos' tienen más suerte con las mujeres",
+        "Los hombres 'delgados' tienen más suerte con las mujeres",
+        "Los hombres 'bajos' tienen más suerte con las mujeres",
+        "Los hombres 'rubios' tienen más suerte con las mujeres",
+        "Los hombres 'altos' tienen más suerte con las mujeres"],
+    [   "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
+        "No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
+        "","","",""],
+    [   "Todos sabemos que los 'mapuches' son alcohólicos.",
+        "Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
+        "","","",""],
+    [   "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
+        "Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
+        "","","",""],
+    [   "A 'las viejas' les gusta contar chusmerio",
+        "A 'los viejos' les gusta contar chusmerio",
+        "","","",""],
+    [   "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
+        "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
+        "","","",""]
+]

interfaces/interface_BiasWordExplorer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+import pandas as pd
+from tkinter import image_names
+from tool_info import TOOL_INFO
+from modules.module_logsManager import HuggingFaceDatasetSaver
+from modules.module_connection import BiasWordExplorerConnector
+from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
+# --- Interface ---
+def interface(embedding, available_logs, lang="spanish"):
+    # --- Init logs ---
+    log_callback = HuggingFaceDatasetSaver(
+        available_logs=available_logs
+    )
+    # --- Init vars ---
+    connector = BiasWordExplorerConnector(embedding=embedding)
+    labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
+    interface = gr.Blocks()
+    with interface:
+        gr.Markdown(labels["step1"])
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
+                with gr.Row():
+                    gr.Markdown(labels["step2&2Spaces"])
+                with gr.Row():
+                    wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
+                    wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
+                with gr.Row():
+                    gr.Markdown(labels["step2&4Spaces"])
+                with gr.Row():
+                    wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
+                    wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
+            with gr.Column():
+                with gr.Row():
+                    bias2d = gr.Button(labels["plot2SpacesButton"])
+                with gr.Row():
+                    bias4d = gr.Button(labels["plot4SpacesButton"])
+                with gr.Row():
+                    err_msg = gr.Markdown(label='',visible=True)
+                with gr.Row():
+                    bias_plot = gr.Plot(label="", show_label=False)
+        with gr.Row():
+            examples = gr.Examples(
+                fn=connector.calculate_bias_2d,
+                inputs=[wordlist_1, wordlist_2, diagnose_list],
+                outputs=[bias_plot, err_msg],
+                examples=examples1_explorar_sesgo_en_palabras,
+                label=labels["examples2Spaces"]
+            )
+        with gr.Row():
+            examples = gr.Examples(
+                fn=connector.calculate_bias_4d,
+                inputs=[wordlist_1, wordlist_2,
+                        wordlist_3, wordlist_4, diagnose_list],
+                outputs=[bias_plot, err_msg],
+                examples=examples2_explorar_sesgo_en_palabras,
+                label=labels["examples4Spaces"]
+            )
+        with gr.Row():
+            gr.Markdown(TOOL_INFO)
+        bias2d.click(
+            fn=connector.calculate_bias_2d,
+            inputs=[wordlist_1,wordlist_2,diagnose_list],
+            outputs=[bias_plot,err_msg]
+        )
+        bias4d.click(
+            fn=connector.calculate_bias_4d,
+            inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
+            outputs=[bias_plot,err_msg]
+        )
+        # --- Logs ---
+        save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
+        log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
+        bias2d.click(
+            fn=lambda *args: log_callback.flag(
+                    flag_data=args,
+                    flag_option="plot_2d",
+                    username="vialibre"
+            ),
+            inputs=save_field,
+            outputs=None,
+            preprocess=False
+        )
+        bias4d.click(
+            fn=lambda *args: log_callback.flag(
+                    flag_data=args,
+                    flag_option="plot_4d",
+                    username="vialibre"
+            ),
+            inputs=save_field,
+            outputs=None,
+            preprocess=False
+        )
+    return interface

interfaces/interface_WordExplorer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+import pandas as pd
+import matplotlib.pyplot as plt
+from tool_info import TOOL_INFO
+from modules.module_connection import WordExplorerConnector
+from modules.module_logsManager import HuggingFaceDatasetSaver
+from examples.examples import examples_explorar_relaciones_entre_palabras
+plt.rcParams.update({'font.size': 14})
+def interface(embedding, available_logs, lang="spanish"):
+    # --- Init logs ---
+    log_callback = HuggingFaceDatasetSaver(
+        available_logs=available_logs
+    )
+    # --- Init vars ---
+    connector = WordExplorerConnector(embedding=embedding)
+    labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
+    # --- Interface ---
+    interface = gr.Blocks()
+    with interface:
+        gr.Markdown(labels["title"])
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(equal_height=True):
+                    with gr.Column(scale=5):
+                        diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
+                    with gr.Column(scale=1,min_width=10):
+                        color_wordlist = gr.ColorPicker(label="",value='#000000',)
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
+                    with gr.Column(scale=1,min_width=10):
+                        color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
+                    with gr.Column(scale=1,min_width=10):
+                        color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
+                    with gr.Column(scale=1,min_width=10):
+                        color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
+                    with gr.Column(scale=1,min_width=10):
+                        color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
+            with gr.Column(scale=4):
+                with gr.Row():
+                    with gr.Row():
+                        gr.Markdown(labels["plotNeighbours"]["title"])
+                        n_neighbors = gr.Slider(minimum=0,maximum=100,step=1,label=labels["plotNeighbours"]["quantity"])
+                    with gr.Row():
+                        alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
+                        fontsize=gr.Number(value=18, label=labels["options"]["font-size"])
+                    with gr.Row():
+                        btn_plot = gr.Button(labels["plot_button"])
+                with gr.Row():
+                    err_msg = gr.Markdown(label="", visible=True)
+                with gr.Row():
+                    word_proyections = gr.Plot(label="", show_label=False)
+        with gr.Row():
+            gr.Examples(
+                fn=connector.plot_proyection_2d,
+                inputs=[diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4],
+                outputs=[word_proyections,err_msg],
+                examples=examples_explorar_relaciones_entre_palabras,
+                label=labels["examples"]
+            )
+        with gr.Row():
+            gr.Markdown(TOOL_INFO)
+        btn_plot.click(
+            fn=connector.plot_proyection_2d,
+            inputs=[
+                diagnose_list,
+                wordlist_1,
+                wordlist_2,
+                wordlist_3,
+                wordlist_4,
+                color_wordlist,
+                color_wordlist_1,
+                color_wordlist_2,
+                color_wordlist_3,
+                color_wordlist_4,
+                alpha,
+                fontsize,
+                n_neighbors
+            ],
+            outputs=[word_proyections,err_msg]
+        )
+        # --- Logs ---
+        save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
+        log_callback.setup(components=save_field, flagging_dir="edia_we_es")
+        btn_plot.click(
+            fn=lambda *args: log_callback.flag(
+                    flag_data=args,
+                    flag_option="explorar_palabras",
+                    username="vialibre",
+            ),
+            inputs=save_field,
+            outputs=None,
+            preprocess=False
+        )
+        return interface

language/english.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+    "app": {
+        "wordExplorer": "Word explorer",
+        "biasWordExplorer": "Word bias",
+        "dataExplorer": "Data bias",
+        "phraseExplorer": "Phrase bias",
+        "crowsPairsExplorer": "Crows-Pairs"
+    },
+    "WordExplorer_interface": {
+        "title": "Write some words to visualize their related ones",
+        "wordList1": "Word list 1",
+        "wordList2": "Word list 2",
+        "wordList3": "Word list 3",
+        "wordList4": "Word list 4",
+        "wordListToDiagnose": "List of words to be diagnosed",
+        "plotNeighbours": {
+            "title": "Plot neighbours words",
+            "quantity": "Quantity"
+        },
+        "options": {
+            "font-size": "Font size",
+            "transparency": "Transparency"
+        },
+        "plot_button": "Plot in the space!",
+        "examples": "Examples"
+    },
+    "BiasWordExplorer_interface": {
+        "step1": "1. Write comma separated words to be diagnosed",
+        "step2&2Spaces": "2. For plotting 2 spaces, fill in the following lists:",
+        "step2&4Spaces": "2. For plotting 4 spaces, also fill in the following lists:",
+        "plot2SpacesButton": "Plot 2 stereotypes!",
+        "plot4SpacesButton": "Plot 4 stereotypes!",
+        "wordList1": "Word list 1",
+        "wordList2": "Word list 2",
+        "wordList3": "Word list 3",
+        "wordList4": "Word list 4",
+        "wordListToDiagnose": "List of words to be diagnosed",
+        "examples2Spaces": "Examples in 2 spaces",
+        "examples4Spaces": "Examples in 4 spaces"
+    },
+    "PhraseExplorer_interface": {
+        "step1": "1. Enter a sentence",
+        "step2": "2. Enter words of interest (Optional)",
+        "step3": "3. Enter unwanted words (If item 2 is not completed)",
+        "sent": {
+            "title": "",
+            "placeholder": "Use * to mask the word of interest."
+        },
+        "wordList": {
+            "title": "",
+            "placeholder": "The words in the list must be comma separated"
+        },
+        "bannedWordList": {
+            "title": "",
+            "placeholder": "The words in the list must be comma separated"
+        },
+        "excludeArticles": "Exclude articles",
+        "excludePrepositions": "Excluir Prepositions",
+        "excludeConjunctions": "Excluir Conjunctions",
+        "resultsButton": "Get",
+        "plot": "Display of proportions",
+        "examples": "Examples"
+    },
+    "DataExplorer_interface": {
+        "step1": "1. Enter a word of interest",
+        "step2": "2. Select maximum number of contexts to retrieve",
+        "step3": "3. Select sets of interest",
+        "inputWord": {
+            "title": "",
+            "placeholder": "Enter the word ..."
+        },
+        "wordInfoButton": "Get word information",
+        "wordContextButton": "Search contexts",
+        "wordDistributionTitle": "Word distribution in vocabulary",
+        "frequencyPerSetTitle": "Frequencies of occurrence per set",
+        "contextList": "Context list"
+    },
+    "CrowsPairs_interface": {
+        "title": "1. Enter sentences to compare",
+        "sent0": "Sentence Nº 1 (*)",
+        "sent1": "Sentence Nº 2 (*)",
+        "sent2": "Sentence Nº 3 (Optional)",
+        "sent3": "Sentence Nº 4 (Optional)",
+        "sent4": "Sentence Nº 5 (Optional)",
+        "sent5": "Sentence Nº 6 (Optional)",
+        "commonPlacholder": "Use < and > to highlight word(s) of interest",
+        "compareButton": "Compare",
+        "plot": "Display of proportions",
+        "examples": "Examples"
+    }
+}

language/spanish.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+    "app": {
+        "wordExplorer": "Explorar palabras",
+        "biasWordExplorer": "Sesgo en palabras",
+        "dataExplorer": "Sesgo en datos",
+        "phraseExplorer": "Sesgo en frases",
+        "crowsPairsExplorer": "Crows-Pairs"
+    },
+    "WordExplorer_interface": {
+        "title": "Escribi algunas palabras para visualizar sus palabras relacionadas",
+        "wordList1": "Lista de palabras 1",
+        "wordList2": "Lista de palabras 2",
+        "wordList3": "Lista de palabras 3",
+        "wordList4": "Lista de palabras 4",
+        "wordListToDiagnose": "Lista de palabras a diagnosticar",
+        "plotNeighbours": {
+            "title": "Graficar palabras relacionadas",
+            "quantity": "Cantidad"
+        },
+        "options": {
+            "font-size": "Tamaño de fuente",
+            "transparency": "Transparencia"
+        },
+        "plot_button": "¡Graficar en el espacio!",
+        "examples": "Ejemplos"
+    },
+    "BiasWordExplorer_interface": {
+        "step1": "1. Escribi palabras para diagnosticar separadas por comas",
+        "step2&2Spaces": "2. Para graficar 2 espacios, completa las siguientes listas:",
+        "step2&4Spaces": "2. Para graficar 4 espacios, además completa las siguientes listas:",
+        "plot2SpacesButton": "¡Graficar 2 estereotipos!",
+        "plot4SpacesButton": "¡Graficar 4 estereotipos!",
+        "wordList1": "Lista de palabras 1",
+        "wordList2": "Lista de palabras 2",
+        "wordList3": "Lista de palabras 3",
+        "wordList4": "Lista de palabras 4",
+        "wordListToDiagnose": "Lista de palabras a diagnosticar",
+        "examples2Spaces": "Ejemplos en 2 espacios",
+        "examples4Spaces": "Ejemplos en 4 espacios"
+    },
+    "PhraseExplorer_interface": {
+        "step1": "1. Ingrese una frase",
+        "step2": "2. Ingrese palabras de interés (Opcional)",
+        "step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
+        "sent": {
+            "title": "",
+            "placeholder": "Utilice * para enmascarar la palabra de interés"
+        },
+        "wordList": {
+            "title": "",
+            "placeholder": "La lista de palabras deberán estar separadas por ,"
+        },
+        "bannedWordList": {
+            "title": "",
+            "placeholder": "La lista de palabras deberán estar separadas por ,"
+        },
+        "excludeArticles": "Excluir Artículos",
+        "excludePrepositions": "Excluir Preposiciones",
+        "excludeConjunctions": "Excluir Conjunciones",
+        "resultsButton": "Obtener",
+        "plot": "Visualización de proporciones",
+        "examples": "Ejemplos"
+    },
+    "DataExplorer_interface": {
+        "step1": "1. Ingrese una palabra de interés",
+        "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
+        "step3": "3. Seleccione conjuntos de interés",
+        "inputWord": {
+            "title": "",
+            "placeholder": "Ingresar aquí la palabra ..."
+        },
+        "wordInfoButton": "Obtener información de palabra",
+        "wordContextButton": "Buscar contextos",
+        "wordDistributionTitle": "Distribución de palabra en vocabulario",
+        "frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
+        "contextList": "Lista de contextos"
+    },
+    "CrowsPairs_interface": {
+        "title": "1. Ingrese frases a comparar",
+        "sent0": "Frase Nº 1 (*)",
+        "sent1": "Frase Nº 2 (*)",
+        "sent2": "Frase Nº 3 (Opcional)",
+        "sent3": "Frase Nº 4 (Opcional)",
+        "sent4": "Frase Nº 5 (Opcional)",
+        "sent5": "Frase Nº 6 (Opcional)",
+        "commonPlacholder": "Utilice comillas simples ' ' para destacar palabra/as de interés",
+        "compareButton": "Comparar",
+        "plot": "Visualización de proporciones",
+        "examples": "Ejemplos"
+    }
+}

modules/model_embbeding.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import operator
+import numpy as np
+import pandas as pd
+from numpy import dot
+from gensim import matutils
+from modules.module_ann import Ann
+from memory_profiler import profile
+from sklearn.neighbors import NearestNeighbors
+class Embedding:
+    @profile
+    def __init__(self, subset_name):
+        # Dataset info
+        self.ds_subset = subset_name
+        self.ds_path = f"data/{subset_name}_embedding_v6.zip"
+        # Pandas dataset
+        self.ds = None
+        # All Words embedding List[List[float]]
+        self.embedding = None
+        # Estimate AproximateNearestNeighbors
+        self.ann = None
+        # Load embedding and pca dataset
+        self.__load()
+    def __contains__(self, word):
+        return word in self.ds['word'].to_list()
+    def __load(self):
+        print(f"Preparing {self.ds_subset} embedding...")
+        # --- Download dataset ---
+        self.ds = pd.read_json(self.ds_path)
+        # --- Get embedding from string
+        self.embedding = self.ds['embedding'].to_list()
+        # --- Get forest tree to estimate Nearest Neighbors ---
+        self.ann = Ann(
+            words=self.ds['word'],
+            vectors=self.ds['embedding'],
+            coord=self.ds['pca']
+        )
+        self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
+        # --- Fit Sklearn NN method ---
+        self.neigh = NearestNeighbors(n_neighbors=20)
+        self.neigh.fit(self.embedding)
+    def __getValue(self, word, feature):
+        word_id, value = None, None
+        if word in self:
+            word_id = self.ds['word'].to_list().index(word)
+        if word_id != None:
+            value = self.ds[feature].to_list()[word_id]
+        return value
+    def getEmbedding(self, word):
+        return self.__getValue(word, 'embedding')
+    def getPCA(self, word):
+        return self.__getValue(word, 'pca')
+    def cosineSimilarities(self, vector_1, vectors_all):
+        norm = np.linalg.norm(vector_1)
+        all_norms = np.linalg.norm(vectors_all, axis=1)
+        dot_products = dot(vectors_all, vector_1)
+        similarities = dot_products / (norm * all_norms)
+        return similarities
+    def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
+        if nn_method == 'ann':
+            words = self.ann.get(word, n_neighbors)
+        elif nn_method == 'sklearn':
+            word_emb = self.getEmbedding(word)
+            neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
+            words = operator.itemgetter(*neighbors)(self.ds['word'])
+        else:
+            words = []
+        return words
+    def getCosineSimilarities(self, w1, w2):
+        return dot(
+            matutils.unitvec(self.getEmbedding(w1)),
+            matutils.unitvec(self.getEmbedding(w2))
+        )

modules/module_BiasExplorer.py ADDED Viewed

	@@ -0,0 +1,631 @@

+import copy
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+def take_two_sides_extreme_sorted(df, n_extreme,
+                                  part_column=None,
+                                  head_value='',
+                                  tail_value=''):
+    head_df = df.head(n_extreme)[:]
+    tail_df = df.tail(n_extreme)[:]
+    if part_column is not None:
+        head_df[part_column] = head_value
+        tail_df[part_column] = tail_value
+    return (pd.concat([head_df, tail_df])
+            .drop_duplicates()
+            .reset_index(drop=True))
+def normalize(v):
+    """Normalize a 1-D vector."""
+    if v.ndim != 1:
+        raise ValueError('v should be 1-D, {}-D was given'.format(
+            v.ndim))
+    norm = np.linalg.norm(v)
+    if norm == 0:
+        return v
+    return v / norm
+def project_params(u, v):
+    """Projecting and rejecting the vector v onto direction u with scalar."""
+    normalize_u = normalize(u)
+    projection = (v @ normalize_u)
+    projected_vector = projection * normalize_u
+    rejected_vector = v - projected_vector
+    return projection, projected_vector, rejected_vector
+def cosine_similarity(v, u):
+    """Calculate the cosine similarity between two vectors."""
+    v_norm = np.linalg.norm(v)
+    u_norm = np.linalg.norm(u)
+    similarity = v @ u / (v_norm * u_norm)
+    return similarity
+DIRECTION_METHODS = ['single', 'sum', 'pca']
+DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
+FIRST_PC_THRESHOLD = 0.5
+MAX_NON_SPECIFIC_EXAMPLES = 1000
+__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
+class WordBiasExplorer():
+    def __init__(self, vocabulary):
+        # pylint: disable=undefined-variable
+        self.vocabulary = vocabulary
+        self.direction = None
+        self.positive_end = None
+        self.negative_end = None
+    def __copy__(self):
+        bias_word_embedding = self.__class__(self.vocabulary)
+        bias_word_embedding.direction = copy.deepcopy(self.direction)
+        bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
+        bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
+        return bias_word_embedding
+    def __deepcopy__(self, memo):
+        bias_word_embedding = copy.copy(self)
+        bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
+        return bias_word_embedding
+    def __getitem__(self, key):
+        return self.vocabulary.getEmbedding(key)
+    def __contains__(self, item):
+        return item in self.vocabulary
+    def _is_direction_identified(self):
+        if self.direction is None:
+            raise RuntimeError('The direction was not identified'
+                               ' for this {} instance'
+                               .format(self.__class__.__name__))
+    def _identify_subspace_by_pca(self, definitional_pairs, n_components):
+        matrix = []
+        for word1, word2 in definitional_pairs:
+            vector1 = normalize(self[word1])
+            vector2 = normalize(self[word2])
+            center = (vector1 + vector2) / 2
+            matrix.append(vector1 - center)
+            matrix.append(vector2 - center)
+        pca = PCA(n_components=n_components)
+        pca.fit(matrix)
+        return pca
+    def _identify_direction(self, positive_end, negative_end,
+                            definitional, method='pca'):
+        if method not in DIRECTION_METHODS:
+            raise ValueError('method should be one of {}, {} was given'.format(
+                DIRECTION_METHODS, method))
+        if positive_end == negative_end:
+            raise ValueError('positive_end and negative_end'
+                             'should be different, and not the same "{}"'
+                             .format(positive_end))
+        direction = None
+        if method == 'single':
+            direction = normalize(normalize(self[definitional[0]])
+                                  - normalize(self[definitional[1]]))
+        elif method == 'sum':
+            group1_sum_vector = np.sum([self[word]
+                                        for word in definitional[0]], axis=0)
+            group2_sum_vector = np.sum([self[word]
+                                        for word in definitional[1]], axis=0)
+            diff_vector = (normalize(group1_sum_vector)
+                           - normalize(group2_sum_vector))
+            direction = normalize(diff_vector)
+        elif method == 'pca':
+            pca = self._identify_subspace_by_pca(definitional, 10)
+            if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
+                raise RuntimeError('The Explained variance'
+                                   'of the first principal component should be'
+                                   'at least {}, but it is {}'
+                                   .format(FIRST_PC_THRESHOLD,
+                                           pca.explained_variance_ratio_[0]))
+            direction = pca.components_[0]
+            # if direction is opposite (e.g. we cannot control
+            # what the PCA will return)
+            ends_diff_projection = cosine_similarity((self[positive_end]
+                                                      - self[negative_end]),
+                                                     direction)
+            if ends_diff_projection < 0:
+                direction = -direction  # pylint: disable=invalid-unary-operand-type
+        self.direction = direction
+        self.positive_end = positive_end
+        self.negative_end = negative_end
+    def project_on_direction(self, word):
+        """Project the normalized vector of the word on the direction.
+        :param str word: The word tor project
+        :return float: The projection scalar
+        """
+        self._is_direction_identified()
+        vector = self[word]
+        projection_score = self.vocabulary.cosineSimilarities(self.direction,
+                                                          [vector])[0]
+        return projection_score
+    def _calc_projection_scores(self, words):
+        self._is_direction_identified()
+        df = pd.DataFrame({'word': words})
+        # TODO: maybe using cosine_similarities on all the vectors?
+        # it might be faster
+        df['projection'] = df['word'].apply(self.project_on_direction)
+        df = df.sort_values('projection', ascending=False)
+        return df
+    def calc_projection_data(self, words):
+        """
+        Calculate projection, projected and rejected vectors of a words list.
+        :param list words: List of words
+        :return: :class:`pandas.DataFrame` of the projection,
+                 projected and rejected vectors of the words list
+        """
+        projection_data = []
+        for word in words:
+            vector = self[word]
+            normalized_vector = normalize(vector)
+            (projection,
+             projected_vector,
+             rejected_vector) = project_params(normalized_vector,
+                                               self.direction)
+            projection_data.append({'word': word,
+                                    'vector': vector,
+                                    'projection': projection,
+                                    'projected_vector': projected_vector,
+                                    'rejected_vector': rejected_vector})
+        return pd.DataFrame(projection_data)
+    def plot_dist_projections_on_direction(self, word_groups, ax=None):
+        """Plot the projection scalars distribution on the direction.
+        :param dict word_groups word: The groups to projects
+        :return float: The ax object of the plot
+        """
+        if ax is None:
+            _, ax = plt.subplots(1)
+        names = sorted(word_groups.keys())
+        for name in names:
+            words = word_groups[name]
+            label = '{} (#{})'.format(name, len(words))
+            vectors = [self[word] for word in words]
+            projections = self.vocabulary.cosineSimilarities(self.direction,
+                                                         vectors)
+            sns.distplot(projections, hist=False, label=label, ax=ax)
+        plt.axvline(0, color='k', linestyle='--')
+        plt.title('← {} {} {} →'.format(self.negative_end,
+                                        ' ' * 20,
+                                        self.positive_end))
+        plt.xlabel('Direction Projection')
+        plt.ylabel('Density')
+        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        return ax
+    def __errorChecking(self, word):
+        out_msj = ""
+        if not word:
+            out_msj = "Error: Primero debe ingresar una palabra!"
+        else:
+            if word not in self.vocabulary:
+                out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
+        return out_msj
+    def check_oov(self, wordlists):
+        for wordlist in wordlists:
+            for word in wordlist:
+                msg = self.__errorChecking(word)
+                if msg:
+                    return msg
+        return None
+    def plot_biased_words(self,
+                       words_to_diagnose,
+                       wordlist_right,
+                       wordlist_left,
+                       wordlist_top=[],
+                       wordlist_bottom=[]
+                       ):
+        bias_2D = wordlist_top == [] and wordlist_bottom == []
+        if bias_2D and (not wordlist_right or not wordlist_left):
+            raise Exception('For bar plot, wordlist right and left can NOT be empty')
+        elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
+            raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
+        err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
+        if err:
+            raise Exception(err)
+        return self.get_bias_plot(bias_2D,
+                                  words_to_diagnose,
+                                  definitional_1=(wordlist_right, wordlist_left),
+                                  definitional_2=(wordlist_top, wordlist_bottom)
+                                  )
+    def get_bias_plot(self,
+                      plot_2D,
+                      words_to_diagnose,
+                      definitional_1,
+                      definitional_2=([], []),
+                      method='sum',
+                      n_extreme=10,
+                      figsize=(15, 10)
+                      ):
+        fig, ax = plt.subplots(1, figsize=figsize)
+        self.method = method
+        self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
+        if plot_2D:
+            fig.tight_layout()
+        fig.canvas.draw()
+        return fig
+    def plot_projection_scores(self,
+                                  plot_2D,
+                                  words,
+                                  definitional_1,
+                                  definitional_2=([], []),
+                                  n_extreme=10,
+                                  ax=None,
+                                  axis_projection_step=0.1):
+        name_left  = ', '.join(definitional_1[1])
+        name_right = ', '.join(definitional_1[0])
+        self._identify_direction(name_left, name_right, definitional=definitional_1, method='sum')
+        self._is_direction_identified()
+        projections_df = self._calc_projection_scores(words)
+        projections_df['projection_x'] = projections_df['projection'].round(2)
+        if not plot_2D:
+            name_top    = ', '.join(definitional_2[1])
+            name_bottom = ', '.join(definitional_2[0])
+            self._identify_direction(name_top, name_bottom, definitional=definitional_2, method='sum')
+            self._is_direction_identified()
+            projections_df['projection_y'] = self._calc_projection_scores(words)['projection'].round(2)
+        if n_extreme is not None:
+            projections_df = take_two_sides_extreme_sorted(projections_df, n_extreme=n_extreme)
+        if ax is None:
+            _, ax = plt.subplots(1)
+        cmap = plt.get_cmap('RdBu')
+        projections_df['color'] = ((projections_df['projection'] + 0.5).apply(cmap))
+        most_extream_projection = np.round(
+            projections_df['projection']
+            .abs()
+            .max(),
+            decimals=1)
+        if plot_2D:
+            sns.barplot(x='projection', y='word', data=projections_df,
+                    palette=projections_df['color'])
+        else:
+            sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
+                        palette=projections_df['color'])
+        plt.xticks(np.arange(-most_extream_projection,
+                             most_extream_projection + axis_projection_step,
+                             axis_projection_step))
+        x_label = '← {} {} {} →'.format(name_left,
+                                        ' ' * 20,
+                                        name_right)
+        if not plot_2D:
+            y_label = '← {} {} {} →'.format(name_top,
+                                        ' ' * 20,
+                                        name_bottom)
+            for _, row in (projections_df.iterrows()):
+                ax.annotate(row['word'], (row['projection_x'], row['projection_y']))
+        plt.xlabel(x_label)
+        plt.ylabel('Words')
+        if not plot_2D:
+            ax.xaxis.set_label_position('bottom')
+            ax.xaxis.set_label_coords(.5, 0)
+            plt.ylabel(y_label)
+            ax.yaxis.set_label_position('left')
+            ax.yaxis.set_label_coords(0, .5)
+            ax.spines['left'].set_position('center')
+            ax.spines['bottom'].set_position('center')
+            ax.set_xticks([])
+            ax.set_yticks([])
+        return ax
+# TODO: Would be erased if decided to keep all info in BiasWordExplorer
+class WEBiasExplorer2d(WordBiasExplorer):
+    def __init__(self, word_embedding) -> None:
+        super().__init__(word_embedding)
+    def calculate_bias( self,
+                        palabras_extremo_1,
+                        palabras_extremo_2,
+                        palabras_para_situar
+                        ):
+        wordlists = [palabras_extremo_1, palabras_extremo_2, palabras_para_situar]
+        err = self.check_oov(wordlists)
+        for wordlist in wordlists:
+            if not wordlist:
+                err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' + "<center><h3>"
+        if err:
+            return None, err
+        im = self.get_bias_plot(
+            palabras_para_situar,
+            definitional=(
+                palabras_extremo_1, palabras_extremo_2),
+            method='sum',
+            n_extreme=10
+        )
+        return im, ''
+    def get_bias_plot(self,
+                      palabras_para_situar,
+                      definitional,
+                      method='sum',
+                      n_extreme=10,
+                      figsize=(10, 10)
+                      ):
+        fig, ax = plt.subplots(1, figsize=figsize)
+        self.method = method
+        self.plot_projection_scores(
+            definitional,
+            palabras_para_situar, n_extreme, ax=ax,)
+        fig.tight_layout()
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        w, h = fig.canvas.get_width_height()
+        im = data.reshape((int(h), int(w), -1))
+        return im
+    def plot_projection_scores(self, definitional,
+                               words, n_extreme=10,
+                               ax=None, axis_projection_step=None):
+        """Plot the projection scalar of words on the direction.
+        :param list words: The words tor project
+        :param int or None n_extreme: The number of extreme words to show
+        :return: The ax object of the plot
+        """
+        nombre_del_extremo_1 = ', '.join(definitional[0])
+        nombre_del_extremo_2 = ', '.join(definitional[1])
+        self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
+                                 definitional=definitional,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df = self._calc_projection_scores(words)
+        projections_df['projection'] = projections_df['projection'].round(2)
+        if n_extreme is not None:
+            projections_df = take_two_sides_extreme_sorted(projections_df,
+                                                           n_extreme=n_extreme)
+        if ax is None:
+            _, ax = plt.subplots(1)
+        if axis_projection_step is None:
+            axis_projection_step = 0.1
+        cmap = plt.get_cmap('RdBu')
+        projections_df['color'] = ((projections_df['projection'] + 0.5)
+                                   .apply(cmap))
+        most_extream_projection = np.round(
+            projections_df['projection']
+            .abs()
+            .max(),
+            decimals=1)
+        sns.barplot(x='projection', y='word', data=projections_df,
+                    palette=projections_df['color'])
+        plt.xticks(np.arange(-most_extream_projection,
+                             most_extream_projection + axis_projection_step,
+                             axis_projection_step))
+        xlabel = ('← {} {} {} →'.format(self.negative_end,
+                                        ' ' * 20,
+                                        self.positive_end))
+        plt.xlabel(xlabel)
+        plt.ylabel('Words')
+        return ax
+class WEBiasExplorer4d(WordBiasExplorer):
+    def __init__(self, word_embedding) -> None:
+        super().__init__(word_embedding)
+    def calculate_bias( self,
+                        palabras_extremo_1,
+                        palabras_extremo_2,
+                        palabras_extremo_3,
+                        palabras_extremo_4,
+                        palabras_para_situar
+                        ):
+        wordlists = [
+            palabras_extremo_1,
+            palabras_extremo_2,
+            palabras_extremo_3,
+            palabras_extremo_4,
+            palabras_para_situar
+        ]
+        for wordlist in wordlists:
+            if not wordlist:
+                err = "<center><h3>" + \
+                    '¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
+        err = self.check_oov(wordlist)
+        if err:
+            return None, err
+        im = self.get_bias_plot(
+            palabras_para_situar,
+            definitional_1=(
+                palabras_extremo_1, palabras_extremo_2),
+            definitional_2=(
+                palabras_extremo_3, palabras_extremo_4),
+            method='sum',
+            n_extreme=10
+        )
+        return im, ''
+    def get_bias_plot(self,
+                      palabras_para_situar,
+                      definitional_1,
+                      definitional_2,
+                      method='sum',
+                      n_extreme=10,
+                      figsize=(10, 10)
+                      ):
+        fig, ax = plt.subplots(1, figsize=figsize)
+        self.method = method
+        self.plot_projection_scores(
+            definitional_1,
+            definitional_2,
+            palabras_para_situar, n_extreme, ax=ax,)
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        w, h = fig.canvas.get_width_height()
+        im = data.reshape((int(h), int(w), -1))
+        return im
+    def plot_projection_scores(self, definitional_1, definitional_2,
+                               words, n_extreme=10,
+                               ax=None, axis_projection_step=None):
+        """Plot the projection scalar of words on the direction.
+        :param list words: The words tor project
+        :param int or None n_extreme: The number of extreme words to show
+        :return: The ax object of the plot
+        """
+        nombre_del_extremo_1 = ', '.join(definitional_1[1])
+        nombre_del_extremo_2 = ', '.join(definitional_1[0])
+        self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
+                                 definitional=definitional_1,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df = self._calc_projection_scores(words)
+        projections_df['projection_x'] = projections_df['projection'].round(2)
+        nombre_del_extremo_3 = ', '.join(definitional_2[1])
+        nombre_del_extremo_4 = ', '.join(definitional_2[0])
+        self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
+                                 definitional=definitional_2,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df['projection_y'] = self._calc_projection_scores(words)[
+            'projection'].round(2)
+        if n_extreme is not None:
+            projections_df = take_two_sides_extreme_sorted(projections_df,
+                                                           n_extreme=n_extreme)
+        if ax is None:
+            _, ax = plt.subplots(1)
+        if axis_projection_step is None:
+            axis_projection_step = 0.1
+        cmap = plt.get_cmap('RdBu')
+        projections_df['color'] = ((projections_df['projection'] + 0.5)
+                                   .apply(cmap))
+        most_extream_projection = np.round(
+            projections_df['projection']
+            .abs()
+            .max(),
+            decimals=1)
+        sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
+                        palette=projections_df['color'])
+        plt.xticks(np.arange(-most_extream_projection,
+                             most_extream_projection + axis_projection_step,
+                             axis_projection_step))
+        for _, row in (projections_df.iterrows()):
+            ax.annotate(
+                row['word'], (row['projection_x'], row['projection_y']))
+        x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
+                                        ' ' * 20,
+                                        nombre_del_extremo_2)
+        y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
+                                        ' ' * 20,
+                                        nombre_del_extremo_4)
+        plt.xlabel(x_label)
+        ax.xaxis.set_label_position('bottom')
+        ax.xaxis.set_label_coords(.5, 0)
+        plt.ylabel(y_label)
+        ax.yaxis.set_label_position('left')
+        ax.yaxis.set_label_coords(0, .5)
+        ax.spines['left'].set_position('center')
+        ax.spines['bottom'].set_position('center')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        #plt.yticks([], [])
+        # ax.spines['left'].set_position('zero')
+        # ax.spines['bottom'].set_position('zero')
+        return ax

modules/module_WordExplorer.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import numpy as np
+import pandas as pd
+import seaborn as sns
+from numpy.linalg import norm
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+class WordToPlot:
+    def __init__(self, word, color, bias_space, alpha):
+        self.word = word
+        self.color = color
+        self.bias_space = bias_space
+        self.alpha = alpha
+class WordExplorer:
+    def __init__(self, vocabulary) -> None:
+        self.vocabulary = vocabulary
+    def __errorChecking(self, word):
+        out_msj = ""
+        if not word:
+            out_msj = "Error: Primero debe ingresar una palabra!"
+        else:
+            if word not in self.vocabulary:
+                out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
+        return out_msj
+    def parse_words(self, string):
+        words = string.strip()
+        if words:
+            words = [word.strip() for word in words.split(',') if word != ""]
+        return words
+    def check_oov(self, wordlists):
+        for wordlist in wordlists:
+            for word in wordlist:
+                msg = self.__errorChecking(word)
+                if msg:
+                    return msg
+        return None
+    def get_neighbors(self, word, n_neighbors, nn_method):
+        return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
+    def get_df(self, words_embedded, processed_word_list):
+        df = pd.DataFrame(words_embedded)
+        df['word'] = [wtp.word for wtp in processed_word_list]
+        df['color'] = [wtp.color for wtp in processed_word_list]
+        df['alpha'] = [wtp.alpha for wtp in processed_word_list]
+        df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
+        return df
+    def get_plot(self,
+                 data,
+                 processed_word_list,
+                 words_embedded,
+                 color_dict,
+                 n_neighbors,
+                 n_alpha,
+                 fontsize=18,
+                 figsize=(20, 15)
+                 ):
+        fig, ax = plt.subplots(figsize=figsize)
+        sns.scatterplot(
+            data=data[data['alpha'] == 1],
+            x=0,
+            y=1,
+            style='word_bias_space',
+            hue='word_bias_space',
+            ax=ax,
+            palette=color_dict
+        )
+        if n_neighbors > 0:
+            sns.scatterplot(
+                data=data[data['alpha'] != 1],
+                x=0,
+                y=1,
+                style='color',
+                hue='word_bias_space',
+                ax=ax,
+                alpha=n_alpha,
+                legend=False,
+                palette=color_dict
+            )
+        for i, wtp in enumerate(processed_word_list):
+            x, y = words_embedded[i, :]
+            ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
+                        textcoords='offset points',
+                        ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_xlabel('')
+        ax.set_ylabel('')
+        fig.tight_layout()
+        return fig
+    def plot_projections_2d(self,
+                            wordlist_0,
+                            wordlist_1 = [],
+                            wordlist_2 = [],
+                            wordlist_3 = [],
+                            wordlist_4 = [],
+                            **kwargs
+                            ):
+        # convertirlas a vector
+        choices = [0, 1, 2, 3, 4]
+        wordlist_choice = [
+            wordlist_0,
+            wordlist_1,
+            wordlist_2,
+            wordlist_3,
+            wordlist_4
+        ]
+        err = self.check_oov(wordlist_choice)
+        if err:
+            raise Exception(err)
+        color_dict = {
+            0: kwargs.get('color_wordlist_0', '#000000'),
+            1: kwargs.get('color_wordlist_1', '#1f78b4'),
+            2: kwargs.get('color_wordlist_2', '#33a02c'),
+            3: kwargs.get('color_wordlist_3', '#e31a1c'),
+            4: kwargs.get('color_wordlist_4', '#6a3d9a')
+        }
+        n_neighbors = kwargs.get('n_neighbors', 0)
+        n_alpha = kwargs.get('n_alpha', 0.3)
+        processed_word_list = []
+        for word_list_to_process, color in zip(wordlist_choice, choices):
+            for word in word_list_to_process:
+                processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
+                if n_neighbors > 0:
+                    neighbors = self.get_neighbors(word,
+                                                   n_neighbors=n_neighbors+1,
+                                                   nn_method=kwargs.get('nn_method', 'sklearn')
+                                                   )
+                    for n in neighbors:
+                        if n not in [wtp.word for wtp in processed_word_list]:
+                            processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
+        if not processed_word_list:
+            raise Exception('Only empty lists were passed')
+        words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
+        data = self.get_df(words_embedded, processed_word_list)
+        fig = self.get_plot(data, processed_word_list, words_embedded,
+                            color_dict, n_neighbors, n_alpha,
+                            kwargs.get('fontsize', 18),
+                            kwargs.get('figsize', (20, 15))
+                            )
+        plt.show()
+        return fig
+    def doesnt_match(self, wordlist):
+        err = self.check_oov([wordlist])
+        if err:
+            raise Exception(err)
+        words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
+        mean_vec = np.mean(words_emb, axis=0)
+        doesnt_match = ""
+        farthest_emb = 1.0
+        for word in wordlist:
+            word_emb = self.vocabulary.getEmbedding(word)
+            cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
+            if cos_sim <= farthest_emb:
+                farthest_emb = cos_sim
+                doesnt_match = word
+        return doesnt_match

modules/module_ann.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import time
+import operator
+from tqdm import tqdm
+from annoy import AnnoyIndex
+from memory_profiler import profile
+class TicToc:
+    def __init__(self):
+        self.i = None
+    def start(self):
+        self.i = time.time()
+    def stop(self):
+        f = time.time()
+        print(f - self.i, "seg.")
+class Ann:
+    def __init__(self, words, vectors, coord):
+        self.words = words.to_list()
+        self.vectors = vectors.to_list()
+        self.coord = coord.to_list()
+        self.tree = None
+        self.tt = TicToc()
+    @profile
+    def init(self, n_trees=10, metric='angular', n_jobs=-1):
+        # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
+        # n_jobs=-1 Run over all CPU availables
+        print("Init tree...")
+        self.tt.start()
+        self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
+        for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
+            self.tree.add_item(i,v)
+        self.tt.stop()
+        print("Build tree...")
+        self.tt.start()
+        self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
+        self.tt.stop()
+    def __getWordId(self, word):
+        word_id = None
+        try:
+            word_id = self.words.index(word)
+        except:
+            pass
+        return word_id
+    def get(self, word, n_neighbors=10):
+        word_id = self.__getWordId(word)
+        reword_xy_list = None
+        if word_id != None:
+            neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
+            # word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
+            # word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
+            word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
+        else:
+            print(f"The word '{word}' does not exist")
+        return word_xy_list

modules/module_connection.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import numpy as np
+import pandas as pd
+import gradio as gr
+from abc import ABC, abstractmethod
+from modules.module_WordExplorer import WordExplorer
+from modules.module_BiasExplorer import WordBiasExplorer
+class Connector(ABC):
+    def parse_word(self, word : str):
+        return word.lower().strip()
+    def parse_words(self, array_in_string : str):
+        words = array_in_string.strip()
+        if not words:
+            return []
+        words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
+        return words
+    def process_error(self, err: str):
+        if err is None:
+            return
+        return "<center><h3>" + err + "</h3></center>"
+class WordExplorerConnector(Connector):
+    def __init__(self, **kwargs):
+        if 'embedding' in kwargs:
+            embedding = kwargs.get('embedding')
+        else:
+            raise KeyError
+        self.word_explorer = WordExplorer(embedding)
+    def plot_proyection_2d( self,
+                            wordlist_0,
+                            wordlist_1,
+                            wordlist_2,
+                            wordlist_3,
+                            wordlist_4,
+                            color_wordlist_0,
+                            color_wordlist_1,
+                            color_wordlist_2,
+                            color_wordlist_3,
+                            color_wordlist_4,
+                            n_alpha,
+                            fontsize,
+                            n_neighbors
+                            ):
+        err = ""
+        neighbors_method = 'sklearn'
+        wordlist_0 = self.parse_words(wordlist_0)
+        wordlist_1 = self.parse_words(wordlist_1)
+        wordlist_2 = self.parse_words(wordlist_2)
+        wordlist_3 = self.parse_words(wordlist_3)
+        wordlist_4 = self.parse_words(wordlist_4)
+        if not (wordlist_0 or wordlist_1 or wordlist_2 or wordlist_1 or wordlist_4):
+            err = self.process_error("Ingresa al menos 1 palabras para continuar")
+            return None, err
+        err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
+        if err:
+            return None, self.process_error(err)
+        fig = self.word_explorer.plot_projections_2d(wordlist_0,
+                                                     wordlist_1,
+                                                     wordlist_2,
+                                                     wordlist_3,
+                                                     wordlist_4,
+                                                     color_wordlist_0=color_wordlist_0,
+                                                     color_wordlist_1=color_wordlist_1,
+                                                     color_wordlist_2=color_wordlist_2,
+                                                     color_wordlist_3=color_wordlist_3,
+                                                     color_wordlist_4=color_wordlist_4,
+                                                     n_alpha=n_alpha,
+                                                     fontsize=fontsize,
+                                                     n_neighbors=n_neighbors,
+                                                     nn_method = neighbors_method
+                                                     )
+        return fig, self.process_error(err)
+class BiasWordExplorerConnector(Connector):
+    def __init__(self, **kwargs):
+        if 'embedding' in kwargs:
+            embedding = kwargs.get('embedding')
+        else:
+            raise KeyError
+        self.bias_word_explorer = WordBiasExplorer(embedding)
+    def calculate_bias_2d(self,
+                         wordlist_1,
+                         wordlist_2,
+                         to_diagnose_list
+                         ):
+        err = ""
+        wordlist_1 = self.parse_words(wordlist_1)
+        wordlist_2 = self.parse_words(wordlist_2)
+        to_diagnose_list = self.parse_words(to_diagnose_list)
+        word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
+        for list in word_lists:
+            if not list:
+                err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
+        if err:
+            return None, self.process_error(err)
+        err = self.bias_word_explorer.check_oov(word_lists)
+        if err:
+            return None, self.process_error(err)
+        fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
+        return fig, self.process_error(err)
+    def calculate_bias_4d(self,
+                         wordlist_1,
+                         wordlist_2,
+                         wordlist_3,
+                         wordlist_4,
+                         to_diagnose_list
+                         ):
+        err = ""
+        wordlist_1 = self.parse_words(wordlist_1)
+        wordlist_2 = self.parse_words(wordlist_2)
+        wordlist_3 = self.parse_words(wordlist_3)
+        wordlist_4 = self.parse_words(wordlist_4)
+        to_diagnose_list = self.parse_words(to_diagnose_list)
+        wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
+        for list in wordlists:
+            if not list:
+                err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
+        if err:
+            return None, self.process_error(err)
+        err = self.bias_word_explorer.check_oov(wordlists)
+        if err:
+            return None, self.process_error(err)
+        fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
+        return fig, self.process_error(err)

modules/module_logsManager.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import csv, os, pytz
+from gradio import utils
+from datetime import datetime
+from dotenv import load_dotenv
+from distutils.log import debug
+from typing import Any, List, Optional
+from gradio.components import IOComponent
+from gradio.flagging import FlaggingCallback, _get_dataset_features_info
+# --- Load environments vars ---
+load_dotenv()
+# --- Classes declaration ---
+class DateLogs:
+    def __init__(self, zone="America/Argentina/Cordoba"):
+        self.time_zone = pytz.timezone(zone)
+    def full(self):
+        now = datetime.now(self.time_zone)
+        return now.strftime("%H:%M:%S %d-%m-%Y")
+    def day(self):
+        now = datetime.now(self.time_zone)
+        return now.strftime("%d-%m-%Y")
+class HuggingFaceDatasetSaver(FlaggingCallback):
+    """
+    A callback that saves each flagged sample (both the input and output data)
+    to a HuggingFace dataset.
+    Example:
+        import gradio as gr
+        hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
+        def image_classifier(inp):
+            return {'cat': 0.3, 'dog': 0.7}
+        demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
+                            allow_flagging="manual", flagging_callback=hf_writer)
+    Guides: using_flagging
+    """
+    def __init__(
+        self,
+        hf_token: str = os.getenv('HF_TOKEN'),
+        dataset_name: str = os.getenv('DS_LOGS_NAME'),
+        organization: Optional[str] = os.getenv('ORG_NAME'),
+        private: bool = True,
+        available_logs: bool = False
+    ):
+        """
+        Parameters:
+            hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
+            dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
+            organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
+            private: Whether the dataset should be private (defaults to False).
+        """
+        self.hf_token = hf_token
+        self.dataset_name = dataset_name
+        self.organization_name = organization
+        self.dataset_private = private
+        self.datetime = DateLogs()
+        self.available_logs = available_logs
+        if not available_logs:
+            print("Push: logs DISABLED!...")
+    def setup(
+            self,
+            components: List[IOComponent],
+            flagging_dir: str
+        ):
+        """
+        Params:
+        flagging_dir (str): local directory where the dataset is cloned,
+        updated, and pushed from.
+        """
+        if self.available_logs:
+            try:
+                import huggingface_hub
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError(
+                    "Package `huggingface_hub` not found is needed "
+                    "for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
+                )
+            path_to_dataset_repo = huggingface_hub.create_repo(
+                repo_id=os.path.join(self.organization_name, self.dataset_name),
+                token=self.hf_token,
+                private=self.dataset_private,
+                repo_type="dataset",
+                exist_ok=True,
+            )
+            self.path_to_dataset_repo = path_to_dataset_repo
+            self.components = components
+            self.flagging_dir = flagging_dir
+            self.dataset_dir = self.dataset_name
+            self.repo = huggingface_hub.Repository(
+                local_dir=self.dataset_dir,
+                clone_from=path_to_dataset_repo,
+                use_auth_token=self.hf_token,
+            )
+            self.repo.git_pull(lfs=True)
+            # Should filename be user-specified?
+            # log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
+            self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
+    def flag(
+        self,
+        flag_data: List[Any],
+        flag_option: Optional[str] = None,
+        flag_index: Optional[int] = None,
+        username: Optional[str] = None,
+    ) -> int:
+        if self.available_logs:
+            self.repo.git_pull(lfs=True)
+            is_new = not os.path.exists(self.log_file)
+            with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
+                writer = csv.writer(csvfile)
+                # File previews for certain input and output types
+                infos, file_preview_types, headers = _get_dataset_features_info(
+                    is_new, self.components
+                )
+                # Generate the headers and dataset_infos
+                if is_new:
+                    headers = [
+                        component.label or f"component {idx}"
+                        for idx, component in enumerate(self.components)
+                    ] + [
+                        "flag",
+                        "username",
+                        "timestamp",
+                    ]
+                    writer.writerow(utils.sanitize_list_for_csv(headers))
+                # Generate the row corresponding to the flagged sample
+                csv_data = []
+                for component, sample in zip(self.components, flag_data):
+                    save_dir = os.path.join(
+                        self.dataset_dir,
+                        utils.strip_invalid_filename_characters(component.label),
+                    )
+                    filepath = component.deserialize(sample, save_dir, None)
+                    csv_data.append(filepath)
+                    if isinstance(component, tuple(file_preview_types)):
+                        csv_data.append(
+                            "{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
+                        )
+                csv_data.append(flag_option if flag_option is not None else "")
+                csv_data.append(username if username is not None else "")
+                csv_data.append(self.datetime.full())
+                writer.writerow(utils.sanitize_list_for_csv(csv_data))
+            with open(self.log_file, "r", encoding="utf-8") as csvfile:
+                line_count = len([None for row in csv.reader(csvfile)]) - 1
+            self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
+        else:
+            line_count = 0
+            print("Logs: Virtual push...")
+        return line_count

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+sklearn
+gensim==3.7.3
+transformers
+matplotlib
+numpy
+seaborn
+uuid
+python-dotenv
+memory_profiler
+annoy

tool_info.py ADDED Viewed

	@@ -0,0 +1,23 @@

+TOOL_INFO = """
+> ### A tool to overcome technical barriers for bias assessment in human language technologies
+* [Read Full Paper](https://arxiv.org/abs/2207.06591)
+> ### Licensing Information
+* [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
+> ### Citation Information
+```c
+@misc{https://doi.org/10.48550/arxiv.2207.06591,
+    doi = {10.48550/ARXIV.2207.06591},
+    url = {https://arxiv.org/abs/2207.06591},
+    author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
+    keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
+    FOS: Computer and information sciences, FOS: Computer and information sciences},
+    title = {A tool to overcome technical barriers for bias assessment in human language technologies},
+    publisher = {arXiv},
+    year = {2022},
+    copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
+}
+```
+"""