Spaces:

poltextlab
/

babel_machine

Running

App Files Files Community

kovacsvi commited on 6 days ago

Commit

4bba8df

1 Parent(s): 6d39e54

up-to-date prod demo

Browse files

Files changed (15) hide show

README.md +2 -2
app.py +36 -25
interfaces/cap.py +27 -41
interfaces/cap_minor.py +83 -0
interfaces/emotion.py +20 -6
interfaces/emotion9.py +65 -0
interfaces/illframes.py +116 -0
interfaces/manifesto.py +27 -47
interfaces/ner.py +7 -2
interfaces/ontolisst.py +96 -0
interfaces/sentiment.py +31 -8
interfaces/utils.py +12 -0
label_dicts.py +520 -1
requirements.txt +2 -1
utils.py +47 -11

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Babel Machine Demo Dev
-emoji: 💻
 colorFrom: pink
 colorTo: indigo
 sdk: gradio
@@ -10,4 +10,4 @@ pinned: false
 short_description: CAP, Manifesto, sentiment, emotion classification
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Babel Machine Demo Dev
+emoji: 📊
 colorFrom: pink
 colorTo: indigo
 sdk: gradio
 short_description: CAP, Manifesto, sentiment, emotion classification
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,47 +1,58 @@
 import os
-PATH = '/data/'
-os.environ['TRANSFORMERS_CACHE'] = PATH
-os.environ['HF_HOME'] = PATH
-os.environ['HF_DATASETS_CACHE'] = PATH
-os.environ['TORCH_HOME'] = PATH
 import gradio as gr
-from spacy import glossary
 from interfaces.cap import demo as cap_demo
 from interfaces.manifesto import demo as manifesto_demo
 from interfaces.sentiment import demo as sentiment_demo
 from interfaces.emotion import demo as emotion_demo
 from interfaces.ner import demo as ner_demo
 from interfaces.ner import download_models as download_spacy_models
-from utils import download_hf_models
-entities = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
-ent_dict = glossary.GLOSSARY
-ent_sum = [f'{ent} = {ent_dict[ent]}' for ent in entities ]
-with gr.Blocks() as demo:
     gr.Markdown(
         f"""
-        <div style="display: block; text-align: left; padding:0; margin:0;">
-            <h1 style="text-align: center">Babel Machine Demo</h1>
-            <p>This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, and emotion coding systems.<br>
-            For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.</p>
-            <p> For named entity recognition the following labels are used: </p>
-            <ul>
-            <li> {'</li> <li>'.join(ent_sum)} </li>
-            </ul>
         </div>
         """)
     gr.TabbedInterface(
-        interface_list=[cap_demo, manifesto_demo, sentiment_demo, emotion_demo, ner_demo],
-        tab_names=["CAP", "Manifesto", "Sentiment (3)", "Emotions (8)", "Named Entity Recognition"],
     )
 if __name__ == "__main__":
-    download_hf_models()
-    download_spacy_models()
     demo.launch()
-# TODO: add all languages & domains

 import os
 import gradio as gr
 from interfaces.cap import demo as cap_demo
+from interfaces.cap_minor import demo as cap_minor_demo
 from interfaces.manifesto import demo as manifesto_demo
 from interfaces.sentiment import demo as sentiment_demo
 from interfaces.emotion import demo as emotion_demo
 from interfaces.ner import demo as ner_demo
 from interfaces.ner import download_models as download_spacy_models
+from interfaces.illframes import demo as illframes_demo
+from interfaces.ontolisst import demo as ontolisst_demo
+from interfaces.emotion9 import demo as e9_demo
+from utils import download_hf_models, df_h, set_hf_cache_dir
+css = """
+/* Make only the active tab bold */
+.svelte-1uw5tnk[aria-selected="true"] {
+    font-weight: bold;
+    background: linear-gradient(to bottom right, var(--primary-100), var(--primary-300));
+    color: var(--primary-600)
+}
+"""
+with gr.Blocks(css=css) as demo:
     gr.Markdown(
         f"""
+        <style>
+        @import 'https://fonts.googleapis.com/css?family=Source+Sans+Pro:300,400';
+        </style>
+        <div style="display: block; text-align: left; padding:0; margin:0;font-family: "Source Sans Pro", Helvetica, sans-serif;">
+            <h1 style="text-align: center;font-size: 17pt;">Babel Machine Demo</h1>
+            <p style="font-size: 14pt;">This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, emotion coding and Named Entity Recognition systems.
+            For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.<br>
+            Please note that the sentiment (3) and emotions (6) models have been trained using parliamentary speech data, so the results for generic sentences may not be reliable. The emotions (9) models have been trained using <a href="https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/C9SAIX">this dataset</a>. It contains labeled parliamentary speeches and social media data. Under-represented categories were enriched with synthetic data.<br>
+            <br>
+            The models listed for Manifesto, Sentiment (3) and Emotions (6) tasks are a beta version and thus not publicly available,
+            the Hugging Face link will not work for them for the time being. We expect a public version after tests and improvements in the Fall.
+            Please feel free to check back for model updates, or reach out to us at that point if you wish to ask about a specific model.
+            </p>
         </div>
         """)
     gr.TabbedInterface(
+        interface_list=[cap_demo, cap_minor_demo, manifesto_demo, sentiment_demo, emotion_demo, e9_demo,illframes_demo, ner_demo,  ontolisst_demo],
+        tab_names=["CAP", "CAP Minor Codes", "Manifesto", "Sentiment (3)", "Emotions (6)","Emotions (9)", "ILLFRAMES", "Named Entity Recognition",  "ONTOLISST"]
     )
 if __name__ == "__main__":
+    set_hf_cache_dir("/data")
+    download_spacy_models()
+    download_hf_models() # does this affect the build?
+    df_h() # debug -> check disk space before launching demo
     demo.launch()
+# TODO: add all languages & domains

interfaces/cap.py CHANGED Viewed

@@ -7,26 +7,16 @@ import pandas as pd
 from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 from huggingface_hub import HfApi
-from huggingface_hub.utils._errors import RepositoryNotFoundError
 from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
 HF_TOKEN = os.environ["hf_read"]
 languages = [
-    "Danish",
-    "Dutch",
     "English",
-    "French",
-    "German",
-    "Hungarian",
-    "Italian",
-    "Polish",
-    "Portuguese",
-    "Spanish",
-    "Czech",
-    "Slovak",
-    "Norwegian"
 ]
 domains = {
@@ -48,12 +38,19 @@ def check_huggingface_path(checkpoint_path: str):
         hf_api = HfApi(token=HF_TOKEN)
         hf_api.model_info(checkpoint_path, token=HF_TOKEN)
         return True
-    except RepositoryNotFoundError:
         return False
 def build_huggingface_path(language: str, domain: str):
     base_path = "xlm-roberta-large"
-    lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
     lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
     path_map = {
@@ -75,48 +72,31 @@ def build_huggingface_path(language: str, domain: str):
     except (AttributeError, FileNotFoundError):
         value = None
-    if value and value in path_map:
-        model_path = path_map[value]
-        if check_huggingface_path(model_path):
-            # if the model is available on Huggingface, return the path
-            return model_path
-        else:
-            # if the model is not available on Huggingface, look for other models
-            filtered_path_map = {k: v for k, v in path_map.items() if k != value}
-            for k, v in filtered_path_map.items():
-                if check_huggingface_path(v):
-                    return v
-    elif check_huggingface_path(lang_domain_path):
-        return lang_domain_path
-    elif check_huggingface_path(lang_path):
-        return lang_path
     else:
         return "poltextlab/xlm-roberta-large-pooled-cap"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
-    gr.Info("Loading model")
-    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN)
-    gr.Info("Loading tokenizer")
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-    #gr.Info("Mapping model to device")
-    #model.to(device)
-    gr.Info("Tokenizing")
     inputs = tokenizer(text,
                        max_length=256,
                        truncation=True,
                        padding="do_not_pad",
                        return_tensors="pt").to(device)
-    gr.Info("model.eval()")
     model.eval()
-    gr.Info("Prediction")
     with torch.no_grad():
         logits = model(**inputs).logits
-    gr.Info("Softmax")
     probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
     output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
@@ -126,11 +106,17 @@ def predict_cap(text, language, domain):
     domain = domains[domain]
     model_id = build_huggingface_path(language, domain)
     tokenizer_id = "xlm-roberta-large"
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language"),
             gr.Dropdown(domains.keys(), label="Domain")],
-    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

 from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 from huggingface_hub import HfApi
 from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
+from .utils import is_disk_full
 HF_TOKEN = os.environ["hf_read"]
 languages = [
     "English",
+    "Multilingual"
 ]
 domains = {
         hf_api = HfApi(token=HF_TOKEN)
         hf_api.model_info(checkpoint_path, token=HF_TOKEN)
         return True
+    except:
         return False
 def build_huggingface_path(language: str, domain: str):
+    language = language.lower()
     base_path = "xlm-roberta-large"
+    if language == "english" and (domain == "media" or domain == "legislative"):
+        lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v4"
+        return lang_domain_path
+    else:
+        lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
     lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
     path_map = {
     except (AttributeError, FileNotFoundError):
         value = None
+    if language == 'english':
+        model_path = lang_path
+    else:
+        model_path = "poltextlab/xlm-roberta-large-pooled-cap"
+    if check_huggingface_path(model_path):
+        return model_path
     else:
         return "poltextlab/xlm-roberta-large-pooled-cap"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     inputs = tokenizer(text,
                        max_length=256,
                        truncation=True,
                        padding="do_not_pad",
                        return_tensors="pt").to(device)
     model.eval()
     with torch.no_grad():
         logits = model(**inputs).logits
     probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
     output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     domain = domains[domain]
     model_id = build_huggingface_path(language, domain)
     tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
+    title="CAP Babel Demo",
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language"),
             gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/cap_minor.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES
+from .utils import is_disk_full
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "Multilingual",
+]
+domains = {
+    "media": "media",
+    "social media": "social",
+    "parliamentary speech": "parlspeech",
+    "legislative documents": "legislative",
+    "executive speech": "execspeech",
+    "executive order": "execorder",
+    "party programs": "party",
+    "judiciary": "judiciary",
+    "budget": "budget",
+    "public opinion": "publicopinion",
+    "local government agenda": "localgovernment"
+}
+def check_huggingface_path(checkpoint_path: str):
+    try:
+        hf_api = HfApi(token=HF_TOKEN)
+        hf_api.model_info(checkpoint_path, token=HF_TOKEN)
+        return True
+    except:
+        return False
+def build_huggingface_path(language: str, domain: str):
+    return "poltextlab/xlm-roberta-large-pooled-cap-minor"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    inputs = tokenizer(text,
+                       max_length=256,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {f"[{CAP_MIN_NUM_DICT[i]}] {CAP_MIN_LABEL_NAMES[CAP_MIN_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language, domain):
+    domain = domains[domain]
+    model_id = build_huggingface_path(language, domain)
+    tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    title="CAP Minor Topics Babel Demo",
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/emotion.py CHANGED Viewed

@@ -9,18 +9,25 @@ from huggingface_hub import HfApi
 from label_dicts import MANIFESTO_LABEL_NAMES
 HF_TOKEN = os.environ["hf_read"]
 languages = [
-    "Czech", "English", "French", "German", "Hungarian", "Italian"
 ]
 def build_huggingface_path(language: str):
-    return "poltextlab/xlm-roberta-large-pooled-emotions"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
-    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     model.to(device)
@@ -39,13 +46,20 @@ def predict(text, model_id, tokenizer_id):
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
-def predict_cap(text, language):
     model_id = build_huggingface_path(language)
     tokenizer_id = "xlm-roberta-large"
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
-            gr.Dropdown(languages, label="Language")],
-    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

 from label_dicts import MANIFESTO_LABEL_NAMES
+from .utils import is_disk_full
 HF_TOKEN = os.environ["hf_read"]
 languages = [
+    "Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
 ]
+domains = {
+    "parliamentary speech": "parlspeech",
+}
 def build_huggingface_path(language: str):
+    if language == "Czech" or language == "Slovak":
+        return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
+    return "poltextlab/xlm-roberta-large-pooled-MORES"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     model.to(device)
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
+def predict_cap(text, language, domain):
     model_id = build_huggingface_path(language)
     tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
+    title="Emotions (6) Babel Demo",
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/emotion9.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import EMOTION9_LABEL_NAMES
+from .utils import is_disk_full
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "Czech", "English", "German", "Hungarian", "Polish", "Slovak"
+]
+domains = {
+    "parliamentary speech": "parlspeech",
+}
+def build_huggingface_path(language: str):
+    language = language.lower()
+    return f"poltextlab/xlm-roberta-large-pooled-{language}-emotions9"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    inputs = tokenizer(text,
+                       max_length=512,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    NUMS_DICT = {i: key for i, key in enumerate(sorted(EMOTION9_LABEL_NAMES.keys()))}
+    output_pred = {f"[{NUMS_DICT[i]}] {EMOTION9_LABEL_NAMES[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_e6(text, language, domain):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    title="Emotions (9) Babel Demo",
+    fn=predict_e6,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/illframes.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import ILLFRAMES_MIGRATION_LABEL_NAMES, ILLFRAMES_COVID_LABEL_NAMES, ILLFRAMES_WAR_LABEL_NAMES
+from .utils import is_disk_full
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "English"
+]
+domains = {
+    "Covid": "covid",
+    "Migration": "migration",
+    "War": "war"
+}
+# --- DEBUG ---
+import shutil
+def convert_size(size):
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
+        if size < 1024:
+            return f"{size:.2f} {unit}"
+        size /= 1024
+def get_disk_space(path="/"):
+    total, used, free = shutil.disk_usage(path)
+    return {
+        "Total": convert_size(total),
+        "Used": convert_size(used),
+        "Free": convert_size(free)
+    }
+# ---
+def check_huggingface_path(checkpoint_path: str):
+    try:
+        hf_api = HfApi(token=HF_TOKEN)
+        hf_api.model_info(checkpoint_path, token=HF_TOKEN)
+        return True
+    except:
+        return False
+def build_huggingface_path(domain: str):
+    return f"poltextlab/xlm-roberta-large-english-ILLFRAMES-{domain}"
+def predict(text, model_id, tokenizer_id, label_names):
+    device = torch.device("cpu")
+    try:
+        model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, offload_folder="offload", device_map="auto", token=HF_TOKEN)
+    except:
+        disk_space = get_disk_space('/data/')
+        print("Disk Space Error:")
+        for key, value in disk_space.items():
+            print(f"{key}: {value}")
+        shutil.rmtree("/data")
+        model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN, force_download=True)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    inputs = tokenizer(text,
+                       max_length=256,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    NUMS_DICT = {i: key for i, key in enumerate(sorted(label_names.keys()))}
+    output_pred = {f"[{NUMS_DICT[i]}] {label_names[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_illframes(text, language, domain):
+    domain = domains[domain]
+    model_id = build_huggingface_path(domain)
+    tokenizer_id = "xlm-roberta-large"
+    if domain == "migration":
+        label_names = ILLFRAMES_MIGRATION_LABEL_NAMES
+    elif domain == "covid":
+        label_names = ILLFRAMES_COVID_LABEL_NAMES
+    elif domain == "war":
+        label_names = ILLFRAMES_WAR_LABEL_NAMES
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
+    return predict(text, model_id, tokenizer_id, label_names)
+demo = gr.Interface(
+    title="ILLFRAMES Babel Demo",
+    fn=predict_illframes,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/manifesto.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import time
 import gradio as gr
 import os
@@ -10,20 +9,7 @@ from huggingface_hub import HfApi
 from label_dicts import MANIFESTO_LABEL_NAMES
-class RuntimeMeasure:
-    def __init__(self, msg):
-        self.msg = msg
-    def __enter__(self):
-        self.start_time = time.time()
-        return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        end_time = time.time()
-        runtime = end_time - self.start_time
-        gr.Info(f"{self.msg}: {runtime} seconds")
-def m(msg):
-    return RuntimeMeasure(msg)
 HF_TOKEN = os.environ["hf_read"]
@@ -39,44 +25,38 @@ def build_huggingface_path(language: str):
     return "poltextlab/xlm-roberta-large-manifesto"
 def predict(text, model_id, tokenizer_id):
-    gr.Info("\n".join(os.listdir("/data/")))
     device = torch.device("cpu")
-    with m("Loading model"):
-        model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN)
-    with m("Loading tokenizer"):
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-    with m("Tokenizing"):
-        inputs = tokenizer(text,
-                           max_length=256,
-                           truncation=True,
-                           padding="do_not_pad",
-                           return_tensors="pt").to(device)
-    with m("model.eval()"):
-        model.eval()
-    with m("Inference"):
-        with torch.no_grad():
-            logits = model(**inputs).logits
-    with m("Softmax"):
-        probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
-    with m("Output formatting"):
-        output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
-        output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
 def predict_cap(text, language):
-    with m("WHOLE PROCESS"):
-        model_id = build_huggingface_path(language)
-        tokenizer_id = "xlm-roberta-large"
-        prediction = predict(text, model_id, tokenizer_id)
-    return prediction
 demo = gr.Interface(
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language")],
-    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

 import gradio as gr
 import os
 from label_dicts import MANIFESTO_LABEL_NAMES
+from .utils import is_disk_full
 HF_TOKEN = os.environ["hf_read"]
     return "poltextlab/xlm-roberta-large-manifesto"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    inputs = tokenizer(text,
+                       max_length=256,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
 def predict_cap(text, language):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
+    return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
+    title="Manifesto Babel Demo",
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/ner.py CHANGED Viewed

@@ -9,6 +9,8 @@ from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 from huggingface_hub import HfApi
 languages = [
     "English", "Hungarian", "Multilingual"
 ]
@@ -34,13 +36,16 @@ def named_entity_recognition(text, language):
     pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
     doc = pipeline(text)
     entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
     output = {"text":text, "entities":entities}
     model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
-    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p>'
     return output, output_info
 demo = gr.Interface(
     fn=named_entity_recognition,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language")],
-    outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])

 from transformers import AutoTokenizer
 from huggingface_hub import HfApi
+from spacy.glossary import GLOSSARY as NER_DICT
 languages = [
     "English", "Hungarian", "Multilingual"
 ]
     pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
     doc = pipeline(text)
     entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
+    labels_used = [ent.label_ for ent in doc.ents]
+    legend = '<p style="text-align: left; display: block">Legend:</p><ul style="text-align: left; display: block">'+"".join([f"<li> <b>{label}</b> = <i>{NER_DICT[label]}</i> </li>" for label in set(labels_used)])+"</ul>"
     output = {"text":text, "entities":entities}
     model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
+    output_info = legend + f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p> <ul>'
     return output, output_info
 demo = gr.Interface(
+    title="NER Babel Demo",
     fn=named_entity_recognition,
     inputs=[gr.Textbox(lines=6, label="Input"),
             gr.Dropdown(languages, label="Language")],
+    outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])

interfaces/ontolisst.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "English"
+]
+from label_dicts import ONTOLISST_LABEL_NAMES
+from .utils import is_disk_full
+# --- DEBUG ---
+import shutil
+def convert_size(size):
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
+        if size < 1024:
+            return f"{size:.2f} {unit}"
+        size /= 1024
+def get_disk_space(path="/"):
+    total, used, free = shutil.disk_usage(path)
+    return {
+        "Total": convert_size(total),
+        "Used": convert_size(used),
+        "Free": convert_size(free)
+    }
+# ---
+def build_huggingface_path(language: str):
+    return "poltextlab/xlm-roberta-large_ontolisst_v1"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    # --- DEBUG ---
+    disk_space = get_disk_space('/data/')
+    print("Disk Space Info:")
+    for key, value in disk_space.items():
+        print(f"{key}: {value}")
+    # ---
+    model.to(device)
+    inputs = tokenizer(text,
+                       max_length=256,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    predicted_class_id = probs.argmax()
+    predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
+    output_pred = ONTOLISST_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    title="ONTOLISST Babel Demo",
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language")],
+    outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])

interfaces/sentiment.py CHANGED Viewed

@@ -9,23 +9,33 @@ from huggingface_hub import HfApi
 from label_dicts import MANIFESTO_LABEL_NAMES
 HF_TOKEN = os.environ["hf_read"]
 languages = [
-    "Czech", "English", "French", "German", "Hungarian", "Italian"
 ]
 def build_huggingface_path(language: str):
-    return "poltextlab/xlm-roberta-large-pooled-sentiment"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
-    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     model.to(device)
     inputs = tokenizer(text,
-                       max_length=512,
                        truncation=True,
                        padding="do_not_pad",
                        return_tensors="pt").to(device)
@@ -35,17 +45,30 @@ def predict(text, model_id, tokenizer_id):
         logits = model(**inputs).logits
     probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
-    output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
-def predict_cap(text, language):
     model_id = build_huggingface_path(language)
     tokenizer_id = "xlm-roberta-large"
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
-            gr.Dropdown(languages, label="Language")],
-    outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])

 from label_dicts import MANIFESTO_LABEL_NAMES
+from .utils import is_disk_full
 HF_TOKEN = os.environ["hf_read"]
 languages = [
+    "Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
 ]
+domains = {
+    "parliamentary speech": "parlspeech",
+}
+SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
 def build_huggingface_path(language: str):
+    if language == "Czech" or language == "Slovak":
+        return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
+    return "poltextlab/xlm-roberta-large-pooled-MORES"
 def predict(text, model_id, tokenizer_id):
     device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     model.to(device)
     inputs = tokenizer(text,
+                       max_length=256,
                        truncation=True,
                        padding="do_not_pad",
                        return_tensors="pt").to(device)
         logits = model(**inputs).logits
     probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    predicted_class_id = probs.argmax()
+    predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
+    output_pred = SENTIMENT_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
     output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
     return output_pred, output_info
+def predict_cap(text, language, domain):
     model_id = build_huggingface_path(language)
     tokenizer_id = "xlm-roberta-large"
+    if is_disk_full():
+        os.system('rm -rf /data/models*')
+        os.system('rm -r ~/.cache/huggingface/hub')
     return predict(text, model_id, tokenizer_id)
 demo = gr.Interface(
+    title="Sentiment (3) Babel Demo",
     fn=predict_cap,
     inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])

interfaces/utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import shutil
+def is_disk_full(min_free_space_in_GB=10):
+    total, used, free = shutil.disk_usage("/")
+    free_gb = free / (1024 ** 3)
+    if free_gb >= min_free_space_in_GB:
+        print(f'enough space available ({free_gb} GB)')
+        return False
+    else:
+        print('clean up!')
+        return True

label_dicts.py CHANGED Viewed

@@ -21,6 +21,220 @@ CAP_NUM_DICT = {
     19: 21,
     20: 23,
     21: 999,
 }
 CAP_LABEL_NAMES = {
@@ -48,6 +262,245 @@ CAP_LABEL_NAMES = {
     999: "No Policy Content"
 }
 MANIFESTO_LABEL_NAMES = {
     0: "No Policy Goal",
     999: "No Policy Goal",
@@ -107,4 +560,70 @@ MANIFESTO_LABEL_NAMES = {
     704: "Middle Class and Professional Groups",
     705: "Underprivileged Minority Groups",
     706: "Non-economic Demographic Groups"
-}

     19: 21,
     20: 23,
     21: 999,
+    22: 999, # had to do this because of some language-domain models (e.g. english media)
+}
+CAP_MIN_NUM_DICT = {
+    0: 100,
+    1: 101,
+    2: 103,
+    3: 104,
+    4: 105,
+    5: 107,
+    6: 108,
+    7: 110,
+    8: 199,
+    9: 200,
+    10: 201,
+    11: 202,
+    12: 204,
+    13: 205,
+    14: 206,
+    15: 207,
+    16: 208,
+    17: 209,
+    18: 299,
+    19: 300,
+    20: 301,
+    21: 302,
+    22: 321,
+    23: 322,
+    24: 323,
+    25: 324,
+    26: 325,
+    27: 331,
+    28: 332,
+    29: 333,
+    30: 334,
+    31: 335,
+    32: 341,
+    33: 342,
+    34: 398,
+    35: 399,
+    36: 400,
+    37: 401,
+    38: 402,
+    39: 403,
+    40: 404,
+    41: 405,
+    42: 408,
+    43: 498,
+    44: 499,
+    45: 500,
+    46: 501,
+    47: 502,
+    48: 503,
+    49: 504,
+    50: 505,
+    51: 506,
+    52: 529,
+    53: 599,
+    54: 600,
+    55: 601,
+    56: 602,
+    57: 603,
+    58: 604,
+    59: 606,
+    60: 607,
+    61: 698,
+    62: 699,
+    63: 700,
+    64: 701,
+    65: 703,
+    66: 704,
+    67: 705,
+    68: 707,
+    69: 708,
+    70: 709,
+    71: 711,
+    72: 798,
+    73: 799,
+    74: 800,
+    75: 801,
+    76: 802,
+    77: 803,
+    78: 805,
+    79: 806,
+    80: 807,
+    81: 898,
+    82: 899,
+    83: 900,
+    84: 1000,
+    85: 1001,
+    86: 1002,
+    87: 1003,
+    88: 1005,
+    89: 1007,
+    90: 1010,
+    91: 1098,
+    92: 1099,
+    93: 1200,
+    94: 1201,
+    95: 1202,
+    96: 1203,
+    97: 1204,
+    98: 1205,
+    99: 1206,
+    100: 1207,
+    101: 1208,
+    102: 1210,
+    103: 1211,
+    104: 1227,
+    105: 1299,
+    106: 1300,
+    107: 1302,
+    108: 1303,
+    109: 1304,
+    110: 1305,
+    111: 1308,
+    112: 1399,
+    113: 1400,
+    114: 1401,
+    115: 1403,
+    116: 1404,
+    117: 1405,
+    118: 1406,
+    119: 1407,
+    120: 1408,
+    121: 1409,
+    122: 1498,
+    123: 1499,
+    124: 1500,
+    125: 1501,
+    126: 1502,
+    127: 1504,
+    128: 1505,
+    129: 1507,
+    130: 1520,
+    131: 1521,
+    132: 1522,
+    133: 1523,
+    134: 1524,
+    135: 1525,
+    136: 1526,
+    137: 1598,
+    138: 1599,
+    139: 1600,
+    140: 1602,
+    141: 1603,
+    142: 1604,
+    143: 1605,
+    144: 1606,
+    145: 1608,
+    146: 1610,
+    147: 1611,
+    148: 1612,
+    149: 1614,
+    150: 1615,
+    151: 1616,
+    152: 1617,
+    153: 1619,
+    154: 1620,
+    155: 1698,
+    156: 1699,
+    157: 1700,
+    158: 1701,
+    159: 1704,
+    160: 1705,
+    161: 1706,
+    162: 1707,
+    163: 1708,
+    164: 1709,
+    165: 1798,
+    166: 1799,
+    167: 1800,
+    168: 1802,
+    169: 1803,
+    170: 1804,
+    171: 1806,
+    172: 1807,
+    173: 1808,
+    174: 1899,
+    175: 1900,
+    176: 1901,
+    177: 1902,
+    178: 1905,
+    179: 1906,
+    180: 1910,
+    181: 1921,
+    182: 1925,
+    183: 1926,
+    184: 1927,
+    185: 1929,
+    186: 1999,
+    187: 2000,
+    188: 2001,
+    189: 2002,
+    190: 2003,
+    191: 2004,
+    192: 2005,
+    193: 2006,
+    194: 2007,
+    195: 2008,
+    196: 2009,
+    197: 2010,
+    198: 2011,
+    199: 2012,
+    200: 2013,
+    201: 2014,
+    202: 2015,
+    203: 2030,
+    204: 2099,
+    205: 2100,
+    206: 2101,
+    207: 2102,
+    208: 2103,
+    209: 2104
 }
 CAP_LABEL_NAMES = {
     999: "No Policy Content"
 }
+CAP_MIN_LABEL_NAMES = {
+    # 1. Macroeconomics
+    100: "General",
+    101: "Interest Rates",
+    103: "Unemployment Rate",
+    104: "Monetary Policy",
+    105: "National Budget",
+    107: "Tax Code",
+    108: "Industrial Policy",
+    110: "Price Control",
+    199: "Other",
+    # 2. Civil Rights
+    200: "General",
+    201: "Minority Discrimination",
+    202: "Gender Discrimination",
+    204: "Age Discrimination",
+    205: "Handicap Discrimination",
+    206: "Voting Rights",
+    207: "Freedom of Speech",
+    208: "Right to Privacy",
+    209: "Anti-Government",
+    299: "Other",
+    # 3. Health
+    300: "General",
+    301: "Health Care Reform",
+    302: "Insurance",
+    321: "Drug Industry",
+    322: "Medical Facilities",
+    323: "Insurance Providers",
+    324: "Medical Liability",
+    325: "Manpower",
+    331: "Disease Prevention",
+    332: "Infants and Children",
+    333: "Mental Health",
+    334: "Long-term Care",
+    335: "Drug Coverage and Cost",
+    341: "Tobacco Abuse",
+    342: "Drug and Alcohol Abuse",
+    398: "R&D",
+    399: "Other",
+    # 4. Agriculture
+    400: "General",
+    401: "Trade",
+    402: "Subsidies to Farmers",
+    403: "Food Inspection & Safety",
+    404: "Food Marketing & Promotion",
+    405: "Animal and Crop Disease",
+    408: "Fisheries & Fishing",
+    498: "R&D",
+    499: "Other",
+    # 5. Labor
+    500: "General",
+    501: "Worker Safety",
+    502: "Employment Training",
+    503: "Employee Benefits",
+    504: "Labor Unions",
+    505: "Fair Labor Standards",
+    506: "Youth Employment",
+    529: "Migrant and Seasonal",
+    599: "Other",
+    # 6. Education
+    600: "General",
+    601: "Higher",
+    602: "Elementary & Secondary",
+    603: "Underprivileged",
+    604: "Vocational",
+    606: "Special",
+    607: "Excellence",
+    698: "R&D",
+    699: "Other",
+    # 7. Environment
+    700: "General",
+    701: "Drinking Water",
+    703: "Waste Disposal",
+    704: "Hazardous Waste",
+    705: "Air Pollution",
+    707: "Recycling",
+    708: "Indoor Hazards",
+    709: "Species & Forest",
+    711: "Land and Water Conservation",
+    798: "R&D",
+    799: "Other",
+    # 8. Energy
+    800: "General",
+    801: "Nuclear",
+    802: "Electricity",
+    803: "Natural Gas & Oil",
+    805: "Coal",
+    806: "Alternative & Renewable",
+    807: "Conservation",
+    898: "R&D",
+    899: "Other",
+    # 9. Immigration
+    900: "Immigration",
+    # 10. Transportation
+    1000: "General",
+    1001: "Mass",
+    1002: "Highways",
+    1003: "Air Travel",
+    1005: "Railroad Travel",
+    1007: "Maritime",
+    1010: "Infrastructure",
+    1098: "R&D",
+    1099: "Other",
+    # 12. Law and Crime
+    1200: "General",
+    1201: "Agencies",
+    1202: "White Collar Crime",
+    1203: "Illegal Drugs",
+    1204: "Court Administration",
+    1205: "Prisons",
+    1206: "Juvenile Crime",
+    1207: "Child Abuse",
+    1208: "Family Issues",
+    1210: "Criminal & Civil Code",
+    1211: "Crime Control",
+    1227: "Police",
+    1299: "Other",
+    # 13. Social Welfare
+    1300: "General",
+    1302: "Low-Income Assistance",
+    1303: "Elderly Assistance",
+    1304: "Disabled Assistance",
+    1305: "Volunteer Associations",
+    1308: "Child Care",
+    1399: "Other",
+    # 14. Housing
+    1400: "General",
+    1401: "Community Development",
+    1403: "Urban Development",
+    1404: "Rural Housing",
+    1405: "Rural Development",
+    1406: "Low-Income Assistance",
+    1407: "Veterans",
+    1408: "Elderly",
+    1409: "Homeless",
+    1498: "R&D",
+    1499: "Other",
+    # 15. Domestic Commerce
+    1500: "General",
+    1501: "Banking",
+    1502: "Securities & Commodities",
+    1504: "Consumer Finance",
+    1505: "Insurance Regulation",
+    1507: "Bankruptcy",
+    1520: "Corporate Management",
+    1521: "Small Businesses",
+    1522: "Copyrights and Patents",
+    1523: "Disaster Relief",
+    1524: "Tourism",
+    1525: "Consumer Safety",
+    1526: "Sports Regulation",
+    1598: "R&D",
+    1599: "Other",
+    # 16. Defense
+    1600: "General",
+    1602: "Alliances",
+    1603: "Intelligence",
+    1604: "Readiness",
+    1605: "Nuclear Arms",
+    1606: "Military Aid",
+    1608: "Personnel Issues",
+    1610: "Procurement",
+    1611: "Installations & Land",
+    1612: "Reserve Forces",
+    1614: "Hazardous Waste",
+    1615: "Civil",
+    1616: "Civilian Personnel",
+    1617: "Contractors",
+    1619: "Foreign Operations",
+    1620: "Claims against Military",
+    1698: "R&D",
+    1699: "Other",
+    # 17. Technology
+    1700: "General",
+    1701: "Space",
+    1704: "Commercial Use of Space",
+    1705: "Science Transfer",
+    1706: "Telecommunications",
+    1707: "Broadcast",
+    1708: "Weather Forecasting",
+    1709: "Computers",
+    1798: "R&D",
+    1799: "Other",
+    # 18. Foreign Trade
+    1800: "General",
+    1802: "Trade Agreements",
+    1803: "Exports",
+    1804: "Private Investments",
+    1806: "Competitiveness",
+    1807: "Tariff & Imports",
+    1808: "Exchange Rates",
+    1899: "Other",
+    # 19. International Affairs
+    1900: "General",
+    1901: "Foreign Aid",
+    1902: "Resources Exploitation",
+    1905: "Developing Countries",
+    1906: "International Finance",
+    1910: "Western Europe",
+    1921: "Specific Country",
+    1925: "Human Rights",
+    1926: "Organizations",
+    1927: "Terrorism",
+    1929: "Diplomats",
+    1999: "Other",
+    # 20. Government Operations
+    2000: "General",
+    2001: "Intergovernmental Relations",
+    2002: "Bureaucracy",
+    2003: "Postal Service",
+    2004: "Employees",
+    2005: "Appointments",
+    2006: "Currency",
+    2007: "Procurement & Contractors",
+    2008: "Property Management",
+    2009: "Tax Administration",
+    2010: "Scandals",
+    2011: "Branch Relations",
+    2012: "Political Campaigns",
+    2013: "Census & Statistics",
+    2014: "Capital City",
+    2015: "Claims against the government",
+    2030: "National Holidays",
+    2099: "Other",
+    # 21. Public Lands
+    2100: "General",
+    2101: "National Parks",
+    2102: "Indigenous Affairs",
+    2103: "Public Lands",
+    2104: "Water Resources",
+    2105: "Dependencies & Territories",
+    2199: "Other",
+    # 23. Culture
+    2300: "General",
+    # NPC
+    9999: "No Policy Content",
+}
 MANIFESTO_LABEL_NAMES = {
     0: "No Policy Goal",
     999: "No Policy Goal",
     704: "Middle Class and Professional Groups",
     705: "Underprivileged Minority Groups",
     706: "Non-economic Demographic Groups"
+}
+ILLFRAMES_MIGRATION_LABEL_NAMES = {
+    901: "Culture Under Attack",
+    902: "Economic Burden",
+    903: "Illegals and Fraudsters",
+    904: "Extradition Necessity",
+    905: "Nation tate Should Decide",
+    906: "Administrative Burden",
+    907: "General System Failure",
+    908: "Security Threat",
+    909: "Criminals",
+    910: "Welfare State Overload",
+    999: "None of Them",
+}
+ILLFRAMES_COVID_LABEL_NAMES = {
+    310: "Skepticism",
+    311: "Great Reset and Elite Control",
+    312: "Undermining the Economy",
+    313: "Medical Choice",
+    314: "Media Fabrication",
+    315: "Threatening Way of Life",
+    399: "None of Them",
+}
+ILLFRAMES_WAR_LABEL_NAMES = {
+    101: 'Identity and Cultural Threat',
+    102: 'Economic Fallout/Domestic Welfare Neglected',
+    103: 'Violation of Russian Sovereignty/Western geopolitical meddling',
+    104: 'Illegitimate and corrupt Ukraine leadership',
+    105: 'Ukrainians and Ukraine are a military threat and agressive war-mongerer that threaten EU stability and security',
+    107: 'Western Propaganda and Civilian Suffering',
+    108: 'Historical Betrayal of Russia',
+    109: 'Ukraine/Nazi Allegation',
+    110: "None of Them"
+}
+ONTOLISST_LABEL_NAMES = {
+    0: 'Demographics',
+    1: 'Housing and local environment (Housing and environment)',
+    2: 'Physical health',
+    3: 'Mental health and mental processes',
+    4: 'Healthcare',
+    5: 'Health behaviour (Health and lifestyle)',
+    6: 'Family and social networks',
+    7: 'Education',
+    8: 'Employment and income (Employment and pensions)',
+    9: 'Expectation, attitudes and beliefs (Attitudes and beliefs)',
+    10: 'Child development',
+    11: 'Life events',
+    12: 'Omics',
+    13: 'Pregnancy',
+    14: 'Administration',
+    15: 'COVID19'
+}
+EMOTION9_LABEL_NAMES = {
+    0: "Anger",
+    1: "Fear",
+    2: "Disgust",
+    3: "Sadness",
+    4: "Joy",
+    5: "Enthusiasm",
+    6: "Hope",
+    7: "Pride",
+    8: "None of Them",
+}

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ transformers==4.39.1
 sentencepiece==0.2.0
 accelerate
 spacy
-huspacy

 sentencepiece==0.2.0
 accelerate
 spacy
+huspacy
+numpy==1.26.4

utils.py CHANGED Viewed

@@ -1,36 +1,72 @@
 import os
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-"""
-from interfaces.manifesto import languages as languages_manifesto
-from interfaces.manifesto import languages as languages_manifesto
-from interfaces.manifesto import languages as languages_manifesto
-"""
 from interfaces.cap import languages as languages_cap
 from interfaces.cap import domains as domains_cap
 from interfaces.cap import build_huggingface_path as hf_cap_path
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
 from interfaces.emotion import build_huggingface_path as hf_emotion_path
 HF_TOKEN = os.environ["hf_read"]
 # should be a temporary solution
-models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path("")]
 for language in languages_cap:
     for domain in domains_cap:
         models.append(hf_cap_path(language, domain))
 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
     for model_id in models:
-        model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto",
                                                                    token=HF_TOKEN)
-        del model
     for tokenizer_id in tokenizers:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-        del tokenizer

 import os
+import shutil
+import subprocess
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from interfaces.cap import languages as languages_cap
 from interfaces.cap import domains as domains_cap
+from interfaces.emotion9 import languages as languages_emotion9
+from interfaces.illframes import domains as domains_illframes
 from interfaces.cap import build_huggingface_path as hf_cap_path
+from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
 from interfaces.emotion import build_huggingface_path as hf_emotion_path
+from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
+from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
+from interfaces.illframes import build_huggingface_path as hf_illframes_path
+from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
 HF_TOKEN = os.environ["hf_read"]
 # should be a temporary solution
+models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]
+# it gets more difficult with cap
+domains_cap = list(domains_cap.values())
 for language in languages_cap:
     for domain in domains_cap:
         models.append(hf_cap_path(language, domain))
+# emotion9
+for language in languages_emotion9:
+    models.append(hf_emotion9_path(language))
+# illframes (domains is a dict for some reason?)
+for domain in domains_illframes.values():
+    models.append(hf_illframes_path(domain))
 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
     for model_id in models:
+        AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload",
                                                                    token=HF_TOKEN)
     for tokenizer_id in tokenizers:
+        AutoTokenizer.from_pretrained(tokenizer_id)
+def df_h():
+    result = subprocess.run(["df", "-H"], capture_output=True, text=True)
+    print(result.stdout)
+def set_hf_cache_dir(path:str):
+    os.environ['TRANSFORMERS_CACHE'] = path
+    os.environ['HF_HOME'] = path
+    os.environ['HF_DATASETS_CACHE'] = path
+    os.environ['TORCH_HOME'] = path
+def is_disk_full(min_free_space_in_GB=10):
+    total, used, free = shutil.disk_usage("/")
+    free_gb = free / (1024 ** 3)
+    if free_gb >= min_free_space_in_GB:
+        return False
+    else:
+        return True