kovacsvi commited on
Commit
4bba8df
Β·
1 Parent(s): 6d39e54

up-to-date prod demo

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Babel Machine Demo Dev
3
- emoji: πŸ’»
4
  colorFrom: pink
5
  colorTo: indigo
6
  sdk: gradio
@@ -10,4 +10,4 @@ pinned: false
10
  short_description: CAP, Manifesto, sentiment, emotion classification
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Babel Machine Demo Dev
3
+ emoji: πŸ“Š
4
  colorFrom: pink
5
  colorTo: indigo
6
  sdk: gradio
 
10
  short_description: CAP, Manifesto, sentiment, emotion classification
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,47 +1,58 @@
1
  import os
2
- PATH = '/data/'
3
- os.environ['TRANSFORMERS_CACHE'] = PATH
4
- os.environ['HF_HOME'] = PATH
5
- os.environ['HF_DATASETS_CACHE'] = PATH
6
- os.environ['TORCH_HOME'] = PATH
7
-
8
  import gradio as gr
9
 
10
- from spacy import glossary
11
  from interfaces.cap import demo as cap_demo
 
12
  from interfaces.manifesto import demo as manifesto_demo
13
  from interfaces.sentiment import demo as sentiment_demo
14
  from interfaces.emotion import demo as emotion_demo
15
  from interfaces.ner import demo as ner_demo
16
  from interfaces.ner import download_models as download_spacy_models
17
- from utils import download_hf_models
 
 
 
 
 
18
 
19
- entities = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
20
- ent_dict = glossary.GLOSSARY
21
- ent_sum = [f'{ent} = {ent_dict[ent]}' for ent in entities ]
 
 
 
 
 
22
 
23
- with gr.Blocks() as demo:
24
  gr.Markdown(
25
  f"""
26
- <div style="display: block; text-align: left; padding:0; margin:0;">
27
- <h1 style="text-align: center">Babel Machine Demo</h1>
28
- <p>This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, and emotion coding systems.<br>
29
- For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.</p>
30
- <p> For named entity recognition the following labels are used: </p>
31
- <ul>
32
- <li> {'</li> <li>'.join(ent_sum)} </li>
33
- </ul>
 
 
 
 
 
34
  </div>
35
  """)
36
 
37
  gr.TabbedInterface(
38
- interface_list=[cap_demo, manifesto_demo, sentiment_demo, emotion_demo, ner_demo],
39
- tab_names=["CAP", "Manifesto", "Sentiment (3)", "Emotions (8)", "Named Entity Recognition"],
40
  )
41
 
42
  if __name__ == "__main__":
43
- download_hf_models()
44
- download_spacy_models()
 
 
45
  demo.launch()
46
 
47
- # TODO: add all languages & domains
 
1
  import os
 
 
 
 
 
 
2
  import gradio as gr
3
 
 
4
  from interfaces.cap import demo as cap_demo
5
+ from interfaces.cap_minor import demo as cap_minor_demo
6
  from interfaces.manifesto import demo as manifesto_demo
7
  from interfaces.sentiment import demo as sentiment_demo
8
  from interfaces.emotion import demo as emotion_demo
9
  from interfaces.ner import demo as ner_demo
10
  from interfaces.ner import download_models as download_spacy_models
11
+ from interfaces.illframes import demo as illframes_demo
12
+ from interfaces.ontolisst import demo as ontolisst_demo
13
+ from interfaces.emotion9 import demo as e9_demo
14
+ from utils import download_hf_models, df_h, set_hf_cache_dir
15
+
16
+
17
 
18
+ css = """
19
+ /* Make only the active tab bold */
20
+ .svelte-1uw5tnk[aria-selected="true"] {
21
+ font-weight: bold;
22
+ background: linear-gradient(to bottom right, var(--primary-100), var(--primary-300));
23
+ color: var(--primary-600)
24
+ }
25
+ """
26
 
27
+ with gr.Blocks(css=css) as demo:
28
  gr.Markdown(
29
  f"""
30
+ <style>
31
+ @import 'https://fonts.googleapis.com/css?family=Source+Sans+Pro:300,400';
32
+ </style>
33
+ <div style="display: block; text-align: left; padding:0; margin:0;font-family: "Source Sans Pro", Helvetica, sans-serif;">
34
+ <h1 style="text-align: center;font-size: 17pt;">Babel Machine Demo</h1>
35
+ <p style="font-size: 14pt;">This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, emotion coding and Named Entity Recognition systems.
36
+ For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.<br>
37
+ Please note that the sentiment (3) and emotions (6) models have been trained using parliamentary speech data, so the results for generic sentences may not be reliable. The emotions (9) models have been trained using <a href="https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/C9SAIX">this dataset</a>. It contains labeled parliamentary speeches and social media data. Under-represented categories were enriched with synthetic data.<br>
38
+ <br>
39
+ The models listed for Manifesto, Sentiment (3) and Emotions (6) tasks are a beta version and thus not publicly available,
40
+ the Hugging Face link will not work for them for the time being. We expect a public version after tests and improvements in the Fall.
41
+ Please feel free to check back for model updates, or reach out to us at that point if you wish to ask about a specific model.
42
+ </p>
43
  </div>
44
  """)
45
 
46
  gr.TabbedInterface(
47
+ interface_list=[cap_demo, cap_minor_demo, manifesto_demo, sentiment_demo, emotion_demo, e9_demo,illframes_demo, ner_demo, ontolisst_demo],
48
+ tab_names=["CAP", "CAP Minor Codes", "Manifesto", "Sentiment (3)", "Emotions (6)","Emotions (9)", "ILLFRAMES", "Named Entity Recognition", "ONTOLISST"]
49
  )
50
 
51
  if __name__ == "__main__":
52
+ set_hf_cache_dir("/data")
53
+ download_spacy_models()
54
+ download_hf_models() # does this affect the build?
55
+ df_h() # debug -> check disk space before launching demo
56
  demo.launch()
57
 
58
+ # TODO: add all languages & domains
interfaces/cap.py CHANGED
@@ -7,26 +7,16 @@ import pandas as pd
7
  from transformers import AutoModelForSequenceClassification
8
  from transformers import AutoTokenizer
9
  from huggingface_hub import HfApi
10
- from huggingface_hub.utils._errors import RepositoryNotFoundError
11
 
12
  from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
13
 
 
 
14
  HF_TOKEN = os.environ["hf_read"]
15
 
16
  languages = [
17
- "Danish",
18
- "Dutch",
19
  "English",
20
- "French",
21
- "German",
22
- "Hungarian",
23
- "Italian",
24
- "Polish",
25
- "Portuguese",
26
- "Spanish",
27
- "Czech",
28
- "Slovak",
29
- "Norwegian"
30
  ]
31
 
32
  domains = {
@@ -48,12 +38,19 @@ def check_huggingface_path(checkpoint_path: str):
48
  hf_api = HfApi(token=HF_TOKEN)
49
  hf_api.model_info(checkpoint_path, token=HF_TOKEN)
50
  return True
51
- except RepositoryNotFoundError:
52
  return False
53
 
54
  def build_huggingface_path(language: str, domain: str):
 
55
  base_path = "xlm-roberta-large"
56
- lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
 
 
 
 
 
 
57
  lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
58
 
59
  path_map = {
@@ -75,48 +72,31 @@ def build_huggingface_path(language: str, domain: str):
75
  except (AttributeError, FileNotFoundError):
76
  value = None
77
 
78
- if value and value in path_map:
79
- model_path = path_map[value]
80
- if check_huggingface_path(model_path):
81
- # if the model is available on Huggingface, return the path
82
- return model_path
83
- else:
84
- # if the model is not available on Huggingface, look for other models
85
- filtered_path_map = {k: v for k, v in path_map.items() if k != value}
86
- for k, v in filtered_path_map.items():
87
- if check_huggingface_path(v):
88
- return v
89
- elif check_huggingface_path(lang_domain_path):
90
- return lang_domain_path
91
- elif check_huggingface_path(lang_path):
92
- return lang_path
93
  else:
94
  return "poltextlab/xlm-roberta-large-pooled-cap"
95
 
96
  def predict(text, model_id, tokenizer_id):
97
  device = torch.device("cpu")
98
- gr.Info("Loading model")
99
- model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN)
100
- gr.Info("Loading tokenizer")
101
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
102
- #gr.Info("Mapping model to device")
103
- #model.to(device)
104
 
105
- gr.Info("Tokenizing")
106
  inputs = tokenizer(text,
107
  max_length=256,
108
  truncation=True,
109
  padding="do_not_pad",
110
  return_tensors="pt").to(device)
111
-
112
- gr.Info("model.eval()")
113
  model.eval()
114
 
115
- gr.Info("Prediction")
116
  with torch.no_grad():
117
  logits = model(**inputs).logits
118
 
119
- gr.Info("Softmax")
120
  probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
121
  output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
122
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
@@ -126,11 +106,17 @@ def predict_cap(text, language, domain):
126
  domain = domains[domain]
127
  model_id = build_huggingface_path(language, domain)
128
  tokenizer_id = "xlm-roberta-large"
 
 
 
 
 
129
  return predict(text, model_id, tokenizer_id)
130
 
131
  demo = gr.Interface(
 
132
  fn=predict_cap,
133
  inputs=[gr.Textbox(lines=6, label="Input"),
134
  gr.Dropdown(languages, label="Language"),
135
  gr.Dropdown(domains.keys(), label="Domain")],
136
- outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
 
7
  from transformers import AutoModelForSequenceClassification
8
  from transformers import AutoTokenizer
9
  from huggingface_hub import HfApi
 
10
 
11
  from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
12
 
13
+ from .utils import is_disk_full
14
+
15
  HF_TOKEN = os.environ["hf_read"]
16
 
17
  languages = [
 
 
18
  "English",
19
+ "Multilingual"
 
 
 
 
 
 
 
 
 
20
  ]
21
 
22
  domains = {
 
38
  hf_api = HfApi(token=HF_TOKEN)
39
  hf_api.model_info(checkpoint_path, token=HF_TOKEN)
40
  return True
41
+ except:
42
  return False
43
 
44
  def build_huggingface_path(language: str, domain: str):
45
+ language = language.lower()
46
  base_path = "xlm-roberta-large"
47
+
48
+ if language == "english" and (domain == "media" or domain == "legislative"):
49
+ lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v4"
50
+ return lang_domain_path
51
+ else:
52
+ lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
53
+
54
  lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
55
 
56
  path_map = {
 
72
  except (AttributeError, FileNotFoundError):
73
  value = None
74
 
75
+ if language == 'english':
76
+ model_path = lang_path
77
+ else:
78
+ model_path = "poltextlab/xlm-roberta-large-pooled-cap"
79
+
80
+ if check_huggingface_path(model_path):
81
+ return model_path
 
 
 
 
 
 
 
 
82
  else:
83
  return "poltextlab/xlm-roberta-large-pooled-cap"
84
 
85
  def predict(text, model_id, tokenizer_id):
86
  device = torch.device("cpu")
87
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
 
 
88
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
 
 
89
 
 
90
  inputs = tokenizer(text,
91
  max_length=256,
92
  truncation=True,
93
  padding="do_not_pad",
94
  return_tensors="pt").to(device)
 
 
95
  model.eval()
96
 
 
97
  with torch.no_grad():
98
  logits = model(**inputs).logits
99
 
 
100
  probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
101
  output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
102
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
 
106
  domain = domains[domain]
107
  model_id = build_huggingface_path(language, domain)
108
  tokenizer_id = "xlm-roberta-large"
109
+
110
+ if is_disk_full():
111
+ os.system('rm -rf /data/models*')
112
+ os.system('rm -r ~/.cache/huggingface/hub')
113
+
114
  return predict(text, model_id, tokenizer_id)
115
 
116
  demo = gr.Interface(
117
+ title="CAP Babel Demo",
118
  fn=predict_cap,
119
  inputs=[gr.Textbox(lines=6, label="Input"),
120
  gr.Dropdown(languages, label="Language"),
121
  gr.Dropdown(domains.keys(), label="Domain")],
122
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/cap_minor.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ import pandas as pd
7
+ from transformers import AutoModelForSequenceClassification
8
+ from transformers import AutoTokenizer
9
+ from huggingface_hub import HfApi
10
+
11
+ from label_dicts import CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES
12
+
13
+ from .utils import is_disk_full
14
+
15
+ HF_TOKEN = os.environ["hf_read"]
16
+
17
+ languages = [
18
+ "Multilingual",
19
+ ]
20
+
21
+ domains = {
22
+ "media": "media",
23
+ "social media": "social",
24
+ "parliamentary speech": "parlspeech",
25
+ "legislative documents": "legislative",
26
+ "executive speech": "execspeech",
27
+ "executive order": "execorder",
28
+ "party programs": "party",
29
+ "judiciary": "judiciary",
30
+ "budget": "budget",
31
+ "public opinion": "publicopinion",
32
+ "local government agenda": "localgovernment"
33
+ }
34
+
35
+ def check_huggingface_path(checkpoint_path: str):
36
+ try:
37
+ hf_api = HfApi(token=HF_TOKEN)
38
+ hf_api.model_info(checkpoint_path, token=HF_TOKEN)
39
+ return True
40
+ except:
41
+ return False
42
+
43
+ def build_huggingface_path(language: str, domain: str):
44
+ return "poltextlab/xlm-roberta-large-pooled-cap-minor"
45
+
46
+ def predict(text, model_id, tokenizer_id):
47
+ device = torch.device("cpu")
48
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
49
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
50
+
51
+ inputs = tokenizer(text,
52
+ max_length=256,
53
+ truncation=True,
54
+ padding="do_not_pad",
55
+ return_tensors="pt").to(device)
56
+ model.eval()
57
+
58
+ with torch.no_grad():
59
+ logits = model(**inputs).logits
60
+
61
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
62
+ output_pred = {f"[{CAP_MIN_NUM_DICT[i]}] {CAP_MIN_LABEL_NAMES[CAP_MIN_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
63
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
64
+ return output_pred, output_info
65
+
66
+ def predict_cap(text, language, domain):
67
+ domain = domains[domain]
68
+ model_id = build_huggingface_path(language, domain)
69
+ tokenizer_id = "xlm-roberta-large"
70
+
71
+ if is_disk_full():
72
+ os.system('rm -rf /data/models*')
73
+ os.system('rm -r ~/.cache/huggingface/hub')
74
+
75
+ return predict(text, model_id, tokenizer_id)
76
+
77
+ demo = gr.Interface(
78
+ title="CAP Minor Topics Babel Demo",
79
+ fn=predict_cap,
80
+ inputs=[gr.Textbox(lines=6, label="Input"),
81
+ gr.Dropdown(languages, label="Language"),
82
+ gr.Dropdown(domains.keys(), label="Domain")],
83
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/emotion.py CHANGED
@@ -9,18 +9,25 @@ from huggingface_hub import HfApi
9
 
10
  from label_dicts import MANIFESTO_LABEL_NAMES
11
 
 
 
12
  HF_TOKEN = os.environ["hf_read"]
13
 
14
  languages = [
15
- "Czech", "English", "French", "German", "Hungarian", "Italian"
16
  ]
 
 
 
17
 
18
  def build_huggingface_path(language: str):
19
- return "poltextlab/xlm-roberta-large-pooled-emotions"
 
 
20
 
21
  def predict(text, model_id, tokenizer_id):
22
  device = torch.device("cpu")
23
- model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
24
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
25
  model.to(device)
26
 
@@ -39,13 +46,20 @@ def predict(text, model_id, tokenizer_id):
39
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
40
  return output_pred, output_info
41
 
42
- def predict_cap(text, language):
43
  model_id = build_huggingface_path(language)
44
  tokenizer_id = "xlm-roberta-large"
 
 
 
 
 
45
  return predict(text, model_id, tokenizer_id)
46
 
47
  demo = gr.Interface(
 
48
  fn=predict_cap,
49
  inputs=[gr.Textbox(lines=6, label="Input"),
50
- gr.Dropdown(languages, label="Language")],
51
- outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
 
 
9
 
10
  from label_dicts import MANIFESTO_LABEL_NAMES
11
 
12
+ from .utils import is_disk_full
13
+
14
  HF_TOKEN = os.environ["hf_read"]
15
 
16
  languages = [
17
+ "Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
18
  ]
19
+ domains = {
20
+ "parliamentary speech": "parlspeech",
21
+ }
22
 
23
  def build_huggingface_path(language: str):
24
+ if language == "Czech" or language == "Slovak":
25
+ return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
26
+ return "poltextlab/xlm-roberta-large-pooled-MORES"
27
 
28
  def predict(text, model_id, tokenizer_id):
29
  device = torch.device("cpu")
30
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
31
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
32
  model.to(device)
33
 
 
46
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
47
  return output_pred, output_info
48
 
49
+ def predict_cap(text, language, domain):
50
  model_id = build_huggingface_path(language)
51
  tokenizer_id = "xlm-roberta-large"
52
+
53
+ if is_disk_full():
54
+ os.system('rm -rf /data/models*')
55
+ os.system('rm -r ~/.cache/huggingface/hub')
56
+
57
  return predict(text, model_id, tokenizer_id)
58
 
59
  demo = gr.Interface(
60
+ title="Emotions (6) Babel Demo",
61
  fn=predict_cap,
62
  inputs=[gr.Textbox(lines=6, label="Input"),
63
+ gr.Dropdown(languages, label="Language"),
64
+ gr.Dropdown(domains.keys(), label="Domain")],
65
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/emotion9.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification
7
+ from transformers import AutoTokenizer
8
+ from huggingface_hub import HfApi
9
+
10
+ from label_dicts import EMOTION9_LABEL_NAMES
11
+
12
+ from .utils import is_disk_full
13
+
14
+ HF_TOKEN = os.environ["hf_read"]
15
+
16
+ languages = [
17
+ "Czech", "English", "German", "Hungarian", "Polish", "Slovak"
18
+ ]
19
+ domains = {
20
+ "parliamentary speech": "parlspeech",
21
+ }
22
+
23
+ def build_huggingface_path(language: str):
24
+ language = language.lower()
25
+ return f"poltextlab/xlm-roberta-large-pooled-{language}-emotions9"
26
+
27
+ def predict(text, model_id, tokenizer_id):
28
+ device = torch.device("cpu")
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, token=HF_TOKEN)
30
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
31
+
32
+ inputs = tokenizer(text,
33
+ max_length=512,
34
+ truncation=True,
35
+ padding="do_not_pad",
36
+ return_tensors="pt").to(device)
37
+ model.eval()
38
+
39
+ with torch.no_grad():
40
+ logits = model(**inputs).logits
41
+
42
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
43
+
44
+ NUMS_DICT = {i: key for i, key in enumerate(sorted(EMOTION9_LABEL_NAMES.keys()))}
45
+ output_pred = {f"[{NUMS_DICT[i]}] {EMOTION9_LABEL_NAMES[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
46
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
47
+ return output_pred, output_info
48
+
49
+ def predict_e6(text, language, domain):
50
+ model_id = build_huggingface_path(language)
51
+ tokenizer_id = "xlm-roberta-large"
52
+
53
+ if is_disk_full():
54
+ os.system('rm -rf /data/models*')
55
+ os.system('rm -r ~/.cache/huggingface/hub')
56
+
57
+ return predict(text, model_id, tokenizer_id)
58
+
59
+ demo = gr.Interface(
60
+ title="Emotions (9) Babel Demo",
61
+ fn=predict_e6,
62
+ inputs=[gr.Textbox(lines=6, label="Input"),
63
+ gr.Dropdown(languages, label="Language"),
64
+ gr.Dropdown(domains.keys(), label="Domain")],
65
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/illframes.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ import pandas as pd
7
+ from transformers import AutoModelForSequenceClassification
8
+ from transformers import AutoTokenizer
9
+ from huggingface_hub import HfApi
10
+
11
+ from label_dicts import ILLFRAMES_MIGRATION_LABEL_NAMES, ILLFRAMES_COVID_LABEL_NAMES, ILLFRAMES_WAR_LABEL_NAMES
12
+
13
+ from .utils import is_disk_full
14
+
15
+ HF_TOKEN = os.environ["hf_read"]
16
+
17
+ languages = [
18
+ "English"
19
+ ]
20
+
21
+ domains = {
22
+ "Covid": "covid",
23
+ "Migration": "migration",
24
+ "War": "war"
25
+ }
26
+
27
+
28
+ # --- DEBUG ---
29
+ import shutil
30
+
31
+ def convert_size(size):
32
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
33
+ if size < 1024:
34
+ return f"{size:.2f} {unit}"
35
+ size /= 1024
36
+
37
+ def get_disk_space(path="/"):
38
+ total, used, free = shutil.disk_usage(path)
39
+
40
+ return {
41
+ "Total": convert_size(total),
42
+ "Used": convert_size(used),
43
+ "Free": convert_size(free)
44
+ }
45
+
46
+ # ---
47
+
48
+ def check_huggingface_path(checkpoint_path: str):
49
+ try:
50
+ hf_api = HfApi(token=HF_TOKEN)
51
+ hf_api.model_info(checkpoint_path, token=HF_TOKEN)
52
+ return True
53
+ except:
54
+ return False
55
+
56
+ def build_huggingface_path(domain: str):
57
+ return f"poltextlab/xlm-roberta-large-english-ILLFRAMES-{domain}"
58
+
59
+ def predict(text, model_id, tokenizer_id, label_names):
60
+ device = torch.device("cpu")
61
+ try:
62
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, offload_folder="offload", device_map="auto", token=HF_TOKEN)
63
+ except:
64
+ disk_space = get_disk_space('/data/')
65
+ print("Disk Space Error:")
66
+ for key, value in disk_space.items():
67
+ print(f"{key}: {value}")
68
+
69
+ shutil.rmtree("/data")
70
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN, force_download=True)
71
+
72
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
73
+
74
+ inputs = tokenizer(text,
75
+ max_length=256,
76
+ truncation=True,
77
+ padding="do_not_pad",
78
+ return_tensors="pt").to(device)
79
+ model.eval()
80
+
81
+ with torch.no_grad():
82
+ logits = model(**inputs).logits
83
+
84
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
85
+
86
+ NUMS_DICT = {i: key for i, key in enumerate(sorted(label_names.keys()))}
87
+
88
+ output_pred = {f"[{NUMS_DICT[i]}] {label_names[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
89
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
90
+ return output_pred, output_info
91
+
92
+ def predict_illframes(text, language, domain):
93
+ domain = domains[domain]
94
+ model_id = build_huggingface_path(domain)
95
+ tokenizer_id = "xlm-roberta-large"
96
+
97
+ if domain == "migration":
98
+ label_names = ILLFRAMES_MIGRATION_LABEL_NAMES
99
+ elif domain == "covid":
100
+ label_names = ILLFRAMES_COVID_LABEL_NAMES
101
+ elif domain == "war":
102
+ label_names = ILLFRAMES_WAR_LABEL_NAMES
103
+
104
+ if is_disk_full():
105
+ os.system('rm -rf /data/models*')
106
+ os.system('rm -r ~/.cache/huggingface/hub')
107
+
108
+ return predict(text, model_id, tokenizer_id, label_names)
109
+
110
+ demo = gr.Interface(
111
+ title="ILLFRAMES Babel Demo",
112
+ fn=predict_illframes,
113
+ inputs=[gr.Textbox(lines=6, label="Input"),
114
+ gr.Dropdown(languages, label="Language"),
115
+ gr.Dropdown(domains.keys(), label="Domain")],
116
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/manifesto.py CHANGED
@@ -1,4 +1,3 @@
1
- import time
2
  import gradio as gr
3
 
4
  import os
@@ -10,20 +9,7 @@ from huggingface_hub import HfApi
10
 
11
  from label_dicts import MANIFESTO_LABEL_NAMES
12
 
13
- class RuntimeMeasure:
14
- def __init__(self, msg):
15
- self.msg = msg
16
-
17
- def __enter__(self):
18
- self.start_time = time.time()
19
- return self
20
-
21
- def __exit__(self, exc_type, exc_value, traceback):
22
- end_time = time.time()
23
- runtime = end_time - self.start_time
24
- gr.Info(f"{self.msg}: {runtime} seconds")
25
- def m(msg):
26
- return RuntimeMeasure(msg)
27
 
28
  HF_TOKEN = os.environ["hf_read"]
29
 
@@ -39,44 +25,38 @@ def build_huggingface_path(language: str):
39
  return "poltextlab/xlm-roberta-large-manifesto"
40
 
41
  def predict(text, model_id, tokenizer_id):
42
- gr.Info("\n".join(os.listdir("/data/")))
43
-
44
  device = torch.device("cpu")
45
- with m("Loading model"):
46
- model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN)
47
- with m("Loading tokenizer"):
48
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
49
-
50
- with m("Tokenizing"):
51
- inputs = tokenizer(text,
52
- max_length=256,
53
- truncation=True,
54
- padding="do_not_pad",
55
- return_tensors="pt").to(device)
56
- with m("model.eval()"):
57
- model.eval()
58
-
59
- with m("Inference"):
60
- with torch.no_grad():
61
- logits = model(**inputs).logits
62
-
63
- with m("Softmax"):
64
- probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
65
-
66
- with m("Output formatting"):
67
- output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
68
- output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
69
  return output_pred, output_info
70
 
71
  def predict_cap(text, language):
72
- with m("WHOLE PROCESS"):
73
- model_id = build_huggingface_path(language)
74
- tokenizer_id = "xlm-roberta-large"
75
- prediction = predict(text, model_id, tokenizer_id)
76
- return prediction
 
 
 
77
 
78
  demo = gr.Interface(
 
79
  fn=predict_cap,
80
  inputs=[gr.Textbox(lines=6, label="Input"),
81
  gr.Dropdown(languages, label="Language")],
82
- outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
 
 
1
  import gradio as gr
2
 
3
  import os
 
9
 
10
  from label_dicts import MANIFESTO_LABEL_NAMES
11
 
12
+ from .utils import is_disk_full
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  HF_TOKEN = os.environ["hf_read"]
15
 
 
25
  return "poltextlab/xlm-roberta-large-manifesto"
26
 
27
  def predict(text, model_id, tokenizer_id):
 
 
28
  device = torch.device("cpu")
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
30
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
31
+
32
+ inputs = tokenizer(text,
33
+ max_length=256,
34
+ truncation=True,
35
+ padding="do_not_pad",
36
+ return_tensors="pt").to(device)
37
+ model.eval()
38
+
39
+ with torch.no_grad():
40
+ logits = model(**inputs).logits
41
+
42
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
43
+ output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
44
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
 
 
 
 
 
 
 
 
45
  return output_pred, output_info
46
 
47
  def predict_cap(text, language):
48
+ model_id = build_huggingface_path(language)
49
+ tokenizer_id = "xlm-roberta-large"
50
+
51
+ if is_disk_full():
52
+ os.system('rm -rf /data/models*')
53
+ os.system('rm -r ~/.cache/huggingface/hub')
54
+
55
+ return predict(text, model_id, tokenizer_id)
56
 
57
  demo = gr.Interface(
58
+ title="Manifesto Babel Demo",
59
  fn=predict_cap,
60
  inputs=[gr.Textbox(lines=6, label="Input"),
61
  gr.Dropdown(languages, label="Language")],
62
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/ner.py CHANGED
@@ -9,6 +9,8 @@ from transformers import AutoModelForSequenceClassification
9
  from transformers import AutoTokenizer
10
  from huggingface_hub import HfApi
11
 
 
 
12
  languages = [
13
  "English", "Hungarian", "Multilingual"
14
  ]
@@ -34,13 +36,16 @@ def named_entity_recognition(text, language):
34
  pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
35
  doc = pipeline(text)
36
  entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
 
 
37
  output = {"text":text, "entities":entities}
38
  model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
39
- output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p>'
40
  return output, output_info
41
 
42
  demo = gr.Interface(
 
43
  fn=named_entity_recognition,
44
  inputs=[gr.Textbox(lines=6, label="Input"),
45
  gr.Dropdown(languages, label="Language")],
46
- outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])
 
9
  from transformers import AutoTokenizer
10
  from huggingface_hub import HfApi
11
 
12
+ from spacy.glossary import GLOSSARY as NER_DICT
13
+
14
  languages = [
15
  "English", "Hungarian", "Multilingual"
16
  ]
 
36
  pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
37
  doc = pipeline(text)
38
  entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
39
+ labels_used = [ent.label_ for ent in doc.ents]
40
+ legend = '<p style="text-align: left; display: block">Legend:</p><ul style="text-align: left; display: block">'+"".join([f"<li> <b>{label}</b> = <i>{NER_DICT[label]}</i> </li>" for label in set(labels_used)])+"</ul>"
41
  output = {"text":text, "entities":entities}
42
  model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
43
+ output_info = legend + f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p> <ul>'
44
  return output, output_info
45
 
46
  demo = gr.Interface(
47
+ title="NER Babel Demo",
48
  fn=named_entity_recognition,
49
  inputs=[gr.Textbox(lines=6, label="Input"),
50
  gr.Dropdown(languages, label="Language")],
51
+ outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])
interfaces/ontolisst.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification
7
+ from transformers import AutoTokenizer
8
+ from huggingface_hub import HfApi
9
+
10
+ HF_TOKEN = os.environ["hf_read"]
11
+
12
+ languages = [
13
+ "English"
14
+ ]
15
+
16
+ from label_dicts import ONTOLISST_LABEL_NAMES
17
+
18
+ from .utils import is_disk_full
19
+
20
+ # --- DEBUG ---
21
+ import shutil
22
+
23
+ def convert_size(size):
24
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
25
+ if size < 1024:
26
+ return f"{size:.2f} {unit}"
27
+ size /= 1024
28
+
29
+ def get_disk_space(path="/"):
30
+ total, used, free = shutil.disk_usage(path)
31
+
32
+ return {
33
+ "Total": convert_size(total),
34
+ "Used": convert_size(used),
35
+ "Free": convert_size(free)
36
+ }
37
+
38
+ # ---
39
+
40
+
41
+ def build_huggingface_path(language: str):
42
+ return "poltextlab/xlm-roberta-large_ontolisst_v1"
43
+
44
+ def predict(text, model_id, tokenizer_id):
45
+ device = torch.device("cpu")
46
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
47
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
48
+
49
+ # --- DEBUG ---
50
+
51
+ disk_space = get_disk_space('/data/')
52
+ print("Disk Space Info:")
53
+ for key, value in disk_space.items():
54
+ print(f"{key}: {value}")
55
+
56
+ # ---
57
+
58
+ model.to(device)
59
+
60
+ inputs = tokenizer(text,
61
+ max_length=256,
62
+ truncation=True,
63
+ padding="do_not_pad",
64
+ return_tensors="pt").to(device)
65
+ model.eval()
66
+
67
+ with torch.no_grad():
68
+ logits = model(**inputs).logits
69
+
70
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
71
+ predicted_class_id = probs.argmax()
72
+ predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
73
+
74
+
75
+ output_pred = ONTOLISST_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
76
+
77
+
78
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
79
+ return output_pred, output_info
80
+
81
+ def predict_cap(text, language):
82
+ model_id = build_huggingface_path(language)
83
+ tokenizer_id = "xlm-roberta-large"
84
+
85
+ if is_disk_full():
86
+ os.system('rm -rf /data/models*')
87
+ os.system('rm -r ~/.cache/huggingface/hub')
88
+
89
+ return predict(text, model_id, tokenizer_id)
90
+
91
+ demo = gr.Interface(
92
+ title="ONTOLISST Babel Demo",
93
+ fn=predict_cap,
94
+ inputs=[gr.Textbox(lines=6, label="Input"),
95
+ gr.Dropdown(languages, label="Language")],
96
+ outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
interfaces/sentiment.py CHANGED
@@ -9,23 +9,33 @@ from huggingface_hub import HfApi
9
 
10
  from label_dicts import MANIFESTO_LABEL_NAMES
11
 
 
 
12
  HF_TOKEN = os.environ["hf_read"]
13
 
14
  languages = [
15
- "Czech", "English", "French", "German", "Hungarian", "Italian"
16
  ]
 
 
 
 
 
 
17
 
18
  def build_huggingface_path(language: str):
19
- return "poltextlab/xlm-roberta-large-pooled-sentiment"
 
 
20
 
21
  def predict(text, model_id, tokenizer_id):
22
  device = torch.device("cpu")
23
- model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
24
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
25
  model.to(device)
26
 
27
  inputs = tokenizer(text,
28
- max_length=512,
29
  truncation=True,
30
  padding="do_not_pad",
31
  return_tensors="pt").to(device)
@@ -35,17 +45,30 @@ def predict(text, model_id, tokenizer_id):
35
  logits = model(**inputs).logits
36
 
37
  probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
38
- output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
 
 
 
 
 
 
39
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
40
  return output_pred, output_info
41
 
42
- def predict_cap(text, language):
43
  model_id = build_huggingface_path(language)
44
  tokenizer_id = "xlm-roberta-large"
 
 
 
 
 
45
  return predict(text, model_id, tokenizer_id)
46
 
47
  demo = gr.Interface(
 
48
  fn=predict_cap,
49
  inputs=[gr.Textbox(lines=6, label="Input"),
50
- gr.Dropdown(languages, label="Language")],
51
- outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
 
 
9
 
10
  from label_dicts import MANIFESTO_LABEL_NAMES
11
 
12
+ from .utils import is_disk_full
13
+
14
  HF_TOKEN = os.environ["hf_read"]
15
 
16
  languages = [
17
+ "Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
18
  ]
19
+ domains = {
20
+ "parliamentary speech": "parlspeech",
21
+ }
22
+
23
+ SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
24
+
25
 
26
  def build_huggingface_path(language: str):
27
+ if language == "Czech" or language == "Slovak":
28
+ return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
29
+ return "poltextlab/xlm-roberta-large-pooled-MORES"
30
 
31
  def predict(text, model_id, tokenizer_id):
32
  device = torch.device("cpu")
33
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
34
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
35
  model.to(device)
36
 
37
  inputs = tokenizer(text,
38
+ max_length=256,
39
  truncation=True,
40
  padding="do_not_pad",
41
  return_tensors="pt").to(device)
 
45
  logits = model(**inputs).logits
46
 
47
  probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
48
+ predicted_class_id = probs.argmax()
49
+ predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
50
+
51
+
52
+ output_pred = SENTIMENT_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
53
+
54
+
55
  output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
56
  return output_pred, output_info
57
 
58
+ def predict_cap(text, language, domain):
59
  model_id = build_huggingface_path(language)
60
  tokenizer_id = "xlm-roberta-large"
61
+
62
+ if is_disk_full():
63
+ os.system('rm -rf /data/models*')
64
+ os.system('rm -r ~/.cache/huggingface/hub')
65
+
66
  return predict(text, model_id, tokenizer_id)
67
 
68
  demo = gr.Interface(
69
+ title="Sentiment (3) Babel Demo",
70
  fn=predict_cap,
71
  inputs=[gr.Textbox(lines=6, label="Input"),
72
+ gr.Dropdown(languages, label="Language"),
73
+ gr.Dropdown(domains.keys(), label="Domain")],
74
+ outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
interfaces/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+
3
+ def is_disk_full(min_free_space_in_GB=10):
4
+ total, used, free = shutil.disk_usage("/")
5
+ free_gb = free / (1024 ** 3)
6
+
7
+ if free_gb >= min_free_space_in_GB:
8
+ print(f'enough space available ({free_gb} GB)')
9
+ return False
10
+ else:
11
+ print('clean up!')
12
+ return True
label_dicts.py CHANGED
@@ -21,6 +21,220 @@ CAP_NUM_DICT = {
21
  19: 21,
22
  20: 23,
23
  21: 999,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
  CAP_LABEL_NAMES = {
@@ -48,6 +262,245 @@ CAP_LABEL_NAMES = {
48
  999: "No Policy Content"
49
  }
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  MANIFESTO_LABEL_NAMES = {
52
  0: "No Policy Goal",
53
  999: "No Policy Goal",
@@ -107,4 +560,70 @@ MANIFESTO_LABEL_NAMES = {
107
  704: "Middle Class and Professional Groups",
108
  705: "Underprivileged Minority Groups",
109
  706: "Non-economic Demographic Groups"
110
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  19: 21,
22
  20: 23,
23
  21: 999,
24
+ 22: 999, # had to do this because of some language-domain models (e.g. english media)
25
+ }
26
+
27
+ CAP_MIN_NUM_DICT = {
28
+ 0: 100,
29
+ 1: 101,
30
+ 2: 103,
31
+ 3: 104,
32
+ 4: 105,
33
+ 5: 107,
34
+ 6: 108,
35
+ 7: 110,
36
+ 8: 199,
37
+ 9: 200,
38
+ 10: 201,
39
+ 11: 202,
40
+ 12: 204,
41
+ 13: 205,
42
+ 14: 206,
43
+ 15: 207,
44
+ 16: 208,
45
+ 17: 209,
46
+ 18: 299,
47
+ 19: 300,
48
+ 20: 301,
49
+ 21: 302,
50
+ 22: 321,
51
+ 23: 322,
52
+ 24: 323,
53
+ 25: 324,
54
+ 26: 325,
55
+ 27: 331,
56
+ 28: 332,
57
+ 29: 333,
58
+ 30: 334,
59
+ 31: 335,
60
+ 32: 341,
61
+ 33: 342,
62
+ 34: 398,
63
+ 35: 399,
64
+ 36: 400,
65
+ 37: 401,
66
+ 38: 402,
67
+ 39: 403,
68
+ 40: 404,
69
+ 41: 405,
70
+ 42: 408,
71
+ 43: 498,
72
+ 44: 499,
73
+ 45: 500,
74
+ 46: 501,
75
+ 47: 502,
76
+ 48: 503,
77
+ 49: 504,
78
+ 50: 505,
79
+ 51: 506,
80
+ 52: 529,
81
+ 53: 599,
82
+ 54: 600,
83
+ 55: 601,
84
+ 56: 602,
85
+ 57: 603,
86
+ 58: 604,
87
+ 59: 606,
88
+ 60: 607,
89
+ 61: 698,
90
+ 62: 699,
91
+ 63: 700,
92
+ 64: 701,
93
+ 65: 703,
94
+ 66: 704,
95
+ 67: 705,
96
+ 68: 707,
97
+ 69: 708,
98
+ 70: 709,
99
+ 71: 711,
100
+ 72: 798,
101
+ 73: 799,
102
+ 74: 800,
103
+ 75: 801,
104
+ 76: 802,
105
+ 77: 803,
106
+ 78: 805,
107
+ 79: 806,
108
+ 80: 807,
109
+ 81: 898,
110
+ 82: 899,
111
+ 83: 900,
112
+ 84: 1000,
113
+ 85: 1001,
114
+ 86: 1002,
115
+ 87: 1003,
116
+ 88: 1005,
117
+ 89: 1007,
118
+ 90: 1010,
119
+ 91: 1098,
120
+ 92: 1099,
121
+ 93: 1200,
122
+ 94: 1201,
123
+ 95: 1202,
124
+ 96: 1203,
125
+ 97: 1204,
126
+ 98: 1205,
127
+ 99: 1206,
128
+ 100: 1207,
129
+ 101: 1208,
130
+ 102: 1210,
131
+ 103: 1211,
132
+ 104: 1227,
133
+ 105: 1299,
134
+ 106: 1300,
135
+ 107: 1302,
136
+ 108: 1303,
137
+ 109: 1304,
138
+ 110: 1305,
139
+ 111: 1308,
140
+ 112: 1399,
141
+ 113: 1400,
142
+ 114: 1401,
143
+ 115: 1403,
144
+ 116: 1404,
145
+ 117: 1405,
146
+ 118: 1406,
147
+ 119: 1407,
148
+ 120: 1408,
149
+ 121: 1409,
150
+ 122: 1498,
151
+ 123: 1499,
152
+ 124: 1500,
153
+ 125: 1501,
154
+ 126: 1502,
155
+ 127: 1504,
156
+ 128: 1505,
157
+ 129: 1507,
158
+ 130: 1520,
159
+ 131: 1521,
160
+ 132: 1522,
161
+ 133: 1523,
162
+ 134: 1524,
163
+ 135: 1525,
164
+ 136: 1526,
165
+ 137: 1598,
166
+ 138: 1599,
167
+ 139: 1600,
168
+ 140: 1602,
169
+ 141: 1603,
170
+ 142: 1604,
171
+ 143: 1605,
172
+ 144: 1606,
173
+ 145: 1608,
174
+ 146: 1610,
175
+ 147: 1611,
176
+ 148: 1612,
177
+ 149: 1614,
178
+ 150: 1615,
179
+ 151: 1616,
180
+ 152: 1617,
181
+ 153: 1619,
182
+ 154: 1620,
183
+ 155: 1698,
184
+ 156: 1699,
185
+ 157: 1700,
186
+ 158: 1701,
187
+ 159: 1704,
188
+ 160: 1705,
189
+ 161: 1706,
190
+ 162: 1707,
191
+ 163: 1708,
192
+ 164: 1709,
193
+ 165: 1798,
194
+ 166: 1799,
195
+ 167: 1800,
196
+ 168: 1802,
197
+ 169: 1803,
198
+ 170: 1804,
199
+ 171: 1806,
200
+ 172: 1807,
201
+ 173: 1808,
202
+ 174: 1899,
203
+ 175: 1900,
204
+ 176: 1901,
205
+ 177: 1902,
206
+ 178: 1905,
207
+ 179: 1906,
208
+ 180: 1910,
209
+ 181: 1921,
210
+ 182: 1925,
211
+ 183: 1926,
212
+ 184: 1927,
213
+ 185: 1929,
214
+ 186: 1999,
215
+ 187: 2000,
216
+ 188: 2001,
217
+ 189: 2002,
218
+ 190: 2003,
219
+ 191: 2004,
220
+ 192: 2005,
221
+ 193: 2006,
222
+ 194: 2007,
223
+ 195: 2008,
224
+ 196: 2009,
225
+ 197: 2010,
226
+ 198: 2011,
227
+ 199: 2012,
228
+ 200: 2013,
229
+ 201: 2014,
230
+ 202: 2015,
231
+ 203: 2030,
232
+ 204: 2099,
233
+ 205: 2100,
234
+ 206: 2101,
235
+ 207: 2102,
236
+ 208: 2103,
237
+ 209: 2104
238
  }
239
 
240
  CAP_LABEL_NAMES = {
 
262
  999: "No Policy Content"
263
  }
264
 
265
+ CAP_MIN_LABEL_NAMES = {
266
+ # 1. Macroeconomics
267
+ 100: "General",
268
+ 101: "Interest Rates",
269
+ 103: "Unemployment Rate",
270
+ 104: "Monetary Policy",
271
+ 105: "National Budget",
272
+ 107: "Tax Code",
273
+ 108: "Industrial Policy",
274
+ 110: "Price Control",
275
+ 199: "Other",
276
+ # 2. Civil Rights
277
+ 200: "General",
278
+ 201: "Minority Discrimination",
279
+ 202: "Gender Discrimination",
280
+ 204: "Age Discrimination",
281
+ 205: "Handicap Discrimination",
282
+ 206: "Voting Rights",
283
+ 207: "Freedom of Speech",
284
+ 208: "Right to Privacy",
285
+ 209: "Anti-Government",
286
+ 299: "Other",
287
+ # 3. Health
288
+ 300: "General",
289
+ 301: "Health Care Reform",
290
+ 302: "Insurance",
291
+ 321: "Drug Industry",
292
+ 322: "Medical Facilities",
293
+ 323: "Insurance Providers",
294
+ 324: "Medical Liability",
295
+ 325: "Manpower",
296
+ 331: "Disease Prevention",
297
+ 332: "Infants and Children",
298
+ 333: "Mental Health",
299
+ 334: "Long-term Care",
300
+ 335: "Drug Coverage and Cost",
301
+ 341: "Tobacco Abuse",
302
+ 342: "Drug and Alcohol Abuse",
303
+ 398: "R&D",
304
+ 399: "Other",
305
+ # 4. Agriculture
306
+ 400: "General",
307
+ 401: "Trade",
308
+ 402: "Subsidies to Farmers",
309
+ 403: "Food Inspection & Safety",
310
+ 404: "Food Marketing & Promotion",
311
+ 405: "Animal and Crop Disease",
312
+ 408: "Fisheries & Fishing",
313
+ 498: "R&D",
314
+ 499: "Other",
315
+ # 5. Labor
316
+ 500: "General",
317
+ 501: "Worker Safety",
318
+ 502: "Employment Training",
319
+ 503: "Employee Benefits",
320
+ 504: "Labor Unions",
321
+ 505: "Fair Labor Standards",
322
+ 506: "Youth Employment",
323
+ 529: "Migrant and Seasonal",
324
+ 599: "Other",
325
+ # 6. Education
326
+ 600: "General",
327
+ 601: "Higher",
328
+ 602: "Elementary & Secondary",
329
+ 603: "Underprivileged",
330
+ 604: "Vocational",
331
+ 606: "Special",
332
+ 607: "Excellence",
333
+ 698: "R&D",
334
+ 699: "Other",
335
+ # 7. Environment
336
+ 700: "General",
337
+ 701: "Drinking Water",
338
+ 703: "Waste Disposal",
339
+ 704: "Hazardous Waste",
340
+ 705: "Air Pollution",
341
+ 707: "Recycling",
342
+ 708: "Indoor Hazards",
343
+ 709: "Species & Forest",
344
+ 711: "Land and Water Conservation",
345
+ 798: "R&D",
346
+ 799: "Other",
347
+ # 8. Energy
348
+ 800: "General",
349
+ 801: "Nuclear",
350
+ 802: "Electricity",
351
+ 803: "Natural Gas & Oil",
352
+ 805: "Coal",
353
+ 806: "Alternative & Renewable",
354
+ 807: "Conservation",
355
+ 898: "R&D",
356
+ 899: "Other",
357
+ # 9. Immigration
358
+ 900: "Immigration",
359
+ # 10. Transportation
360
+ 1000: "General",
361
+ 1001: "Mass",
362
+ 1002: "Highways",
363
+ 1003: "Air Travel",
364
+ 1005: "Railroad Travel",
365
+ 1007: "Maritime",
366
+ 1010: "Infrastructure",
367
+ 1098: "R&D",
368
+ 1099: "Other",
369
+ # 12. Law and Crime
370
+ 1200: "General",
371
+ 1201: "Agencies",
372
+ 1202: "White Collar Crime",
373
+ 1203: "Illegal Drugs",
374
+ 1204: "Court Administration",
375
+ 1205: "Prisons",
376
+ 1206: "Juvenile Crime",
377
+ 1207: "Child Abuse",
378
+ 1208: "Family Issues",
379
+ 1210: "Criminal & Civil Code",
380
+ 1211: "Crime Control",
381
+ 1227: "Police",
382
+ 1299: "Other",
383
+ # 13. Social Welfare
384
+ 1300: "General",
385
+ 1302: "Low-Income Assistance",
386
+ 1303: "Elderly Assistance",
387
+ 1304: "Disabled Assistance",
388
+ 1305: "Volunteer Associations",
389
+ 1308: "Child Care",
390
+ 1399: "Other",
391
+ # 14. Housing
392
+ 1400: "General",
393
+ 1401: "Community Development",
394
+ 1403: "Urban Development",
395
+ 1404: "Rural Housing",
396
+ 1405: "Rural Development",
397
+ 1406: "Low-Income Assistance",
398
+ 1407: "Veterans",
399
+ 1408: "Elderly",
400
+ 1409: "Homeless",
401
+ 1498: "R&D",
402
+ 1499: "Other",
403
+ # 15. Domestic Commerce
404
+ 1500: "General",
405
+ 1501: "Banking",
406
+ 1502: "Securities & Commodities",
407
+ 1504: "Consumer Finance",
408
+ 1505: "Insurance Regulation",
409
+ 1507: "Bankruptcy",
410
+ 1520: "Corporate Management",
411
+ 1521: "Small Businesses",
412
+ 1522: "Copyrights and Patents",
413
+ 1523: "Disaster Relief",
414
+ 1524: "Tourism",
415
+ 1525: "Consumer Safety",
416
+ 1526: "Sports Regulation",
417
+ 1598: "R&D",
418
+ 1599: "Other",
419
+ # 16. Defense
420
+ 1600: "General",
421
+ 1602: "Alliances",
422
+ 1603: "Intelligence",
423
+ 1604: "Readiness",
424
+ 1605: "Nuclear Arms",
425
+ 1606: "Military Aid",
426
+ 1608: "Personnel Issues",
427
+ 1610: "Procurement",
428
+ 1611: "Installations & Land",
429
+ 1612: "Reserve Forces",
430
+ 1614: "Hazardous Waste",
431
+ 1615: "Civil",
432
+ 1616: "Civilian Personnel",
433
+ 1617: "Contractors",
434
+ 1619: "Foreign Operations",
435
+ 1620: "Claims against Military",
436
+ 1698: "R&D",
437
+ 1699: "Other",
438
+ # 17. Technology
439
+ 1700: "General",
440
+ 1701: "Space",
441
+ 1704: "Commercial Use of Space",
442
+ 1705: "Science Transfer",
443
+ 1706: "Telecommunications",
444
+ 1707: "Broadcast",
445
+ 1708: "Weather Forecasting",
446
+ 1709: "Computers",
447
+ 1798: "R&D",
448
+ 1799: "Other",
449
+ # 18. Foreign Trade
450
+ 1800: "General",
451
+ 1802: "Trade Agreements",
452
+ 1803: "Exports",
453
+ 1804: "Private Investments",
454
+ 1806: "Competitiveness",
455
+ 1807: "Tariff & Imports",
456
+ 1808: "Exchange Rates",
457
+ 1899: "Other",
458
+ # 19. International Affairs
459
+ 1900: "General",
460
+ 1901: "Foreign Aid",
461
+ 1902: "Resources Exploitation",
462
+ 1905: "Developing Countries",
463
+ 1906: "International Finance",
464
+ 1910: "Western Europe",
465
+ 1921: "Specific Country",
466
+ 1925: "Human Rights",
467
+ 1926: "Organizations",
468
+ 1927: "Terrorism",
469
+ 1929: "Diplomats",
470
+ 1999: "Other",
471
+ # 20. Government Operations
472
+ 2000: "General",
473
+ 2001: "Intergovernmental Relations",
474
+ 2002: "Bureaucracy",
475
+ 2003: "Postal Service",
476
+ 2004: "Employees",
477
+ 2005: "Appointments",
478
+ 2006: "Currency",
479
+ 2007: "Procurement & Contractors",
480
+ 2008: "Property Management",
481
+ 2009: "Tax Administration",
482
+ 2010: "Scandals",
483
+ 2011: "Branch Relations",
484
+ 2012: "Political Campaigns",
485
+ 2013: "Census & Statistics",
486
+ 2014: "Capital City",
487
+ 2015: "Claims against the government",
488
+ 2030: "National Holidays",
489
+ 2099: "Other",
490
+ # 21. Public Lands
491
+ 2100: "General",
492
+ 2101: "National Parks",
493
+ 2102: "Indigenous Affairs",
494
+ 2103: "Public Lands",
495
+ 2104: "Water Resources",
496
+ 2105: "Dependencies & Territories",
497
+ 2199: "Other",
498
+ # 23. Culture
499
+ 2300: "General",
500
+ # NPC
501
+ 9999: "No Policy Content",
502
+ }
503
+
504
  MANIFESTO_LABEL_NAMES = {
505
  0: "No Policy Goal",
506
  999: "No Policy Goal",
 
560
  704: "Middle Class and Professional Groups",
561
  705: "Underprivileged Minority Groups",
562
  706: "Non-economic Demographic Groups"
563
+ }
564
+
565
+ ILLFRAMES_MIGRATION_LABEL_NAMES = {
566
+ 901: "Culture Under Attack",
567
+ 902: "Economic Burden",
568
+ 903: "Illegals and Fraudsters",
569
+ 904: "Extradition Necessity",
570
+ 905: "Nation tate Should Decide",
571
+ 906: "Administrative Burden",
572
+ 907: "General System Failure",
573
+ 908: "Security Threat",
574
+ 909: "Criminals",
575
+ 910: "Welfare State Overload",
576
+ 999: "None of Them",
577
+ }
578
+
579
+ ILLFRAMES_COVID_LABEL_NAMES = {
580
+ 310: "Skepticism",
581
+ 311: "Great Reset and Elite Control",
582
+ 312: "Undermining the Economy",
583
+ 313: "Medical Choice",
584
+ 314: "Media Fabrication",
585
+ 315: "Threatening Way of Life",
586
+ 399: "None of Them",
587
+ }
588
+
589
+ ILLFRAMES_WAR_LABEL_NAMES = {
590
+ 101: 'Identity and Cultural Threat',
591
+ 102: 'Economic Fallout/Domestic Welfare Neglected',
592
+ 103: 'Violation of Russian Sovereignty/Western geopolitical meddling',
593
+ 104: 'Illegitimate and corrupt Ukraine leadership',
594
+ 105: 'Ukrainians and Ukraine are a military threat and agressive war-mongerer that threaten EU stability and security',
595
+ 107: 'Western Propaganda and Civilian Suffering',
596
+ 108: 'Historical Betrayal of Russia',
597
+ 109: 'Ukraine/Nazi Allegation',
598
+ 110: "None of Them"
599
+ }
600
+
601
+ ONTOLISST_LABEL_NAMES = {
602
+ 0: 'Demographics',
603
+ 1: 'Housing and local environment (Housing and environment)',
604
+ 2: 'Physical health',
605
+ 3: 'Mental health and mental processes',
606
+ 4: 'Healthcare',
607
+ 5: 'Health behaviour (Health and lifestyle)',
608
+ 6: 'Family and social networks',
609
+ 7: 'Education',
610
+ 8: 'Employment and income (Employment and pensions)',
611
+ 9: 'Expectation, attitudes and beliefs (Attitudes and beliefs)',
612
+ 10: 'Child development',
613
+ 11: 'Life events',
614
+ 12: 'Omics',
615
+ 13: 'Pregnancy',
616
+ 14: 'Administration',
617
+ 15: 'COVID19'
618
+ }
619
+ EMOTION9_LABEL_NAMES = {
620
+ 0: "Anger",
621
+ 1: "Fear",
622
+ 2: "Disgust",
623
+ 3: "Sadness",
624
+ 4: "Joy",
625
+ 5: "Enthusiasm",
626
+ 6: "Hope",
627
+ 7: "Pride",
628
+ 8: "None of Them",
629
+ }
requirements.txt CHANGED
@@ -4,4 +4,5 @@ transformers==4.39.1
4
  sentencepiece==0.2.0
5
  accelerate
6
  spacy
7
- huspacy
 
 
4
  sentencepiece==0.2.0
5
  accelerate
6
  spacy
7
+ huspacy
8
+ numpy==1.26.4
utils.py CHANGED
@@ -1,36 +1,72 @@
1
  import os
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
3
 
4
- """
5
- from interfaces.manifesto import languages as languages_manifesto
6
- from interfaces.manifesto import languages as languages_manifesto
7
- from interfaces.manifesto import languages as languages_manifesto
8
- """
9
 
10
  from interfaces.cap import languages as languages_cap
11
  from interfaces.cap import domains as domains_cap
12
 
 
 
 
 
13
  from interfaces.cap import build_huggingface_path as hf_cap_path
 
14
  from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
15
  from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
16
  from interfaces.emotion import build_huggingface_path as hf_emotion_path
 
 
 
 
17
 
18
  HF_TOKEN = os.environ["hf_read"]
19
 
20
  # should be a temporary solution
21
- models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path("")]
 
 
 
22
  for language in languages_cap:
23
  for domain in domains_cap:
24
  models.append(hf_cap_path(language, domain))
 
 
 
 
 
 
 
 
25
 
26
  tokenizers = ["xlm-roberta-large"]
27
 
28
  def download_hf_models():
29
  for model_id in models:
30
- model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto",
31
  token=HF_TOKEN)
32
- del model
33
  for tokenizer_id in tokenizers:
34
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
35
- del tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import shutil
3
+ import subprocess
4
 
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
 
6
 
7
  from interfaces.cap import languages as languages_cap
8
  from interfaces.cap import domains as domains_cap
9
 
10
+ from interfaces.emotion9 import languages as languages_emotion9
11
+
12
+ from interfaces.illframes import domains as domains_illframes
13
+
14
  from interfaces.cap import build_huggingface_path as hf_cap_path
15
+ from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
16
  from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
17
  from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
18
  from interfaces.emotion import build_huggingface_path as hf_emotion_path
19
+ from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
20
+ from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
21
+ from interfaces.illframes import build_huggingface_path as hf_illframes_path
22
+ from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
23
 
24
  HF_TOKEN = os.environ["hf_read"]
25
 
26
  # should be a temporary solution
27
+ models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]
28
+
29
+ # it gets more difficult with cap
30
+ domains_cap = list(domains_cap.values())
31
  for language in languages_cap:
32
  for domain in domains_cap:
33
  models.append(hf_cap_path(language, domain))
34
+
35
+ # emotion9
36
+ for language in languages_emotion9:
37
+ models.append(hf_emotion9_path(language))
38
+
39
+ # illframes (domains is a dict for some reason?)
40
+ for domain in domains_illframes.values():
41
+ models.append(hf_illframes_path(domain))
42
 
43
  tokenizers = ["xlm-roberta-large"]
44
 
45
  def download_hf_models():
46
  for model_id in models:
47
+ AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload",
48
  token=HF_TOKEN)
 
49
  for tokenizer_id in tokenizers:
50
+ AutoTokenizer.from_pretrained(tokenizer_id)
51
+
52
+
53
+ def df_h():
54
+ result = subprocess.run(["df", "-H"], capture_output=True, text=True)
55
+ print(result.stdout)
56
+
57
+
58
+ def set_hf_cache_dir(path:str):
59
+ os.environ['TRANSFORMERS_CACHE'] = path
60
+ os.environ['HF_HOME'] = path
61
+ os.environ['HF_DATASETS_CACHE'] = path
62
+ os.environ['TORCH_HOME'] = path
63
+
64
 
65
+ def is_disk_full(min_free_space_in_GB=10):
66
+ total, used, free = shutil.disk_usage("/")
67
+ free_gb = free / (1024 ** 3)
68
+
69
+ if free_gb >= min_free_space_in_GB:
70
+ return False
71
+ else:
72
+ return True