Spaces:
Running
Running
kovacsvi
commited on
Commit
Β·
4bba8df
1
Parent(s):
6d39e54
up-to-date prod demo
Browse files- README.md +2 -2
- app.py +36 -25
- interfaces/cap.py +27 -41
- interfaces/cap_minor.py +83 -0
- interfaces/emotion.py +20 -6
- interfaces/emotion9.py +65 -0
- interfaces/illframes.py +116 -0
- interfaces/manifesto.py +27 -47
- interfaces/ner.py +7 -2
- interfaces/ontolisst.py +96 -0
- interfaces/sentiment.py +31 -8
- interfaces/utils.py +12 -0
- label_dicts.py +520 -1
- requirements.txt +2 -1
- utils.py +47 -11
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Babel Machine Demo Dev
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
@@ -10,4 +10,4 @@ pinned: false
|
|
10 |
short_description: CAP, Manifesto, sentiment, emotion classification
|
11 |
---
|
12 |
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Babel Machine Demo Dev
|
3 |
+
emoji: π
|
4 |
colorFrom: pink
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
10 |
short_description: CAP, Manifesto, sentiment, emotion classification
|
11 |
---
|
12 |
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,47 +1,58 @@
|
|
1 |
import os
|
2 |
-
PATH = '/data/'
|
3 |
-
os.environ['TRANSFORMERS_CACHE'] = PATH
|
4 |
-
os.environ['HF_HOME'] = PATH
|
5 |
-
os.environ['HF_DATASETS_CACHE'] = PATH
|
6 |
-
os.environ['TORCH_HOME'] = PATH
|
7 |
-
|
8 |
import gradio as gr
|
9 |
|
10 |
-
from spacy import glossary
|
11 |
from interfaces.cap import demo as cap_demo
|
|
|
12 |
from interfaces.manifesto import demo as manifesto_demo
|
13 |
from interfaces.sentiment import demo as sentiment_demo
|
14 |
from interfaces.emotion import demo as emotion_demo
|
15 |
from interfaces.ner import demo as ner_demo
|
16 |
from interfaces.ner import download_models as download_spacy_models
|
17 |
-
from
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
with gr.Blocks() as demo:
|
24 |
gr.Markdown(
|
25 |
f"""
|
26 |
-
<
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
<
|
31 |
-
<
|
32 |
-
<
|
33 |
-
</
|
|
|
|
|
|
|
|
|
|
|
34 |
</div>
|
35 |
""")
|
36 |
|
37 |
gr.TabbedInterface(
|
38 |
-
interface_list=[cap_demo, manifesto_demo, sentiment_demo, emotion_demo, ner_demo],
|
39 |
-
tab_names=["CAP", "Manifesto", "Sentiment (3)", "Emotions (
|
40 |
)
|
41 |
|
42 |
if __name__ == "__main__":
|
43 |
-
|
44 |
-
download_spacy_models()
|
|
|
|
|
45 |
demo.launch()
|
46 |
|
47 |
-
# TODO: add all languages & domains
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
|
|
|
4 |
from interfaces.cap import demo as cap_demo
|
5 |
+
from interfaces.cap_minor import demo as cap_minor_demo
|
6 |
from interfaces.manifesto import demo as manifesto_demo
|
7 |
from interfaces.sentiment import demo as sentiment_demo
|
8 |
from interfaces.emotion import demo as emotion_demo
|
9 |
from interfaces.ner import demo as ner_demo
|
10 |
from interfaces.ner import download_models as download_spacy_models
|
11 |
+
from interfaces.illframes import demo as illframes_demo
|
12 |
+
from interfaces.ontolisst import demo as ontolisst_demo
|
13 |
+
from interfaces.emotion9 import demo as e9_demo
|
14 |
+
from utils import download_hf_models, df_h, set_hf_cache_dir
|
15 |
+
|
16 |
+
|
17 |
|
18 |
+
css = """
|
19 |
+
/* Make only the active tab bold */
|
20 |
+
.svelte-1uw5tnk[aria-selected="true"] {
|
21 |
+
font-weight: bold;
|
22 |
+
background: linear-gradient(to bottom right, var(--primary-100), var(--primary-300));
|
23 |
+
color: var(--primary-600)
|
24 |
+
}
|
25 |
+
"""
|
26 |
|
27 |
+
with gr.Blocks(css=css) as demo:
|
28 |
gr.Markdown(
|
29 |
f"""
|
30 |
+
<style>
|
31 |
+
@import 'https://fonts.googleapis.com/css?family=Source+Sans+Pro:300,400';
|
32 |
+
</style>
|
33 |
+
<div style="display: block; text-align: left; padding:0; margin:0;font-family: "Source Sans Pro", Helvetica, sans-serif;">
|
34 |
+
<h1 style="text-align: center;font-size: 17pt;">Babel Machine Demo</h1>
|
35 |
+
<p style="font-size: 14pt;">This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, emotion coding and Named Entity Recognition systems.
|
36 |
+
For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.<br>
|
37 |
+
Please note that the sentiment (3) and emotions (6) models have been trained using parliamentary speech data, so the results for generic sentences may not be reliable. The emotions (9) models have been trained using <a href="https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/C9SAIX">this dataset</a>. It contains labeled parliamentary speeches and social media data. Under-represented categories were enriched with synthetic data.<br>
|
38 |
+
<br>
|
39 |
+
The models listed for Manifesto, Sentiment (3) and Emotions (6) tasks are a beta version and thus not publicly available,
|
40 |
+
the Hugging Face link will not work for them for the time being. We expect a public version after tests and improvements in the Fall.
|
41 |
+
Please feel free to check back for model updates, or reach out to us at that point if you wish to ask about a specific model.
|
42 |
+
</p>
|
43 |
</div>
|
44 |
""")
|
45 |
|
46 |
gr.TabbedInterface(
|
47 |
+
interface_list=[cap_demo, cap_minor_demo, manifesto_demo, sentiment_demo, emotion_demo, e9_demo,illframes_demo, ner_demo, ontolisst_demo],
|
48 |
+
tab_names=["CAP", "CAP Minor Codes", "Manifesto", "Sentiment (3)", "Emotions (6)","Emotions (9)", "ILLFRAMES", "Named Entity Recognition", "ONTOLISST"]
|
49 |
)
|
50 |
|
51 |
if __name__ == "__main__":
|
52 |
+
set_hf_cache_dir("/data")
|
53 |
+
download_spacy_models()
|
54 |
+
download_hf_models() # does this affect the build?
|
55 |
+
df_h() # debug -> check disk space before launching demo
|
56 |
demo.launch()
|
57 |
|
58 |
+
# TODO: add all languages & domains
|
interfaces/cap.py
CHANGED
@@ -7,26 +7,16 @@ import pandas as pd
|
|
7 |
from transformers import AutoModelForSequenceClassification
|
8 |
from transformers import AutoTokenizer
|
9 |
from huggingface_hub import HfApi
|
10 |
-
from huggingface_hub.utils._errors import RepositoryNotFoundError
|
11 |
|
12 |
from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
|
13 |
|
|
|
|
|
14 |
HF_TOKEN = os.environ["hf_read"]
|
15 |
|
16 |
languages = [
|
17 |
-
"Danish",
|
18 |
-
"Dutch",
|
19 |
"English",
|
20 |
-
"
|
21 |
-
"German",
|
22 |
-
"Hungarian",
|
23 |
-
"Italian",
|
24 |
-
"Polish",
|
25 |
-
"Portuguese",
|
26 |
-
"Spanish",
|
27 |
-
"Czech",
|
28 |
-
"Slovak",
|
29 |
-
"Norwegian"
|
30 |
]
|
31 |
|
32 |
domains = {
|
@@ -48,12 +38,19 @@ def check_huggingface_path(checkpoint_path: str):
|
|
48 |
hf_api = HfApi(token=HF_TOKEN)
|
49 |
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
|
50 |
return True
|
51 |
-
except
|
52 |
return False
|
53 |
|
54 |
def build_huggingface_path(language: str, domain: str):
|
|
|
55 |
base_path = "xlm-roberta-large"
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
|
58 |
|
59 |
path_map = {
|
@@ -75,48 +72,31 @@ def build_huggingface_path(language: str, domain: str):
|
|
75 |
except (AttributeError, FileNotFoundError):
|
76 |
value = None
|
77 |
|
78 |
-
if
|
79 |
-
model_path =
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
filtered_path_map = {k: v for k, v in path_map.items() if k != value}
|
86 |
-
for k, v in filtered_path_map.items():
|
87 |
-
if check_huggingface_path(v):
|
88 |
-
return v
|
89 |
-
elif check_huggingface_path(lang_domain_path):
|
90 |
-
return lang_domain_path
|
91 |
-
elif check_huggingface_path(lang_path):
|
92 |
-
return lang_path
|
93 |
else:
|
94 |
return "poltextlab/xlm-roberta-large-pooled-cap"
|
95 |
|
96 |
def predict(text, model_id, tokenizer_id):
|
97 |
device = torch.device("cpu")
|
98 |
-
|
99 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN)
|
100 |
-
gr.Info("Loading tokenizer")
|
101 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
102 |
-
#gr.Info("Mapping model to device")
|
103 |
-
#model.to(device)
|
104 |
|
105 |
-
gr.Info("Tokenizing")
|
106 |
inputs = tokenizer(text,
|
107 |
max_length=256,
|
108 |
truncation=True,
|
109 |
padding="do_not_pad",
|
110 |
return_tensors="pt").to(device)
|
111 |
-
|
112 |
-
gr.Info("model.eval()")
|
113 |
model.eval()
|
114 |
|
115 |
-
gr.Info("Prediction")
|
116 |
with torch.no_grad():
|
117 |
logits = model(**inputs).logits
|
118 |
|
119 |
-
gr.Info("Softmax")
|
120 |
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
121 |
output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
|
122 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
@@ -126,11 +106,17 @@ def predict_cap(text, language, domain):
|
|
126 |
domain = domains[domain]
|
127 |
model_id = build_huggingface_path(language, domain)
|
128 |
tokenizer_id = "xlm-roberta-large"
|
|
|
|
|
|
|
|
|
|
|
129 |
return predict(text, model_id, tokenizer_id)
|
130 |
|
131 |
demo = gr.Interface(
|
|
|
132 |
fn=predict_cap,
|
133 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
134 |
gr.Dropdown(languages, label="Language"),
|
135 |
gr.Dropdown(domains.keys(), label="Domain")],
|
136 |
-
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
|
|
7 |
from transformers import AutoModelForSequenceClassification
|
8 |
from transformers import AutoTokenizer
|
9 |
from huggingface_hub import HfApi
|
|
|
10 |
|
11 |
from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
|
12 |
|
13 |
+
from .utils import is_disk_full
|
14 |
+
|
15 |
HF_TOKEN = os.environ["hf_read"]
|
16 |
|
17 |
languages = [
|
|
|
|
|
18 |
"English",
|
19 |
+
"Multilingual"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
]
|
21 |
|
22 |
domains = {
|
|
|
38 |
hf_api = HfApi(token=HF_TOKEN)
|
39 |
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
|
40 |
return True
|
41 |
+
except:
|
42 |
return False
|
43 |
|
44 |
def build_huggingface_path(language: str, domain: str):
|
45 |
+
language = language.lower()
|
46 |
base_path = "xlm-roberta-large"
|
47 |
+
|
48 |
+
if language == "english" and (domain == "media" or domain == "legislative"):
|
49 |
+
lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v4"
|
50 |
+
return lang_domain_path
|
51 |
+
else:
|
52 |
+
lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
|
53 |
+
|
54 |
lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
|
55 |
|
56 |
path_map = {
|
|
|
72 |
except (AttributeError, FileNotFoundError):
|
73 |
value = None
|
74 |
|
75 |
+
if language == 'english':
|
76 |
+
model_path = lang_path
|
77 |
+
else:
|
78 |
+
model_path = "poltextlab/xlm-roberta-large-pooled-cap"
|
79 |
+
|
80 |
+
if check_huggingface_path(model_path):
|
81 |
+
return model_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
else:
|
83 |
return "poltextlab/xlm-roberta-large-pooled-cap"
|
84 |
|
85 |
def predict(text, model_id, tokenizer_id):
|
86 |
device = torch.device("cpu")
|
87 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
|
|
|
|
88 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
|
|
|
|
89 |
|
|
|
90 |
inputs = tokenizer(text,
|
91 |
max_length=256,
|
92 |
truncation=True,
|
93 |
padding="do_not_pad",
|
94 |
return_tensors="pt").to(device)
|
|
|
|
|
95 |
model.eval()
|
96 |
|
|
|
97 |
with torch.no_grad():
|
98 |
logits = model(**inputs).logits
|
99 |
|
|
|
100 |
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
101 |
output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
|
102 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
|
|
106 |
domain = domains[domain]
|
107 |
model_id = build_huggingface_path(language, domain)
|
108 |
tokenizer_id = "xlm-roberta-large"
|
109 |
+
|
110 |
+
if is_disk_full():
|
111 |
+
os.system('rm -rf /data/models*')
|
112 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
113 |
+
|
114 |
return predict(text, model_id, tokenizer_id)
|
115 |
|
116 |
demo = gr.Interface(
|
117 |
+
title="CAP Babel Demo",
|
118 |
fn=predict_cap,
|
119 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
120 |
gr.Dropdown(languages, label="Language"),
|
121 |
gr.Dropdown(domains.keys(), label="Domain")],
|
122 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/cap_minor.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from transformers import AutoModelForSequenceClassification
|
8 |
+
from transformers import AutoTokenizer
|
9 |
+
from huggingface_hub import HfApi
|
10 |
+
|
11 |
+
from label_dicts import CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES
|
12 |
+
|
13 |
+
from .utils import is_disk_full
|
14 |
+
|
15 |
+
HF_TOKEN = os.environ["hf_read"]
|
16 |
+
|
17 |
+
languages = [
|
18 |
+
"Multilingual",
|
19 |
+
]
|
20 |
+
|
21 |
+
domains = {
|
22 |
+
"media": "media",
|
23 |
+
"social media": "social",
|
24 |
+
"parliamentary speech": "parlspeech",
|
25 |
+
"legislative documents": "legislative",
|
26 |
+
"executive speech": "execspeech",
|
27 |
+
"executive order": "execorder",
|
28 |
+
"party programs": "party",
|
29 |
+
"judiciary": "judiciary",
|
30 |
+
"budget": "budget",
|
31 |
+
"public opinion": "publicopinion",
|
32 |
+
"local government agenda": "localgovernment"
|
33 |
+
}
|
34 |
+
|
35 |
+
def check_huggingface_path(checkpoint_path: str):
|
36 |
+
try:
|
37 |
+
hf_api = HfApi(token=HF_TOKEN)
|
38 |
+
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
|
39 |
+
return True
|
40 |
+
except:
|
41 |
+
return False
|
42 |
+
|
43 |
+
def build_huggingface_path(language: str, domain: str):
|
44 |
+
return "poltextlab/xlm-roberta-large-pooled-cap-minor"
|
45 |
+
|
46 |
+
def predict(text, model_id, tokenizer_id):
|
47 |
+
device = torch.device("cpu")
|
48 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
49 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
50 |
+
|
51 |
+
inputs = tokenizer(text,
|
52 |
+
max_length=256,
|
53 |
+
truncation=True,
|
54 |
+
padding="do_not_pad",
|
55 |
+
return_tensors="pt").to(device)
|
56 |
+
model.eval()
|
57 |
+
|
58 |
+
with torch.no_grad():
|
59 |
+
logits = model(**inputs).logits
|
60 |
+
|
61 |
+
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
62 |
+
output_pred = {f"[{CAP_MIN_NUM_DICT[i]}] {CAP_MIN_LABEL_NAMES[CAP_MIN_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
|
63 |
+
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
64 |
+
return output_pred, output_info
|
65 |
+
|
66 |
+
def predict_cap(text, language, domain):
|
67 |
+
domain = domains[domain]
|
68 |
+
model_id = build_huggingface_path(language, domain)
|
69 |
+
tokenizer_id = "xlm-roberta-large"
|
70 |
+
|
71 |
+
if is_disk_full():
|
72 |
+
os.system('rm -rf /data/models*')
|
73 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
74 |
+
|
75 |
+
return predict(text, model_id, tokenizer_id)
|
76 |
+
|
77 |
+
demo = gr.Interface(
|
78 |
+
title="CAP Minor Topics Babel Demo",
|
79 |
+
fn=predict_cap,
|
80 |
+
inputs=[gr.Textbox(lines=6, label="Input"),
|
81 |
+
gr.Dropdown(languages, label="Language"),
|
82 |
+
gr.Dropdown(domains.keys(), label="Domain")],
|
83 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/emotion.py
CHANGED
@@ -9,18 +9,25 @@ from huggingface_hub import HfApi
|
|
9 |
|
10 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
11 |
|
|
|
|
|
12 |
HF_TOKEN = os.environ["hf_read"]
|
13 |
|
14 |
languages = [
|
15 |
-
"Czech", "English", "French", "German", "Hungarian", "
|
16 |
]
|
|
|
|
|
|
|
17 |
|
18 |
def build_huggingface_path(language: str):
|
19 |
-
|
|
|
|
|
20 |
|
21 |
def predict(text, model_id, tokenizer_id):
|
22 |
device = torch.device("cpu")
|
23 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
|
24 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
25 |
model.to(device)
|
26 |
|
@@ -39,13 +46,20 @@ def predict(text, model_id, tokenizer_id):
|
|
39 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
40 |
return output_pred, output_info
|
41 |
|
42 |
-
def predict_cap(text, language):
|
43 |
model_id = build_huggingface_path(language)
|
44 |
tokenizer_id = "xlm-roberta-large"
|
|
|
|
|
|
|
|
|
|
|
45 |
return predict(text, model_id, tokenizer_id)
|
46 |
|
47 |
demo = gr.Interface(
|
|
|
48 |
fn=predict_cap,
|
49 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
50 |
-
gr.Dropdown(languages, label="Language")
|
51 |
-
|
|
|
|
9 |
|
10 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
11 |
|
12 |
+
from .utils import is_disk_full
|
13 |
+
|
14 |
HF_TOKEN = os.environ["hf_read"]
|
15 |
|
16 |
languages = [
|
17 |
+
"Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
|
18 |
]
|
19 |
+
domains = {
|
20 |
+
"parliamentary speech": "parlspeech",
|
21 |
+
}
|
22 |
|
23 |
def build_huggingface_path(language: str):
|
24 |
+
if language == "Czech" or language == "Slovak":
|
25 |
+
return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
|
26 |
+
return "poltextlab/xlm-roberta-large-pooled-MORES"
|
27 |
|
28 |
def predict(text, model_id, tokenizer_id):
|
29 |
device = torch.device("cpu")
|
30 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
31 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
32 |
model.to(device)
|
33 |
|
|
|
46 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
47 |
return output_pred, output_info
|
48 |
|
49 |
+
def predict_cap(text, language, domain):
|
50 |
model_id = build_huggingface_path(language)
|
51 |
tokenizer_id = "xlm-roberta-large"
|
52 |
+
|
53 |
+
if is_disk_full():
|
54 |
+
os.system('rm -rf /data/models*')
|
55 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
56 |
+
|
57 |
return predict(text, model_id, tokenizer_id)
|
58 |
|
59 |
demo = gr.Interface(
|
60 |
+
title="Emotions (6) Babel Demo",
|
61 |
fn=predict_cap,
|
62 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
63 |
+
gr.Dropdown(languages, label="Language"),
|
64 |
+
gr.Dropdown(domains.keys(), label="Domain")],
|
65 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/emotion9.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
from transformers import AutoModelForSequenceClassification
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
from huggingface_hub import HfApi
|
9 |
+
|
10 |
+
from label_dicts import EMOTION9_LABEL_NAMES
|
11 |
+
|
12 |
+
from .utils import is_disk_full
|
13 |
+
|
14 |
+
HF_TOKEN = os.environ["hf_read"]
|
15 |
+
|
16 |
+
languages = [
|
17 |
+
"Czech", "English", "German", "Hungarian", "Polish", "Slovak"
|
18 |
+
]
|
19 |
+
domains = {
|
20 |
+
"parliamentary speech": "parlspeech",
|
21 |
+
}
|
22 |
+
|
23 |
+
def build_huggingface_path(language: str):
|
24 |
+
language = language.lower()
|
25 |
+
return f"poltextlab/xlm-roberta-large-pooled-{language}-emotions9"
|
26 |
+
|
27 |
+
def predict(text, model_id, tokenizer_id):
|
28 |
+
device = torch.device("cpu")
|
29 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, token=HF_TOKEN)
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
31 |
+
|
32 |
+
inputs = tokenizer(text,
|
33 |
+
max_length=512,
|
34 |
+
truncation=True,
|
35 |
+
padding="do_not_pad",
|
36 |
+
return_tensors="pt").to(device)
|
37 |
+
model.eval()
|
38 |
+
|
39 |
+
with torch.no_grad():
|
40 |
+
logits = model(**inputs).logits
|
41 |
+
|
42 |
+
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
43 |
+
|
44 |
+
NUMS_DICT = {i: key for i, key in enumerate(sorted(EMOTION9_LABEL_NAMES.keys()))}
|
45 |
+
output_pred = {f"[{NUMS_DICT[i]}] {EMOTION9_LABEL_NAMES[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
|
46 |
+
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
47 |
+
return output_pred, output_info
|
48 |
+
|
49 |
+
def predict_e6(text, language, domain):
|
50 |
+
model_id = build_huggingface_path(language)
|
51 |
+
tokenizer_id = "xlm-roberta-large"
|
52 |
+
|
53 |
+
if is_disk_full():
|
54 |
+
os.system('rm -rf /data/models*')
|
55 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
56 |
+
|
57 |
+
return predict(text, model_id, tokenizer_id)
|
58 |
+
|
59 |
+
demo = gr.Interface(
|
60 |
+
title="Emotions (9) Babel Demo",
|
61 |
+
fn=predict_e6,
|
62 |
+
inputs=[gr.Textbox(lines=6, label="Input"),
|
63 |
+
gr.Dropdown(languages, label="Language"),
|
64 |
+
gr.Dropdown(domains.keys(), label="Domain")],
|
65 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/illframes.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from transformers import AutoModelForSequenceClassification
|
8 |
+
from transformers import AutoTokenizer
|
9 |
+
from huggingface_hub import HfApi
|
10 |
+
|
11 |
+
from label_dicts import ILLFRAMES_MIGRATION_LABEL_NAMES, ILLFRAMES_COVID_LABEL_NAMES, ILLFRAMES_WAR_LABEL_NAMES
|
12 |
+
|
13 |
+
from .utils import is_disk_full
|
14 |
+
|
15 |
+
HF_TOKEN = os.environ["hf_read"]
|
16 |
+
|
17 |
+
languages = [
|
18 |
+
"English"
|
19 |
+
]
|
20 |
+
|
21 |
+
domains = {
|
22 |
+
"Covid": "covid",
|
23 |
+
"Migration": "migration",
|
24 |
+
"War": "war"
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
# --- DEBUG ---
|
29 |
+
import shutil
|
30 |
+
|
31 |
+
def convert_size(size):
|
32 |
+
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
|
33 |
+
if size < 1024:
|
34 |
+
return f"{size:.2f} {unit}"
|
35 |
+
size /= 1024
|
36 |
+
|
37 |
+
def get_disk_space(path="/"):
|
38 |
+
total, used, free = shutil.disk_usage(path)
|
39 |
+
|
40 |
+
return {
|
41 |
+
"Total": convert_size(total),
|
42 |
+
"Used": convert_size(used),
|
43 |
+
"Free": convert_size(free)
|
44 |
+
}
|
45 |
+
|
46 |
+
# ---
|
47 |
+
|
48 |
+
def check_huggingface_path(checkpoint_path: str):
|
49 |
+
try:
|
50 |
+
hf_api = HfApi(token=HF_TOKEN)
|
51 |
+
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
|
52 |
+
return True
|
53 |
+
except:
|
54 |
+
return False
|
55 |
+
|
56 |
+
def build_huggingface_path(domain: str):
|
57 |
+
return f"poltextlab/xlm-roberta-large-english-ILLFRAMES-{domain}"
|
58 |
+
|
59 |
+
def predict(text, model_id, tokenizer_id, label_names):
|
60 |
+
device = torch.device("cpu")
|
61 |
+
try:
|
62 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, offload_folder="offload", device_map="auto", token=HF_TOKEN)
|
63 |
+
except:
|
64 |
+
disk_space = get_disk_space('/data/')
|
65 |
+
print("Disk Space Error:")
|
66 |
+
for key, value in disk_space.items():
|
67 |
+
print(f"{key}: {value}")
|
68 |
+
|
69 |
+
shutil.rmtree("/data")
|
70 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", token=HF_TOKEN, force_download=True)
|
71 |
+
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
73 |
+
|
74 |
+
inputs = tokenizer(text,
|
75 |
+
max_length=256,
|
76 |
+
truncation=True,
|
77 |
+
padding="do_not_pad",
|
78 |
+
return_tensors="pt").to(device)
|
79 |
+
model.eval()
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
logits = model(**inputs).logits
|
83 |
+
|
84 |
+
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
85 |
+
|
86 |
+
NUMS_DICT = {i: key for i, key in enumerate(sorted(label_names.keys()))}
|
87 |
+
|
88 |
+
output_pred = {f"[{NUMS_DICT[i]}] {label_names[NUMS_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
|
89 |
+
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
90 |
+
return output_pred, output_info
|
91 |
+
|
92 |
+
def predict_illframes(text, language, domain):
|
93 |
+
domain = domains[domain]
|
94 |
+
model_id = build_huggingface_path(domain)
|
95 |
+
tokenizer_id = "xlm-roberta-large"
|
96 |
+
|
97 |
+
if domain == "migration":
|
98 |
+
label_names = ILLFRAMES_MIGRATION_LABEL_NAMES
|
99 |
+
elif domain == "covid":
|
100 |
+
label_names = ILLFRAMES_COVID_LABEL_NAMES
|
101 |
+
elif domain == "war":
|
102 |
+
label_names = ILLFRAMES_WAR_LABEL_NAMES
|
103 |
+
|
104 |
+
if is_disk_full():
|
105 |
+
os.system('rm -rf /data/models*')
|
106 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
107 |
+
|
108 |
+
return predict(text, model_id, tokenizer_id, label_names)
|
109 |
+
|
110 |
+
demo = gr.Interface(
|
111 |
+
title="ILLFRAMES Babel Demo",
|
112 |
+
fn=predict_illframes,
|
113 |
+
inputs=[gr.Textbox(lines=6, label="Input"),
|
114 |
+
gr.Dropdown(languages, label="Language"),
|
115 |
+
gr.Dropdown(domains.keys(), label="Domain")],
|
116 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/manifesto.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import time
|
2 |
import gradio as gr
|
3 |
|
4 |
import os
|
@@ -10,20 +9,7 @@ from huggingface_hub import HfApi
|
|
10 |
|
11 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
12 |
|
13 |
-
|
14 |
-
def __init__(self, msg):
|
15 |
-
self.msg = msg
|
16 |
-
|
17 |
-
def __enter__(self):
|
18 |
-
self.start_time = time.time()
|
19 |
-
return self
|
20 |
-
|
21 |
-
def __exit__(self, exc_type, exc_value, traceback):
|
22 |
-
end_time = time.time()
|
23 |
-
runtime = end_time - self.start_time
|
24 |
-
gr.Info(f"{self.msg}: {runtime} seconds")
|
25 |
-
def m(msg):
|
26 |
-
return RuntimeMeasure(msg)
|
27 |
|
28 |
HF_TOKEN = os.environ["hf_read"]
|
29 |
|
@@ -39,44 +25,38 @@ def build_huggingface_path(language: str):
|
|
39 |
return "poltextlab/xlm-roberta-large-manifesto"
|
40 |
|
41 |
def predict(text, model_id, tokenizer_id):
|
42 |
-
gr.Info("\n".join(os.listdir("/data/")))
|
43 |
-
|
44 |
device = torch.device("cpu")
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
logits = model(**inputs).logits
|
62 |
-
|
63 |
-
with m("Softmax"):
|
64 |
-
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
65 |
-
|
66 |
-
with m("Output formatting"):
|
67 |
-
output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
|
68 |
-
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
69 |
return output_pred, output_info
|
70 |
|
71 |
def predict_cap(text, language):
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
77 |
|
78 |
demo = gr.Interface(
|
|
|
79 |
fn=predict_cap,
|
80 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
81 |
gr.Dropdown(languages, label="Language")],
|
82 |
-
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
import os
|
|
|
9 |
|
10 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
11 |
|
12 |
+
from .utils import is_disk_full
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
HF_TOKEN = os.environ["hf_read"]
|
15 |
|
|
|
25 |
return "poltextlab/xlm-roberta-large-manifesto"
|
26 |
|
27 |
def predict(text, model_id, tokenizer_id):
|
|
|
|
|
28 |
device = torch.device("cpu")
|
29 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
31 |
+
|
32 |
+
inputs = tokenizer(text,
|
33 |
+
max_length=256,
|
34 |
+
truncation=True,
|
35 |
+
padding="do_not_pad",
|
36 |
+
return_tensors="pt").to(device)
|
37 |
+
model.eval()
|
38 |
+
|
39 |
+
with torch.no_grad():
|
40 |
+
logits = model(**inputs).logits
|
41 |
+
|
42 |
+
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
43 |
+
output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
|
44 |
+
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
return output_pred, output_info
|
46 |
|
47 |
def predict_cap(text, language):
|
48 |
+
model_id = build_huggingface_path(language)
|
49 |
+
tokenizer_id = "xlm-roberta-large"
|
50 |
+
|
51 |
+
if is_disk_full():
|
52 |
+
os.system('rm -rf /data/models*')
|
53 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
54 |
+
|
55 |
+
return predict(text, model_id, tokenizer_id)
|
56 |
|
57 |
demo = gr.Interface(
|
58 |
+
title="Manifesto Babel Demo",
|
59 |
fn=predict_cap,
|
60 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
61 |
gr.Dropdown(languages, label="Language")],
|
62 |
+
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|
interfaces/ner.py
CHANGED
@@ -9,6 +9,8 @@ from transformers import AutoModelForSequenceClassification
|
|
9 |
from transformers import AutoTokenizer
|
10 |
from huggingface_hub import HfApi
|
11 |
|
|
|
|
|
12 |
languages = [
|
13 |
"English", "Hungarian", "Multilingual"
|
14 |
]
|
@@ -34,13 +36,16 @@ def named_entity_recognition(text, language):
|
|
34 |
pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
|
35 |
doc = pipeline(text)
|
36 |
entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
|
|
|
|
|
37 |
output = {"text":text, "entities":entities}
|
38 |
model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
|
39 |
-
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p>'
|
40 |
return output, output_info
|
41 |
|
42 |
demo = gr.Interface(
|
|
|
43 |
fn=named_entity_recognition,
|
44 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
45 |
gr.Dropdown(languages, label="Language")],
|
46 |
-
outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])
|
|
|
9 |
from transformers import AutoTokenizer
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
+
from spacy.glossary import GLOSSARY as NER_DICT
|
13 |
+
|
14 |
languages = [
|
15 |
"English", "Hungarian", "Multilingual"
|
16 |
]
|
|
|
36 |
pipeline = huspacy.load() if model_id.startswith("hu") else spacy.load(model_id)
|
37 |
doc = pipeline(text)
|
38 |
entities = [{"entity":ent.label_, "start":ent.start_char, "end":ent.end_char} for ent in doc.ents]
|
39 |
+
labels_used = [ent.label_ for ent in doc.ents]
|
40 |
+
legend = '<p style="text-align: left; display: block">Legend:</p><ul style="text-align: left; display: block">'+"".join([f"<li> <b>{label}</b> = <i>{NER_DICT[label]}</i> </li>" for label in set(labels_used)])+"</ul>"
|
41 |
output = {"text":text, "entities":entities}
|
42 |
model_id_hf = f"huspacy/{model_id}" if model_id.startswith("hu") else f"spacy/{model_id}"
|
43 |
+
output_info = legend + f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id_hf}">{model_id_hf}</a> model.</p> <ul>'
|
44 |
return output, output_info
|
45 |
|
46 |
demo = gr.Interface(
|
47 |
+
title="NER Babel Demo",
|
48 |
fn=named_entity_recognition,
|
49 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
50 |
gr.Dropdown(languages, label="Language")],
|
51 |
+
outputs=[gr.HighlightedText(label='Output'), gr.Markdown()])
|
interfaces/ontolisst.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
from transformers import AutoModelForSequenceClassification
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
from huggingface_hub import HfApi
|
9 |
+
|
10 |
+
HF_TOKEN = os.environ["hf_read"]
|
11 |
+
|
12 |
+
languages = [
|
13 |
+
"English"
|
14 |
+
]
|
15 |
+
|
16 |
+
from label_dicts import ONTOLISST_LABEL_NAMES
|
17 |
+
|
18 |
+
from .utils import is_disk_full
|
19 |
+
|
20 |
+
# --- DEBUG ---
|
21 |
+
import shutil
|
22 |
+
|
23 |
+
def convert_size(size):
|
24 |
+
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
|
25 |
+
if size < 1024:
|
26 |
+
return f"{size:.2f} {unit}"
|
27 |
+
size /= 1024
|
28 |
+
|
29 |
+
def get_disk_space(path="/"):
|
30 |
+
total, used, free = shutil.disk_usage(path)
|
31 |
+
|
32 |
+
return {
|
33 |
+
"Total": convert_size(total),
|
34 |
+
"Used": convert_size(used),
|
35 |
+
"Free": convert_size(free)
|
36 |
+
}
|
37 |
+
|
38 |
+
# ---
|
39 |
+
|
40 |
+
|
41 |
+
def build_huggingface_path(language: str):
|
42 |
+
return "poltextlab/xlm-roberta-large_ontolisst_v1"
|
43 |
+
|
44 |
+
def predict(text, model_id, tokenizer_id):
|
45 |
+
device = torch.device("cpu")
|
46 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
48 |
+
|
49 |
+
# --- DEBUG ---
|
50 |
+
|
51 |
+
disk_space = get_disk_space('/data/')
|
52 |
+
print("Disk Space Info:")
|
53 |
+
for key, value in disk_space.items():
|
54 |
+
print(f"{key}: {value}")
|
55 |
+
|
56 |
+
# ---
|
57 |
+
|
58 |
+
model.to(device)
|
59 |
+
|
60 |
+
inputs = tokenizer(text,
|
61 |
+
max_length=256,
|
62 |
+
truncation=True,
|
63 |
+
padding="do_not_pad",
|
64 |
+
return_tensors="pt").to(device)
|
65 |
+
model.eval()
|
66 |
+
|
67 |
+
with torch.no_grad():
|
68 |
+
logits = model(**inputs).logits
|
69 |
+
|
70 |
+
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
71 |
+
predicted_class_id = probs.argmax()
|
72 |
+
predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
|
73 |
+
|
74 |
+
|
75 |
+
output_pred = ONTOLISST_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
|
76 |
+
|
77 |
+
|
78 |
+
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
79 |
+
return output_pred, output_info
|
80 |
+
|
81 |
+
def predict_cap(text, language):
|
82 |
+
model_id = build_huggingface_path(language)
|
83 |
+
tokenizer_id = "xlm-roberta-large"
|
84 |
+
|
85 |
+
if is_disk_full():
|
86 |
+
os.system('rm -rf /data/models*')
|
87 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
88 |
+
|
89 |
+
return predict(text, model_id, tokenizer_id)
|
90 |
+
|
91 |
+
demo = gr.Interface(
|
92 |
+
title="ONTOLISST Babel Demo",
|
93 |
+
fn=predict_cap,
|
94 |
+
inputs=[gr.Textbox(lines=6, label="Input"),
|
95 |
+
gr.Dropdown(languages, label="Language")],
|
96 |
+
outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
|
interfaces/sentiment.py
CHANGED
@@ -9,23 +9,33 @@ from huggingface_hub import HfApi
|
|
9 |
|
10 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
11 |
|
|
|
|
|
12 |
HF_TOKEN = os.environ["hf_read"]
|
13 |
|
14 |
languages = [
|
15 |
-
"Czech", "English", "French", "German", "Hungarian", "
|
16 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def build_huggingface_path(language: str):
|
19 |
-
|
|
|
|
|
20 |
|
21 |
def predict(text, model_id, tokenizer_id):
|
22 |
device = torch.device("cpu")
|
23 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
|
24 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
25 |
model.to(device)
|
26 |
|
27 |
inputs = tokenizer(text,
|
28 |
-
max_length=
|
29 |
truncation=True,
|
30 |
padding="do_not_pad",
|
31 |
return_tensors="pt").to(device)
|
@@ -35,17 +45,30 @@ def predict(text, model_id, tokenizer_id):
|
|
35 |
logits = model(**inputs).logits
|
36 |
|
37 |
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
40 |
return output_pred, output_info
|
41 |
|
42 |
-
def predict_cap(text, language):
|
43 |
model_id = build_huggingface_path(language)
|
44 |
tokenizer_id = "xlm-roberta-large"
|
|
|
|
|
|
|
|
|
|
|
45 |
return predict(text, model_id, tokenizer_id)
|
46 |
|
47 |
demo = gr.Interface(
|
|
|
48 |
fn=predict_cap,
|
49 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
50 |
-
gr.Dropdown(languages, label="Language")
|
51 |
-
|
|
|
|
9 |
|
10 |
from label_dicts import MANIFESTO_LABEL_NAMES
|
11 |
|
12 |
+
from .utils import is_disk_full
|
13 |
+
|
14 |
HF_TOKEN = os.environ["hf_read"]
|
15 |
|
16 |
languages = [
|
17 |
+
"Czech", "English", "French", "German", "Hungarian", "Polish", "Slovak"
|
18 |
]
|
19 |
+
domains = {
|
20 |
+
"parliamentary speech": "parlspeech",
|
21 |
+
}
|
22 |
+
|
23 |
+
SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
|
24 |
+
|
25 |
|
26 |
def build_huggingface_path(language: str):
|
27 |
+
if language == "Czech" or language == "Slovak":
|
28 |
+
return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
|
29 |
+
return "poltextlab/xlm-roberta-large-pooled-MORES"
|
30 |
|
31 |
def predict(text, model_id, tokenizer_id):
|
32 |
device = torch.device("cpu")
|
33 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
|
34 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
35 |
model.to(device)
|
36 |
|
37 |
inputs = tokenizer(text,
|
38 |
+
max_length=256,
|
39 |
truncation=True,
|
40 |
padding="do_not_pad",
|
41 |
return_tensors="pt").to(device)
|
|
|
45 |
logits = model(**inputs).logits
|
46 |
|
47 |
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
|
48 |
+
predicted_class_id = probs.argmax()
|
49 |
+
predicted_class_id = {4: 2, 5: 1}.get(predicted_class_id, 0)
|
50 |
+
|
51 |
+
|
52 |
+
output_pred = SENTIMENT_LABEL_NAMES.get(predicted_class_id, predicted_class_id)
|
53 |
+
|
54 |
+
|
55 |
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
|
56 |
return output_pred, output_info
|
57 |
|
58 |
+
def predict_cap(text, language, domain):
|
59 |
model_id = build_huggingface_path(language)
|
60 |
tokenizer_id = "xlm-roberta-large"
|
61 |
+
|
62 |
+
if is_disk_full():
|
63 |
+
os.system('rm -rf /data/models*')
|
64 |
+
os.system('rm -r ~/.cache/huggingface/hub')
|
65 |
+
|
66 |
return predict(text, model_id, tokenizer_id)
|
67 |
|
68 |
demo = gr.Interface(
|
69 |
+
title="Sentiment (3) Babel Demo",
|
70 |
fn=predict_cap,
|
71 |
inputs=[gr.Textbox(lines=6, label="Input"),
|
72 |
+
gr.Dropdown(languages, label="Language"),
|
73 |
+
gr.Dropdown(domains.keys(), label="Domain")],
|
74 |
+
outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
|
interfaces/utils.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
|
3 |
+
def is_disk_full(min_free_space_in_GB=10):
|
4 |
+
total, used, free = shutil.disk_usage("/")
|
5 |
+
free_gb = free / (1024 ** 3)
|
6 |
+
|
7 |
+
if free_gb >= min_free_space_in_GB:
|
8 |
+
print(f'enough space available ({free_gb} GB)')
|
9 |
+
return False
|
10 |
+
else:
|
11 |
+
print('clean up!')
|
12 |
+
return True
|
label_dicts.py
CHANGED
@@ -21,6 +21,220 @@ CAP_NUM_DICT = {
|
|
21 |
19: 21,
|
22 |
20: 23,
|
23 |
21: 999,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
}
|
25 |
|
26 |
CAP_LABEL_NAMES = {
|
@@ -48,6 +262,245 @@ CAP_LABEL_NAMES = {
|
|
48 |
999: "No Policy Content"
|
49 |
}
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
MANIFESTO_LABEL_NAMES = {
|
52 |
0: "No Policy Goal",
|
53 |
999: "No Policy Goal",
|
@@ -107,4 +560,70 @@ MANIFESTO_LABEL_NAMES = {
|
|
107 |
704: "Middle Class and Professional Groups",
|
108 |
705: "Underprivileged Minority Groups",
|
109 |
706: "Non-economic Demographic Groups"
|
110 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
19: 21,
|
22 |
20: 23,
|
23 |
21: 999,
|
24 |
+
22: 999, # had to do this because of some language-domain models (e.g. english media)
|
25 |
+
}
|
26 |
+
|
27 |
+
CAP_MIN_NUM_DICT = {
|
28 |
+
0: 100,
|
29 |
+
1: 101,
|
30 |
+
2: 103,
|
31 |
+
3: 104,
|
32 |
+
4: 105,
|
33 |
+
5: 107,
|
34 |
+
6: 108,
|
35 |
+
7: 110,
|
36 |
+
8: 199,
|
37 |
+
9: 200,
|
38 |
+
10: 201,
|
39 |
+
11: 202,
|
40 |
+
12: 204,
|
41 |
+
13: 205,
|
42 |
+
14: 206,
|
43 |
+
15: 207,
|
44 |
+
16: 208,
|
45 |
+
17: 209,
|
46 |
+
18: 299,
|
47 |
+
19: 300,
|
48 |
+
20: 301,
|
49 |
+
21: 302,
|
50 |
+
22: 321,
|
51 |
+
23: 322,
|
52 |
+
24: 323,
|
53 |
+
25: 324,
|
54 |
+
26: 325,
|
55 |
+
27: 331,
|
56 |
+
28: 332,
|
57 |
+
29: 333,
|
58 |
+
30: 334,
|
59 |
+
31: 335,
|
60 |
+
32: 341,
|
61 |
+
33: 342,
|
62 |
+
34: 398,
|
63 |
+
35: 399,
|
64 |
+
36: 400,
|
65 |
+
37: 401,
|
66 |
+
38: 402,
|
67 |
+
39: 403,
|
68 |
+
40: 404,
|
69 |
+
41: 405,
|
70 |
+
42: 408,
|
71 |
+
43: 498,
|
72 |
+
44: 499,
|
73 |
+
45: 500,
|
74 |
+
46: 501,
|
75 |
+
47: 502,
|
76 |
+
48: 503,
|
77 |
+
49: 504,
|
78 |
+
50: 505,
|
79 |
+
51: 506,
|
80 |
+
52: 529,
|
81 |
+
53: 599,
|
82 |
+
54: 600,
|
83 |
+
55: 601,
|
84 |
+
56: 602,
|
85 |
+
57: 603,
|
86 |
+
58: 604,
|
87 |
+
59: 606,
|
88 |
+
60: 607,
|
89 |
+
61: 698,
|
90 |
+
62: 699,
|
91 |
+
63: 700,
|
92 |
+
64: 701,
|
93 |
+
65: 703,
|
94 |
+
66: 704,
|
95 |
+
67: 705,
|
96 |
+
68: 707,
|
97 |
+
69: 708,
|
98 |
+
70: 709,
|
99 |
+
71: 711,
|
100 |
+
72: 798,
|
101 |
+
73: 799,
|
102 |
+
74: 800,
|
103 |
+
75: 801,
|
104 |
+
76: 802,
|
105 |
+
77: 803,
|
106 |
+
78: 805,
|
107 |
+
79: 806,
|
108 |
+
80: 807,
|
109 |
+
81: 898,
|
110 |
+
82: 899,
|
111 |
+
83: 900,
|
112 |
+
84: 1000,
|
113 |
+
85: 1001,
|
114 |
+
86: 1002,
|
115 |
+
87: 1003,
|
116 |
+
88: 1005,
|
117 |
+
89: 1007,
|
118 |
+
90: 1010,
|
119 |
+
91: 1098,
|
120 |
+
92: 1099,
|
121 |
+
93: 1200,
|
122 |
+
94: 1201,
|
123 |
+
95: 1202,
|
124 |
+
96: 1203,
|
125 |
+
97: 1204,
|
126 |
+
98: 1205,
|
127 |
+
99: 1206,
|
128 |
+
100: 1207,
|
129 |
+
101: 1208,
|
130 |
+
102: 1210,
|
131 |
+
103: 1211,
|
132 |
+
104: 1227,
|
133 |
+
105: 1299,
|
134 |
+
106: 1300,
|
135 |
+
107: 1302,
|
136 |
+
108: 1303,
|
137 |
+
109: 1304,
|
138 |
+
110: 1305,
|
139 |
+
111: 1308,
|
140 |
+
112: 1399,
|
141 |
+
113: 1400,
|
142 |
+
114: 1401,
|
143 |
+
115: 1403,
|
144 |
+
116: 1404,
|
145 |
+
117: 1405,
|
146 |
+
118: 1406,
|
147 |
+
119: 1407,
|
148 |
+
120: 1408,
|
149 |
+
121: 1409,
|
150 |
+
122: 1498,
|
151 |
+
123: 1499,
|
152 |
+
124: 1500,
|
153 |
+
125: 1501,
|
154 |
+
126: 1502,
|
155 |
+
127: 1504,
|
156 |
+
128: 1505,
|
157 |
+
129: 1507,
|
158 |
+
130: 1520,
|
159 |
+
131: 1521,
|
160 |
+
132: 1522,
|
161 |
+
133: 1523,
|
162 |
+
134: 1524,
|
163 |
+
135: 1525,
|
164 |
+
136: 1526,
|
165 |
+
137: 1598,
|
166 |
+
138: 1599,
|
167 |
+
139: 1600,
|
168 |
+
140: 1602,
|
169 |
+
141: 1603,
|
170 |
+
142: 1604,
|
171 |
+
143: 1605,
|
172 |
+
144: 1606,
|
173 |
+
145: 1608,
|
174 |
+
146: 1610,
|
175 |
+
147: 1611,
|
176 |
+
148: 1612,
|
177 |
+
149: 1614,
|
178 |
+
150: 1615,
|
179 |
+
151: 1616,
|
180 |
+
152: 1617,
|
181 |
+
153: 1619,
|
182 |
+
154: 1620,
|
183 |
+
155: 1698,
|
184 |
+
156: 1699,
|
185 |
+
157: 1700,
|
186 |
+
158: 1701,
|
187 |
+
159: 1704,
|
188 |
+
160: 1705,
|
189 |
+
161: 1706,
|
190 |
+
162: 1707,
|
191 |
+
163: 1708,
|
192 |
+
164: 1709,
|
193 |
+
165: 1798,
|
194 |
+
166: 1799,
|
195 |
+
167: 1800,
|
196 |
+
168: 1802,
|
197 |
+
169: 1803,
|
198 |
+
170: 1804,
|
199 |
+
171: 1806,
|
200 |
+
172: 1807,
|
201 |
+
173: 1808,
|
202 |
+
174: 1899,
|
203 |
+
175: 1900,
|
204 |
+
176: 1901,
|
205 |
+
177: 1902,
|
206 |
+
178: 1905,
|
207 |
+
179: 1906,
|
208 |
+
180: 1910,
|
209 |
+
181: 1921,
|
210 |
+
182: 1925,
|
211 |
+
183: 1926,
|
212 |
+
184: 1927,
|
213 |
+
185: 1929,
|
214 |
+
186: 1999,
|
215 |
+
187: 2000,
|
216 |
+
188: 2001,
|
217 |
+
189: 2002,
|
218 |
+
190: 2003,
|
219 |
+
191: 2004,
|
220 |
+
192: 2005,
|
221 |
+
193: 2006,
|
222 |
+
194: 2007,
|
223 |
+
195: 2008,
|
224 |
+
196: 2009,
|
225 |
+
197: 2010,
|
226 |
+
198: 2011,
|
227 |
+
199: 2012,
|
228 |
+
200: 2013,
|
229 |
+
201: 2014,
|
230 |
+
202: 2015,
|
231 |
+
203: 2030,
|
232 |
+
204: 2099,
|
233 |
+
205: 2100,
|
234 |
+
206: 2101,
|
235 |
+
207: 2102,
|
236 |
+
208: 2103,
|
237 |
+
209: 2104
|
238 |
}
|
239 |
|
240 |
CAP_LABEL_NAMES = {
|
|
|
262 |
999: "No Policy Content"
|
263 |
}
|
264 |
|
265 |
+
CAP_MIN_LABEL_NAMES = {
|
266 |
+
# 1. Macroeconomics
|
267 |
+
100: "General",
|
268 |
+
101: "Interest Rates",
|
269 |
+
103: "Unemployment Rate",
|
270 |
+
104: "Monetary Policy",
|
271 |
+
105: "National Budget",
|
272 |
+
107: "Tax Code",
|
273 |
+
108: "Industrial Policy",
|
274 |
+
110: "Price Control",
|
275 |
+
199: "Other",
|
276 |
+
# 2. Civil Rights
|
277 |
+
200: "General",
|
278 |
+
201: "Minority Discrimination",
|
279 |
+
202: "Gender Discrimination",
|
280 |
+
204: "Age Discrimination",
|
281 |
+
205: "Handicap Discrimination",
|
282 |
+
206: "Voting Rights",
|
283 |
+
207: "Freedom of Speech",
|
284 |
+
208: "Right to Privacy",
|
285 |
+
209: "Anti-Government",
|
286 |
+
299: "Other",
|
287 |
+
# 3. Health
|
288 |
+
300: "General",
|
289 |
+
301: "Health Care Reform",
|
290 |
+
302: "Insurance",
|
291 |
+
321: "Drug Industry",
|
292 |
+
322: "Medical Facilities",
|
293 |
+
323: "Insurance Providers",
|
294 |
+
324: "Medical Liability",
|
295 |
+
325: "Manpower",
|
296 |
+
331: "Disease Prevention",
|
297 |
+
332: "Infants and Children",
|
298 |
+
333: "Mental Health",
|
299 |
+
334: "Long-term Care",
|
300 |
+
335: "Drug Coverage and Cost",
|
301 |
+
341: "Tobacco Abuse",
|
302 |
+
342: "Drug and Alcohol Abuse",
|
303 |
+
398: "R&D",
|
304 |
+
399: "Other",
|
305 |
+
# 4. Agriculture
|
306 |
+
400: "General",
|
307 |
+
401: "Trade",
|
308 |
+
402: "Subsidies to Farmers",
|
309 |
+
403: "Food Inspection & Safety",
|
310 |
+
404: "Food Marketing & Promotion",
|
311 |
+
405: "Animal and Crop Disease",
|
312 |
+
408: "Fisheries & Fishing",
|
313 |
+
498: "R&D",
|
314 |
+
499: "Other",
|
315 |
+
# 5. Labor
|
316 |
+
500: "General",
|
317 |
+
501: "Worker Safety",
|
318 |
+
502: "Employment Training",
|
319 |
+
503: "Employee Benefits",
|
320 |
+
504: "Labor Unions",
|
321 |
+
505: "Fair Labor Standards",
|
322 |
+
506: "Youth Employment",
|
323 |
+
529: "Migrant and Seasonal",
|
324 |
+
599: "Other",
|
325 |
+
# 6. Education
|
326 |
+
600: "General",
|
327 |
+
601: "Higher",
|
328 |
+
602: "Elementary & Secondary",
|
329 |
+
603: "Underprivileged",
|
330 |
+
604: "Vocational",
|
331 |
+
606: "Special",
|
332 |
+
607: "Excellence",
|
333 |
+
698: "R&D",
|
334 |
+
699: "Other",
|
335 |
+
# 7. Environment
|
336 |
+
700: "General",
|
337 |
+
701: "Drinking Water",
|
338 |
+
703: "Waste Disposal",
|
339 |
+
704: "Hazardous Waste",
|
340 |
+
705: "Air Pollution",
|
341 |
+
707: "Recycling",
|
342 |
+
708: "Indoor Hazards",
|
343 |
+
709: "Species & Forest",
|
344 |
+
711: "Land and Water Conservation",
|
345 |
+
798: "R&D",
|
346 |
+
799: "Other",
|
347 |
+
# 8. Energy
|
348 |
+
800: "General",
|
349 |
+
801: "Nuclear",
|
350 |
+
802: "Electricity",
|
351 |
+
803: "Natural Gas & Oil",
|
352 |
+
805: "Coal",
|
353 |
+
806: "Alternative & Renewable",
|
354 |
+
807: "Conservation",
|
355 |
+
898: "R&D",
|
356 |
+
899: "Other",
|
357 |
+
# 9. Immigration
|
358 |
+
900: "Immigration",
|
359 |
+
# 10. Transportation
|
360 |
+
1000: "General",
|
361 |
+
1001: "Mass",
|
362 |
+
1002: "Highways",
|
363 |
+
1003: "Air Travel",
|
364 |
+
1005: "Railroad Travel",
|
365 |
+
1007: "Maritime",
|
366 |
+
1010: "Infrastructure",
|
367 |
+
1098: "R&D",
|
368 |
+
1099: "Other",
|
369 |
+
# 12. Law and Crime
|
370 |
+
1200: "General",
|
371 |
+
1201: "Agencies",
|
372 |
+
1202: "White Collar Crime",
|
373 |
+
1203: "Illegal Drugs",
|
374 |
+
1204: "Court Administration",
|
375 |
+
1205: "Prisons",
|
376 |
+
1206: "Juvenile Crime",
|
377 |
+
1207: "Child Abuse",
|
378 |
+
1208: "Family Issues",
|
379 |
+
1210: "Criminal & Civil Code",
|
380 |
+
1211: "Crime Control",
|
381 |
+
1227: "Police",
|
382 |
+
1299: "Other",
|
383 |
+
# 13. Social Welfare
|
384 |
+
1300: "General",
|
385 |
+
1302: "Low-Income Assistance",
|
386 |
+
1303: "Elderly Assistance",
|
387 |
+
1304: "Disabled Assistance",
|
388 |
+
1305: "Volunteer Associations",
|
389 |
+
1308: "Child Care",
|
390 |
+
1399: "Other",
|
391 |
+
# 14. Housing
|
392 |
+
1400: "General",
|
393 |
+
1401: "Community Development",
|
394 |
+
1403: "Urban Development",
|
395 |
+
1404: "Rural Housing",
|
396 |
+
1405: "Rural Development",
|
397 |
+
1406: "Low-Income Assistance",
|
398 |
+
1407: "Veterans",
|
399 |
+
1408: "Elderly",
|
400 |
+
1409: "Homeless",
|
401 |
+
1498: "R&D",
|
402 |
+
1499: "Other",
|
403 |
+
# 15. Domestic Commerce
|
404 |
+
1500: "General",
|
405 |
+
1501: "Banking",
|
406 |
+
1502: "Securities & Commodities",
|
407 |
+
1504: "Consumer Finance",
|
408 |
+
1505: "Insurance Regulation",
|
409 |
+
1507: "Bankruptcy",
|
410 |
+
1520: "Corporate Management",
|
411 |
+
1521: "Small Businesses",
|
412 |
+
1522: "Copyrights and Patents",
|
413 |
+
1523: "Disaster Relief",
|
414 |
+
1524: "Tourism",
|
415 |
+
1525: "Consumer Safety",
|
416 |
+
1526: "Sports Regulation",
|
417 |
+
1598: "R&D",
|
418 |
+
1599: "Other",
|
419 |
+
# 16. Defense
|
420 |
+
1600: "General",
|
421 |
+
1602: "Alliances",
|
422 |
+
1603: "Intelligence",
|
423 |
+
1604: "Readiness",
|
424 |
+
1605: "Nuclear Arms",
|
425 |
+
1606: "Military Aid",
|
426 |
+
1608: "Personnel Issues",
|
427 |
+
1610: "Procurement",
|
428 |
+
1611: "Installations & Land",
|
429 |
+
1612: "Reserve Forces",
|
430 |
+
1614: "Hazardous Waste",
|
431 |
+
1615: "Civil",
|
432 |
+
1616: "Civilian Personnel",
|
433 |
+
1617: "Contractors",
|
434 |
+
1619: "Foreign Operations",
|
435 |
+
1620: "Claims against Military",
|
436 |
+
1698: "R&D",
|
437 |
+
1699: "Other",
|
438 |
+
# 17. Technology
|
439 |
+
1700: "General",
|
440 |
+
1701: "Space",
|
441 |
+
1704: "Commercial Use of Space",
|
442 |
+
1705: "Science Transfer",
|
443 |
+
1706: "Telecommunications",
|
444 |
+
1707: "Broadcast",
|
445 |
+
1708: "Weather Forecasting",
|
446 |
+
1709: "Computers",
|
447 |
+
1798: "R&D",
|
448 |
+
1799: "Other",
|
449 |
+
# 18. Foreign Trade
|
450 |
+
1800: "General",
|
451 |
+
1802: "Trade Agreements",
|
452 |
+
1803: "Exports",
|
453 |
+
1804: "Private Investments",
|
454 |
+
1806: "Competitiveness",
|
455 |
+
1807: "Tariff & Imports",
|
456 |
+
1808: "Exchange Rates",
|
457 |
+
1899: "Other",
|
458 |
+
# 19. International Affairs
|
459 |
+
1900: "General",
|
460 |
+
1901: "Foreign Aid",
|
461 |
+
1902: "Resources Exploitation",
|
462 |
+
1905: "Developing Countries",
|
463 |
+
1906: "International Finance",
|
464 |
+
1910: "Western Europe",
|
465 |
+
1921: "Specific Country",
|
466 |
+
1925: "Human Rights",
|
467 |
+
1926: "Organizations",
|
468 |
+
1927: "Terrorism",
|
469 |
+
1929: "Diplomats",
|
470 |
+
1999: "Other",
|
471 |
+
# 20. Government Operations
|
472 |
+
2000: "General",
|
473 |
+
2001: "Intergovernmental Relations",
|
474 |
+
2002: "Bureaucracy",
|
475 |
+
2003: "Postal Service",
|
476 |
+
2004: "Employees",
|
477 |
+
2005: "Appointments",
|
478 |
+
2006: "Currency",
|
479 |
+
2007: "Procurement & Contractors",
|
480 |
+
2008: "Property Management",
|
481 |
+
2009: "Tax Administration",
|
482 |
+
2010: "Scandals",
|
483 |
+
2011: "Branch Relations",
|
484 |
+
2012: "Political Campaigns",
|
485 |
+
2013: "Census & Statistics",
|
486 |
+
2014: "Capital City",
|
487 |
+
2015: "Claims against the government",
|
488 |
+
2030: "National Holidays",
|
489 |
+
2099: "Other",
|
490 |
+
# 21. Public Lands
|
491 |
+
2100: "General",
|
492 |
+
2101: "National Parks",
|
493 |
+
2102: "Indigenous Affairs",
|
494 |
+
2103: "Public Lands",
|
495 |
+
2104: "Water Resources",
|
496 |
+
2105: "Dependencies & Territories",
|
497 |
+
2199: "Other",
|
498 |
+
# 23. Culture
|
499 |
+
2300: "General",
|
500 |
+
# NPC
|
501 |
+
9999: "No Policy Content",
|
502 |
+
}
|
503 |
+
|
504 |
MANIFESTO_LABEL_NAMES = {
|
505 |
0: "No Policy Goal",
|
506 |
999: "No Policy Goal",
|
|
|
560 |
704: "Middle Class and Professional Groups",
|
561 |
705: "Underprivileged Minority Groups",
|
562 |
706: "Non-economic Demographic Groups"
|
563 |
+
}
|
564 |
+
|
565 |
+
ILLFRAMES_MIGRATION_LABEL_NAMES = {
|
566 |
+
901: "Culture Under Attack",
|
567 |
+
902: "Economic Burden",
|
568 |
+
903: "Illegals and Fraudsters",
|
569 |
+
904: "Extradition Necessity",
|
570 |
+
905: "Nation tate Should Decide",
|
571 |
+
906: "Administrative Burden",
|
572 |
+
907: "General System Failure",
|
573 |
+
908: "Security Threat",
|
574 |
+
909: "Criminals",
|
575 |
+
910: "Welfare State Overload",
|
576 |
+
999: "None of Them",
|
577 |
+
}
|
578 |
+
|
579 |
+
ILLFRAMES_COVID_LABEL_NAMES = {
|
580 |
+
310: "Skepticism",
|
581 |
+
311: "Great Reset and Elite Control",
|
582 |
+
312: "Undermining the Economy",
|
583 |
+
313: "Medical Choice",
|
584 |
+
314: "Media Fabrication",
|
585 |
+
315: "Threatening Way of Life",
|
586 |
+
399: "None of Them",
|
587 |
+
}
|
588 |
+
|
589 |
+
ILLFRAMES_WAR_LABEL_NAMES = {
|
590 |
+
101: 'Identity and Cultural Threat',
|
591 |
+
102: 'Economic Fallout/Domestic Welfare Neglected',
|
592 |
+
103: 'Violation of Russian Sovereignty/Western geopolitical meddling',
|
593 |
+
104: 'Illegitimate and corrupt Ukraine leadership',
|
594 |
+
105: 'Ukrainians and Ukraine are a military threat and agressive war-mongerer that threaten EU stability and security',
|
595 |
+
107: 'Western Propaganda and Civilian Suffering',
|
596 |
+
108: 'Historical Betrayal of Russia',
|
597 |
+
109: 'Ukraine/Nazi Allegation',
|
598 |
+
110: "None of Them"
|
599 |
+
}
|
600 |
+
|
601 |
+
ONTOLISST_LABEL_NAMES = {
|
602 |
+
0: 'Demographics',
|
603 |
+
1: 'Housing and local environment (Housing and environment)',
|
604 |
+
2: 'Physical health',
|
605 |
+
3: 'Mental health and mental processes',
|
606 |
+
4: 'Healthcare',
|
607 |
+
5: 'Health behaviour (Health and lifestyle)',
|
608 |
+
6: 'Family and social networks',
|
609 |
+
7: 'Education',
|
610 |
+
8: 'Employment and income (Employment and pensions)',
|
611 |
+
9: 'Expectation, attitudes and beliefs (Attitudes and beliefs)',
|
612 |
+
10: 'Child development',
|
613 |
+
11: 'Life events',
|
614 |
+
12: 'Omics',
|
615 |
+
13: 'Pregnancy',
|
616 |
+
14: 'Administration',
|
617 |
+
15: 'COVID19'
|
618 |
+
}
|
619 |
+
EMOTION9_LABEL_NAMES = {
|
620 |
+
0: "Anger",
|
621 |
+
1: "Fear",
|
622 |
+
2: "Disgust",
|
623 |
+
3: "Sadness",
|
624 |
+
4: "Joy",
|
625 |
+
5: "Enthusiasm",
|
626 |
+
6: "Hope",
|
627 |
+
7: "Pride",
|
628 |
+
8: "None of Them",
|
629 |
+
}
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ transformers==4.39.1
|
|
4 |
sentencepiece==0.2.0
|
5 |
accelerate
|
6 |
spacy
|
7 |
-
huspacy
|
|
|
|
4 |
sentencepiece==0.2.0
|
5 |
accelerate
|
6 |
spacy
|
7 |
+
huspacy
|
8 |
+
numpy==1.26.4
|
utils.py
CHANGED
@@ -1,36 +1,72 @@
|
|
1 |
import os
|
2 |
-
|
|
|
3 |
|
4 |
-
|
5 |
-
from interfaces.manifesto import languages as languages_manifesto
|
6 |
-
from interfaces.manifesto import languages as languages_manifesto
|
7 |
-
from interfaces.manifesto import languages as languages_manifesto
|
8 |
-
"""
|
9 |
|
10 |
from interfaces.cap import languages as languages_cap
|
11 |
from interfaces.cap import domains as domains_cap
|
12 |
|
|
|
|
|
|
|
|
|
13 |
from interfaces.cap import build_huggingface_path as hf_cap_path
|
|
|
14 |
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
|
15 |
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
|
16 |
from interfaces.emotion import build_huggingface_path as hf_emotion_path
|
|
|
|
|
|
|
|
|
17 |
|
18 |
HF_TOKEN = os.environ["hf_read"]
|
19 |
|
20 |
# should be a temporary solution
|
21 |
-
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path("")]
|
|
|
|
|
|
|
22 |
for language in languages_cap:
|
23 |
for domain in domains_cap:
|
24 |
models.append(hf_cap_path(language, domain))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
tokenizers = ["xlm-roberta-large"]
|
27 |
|
28 |
def download_hf_models():
|
29 |
for model_id in models:
|
30 |
-
|
31 |
token=HF_TOKEN)
|
32 |
-
del model
|
33 |
for tokenizer_id in tokenizers:
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import shutil
|
3 |
+
import subprocess
|
4 |
|
5 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
|
|
|
|
|
6 |
|
7 |
from interfaces.cap import languages as languages_cap
|
8 |
from interfaces.cap import domains as domains_cap
|
9 |
|
10 |
+
from interfaces.emotion9 import languages as languages_emotion9
|
11 |
+
|
12 |
+
from interfaces.illframes import domains as domains_illframes
|
13 |
+
|
14 |
from interfaces.cap import build_huggingface_path as hf_cap_path
|
15 |
+
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
|
16 |
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
|
17 |
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
|
18 |
from interfaces.emotion import build_huggingface_path as hf_emotion_path
|
19 |
+
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
|
20 |
+
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
|
21 |
+
from interfaces.illframes import build_huggingface_path as hf_illframes_path
|
22 |
+
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
|
23 |
|
24 |
HF_TOKEN = os.environ["hf_read"]
|
25 |
|
26 |
# should be a temporary solution
|
27 |
+
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]
|
28 |
+
|
29 |
+
# it gets more difficult with cap
|
30 |
+
domains_cap = list(domains_cap.values())
|
31 |
for language in languages_cap:
|
32 |
for domain in domains_cap:
|
33 |
models.append(hf_cap_path(language, domain))
|
34 |
+
|
35 |
+
# emotion9
|
36 |
+
for language in languages_emotion9:
|
37 |
+
models.append(hf_emotion9_path(language))
|
38 |
+
|
39 |
+
# illframes (domains is a dict for some reason?)
|
40 |
+
for domain in domains_illframes.values():
|
41 |
+
models.append(hf_illframes_path(domain))
|
42 |
|
43 |
tokenizers = ["xlm-roberta-large"]
|
44 |
|
45 |
def download_hf_models():
|
46 |
for model_id in models:
|
47 |
+
AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload",
|
48 |
token=HF_TOKEN)
|
|
|
49 |
for tokenizer_id in tokenizers:
|
50 |
+
AutoTokenizer.from_pretrained(tokenizer_id)
|
51 |
+
|
52 |
+
|
53 |
+
def df_h():
|
54 |
+
result = subprocess.run(["df", "-H"], capture_output=True, text=True)
|
55 |
+
print(result.stdout)
|
56 |
+
|
57 |
+
|
58 |
+
def set_hf_cache_dir(path:str):
|
59 |
+
os.environ['TRANSFORMERS_CACHE'] = path
|
60 |
+
os.environ['HF_HOME'] = path
|
61 |
+
os.environ['HF_DATASETS_CACHE'] = path
|
62 |
+
os.environ['TORCH_HOME'] = path
|
63 |
+
|
64 |
|
65 |
+
def is_disk_full(min_free_space_in_GB=10):
|
66 |
+
total, used, free = shutil.disk_usage("/")
|
67 |
+
free_gb = free / (1024 ** 3)
|
68 |
+
|
69 |
+
if free_gb >= min_free_space_in_GB:
|
70 |
+
return False
|
71 |
+
else:
|
72 |
+
return True
|