Model Card for German-Emotions

This is basically the German translation of arpanghoshal/EmoRoBERTa. We used the go_emotions dataset, translated it into German and fine-tuned the FacebookAI/xlm-roberta-base model. So this model allows the classification of 28 emotions in German Transcripts ('admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'). A paper will be published soonish...

Model Details

Model type: text-classification
Language(s) (NLP): German
License: apache-2.0
Finetuned from model: FacebookAI/xlm-roberta-base
Hyperparameters:
- Epochs: 10
- learning_rate: 3e-5
- weight_decay: 0.01
Metrics:
- accuracy: 0.41
- f1: 0.45
- kappa: 0.42

Classification Metrics

Emotion	Sentiment	F1	Cohen’s Kappa
admiration	positive	0.64	0.601
amusement	positive	0.78	0.767
anger	negative	0.38	0.358
annoyance	negative	0.27	0.229
approval	positive	0.34	0.293
caring	positive	0.38	0.365
confusion	negative	0.40	0.378
curiosity	positive	0.51	0.486
desire	positive	0.39	0.387
disappointment	negative	0.19	0.170
disapproval	negative	0.32	0.286
disgust	negative	0.41	0.395
embarrassment	negative	0.37	0.367
excitement	positive	0.35	0.339
fear	negative	0.59	0.584
gratitude	positive	0.89	0.882
grief	negative	0.31	0.307
joy	positive	0.51	0.499
love	positive	0.73	0.721
nervousness	negative	0.28	0.276
optimism	positive	0.53	0.512
pride	positive	0.30	0.299
realization	positive	0.17	0.150
relief	positive	0.27	0.266
remorse	negative	0.55	0.545
sadness	negative	0.50	0.488
surprise	neutral	0.53	0.514
neutral	neutral	0.60	0.410

How to Get Started with the Model

Use the code below to get started with the model.

# pip install transformers[torch]
# pip install pandas, transformers, numpy, tqdm, openpyxl
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import numpy as np
from tqdm import tqdm
import time
import os
from transformers import DataCollatorWithPadding
import json

# create base path and input and output path for the model folder and the file folder
base_path = "/share/users/staff/c/clalk/Emotionen"
model_path = os.path.join(base_path, 'Modell')
file_path = os.path.join(base_path, 'Datensatz')

MODEL = "ChrisLalk/German-Emotions"
tokenizer = AutoTokenizer.from_pretrained(MODEL, do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    from_tf=False,
    from_flax=False,
    trust_remote_code=False,
    num_labels=28,
    ignore_mismatched_sizes=True
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Path to the file
os.chdir(file_path)
df_full = pd.read_excel("speech_turns_pat.xlsx", index_col=None)

if 'Unnamed: 0' in df_full.columns:
    df_full = df_full.drop(columns=['Unnamed: 0'])

df_full.reset_index(drop=True, inplace=True)

# Tokenization and inference function
def infer_texts(texts):
    tokenized_texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    class SimpleDataset:
        def __init__(self, tokenized_texts):
            self.tokenized_texts = tokenized_texts
        def __len__(self):
            return len(self.tokenized_texts["input_ids"])
        def __getitem__(self, idx):
            return {k: v[idx] for k, v in self.tokenized_texts.items()}
    test_dataset = SimpleDataset(tokenized_texts)
    trainer = Trainer(model=model, data_collator=data_collator)
    predictions = trainer.predict(test_dataset)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions.predictions))
    return np.round(np.array(probs), 3).tolist()

start_time = time.time()
df = df_full

# Save results in a dict, here the df contains the additional variables File, Class, session, short_id, long_id, Prediction, hscl-11, and srs.
# However, only the "Sentence" column with the text is relevant for the pipeline. 
results = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    patient_texts = row['Patient']
    prob_list = infer_texts(patient_texts)
    results.append({
        "File": row['Class']+"_"+row['session'],
        "Class": row['Class'],
        "session": row['session'],
        "short_id": row["short_id"],
        "long_id": row["long_id"],
        "Sentence": patient_texts,
        "Prediction": prob_list[0],
        "hscl-11": row["Gesamtscore_hscl"],
        "srs": row["srs_ges"],
    })

# Convert results to df
df_results = pd.DataFrame(results)
df_results.to_json("emo_speech_turn_inference.json")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(df_results)

emo_df = pd.DataFrame(df_results['Prediction'].tolist(), index=df_results["Class"].index)
col_names = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
emo_df.columns = col_names
print(emo_df)

ChrisLalk
/

German-Emotions

Model Card for German-Emotions

Model Details

Classification Metrics

How to Get Started with the Model

Model tree for ChrisLalk/German-Emotions

Dataset used to train ChrisLalk/German-Emotions