--- license: apache-2.0 datasets: google-research-datasets/go_emotions base_model: FacebookAI/xlm-roberta-base language: - de metrics: - f1_macro: 0.45 - accuracy: 0.41 - kappa: 0.42 pipeline_tag: text-classification tags: - medical model_description: >- This is basically the German translation of arpanghoshal/EmoRoBERTa. We used the go_emotions dataset, translated it into German and fine-tuned the FacebookAI/xlm-roberta-base model. So this model allows the classification of 28 emotions in German Transcripts ('admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'). A paper will be published soonish... --- # Model Card for G-E5-rman-Emotions This is basically the German translation of arpanghoshal/EmoRoBERTa. We used the go_emotions dataset, translated it into German and fine-tuned the FacebookAI/xlm-roberta-base model. So this model allows the classification of **28 emotions** in German Transcripts (**'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'**). A paper will be published soonish... ## Model Details - **Model type:** text-classification - **Language(s) (NLP):** German - **License:** apache-2.0 - **Finetuned from model:** intfloat/multilingual-e5-large - **Hyperparameters:** - Epochs: 10 - learning_rate: 3e-5 - weight_decay: 0.01 - **Metrics:** - accuracy: 0.41 - f1: 0.45 - kappa: 0.42 --- ## How to Get Started with the Model Use the code below to get started with the model. ```python # pip install transformers[torch] # pip install pandas, transformers, numpy, tqdm, openpyxl import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer import numpy as np from tqdm import tqdm import time import os from transformers import DataCollatorWithPadding import json # create base path and input and output path for the model folder and the file folder base_path = "/share/users/staff/c/clalk/Emotionen" model_path = os.path.join(base_path, 'Modell') file_path = os.path.join(base_path, 'Datensatz') MODEL = "FacebookAI/xlm-roberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL, do_lower_case=False) model = AutoModelForSequenceClassification.from_pretrained( model_path, from_tf=False, from_flax=False, trust_remote_code=False, num_labels=28, ignore_mismatched_sizes=True ) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Path to the file os.chdir(file_path) df_full = pd.read_excel("speech_turns_pat.xlsx", index_col=None) if 'Unnamed: 0' in df_full.columns: df_full = df_full.drop(columns=['Unnamed: 0']) df_full.reset_index(drop=True, inplace=True) # Tokenization and inference function def infer_texts(texts): tokenized_texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) class SimpleDataset: def __init__(self, tokenized_texts): self.tokenized_texts = tokenized_texts def __len__(self): return len(self.tokenized_texts["input_ids"]) def __getitem__(self, idx): return {k: v[idx] for k, v in self.tokenized_texts.items()} test_dataset = SimpleDataset(tokenized_texts) trainer = Trainer(model=model, data_collator=data_collator) predictions = trainer.predict(test_dataset) sigmoid = torch.nn.Sigmoid() probs = sigmoid(torch.Tensor(predictions.predictions)) return np.round(np.array(probs), 3).tolist() start_time = time.time() df = df_full # Save results in a dict, here the df contains the additional variables File, Class, session, short_id, long_id, Prediction, hscl-11, and srs. # However, only the "Sentence" column with the text is relevant for the pipeline. results = [] for index, row in tqdm(df.iterrows(), total=df.shape[0]): patient_texts = row['Patient'] prob_list = infer_texts(patient_texts) results.append({ "File": row['Class']+"_"+row['session'], "Class": row['Class'], "session": row['session'], "short_id": row["short_id"], "long_id": row["long_id"], "Sentence": patient_texts, "Prediction": prob_list[0], "hscl-11": row["Gesamtscore_hscl"], "srs": row["srs_ges"], }) # Convert results to df df_results = pd.DataFrame(results) df_results.to_json("emo_speech_turn_inference.json") end_time = time.time() elapsed_time = end_time - start_time print(f"Elapsed time: {elapsed_time:.2f} seconds") print(df_results) emo_df = pd.DataFrame(df_results['Prediction'].tolist(), index=df_results["Class"].index) col_names = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'] emo_df.columns = col_names print(emo_df) ```