Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import os | |
import plotly.express as px | |
import numpy as np | |
datadir = 'data/emissions/complete' | |
seq2seq_finetuned = ['sshleifer/distilbart-xsum-12-6', 'sshleifer/distilbart-cnn-12-6', 'sshleifer/distilbart-cnn-6-6', | |
'pszemraj/led-large-book-summary', 'google/pegasus-xsum', 'google/pegasus-large', | |
'google/pegasus-multi_news' ,'facebook/bart-large-cnn', 'ainize/bart-base-cnn'] | |
color_discrete_map = {'Task-specific Encoder': '#636EFA', 'Multi-purpose Seq2Seq': '#AB63FA', 'Multi-purpose Decoder': '#00CC96', 'Task-specific Seq2Seq':'#EF553B'} | |
def multi_check(mname): | |
if 'flan' in mname: | |
return 'Seq2Seq' | |
elif 'bloomz' in mname: | |
return 'Decoder' | |
def encoder_check(mname): | |
if 'flan' in mname: | |
return 'Multi-purpose Seq2Seq' | |
elif mname in seq2seq_finetuned: | |
return 'Task-specific Seq2Seq' | |
elif 'bloomz' in mname: | |
return 'Multi-purpose Decoder' | |
else: | |
return 'Task-specific Encoder' | |
# Data loading | |
model_param_df = pd.read_csv('data/model_parameters.csv', header=0) | |
model_performance_df = pd.read_csv('data/performance.csv', header=0) | |
emissions_df = pd.read_csv('data/co2_data.csv',header=0) | |
modalities_df = pd.read_csv('data/modalities_data.csv',header=0) | |
finetuned_df = emissions_df[~emissions_df['task'].str.contains('zero')] | |
finetuned_df['task'] = finetuned_df['task'].str.replace('_',' ') | |
zeroshot_df = emissions_df[emissions_df['task'].str.contains('zero')] | |
zeroshot_df['task'] = zeroshot_df['task'].str.replace('_',' ') | |
zeroshot_df['architecture_type'] = zeroshot_df.apply(lambda x : multi_check(x.model),axis=1) | |
grouped_df = emissions_df.groupby(['model','task']).mean() | |
grouped_df = grouped_df.reset_index() | |
grouped_df = grouped_df.drop('task',axis=1) | |
performance_all = pd.merge(grouped_df, model_performance_df, on='model') | |
performance_all['type']= performance_all.apply(lambda x : encoder_check(x.model),axis=1) | |
performance_all['log_emissions'] = np.log1p(performance_all["query emissions (g)"]) | |
sent_df = performance_all[['imdb (acc)','sst2 (acc)','tomatoes (acc)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['sentiment'])] | |
qa_df = performance_all[['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['qa'])] | |
summ_df = performance_all[['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)', "query emissions (g)", 'model','type', 'num_params','log_emissions']][performance_all['task'].isin(['summarization'])] | |
# Figure loading | |
fig0 = px.scatter(emissions_df, x="num_params", y="query emissions (g)", color="model", log_x=True, log_y=True) | |
fig0.update_layout(xaxis={'categoryorder':'mean ascending'}) | |
fig0.update_layout(yaxis_title='Total carbon emitted (g)') | |
fig0.update_layout(xaxis_title='Number of Parameters') | |
fig1 = px.scatter(finetuned_df, x="task", y="query_energy (kWh)", color="model", log_y=True) | |
fig1.update_layout(xaxis={'categoryorder':'mean ascending'}) | |
fig1.update_layout(yaxis_title='Total energy used (Wh)') | |
fig1.update_layout(xaxis_title='Task') | |
fig1.update_traces( | |
hovertemplate="<br>".join([ | |
"Model: %{customdata[0]}", | |
"Task: %{customdata[1]}", | |
]) | |
) | |
fig2 = px.scatter(modalities_df, x="num_params", y="query emissions (g)", color="modality", | |
log_x=True, log_y=True, custom_data=['model','task']) | |
fig2.update_traces( | |
hovertemplate="<br>".join([ | |
"Model: %{customdata[0]}", | |
"Task: %{customdata[1]}", | |
]) | |
) | |
fig2.update_layout(xaxis_title='Model size (number of parameters)') | |
fig2.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)') | |
fig3 = px.scatter(zeroshot_df, x="model", y="query emissions (g)", color="architecture_type", size='num_params', log_y=True) | |
fig3.update_layout(xaxis={'categoryorder':'mean ascending'}) | |
fig3.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)') | |
fig3.update_layout(xaxis_title='Model') | |
fig4 = px.scatter(zeroshot_df, x="dataset", y="query emissions (g)", color="model", size='num_params', log_y=True) | |
fig4.update_layout(xaxis={'categoryorder':'mean ascending'}) | |
fig4.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)') | |
fig4.update_layout(xaxis_title='Model') | |
fig5 = px.scatter(sent_df, y=['imdb (acc)', 'sst2 (acc)', 'tomatoes (acc)'], x="num_params", color="type", color_discrete_map=color_discrete_map, | |
size= "log_emissions", log_x=True, hover_data="model") | |
fig5.update_layout(legend=dict(y=-0.4,x=0.3)) | |
fig5.update_layout(yaxis_title='Text Classification Accuracy') | |
fig6 = px.scatter(qa_df, y=['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)'], x="num_params", color="type", | |
size = 'log_emissions', log_x=True, hover_data="model") | |
fig6.update_layout(legend=dict(y=-0.4,x=0.3)) | |
fig6.update_layout(yaxis_title='QA accuracy/F1') | |
fig7 = px.scatter(summ_df, y=['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)'], x="num_params", color="type", | |
size = 'log_emissions', log_x=True, hover_data="model") | |
fig7.update_layout(legend=dict(y=-0.4,x=0.3)) | |
fig7.update_layout(yaxis_title='Summarization Rouge Score') | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# CO2 Inference Demo π π» β‘") | |
gr.Markdown("### TL;DR - We ran a series of experiments to measure the energy efficiency and carbon emissions of different\ | |
models from the HuggingFace Hub, and to see how different tasks and models compare.\ | |
We found that multi-purpose, generative models are orders of magnitude more energy-intensive than task-specific systems\ | |
for a variety of tasks, even for models with a similar number of parameters") | |
gr.Markdown("### Explore the plots below to get more insights about the different models and tasks from our study.") | |
with gr.Accordion("More details about our methodology:", open=False): | |
gr.Markdown("We chose ten ML tasks: text classification, token classification, question answering, \ | |
), masked language modeling, text generation, summarization, image classification, object detection, \ | |
image captioning and image generation. For each of the taks, we chose three of the most downloaded datasets and 8 of the most \ | |
downloaded models from the Hugging Face Hub. We ran each of the models ten times over a 1,000 sample from each of the models and measured the energy consumed and carbon emitted.") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## All models from our study (carbon)") | |
gr.Markdown("### Double click on the model name in the list on the right to isolate its datapoints:") | |
gr.Markdown("The axes of the plot are in logarithmic scale, meaning that the difference between the least carbon-intensive and the most carbon-intensive models is over 9,000 times!") | |
gr.Plot(fig0) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## Task-by-task comparison (energy)") | |
gr.Markdown("### Grouping the models by task, we can see different patterns emerge:") | |
gr.Markdown("Image generation is by far the most energy- and carbon-intensive task from the ones studied, and text classification \ | |
is the least.") | |
gr.Plot(fig1) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## Modality comparison (carbon)") | |
gr.Markdown("### Grouping the models by their modality shows different characteristics:") | |
gr.Markdown("We can see that tasks involving images (image-to-text, image-to-category) require more energy and emit more carbon\ | |
than ones involving text.") | |
gr.Plot(fig2) | |
gr.Markdown("## Multi-task model comparison (carbon)") | |
gr.Markdown("### Looking at the emissions of multi-task models, we can see that decoder-only models tend to emit more carbon compared to sequence-to-sequence ones.") | |
gr.Markdown("### This pattern varies depending on the dataset and task - for summarization datasets (the 3 rightmost ones), the difference between models is less obvious.") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Plot(fig3) | |
with gr.Column(): | |
gr.Plot(fig4) | |
gr.Markdown("## Evaluations (accuracy vs carbon)") | |
gr.Markdown("### Single-task models are, ceteris paribus, less carbon-intensive than multi-task models for all 3 tasks we looked at: ") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Sentiment Analysis") | |
gr.Plot(fig5) | |
with gr.Column(): | |
gr.Markdown("### Question Answering") | |
gr.Plot(fig6) | |
with gr.Column(): | |
gr.Markdown("### Summarization") | |
gr.Plot(fig7) | |
demo.launch() | |