File size: 7,812 Bytes
c12bd84 dfa14a8 843a5ef a79afe8 c12bd84 03ade34 c12bd84 a5fb364 43b4e29 ca8d4b9 43b4e29 8488477 e3642ff 8488477 e3642ff 8488477 e3642ff a34a60b c671de9 e3642ff 43b4e29 8488477 a34a60b 6a7ad7c a34a60b 8488477 337b761 8488477 337b761 b94ee8f 337b761 8488477 337b761 8488477 337b761 7ed3839 337b761 7ed3839 ca8d4b9 a79afe8 ca8d4b9 a5fb364 ca8d4b9 7ed3839 a79afe8 7ed3839 c671de9 a5fb364 c671de9 a5fb364 c671de9 a5fb364 a79afe8 4fbdb10 a79afe8 a5fb364 4fbdb10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import streamlit as st
import pandas as pd
import plotly.express as px
from result_data_processor import ResultDataProcessor
import matplotlib.pyplot as plt
import numpy as np
def plot_top_n(df, target_column, n=10):
top_n = df.nlargest(n, target_column)
# Initialize the bar plot
fig, ax1 = plt.subplots(figsize=(10, 5))
# Set width for each bar and their positions
width = 0.28
ind = np.arange(len(top_n))
# Plot target_column and MMLU_average on the primary y-axis with adjusted positions
ax1.bar(ind - width, top_n[target_column], width=width, color='blue', label=target_column)
ax1.bar(ind, top_n['MMLU_average'], width=width, color='orange', label='MMLU_average')
# Set the primary y-axis labels and title
ax1.set_title(f'Top {n} performing models on {target_column}')
ax1.set_xlabel('Model')
ax1.set_ylabel('Score')
# Create a secondary y-axis for Parameters
ax2 = ax1.twinx()
# Plot Parameters as bars on the secondary y-axis with adjusted position
ax2.bar(ind + width, top_n['Parameters'], width=width, color='red', label='Parameters')
# Set the secondary y-axis labels
ax2.set_ylabel('Parameters', color='red')
ax2.tick_params(axis='y', labelcolor='red')
# Set the x-ticks and their labels
ax1.set_xticks(ind)
ax1.set_xticklabels(top_n.index, rotation=45, ha="right")
# Adjust the legend
fig.tight_layout()
fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Show the plot
st.pyplot(fig)
data_provider = ResultDataProcessor()
# st.title('Model Evaluation Results including MMLU by task')
st.title('MMLU-by-Task Evaluation Results for 500+ Open Source Models')
st.markdown("""***Last updated August 7th***""")
st.markdown("""
Hugging Face has run evaluations on over 500 open source models and provides results on a
[publicly available leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [dataset](https://huggingface.co/datasets/open-llm-leaderboard/results).
The leaderboard currently displays the overall result for MMLU. This page shows individual accuracy scores for all 57 tasks of the MMLU evaluation.
[Preliminary analysis of MMLU-by-Task data](https://coreymorrisdata.medium.com/preliminary-analysis-of-mmlu-evaluation-data-insights-from-500-open-source-models-e67885aa364b)
""")
filters = st.checkbox('Select Models and Evaluations')
# Create defaults for selected columns and models
selected_columns = data_provider.data.columns.tolist()
selected_models = data_provider.data.index.tolist()
if filters:
# Create checkboxes for each column
selected_columns = st.multiselect(
'Select Columns',
data_provider.data.columns.tolist(),
default=selected_columns
)
selected_models = st.multiselect(
'Select Models',
data_provider.data.index.tolist(),
default=selected_models
)
# Get the filtered data
st.header('Sortable table')
filtered_data = data_provider.get_data(selected_models)
# sort the table by the MMLU_average column
filtered_data = filtered_data.sort_values(by=['MMLU_average'], ascending=False)
st.dataframe(filtered_data[selected_columns])
# CSV download
filtered_data.index.name = "Model Name"
csv = filtered_data.to_csv(index=True)
st.download_button(
label="Download data as CSV",
data=csv,
file_name="model_evaluation_results.csv",
mime="text/csv",
)
def create_plot(df, arc_column, moral_column, models=None):
if models is not None:
df = df[df.index.isin(models)]
# remove rows with NaN values
df = df.dropna(subset=[arc_column, moral_column])
plot_data = pd.DataFrame({
'Model': df.index,
arc_column: df[arc_column],
moral_column: df[moral_column],
})
plot_data['color'] = 'purple'
fig = px.scatter(plot_data, x=arc_column, y=moral_column, color='color', hover_data=['Model'], trendline="ols")
fig.update_layout(showlegend=False,
xaxis_title=arc_column,
yaxis_title=moral_column,
xaxis = dict(),
yaxis = dict())
# Add a dashed line at 0.25 for the moral columns
x_min = df[arc_column].min()
x_max = df[arc_column].max()
y_min = df[moral_column].min()
y_max = df[moral_column].max()
if arc_column.startswith('MMLU'):
fig.add_shape(
type='line',
x0=0.25, x1=0.25,
y0=y_min, y1=y_max,
line=dict(
color='red',
width=2,
dash='dash'
)
)
if moral_column.startswith('MMLU'):
fig.add_shape(
type='line',
x0=x_min, x1=x_max,
y0=0.25, y1=0.25,
line=dict(
color='red',
width=2,
dash='dash'
)
)
return fig
# Custom scatter plots
st.header('Custom scatter plots')
st.write("The dashed red line represents the random chance performance of 0.25")
selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
fig = create_plot(filtered_data, selected_x_column, selected_y_column)
st.plotly_chart(fig)
else:
st.write("Please select different columns for the x and y axes.")
# end of custom scatter plots
st.markdown("## Notable findings and plots")
st.markdown("### Moral Scenarios Performance")
fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
st.plotly_chart(fig)
fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios')
st.plotly_chart(fig)
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
st.plotly_chart(fig)
st.header('Abstract Algebra Performance')
st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
# Usage example:
plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
st.plotly_chart(fig)
st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
st.markdown("""
# References
1. Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf. (2023). *Open LLM Leaderboard*. Hugging Face. [link](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
2. Gao, Leo et al. (2021). *A framework for few-shot language model evaluation*. Zenodo. [link](https://doi.org/10.5281/zenodo.5371628)
3. Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, Oyvind Tafjord. (2018). *Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge*. arXiv. [link](https://arxiv.org/abs/1803.05457)
4. Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi. (2019). *HellaSwag: Can a Machine Really Finish Your Sentence?*. arXiv. [link](https://arxiv.org/abs/1905.07830)
5. Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt. (2021). *Measuring Massive Multitask Language Understanding*. arXiv. [link](https://arxiv.org/abs/2009.03300)
6. Stephanie Lin, Jacob Hilton, Owain Evans. (2022). *TruthfulQA: Measuring How Models Mimic Human Falsehoods*. arXiv. [link](https://arxiv.org/abs/2109.07958)
""")
|