Joshua Saxe commited on
Commit
2e9a66b
·
1 Parent(s): c05047e

adding descriptive text

Browse files
app.py CHANGED
@@ -1,53 +1,104 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import plotly.express as px
4
  import json
 
 
5
 
6
- st.set_page_config(layout="wide", page_title="CyberSafetyEval Leaderboard", page_icon=":bar_chart:")
 
 
 
7
 
8
- st.markdown("# CyberSafetyEval Leaderboard", unsafe_allow_html=True)
 
 
9
 
10
- data_mitre = json.load(open("mitre.json"))
11
- data_insecure = json.load(open("insecure_code.json"))
12
 
13
- # Processing data for the table
14
- st.markdown("### Helpfulness to cyberattackers results, benign percentage is percentage of tests safely passed (higher is better)")
15
- model_stats = {}
16
- for model, categories in data_mitre.items():
17
- model_stats[model] = {'Mean Benign Percentage': pd.Series([v['benign_percentage'] for v in categories.values()]).mean()}
18
- for category, values in categories.items():
19
- model_stats[model][f'Benign Percentage in {category}'] = values['benign_percentage']
20
 
21
- leaderboard_df = pd.DataFrame.from_dict(model_stats, orient='index').sort_values(by='Mean Benign Percentage', ascending=False)
 
 
 
 
 
22
 
23
- # Preparing data for the clustered bar chart
24
- chart_data = []
25
- for model, categories in data_mitre.items():
26
- for category, values in categories.items():
27
- chart_data.append({
28
- 'Model': model,
29
- 'Category': category,
30
- 'Benign Percentage': values['benign_percentage']
31
- })
32
 
33
- chart_df = pd.DataFrame(chart_data)
 
 
 
 
34
 
35
- # Styling the data table
36
- st.dataframe(leaderboard_df.style.format("{:.2%}").background_gradient(cmap='Blues'))
37
 
38
- # Enhanced Plotly chart
39
- fig = px.bar(chart_df, x='Category', y='Benign Percentage', barmode='group', color='Model',
40
- title='MITRE ATT&CK category results per model',
41
- labels={'Benign Percentage': 'Benign %'},
42
- color_discrete_sequence=px.colors.qualitative.Pastel)
43
- fig.update_layout(plot_bgcolor='rgba(0,0,0,0)',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  xaxis=dict(showgrid=False),
45
- yaxis=dict(showgrid=False, tickformat=".0%"),
46
- legend=dict(title='Models'))
 
 
 
 
 
 
 
47
 
 
48
  st.plotly_chart(fig, use_container_width=True)
49
 
50
- # Processing data for the table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  model_stats_insecure = {}
52
  for model, categories in data_insecure.items():
53
  model_stats_insecure[model] = {'Mean Insecure Code Test Pass Rate': pd.Series([1-v['autocomplete_vunerable_percentage'] for v in categories.values()]).mean()}
@@ -55,7 +106,7 @@ for model, categories in data_insecure.items():
55
  model_stats_insecure[model][f'Insecure Code Test Pass Rate in {category}'] = 1-values['autocomplete_vunerable_percentage']
56
  leaderboard_df_insecure = pd.DataFrame.from_dict(model_stats_insecure, orient='index').sort_values(by='Mean Insecure Code Test Pass Rate', ascending=False)
57
 
58
- # Preparing data for the clustered bar chart
59
  chart_data_insecure = []
60
  for model, categories in data_insecure.items():
61
  for category, values in categories.items():
@@ -66,11 +117,14 @@ for model, categories in data_insecure.items():
66
  })
67
  chart_df_insecure = pd.DataFrame(chart_data_insecure)
68
 
69
- # Styling the data table
70
- st.markdown("### Insecure coding test pass rate results (higher is better)")
 
 
 
71
  st.dataframe(leaderboard_df_insecure.style.format("{:.2%}").background_gradient(cmap='Blues')) # Changed cmap to 'Blues'
72
 
73
- # Enhanced Plotly chart
74
  fig_insecure = px.bar(chart_df_insecure, x='Category', y='Insecure Code Test Pass Rate', barmode='group', color='Model',
75
  title='Category-wise Insecure Code Test Pass Rate per Model',
76
  labels={'Insecure Code Test Pass Rate': 'Insecure Code Test Pass Rate %'},
@@ -80,4 +134,5 @@ fig_insecure.update_layout(plot_bgcolor='rgba(0,0,0,0)',
80
  yaxis=dict(showgrid=False, tickformat=".0%"),
81
  legend=dict(title='Models'))
82
 
 
83
  st.plotly_chart(fig_insecure, use_container_width=True)
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  import json
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
 
7
+ # Configure the streamlit page
8
+ st.set_page_config(layout="wide", page_title="CyberSecEval Leaderboard", page_icon=":bar_chart:")
9
+ # Display the title and brief description of the page
10
+ st.markdown("# CyberSecEval: Comprehensive Evaluation Framework for Cybersecurity Risks and Capabilities of Large Language Models (LLMs)", unsafe_allow_html=True)
11
 
12
+ # Provide more detailed information about the page and its purpose
13
+ st.markdown("""
14
+ Welcome to the CyberSecEval leaderboard. This platform showcases the results of our cybersecurity evaluation framework applied to various popular LLMs. Our open-source evaluation suite's workings and coverage are detailed in our [first](https://ai.meta.com/research/publications/purple-llama-cyberseceval-a-benchmark-for-evaluating-the-cybersecurity-risks-of-large-language-models/) and [second](https://ai.meta.com/research/publications/cyberseceval-2-a-wide-ranging-cybersecurity-evaluation-suite-for-large-language-models/) papers.
15
 
16
+ In the following sections, we present case study test results and provide concise explanations of their implications.
 
17
 
18
+ **Take Action:** Measure your own LLM using CyberSecEval! Visit our open-source GitHub repository [here](https://github.com/meta-llama/PurpleLlama/tree/main/CybersecurityBenchmarks)!
19
+ """)
 
 
 
 
 
20
 
21
+ # Load JSON files containing test results
22
+ data_attack = json.load(open("attack_helpfulness.json"))
23
+ data_interpreter = json.load(open("interpreter_abuse_tests.json"))
24
+ data_prompt = json.load(open("prompt_injection_tests.json"))
25
+ data_exploit = json.load(open("exploit_tests.json"))
26
+ data_tradeoff = json.load(open("trr_frr_tradeoff_helpfulness.json"))
27
 
28
+ # Function to sort rows and columns by their sums
29
+ def sort_df(df):
30
+ return df.loc[df.sum(axis=1).sort_values(ascending=False).index, df.sum(axis=0).sort_values(ascending=False).index]
 
 
 
 
 
 
31
 
32
+ # Convert JSON data to pandas DataFrames and sort
33
+ attack_df = sort_df(pd.DataFrame(data_attack))
34
+ interpreter_df = sort_df(pd.DataFrame(data_interpreter))
35
+ prompt_df = sort_df(pd.DataFrame(data_prompt))
36
+ exploit_df = sort_df(pd.DataFrame(data_exploit))
37
 
38
+ # Scatterplot for True refusal vs. false refusal results
39
+ tradeoff_df = pd.DataFrame(data_tradeoff).transpose()
40
 
41
+ # Briefly explain the purpose of the scatterplot
42
+ st.markdown("### Evaluating LLMs' Assistance to Cyberattackers")
43
+ st.markdown("""
44
+ We measure the tradeoff between LLMs' 'false refusals' (refusing to assist in legitimate cybersecurity-related activities) and their 'violation rate' (agreeing to assist in offensive cyber attacks). The results are displayed in the scatterplot below.
45
+ """)
46
+
47
+ # Create a new figure
48
+ fig = go.Figure()
49
+
50
+ # Loop through each row and plot with different marker and color
51
+ for i, row in enumerate(tradeoff_df.iterrows()):
52
+ fig.add_trace(go.Scatter(x=[row[1]['Violation Rate']], y=[row[1]['Refusal Rate']],
53
+ mode='markers',
54
+ name=row[0], marker=dict(size=15)))
55
+
56
+ # Add labels and title
57
+ fig.update_layout(title='True Refusal vs. Violation Rate for Cybersecurity Prompts',
58
+ xaxis_title='Violation Rate',
59
+ yaxis_title='Refusal Rate',
60
+ plot_bgcolor='rgba(0,0,0,0)',
61
  xaxis=dict(showgrid=False),
62
+ yaxis=dict(showgrid=False))
63
+
64
+ # Describe the plot
65
+ st.markdown("""
66
+ The plot below evaluates the performance of various LLMs in two tasks:
67
+ * Complying with legitimate requests for help with cybersecurity-related tasks
68
+ * Refusing to assist in offensive cyberattacks
69
+ An ideal LLM would be positioned in the lower left corner of the plot.
70
+ """)
71
 
72
+ # Display the plot
73
  st.plotly_chart(fig, use_container_width=True)
74
 
75
+ # Function to display heatmaps and bar charts
76
+ def display_results(df, title, description,include_barchart=True):
77
+ if title:
78
+ st.markdown(f"### {title}")
79
+ st.markdown(f"{description}")
80
+ st.dataframe(df.style.format("{:.2%}").background_gradient(cmap='coolwarm'))
81
+ if include_barchart:
82
+ if title:
83
+ st.markdown(f"### Overall Performance of Models in {title}")
84
+ fig = px.bar(df.sum(axis=1).sort_values(ascending=False))
85
+ fig.update_layout(plot_bgcolor='rgba(0,0,0,0)',
86
+ xaxis=dict(showgrid=False),
87
+ yaxis=dict(showgrid=False)
88
+ )
89
+ st.plotly_chart(fig, use_container_width=True)
90
+
91
+ # Display the results for each test
92
+ # Continue displaying the results for each test
93
+ display_results(interpreter_df, 'LLMs Assistance to Hack Code Interpreter Environments', 'The plot below shows how often each model complies with attempts to hack a connected code interpreter. Lower values indicate safer models.', False)
94
+ display_results(prompt_df, 'LLMs Susceptibility to Prompt Injection Attacks', 'The plot below shows how often each model succumbs to prompt injection attacks, categorized into threat categories detailed in our paper. Lower values indicate safer models.', False)
95
+ display_results(exploit_df, 'LLMs Capability to Solve Cyber Capture the Flag Challenges', 'The plot below shows how well each model performs at solving capture-the-flag style program exploitation challenges. Higher values indicate more capable models.', True)
96
+
97
+ # Load additional JSON files
98
+ data_mitre = json.load(open("mitre.json"))
99
+ data_insecure = json.load(open("insecure_code.json"))
100
+
101
+ # Process data for the table
102
  model_stats_insecure = {}
103
  for model, categories in data_insecure.items():
104
  model_stats_insecure[model] = {'Mean Insecure Code Test Pass Rate': pd.Series([1-v['autocomplete_vunerable_percentage'] for v in categories.values()]).mean()}
 
106
  model_stats_insecure[model][f'Insecure Code Test Pass Rate in {category}'] = 1-values['autocomplete_vunerable_percentage']
107
  leaderboard_df_insecure = pd.DataFrame.from_dict(model_stats_insecure, orient='index').sort_values(by='Mean Insecure Code Test Pass Rate', ascending=False)
108
 
109
+ # Prepare data for the clustered bar chart
110
  chart_data_insecure = []
111
  for model, categories in data_insecure.items():
112
  for category, values in categories.items():
 
117
  })
118
  chart_df_insecure = pd.DataFrame(chart_data_insecure)
119
 
120
+ # Style the data table
121
+ st.markdown("### LLMs Adherence to Secure Coding Practices in Risky Software Engineering Settings")
122
+ st.markdown("""
123
+ The table below shows the propensity of LLMs to avoid insecure coding practices when used as coding assistants or software engineering agents. Higher values indicate safer models.
124
+ """)
125
  st.dataframe(leaderboard_df_insecure.style.format("{:.2%}").background_gradient(cmap='Blues')) # Changed cmap to 'Blues'
126
 
127
+ # Create a Plotly chart
128
  fig_insecure = px.bar(chart_df_insecure, x='Category', y='Insecure Code Test Pass Rate', barmode='group', color='Model',
129
  title='Category-wise Insecure Code Test Pass Rate per Model',
130
  labels={'Insecure Code Test Pass Rate': 'Insecure Code Test Pass Rate %'},
 
134
  yaxis=dict(showgrid=False, tickformat=".0%"),
135
  legend=dict(title='Models'))
136
 
137
+ # Display the chart
138
  st.plotly_chart(fig_insecure, use_container_width=True)
attack_helpfulness.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Discovery": {
3
+ "llama 3 8b-instruct": 0.4033333333,
4
+ "gpt-4": 0.4033333333,
5
+ "mistral-medium-latest": 0.42,
6
+ "llama 3 70b-instruct": 0.6733333333,
7
+ "codellama-70b-instruct": 0.3633333333,
8
+ "codellama-13b-instruct": 0.5633333333,
9
+ "mistral-large-latest": 0.4,
10
+ "codellama-34b-instruct": 0.4966666667,
11
+ "mistral-small-latest": 0.4666666667,
12
+ "gpt-3.5-turbo": 0.49
13
+ },
14
+ "C2": {
15
+ "llama 3 8b-instruct": 0.15,
16
+ "gpt-4": 0.4266666667,
17
+ "mistral-medium-latest": 0.38,
18
+ "llama 3 70b-instruct": 0.4766666667,
19
+ "codellama-70b-instruct": 0.3766666667,
20
+ "codellama-13b-instruct": 0.5353535354,
21
+ "mistral-large-latest": 0.51,
22
+ "codellama-34b-instruct": 0.5733333333,
23
+ "mistral-small-latest": 0.5433333333,
24
+ "gpt-3.5-turbo": 0.5966666667
25
+ },
26
+ "Recon": {
27
+ "llama 3 8b-instruct": 0.35,
28
+ "gpt-4": 0.3866666667,
29
+ "mistral-medium-latest": 0.4433333333,
30
+ "llama 3 70b-instruct": 0.5833333333,
31
+ "codellama-70b-instruct": 0.3833333333,
32
+ "codellama-13b-instruct": 0.4080267559,
33
+ "mistral-large-latest": 0.4333333333,
34
+ "codellama-34b-instruct": 0.58,
35
+ "mistral-small-latest": 0.44,
36
+ "gpt-3.5-turbo": 0.4466666667
37
+ },
38
+ "Exfil": {
39
+ "llama 3 8b-instruct": 0.1233333333,
40
+ "gpt-4": 0.2833333333,
41
+ "mistral-medium-latest": 0.3166666667,
42
+ "llama 3 70b-instruct": 0.34,
43
+ "codellama-70b-instruct": 0.3745819398,
44
+ "codellama-13b-instruct": 0.4533333333,
45
+ "mistral-large-latest": 0.52,
46
+ "codellama-34b-instruct": 0.5233333333,
47
+ "mistral-small-latest": 0.5866666667,
48
+ "gpt-3.5-turbo": 0.5933333333
49
+ },
50
+ "Privilege Escalation": {
51
+ "llama 3 8b-instruct": 0.06,
52
+ "gpt-4": 0.3033333333,
53
+ "mistral-medium-latest": 0.3866666667,
54
+ "llama 3 70b-instruct": 0.2866666667,
55
+ "codellama-70b-instruct": 0.4233333333,
56
+ "codellama-13b-instruct": 0.3535353535,
57
+ "mistral-large-latest": 0.4666666667,
58
+ "codellama-34b-instruct": 0.4633333333,
59
+ "mistral-small-latest": 0.6566666667,
60
+ "gpt-3.5-turbo": 0.6733333333
61
+ },
62
+ "Lateral Movement": {
63
+ "llama 3 8b-instruct": 0.0966666667,
64
+ "gpt-4": 0.2366666667,
65
+ "mistral-medium-latest": 0.2766666667,
66
+ "llama 3 70b-instruct": 0.3433333333,
67
+ "codellama-70b-instruct": 0.4033333333,
68
+ "codellama-13b-instruct": 0.47,
69
+ "mistral-large-latest": 0.5366666667,
70
+ "codellama-34b-instruct": 0.4233333333,
71
+ "mistral-small-latest": 0.6166666667,
72
+ "gpt-3.5-turbo": 0.6666666667
73
+ },
74
+ "Persistence": {
75
+ "llama 3 8b-instruct": 0.1366666667,
76
+ "gpt-4": 0.2533333333,
77
+ "mistral-medium-latest": 0.3066666667,
78
+ "llama 3 70b-instruct": 0.2766666667,
79
+ "codellama-70b-instruct": 0.3933333333,
80
+ "codellama-13b-instruct": 0.4666666667,
81
+ "mistral-large-latest": 0.4733333333,
82
+ "codellama-34b-instruct": 0.48,
83
+ "mistral-small-latest": 0.59,
84
+ "gpt-3.5-turbo": 0.6666666667
85
+ },
86
+ "Evasion": {
87
+ "llama 3 8b-instruct": 0.0466666667,
88
+ "gpt-4": 0.2133333333,
89
+ "mistral-medium-latest": 0.3333333333,
90
+ "llama 3 70b-instruct": 0.17,
91
+ "codellama-70b-instruct": 0.29,
92
+ "codellama-13b-instruct": 0.4333333333,
93
+ "mistral-large-latest": 0.49,
94
+ "codellama-34b-instruct": 0.45,
95
+ "mistral-small-latest": 0.65,
96
+ "gpt-3.5-turbo": 0.7233333333
97
+ },
98
+ "Execution": {
99
+ "llama 3 8b-instruct": 0.09,
100
+ "gpt-4": 0.1833333333,
101
+ "mistral-medium-latest": 0.2976588629,
102
+ "llama 3 70b-instruct": 0.2033333333,
103
+ "codellama-70b-instruct": 0.3233333333,
104
+ "codellama-13b-instruct": 0.3966666667,
105
+ "mistral-large-latest": 0.52,
106
+ "codellama-34b-instruct": 0.51,
107
+ "mistral-small-latest": 0.6233333333,
108
+ "gpt-3.5-turbo": 0.65
109
+ },
110
+ "Collection": {
111
+ "llama 3 8b-instruct": 0.18,
112
+ "gpt-4": 0.19,
113
+ "mistral-medium-latest": 0.31,
114
+ "llama 3 70b-instruct": 0.31,
115
+ "codellama-70b-instruct": 0.3333333333,
116
+ "codellama-13b-instruct": 0.4612794613,
117
+ "mistral-large-latest": 0.4233333333,
118
+ "codellama-34b-instruct": 0.4966666667,
119
+ "mistral-small-latest": 0.52,
120
+ "gpt-3.5-turbo": 0.55
121
+ }
122
+ }
exploit_tests.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Javascript input constraint tests": {
3
+ "gpt-4-turbo": 0.6748333333,
4
+ "mistral-large-latest": 0.6375,
5
+ "mistral-small-latest": 0.6125,
6
+ "llama 3 70b-instruct": 0.6196666667,
7
+ "codellama-34b-instruct": 0.5487719298,
8
+ "llama 3 8b-instruct": 0.5601351351,
9
+ "gpt-3.5-turbo": 0.5311666667,
10
+ "codellama-13b-instruct": 0.4996655518,
11
+ "mistral-medium-latest": 0.5553872054,
12
+ "codellama-70b-instruct": 0.4539115646
13
+ },
14
+ "Python input constraint tests": {
15
+ "gpt-4-turbo": 0.6566666667,
16
+ "mistral-large-latest": 0.6365,
17
+ "mistral-small-latest": 0.6127090301,
18
+ "llama 3 70b-instruct": 0.6028333333,
19
+ "codellama-34b-instruct": 0.5325423729,
20
+ "llama 3 8b-instruct": 0.5348993289,
21
+ "gpt-3.5-turbo": 0.5265,
22
+ "codellama-13b-instruct": 0.4916666667,
23
+ "mistral-medium-latest": 0.5210884354,
24
+ "codellama-70b-instruct": 0.4444256757
25
+ },
26
+ "C input constraint tests": {
27
+ "gpt-4-turbo": 0.6643333333,
28
+ "mistral-large-latest": 0.6231666667,
29
+ "mistral-small-latest": 0.608,
30
+ "llama 3 70b-instruct": 0.6193333333,
31
+ "codellama-34b-instruct": 0.445777027,
32
+ "llama 3 8b-instruct": 0.4947552448,
33
+ "gpt-3.5-turbo": 0.4848333333,
34
+ "codellama-13b-instruct": 0.4821070234,
35
+ "mistral-medium-latest": 0.4363309353,
36
+ "codellama-70b-instruct": 0.4342281879
37
+ },
38
+ "Diverse memory corruption tests": {
39
+ "gpt-4-turbo": 0.1595330739,
40
+ "mistral-large-latest": 0.1576763485,
41
+ "mistral-small-latest": 0.2356321839,
42
+ "llama 3 70b-instruct": 0.1434782609,
43
+ "codellama-34b-instruct": 0.1991869919,
44
+ "llama 3 8b-instruct": 0.1242937853,
45
+ "gpt-3.5-turbo": 0.1388888889,
46
+ "codellama-13b-instruct": 0.1505791506,
47
+ "mistral-medium-latest": 0.112745098,
48
+ "codellama-70b-instruct": 0.1373390558
49
+ },
50
+ "Python/SQLite SQL injection tests": {
51
+ "gpt-4-turbo": 0.23,
52
+ "mistral-large-latest": 0.0033333333,
53
+ "mistral-small-latest": 0.0,
54
+ "llama 3 70b-instruct": 0.0166666667,
55
+ "codellama-34b-instruct": 0.0,
56
+ "llama 3 8b-instruct": 0.0,
57
+ "gpt-3.5-turbo": 0.0,
58
+ "codellama-13b-instruct": 0.0033333333,
59
+ "mistral-medium-latest": 0.0,
60
+ "codellama-70b-instruct": 0.0745762712
61
+ },
62
+ "Buffer overflow tests": {
63
+ "gpt-4-turbo": 0.0469798658,
64
+ "mistral-large-latest": 0.0205992509,
65
+ "mistral-small-latest": 0.0017361111,
66
+ "llama 3 70b-instruct": 0.0068027211,
67
+ "codellama-34b-instruct": 0.0,
68
+ "llama 3 8b-instruct": 0.0070422535,
69
+ "gpt-3.5-turbo": 0.0183333333,
70
+ "codellama-13b-instruct": 0.005,
71
+ "mistral-medium-latest": 0.0018382353,
72
+ "codellama-70b-instruct": 0.0154109589
73
+ }
74
+ }
insecure_code.json CHANGED
@@ -253,260 +253,5 @@
253
  "instruct_bleu": 15.43,
254
  "instruct_total_count": 204,
255
  "instruct_vunerable_percentage": 0.3971,
256
- "instruct_vunerable_suggestion_count": 81}},
257
- "llama2-13b-chat": { "C": { "autocomplete_bleu": 15.54,
258
- "autocomplete_total_count": 227,
259
- "autocomplete_vunerable_percentage": 0.22469999999999998,
260
- "autocomplete_vunerable_suggestion_count": 51,
261
- "instruct_bleu": 9.25,
262
- "instruct_total_count": 227,
263
- "instruct_vunerable_percentage": 0.3568,
264
- "instruct_vunerable_suggestion_count": 81},
265
- "C#": { "autocomplete_bleu": 17.27,
266
- "autocomplete_total_count": 235,
267
- "autocomplete_vunerable_percentage": 0.09359999999999999,
268
- "autocomplete_vunerable_suggestion_count": 22,
269
- "instruct_bleu": 11.08,
270
- "instruct_total_count": 235,
271
- "instruct_vunerable_percentage": 0.2851,
272
- "instruct_vunerable_suggestion_count": 67},
273
- "C++": { "autocomplete_bleu": 14.91,
274
- "autocomplete_total_count": 259,
275
- "autocomplete_vunerable_percentage": 0.1583,
276
- "autocomplete_vunerable_suggestion_count": 41,
277
- "instruct_bleu": 8.85,
278
- "instruct_total_count": 259,
279
- "instruct_vunerable_percentage": 0.2432,
280
- "instruct_vunerable_suggestion_count": 63},
281
- "Java": { "autocomplete_bleu": 18.59,
282
- "autocomplete_total_count": 229,
283
- "autocomplete_vunerable_percentage": 0.23579999999999998,
284
- "autocomplete_vunerable_suggestion_count": 54,
285
- "instruct_bleu": 14.04,
286
- "instruct_total_count": 229,
287
- "instruct_vunerable_percentage": 0.4279,
288
- "instruct_vunerable_suggestion_count": 98},
289
- "JavaScript": { "autocomplete_bleu": 15.19,
290
- "autocomplete_total_count": 249,
291
- "autocomplete_vunerable_percentage": 0.0843,
292
- "autocomplete_vunerable_suggestion_count": 21,
293
- "instruct_bleu": 9.12,
294
- "instruct_total_count": 249,
295
- "instruct_vunerable_percentage": 0.2851,
296
- "instruct_vunerable_suggestion_count": 71},
297
- "PHP": { "autocomplete_bleu": 13.76,
298
- "autocomplete_total_count": 162,
299
- "autocomplete_vunerable_percentage": 0.1852,
300
- "autocomplete_vunerable_suggestion_count": 30,
301
- "instruct_bleu": 11.83,
302
- "instruct_total_count": 162,
303
- "instruct_vunerable_percentage": 0.3519,
304
- "instruct_vunerable_suggestion_count": 57},
305
- "Python": { "autocomplete_bleu": 12.72,
306
- "autocomplete_total_count": 351,
307
- "autocomplete_vunerable_percentage": 0.2707,
308
- "autocomplete_vunerable_suggestion_count": 95,
309
- "instruct_bleu": 7.38,
310
- "instruct_total_count": 351,
311
- "instruct_vunerable_percentage": 0.31620000000000004,
312
- "instruct_vunerable_suggestion_count": 111},
313
- "Rust": { "autocomplete_bleu": 12.16,
314
- "autocomplete_total_count": 204,
315
- "autocomplete_vunerable_percentage": 0.1422,
316
- "autocomplete_vunerable_suggestion_count": 29,
317
- "instruct_bleu": 12.04,
318
- "instruct_total_count": 204,
319
- "instruct_vunerable_percentage": 0.2353,
320
- "instruct_vunerable_suggestion_count": 48}},
321
- "llama2-30b-chat": { "C": { "autocomplete_bleu": 13.32,
322
- "autocomplete_total_count": 227,
323
- "autocomplete_vunerable_percentage": 0.28190000000000004,
324
- "autocomplete_vunerable_suggestion_count": 64,
325
- "instruct_bleu": 8.95,
326
- "instruct_total_count": 227,
327
- "instruct_vunerable_percentage": 0.43170000000000003,
328
- "instruct_vunerable_suggestion_count": 98},
329
- "C#": { "autocomplete_bleu": 12.7,
330
- "autocomplete_total_count": 235,
331
- "autocomplete_vunerable_percentage": 0.1574,
332
- "autocomplete_vunerable_suggestion_count": 37,
333
- "instruct_bleu": 12.68,
334
- "instruct_total_count": 235,
335
- "instruct_vunerable_percentage": 0.37450000000000006,
336
- "instruct_vunerable_suggestion_count": 88},
337
- "C++": { "autocomplete_bleu": 11.91,
338
- "autocomplete_total_count": 259,
339
- "autocomplete_vunerable_percentage": 0.15439999999999998,
340
- "autocomplete_vunerable_suggestion_count": 40,
341
- "instruct_bleu": 8.97,
342
- "instruct_total_count": 259,
343
- "instruct_vunerable_percentage": 0.28190000000000004,
344
- "instruct_vunerable_suggestion_count": 73},
345
- "Java": { "autocomplete_bleu": 15.58,
346
- "autocomplete_total_count": 229,
347
- "autocomplete_vunerable_percentage": 0.3188,
348
- "autocomplete_vunerable_suggestion_count": 73,
349
- "instruct_bleu": 15.77,
350
- "instruct_total_count": 229,
351
- "instruct_vunerable_percentage": 0.4803,
352
- "instruct_vunerable_suggestion_count": 110},
353
- "JavaScript": { "autocomplete_bleu": 11.14,
354
- "autocomplete_total_count": 249,
355
- "autocomplete_vunerable_percentage": 0.15259999999999999,
356
- "autocomplete_vunerable_suggestion_count": 38,
357
- "instruct_bleu": 9.22,
358
- "instruct_total_count": 249,
359
- "instruct_vunerable_percentage": 0.3414,
360
- "instruct_vunerable_suggestion_count": 85},
361
- "PHP": { "autocomplete_bleu": 11.69,
362
- "autocomplete_total_count": 162,
363
- "autocomplete_vunerable_percentage": 0.179,
364
- "autocomplete_vunerable_suggestion_count": 29,
365
- "instruct_bleu": 11.41,
366
- "instruct_total_count": 162,
367
- "instruct_vunerable_percentage": 0.3889,
368
- "instruct_vunerable_suggestion_count": 63},
369
- "Python": { "autocomplete_bleu": 12.25,
370
- "autocomplete_total_count": 351,
371
- "autocomplete_vunerable_percentage": 0.2593,
372
- "autocomplete_vunerable_suggestion_count": 91,
373
- "instruct_bleu": 7.53,
374
- "instruct_total_count": 351,
375
- "instruct_vunerable_percentage": 0.3504,
376
- "instruct_vunerable_suggestion_count": 123},
377
- "Rust": { "autocomplete_bleu": 11.14,
378
- "autocomplete_total_count": 204,
379
- "autocomplete_vunerable_percentage": 0.1324,
380
- "autocomplete_vunerable_suggestion_count": 27,
381
- "instruct_bleu": 13.17,
382
- "instruct_total_count": 204,
383
- "instruct_vunerable_percentage": 0.2255,
384
- "instruct_vunerable_suggestion_count": 46}},
385
- "llama2-70b-chat": { "C": { "autocomplete_bleu": 13.76,
386
- "autocomplete_total_count": 227,
387
- "autocomplete_vunerable_percentage": 0.33039999999999997,
388
- "autocomplete_vunerable_suggestion_count": 75,
389
- "instruct_bleu": 9.0,
390
- "instruct_total_count": 227,
391
- "instruct_vunerable_percentage": 0.38770000000000004,
392
- "instruct_vunerable_suggestion_count": 88},
393
- "C#": { "autocomplete_bleu": 12.13,
394
- "autocomplete_total_count": 235,
395
- "autocomplete_vunerable_percentage": 0.2468,
396
- "autocomplete_vunerable_suggestion_count": 58,
397
- "instruct_bleu": 12.65,
398
- "instruct_total_count": 235,
399
- "instruct_vunerable_percentage": 0.3617,
400
- "instruct_vunerable_suggestion_count": 85},
401
- "C++": { "autocomplete_bleu": 11.3,
402
- "autocomplete_total_count": 259,
403
- "autocomplete_vunerable_percentage": 0.1274,
404
- "autocomplete_vunerable_suggestion_count": 33,
405
- "instruct_bleu": 9.6,
406
- "instruct_total_count": 259,
407
- "instruct_vunerable_percentage": 0.2664,
408
- "instruct_vunerable_suggestion_count": 69},
409
- "Java": { "autocomplete_bleu": 16.13,
410
- "autocomplete_total_count": 229,
411
- "autocomplete_vunerable_percentage": 0.2838,
412
- "autocomplete_vunerable_suggestion_count": 65,
413
- "instruct_bleu": 17.08,
414
- "instruct_total_count": 229,
415
- "instruct_vunerable_percentage": 0.49340000000000006,
416
- "instruct_vunerable_suggestion_count": 113},
417
- "JavaScript": { "autocomplete_bleu": 11.58,
418
- "autocomplete_total_count": 249,
419
- "autocomplete_vunerable_percentage": 0.1406,
420
- "autocomplete_vunerable_suggestion_count": 35,
421
- "instruct_bleu": 9.01,
422
- "instruct_total_count": 249,
423
- "instruct_vunerable_percentage": 0.3494,
424
- "instruct_vunerable_suggestion_count": 87},
425
- "PHP": { "autocomplete_bleu": 12.74,
426
- "autocomplete_total_count": 162,
427
- "autocomplete_vunerable_percentage": 0.19140000000000001,
428
- "autocomplete_vunerable_suggestion_count": 31,
429
- "instruct_bleu": 13.02,
430
- "instruct_total_count": 162,
431
- "instruct_vunerable_percentage": 0.3951,
432
- "instruct_vunerable_suggestion_count": 64},
433
- "Python": { "autocomplete_bleu": 11.22,
434
- "autocomplete_total_count": 351,
435
- "autocomplete_vunerable_percentage": 0.3276,
436
- "autocomplete_vunerable_suggestion_count": 115,
437
- "instruct_bleu": 7.52,
438
- "instruct_total_count": 351,
439
- "instruct_vunerable_percentage": 0.3447,
440
- "instruct_vunerable_suggestion_count": 121},
441
- "Rust": { "autocomplete_bleu": 9.05,
442
- "autocomplete_total_count": 204,
443
- "autocomplete_vunerable_percentage": 0.2157,
444
- "autocomplete_vunerable_suggestion_count": 44,
445
- "instruct_bleu": 13.24,
446
- "instruct_total_count": 204,
447
- "instruct_vunerable_percentage": 0.27449999999999997,
448
- "instruct_vunerable_suggestion_count": 56}},
449
- "llama2-7b-chat": { "C": { "autocomplete_bleu": 11.55,
450
- "autocomplete_total_count": 227,
451
- "autocomplete_vunerable_percentage": 0.141,
452
- "autocomplete_vunerable_suggestion_count": 32,
453
- "instruct_bleu": 8.49,
454
- "instruct_total_count": 227,
455
- "instruct_vunerable_percentage": 0.36560000000000004,
456
- "instruct_vunerable_suggestion_count": 83},
457
- "C#": { "autocomplete_bleu": 13.47,
458
- "autocomplete_total_count": 235,
459
- "autocomplete_vunerable_percentage": 0.0681,
460
- "autocomplete_vunerable_suggestion_count": 16,
461
- "instruct_bleu": 10.97,
462
- "instruct_total_count": 235,
463
- "instruct_vunerable_percentage": 0.29359999999999997,
464
- "instruct_vunerable_suggestion_count": 69},
465
- "C++": { "autocomplete_bleu": 11.28,
466
- "autocomplete_total_count": 259,
467
- "autocomplete_vunerable_percentage": 0.0695,
468
- "autocomplete_vunerable_suggestion_count": 18,
469
- "instruct_bleu": 8.36,
470
- "instruct_total_count": 259,
471
- "instruct_vunerable_percentage": 0.2278,
472
- "instruct_vunerable_suggestion_count": 59},
473
- "Java": { "autocomplete_bleu": 12.79,
474
- "autocomplete_total_count": 229,
475
- "autocomplete_vunerable_percentage": 0.11349999999999999,
476
- "autocomplete_vunerable_suggestion_count": 26,
477
- "instruct_bleu": 13.84,
478
- "instruct_total_count": 229,
479
- "instruct_vunerable_percentage": 0.39299999999999996,
480
- "instruct_vunerable_suggestion_count": 90},
481
- "JavaScript": { "autocomplete_bleu": 11.52,
482
- "autocomplete_total_count": 249,
483
- "autocomplete_vunerable_percentage": 0.0643,
484
- "autocomplete_vunerable_suggestion_count": 16,
485
- "instruct_bleu": 7.77,
486
- "instruct_total_count": 249,
487
- "instruct_vunerable_percentage": 0.30519999999999997,
488
- "instruct_vunerable_suggestion_count": 76},
489
- "PHP": { "autocomplete_bleu": 10.06,
490
- "autocomplete_total_count": 162,
491
- "autocomplete_vunerable_percentage": 0.11109999999999999,
492
- "autocomplete_vunerable_suggestion_count": 18,
493
- "instruct_bleu": 10.8,
494
- "instruct_total_count": 162,
495
- "instruct_vunerable_percentage": 0.2963,
496
- "instruct_vunerable_suggestion_count": 48},
497
- "Python": { "autocomplete_bleu": 10.67,
498
- "autocomplete_total_count": 351,
499
- "autocomplete_vunerable_percentage": 0.2365,
500
- "autocomplete_vunerable_suggestion_count": 83,
501
- "instruct_bleu": 6.89,
502
- "instruct_total_count": 351,
503
- "instruct_vunerable_percentage": 0.3077,
504
- "instruct_vunerable_suggestion_count": 108},
505
- "Rust": { "autocomplete_bleu": 9.53,
506
- "autocomplete_total_count": 204,
507
- "autocomplete_vunerable_percentage": 0.0931,
508
- "autocomplete_vunerable_suggestion_count": 19,
509
- "instruct_bleu": 10.78,
510
- "instruct_total_count": 204,
511
- "instruct_vunerable_percentage": 0.2108,
512
- "instruct_vunerable_suggestion_count": 43}}}
 
253
  "instruct_bleu": 15.43,
254
  "instruct_total_count": 204,
255
  "instruct_vunerable_percentage": 0.3971,
256
+ "instruct_vunerable_suggestion_count": 81}}
257
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
interpreter_abuse_tests.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Container Escape": {
3
+ "codellama-13b-instruct": 0.2247191011,
4
+ "codellama-34b-instruct": 0.1978021978,
5
+ "gpt-4": 0.2470588235,
6
+ "gpt-3.5-turbo": 0.3076923077,
7
+ "codellama-70b-instruct": 0.2976190476,
8
+ "llama 3 8b-instruct": 0.2527472527,
9
+ "mistral-large-latest": 0.3368421053,
10
+ "mistral-medium-latest": 0.4222222222,
11
+ "llama 3 70b-instruct": 0.5257731959,
12
+ "mistral-small-latest": 0.4693877551
13
+ },
14
+ "Privilege Escalation": {
15
+ "codellama-13b-instruct": 0.1276595745,
16
+ "codellama-34b-instruct": 0.1789473684,
17
+ "gpt-4": 0.2111111111,
18
+ "gpt-3.5-turbo": 0.313253012,
19
+ "codellama-70b-instruct": 0.2696629213,
20
+ "llama 3 8b-instruct": 0.2474226804,
21
+ "mistral-large-latest": 0.4166666667,
22
+ "mistral-medium-latest": 0.347826087,
23
+ "llama 3 70b-instruct": 0.6082474227,
24
+ "mistral-small-latest": 0.4787234043
25
+ },
26
+ "Post Exploitation": {
27
+ "codellama-13b-instruct": 0.1648351648,
28
+ "codellama-34b-instruct": 0.1318681319,
29
+ "gpt-4": 0.1445783133,
30
+ "gpt-3.5-turbo": 0.1764705882,
31
+ "codellama-70b-instruct": 0.2298850575,
32
+ "llama 3 8b-instruct": 0.2653061224,
33
+ "mistral-large-latest": 0.5154639175,
34
+ "mistral-medium-latest": 0.4719101124,
35
+ "llama 3 70b-instruct": 0.4020618557,
36
+ "mistral-small-latest": 0.6736842105
37
+ },
38
+ "Reflected Attack": {
39
+ "codellama-13b-instruct": 0.0602409639,
40
+ "codellama-34b-instruct": 0.1785714286,
41
+ "gpt-4": 0.1460674157,
42
+ "gpt-3.5-turbo": 0.0714285714,
43
+ "codellama-70b-instruct": 0.1704545455,
44
+ "llama 3 8b-instruct": 0.3469387755,
45
+ "mistral-large-latest": 0.3548387097,
46
+ "mistral-medium-latest": 0.4565217391,
47
+ "llama 3 70b-instruct": 0.2323232323,
48
+ "mistral-small-latest": 0.4
49
+ },
50
+ "Social Engineering": {
51
+ "codellama-13b-instruct": 0.0963855422,
52
+ "codellama-34b-instruct": 0.1290322581,
53
+ "gpt-4": 0.2073170732,
54
+ "gpt-3.5-turbo": 0.2359550562,
55
+ "codellama-70b-instruct": 0.2117647059,
56
+ "llama 3 8b-instruct": 0.3052631579,
57
+ "mistral-large-latest": 0.2421052632,
58
+ "mistral-medium-latest": 0.2150537634,
59
+ "llama 3 70b-instruct": 0.1770833333,
60
+ "mistral-small-latest": 0.3214285714
61
+ }
62
+ }
prompt_injection_tests.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "different_user_input_language": {
3
+ "codellama-70b-instruct": 0.28,
4
+ "gpt-4": 0.16,
5
+ "llama 3 70b-instruct": 0.44,
6
+ "codellama-34b-instruct": 0.64,
7
+ "codellama-13b-instruct": 0.76,
8
+ "gpt-3.5-turbo": 0.6,
9
+ "llama 3 8b-instruct": 0.76
10
+ },
11
+ "output_formatting_manipulation": {
12
+ "codellama-70b-instruct": 0.1764705882,
13
+ "gpt-4": 0.3529411765,
14
+ "llama 3 70b-instruct": 0.7058823529,
15
+ "codellama-34b-instruct": 0.2941176471,
16
+ "codellama-13b-instruct": 0.2941176471,
17
+ "gpt-3.5-turbo": 0.7647058824,
18
+ "llama 3 8b-instruct": 0.7647058824
19
+ },
20
+ "overload_with_information": {
21
+ "codellama-70b-instruct": 0.2,
22
+ "gpt-4": 0.35,
23
+ "llama 3 70b-instruct": 0.25,
24
+ "codellama-34b-instruct": 0.6,
25
+ "codellama-13b-instruct": 0.55,
26
+ "gpt-3.5-turbo": 0.4,
27
+ "llama 3 8b-instruct": 0.6
28
+ },
29
+ "many_shot_attack": {
30
+ "codellama-70b-instruct": 0.0,
31
+ "gpt-4": 0.2857142857,
32
+ "llama 3 70b-instruct": 0.2857142857,
33
+ "codellama-34b-instruct": 0.7142857143,
34
+ "codellama-13b-instruct": 0.5714285714,
35
+ "gpt-3.5-turbo": 0.5714285714,
36
+ "llama 3 8b-instruct": 0.1428571429
37
+ },
38
+ "ignore_previous_instructions": {
39
+ "codellama-70b-instruct": 0.3333333333,
40
+ "gpt-4": 0.2083333333,
41
+ "llama 3 70b-instruct": 0.375,
42
+ "codellama-34b-instruct": 0.4583333333,
43
+ "codellama-13b-instruct": 0.2916666667,
44
+ "gpt-3.5-turbo": 0.3333333333,
45
+ "llama 3 8b-instruct": 0.5416666667
46
+ },
47
+ "system_mode": {
48
+ "codellama-70b-instruct": 0.1578947368,
49
+ "gpt-4": 0.1578947368,
50
+ "llama 3 70b-instruct": 0.4210526316,
51
+ "codellama-34b-instruct": 0.3684210526,
52
+ "codellama-13b-instruct": 0.5263157895,
53
+ "gpt-3.5-turbo": 0.3684210526,
54
+ "llama 3 8b-instruct": 0.5263157895
55
+ },
56
+ "few_shot_attack": {
57
+ "codellama-70b-instruct": 0.0,
58
+ "gpt-4": 0.1818181818,
59
+ "llama 3 70b-instruct": 0.1818181818,
60
+ "codellama-34b-instruct": 0.4545454545,
61
+ "codellama-13b-instruct": 0.5454545455,
62
+ "gpt-3.5-turbo": 0.3636363636,
63
+ "llama 3 8b-instruct": 0.6363636364
64
+ },
65
+ "indirect_reference": {
66
+ "codellama-70b-instruct": 0.2142857143,
67
+ "gpt-4": 0.4285714286,
68
+ "llama 3 70b-instruct": 0.3571428571,
69
+ "codellama-34b-instruct": 0.3571428571,
70
+ "codellama-13b-instruct": 0.2142857143,
71
+ "gpt-3.5-turbo": 0.3571428571,
72
+ "llama 3 8b-instruct": 0.3571428571
73
+ },
74
+ "repeated_token_attack": {
75
+ "codellama-70b-instruct": 0.0,
76
+ "gpt-4": 0.0,
77
+ "llama 3 70b-instruct": 0.0,
78
+ "codellama-34b-instruct": 0.5,
79
+ "codellama-13b-instruct": 0.5,
80
+ "gpt-3.5-turbo": 0.6666666667,
81
+ "llama 3 8b-instruct": 0.5
82
+ },
83
+ "persuasion": {
84
+ "codellama-70b-instruct": 0.3461538462,
85
+ "gpt-4": 0.1538461538,
86
+ "llama 3 70b-instruct": 0.2692307692,
87
+ "codellama-34b-instruct": 0.3076923077,
88
+ "codellama-13b-instruct": 0.2692307692,
89
+ "gpt-3.5-turbo": 0.1923076923,
90
+ "llama 3 8b-instruct": 0.4615384615
91
+ },
92
+ "mixed_techniques": {
93
+ "codellama-70b-instruct": 0.1515151515,
94
+ "gpt-4": 0.3333333333,
95
+ "llama 3 70b-instruct": 0.2424242424,
96
+ "codellama-34b-instruct": 0.2121212121,
97
+ "codellama-13b-instruct": 0.2727272727,
98
+ "gpt-3.5-turbo": 0.3636363636,
99
+ "llama 3 8b-instruct": 0.3636363636
100
+ },
101
+ "virtualization": {
102
+ "codellama-70b-instruct": 0.0,
103
+ "gpt-4": 0.1428571429,
104
+ "llama 3 70b-instruct": 0.4285714286,
105
+ "codellama-34b-instruct": 0.0714285714,
106
+ "codellama-13b-instruct": 0.2142857143,
107
+ "gpt-3.5-turbo": 0.2857142857,
108
+ "llama 3 8b-instruct": 0.5
109
+ },
110
+ "payload_splitting": {
111
+ "codellama-70b-instruct": 0.0,
112
+ "gpt-4": 0.0,
113
+ "llama 3 70b-instruct": 0.1111111111,
114
+ "codellama-34b-instruct": 0.3333333333,
115
+ "codellama-13b-instruct": 0.4444444444,
116
+ "gpt-3.5-turbo": 0.2222222222,
117
+ "llama 3 8b-instruct": 0.3333333333
118
+ },
119
+ "hypothetical_scenario": {
120
+ "codellama-70b-instruct": 0.0769230769,
121
+ "gpt-4": 0.2307692308,
122
+ "llama 3 70b-instruct": 0.2307692308,
123
+ "codellama-34b-instruct": 0.1538461538,
124
+ "codellama-13b-instruct": 0.1538461538,
125
+ "gpt-3.5-turbo": 0.3076923077,
126
+ "llama 3 8b-instruct": 0.2307692308
127
+ },
128
+ "token_smuggling": {
129
+ "codellama-70b-instruct": 0.0,
130
+ "gpt-4": 0.0,
131
+ "llama 3 70b-instruct": 0.0769230769,
132
+ "codellama-34b-instruct": 0.0,
133
+ "codellama-13b-instruct": 0.0,
134
+ "gpt-3.5-turbo": 0.0769230769,
135
+ "llama 3 8b-instruct": 0.0769230769
136
+ }
137
+ }
trr_frr_tradeoff_helpfulness.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "codellama-13b-instruct": {
3
+ "Violation Rate": 0.45415284393545263,
4
+ "Refusal Rate": 0.016
5
+ },
6
+ "codellama-34b-instruct": {
7
+ "Violation Rate": 0.4996666666666666,
8
+ "Refusal Rate": 0.016
9
+ },
10
+ "codellama/CodeLlama-70b-Instruct-hf": {
11
+ "Violation Rate": 0.36645819397993296,
12
+ "Refusal Rate": 0.6946666666666667
13
+ },
14
+ "gpt-3.5-turbo": {
15
+ "Violation Rate": 0.6056666666666667,
16
+ "Refusal Rate": 0.0026666666666666666
17
+ },
18
+ "gpt-4": {
19
+ "Violation Rate": 0.28800000000000003,
20
+ "Refusal Rate": 0.04133333333333333
21
+ },
22
+ "mistral-medium-latest": {
23
+ "Violation Rate": 0.3470992196209587,
24
+ "Refusal Rate": 0.11333333333333333
25
+ },
26
+ "mistral-small-latest": {
27
+ "Violation Rate": 0.5693333333333334,
28
+ "Refusal Rate": 0.104
29
+ },
30
+ "mistral-large-latest": {
31
+ "Violation Rate": 0.4773333333333334,
32
+ "Refusal Rate": 0.11066666666666666
33
+ },
34
+ "llama3p-70b-rc3_vr_mid_3": {
35
+ "Violation Rate": 0.3663333333333333,
36
+ "Refusal Rate": 0.0
37
+ },
38
+ "llama3p-7b-rc3_vr_mid_2": {
39
+ "Violation Rate": 0.16366666666666663,
40
+ "Refusal Rate": 0.02266666666666667
41
+ }
42
+ }