Spaces:
Runtime error
Runtime error
Commit
·
696341e
1
Parent(s):
053ade8
display plot; use funix 0.6.2
Browse files- Dockerfile +0 -1
- app/app.py +12 -14
- app/app_utils.py +35 -29
- app/requirements.txt +1 -1
Dockerfile
CHANGED
@@ -6,7 +6,6 @@ COPY ./app/vectara_theme.py /app/vectara_theme.py
|
|
6 |
COPY ./app/requirements.txt /app/requirements.txt
|
7 |
COPY ./app/app.py /app/app.py
|
8 |
COPY ./app/app_utils.py /app/app_utils.py
|
9 |
-
# COPY ./app/results.json /app/results.json
|
10 |
|
11 |
RUN apt-get update && apt-get install -y git-lfs
|
12 |
|
|
|
6 |
COPY ./app/requirements.txt /app/requirements.txt
|
7 |
COPY ./app/app.py /app/app.py
|
8 |
COPY ./app/app_utils.py /app/app_utils.py
|
|
|
9 |
|
10 |
RUN apt-get update && apt-get install -y git-lfs
|
11 |
|
app/app.py
CHANGED
@@ -18,7 +18,7 @@ results_df = load_results()
|
|
18 |
direction="column",
|
19 |
autorun="always",
|
20 |
theme="vectara",
|
21 |
-
|
22 |
# output_layout=[
|
23 |
# [{"return_index": 0, "width": 0.3}],
|
24 |
# [{"return_index": 1, "width": 0.7}],
|
@@ -26,13 +26,14 @@ results_df = load_results()
|
|
26 |
)
|
27 |
def leaderboard(
|
28 |
filter_models_by_name: str = ""
|
29 |
-
#
|
30 |
-
) -> Tuple[Markdown, pd.DataFrame]:
|
|
|
31 |
"""# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
|
32 |
|
33 |
Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
|
34 |
|
35 |
-
|
36 |
|
37 |
* All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
|
38 |
* Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
|
@@ -44,21 +45,18 @@ def leaderboard(
|
|
44 |
"""
|
45 |
df = results_df
|
46 |
|
47 |
-
filter_models_by_name = filter_models_by_name.replace(",", ";")
|
48 |
-
filter_models_by_name
|
49 |
-
if len(filter_models_by_name) > 0:
|
50 |
filter_models_by_name = filter_models_by_name.split(";")
|
|
|
51 |
filter_models_by_name = [name for name in filter_models_by_name if name != ""]
|
52 |
df = df.copy()
|
53 |
-
df = df[df["
|
54 |
|
55 |
if len(df) == 0: # return an empty DF and an empty figure
|
56 |
-
|
57 |
-
return Markdown("No models found"), pd.DataFrame()
|
58 |
|
59 |
-
return Markdown(""), df
|
60 |
|
61 |
fig = visualize_leaderboard(df)
|
62 |
-
|
63 |
-
# return df, fig
|
64 |
-
# return Markdown(""), fig, df
|
|
|
18 |
direction="column",
|
19 |
autorun="always",
|
20 |
theme="vectara",
|
21 |
+
matplotlib_format="svg",
|
22 |
# output_layout=[
|
23 |
# [{"return_index": 0, "width": 0.3}],
|
24 |
# [{"return_index": 1, "width": 0.7}],
|
|
|
26 |
)
|
27 |
def leaderboard(
|
28 |
filter_models_by_name: str = ""
|
29 |
+
# filter_models_by_name: List[Literal["all", "anthropic", "google", "meta", "openai", "xai", "qwen"]] = ["all"]
|
30 |
+
) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
|
31 |
+
# ) -> Tuple[Markdown, pd.DataFrame]:
|
32 |
"""# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
|
33 |
|
34 |
Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
|
35 |
|
36 |
+
**Usage:**
|
37 |
|
38 |
* All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
|
39 |
* Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
|
|
|
45 |
"""
|
46 |
df = results_df
|
47 |
|
48 |
+
filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
|
49 |
+
if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name:
|
|
|
50 |
filter_models_by_name = filter_models_by_name.split(";")
|
51 |
+
# filter_models_by_name = [name for name in filter_models_by_name if name != "all"]
|
52 |
filter_models_by_name = [name for name in filter_models_by_name if name != ""]
|
53 |
df = df.copy()
|
54 |
+
df = df[df["LLM_lower_case"].str.contains("|".join(filter_models_by_name), na=False)]
|
55 |
|
56 |
if len(df) == 0: # return an empty DF and an empty figure
|
57 |
+
return Markdown(f"No models found matching: {filter_models_by_name}"), matplotlib.figure.Figure(), pd.DataFrame()
|
|
|
58 |
|
59 |
+
# return Markdown(""), df
|
60 |
|
61 |
fig = visualize_leaderboard(df)
|
62 |
+
return Markdown(""), fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
|
|
|
|
app/app_utils.py
CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import Repository
|
|
5 |
import pandas as pd
|
6 |
import matplotlib.pyplot as plt
|
7 |
import matplotlib.figure
|
|
|
8 |
from sklearn.preprocessing import MinMaxScaler
|
9 |
|
10 |
# import dotenv
|
@@ -103,7 +104,7 @@ def load_results(
|
|
103 |
print(f"Using pre-dumped results from {results_json}")
|
104 |
|
105 |
results = json.load(open(results_json, "r"))
|
106 |
-
print(results)
|
107 |
|
108 |
results_df = pd.DataFrame(results)
|
109 |
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
|
@@ -113,6 +114,8 @@ def load_results(
|
|
113 |
|
114 |
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
|
115 |
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
|
|
|
|
|
116 |
|
117 |
return results_df
|
118 |
|
@@ -130,54 +133,57 @@ def determine_font_size(LLM: str, hallucination_percent: float) -> int:
|
|
130 |
return 9
|
131 |
|
132 |
def determine_font_color(hallucination_percent: float) -> str:
|
133 |
-
if hallucination_percent < 0.
|
134 |
-
return 'white'
|
135 |
-
elif hallucination_percent < 0.65:
|
136 |
return 'black'
|
137 |
else:
|
138 |
return 'white'
|
139 |
|
140 |
-
def
|
141 |
-
# determine the x position of the LLM name
|
142 |
-
# For an LLM, it's bar length is 10* its hallucination %
|
143 |
-
# if the LLM name cannot fit in the bar, move it to the left
|
144 |
-
# if the LLM name can fit in the bar, let its x position be 0.01
|
145 |
-
|
146 |
name_length = len(LLM)
|
147 |
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
|
148 |
|
149 |
-
hallu_rate_to_bar_length_ratio =
|
150 |
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
|
151 |
-
if name_length
|
152 |
-
return 0.01
|
153 |
-
else:
|
154 |
-
return hallucination_percent
|
155 |
-
|
156 |
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
|
157 |
-
fig = plt.figure(figsize=(
|
158 |
# plot using LLM as x-axis and Hallucination % as y-axis
|
159 |
# make bars horizontal
|
160 |
plot_df = df.head(10)
|
161 |
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
|
162 |
|
163 |
plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
175 |
# plt.yticks([])
|
176 |
plt.tight_layout()
|
177 |
|
|
|
|
|
|
|
178 |
plt.xticks(fontsize=9)
|
179 |
-
|
180 |
-
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=
|
181 |
plt.gca().spines['top'].set_visible(False)
|
182 |
plt.gca().spines['right'].set_visible(False)
|
183 |
plt.gca().spines['left'].set_visible(False)
|
|
|
5 |
import pandas as pd
|
6 |
import matplotlib.pyplot as plt
|
7 |
import matplotlib.figure
|
8 |
+
from datetime import datetime
|
9 |
from sklearn.preprocessing import MinMaxScaler
|
10 |
|
11 |
# import dotenv
|
|
|
104 |
print(f"Using pre-dumped results from {results_json}")
|
105 |
|
106 |
results = json.load(open(results_json, "r"))
|
107 |
+
# print(results)
|
108 |
|
109 |
results_df = pd.DataFrame(results)
|
110 |
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
|
|
|
114 |
|
115 |
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
|
116 |
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
|
117 |
+
|
118 |
+
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
|
119 |
|
120 |
return results_df
|
121 |
|
|
|
133 |
return 9
|
134 |
|
135 |
def determine_font_color(hallucination_percent: float) -> str:
|
136 |
+
if 0.25 < hallucination_percent < 0.65:
|
|
|
|
|
137 |
return 'black'
|
138 |
else:
|
139 |
return 'white'
|
140 |
|
141 |
+
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
|
|
|
|
|
|
|
|
|
|
|
142 |
name_length = len(LLM)
|
143 |
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
|
144 |
|
145 |
+
hallu_rate_to_bar_length_ratio = 5
|
146 |
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
|
147 |
+
if name_length < bar_length:
|
148 |
+
return 0.01, determine_font_color(hallucination_percent)
|
149 |
+
else: # to the right of the bar, black anyway
|
150 |
+
return hallucination_percent, 'black'
|
151 |
+
|
152 |
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
|
153 |
+
fig = plt.figure(figsize=(8, 4))
|
154 |
# plot using LLM as x-axis and Hallucination % as y-axis
|
155 |
# make bars horizontal
|
156 |
plot_df = df.head(10)
|
157 |
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
|
158 |
|
159 |
plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
|
160 |
+
|
161 |
+
# plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
|
162 |
+
# lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
|
163 |
+
# axis=1
|
164 |
+
# ))
|
165 |
|
166 |
+
for i, row in plot_df.iterrows():
|
167 |
+
plt.text(
|
168 |
+
# row["LLM_x_position"],
|
169 |
+
row["Hallucination %"] + 0.025,
|
170 |
+
row["LLM"],
|
171 |
+
row["Hallucination %"],
|
172 |
+
# f"{row['LLM']}",
|
173 |
+
ha='left',
|
174 |
+
va='center',
|
175 |
+
fontsize=9,
|
176 |
+
# color=row["font_color"]
|
177 |
+
)
|
178 |
# plt.yticks([])
|
179 |
plt.tight_layout()
|
180 |
|
181 |
+
# add margin to the right of the plot
|
182 |
+
plt.subplots_adjust(right=0.95)
|
183 |
+
|
184 |
plt.xticks(fontsize=9)
|
185 |
+
plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
|
186 |
+
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
|
187 |
plt.gca().spines['top'].set_visible(False)
|
188 |
plt.gca().spines['right'].set_visible(False)
|
189 |
plt.gca().spines['left'].set_visible(False)
|
app/requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
funix==0.6.
|
2 |
pandas
|
3 |
huggingface_hub
|
4 |
matplotlib
|
|
|
1 |
+
funix==0.6.2
|
2 |
pandas
|
3 |
huggingface_hub
|
4 |
matplotlib
|