forrestbao commited on
Commit
696341e
·
1 Parent(s): 053ade8

display plot; use funix 0.6.2

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -1
  2. app/app.py +12 -14
  3. app/app_utils.py +35 -29
  4. app/requirements.txt +1 -1
Dockerfile CHANGED
@@ -6,7 +6,6 @@ COPY ./app/vectara_theme.py /app/vectara_theme.py
6
  COPY ./app/requirements.txt /app/requirements.txt
7
  COPY ./app/app.py /app/app.py
8
  COPY ./app/app_utils.py /app/app_utils.py
9
- # COPY ./app/results.json /app/results.json
10
 
11
  RUN apt-get update && apt-get install -y git-lfs
12
 
 
6
  COPY ./app/requirements.txt /app/requirements.txt
7
  COPY ./app/app.py /app/app.py
8
  COPY ./app/app_utils.py /app/app_utils.py
 
9
 
10
  RUN apt-get update && apt-get install -y git-lfs
11
 
app/app.py CHANGED
@@ -18,7 +18,7 @@ results_df = load_results()
18
  direction="column",
19
  autorun="always",
20
  theme="vectara",
21
- figure_to_image= True,
22
  # output_layout=[
23
  # [{"return_index": 0, "width": 0.3}],
24
  # [{"return_index": 1, "width": 0.7}],
@@ -26,13 +26,14 @@ results_df = load_results()
26
  )
27
  def leaderboard(
28
  filter_models_by_name: str = ""
29
- # ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
30
- ) -> Tuple[Markdown, pd.DataFrame]:
 
31
  """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
32
 
33
  Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
34
 
35
- ## Usage
36
 
37
  * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
38
  * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
@@ -44,21 +45,18 @@ def leaderboard(
44
  """
45
  df = results_df
46
 
47
- filter_models_by_name = filter_models_by_name.replace(",", ";")
48
- filter_models_by_name = filter_models_by_name.replace(" ", "")
49
- if len(filter_models_by_name) > 0:
50
  filter_models_by_name = filter_models_by_name.split(";")
 
51
  filter_models_by_name = [name for name in filter_models_by_name if name != ""]
52
  df = df.copy()
53
- df = df[df["LLM"].str.contains("|".join(filter_models_by_name), na=False)]
54
 
55
  if len(df) == 0: # return an empty DF and an empty figure
56
- # return pd.DataFrame(), matplotlib.figure.Figure(), Markdown("No models found")
57
- return Markdown("No models found"), pd.DataFrame()
58
 
59
- return Markdown(""), df
60
 
61
  fig = visualize_leaderboard(df)
62
-
63
- # return df, fig
64
- # return Markdown(""), fig, df
 
18
  direction="column",
19
  autorun="always",
20
  theme="vectara",
21
+ matplotlib_format="svg",
22
  # output_layout=[
23
  # [{"return_index": 0, "width": 0.3}],
24
  # [{"return_index": 1, "width": 0.7}],
 
26
  )
27
  def leaderboard(
28
  filter_models_by_name: str = ""
29
+ # filter_models_by_name: List[Literal["all", "anthropic", "google", "meta", "openai", "xai", "qwen"]] = ["all"]
30
+ ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
31
+ # ) -> Tuple[Markdown, pd.DataFrame]:
32
  """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
33
 
34
  Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
35
 
36
+ **Usage:**
37
 
38
  * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
39
  * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
 
45
  """
46
  df = results_df
47
 
48
+ filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
49
+ if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name:
 
50
  filter_models_by_name = filter_models_by_name.split(";")
51
+ # filter_models_by_name = [name for name in filter_models_by_name if name != "all"]
52
  filter_models_by_name = [name for name in filter_models_by_name if name != ""]
53
  df = df.copy()
54
+ df = df[df["LLM_lower_case"].str.contains("|".join(filter_models_by_name), na=False)]
55
 
56
  if len(df) == 0: # return an empty DF and an empty figure
57
+ return Markdown(f"No models found matching: {filter_models_by_name}"), matplotlib.figure.Figure(), pd.DataFrame()
 
58
 
59
+ # return Markdown(""), df
60
 
61
  fig = visualize_leaderboard(df)
62
+ return Markdown(""), fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
 
 
app/app_utils.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import Repository
5
  import pandas as pd
6
  import matplotlib.pyplot as plt
7
  import matplotlib.figure
 
8
  from sklearn.preprocessing import MinMaxScaler
9
 
10
  # import dotenv
@@ -103,7 +104,7 @@ def load_results(
103
  print(f"Using pre-dumped results from {results_json}")
104
 
105
  results = json.load(open(results_json, "r"))
106
- print(results)
107
 
108
  results_df = pd.DataFrame(results)
109
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
@@ -113,6 +114,8 @@ def load_results(
113
 
114
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
115
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
 
 
116
 
117
  return results_df
118
 
@@ -130,54 +133,57 @@ def determine_font_size(LLM: str, hallucination_percent: float) -> int:
130
  return 9
131
 
132
  def determine_font_color(hallucination_percent: float) -> str:
133
- if hallucination_percent < 0.3:
134
- return 'white'
135
- elif hallucination_percent < 0.65:
136
  return 'black'
137
  else:
138
  return 'white'
139
 
140
- def determine_llm_x_position(LLM: str, hallucination_percent: float) -> float:
141
- # determine the x position of the LLM name
142
- # For an LLM, it's bar length is 10* its hallucination %
143
- # if the LLM name cannot fit in the bar, move it to the left
144
- # if the LLM name can fit in the bar, let its x position be 0.01
145
-
146
  name_length = len(LLM)
147
  print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
148
 
149
- hallu_rate_to_bar_length_ratio = 10
150
  bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
151
- if name_length > bar_length:
152
- return 0.01
153
- else:
154
- return hallucination_percent
155
-
156
  def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
157
- fig = plt.figure(figsize=(5, 4))
158
  # plot using LLM as x-axis and Hallucination % as y-axis
159
  # make bars horizontal
160
  plot_df = df.head(10)
161
  plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
162
 
163
  plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
 
 
 
 
 
164
 
165
- # for i, row in plot_df.iterrows():
166
- # plt.text(
167
- # determine_llm_x_position(row["LLM"], row["Hallucination %"]),
168
- # row["LLM"],
169
- # f"{row['LLM']}",
170
- # ha='left',
171
- # va='center',
172
- # fontsize=9,
173
- # color=determine_font_color(row["normalized_hallucination_rate"])
174
- # )
 
 
175
  # plt.yticks([])
176
  plt.tight_layout()
177
 
 
 
 
178
  plt.xticks(fontsize=9)
179
- # plt.xlabel("Hallucination %", fontsize=9)
180
- plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=9)
181
  plt.gca().spines['top'].set_visible(False)
182
  plt.gca().spines['right'].set_visible(False)
183
  plt.gca().spines['left'].set_visible(False)
 
5
  import pandas as pd
6
  import matplotlib.pyplot as plt
7
  import matplotlib.figure
8
+ from datetime import datetime
9
  from sklearn.preprocessing import MinMaxScaler
10
 
11
  # import dotenv
 
104
  print(f"Using pre-dumped results from {results_json}")
105
 
106
  results = json.load(open(results_json, "r"))
107
+ # print(results)
108
 
109
  results_df = pd.DataFrame(results)
110
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
 
114
 
115
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
116
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
117
+
118
+ results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
119
 
120
  return results_df
121
 
 
133
  return 9
134
 
135
  def determine_font_color(hallucination_percent: float) -> str:
136
+ if 0.25 < hallucination_percent < 0.65:
 
 
137
  return 'black'
138
  else:
139
  return 'white'
140
 
141
+ def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
 
 
 
 
 
142
  name_length = len(LLM)
143
  print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
144
 
145
+ hallu_rate_to_bar_length_ratio = 5
146
  bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
147
+ if name_length < bar_length:
148
+ return 0.01, determine_font_color(hallucination_percent)
149
+ else: # to the right of the bar, black anyway
150
+ return hallucination_percent, 'black'
151
+
152
  def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
153
+ fig = plt.figure(figsize=(8, 4))
154
  # plot using LLM as x-axis and Hallucination % as y-axis
155
  # make bars horizontal
156
  plot_df = df.head(10)
157
  plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
158
 
159
  plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
160
+
161
+ # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
162
+ # lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
163
+ # axis=1
164
+ # ))
165
 
166
+ for i, row in plot_df.iterrows():
167
+ plt.text(
168
+ # row["LLM_x_position"],
169
+ row["Hallucination %"] + 0.025,
170
+ row["LLM"],
171
+ row["Hallucination %"],
172
+ # f"{row['LLM']}",
173
+ ha='left',
174
+ va='center',
175
+ fontsize=9,
176
+ # color=row["font_color"]
177
+ )
178
  # plt.yticks([])
179
  plt.tight_layout()
180
 
181
+ # add margin to the right of the plot
182
+ plt.subplots_adjust(right=0.95)
183
+
184
  plt.xticks(fontsize=9)
185
+ plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
186
+ plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
187
  plt.gca().spines['top'].set_visible(False)
188
  plt.gca().spines['right'].set_visible(False)
189
  plt.gca().spines['left'].set_visible(False)
app/requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- funix==0.6.1
2
  pandas
3
  huggingface_hub
4
  matplotlib
 
1
+ funix==0.6.2
2
  pandas
3
  huggingface_hub
4
  matplotlib