Spaces:

vectara
/

leaderboard

Runtime error

App Files Files Community

forrestbao commited on 10 days ago

Commit

696341e

1 Parent(s): 053ade8

display plot; use funix 0.6.2

Browse files

Files changed (4) hide show

Dockerfile +0 -1
app/app.py +12 -14
app/app_utils.py +35 -29
app/requirements.txt +1 -1

Dockerfile CHANGED Viewed

@@ -6,7 +6,6 @@ COPY ./app/vectara_theme.py /app/vectara_theme.py
 COPY ./app/requirements.txt /app/requirements.txt
 COPY ./app/app.py /app/app.py
 COPY ./app/app_utils.py /app/app_utils.py
-# COPY ./app/results.json /app/results.json
 RUN apt-get update && apt-get install -y git-lfs

 COPY ./app/requirements.txt /app/requirements.txt
 COPY ./app/app.py /app/app.py
 COPY ./app/app_utils.py /app/app_utils.py
 RUN apt-get update && apt-get install -y git-lfs

app/app.py CHANGED Viewed

@@ -18,7 +18,7 @@ results_df = load_results()
     direction="column",
     autorun="always",
     theme="vectara",
-    figure_to_image= True,
     # output_layout=[
     #     [{"return_index": 0, "width": 0.3}],
     #     [{"return_index": 1, "width": 0.7}],
@@ -26,13 +26,14 @@ results_df = load_results()
 )
 def leaderboard(
     filter_models_by_name: str = ""
-# ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
-) -> Tuple[Markdown, pd.DataFrame]:
     """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
     Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model).    For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
-    ## Usage
     * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
     * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
@@ -44,21 +45,18 @@ def leaderboard(
     """
     df = results_df
-    filter_models_by_name = filter_models_by_name.replace(",", ";")
-    filter_models_by_name = filter_models_by_name.replace(" ", "")
-    if len(filter_models_by_name) > 0:
         filter_models_by_name = filter_models_by_name.split(";")
         filter_models_by_name = [name for name in filter_models_by_name if name != ""]
         df = df.copy()
-        df = df[df["LLM"].str.contains("|".join(filter_models_by_name), na=False)]
         if len(df) == 0: # return an empty DF and an empty figure
-            # return pd.DataFrame(), matplotlib.figure.Figure(), Markdown("No models found")
-            return Markdown("No models found"), pd.DataFrame()
-    return Markdown(""), df
     fig = visualize_leaderboard(df)
-    # return df, fig
-    # return Markdown(""), fig, df

     direction="column",
     autorun="always",
     theme="vectara",
+    matplotlib_format="svg",
     # output_layout=[
     #     [{"return_index": 0, "width": 0.3}],
     #     [{"return_index": 1, "width": 0.7}],
 )
 def leaderboard(
     filter_models_by_name: str = ""
+    # filter_models_by_name: List[Literal["all", "anthropic", "google", "meta", "openai", "xai", "qwen"]] = ["all"]
+) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
+# ) -> Tuple[Markdown, pd.DataFrame]:
     """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
     Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model).    For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
+    **Usage:**
     * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
     * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
     """
     df = results_df
+    filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
+    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name:
         filter_models_by_name = filter_models_by_name.split(";")
+        # filter_models_by_name = [name for name in filter_models_by_name if name != "all"]
         filter_models_by_name = [name for name in filter_models_by_name if name != ""]
         df = df.copy()
+        df = df[df["LLM_lower_case"].str.contains("|".join(filter_models_by_name), na=False)]
         if len(df) == 0: # return an empty DF and an empty figure
+            return Markdown(f"No models found matching: {filter_models_by_name}"), matplotlib.figure.Figure(), pd.DataFrame()
+    # return Markdown(""), df
     fig = visualize_leaderboard(df)
+    return Markdown(""), fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]

app/app_utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import Repository
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.figure
 from sklearn.preprocessing import MinMaxScaler
 # import dotenv
@@ -103,7 +104,7 @@ def load_results(
         print(f"Using pre-dumped results from {results_json}")
     results = json.load(open(results_json, "r"))
-    print(results)
     results_df = pd.DataFrame(results)
     results_df = results_df.sort_values(by="Hallucination %", ascending=True)
@@ -113,6 +114,8 @@ def load_results(
     for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
         results_df[column] = results_df[column].apply(lambda x: round(x, 3))
     return results_df
@@ -130,54 +133,57 @@ def determine_font_size(LLM: str, hallucination_percent: float) -> int:
         return 9
 def determine_font_color(hallucination_percent: float) -> str:
-    if hallucination_percent < 0.3:
-        return 'white'
-    elif hallucination_percent < 0.65:
         return 'black'
     else:
         return 'white'
-def determine_llm_x_position(LLM: str, hallucination_percent: float) -> float:
-    # determine the x position of the LLM name
-    # For an LLM, it's bar length is 10* its hallucination %
-    # if the LLM name cannot fit in the bar, move it to the left
-    # if the LLM name can fit in the bar, let its x position be 0.01
     name_length = len(LLM)
     print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
-    hallu_rate_to_bar_length_ratio = 10
     bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
-    if name_length > bar_length:
-        return 0.01
-    else:
-        return hallucination_percent
 def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
-    fig = plt.figure(figsize=(5, 4))
     # plot using LLM as x-axis and Hallucination % as y-axis
     # make bars horizontal
     plot_df = df.head(10)
     plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
     plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
-    # for i, row in plot_df.iterrows():
-    #     plt.text(
-    #         determine_llm_x_position(row["LLM"], row["Hallucination %"]),
-    #         row["LLM"],
-    #         f"{row['LLM']}",
-    #         ha='left',
-    #         va='center',
-    #         fontsize=9,
-    #         color=determine_font_color(row["normalized_hallucination_rate"])
-    #     )
     # plt.yticks([])
     plt.tight_layout()
     plt.xticks(fontsize=9)
-    # plt.xlabel("Hallucination %", fontsize=9)
-    plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=9)
     plt.gca().spines['top'].set_visible(False)
     plt.gca().spines['right'].set_visible(False)
     plt.gca().spines['left'].set_visible(False)

 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.figure
+from datetime import datetime
 from sklearn.preprocessing import MinMaxScaler
 # import dotenv
         print(f"Using pre-dumped results from {results_json}")
     results = json.load(open(results_json, "r"))
+    # print(results)
     results_df = pd.DataFrame(results)
     results_df = results_df.sort_values(by="Hallucination %", ascending=True)
     for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
         results_df[column] = results_df[column].apply(lambda x: round(x, 3))
+    results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
     return results_df
         return 9
 def determine_font_color(hallucination_percent: float) -> str:
+    if 0.25 < hallucination_percent < 0.65:
         return 'black'
     else:
         return 'white'
+def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
     name_length = len(LLM)
     print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
+    hallu_rate_to_bar_length_ratio = 5
     bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
+    if name_length < bar_length:
+        return 0.01, determine_font_color(hallucination_percent)
+    else: # to the right of the bar, black anyway
+        return hallucination_percent, 'black'
 def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
+    fig = plt.figure(figsize=(8, 4))
     # plot using LLM as x-axis and Hallucination % as y-axis
     # make bars horizontal
     plot_df = df.head(10)
     plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
     plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
+    # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
+    #     lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
+    #     axis=1
+    # ))
+    for i, row in plot_df.iterrows():
+        plt.text(
+            # row["LLM_x_position"],
+            row["Hallucination %"] + 0.025,
+            row["LLM"],
+            row["Hallucination %"],
+            # f"{row['LLM']}",
+            ha='left',
+            va='center',
+            fontsize=9,
+            # color=row["font_color"]
+        )
     # plt.yticks([])
     plt.tight_layout()
+    # add margin to the right of the plot
+    plt.subplots_adjust(right=0.95)
     plt.xticks(fontsize=9)
+    plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
+    plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
     plt.gca().spines['top'].set_visible(False)
     plt.gca().spines['right'].set_visible(False)
     plt.gca().spines['left'].set_visible(False)

app/requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-funix==0.6.1
 pandas
 huggingface_hub
 matplotlib

+funix==0.6.2
 pandas
 huggingface_hub
 matplotlib