Spaces:

inflaton
/

rap

Running

App Files Files Community

inflaton commited on Oct 21, 2024

Commit

251efe4

1 Parent(s): 8a58456

initial working code

Browse files

Files changed (7) hide show

.env.example +81 -0
.gitignore +149 -0
app.py +109 -25
eval_modules/calc_repetitions_v2e.py +1333 -0
eval_modules/utils.py +262 -0
ms_macro.json +0 -0
requirements.txt +20 -1

.env.example ADDED Viewed

	@@ -0,0 +1,81 @@

+LLM_MODEL_TYPE=huggingface
+# LLM_MODEL_TYPE=openai
+# LLM_MODEL_TYPE=hftgi
+# LLM_MODEL_TYPE=ollama
+# LLM_MODEL_TYPE=google
+# LLM_MODEL_TYPE=vllm
+HUGGINGFACE_AUTH_TOKEN=
+HFTGI_SERVER_URL=
+OPENAI_API_KEY=
+GOOGLE_API_KEY=
+# if unset, default to "gpt-3.5-turbo"
+OPENAI_MODEL_NAME=
+# GEMINI_MODEL_NAME=gemini-1.5-pro-latest
+# OLLAMA_MODEL_NAME=orca2:7b
+# OLLAMA_MODEL_NAME=mistral:7b
+# OLLAMA_MODEL_NAME=gemma:7b
+# OLLAMA_MODEL_NAME=llama2:7b
+OLLAMA_MODEL_NAME=llama3:8b
+OLLAMA_RP=1.15
+HF_RP=1.15
+LANGCHAIN_DEBUG=false
+BATCH_SIZE=1
+APPLY_CHAT_TEMPLATE_FOR_RAG=true
+# cpu, mps or cuda:0 - if unset, use whatever detected
+HF_EMBEDDINGS_DEVICE_TYPE=
+HF_PIPELINE_DEVICE_TYPE=
+# uncomment one of the below to load corresponding quantized model
+# LOAD_QUANTIZED_MODEL=4bit
+# LOAD_QUANTIZED_MODEL=8bit
+QA_WITH_RAG=true
+# QA_WITH_RAG=false
+RETRIEVER_TYPE=questions_file
+# RETRIEVER_TYPE=vectorstore
+QUESTIONS_FILE_PATH="./data/datasets/ms_macro.json"
+DISABLE_MODEL_PRELOADING=true
+CHAT_HISTORY_ENABLED=false
+SHOW_PARAM_SETTINGS=false
+SHARE_GRADIO_APP=false
+# if unset, default to "hkunlp/instructor-xl"
+HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
+# number of cpu cores - used to set n_threads for GPT4ALL & LlamaCpp models
+NUMBER_OF_CPU_CORES=
+USING_TORCH_BFLOAT16=true
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Meta-Llama-3-8B-Instruct"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Meta-Llama-3-70B-Instruct"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-7b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-13b"
+HUGGINGFACE_MODEL_NAME_OR_PATH="google/gemma-1.1-2b-it"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="google/gemma-1.1-7b-it"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Phi-3-mini-128k-instruct"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="mistralai/Mistral-7B-Instruct-v0.2"

.gitignore ADDED Viewed

	@@ -0,0 +1,149 @@

+*.out
+*.log
+pdfs/
+.vscode/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+# *.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# JetBrains
+.idea
+*.db
+.DS_Store
+vectorstore.pkl
+langchain.readthedocs.io/
+models/
+data/logs/hftgi-2024-03-18.txt
+qa_*_all_results.csv
+qa_*_test_results.csv

app.py CHANGED Viewed

@@ -1,54 +1,141 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
     message,
     history: list[tuple[str, str]],
     system_message,
-    max_tokens,
-    temperature,
-    top_p,
 ):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
-    response = ""
     for message in client.chat_completion(
         messages,
         max_tokens=max_tokens,
         stream=True,
         temperature=temperature,
         top_p=top_p,
     ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
-    respond,
     additional_inputs=[
         gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
@@ -58,7 +145,4 @@ demo = gr.ChatInterface(
         ),
     ],
 )
-if __name__ == "__main__":
-    demo.launch()

+import json
+import os
 import gradio as gr
 from huggingface_hub import InferenceClient
+from eval_modules.utils import calc_bleu_rouge_scores
+from eval_modules.calc_repetitions_v2e import detect_repetitions
+questions_file_path = os.getenv("QUESTIONS_FILE_PATH") or "./ms_macro.json"
+questions = json.loads(open(questions_file_path).read())
+examples = [[question["question"].strip()] for question in questions]
+print(f"Loaded {len(examples)} examples")
+qa_system_prompt = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+# client = InferenceClient("HuggingFaceH4/zephyr-7b-gemma-v0.1")
+client = InferenceClient("microsoft/Phi-3.5-mini-instruct")
+def chat(
     message,
     history: list[tuple[str, str]],
     system_message,
+    temperature=0,
+    frequency_penalty=0,
+    presence_penalty=0,
+    max_tokens=256,
+    top_p=0.95,
 ):
+    chat = []
+    for item in history:
+        chat.append({"role": "user", "content": item[0]})
+        if item[1] is not None:
+            chat.append({"role": "assistant", "content": item[1]})
+    index = -1
+    if [message] in examples:
+        index = examples.index([message])
+        message = f"{qa_system_prompt}\n\n{questions[index]['context']}\n\nQuestion: {message}"
+        print("RAG prompt:", message)
+    chat.append({"role": "user", "content": message})
+    messages = [{"role": "system", "content": system_message}]
     messages.append({"role": "user", "content": message})
+    partial_text = ""
+    finish_reason = None
     for message in client.chat_completion(
         messages,
         max_tokens=max_tokens,
         stream=True,
         temperature=temperature,
+        frequency_penalty=None,  # frequency_penalty,
+        presence_penalty=None,  # presence_penalty,
         top_p=top_p,
+        seed=42,
     ):
+        finish_reason = message.choices[0].finish_reason
+        # print("finish_reason:", finish_reason)
+        if finish_reason is None:
+            new_text = message.choices[0].delta.content
+            partial_text += new_text
+            yield partial_text
+        else:
+            break
+    answer = partial_text
+    (whitespace_score, repetition_score, total_repetitions) = detect_repetitions(answer)
+    partial_text += "\n\nRepetition Metrics:\n"
+    partial_text += f"1. Whitespace Score: {whitespace_score:.3f}\n"
+    partial_text += f"1. Repetition Score: {repetition_score:.3f}\n"
+    partial_text += f"1. Total Repetitions: {total_repetitions:.3f}\n"
+    partial_text += (
+        f"1. Non-Repetitive Ratio: {1 - total_repetitions / len(answer):.3f}\n"
+    )
+    if index >= 0:  # RAG
+        key = (
+            "wellFormedAnswers"
+            if "wellFormedAnswers" in questions[index]
+            else "answers"
+        )
+        scores = calc_bleu_rouge_scores([answer], [questions[index][key]], debug=True)
+        partial_text += "\n\n Performance Metrics:\n"
+        partial_text += f'1. BLEU-1: {scores["bleu_scores"]["bleu"]:.3f}\n'
+        partial_text += f'1. RougeL: {scores["rouge_scores"]["rougeL"]:.3f}\n'
+        partial_text += f"\n\nGround truth: {questions[index][key][0]}\n"
+    partial_text += f"\n\nThe text generation has ended because: {finish_reason}\n"
+    yield partial_text
 demo = gr.ChatInterface(
+    fn=chat,
+    examples=examples,
+    cache_examples=False,
+    additional_inputs_accordion=gr.Accordion(
+        label="⚙️ Parameters", open=False, render=False
+    ),
     additional_inputs=[
         gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(
+            minimum=0, maximum=2, step=0.1, value=0, label="Temperature", render=False
+        ),
+        gr.Slider(
+            minimum=-2,
+            maximum=2,
+            step=0.1,
+            value=0,
+            label="Frequency Penalty",
+            render=False,
+        ),
+        gr.Slider(
+            minimum=-2,
+            maximum=2,
+            step=0.1,
+            value=0,
+            label="Presence Penalty",
+            render=False,
+        ),
+        gr.Slider(
+            minimum=128,
+            maximum=4096,
+            step=1,
+            value=512,
+            label="Max new tokens",
+            render=False,
+        ),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
         ),
     ],
 )
+demo.launch()

eval_modules/calc_repetitions_v2e.py ADDED Viewed

	@@ -0,0 +1,1333 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+import evaluate
+import traceback
+bert_score = evaluate.load("bertscore")
+meteor = evaluate.load("meteor")
+print(f"loading: {__file__}")
+# pattern_non_word_char_repetition = re.compile(r"\s{5,}")
+# pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+# final version
+pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
+pattern_text_repetitions = re.compile(
+    r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
+)
+# Explanation of the Regex Pattern:
+#   (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
+#     .*?: Matches zero or more characters, non-greedily (as few as possible).
+#   (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
+#     [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
+#     (?P=repeat): A backreference to the named group repeat.
+def del_non_word_char_repetition(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect non-word characters repetition----")
+        count = len(text)
+        text = pattern_non_word_char_repetition.sub("\t", text)
+        count -= len(text)
+        if debug and count:
+            print(f"removed non-word characters repetition: {count}")
+    return text, count
+# final version for repetition detection
+def detect_text_repetitions(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect text repetitions----")
+        matches = pattern_text_repetitions.finditer(text)
+        for match in matches:
+            if debug:
+                print(match)
+                for groupNum in range(0, len(match.groups())):
+                    groupNum = groupNum + 1
+                    print(
+                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                            groupNum=groupNum,
+                            start=match.start(groupNum),
+                            end=match.end(groupNum),
+                            group=match.group(groupNum),
+                        )
+                    )
+            start, end = match.span()
+            count += end - start - len(match.group(1))
+    return count
+def detect_repetitions(text, debug=False):
+    if isinstance(text, str) is False:
+        return 0, 0, 0
+    text, count_non_word_char_repetition = del_non_word_char_repetition(
+        text, debug=debug
+    )
+    count_text_repetitions = detect_text_repetitions(text, debug=debug)
+    total_repetitions = count_non_word_char_repetition + count_text_repetitions
+    result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)
+    if debug:
+        print(result)
+    return result
+def detect_scores(
+    row, debug=False, answer_col="answer", ground_truth_col="ground_truth"
+):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        row[answer_col], debug=debug
+    )
+    if ground_truth_col:
+        ground_truth_newline_score, ground_truth_repetition_score, _ = (
+            detect_repetitions(row[ground_truth_col], debug=debug)
+        )
+        newline_score -= ground_truth_newline_score
+        if newline_score < 0:
+            newline_score = 0
+        repetition_score -= ground_truth_repetition_score
+        if repetition_score < 0:
+            repetition_score = 0
+        total_repetitions = newline_score + repetition_score
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+        or "nrr" not in df.columns
+        or "rr" not in df.columns
+    ):
+        if (
+            force_recalculate
+            or "newline_score" not in df.columns
+            or "repetition_score" not in df.columns
+            or "total_repetitions" not in df.columns
+        ):
+            df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
+                detect_scores, axis=1
+            )
+        df["answer_len"] = df["answer"].apply(
+            lambda x: len(x) if isinstance(x, str) else 0
+        )
+        df["nrr"] = df.apply(
+            lambda x: (
+                1
+                if x["answer_len"] == 0
+                else 1 - (x["newline_score"] + x["repetition_score"]) / x["answer_len"]
+            ),
+            axis=1,
+        )
+        df["rr"] = df["nrr"].apply(lambda x: 1 - x)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+rap_penalty_functions = {
+    "linear": lambda x: x,
+    "quadratic": lambda x: x * x,
+    "cubic": lambda x: x * x * x,
+    "logarithmic": lambda x: math.log(x + 1, 2),
+    "exponential": lambda x: math.exp(x - 1),
+}
+def calc_adjusted_performance(f, r, l=1, penalty_function="cubic"):
+    n = 1 - r / l if l > 0 else 0
+    return f * rap_penalty_functions[penalty_function](n)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    l = row["answer_len"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r, l)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r, l)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
+    )
+    if os.path.exists(result_file):
+        print(f"loading result file: {result_file}")
+        df = load_with_newline_and_repetition_scores(
+            result_file, force_recalculate=force_recalculate
+        )
+    else:
+        print(f"re-creating result file: {result_file}")
+        df = pd.DataFrame()
+        force_recalculate = True
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        try:
+            perf_df = load_performance_df(csv_result_file, repetition_penalty)
+            df.drop(
+                columns=[
+                    "precision",
+                    "recall",
+                    "f1",
+                    "f2",
+                    "entities_in_answer",
+                    "entities_in_question",
+                    "word_count",
+                ],
+                errors="ignore",
+                inplace=True,
+            )
+            df["id"] = perf_df["id"]
+            df["question"] = perf_df["question"]
+            df["answer"] = perf_df["pred_answer"]
+            df["word_count"] = df["answer"].apply(
+                lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
+            )
+            df["ground_truth"] = perf_df["ground_truth"]
+            df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+            df["precision"] = perf_df["score"].apply(lambda x: x[0])
+            df["recall"] = perf_df["score"].apply(lambda x: x[1])
+            df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        except Exception as e:
+            print(f"\tignored error: {e}")
+            # traceback.print_exc()
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
+            detect_scores, axis=1
+        )
+        df["answer_len"] = df["answer"].apply(
+            lambda x: len(x) if isinstance(x, str) else 0
+        )
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    # print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    answer_len = [
+        df["answer_len"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    precision = [
+        calc_adjusted_performance(f, n + r, l)
+        for f, n, r, l in zip(precision, newline_score, repetition_score, answer_len)
+    ]
+    recall = [
+        calc_adjusted_performance(f, n + r, l)
+        for f, n, r, l in zip(recall, newline_score, repetition_score, answer_len)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        # plt.axvspan(
+        #     repetition_penalties[best_afrp2_index] - 0.01,
+        #     repetition_penalties[best_afrp2_index] + 0.01,
+        #     alpha=0.5,
+        #     edgecolor="none",
+        #     facecolor="green",
+        # )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        # plt.plot(
+        #     repetition_penalties,
+        #     afrp2,
+        #     label="Per-question RAP - F1",
+        #     marker="s",
+        #     color="green",
+        # )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RAP - F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        # plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best RAP - F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        answer_len = [
+            df["answer_len"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp = [
+            calc_adjusted_performance(f, n + r, l)
+            for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_mtr.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtr.append(0)
+    print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "RAP - F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(15, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+            best_mtr.append(0)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best RAP - Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        answer_len = [
+            df["answer_len"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp = [
+            calc_adjusted_performance(f, n + r, l)
+            for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        best_mtr.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtr.append(0)
+    # print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "RAP - Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+all_open_source_models = [
+    "gemma-1.1-2b-it",
+    "Phi-3-mini-128k-instruct",
+    "gemma-1.1-7b-it",
+    "Llama-2-7b-chat-hf",
+    "Mistral-7B-Instruct-v0.2",
+    "Meta-Llama-3-8B-Instruct",
+    "Llama-2-13b-chat-hf",
+    "Llama-2-70b-chat-hf",
+    "Meta-Llama-3-70B-Instruct",
+]
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RAP - Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        # plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                df["total_repetitions"].mean()
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        # if max_value < 1.5:
+        #     max_value = 1.5
+        # set ylimit
+        plt.ylim(0, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Mean Total Repetitions")
+        plt.title("Mean Total Repetitions vs Repetition Penalties")
+        plt.legend()
+        plt.show()
+def plot_repetition_factors_by_group(result, group_filter=None):
+    markers = ["D", "o", "s", "x"]
+    colors = ["blue", "orange", "green", "red"]
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    index = 0
+    max_value = 0
+    for model in result.keys():
+        if group_filter is not None and group_filter not in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        repetition_panelties = [
+            repetition_penalty for repetition_penalty in df["repetition_penalty"]
+        ]
+        # Calculate the statistics
+        mean_score = [
+            df["total_repetitions"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if len(mean_score) != len(repetition_panelties):
+            print(
+                f"model: {model} has different length of repetition penalties and mean score"
+            )
+            print("repetition_panelties:", len(repetition_panelties))
+            print("mean_score:", len(mean_score))
+            continue
+        new_max = max(mean_score)
+        if new_max > max_value:
+            max_value = new_max
+        sns.lineplot(
+            x=repetition_panelties,
+            y=mean_score,
+            label=model,
+            marker=markers[index],
+            color=colors[index],
+        )
+        index += 1
+    max_value = max_value * 1.05
+    # if max_value < 1.5:
+    #     max_value = 1.5
+    # set ylimit
+    plt.ylim(0, max_value)
+    max_value = 0
+    plt.xlabel("Repetition Penalties")
+    plt.ylabel("Mean Total Repetitions")
+    plt.title("Mean Total Repetitions vs Repetition Penalties")
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    plt.show()
+ms_marco_csv_result_files = [
+    "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_mm.csv",
+    "data/results_v2/gemma-1.1-2b-it(Non-RAG)_mm.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_mm.csv",
+    "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_mm.csv",
+    "data/results_v2/gemma-1.1-7b-it(Non-RAG)_mm.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_mm.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_mm.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_mm.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_mm.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_mm.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_mm.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_mm.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_mm.csv",
+]
+webqsp_csv_result_files = [
+    "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_wd.csv",
+    "data/results_v2/gemma-1.1-2b-it(Non-RAG)_wd.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_wd.csv",
+    "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_wd.csv",
+    "data/results_v2/gemma-1.1-7b-it(Non-RAG)_wd.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_wd.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_wd.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_wd.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_wd.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_wd.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_wd.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_wd.csv",
+    "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_wd.csv",
+]
+def calc_rap_scores(
+    result, precision="precision", recall="recall", penalty_function="cubic"
+):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    if precision in result["df_list_repetition_penalty"][0].columns:
+        precision = [
+            df[precision].mean() for df in result["df_list_repetition_penalty"]
+        ]
+        recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
+    else:
+        precision = result["df_overall"][precision]
+        recall = result["df_overall"][recall]
+    f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+    nrr = [
+        1 - (n + r) / s
+        for f, n, r, s in zip(
+            f1, newline_score, repetition_score, result["df_overall"]["answer_len"]
+        )
+    ]
+    rap = [
+        calc_adjusted_performance(f, 1 - n, penalty_function=penalty_function)
+        for f, n in zip(f1, nrr)
+    ]
+    return newline_score, repetition_score, f1, rap, nrr
+def get_model_name(csv_result_file):
+    parts = re.split(r"[_/]", csv_result_file)
+    print(f"parts: {parts}")
+    model_name = parts[3]
+    return model_name
+def load_webqsp_result(
+    csv_result_files, force_recalculate=False, save=False, penalty_function="cubic"
+):
+    result = {}
+    for i, csv_result_file in enumerate(csv_result_files):
+        try:
+            df = pd.read_csv(csv_result_file)
+            model_name = get_model_name(csv_result_file)
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                calculate_performance_score(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                answer_lens.append(df_rpp["answer_len"].mean())
+            df["answer_len"] = answer_lens
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
+                result[model_name], penalty_function=penalty_function
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["nrr"] = nrr
+            df["rap"] = rap
+            df["rr"] = df["nrr"].apply(lambda x: 1 - x)
+            df["rrp"] = df["rr"].apply(lambda x: x * 100)
+            if save:
+                df.to_csv(csv_result_file, index=False)
+        except Exception as e:
+            print(f"Error: {e}")
+            traceback.print_exc()
+    return result
+def load_ms_marco_result(
+    csv_result_files,
+    force_recalculate=False,
+    calc_bertscore=True,
+    save=False,
+    penalty_function="cubic",
+):
+    result = {}
+    for csv_result_file in csv_result_files:
+        try:
+            df = pd.read_csv(csv_result_file)
+            model_name = get_model_name(csv_result_file)
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                load_for_repetition_penalty_ms_macro(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                answer_lens.append(df_rpp["answer_len"].mean())
+            df["answer_len"] = answer_lens
+            col = "bert_score" if calc_bertscore else "meteor"
+            score_unavailable = col not in df.columns
+            if score_unavailable:
+                save = True
+                bert_meteor_scores = []
+                bert_score_references = None
+                for df_rpp in dfs:
+                    if calc_bertscore:
+                        bert_meteor_score = 0
+                        for i, row in df_rpp.iterrows():
+                            answer = row["answer"]
+                            if not isinstance(answer, str):
+                                answer = ""
+                            bert_meteor_score += bert_score.compute(
+                                predictions=[answer],
+                                references=[row["ground_truth"][0]],
+                                lang="en",
+                                model_type="microsoft/deberta-large-mnli",
+                            )["f1"][0]
+                        # get average of bertscore
+                        bert_meteor_score = bert_meteor_score / len(df_rpp)
+                        print(f"bert_score: {bert_meteor_score}")
+                    else:
+                        bert_meteor_score = meteor.compute(
+                            predictions=df_rpp["answer"],
+                            references=df_rpp["ground_truth"],
+                        )["meteor"]
+                    bert_meteor_scores.append(bert_meteor_score)
+                df[col] = bert_meteor_scores
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
+                result[model_name],
+                precision=col,
+                recall=col,
+                penalty_function=penalty_function,
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["nrr"] = nrr
+            df["rap"] = rap
+            df["rr"] = df["nrr"].apply(lambda x: 1 - x)
+            df["rrp"] = df["rr"].apply(lambda x: x * 100)
+            if save:
+                df.to_csv(csv_result_file, index=False)
+        except Exception as e:
+            print("An error occurred:", e)
+            traceback.print_exc()
+            print(f"csv_result_file: {csv_result_file}")
+    return result

eval_modules/utils.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# -*- coding:utf-8 -*-
+from __future__ import annotations
+import json
+import logging
+import os
+import platform
+import re
+from pathlib import Path
+import evaluate
+import pandas as pd
+import requests
+import torch
+from tqdm import tqdm
+class LogRecord(logging.LogRecord):
+    def getMessage(self):
+        msg = self.msg
+        if self.args:
+            if isinstance(self.args, dict):
+                msg = msg.format(**self.args)
+            else:
+                msg = msg.format(*self.args)
+        return msg
+class Logger(logging.Logger):
+    def makeRecord(
+        self,
+        name,
+        level,
+        fn,
+        lno,
+        msg,
+        args,
+        exc_info,
+        func=None,
+        extra=None,
+        sinfo=None,
+    ):
+        rv = LogRecord(name, level, fn, lno, msg, args, exc_info, func, sinfo)
+        if extra is not None:
+            for key in extra:
+                rv.__dict__[key] = extra[key]
+        return rv
+def init_settings():
+    logging.setLoggerClass(Logger)
+    logging.basicConfig(
+        level=logging.WARNING,
+        format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
+    )
+def remove_extra_spaces(text):
+    return re.sub(" +", " ", text.strip())
+def print_llm_response(llm_response, debug_retrieval=True):
+    answer = llm_response["answer"] if "answer" in llm_response else None
+    if answer is None:
+        answer = llm_response["response"] if "response" in llm_response else None
+    if answer is not None:
+        print("\n\n***Answer:")
+        print(answer)
+    source_documents = (
+        llm_response["source_documents"] if "source_documents" in llm_response else None
+    )
+    if source_documents is None:
+        source_documents = (
+            llm_response["sourceDocs"] if "sourceDocs" in llm_response else None
+        )
+    if debug_retrieval and source_documents is not None:
+        print("\nSources:")
+        for index, source in enumerate(source_documents):
+            metadata = source["metadata"] if "metadata" in source else source.metadata
+            if "page" in metadata:
+                print(f" Page:  {metadata['page']}", end="")
+            print(
+                f" Source {index + 1}: "
+                + str(metadata["url"] if "url" in metadata else metadata["source"])
+            )
+            print(
+                source["page_content"]
+                if "page_content" in source
+                else source.page_content
+            )
+    if "chat_history" in llm_response:
+        print("\nChat History:")
+        print(llm_response["chat_history"])
+def get_device_types():
+    print("Running on: ", platform.platform())
+    print("MPS is", "NOT" if not torch.backends.mps.is_available() else "", "available")
+    print("CUDA is", "NOT" if not torch.cuda.is_available() else "", "available")
+    device_type_available = "cpu"
+    if not torch.backends.mps.is_available():
+        if not torch.backends.mps.is_built():
+            print(
+                "MPS not available because the current PyTorch install was not "
+                "built with MPS enabled."
+            )
+        else:
+            print(
+                "MPS not available because the current MacOS version is not 12.3+ "
+                "and/or you do not have an MPS-enabled device on this machine."
+            )
+    else:
+        device_type_available = "mps"
+    if torch.cuda.is_available():
+        print("CUDA is available, we have found ", torch.cuda.device_count(), " GPU(s)")
+        print(torch.cuda.get_device_name(0))
+        print("CUDA version: " + torch.version.cuda)
+        device_type_available = f"cuda:{torch.cuda.current_device()}"
+    return (
+        os.environ.get("HF_EMBEDDINGS_DEVICE_TYPE") or device_type_available,
+        os.environ.get("HF_PIPELINE_DEVICE_TYPE") or device_type_available,
+    )
+def ensure_model_is_downloaded(llm_model_type):
+    if llm_model_type.startswith("gpt4all"):
+        local_path = (
+            os.environ.get("GPT4ALL_J_MODEL_PATH")
+            if llm_model_type == "gpt4all-j"
+            else os.environ.get("GPT4ALL_MODEL_PATH")
+        )
+        url = (
+            os.environ.get("GPT4ALL_J_DOWNLOAD_LINK")
+            if llm_model_type == "gpt4all-j"
+            else os.environ.get("GPT4ALL_DOWNLOAD_LINK")
+        )
+    elif llm_model_type == "llamacpp":
+        local_path = os.environ.get("LLAMACPP_MODEL_PATH")
+        url = os.environ.get("LLAMACPP_DOWNLOAD_LINK")
+    elif llm_model_type == "ctransformers":
+        local_path = os.environ.get("CTRANSFORMERS_MODEL_PATH")
+        url = os.environ.get("CTRANSFORMERS_DOWNLOAD_LINK")
+    else:
+        raise ValueError(f"wrong model typle: {llm_model_type}")
+    path = Path(local_path)
+    if path.is_file():
+        print(f"model: {local_path} exists")
+    else:
+        print(f"downloading model: {local_path} from {url} ...")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # send a GET request to the URL to download the file. Stream since it's large
+        response = requests.get(url, stream=True)
+        # open the file in binary mode and write the contents of the response to it in chunks
+        # This is a large file, so be prepared to wait.
+        with open(local_path, "wb") as f:
+            for chunk in tqdm(response.iter_content(chunk_size=8192)):
+                if chunk:
+                    f.write(chunk)
+    return local_path
+bleu = evaluate.load("bleu")
+rouge = evaluate.load("rouge")
+def calc_bleu_rouge_scores(predictions, references, debug=False):
+    if debug:
+        print("predictions:", predictions)
+        print("references:", references)
+    bleu_scores = bleu.compute(
+        predictions=predictions, references=references, max_order=1
+    )
+    rouge_scores = rouge.compute(predictions=predictions, references=references)
+    result = {"bleu_scores": bleu_scores, "rouge_scores": rouge_scores}
+    if debug:
+        print("result:", result)
+    return result
+def calc_metrics(df):
+    predictions = [df["answer"][i] for i in range(len(df))]
+    references = [df["ground_truth"][i] for i in range(len(df))]
+    return calc_bleu_rouge_scores(predictions, references)
+pattern_abnormal_newlines = re.compile(r"\n{5,}")
+pattern_text_repetitions = re.compile(r"\b(\w.+?)\b(\1+)", re.M | re.DOTALL)
+exception_pattern = re.compile(r"(\w+\.)\1")
+# final version for repetition detection
+def detect_repetitions(
+    text, debug=False, pattern_text_repetitions=pattern_text_repetitions
+):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                if exception_pattern.match(match[0]):
+                    if debug:
+                        print("ignored: ", match[0])
+                    continue
+                start, end = match.span()
+                subtotals[i] += end - start
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
+    if debug:
+        print(result)
+    return result
+def detect_abnormal_newlines(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_repetition_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])

ms_macro.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

	@@ -1 +1,20 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.26.0
+nltk==3.8.1
+langchain==0.1.16
+langchain-openai==0.1.3
+langchain_google_genai==1.0.2
+transformers==4.40.1
+accelerate==0.29.3
+python-dotenv==1.0.1
+gradio==4.44.1
+black==24.4.0
+InstructorEmbedding==1.0.1
+sentence-transformers==2.2.2
+chardet==5.2.0
+sentencepiece==0.1.98
+evaluate==0.4.3
+rouge_score==0.1.2
+pytest==8.2.1
+seaborn==0.13.2
+tenacity==8.3.0
+bert_score==0.3.13