nouamanetazi HF Staff commited on
Commit
df7cbb5
·
verified ·
1 Parent(s): f6cd590

Upload folder using huggingface_hub

Browse files
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ SYNC_DB=true
2
+ DATASET_ID=your-username/llm-arena-db
3
+ SPACE_ID=your-username/llm-arena
.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
.pytest_cache/v/cache/lastfailed ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "GenAI-Arena/arena_elo/simple_test.py": true
3
+ }
.pytest_cache/v/cache/stepwise ADDED
@@ -0,0 +1 @@
 
 
1
+ []
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Darija Chatbot Arena
3
- emoji: 💻
4
- colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.9.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: darija-chatbot-arena
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.9.1
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.ui import app
3
+
4
+ if __name__ == "__main__":
5
+ app.launch(debug=True)
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .ui import app
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (196 Bytes). View file
 
app/__pycache__/config.cpython-310.pyc ADDED
Binary file (576 Bytes). View file
 
app/__pycache__/db.cpython-310.pyc ADDED
Binary file (1.89 kB). View file
 
app/__pycache__/init.cpython-310.pyc ADDED
Binary file (532 Bytes). View file
 
app/__pycache__/leaderboard.cpython-310.pyc ADDED
Binary file (2.08 kB). View file
 
app/__pycache__/llm.cpython-310.pyc ADDED
Binary file (1.58 kB). View file
 
app/__pycache__/messages.cpython-310.pyc ADDED
Binary file (4.94 kB). View file
 
app/__pycache__/models.cpython-310.pyc ADDED
Binary file (387 Bytes). View file
 
app/__pycache__/synth.cpython-310.pyc ADDED
Binary file (4.97 kB). View file
 
app/__pycache__/ui.cpython-310.pyc ADDED
Binary file (583 Bytes). View file
 
app/__pycache__/ui_battle.cpython-310.pyc ADDED
Binary file (3 kB). View file
 
app/__pycache__/ui_leaderboard.cpython-310.pyc ADDED
Binary file (951 Bytes). View file
 
app/__pycache__/ui_vote.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
app/__pycache__/utils.cpython-310.pyc ADDED
Binary file (307 Bytes). View file
 
app/__pycache__/vote.cpython-310.pyc ADDED
Binary file (4.16 kB). View file
 
app/config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ #########################
4
+ # General Configuration #
5
+ #########################
6
+
7
+ DB_NAME = "database.db"
8
+ DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME
9
+
10
+ # LLM Models Configuration
11
+ AVAILABLE_MODELS = {
12
+ "GPT-4": "gpt4",
13
+ "Claude-3": "claude3",
14
+ "Gemini-Pro": "gemini",
15
+ "Mixtral": "mixtral",
16
+ "Llama-2": "llama2",
17
+ # Add more models as needed
18
+ }
19
+
20
+ # General Configuration
21
+ MAX_PROMPT_LENGTH = 2000
22
+ MIN_PROMPT_LENGTH = 10
23
+
24
+ # Sync settings
25
+ SYNC_DB = True if os.getenv('SYNC_DB') else False
26
+ DB_DATASET_ID = os.getenv('DATASET_ID')
27
+ SPACE_ID = os.getenv('SPACE_ID')
app/db.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from .config import *
3
+ import os
4
+ import shutil
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ def download_db():
8
+ if not os.path.isfile(DB_PATH):
9
+ print("Downloading DB...")
10
+ try:
11
+ cache_path = hf_hub_download(repo_id=DB_DATASET_ID, repo_type='dataset', filename=DB_NAME)
12
+ shutil.copyfile(cache_path, DB_PATH)
13
+ print("Downloaded DB")
14
+ except Exception as e:
15
+ print("Error while downloading DB:", e)
16
+
17
+ def get_db():
18
+ return sqlite3.connect(DB_PATH)
19
+
20
+ def create_db():
21
+ conn = get_db()
22
+ cursor = conn.cursor()
23
+
24
+ # Model table - stores model info and vote counts
25
+ cursor.execute('''
26
+ CREATE TABLE IF NOT EXISTS model (
27
+ name TEXT UNIQUE,
28
+ upvote INTEGER DEFAULT 0,
29
+ downvote INTEGER DEFAULT 0
30
+ );
31
+ ''')
32
+
33
+ # Vote table - stores individual votes with prompts and responses
34
+ cursor.execute('''
35
+ CREATE TABLE IF NOT EXISTS vote (
36
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
37
+ username TEXT,
38
+ model TEXT,
39
+ vote INTEGER,
40
+ prompt TEXT,
41
+ response TEXT,
42
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
43
+ );
44
+ ''')
45
+
46
+ # Battle log table - stores battle outcomes with both responses
47
+ cursor.execute('''
48
+ CREATE TABLE IF NOT EXISTS votelog (
49
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
50
+ username TEXT,
51
+ chosen TEXT,
52
+ rejected TEXT,
53
+ prompt TEXT,
54
+ chosen_response TEXT,
55
+ rejected_response TEXT,
56
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
57
+ );
58
+ ''')
59
+
60
+ conn.commit()
61
+ cursor.close()
app/init.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+ from .db import *
3
+ from huggingface_hub import CommitScheduler
4
+ from pathlib import Path
5
+ import os
6
+
7
+ scheduler = None
8
+
9
+ if SYNC_DB:
10
+ download_db()
11
+ # Sync local DB with remote repo every 5 minute (only if a change is detected)
12
+ scheduler = CommitScheduler(
13
+ repo_id=DB_DATASET_ID,
14
+ repo_type="dataset",
15
+ folder_path=Path(DB_PATH).parent,
16
+ every=5,
17
+ allow_patterns=DB_NAME,
18
+ )
19
+
20
+ create_db()
app/leaderboard.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+ from .db import *
3
+ from .models import *
4
+
5
+ import pandas as pd
6
+
7
+ def get_leaderboard(reveal_prelim=False):
8
+ print("Getting leaderboard data...")
9
+ conn = get_db()
10
+ cursor = conn.cursor()
11
+
12
+ sql = '''
13
+ SELECT name,
14
+ SUM(CASE WHEN vote = 1 THEN 1 ELSE 0 END) as upvote,
15
+ SUM(CASE WHEN vote = -1 THEN 1 ELSE 0 END) as downvote
16
+ FROM model
17
+ LEFT JOIN vote ON model.name = vote.model
18
+ GROUP BY name
19
+ '''
20
+
21
+ print("Executing SQL query...")
22
+ cursor.execute(sql)
23
+ data = cursor.fetchall()
24
+ df = pd.DataFrame(data, columns=['name', 'upvote', 'downvote'])
25
+ df['votes'] = df['upvote'] + df['downvote']
26
+ print(f"Initial dataframe has {len(df)} models")
27
+
28
+ if not reveal_prelim:
29
+ print(f"Filtering out models with less than 100 votes... ({len(df[df['votes'] <= 100])} models will be filtered)")
30
+ df = df[df['votes'] > 100] # Minimum vote threshold
31
+ print(f"After filtering: {len(df)} models remain")
32
+
33
+ print(f"Calculating ELO scores for {len(df)} models...")
34
+ # Calculate ELO scores
35
+ df['score'] = 1200 # Base ELO
36
+ for i in range(len(df)):
37
+ for j in range(len(df)):
38
+ if i != j:
39
+ expected_a = 1 / (1 + 10 ** ((df['score'].iloc[j] - df['score'].iloc[i]) / 400))
40
+ expected_b = 1 / (1 + 10 ** ((df['score'].iloc[i] - df['score'].iloc[j]) / 400))
41
+ actual_a = df['upvote'].iloc[i] / df['votes'].iloc[i] if df['votes'].iloc[i] > 0 else 0.5
42
+ actual_b = df['upvote'].iloc[j] / df['votes'].iloc[j] if df['votes'].iloc[j] > 0 else 0.5
43
+ df.iloc[i, df.columns.get_loc('score')] += 32 * (actual_a - expected_a)
44
+ df.iloc[j, df.columns.get_loc('score')] += 32 * (actual_b - expected_b)
45
+
46
+ df['score'] = round(df['score'])
47
+ df = df.sort_values(by='score', ascending=False)
48
+ df['order'] = ['#' + str(i+1) for i in range(len(df))]
49
+
50
+ print(f"Returning final leaderboard data with {len(df)} models...")
51
+ # Return only the columns we want to display
52
+ return df[['order', 'name', 'score', 'votes']].values.tolist()
app/llm.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ from .config import *
3
+ from .models import *
4
+ from .utils import *
5
+ import random
6
+
7
+ def generate_response(prompt, model_name):
8
+ """
9
+ Replace this with actual API calls to your LLM endpoints
10
+ """
11
+ # Placeholder implementation
12
+ responses = {
13
+ "gpt4": "This is a simulated GPT-4 response",
14
+ "claude3": "This is a simulated Claude-3 response",
15
+ "gemini": "This is a simulated Gemini-Pro response",
16
+ "mixtral": "This is a simulated Mixtral response",
17
+ "llama2": "This is a simulated Llama-2 response"
18
+ }
19
+ return responses.get(model_name, "Model not found")
20
+
21
+ def get_responses(prompt, model_a, model_b):
22
+ results = {}
23
+
24
+ def predict_and_store(prompt, model, result_storage):
25
+ try:
26
+ if model in AVAILABLE_MODELS:
27
+ result = generate_response(prompt, AVAILABLE_MODELS[model])
28
+ else:
29
+ result = generate_response(prompt, model)
30
+ result_storage[model] = result
31
+ except Exception as e:
32
+ raise gr.Error(f'Unable to generate response: {str(e)}')
33
+
34
+ thread1 = threading.Thread(target=predict_and_store, args=(prompt, model_a, results))
35
+ thread2 = threading.Thread(target=predict_and_store, args=(prompt, model_b, results))
36
+
37
+ thread1.start()
38
+ thread2.start()
39
+ thread1.join()
40
+ thread2.join()
41
+
42
+ return results[model_a], results[model_b]
43
+
44
+ def random_models():
45
+ return random.sample(list(AVAILABLE_MODELS.keys()), 2)
app/messages.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+
3
+ ############
4
+ # Messages #
5
+ ############
6
+
7
+ MUST_BE_LOGGEDIN = "Please login with Hugging Face to participate in the TTS Arena."
8
+ DESCR = """
9
+ # TTS Arena: Benchmarking TTS Models in the Wild
10
+ Vote to help the community find the best available text-to-speech model!
11
+ """.strip()
12
+ BATTLE_INSTR = """
13
+ ## Battle
14
+ Choose 2 candidates and vote on which one is better! Currently in beta.
15
+ * Input text (English only) to synthesize audio (or press 🎲 for random text).
16
+ * Listen to the two audio clips, one after the other.
17
+ * Vote on which audio sounds more natural to you.
18
+ """
19
+ INSTR = """
20
+ ## Vote
21
+ * Input text (English only) to synthesize audio (or press 🎲 for random text).
22
+ * Listen to the two audio clips, one after the other.
23
+ * Vote on which audio sounds more natural to you.
24
+ * _Note: Model names are revealed after the vote is cast._
25
+ Note: It may take up to 30 seconds to synthesize audio.
26
+ """.strip()
27
+ request = ""
28
+ if SPACE_ID:
29
+ request = f"""
30
+ ### Request a model
31
+ Please [create a Discussion](https://huggingface.co/spaces/{SPACE_ID}/discussions/new) to request a model.
32
+ """
33
+ ABOUT = f"""
34
+ ## About
35
+ The TTS Arena evaluates leading speech synthesis models. It is inspired by LMsys's [Chatbot Arena](https://chat.lmsys.org/).
36
+ ### Motivation
37
+ The field of speech synthesis has long lacked an accurate method to measure the quality of different models. Objective metrics like WER (word error rate) are unreliable measures of model quality, and subjective measures such as MOS (mean opinion score) are typically small-scale experiments conducted with few listeners. As a result, these measurements are generally not useful for comparing two models of roughly similar quality. To address these drawbacks, we are inviting the community to rank models in an easy-to-use interface, and opening it up to the public in order to make both the opportunity to rank models, as well as the results, more easily accessible to everyone.
38
+ ### The Arena
39
+ The leaderboard allows a user to enter text, which will be synthesized by two models. After listening to each sample, the user can vote on which model sounds more natural. Due to the risks of human bias and abuse, model names are revealed only after a vote is submitted.
40
+ ### Credits
41
+ Thank you to the following individuals who helped make this project possible:
42
+ * VB ([Twitter](https://twitter.com/reach_vb) / [Hugging Face](https://huggingface.co/reach-vb))
43
+ * Clémentine Fourrier ([Twitter](https://twitter.com/clefourrier) / [Hugging Face](https://huggingface.co/clefourrier))
44
+ * Lucain Pouget ([Twitter](https://twitter.com/Wauplin) / [Hugging Face](https://huggingface.co/Wauplin))
45
+ * Yoach Lacombe ([Twitter](https://twitter.com/yoachlacombe) / [Hugging Face](https://huggingface.co/ylacombe))
46
+ * Main Horse ([Twitter](https://twitter.com/main_horse) / [Hugging Face](https://huggingface.co/main-horse))
47
+ * Sanchit Gandhi ([Twitter](https://twitter.com/sanchitgandhi99) / [Hugging Face](https://huggingface.co/sanchit-gandhi))
48
+ * Apolinário Passos ([Twitter](https://twitter.com/multimodalart) / [Hugging Face](https://huggingface.co/multimodalart))
49
+ * Pedro Cuenca ([Twitter](https://twitter.com/pcuenq) / [Hugging Face](https://huggingface.co/pcuenq))
50
+ {request}
51
+ ### Privacy statement
52
+ We may store text you enter and generated audio. We store a unique ID for each session. You agree that we may collect, share, and/or publish any data you input for research and/or commercial purposes.
53
+ ### License
54
+ Generated audio clips cannot be redistributed and may be used for personal, non-commercial use only.
55
+ Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
56
+ """.strip()
57
+ LDESC = """
58
+ ## 🏆 Leaderboard
59
+ Vote to help the community determine the best language models.
60
+ The leaderboard displays models in descending order based on votes cast by the community.
61
+ Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold.
62
+ Tick the `Show preliminary results` to show models with few votes. Please note that preliminary results may be inaccurate.
63
+ """.strip()
64
+ ABOUT_MD = """
65
+ # 🤖 LLM Arena
66
+
67
+ A platform for comparing and ranking different Large Language Models through human feedback.
68
+
69
+ ## How it works
70
+
71
+ 1. **Battle Mode**: Compare responses from two different LLMs side-by-side and vote for the better one
72
+ 2. **Leaderboard**: See how models rank against each other based on user votes
73
+ 3. **Fair Comparison**: Models are randomly selected and anonymized during voting to prevent bias
74
+
75
+ ## Contributing
76
+
77
+ Want to add a new model? Check out our [GitHub repository](link-to-repo) for instructions.
78
+
79
+ ## License
80
+
81
+ This project is licensed under MIT License. Individual models may have their own licenses.
82
+ """
app/models.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models to include in the leaderboard
2
+ AVAILABLE_MODELS = {
3
+ "GPT-4": "gpt4",
4
+ "Claude-3": "claude3",
5
+ "Gemini-Pro": "gemini",
6
+ "Mixtral": "mixtral",
7
+ "Llama-2": "llama2",
8
+ # Add more models as needed
9
+ }
10
+
11
+ # Model name mapping for display
12
+ model_names = {
13
+ "gpt4": "GPT-4",
14
+ "claude3": "Claude-3",
15
+ "gemini": "Gemini-Pro",
16
+ "mixtral": "Mixtral",
17
+ "llama2": "Llama-2",
18
+ # Add more mappings as needed
19
+ }
app/ui.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .messages import *
4
+ from .ui_battle import *
5
+ from .ui_leaderboard import *
6
+
7
+ with gr.Blocks() as about:
8
+ gr.Markdown(ABOUT_MD)
9
+
10
+ with gr.Blocks(css="footer {visibility: hidden}", title="LLM Arena") as app:
11
+ gr.Markdown(ABOUT_MD)
12
+ gr.TabbedInterface([battle, leaderboard, about], ['Battle', 'Leaderboard', 'About'])
app/ui_battle.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .vote import *
4
+ from .messages import *
5
+ from .llm import *
6
+ import random
7
+
8
+ def disable():
9
+ return [gr.update(interactive=False), gr.update(interactive=False)]
10
+ def enable():
11
+ return [gr.update(interactive=True), gr.update(interactive=True)]
12
+
13
+ with gr.Blocks() as battle:
14
+ battle_useridstate = gr.State()
15
+
16
+ gr.Markdown("## 🤖 LLM Battle\nCompare two AI responses and vote for the better one!")
17
+
18
+ with gr.Group():
19
+ with gr.Row():
20
+ prompt = gr.Textbox(
21
+ container=False,
22
+ show_label=False,
23
+ placeholder="Enter your prompt here...",
24
+ lines=3,
25
+ max_lines=10,
26
+ scale=9999999,
27
+ min_width=0
28
+ )
29
+ random_prompt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
30
+
31
+ btn = gr.Button("Generate Responses", variant='primary')
32
+
33
+ with gr.Row(visible=False) as response_row:
34
+ with gr.Column():
35
+ with gr.Group():
36
+ response1 = gr.Textbox(
37
+ label="Model A Response",
38
+ lines=8,
39
+ max_lines=8,
40
+ interactive=False
41
+ )
42
+ a_better = gr.Button("A is better", variant='primary')
43
+ model1_name = gr.Textbox(
44
+ interactive=False,
45
+ show_label=False,
46
+ container=False,
47
+ value="Vote to reveal model A",
48
+ text_align="center",
49
+ visible=False
50
+ )
51
+ with gr.Column():
52
+ with gr.Group():
53
+ response2 = gr.Textbox(
54
+ label="Model B Response",
55
+ lines=8,
56
+ max_lines=8,
57
+ interactive=False
58
+ )
59
+ b_better = gr.Button("B is better", variant='primary')
60
+ model2_name = gr.Textbox(
61
+ interactive=False,
62
+ show_label=False,
63
+ container=False,
64
+ value="Vote to reveal model B",
65
+ text_align="center",
66
+ visible=False
67
+ )
68
+
69
+ def generate_responses(prompt):
70
+ if len(prompt.strip()) < MIN_PROMPT_LENGTH:
71
+ raise gr.Error(f"Prompt must be at least {MIN_PROMPT_LENGTH} characters")
72
+ if len(prompt.strip()) > MAX_PROMPT_LENGTH:
73
+ raise gr.Error(f"Prompt must be less than {MAX_PROMPT_LENGTH} characters")
74
+
75
+ model_a, model_b = random_models()
76
+ resp_a, resp_b = get_responses(prompt, model_a, model_b)
77
+
78
+ return [
79
+ resp_a, # response1
80
+ resp_b, # response2
81
+ model_a, # model1_name (actual model identifier)
82
+ model_b, # model2_name (actual model identifier)
83
+ gr.update(visible=True), # response_row
84
+ gr.update(interactive=True, visible=True), # a_better
85
+ gr.update(interactive=True, visible=True), # b_better
86
+ gr.update(visible=False), # model1_name visibility
87
+ gr.update(visible=False) # model2_name visibility
88
+ ]
89
+
90
+ # Event handlers
91
+ btn.click(
92
+ fn=generate_responses,
93
+ inputs=[prompt],
94
+ outputs=[
95
+ response1,
96
+ response2,
97
+ model1_name,
98
+ model2_name,
99
+ response_row,
100
+ a_better,
101
+ b_better,
102
+ model1_name,
103
+ model2_name
104
+ ]
105
+ )
106
+
107
+ a_better.click(
108
+ fn=a_is_better,
109
+ inputs=[model1_name, model2_name, battle_useridstate, prompt, response1, response2],
110
+ outputs=[a_better, b_better, model1_name, model2_name]
111
+ )
112
+
113
+ b_better.click(
114
+ fn=b_is_better,
115
+ inputs=[model1_name, model2_name, battle_useridstate, prompt, response1, response2],
116
+ outputs=[a_better, b_better, model1_name, model2_name]
117
+ )
118
+
119
+ def get_random_prompt():
120
+ prompts = [
121
+ "What are the key differences between Python and JavaScript?",
122
+ "Explain quantum computing in simple terms.",
123
+ "Write a short story about a robot learning to feel emotions.",
124
+ "What are the pros and cons of remote work?",
125
+ "Explain how blockchain technology works."
126
+ ]
127
+ return random.choice(prompts)
128
+
129
+ random_prompt.click(
130
+ fn=get_random_prompt,
131
+ outputs=[prompt]
132
+ )
app/ui_leaderboard.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .leaderboard import *
4
+ from .messages import *
5
+
6
+ with gr.Blocks() as leaderboard:
7
+ gr.Markdown(LDESC)
8
+
9
+ df = gr.Dataframe(
10
+ headers=['Rank', 'Model', 'Score', 'Total Votes'],
11
+ interactive=False,
12
+ wrap=True,
13
+ column_widths=['80px', '200px', '100px', '100px']
14
+ )
15
+
16
+ reloadbtn = gr.Button("🔄 Refresh")
17
+
18
+ reveal_prelim = gr.Checkbox(
19
+ label="Show preliminary results",
20
+ info="Include models with few votes",
21
+ value=True
22
+ )
23
+
24
+ def update_leaderboard(reveal_prelim):
25
+ return get_leaderboard(reveal_prelim)
26
+
27
+ # Update on checkbox changes
28
+ reveal_prelim.change(
29
+ update_leaderboard,
30
+ inputs=[reveal_prelim],
31
+ outputs=[df]
32
+ )
33
+
34
+ # Update on refresh button click
35
+ reloadbtn.click(
36
+ update_leaderboard,
37
+ inputs=[reveal_prelim],
38
+ outputs=[df]
39
+ )
40
+
41
+ # Initial load
42
+ leaderboard.load(
43
+ update_leaderboard,
44
+ inputs=[reveal_prelim],
45
+ outputs=[df]
46
+ )
app/utils.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import uuid
2
+
3
+ def mkuuid(uid):
4
+ if not uid:
5
+ uid = uuid.uuid4()
6
+ return uid
app/vote.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import *
2
+ from .config import *
3
+ from .models import *
4
+ from .db import *
5
+ from .init import *
6
+
7
+ import gradio as gr
8
+
9
+ # Vote
10
+
11
+ def upvote_model(model, uname, prompt="", response=""):
12
+ print("Establishing database connection for upvoting.")
13
+ conn = get_db()
14
+ cursor = conn.cursor()
15
+ print(f"Updating upvote count for model: {model}")
16
+ cursor.execute('UPDATE model SET upvote = upvote + 1 WHERE name = ?', (model,))
17
+ if cursor.rowcount == 0:
18
+ print(f"No existing entry found for model '{model}'. Inserting new model with upvote=1 and downvote=0.")
19
+ cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 1, 0)', (model,))
20
+ print(f"Inserting vote record: username={uname}, model={model}, vote=1, prompt={prompt}, response={response}")
21
+ cursor.execute('INSERT INTO vote (username, model, vote, prompt, response) VALUES (?, ?, ?, ?, ?)',
22
+ (uname, model, 1, prompt, response))
23
+ print("Committing upvote transaction.")
24
+ conn.commit()
25
+ print("Closing cursor after upvoting.")
26
+ cursor.close()
27
+
28
+ def downvote_model(model, uname, prompt="", response=""):
29
+ print("Establishing database connection for downvoting.")
30
+ conn = get_db()
31
+ cursor = conn.cursor()
32
+ print(f"Updating downvote count for model: {model}")
33
+ cursor.execute('UPDATE model SET downvote = downvote + 1 WHERE name = ?', (model,))
34
+ if cursor.rowcount == 0:
35
+ print(f"No existing entry found for model '{model}'. Inserting new model with upvote=0 and downvote=1.")
36
+ cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 0, 1)', (model,))
37
+ print(f"Inserting vote record: username={uname}, model={model}, vote=-1, prompt={prompt}, response={response}")
38
+ cursor.execute('INSERT INTO vote (username, model, vote, prompt, response) VALUES (?, ?, ?, ?, ?)',
39
+ (uname, model, -1, prompt, response))
40
+ print("Committing downvote transaction.")
41
+ conn.commit()
42
+ print("Closing cursor after downvoting.")
43
+ cursor.close()
44
+
45
+ # Battle Mode
46
+
47
+ def a_is_better(model1, model2, userid, prompt="", response1="", response2=""):
48
+ print("Processing vote: A is better.")
49
+ print(f"Comparing models: {model1} vs {model2}")
50
+ if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
51
+ print(f"Model '{model1}' is not available. Raising error.")
52
+ raise gr.Error('Sorry, please try voting again.')
53
+ userid = mkuuid(userid)
54
+ print(f"Generated UUID for user: {userid}")
55
+ if model1 and model2:
56
+ print("Establishing database connection for voting.")
57
+ conn = get_db()
58
+ cursor = conn.cursor()
59
+ print(f"Inserting votelog: username={userid}, chosen={model1}, rejected={model2}, prompt={prompt}, chosen_response={response1}, rejected_response={response2}")
60
+ cursor.execute('INSERT INTO votelog (username, chosen, rejected, prompt, chosen_response, rejected_response) VALUES (?, ?, ?, ?, ?, ?)',
61
+ (str(userid), model1, model2, prompt, response1, response2))
62
+ if scheduler:
63
+ print("Scheduler detected. Acquiring scheduler lock before committing.")
64
+ with scheduler.lock:
65
+ print("Committing votelog transaction with scheduler lock.")
66
+ conn.commit()
67
+ else:
68
+ print("Committing votelog transaction without scheduler lock.")
69
+ conn.commit()
70
+ print("Closing cursor after logging vote.")
71
+ cursor.close()
72
+ print(f"Upvoting model: {model1}")
73
+ upvote_model(model1, str(userid), prompt, response1)
74
+ print(f"Downvoting model: {model2}")
75
+ downvote_model(model2, str(userid), prompt, response2)
76
+ print("Reloading UI after voting.")
77
+ return reload(model1, model2, userid, chose_a=True)
78
+
79
+ def b_is_better(model1, model2, userid, prompt="", response1="", response2=""):
80
+ print("Processing vote: B is better.")
81
+ print(f"Comparing models: {model1} vs {model2}")
82
+ if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
83
+ print(f"Model '{model1}' is not available. Raising error.")
84
+ raise gr.Error('Sorry, please try voting again.')
85
+ userid = mkuuid(userid)
86
+ print(f"Generated UUID for user: {userid}")
87
+ if model1 and model2:
88
+ print("Establishing database connection for voting.")
89
+ conn = get_db()
90
+ cursor = conn.cursor()
91
+ print(f"Inserting votelog: username={userid}, chosen={model2}, rejected={model1}, prompt={prompt}, chosen_response={response2}, rejected_response={response1}")
92
+ cursor.execute('INSERT INTO votelog (username, chosen, rejected, prompt, chosen_response, rejected_response) VALUES (?, ?, ?, ?, ?, ?)',
93
+ (str(userid), model2, model1, prompt, response2, response1))
94
+ if scheduler:
95
+ print("Scheduler detected. Acquiring scheduler lock before committing.")
96
+ with scheduler.lock:
97
+ print("Committing votelog transaction with scheduler lock.")
98
+ conn.commit()
99
+ else:
100
+ print("Committing votelog transaction without scheduler lock.")
101
+ conn.commit()
102
+ print("Closing cursor after logging vote.")
103
+ cursor.close()
104
+ print(f"Upvoting model: {model2}")
105
+ upvote_model(model2, str(userid), prompt, response2)
106
+ print(f"Downvoting model: {model1}")
107
+ downvote_model(model1, str(userid), prompt, response1)
108
+ print("Reloading UI after voting.")
109
+ return reload(model1, model2, userid, chose_b=True)
110
+
111
+ # Reload
112
+
113
+ def reload(chosenmodel1=None, chosenmodel2=None, userid=None, chose_a=False, chose_b=False):
114
+ out = [
115
+ gr.update(interactive=False), # a_better
116
+ gr.update(interactive=False), # b_better
117
+ gr.update(value=f"Selected: {chosenmodel1}" if chose_a else chosenmodel1,
118
+ interactive=False,
119
+ visible=True), # model1_name
120
+ gr.update(value=f"Selected: {chosenmodel2}" if chose_b else chosenmodel2,
121
+ interactive=False,
122
+ visible=True) # model2_name
123
+ ]
124
+ return out
database.db ADDED
Binary file (24.6 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ sqlite3
3
+ huggingface_hub
scripts/view_db.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import pandas as pd
3
+ from tabulate import tabulate
4
+
5
+ def view_db_content():
6
+ conn = sqlite3.connect("database.db")
7
+
8
+ # Get models and their votes
9
+ print("\n=== Models and Vote Counts ===")
10
+ models_df = pd.read_sql_query("""
11
+ SELECT
12
+ name,
13
+ upvote,
14
+ downvote,
15
+ CAST(upvote AS FLOAT) / NULLIF(upvote + downvote, 0) * 100 as win_rate,
16
+ upvote + downvote as total_votes
17
+ FROM model
18
+ ORDER BY win_rate DESC
19
+ """, conn)
20
+ print(tabulate(models_df, headers='keys', tablefmt='psql', showindex=False))
21
+
22
+ # Get recent votes with response previews
23
+ print("\n=== Recent Votes ===")
24
+ votes_df = pd.read_sql_query("""
25
+ SELECT
26
+ username,
27
+ model,
28
+ CASE
29
+ WHEN vote = 1 THEN 'upvote'
30
+ ELSE 'downvote'
31
+ END as vote_type,
32
+ substr(prompt, 1, 50) || '...' as prompt_preview,
33
+ substr(response, 1, 50) || '...' as response_preview,
34
+ datetime(timestamp, 'localtime') as local_time
35
+ FROM vote
36
+ ORDER BY timestamp DESC
37
+ LIMIT 10
38
+ """, conn)
39
+ print(tabulate(votes_df, headers='keys', tablefmt='psql', showindex=False))
40
+
41
+ # Get recent battles with response previews
42
+ print("\n=== Recent Battles ===")
43
+ battles_df = pd.read_sql_query("""
44
+ SELECT
45
+ username,
46
+ chosen as winner,
47
+ rejected as loser,
48
+ substr(prompt, 1, 50) || '...' as prompt_preview,
49
+ substr(chosen_response, 1, 50) || '...' as winner_response,
50
+ substr(rejected_response, 1, 50) || '...' as loser_response,
51
+ datetime(timestamp, 'localtime') as local_time
52
+ FROM votelog
53
+ ORDER BY timestamp DESC
54
+ LIMIT 10
55
+ """, conn)
56
+ print(tabulate(battles_df, headers='keys', tablefmt='psql', showindex=False))
57
+
58
+ conn.close()
59
+
60
+ if __name__ == "__main__":
61
+ view_db_content()