diff --git "a/index.html" "b/index.html" --- "a/index.html" +++ "b/index.html" @@ -16,11 +16,10 @@ div.column{flex: auto; overflow-x: auto;} div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} ul.task-list{list-style: none;} ul.task-list li input[type="checkbox"] { - width: 0.8em; - margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ - vertical-align: middle; +width: 0.8em; +margin: 0 0.8em 0.2em -1em; vertical-align: middle; } -/* CSS for syntax highlighting */ + pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } @@ -36,40 +35,2987 @@ pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code - { counter-reset: source-line 0; } +{ counter-reset: source-line 0; } pre.numberSource code > span - { position: relative; left: -4em; counter-increment: source-line; } +{ position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before - { content: counter(source-line); - position: relative; left: -1em; text-align: right; vertical-align: baseline; - border: none; display: inline-block; - -webkit-touch-callout: none; -webkit-user-select: none; - -khtml-user-select: none; -moz-user-select: none; - -ms-user-select: none; user-select: none; - padding: 0 4px; width: 4em; - } -pre.numberSource { margin-left: 3em; padding-left: 4px; } +{ content: counter(source-line); +position: relative; left: -1em; text-align: right; vertical-align: baseline; +border: none; display: inline-block; +-webkit-touch-callout: none; -webkit-user-select: none; +-khtml-user-select: none; -moz-user-select: none; +-ms-user-select: none; user-select: none; +padding: 0 4px; width: 4em; +} +pre.numberSource { margin-left: 3em; padding-left: 4px; } div.sourceCode - { } +{ } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } - - - - - - - - - - + + - + function toggleTab(tab, active) { + const tabPanelId = tab.getAttribute("aria-controls"); + const tabPanel = document.getElementById(tabPanelId); + if (active) { + tab.classList.add("active"); + tabPanel.classList.add("active"); + } else { + tab.classList.remove("active"); + tabPanel.classList.remove("active"); + } + } + + function toggleAll(selectedGroup, selectorsToSync) { + for (const [thisGroup, tabs] of Object.entries(selectorsToSync)) { + const active = selectedGroup === thisGroup; + for (const tab of tabs) { + toggleTab(tab, active); + } + } + } + + function findSelectorsToSyncByLanguage() { + const result = {}; + const tabs = Array.from( + document.querySelectorAll(`div[data-group] a[id^='tabset-']`) + ); + for (const item of tabs) { + const div = item.parentElement.parentElement.parentElement; + const group = div.getAttribute("data-group"); + if (!result[group]) { + result[group] = {}; + } + const selectorsToSync = result[group]; + const value = item.innerHTML; + if (!selectorsToSync[value]) { + selectorsToSync[value] = []; + } + selectorsToSync[value].push(item); + } + return result; + } + + function setupSelectorSync() { + const selectorsToSync = findSelectorsToSyncByLanguage(); + Object.entries(selectorsToSync).forEach(([group, tabSetsByValue]) => { + Object.entries(tabSetsByValue).forEach(([value, items]) => { + items.forEach((item) => { + item.addEventListener("click", (_event) => { + setTabState(group, value); + toggleAll(value, selectorsToSync[group]); + }); + }); + }); + }); + return selectorsToSync; + } + + const selectorsToSync = setupSelectorSync(); + for (const [group, selectedName] of Object.entries(getTabSettings())) { + const selectors = selectorsToSync[group]; + // it's possible that stale state gives us empty selections, so we explicitly check here. + if (selectors) { + toggleAll(selectedName, selectors); + } + } +}); + +function throttle(func, wait) { + let waiting = false; + return function () { + if (!waiting) { + func.apply(this, arguments); + waiting = true; + setTimeout(function () { + waiting = false; + }, wait); + } + }; +} + +function nexttick(func) { + return setTimeout(func, 0); +} + + + + + + + + + + + + - + @@ -144,8 +3119,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

Start evaluating your model in 3 line

You can start evaluating your model with TLEM in 3 lines, tlem is designed to work without installing.

-
suite = evaluate.EvaluationSuite.load("SUSTech/tlem", download_mode="force_redownload")
-suite.load("gsm8k")  # You can check the available datasets by suite.supported_datasets
+
suite = evaluate.EvaluationSuite.load("SUSTech/tlem", download_mode="force_redownload")
+suite.load("gsm8k")  # You can check the available datasets by suite.supported_datasets
suite.run(pipe := lambda x: x)
@@ -165,50 +3140,50 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
<class 'evaluate_modules.metrics.sustech--tlem.a09e0e4b7368f89944eb7781a52f3519caa4ffb8677312fbb90e48a613c8efdc.tlem.ReasoningMetric'>
+
<class 'evaluate_modules.metrics.sustech--tlem.a09e0e4b7368f89944eb7781a52f3519caa4ffb8677312fbb90e48a613c8efdc.tlem.ReasoningMetric'>
-
{'gsm8k': 0.022744503411675512}
+
{'gsm8k': 0.022744503411675512}

The lambda function indicate a model pipeline which takes a list of string as input and return a list of string as output. You can use any model you want, as long as it can be wrapped in this way. We use the most popular VLLM and Openai API as an example:

session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60 * 60 * 24 * 7))
-url = "xxx"
-client = AsyncOpenAI(**{"base_url": f"http://{url}/v1/", "api_key": "EMPTY"})
+url = "xxx"
+client = AsyncOpenAI(**{"base_url": f"http://{url}/v1/", "api_key": "EMPTY"})
 
 
 @suite.utils.async_pipe
 async def chatgpt(msg):
-    input = f"### Human: {msg}\n\n### Assistant: "
+    input = f"### Human: {msg}\n\n### Assistant: "
     try:
         resp = await client.completions.create(
-            model="gpt-3.5-turbo",
+            model="gpt-3.5-turbo",
             max_tokens=None,
             prompt=input,
             temperature=0,
         )
         return resp.choices[0].text
     except Exception as e:
-        return "OpenAI Error"
+        return "OpenAI Error"
 
 
 @suite.utils.async_pipe
 async def vllm(msg):
-    input = f"### Human: {msg}\n\n### Assistant: "
+    input = f"### Human: {msg}\n\n### Assistant: "
     data = {
-        "prompt": input,
-        "max_tokens": 4096,
-        "n": 1,
-        "temperature": 0,
+        "prompt": input,
+        "max_tokens": 4096,
+        "n": 1,
+        "temperature": 0,
     }
 
     try:
-        async with session.post(f"http://{url}/generate", json=data) as response:
+        async with session.post(f"http://{url}/generate", json=data) as response:
             response_json = await response.json()
-            return response_json["text"][0][len(input) :]
+            return response_json["text"][0][len(input) :]
     except Exception as e:
-        return "Vllm Error"
+ return "Vllm Error"
@@ -221,7 +3196,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
task = suite[0]
 # task.outputs is avaliable after suite.run or task.run
-pd.DataFrame({"input": task.samples, "label": task.labels, "output": task.outputs})
+pd.DataFrame({"input": task.samples, "label": task.labels, "output": task.outputs})
@@ -286,9 +3261,9 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin 1316 -Mark's car breaks down and he needs to get a n... +Mark's car breaks down and he needs to get a n... The discount on the radiator was 400*.8=$<<400... -Mark's car breaks down and he needs to get a n... +Mark's car breaks down and he needs to get a n... 1317 @@ -313,13 +3288,13 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
task.metric(task.labels, task.labels)
-
{'gsm8k': 1.0}
+
{'gsm8k': 1.0}
task.metric(task.outputs, task.labels)
-
{'gsm8k': 0.022744503411675512}
+
{'gsm8k': 0.022744503411675512}
@@ -331,17 +3306,17 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

You can easily add your own task by inheriting the Task class. For example, if you want to add a task to evaluate the model’s ability to generate a specific type of text, you can do it in this way:

task = suite.task_class(
-    dataset_name=("gsm8k", "main"),
-    input_column="question",
-    label_column="answer",
-    metric_name="evaluate-metric/competition_math",
+    dataset_name=("gsm8k", "main"),
+    input_column="question",
+    label_column="answer",
+    metric_name="evaluate-metric/competition_math",
 )
 task.run(pipe)
-
<class 'evaluate_modules.metrics.evaluate-metric--competition_math.b85814e0172dae97fa4bd6eff6f33caba2ff9547860acabd50222c6dee474a24.competition_math.CompetitionMathMetric'>
+
<class 'evaluate_modules.metrics.evaluate-metric--competition_math.b85814e0172dae97fa4bd6eff6f33caba2ff9547860acabd50222c6dee474a24.competition_math.CompetitionMathMetric'>
-
{'accuracy': 0.0}
+
{'accuracy': 0.0}

where the metric can be put in any huggingface space, TLEM is designed to be decentralized, allowing you to run evaluations on private datasets without the need to contribute your code back to TLEM. You can also define the metric locally:

@@ -445,7 +3420,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
Figure 1: TLEM leaderboard
+
Figure 1: TLEM leaderboard