Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Jan 24, 2024

Commit

82130cb

1 Parent(s): 8f424fc

Added the test reporting structure

Browse files

Files changed (2) hide show

pages/040_Test_Reporter.py +55 -1
src/testing.py +148 -2

pages/040_Test_Reporter.py CHANGED Viewed

@@ -1,9 +1,63 @@
 import streamlit as st
 from src.st_helpers import st_setup
 if st_setup('LLM Arch'):
     summary = st.container()
     with summary:
         st.write("# Test Reporter")
-        st.info("Just a placeholder for now - need to implement the end user test suite selection and reporting")

+import pandas as pd
+import plotly.express as px
 import streamlit as st
+from src.architectures import *
 from src.st_helpers import st_setup
+from src.testing import TestGroup
+def show_stats(for_test_group: str):
+    test_group = TestGroup.for_test_group_tag(for_test_group)
+    title = "No comment provided for group" if test_group.comment == "" else test_group.comment
+    st.write(f"### {title}")
+    st.write(f"Total of {test_group.num_tests} tests over {test_group.num_archs} architectures ({test_group.num_tests_per_arch} per architecture).")
+    stats = test_group.summary_stats_by_arch()
+    with st.expander("**Elapsed End to End Time (seconds)**"):
+        data = []
+        for arch in stats:
+            for e in arch['elapsed']:
+                data.append([arch['arch_name'], e/1000])
+        df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time'])
+        fig = px.box(df, x="Architecture", y="Elapsed time")
+        fig.update_xaxes(tickangle=-90)
+        st.plotly_chart(fig, use_container_width=True)
+    with st.expander("**Average response length (count of characters)**"):
+        data = []
+        for arch in stats:
+            for rl in arch['response_len']:
+                data.append([arch['arch_name'], rl])
+        df = pd.DataFrame(data, columns=['Architecture', 'Response length'])
+        fig = px.box(df, x="Architecture", y="Response length")
+        fig.update_xaxes(tickangle=-90)
+        st.plotly_chart(fig, use_container_width=True)
+    with st.expander("**Mean elapsed time by architecture step (seconds)**"):
+        data = []
+        for arch in stats:
+            for step in arch['steps']:
+                data.append([arch['arch_name'], step['step_name'], step['mean_elapsed'] / 1000])
+        df = pd.DataFrame(data, columns=['Architecture', 'Step', 'Mean elapsed time'])
+        fig = px.bar(df, x='Architecture', y='Mean elapsed time', color='Step', barmode='stack')
+        fig.update_xaxes(tickangle=-90)
+        st.plotly_chart(fig, use_container_width=True)
 if st_setup('LLM Arch'):
     summary = st.container()
     with summary:
         st.write("# Test Reporter")
+        TestGroup.load_all()
+        selector, display = st.columns([2, 3])
+        with selector:
+            test_groups = list(TestGroup.all.values())
+            test_groups.sort(key=lambda x: -x.start)
+            options = [f'{tg.test_group}: {tg.comment}' for tg in test_groups]
+            if selected := st.radio('**Pick a test set to review**', options=options, index=None):
+                with display:
+                    show_stats(selected.split(":")[0])

src/testing.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import json
 import os
 from random import choices
-from typing import List
 from src.common import data_dir
 class TestGenerator:
     """
-    Wrapper to hold testing questions and serve up examples
     """
     questions: List[str] = None
@@ -36,4 +39,147 @@ class TestGenerator:
         """
         Return n random questions
         """
         return choices(cls.questions, k=n)

+from __future__ import annotations  # For self-referencing annotations
 import json
 import os
 from random import choices
+from typing import List, Dict, Optional
+from src.architectures import Architecture
 from src.common import data_dir
 class TestGenerator:
     """
+    Wrapper class to hold testing questions and serve up examples
     """
     questions: List[str] = None
         """
         Return n random questions
         """
+        cls.load_questions()
         return choices(cls.questions, k=n)
+class ArchitectureRequestRecord:
+    """
+    Representation of the test data associated with each invocation of an architecture
+    """
+    all: List[ArchitectureRequestRecord] = None
+    class ArchStep:
+        """
+        Inner class to just hold this data
+        """
+        def __init__(self, name: str, start: int, end: int):
+            self.name = name
+            self.start = start
+            self.end = end
+            self.elapsed = end - start
+    def __init__(self, arch: str, response_len: int, start: int, end: int,
+                 elapsed: int, tags: List[str], test_group: Optional[str],
+                 comment: str, steps: List[ArchitectureRequestRecord.ArchStep]):
+        self.arch = arch
+        self.response_len = response_len
+        self.start = start
+        self.end = end
+        self.elapsed = elapsed
+        self.tags = tags
+        self.test_group = test_group
+        self.comment = comment
+        self.steps = steps
+    @classmethod
+    def from_dict(cls, test: Dict) -> ArchitectureRequestRecord:
+        arch = test['architecture']
+        response_len = len(test['request']['response_evolution'][-1])
+        start = test['trace']['steps'][0]['start_ms']
+        end = test['trace']['steps'][-1]['end_ms']
+        elapsed = end - start
+        tags = test['test_tags']
+        test_group = None
+        for tag in tags:
+            if tag.startswith("TestGroup"):
+                test_group = tag
+        comment = test['test_comment']
+        steps = []
+        for s in test['trace']['steps']:
+            steps.append(ArchitectureRequestRecord.ArchStep(s['name'], s['start_ms'], s['end_ms']))
+        return ArchitectureRequestRecord( arch, response_len, start, end, elapsed, tags, test_group, comment, steps)
+    @classmethod
+    def load_all(cls, reload=False) -> None:
+        """
+        Load all the traces from json trace log
+        """
+        if cls.all is None or reload:
+            records = []
+            test_traces = Architecture.get_trace_records()
+            for trace in test_traces:
+                records.append(ArchitectureRequestRecord.from_dict(trace))
+            cls.all = records
+class TestGroup:
+    all: Dict[str, List[ArchitectureRequestRecord]] = None
+    def __init__(self, test_group:str):
+        self.arch_request_records: List[ArchitectureRequestRecord] = []
+        self.test_group = test_group
+        self.comment = None
+        self.start = None
+        self.end = None
+        self.elapsed = None
+        self.architectures = set()
+    @property
+    def num_archs(self) -> int:
+        return len(self.architectures)
+    @property
+    def num_tests(self) -> int:
+        return len(self.arch_request_records)
+    @property
+    def num_tests_per_arch(self) -> int:
+        # Should always be an even number but cast to int just in case
+        return int(self.num_tests / self.num_archs)
+    def arch_request_records_by_arch(self) -> Dict[List[ArchitectureRequestRecord]]:
+        grouped = {}
+        for arr in self.arch_request_records:
+            if arr.arch not in grouped:
+                grouped[arr.arch] = []
+            grouped[arr.arch].append(arr)
+        return grouped
+    def summary_stats_by_arch(self) -> List[Dict]:
+        arch_records = self.arch_request_records_by_arch()
+        arch_names = list(arch_records.keys())
+        arch_names.sort()
+        stats = []
+        for a in arch_names:
+            stat_pack = {'arch_name': a, 'elapsed': [rec.elapsed for rec in arch_records[a]],
+                         'response_len': [rec.response_len for rec in arch_records[a]], 'steps': []}
+            for i in range(len(arch_records[a][0].steps)):
+                stat_pack['steps'].append({'step_name': arch_records[a][0].steps[i].name})
+                num_recs = len(arch_records[a])
+                total_elapsed = 0
+                for j in range(num_recs):
+                    total_elapsed += arch_records[a][j].steps[i].elapsed
+                stat_pack['steps'][-1]['mean_elapsed'] = total_elapsed / num_recs
+            stats.append(stat_pack)
+        return stats
+    def add_record(self, arr: ArchitectureRequestRecord) -> None:
+        if arr.test_group != self.test_group:
+            raise ValueError("Attempted to group a test record into the wrong group")
+        self.arch_request_records.append(arr)
+        self.architectures.add(arr.arch)
+        if self.comment is None:
+            self.comment = arr.comment
+        if self.start is None or self.start > arr.start:
+            self.start = arr.start
+        if self.end is None or self.end < arr.end:
+            self.end = arr.end
+        self.elapsed = self.end - self.start
+    @classmethod
+    def load_all(cls, reload=False):
+        if cls.all is None or reload:
+            ArchitectureRequestRecord.load_all(reload=reload)
+            records = {}
+            for arr in ArchitectureRequestRecord.all:
+                if arr.test_group is not None:
+                    if arr.test_group not in records:
+                        records[arr.test_group] = TestGroup(arr.test_group)
+                    records[arr.test_group].add_record(arr)
+            cls.all = records
+    @classmethod
+    def for_test_group_tag(cls, test_group_tag: str) -> TestGroup:
+        cls.load_all()
+        return cls.all[test_group_tag]