alfraser commited on
Commit
82130cb
·
1 Parent(s): 8f424fc

Added the test reporting structure

Browse files
Files changed (2) hide show
  1. pages/040_Test_Reporter.py +55 -1
  2. src/testing.py +148 -2
pages/040_Test_Reporter.py CHANGED
@@ -1,9 +1,63 @@
 
 
1
  import streamlit as st
2
 
 
3
  from src.st_helpers import st_setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  if st_setup('LLM Arch'):
6
  summary = st.container()
7
  with summary:
8
  st.write("# Test Reporter")
9
- st.info("Just a placeholder for now - need to implement the end user test suite selection and reporting")
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
  import streamlit as st
4
 
5
+ from src.architectures import *
6
  from src.st_helpers import st_setup
7
+ from src.testing import TestGroup
8
+
9
+ def show_stats(for_test_group: str):
10
+ test_group = TestGroup.for_test_group_tag(for_test_group)
11
+ title = "No comment provided for group" if test_group.comment == "" else test_group.comment
12
+ st.write(f"### {title}")
13
+ st.write(f"Total of {test_group.num_tests} tests over {test_group.num_archs} architectures ({test_group.num_tests_per_arch} per architecture).")
14
+
15
+ stats = test_group.summary_stats_by_arch()
16
+
17
+ with st.expander("**Elapsed End to End Time (seconds)**"):
18
+ data = []
19
+ for arch in stats:
20
+ for e in arch['elapsed']:
21
+ data.append([arch['arch_name'], e/1000])
22
+ df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time'])
23
+ fig = px.box(df, x="Architecture", y="Elapsed time")
24
+ fig.update_xaxes(tickangle=-90)
25
+ st.plotly_chart(fig, use_container_width=True)
26
+
27
+ with st.expander("**Average response length (count of characters)**"):
28
+ data = []
29
+ for arch in stats:
30
+ for rl in arch['response_len']:
31
+ data.append([arch['arch_name'], rl])
32
+ df = pd.DataFrame(data, columns=['Architecture', 'Response length'])
33
+ fig = px.box(df, x="Architecture", y="Response length")
34
+ fig.update_xaxes(tickangle=-90)
35
+ st.plotly_chart(fig, use_container_width=True)
36
+
37
+ with st.expander("**Mean elapsed time by architecture step (seconds)**"):
38
+ data = []
39
+ for arch in stats:
40
+ for step in arch['steps']:
41
+ data.append([arch['arch_name'], step['step_name'], step['mean_elapsed'] / 1000])
42
+ df = pd.DataFrame(data, columns=['Architecture', 'Step', 'Mean elapsed time'])
43
+ fig = px.bar(df, x='Architecture', y='Mean elapsed time', color='Step', barmode='stack')
44
+ fig.update_xaxes(tickangle=-90)
45
+ st.plotly_chart(fig, use_container_width=True)
46
+
47
 
48
  if st_setup('LLM Arch'):
49
  summary = st.container()
50
  with summary:
51
  st.write("# Test Reporter")
52
+ TestGroup.load_all()
53
+ selector, display = st.columns([2, 3])
54
+
55
+ with selector:
56
+ test_groups = list(TestGroup.all.values())
57
+ test_groups.sort(key=lambda x: -x.start)
58
+ options = [f'{tg.test_group}: {tg.comment}' for tg in test_groups]
59
+ if selected := st.radio('**Pick a test set to review**', options=options, index=None):
60
+ with display:
61
+ show_stats(selected.split(":")[0])
62
+
63
+
src/testing.py CHANGED
@@ -1,15 +1,18 @@
 
 
1
  import json
2
  import os
3
 
4
  from random import choices
5
- from typing import List
6
 
 
7
  from src.common import data_dir
8
 
9
 
10
  class TestGenerator:
11
  """
12
- Wrapper to hold testing questions and serve up examples
13
  """
14
  questions: List[str] = None
15
 
@@ -36,4 +39,147 @@ class TestGenerator:
36
  """
37
  Return n random questions
38
  """
 
39
  return choices(cls.questions, k=n)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations # For self-referencing annotations
2
+
3
  import json
4
  import os
5
 
6
  from random import choices
7
+ from typing import List, Dict, Optional
8
 
9
+ from src.architectures import Architecture
10
  from src.common import data_dir
11
 
12
 
13
  class TestGenerator:
14
  """
15
+ Wrapper class to hold testing questions and serve up examples
16
  """
17
  questions: List[str] = None
18
 
 
39
  """
40
  Return n random questions
41
  """
42
+ cls.load_questions()
43
  return choices(cls.questions, k=n)
44
+
45
+
46
+ class ArchitectureRequestRecord:
47
+ """
48
+ Representation of the test data associated with each invocation of an architecture
49
+ """
50
+ all: List[ArchitectureRequestRecord] = None
51
+
52
+ class ArchStep:
53
+ """
54
+ Inner class to just hold this data
55
+ """
56
+ def __init__(self, name: str, start: int, end: int):
57
+ self.name = name
58
+ self.start = start
59
+ self.end = end
60
+ self.elapsed = end - start
61
+
62
+ def __init__(self, arch: str, response_len: int, start: int, end: int,
63
+ elapsed: int, tags: List[str], test_group: Optional[str],
64
+ comment: str, steps: List[ArchitectureRequestRecord.ArchStep]):
65
+ self.arch = arch
66
+ self.response_len = response_len
67
+ self.start = start
68
+ self.end = end
69
+ self.elapsed = elapsed
70
+ self.tags = tags
71
+ self.test_group = test_group
72
+ self.comment = comment
73
+ self.steps = steps
74
+
75
+ @classmethod
76
+ def from_dict(cls, test: Dict) -> ArchitectureRequestRecord:
77
+ arch = test['architecture']
78
+ response_len = len(test['request']['response_evolution'][-1])
79
+ start = test['trace']['steps'][0]['start_ms']
80
+ end = test['trace']['steps'][-1]['end_ms']
81
+ elapsed = end - start
82
+ tags = test['test_tags']
83
+ test_group = None
84
+ for tag in tags:
85
+ if tag.startswith("TestGroup"):
86
+ test_group = tag
87
+ comment = test['test_comment']
88
+ steps = []
89
+ for s in test['trace']['steps']:
90
+ steps.append(ArchitectureRequestRecord.ArchStep(s['name'], s['start_ms'], s['end_ms']))
91
+ return ArchitectureRequestRecord( arch, response_len, start, end, elapsed, tags, test_group, comment, steps)
92
+
93
+ @classmethod
94
+ def load_all(cls, reload=False) -> None:
95
+ """
96
+ Load all the traces from json trace log
97
+ """
98
+ if cls.all is None or reload:
99
+ records = []
100
+ test_traces = Architecture.get_trace_records()
101
+ for trace in test_traces:
102
+ records.append(ArchitectureRequestRecord.from_dict(trace))
103
+ cls.all = records
104
+
105
+
106
+ class TestGroup:
107
+ all: Dict[str, List[ArchitectureRequestRecord]] = None
108
+
109
+ def __init__(self, test_group:str):
110
+ self.arch_request_records: List[ArchitectureRequestRecord] = []
111
+ self.test_group = test_group
112
+ self.comment = None
113
+ self.start = None
114
+ self.end = None
115
+ self.elapsed = None
116
+ self.architectures = set()
117
+
118
+ @property
119
+ def num_archs(self) -> int:
120
+ return len(self.architectures)
121
+
122
+ @property
123
+ def num_tests(self) -> int:
124
+ return len(self.arch_request_records)
125
+
126
+ @property
127
+ def num_tests_per_arch(self) -> int:
128
+ # Should always be an even number but cast to int just in case
129
+ return int(self.num_tests / self.num_archs)
130
+
131
+ def arch_request_records_by_arch(self) -> Dict[List[ArchitectureRequestRecord]]:
132
+ grouped = {}
133
+ for arr in self.arch_request_records:
134
+ if arr.arch not in grouped:
135
+ grouped[arr.arch] = []
136
+ grouped[arr.arch].append(arr)
137
+ return grouped
138
+
139
+ def summary_stats_by_arch(self) -> List[Dict]:
140
+ arch_records = self.arch_request_records_by_arch()
141
+ arch_names = list(arch_records.keys())
142
+ arch_names.sort()
143
+ stats = []
144
+ for a in arch_names:
145
+ stat_pack = {'arch_name': a, 'elapsed': [rec.elapsed for rec in arch_records[a]],
146
+ 'response_len': [rec.response_len for rec in arch_records[a]], 'steps': []}
147
+ for i in range(len(arch_records[a][0].steps)):
148
+ stat_pack['steps'].append({'step_name': arch_records[a][0].steps[i].name})
149
+ num_recs = len(arch_records[a])
150
+ total_elapsed = 0
151
+ for j in range(num_recs):
152
+ total_elapsed += arch_records[a][j].steps[i].elapsed
153
+ stat_pack['steps'][-1]['mean_elapsed'] = total_elapsed / num_recs
154
+ stats.append(stat_pack)
155
+ return stats
156
+
157
+ def add_record(self, arr: ArchitectureRequestRecord) -> None:
158
+ if arr.test_group != self.test_group:
159
+ raise ValueError("Attempted to group a test record into the wrong group")
160
+ self.arch_request_records.append(arr)
161
+ self.architectures.add(arr.arch)
162
+ if self.comment is None:
163
+ self.comment = arr.comment
164
+ if self.start is None or self.start > arr.start:
165
+ self.start = arr.start
166
+ if self.end is None or self.end < arr.end:
167
+ self.end = arr.end
168
+ self.elapsed = self.end - self.start
169
+
170
+ @classmethod
171
+ def load_all(cls, reload=False):
172
+ if cls.all is None or reload:
173
+ ArchitectureRequestRecord.load_all(reload=reload)
174
+ records = {}
175
+ for arr in ArchitectureRequestRecord.all:
176
+ if arr.test_group is not None:
177
+ if arr.test_group not in records:
178
+ records[arr.test_group] = TestGroup(arr.test_group)
179
+ records[arr.test_group].add_record(arr)
180
+ cls.all = records
181
+
182
+ @classmethod
183
+ def for_test_group_tag(cls, test_group_tag: str) -> TestGroup:
184
+ cls.load_all()
185
+ return cls.all[test_group_tag]