Spaces:

alfraser
/

llm-arch

Runtime error

File size: 5,418 Bytes

c319c31
ab87be2
 
c319c31
b897a48
ab87be2
 
c319c31
bb7db2c
ab87be2
 
 
c319c31
 
 
 
 
 
 
 
 
 
f3f6cf6
c319c31
f9e1dd5
fc8884e
 
c319c31
 
 
 
 
 
 
 
bb7db2c
 
 
fc8884e
 
c319c31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3f6cf6
c319c31
f9e1dd5
fc8884e
 
c319c31
 
 
 
 
 
 
 
b897a48
bb7db2c
 
b897a48
c319c31
 
bb7db2c
 
 
 
fc8884e
 
bb7db2c
 
 
 
 
c319c31
bb7db2c
 
c319c31
 
 
 
 
 
 
 
bb7db2c
c319c31
 
ab87be2
 
 
 
c319c31

import regex as re
import streamlit as st

from pandas import DataFrame
from random import sample
from src.architectures import *
from src.common import generate_group_tag
from src.datatypes import *
from src.testing import TestGenerator, batch_test
from src.st_helpers import st_setup


# Componentise different test options
def display_custom_test():
    st.write("## Run a new custom test")
    st.write("### Comment:")
    comment = st.text_input("Optional comment for the test", key="custom_test_comment")

    st.write("### Architectures to include:")
    selected_archs = st.multiselect(label="Architectures", options=[a.name for a in Architecture.architectures], key="custom_test_archs")

    st.write("### Number of questions to ask:")
    q_count = st.slider(label="Number of questions", min_value=1, max_value=TestGenerator.question_count(), step=1, key="custom_q_count")

    st.write("### Number of threads to use for testing:")
    thread_count = st.slider(label="Number of threads", min_value=1, max_value=64, step=1, value=16, key="custom_thread_slider")

    st.write("### Tag:")
    tag = generate_group_tag()
    st.write(f'Test will be tagged as "{tag}" - record this for easy searching later')

    total_tests = len(selected_archs) * q_count
    st.write("### Run:")
    st.write(f"**{total_tests}** total tests will be run")
    if st.button("**Run**", disabled=(total_tests == 0), key="custom_test_button"):
        with st.spinner():
            questions = TestGenerator.get_random_questions(q_count)
            batch_test(questions=questions, architectures=selected_archs,
                       trace_tags=[tag, "TestRunner"], trace_comment=comment,
                       num_workers=thread_count)


def display_pricing_fact_test():
    def get_question_price_pairs():
        DataLoader.load_data()
        pairs = []
        for p in Product.all.values():
            price = p.price
            product_name = p.name
            category_name = p.category.lower_singular_name
            if category_name == "tv":
                category_name = "TV"
            question = f'How much is the {product_name} {category_name}?'
            pairs.append((question, price))
        return pairs

    def get_price_from_response(response: str) -> float:
        prices = re.findall('\$[,\d]+\.\d\d', response)
        if len(prices) == 0:
            return -0.1
        return float(prices[0][1:].replace(',',''))

    st.write("## Run a pricing fact test")
    st.write("### Comment:")
    comment = st.text_input("Optional comment for the test", key="pricing_test_comment")

    st.write("### Architectures to include:")
    selected_archs = st.multiselect(label="Architectures", options=[a.name for a in Architecture.architectures], key="pricing_test_archs")

    question_price_pairs = get_question_price_pairs()
    st.write("### Number of questions to ask:")
    q_count = st.slider(label="Number of questions", min_value=1, max_value=len(question_price_pairs), step=1, key="pricing_q_count")

    st.write("### Number of threads to use for testing:")
    thread_count = st.slider(label="Number of threads", min_value=1, max_value=64, step=1, value=16, key="pricing_thread_slider")

    st.write("### Tag:")
    tag = generate_group_tag()
    st.write(f'Test will be tagged as "{tag}" - record this for easy searching later')

    total_tests = len(selected_archs) * q_count
    st.write("### Run:")
    st.write(f"**{total_tests}** total tests will be run")
    if st.button("**Run**", disabled=(total_tests == 0), key="pricing_test_button"):
        question_price_pairs = sample(question_price_pairs, k=q_count)
        question_price_dict = {qpp[0]: qpp[1] for qpp in question_price_pairs}
        questions = list(question_price_dict.keys())

        answer_stats = {}
        for arch_name in selected_archs:
            answer_stats[arch_name] = [0, 0]  # [correct, incorrect]

        with st.spinner():
            results: List[Tuple[str, str, str]] = batch_test(questions=questions, architectures=selected_archs,
                                                             trace_tags=[tag, "TestRunner"], trace_comment=comment,
                                                             num_workers=thread_count)
            for arch, query, response in results:
                target_price = question_price_dict[query]
                answer_price = get_price_from_response(response)
                if target_price == answer_price:
                    answer_stats[arch][0] += 1
                else:
                    answer_stats[arch][1] += 1

        table_data = []
        for arch_name in selected_archs:
            correct = answer_stats[arch_name][0]
            incorrect = answer_stats[arch_name][1]
            total = correct + incorrect
            percent_correct = round(correct / total * 100, 1)
            table_data.append([arch_name, correct, incorrect, total, f'{percent_correct:.1f}%'])
        df = DataFrame(table_data, columns=['Architecture', 'Correct', 'Incorrect', 'Total', '% Correct'])
        st.table(df.assign(no_index='').set_index('no_index'))


if Architecture.architectures is None:
    Architecture.load_architectures()

if st_setup('LLM Arch'):
    st.write("# Test Runner")
    with st.expander("Pricing Fact Tests"):
        display_pricing_fact_test()
    with st.expander("Custom Tests"):
        display_custom_test()