Spaces:
Runtime error
Runtime error
| import argparse | |
| import matplotlib.pyplot as plt | |
| import json | |
| import time | |
| import numpy as np | |
| import torch | |
| from pathlib import Path | |
| from utils.babyai_utils.baby_agent import load_agent | |
| from utils.storage import get_status | |
| from utils.env import make_env | |
| from utils.other import seed | |
| from utils.storage import get_model_dir | |
| from models import * | |
| from scipy import stats | |
| print("Wrong script. This is from VIGIL") | |
| exit() | |
| start = time.time() | |
| # Parse arguments | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--seed", type=int, default=0, | |
| help="random seed (default: 0)") | |
| parser.add_argument("--random-agent", action="store_true", default=False, | |
| help="random actions") | |
| parser.add_argument("--argmax", action="store_true", default=False, | |
| help="select the action with highest probability (default: False)") | |
| parser.add_argument("--episodes", type=int, default=1000, | |
| help="number of episodes to test") | |
| parser.add_argument("--test-p", type=float, default=0.05, | |
| help="p value") | |
| parser.add_argument("--n-seeds", type=int, default=16, | |
| help="number of episodes to test") | |
| parser.add_argument("--subsample-step", type=int, default=1, | |
| help="subsample step") | |
| parser.add_argument("--start-step", type=int, default=1, | |
| help="at which step to start the curves") | |
| args = parser.parse_args() | |
| # Set seed for all randomness sources | |
| seed(args.seed) | |
| assert args.seed == 1 | |
| assert not args.argmax | |
| # assert args.num_frames == 28000000 | |
| # assert args.episodes == 1000 | |
| test_p = args.test_p | |
| n_seeds = args.n_seeds | |
| subsample_step = args.subsample_step | |
| start_step = args.start_step | |
| # Set device | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Device: {device}\n") | |
| # what to load | |
| models_to_evaluate = [ | |
| "25-03_RERUN_WizardGuide_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOutNoLiar-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50", | |
| "25-03_RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOut-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50" | |
| ] | |
| print("evaluating models: ", models_to_evaluate) | |
| # what to put in the legend | |
| label_parser_dict = { | |
| "RERUN_WizardGuide_lang64_no_explo": "Abl-MH-BabyAI", | |
| "RERUN_WizardTwoGuides_lang64_no_explo": "MH-BabyAI", | |
| "RERUN_WizardGuide_lang64_mm_baby_short_rec_env": "Abl-MH-BabyAI-ExpBonus", | |
| "RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env": "MH-BabyAI-ExpBonus", | |
| "RERUN_WizardGuide_lang64_deaf_no_explo": "Abl-Deaf-MH-BabyAI", | |
| "RERUN_WizardTwoGuides_lang64_deaf_no_explo": "Deaf-MH-BabyAI", | |
| "RERUN_WizardGuide_lang64_bow": "Abl-MH-BabyAI-ExpBonus-BOW", | |
| "RERUN_WizardTwoGuides_lang64_bow": "MH-BabyAI-ExpBonus-BOW", | |
| "RERUN_WizardGuide_lang64_no_mem": "Abl-MH-BabyAI-ExpBonus-no-mem", | |
| "RERUN_WizardTwoGuides_lang64_no_mem": "MH-BabyAI-ExpBonus-no-mem", | |
| "RERUN_WizardGuide_lang64_bigru": "Abl-MH-BabyAI-ExpBonus-bigru", | |
| "RERUN_WizardTwoGuides_lang64_bigru": "MH-BabyAI-ExpBonus-bigru", | |
| "RERUN_WizardGuide_lang64_attgru": "Abl-MH-BabyAI-ExpBonus-attgru", | |
| "RERUN_WizardTwoGuides_lang64_attgru": "MH-BabyAI-ExpBonus-attgru", | |
| "RERUN_WizardGuide_lang64_curr_dial": "Abl-MH-BabyAI-ExpBonus-current-dialogue", | |
| "RERUN_WizardTwoGuides_lang64_curr_dial": "MH-BabyAI-ExpBonus-current-dialogue", | |
| "RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_100M": "MH-BabyAI-ExpBonus-100M" | |
| } | |
| # how do to stat tests | |
| compare = { | |
| "MH-BabyAI-ExpBonus": "Abl-MH-BabyAI-ExpBonus", | |
| } | |
| COLORS = ["red", "blue", "green", "black", "purpule", "brown", "orange", "gray"] | |
| label_color_dict = {l: c for l, c in zip(label_parser_dict.values(), COLORS)} | |
| test_set_check_path = Path("test_set_check_{}_nep_{}.json".format(args.seed, args.episodes)) | |
| def calc_perf_for_seed(i, model_name, num_frames, seed, argmax, episodes, random_agent=False): | |
| print("seed {}".format(i)) | |
| model = Path(model_name) / str(i) | |
| model_dir = get_model_dir(model) | |
| if test_set_check_path.exists(): | |
| with open(test_set_check_path, "r") as f: | |
| check_loaded = json.load(f) | |
| print("check loaded") | |
| else: | |
| print("check not loaded") | |
| check_loaded = None | |
| # Load environment | |
| with open(model_dir+"/config.json") as f: | |
| conf = json.load(f) | |
| env_name = conf["env"] | |
| env = make_env(env_name, seed) | |
| print("Environment loaded\n") | |
| # load agent | |
| agent = load_agent(env, model_dir, argmax, num_frames) | |
| status = get_status(model_dir, num_frames) | |
| assert status["num_frames"] == num_frames | |
| print("Agent loaded\n") | |
| check = {} | |
| seed_rewards = [] | |
| for episode in range(episodes): | |
| print("[{}/{}]: ".format(episode, episodes), end="", flush=True) | |
| obs = env.reset() | |
| # check envs are the same during seeds | |
| if episode in check: | |
| assert check[episode] == int(obs['image'].sum()) | |
| else: | |
| check[episode] = int(obs['image'].sum()) | |
| if check_loaded is not None: | |
| assert check[episode] == int(obs['image'].sum()) | |
| while True: | |
| if random_agent: | |
| action = agent.get_random_action(obs) | |
| else: | |
| action = agent.get_action(obs) | |
| obs, reward, done, _ = env.step(action) | |
| print(".", end="", flush=True) | |
| agent.analyze_feedback(reward, done) | |
| if done: | |
| seed_rewards.append(reward) | |
| break | |
| print() | |
| seed_rewards = np.array(seed_rewards) | |
| seed_success_rates = seed_rewards > 0 | |
| if not test_set_check_path.exists(): | |
| with open(test_set_check_path, "w") as f: | |
| json.dump(check, f) | |
| print("check saved") | |
| print("seed success rate:", seed_success_rates.mean()) | |
| print("seed reward:", seed_rewards.mean()) | |
| return seed_rewards.mean(), seed_success_rates.mean() | |
| def get_available_steps(model): | |
| model_dir = Path(get_model_dir(model)) | |
| per_seed_available_steps = {} | |
| for seed_dir in model_dir.glob("*"): | |
| per_seed_available_steps[seed_dir] = sorted([ | |
| int(str(p.with_suffix("")).split("status_")[-1]) | |
| for p in seed_dir.glob("status_*") | |
| ]) | |
| num_steps = min([len(steps) for steps in per_seed_available_steps.values()]) | |
| steps = list(per_seed_available_steps.values())[0][:num_steps] | |
| for available_steps in per_seed_available_steps.values(): | |
| s_steps = available_steps[:num_steps] | |
| assert steps == s_steps | |
| return steps | |
| def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label, | |
| legend=False, leg_size=30, leg_loc='best', title=None, | |
| ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=8.0, linewidth=7.0, ticksize=30, | |
| zorder=None, xlabel='perf', ylabel='env steps', smooth_factor=1000): | |
| # plt.rcParams.update({'font.size': 15}) | |
| ax.locator_params(axis='x', nbins=6) | |
| ax.locator_params(axis='y', nbins=5) | |
| ax.tick_params(axis='both', which='major', labelsize=ticksize) | |
| # smoothing | |
| def smooth(x_, n=50): | |
| return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))]) | |
| if smooth_factor > 0: | |
| y = smooth(y, n=smooth_factor) | |
| err = smooth(err, n=smooth_factor) | |
| ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder) | |
| ax.fill_between(x, y - err, y + err, color=shade_color, alpha=0.2) | |
| if legend: | |
| leg = ax.legend(loc=leg_loc, fontsize=leg_size, **leg_args) # 34 | |
| for legobj in leg.legendHandles: | |
| legobj.set_linewidth(leg_linewidth) | |
| ax.set_xlabel(xlabel, fontsize=30) | |
| if subplot_nb == 0: | |
| ax.set_ylabel(ylabel, fontsize=30) | |
| ax.set_xlim(xmin=xlim[0], xmax=xlim[1]) | |
| ax.set_ylim(bottom=ylim[0], top=ylim[1]) | |
| if title: | |
| ax.set_title(title, fontsize=22) | |
| def label_parser(label, label_parser_dict): | |
| if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1: | |
| print("ERROR") | |
| print(label) | |
| exit() | |
| for k, v in label_parser_dict.items(): | |
| if k in label: return v | |
| return label | |
| f, ax = plt.subplots(1, 1, figsize=(10.0, 6.0)) | |
| ax = [ax] | |
| performances = {} | |
| per_seed_performances = {} | |
| stds = {} | |
| label_parser_dict_reverse = {v: k for k, v in label_parser_dict.items()} | |
| assert len(label_parser_dict_reverse) == len(label_parser_dict) | |
| label_to_model = {} | |
| # evaluate and draw curves | |
| for model in models_to_evaluate: | |
| label = label_parser(model, label_parser_dict) | |
| label_to_model[label] = model | |
| color = label_color_dict[label] | |
| performances[label] = [] | |
| per_seed_performances[label] = [] | |
| stds[label] = [] | |
| steps = get_available_steps(model) | |
| steps = steps[::subsample_step] | |
| steps = [s for s in steps if s > start_step] | |
| print("steps:", steps) | |
| for step in steps: | |
| results = [] | |
| for s in range(n_seeds): | |
| results.append(calc_perf_for_seed( | |
| s, | |
| model_name=model, | |
| num_frames=step, | |
| seed=args.seed, | |
| argmax=args.argmax, | |
| episodes=args.episodes, | |
| )) | |
| rewards, success_rates = zip(*results) | |
| rewards = np.array(rewards) | |
| success_rates = np.array(success_rates) | |
| per_seed_performances[label].append(success_rates) | |
| performances[label].append(success_rates.mean()) | |
| stds[label].append(success_rates.std()) | |
| means = np.array(performances[label]) | |
| err = np.array(stds[label]) | |
| label = label_parser(str(model), label_parser_dict) | |
| max_steps = np.max(steps) | |
| min_steps = np.min(steps) | |
| min_y = 0.0 | |
| max_y = 1.0 | |
| ylabel = "performance" | |
| smooth_factor = 0 | |
| plot_with_shade(0, ax[0], steps, means, err, color, color, label, | |
| legend=True, xlim=[min_steps, max_steps], ylim=[min_y, max_y], | |
| leg_size=20, xlabel="Env steps (millions)", ylabel=ylabel, linewidth=5.0, smooth_factor=smooth_factor) | |
| assert len(label_to_model) == len(models_to_evaluate) | |
| def get_compatible_steps(model1, model2, subsample_step): | |
| steps_1 = get_available_steps(model1)[::subsample_step] | |
| steps_2 = get_available_steps(model2)[::subsample_step] | |
| min_steps = min(len(steps_1), len(steps_2)) | |
| steps_1 = steps_1[:min_steps] | |
| steps_2 = steps_2[:min_steps] | |
| assert steps_1 == steps_2 | |
| return steps_1 | |
| # stat tests | |
| for k, v in compare.items(): | |
| dist_1_steps = per_seed_performances[k] | |
| dist_2_steps = per_seed_performances[v] | |
| model_k = label_to_model[k] | |
| model_v = label_to_model[v] | |
| steps = get_compatible_steps(model_k, model_v, subsample_step) | |
| steps = [s for s in steps if s > start_step] | |
| for step, dist_1, dist_2 in zip(steps, dist_1_steps, dist_2_steps): | |
| assert len(dist_1) == n_seeds | |
| assert len(dist_2) == n_seeds | |
| p = stats.ttest_ind( | |
| dist_1, | |
| dist_2, | |
| equal_var=False | |
| ).pvalue | |
| if np.isnan(p): | |
| from IPython import embed; embed() | |
| if p < test_p: | |
| plt.scatter(step, 0.8, color=label_color_dict[k], s=50, marker="x") | |
| print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format( | |
| k, np.mean(dist_1), v, np.mean(dist_2), p, | |
| "Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p) | |
| )) | |
| print() | |
| f.savefig('graphics/test.png') | |
| f.savefig('graphics/test.svg') | |