Spaces:
Running
Running
import argparse | |
import matplotlib.pyplot as plt | |
import json | |
import time | |
import numpy as np | |
import torch | |
from pathlib import Path | |
from utils.babyai_utils.baby_agent import load_agent | |
from utils.storage import get_status | |
from utils.env import make_env | |
from utils.other import seed | |
from utils.storage import get_model_dir | |
from models import * | |
from scipy import stats | |
print("Wrong script. This is from VIGIL") | |
exit() | |
start = time.time() | |
# Parse arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--seed", type=int, default=0, | |
help="random seed (default: 0)") | |
parser.add_argument("--random-agent", action="store_true", default=False, | |
help="random actions") | |
parser.add_argument("--argmax", action="store_true", default=False, | |
help="select the action with highest probability (default: False)") | |
parser.add_argument("--episodes", type=int, default=1000, | |
help="number of episodes to test") | |
parser.add_argument("--test-p", type=float, default=0.05, | |
help="p value") | |
parser.add_argument("--n-seeds", type=int, default=16, | |
help="number of episodes to test") | |
parser.add_argument("--subsample-step", type=int, default=1, | |
help="subsample step") | |
parser.add_argument("--start-step", type=int, default=1, | |
help="at which step to start the curves") | |
args = parser.parse_args() | |
# Set seed for all randomness sources | |
seed(args.seed) | |
assert args.seed == 1 | |
assert not args.argmax | |
# assert args.num_frames == 28000000 | |
# assert args.episodes == 1000 | |
test_p = args.test_p | |
n_seeds = args.n_seeds | |
subsample_step = args.subsample_step | |
start_step = args.start_step | |
# Set device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Device: {device}\n") | |
# what to load | |
models_to_evaluate = [ | |
"25-03_RERUN_WizardGuide_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOutNoLiar-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50", | |
"25-03_RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOut-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50" | |
] | |
print("evaluating models: ", models_to_evaluate) | |
# what to put in the legend | |
label_parser_dict = { | |
"RERUN_WizardGuide_lang64_no_explo": "Abl-MH-BabyAI", | |
"RERUN_WizardTwoGuides_lang64_no_explo": "MH-BabyAI", | |
"RERUN_WizardGuide_lang64_mm_baby_short_rec_env": "Abl-MH-BabyAI-ExpBonus", | |
"RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env": "MH-BabyAI-ExpBonus", | |
"RERUN_WizardGuide_lang64_deaf_no_explo": "Abl-Deaf-MH-BabyAI", | |
"RERUN_WizardTwoGuides_lang64_deaf_no_explo": "Deaf-MH-BabyAI", | |
"RERUN_WizardGuide_lang64_bow": "Abl-MH-BabyAI-ExpBonus-BOW", | |
"RERUN_WizardTwoGuides_lang64_bow": "MH-BabyAI-ExpBonus-BOW", | |
"RERUN_WizardGuide_lang64_no_mem": "Abl-MH-BabyAI-ExpBonus-no-mem", | |
"RERUN_WizardTwoGuides_lang64_no_mem": "MH-BabyAI-ExpBonus-no-mem", | |
"RERUN_WizardGuide_lang64_bigru": "Abl-MH-BabyAI-ExpBonus-bigru", | |
"RERUN_WizardTwoGuides_lang64_bigru": "MH-BabyAI-ExpBonus-bigru", | |
"RERUN_WizardGuide_lang64_attgru": "Abl-MH-BabyAI-ExpBonus-attgru", | |
"RERUN_WizardTwoGuides_lang64_attgru": "MH-BabyAI-ExpBonus-attgru", | |
"RERUN_WizardGuide_lang64_curr_dial": "Abl-MH-BabyAI-ExpBonus-current-dialogue", | |
"RERUN_WizardTwoGuides_lang64_curr_dial": "MH-BabyAI-ExpBonus-current-dialogue", | |
"RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_100M": "MH-BabyAI-ExpBonus-100M" | |
} | |
# how do to stat tests | |
compare = { | |
"MH-BabyAI-ExpBonus": "Abl-MH-BabyAI-ExpBonus", | |
} | |
COLORS = ["red", "blue", "green", "black", "purpule", "brown", "orange", "gray"] | |
label_color_dict = {l: c for l, c in zip(label_parser_dict.values(), COLORS)} | |
test_set_check_path = Path("test_set_check_{}_nep_{}.json".format(args.seed, args.episodes)) | |
def calc_perf_for_seed(i, model_name, num_frames, seed, argmax, episodes, random_agent=False): | |
print("seed {}".format(i)) | |
model = Path(model_name) / str(i) | |
model_dir = get_model_dir(model) | |
if test_set_check_path.exists(): | |
with open(test_set_check_path, "r") as f: | |
check_loaded = json.load(f) | |
print("check loaded") | |
else: | |
print("check not loaded") | |
check_loaded = None | |
# Load environment | |
with open(model_dir+"/config.json") as f: | |
conf = json.load(f) | |
env_name = conf["env"] | |
env = make_env(env_name, seed) | |
print("Environment loaded\n") | |
# load agent | |
agent = load_agent(env, model_dir, argmax, num_frames) | |
status = get_status(model_dir, num_frames) | |
assert status["num_frames"] == num_frames | |
print("Agent loaded\n") | |
check = {} | |
seed_rewards = [] | |
for episode in range(episodes): | |
print("[{}/{}]: ".format(episode, episodes), end="", flush=True) | |
obs = env.reset() | |
# check envs are the same during seeds | |
if episode in check: | |
assert check[episode] == int(obs['image'].sum()) | |
else: | |
check[episode] = int(obs['image'].sum()) | |
if check_loaded is not None: | |
assert check[episode] == int(obs['image'].sum()) | |
while True: | |
if random_agent: | |
action = agent.get_random_action(obs) | |
else: | |
action = agent.get_action(obs) | |
obs, reward, done, _ = env.step(action) | |
print(".", end="", flush=True) | |
agent.analyze_feedback(reward, done) | |
if done: | |
seed_rewards.append(reward) | |
break | |
print() | |
seed_rewards = np.array(seed_rewards) | |
seed_success_rates = seed_rewards > 0 | |
if not test_set_check_path.exists(): | |
with open(test_set_check_path, "w") as f: | |
json.dump(check, f) | |
print("check saved") | |
print("seed success rate:", seed_success_rates.mean()) | |
print("seed reward:", seed_rewards.mean()) | |
return seed_rewards.mean(), seed_success_rates.mean() | |
def get_available_steps(model): | |
model_dir = Path(get_model_dir(model)) | |
per_seed_available_steps = {} | |
for seed_dir in model_dir.glob("*"): | |
per_seed_available_steps[seed_dir] = sorted([ | |
int(str(p.with_suffix("")).split("status_")[-1]) | |
for p in seed_dir.glob("status_*") | |
]) | |
num_steps = min([len(steps) for steps in per_seed_available_steps.values()]) | |
steps = list(per_seed_available_steps.values())[0][:num_steps] | |
for available_steps in per_seed_available_steps.values(): | |
s_steps = available_steps[:num_steps] | |
assert steps == s_steps | |
return steps | |
def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label, | |
legend=False, leg_size=30, leg_loc='best', title=None, | |
ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=8.0, linewidth=7.0, ticksize=30, | |
zorder=None, xlabel='perf', ylabel='env steps', smooth_factor=1000): | |
# plt.rcParams.update({'font.size': 15}) | |
ax.locator_params(axis='x', nbins=6) | |
ax.locator_params(axis='y', nbins=5) | |
ax.tick_params(axis='both', which='major', labelsize=ticksize) | |
# smoothing | |
def smooth(x_, n=50): | |
return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))]) | |
if smooth_factor > 0: | |
y = smooth(y, n=smooth_factor) | |
err = smooth(err, n=smooth_factor) | |
ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder) | |
ax.fill_between(x, y - err, y + err, color=shade_color, alpha=0.2) | |
if legend: | |
leg = ax.legend(loc=leg_loc, fontsize=leg_size, **leg_args) # 34 | |
for legobj in leg.legendHandles: | |
legobj.set_linewidth(leg_linewidth) | |
ax.set_xlabel(xlabel, fontsize=30) | |
if subplot_nb == 0: | |
ax.set_ylabel(ylabel, fontsize=30) | |
ax.set_xlim(xmin=xlim[0], xmax=xlim[1]) | |
ax.set_ylim(bottom=ylim[0], top=ylim[1]) | |
if title: | |
ax.set_title(title, fontsize=22) | |
def label_parser(label, label_parser_dict): | |
if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1: | |
print("ERROR") | |
print(label) | |
exit() | |
for k, v in label_parser_dict.items(): | |
if k in label: return v | |
return label | |
f, ax = plt.subplots(1, 1, figsize=(10.0, 6.0)) | |
ax = [ax] | |
performances = {} | |
per_seed_performances = {} | |
stds = {} | |
label_parser_dict_reverse = {v: k for k, v in label_parser_dict.items()} | |
assert len(label_parser_dict_reverse) == len(label_parser_dict) | |
label_to_model = {} | |
# evaluate and draw curves | |
for model in models_to_evaluate: | |
label = label_parser(model, label_parser_dict) | |
label_to_model[label] = model | |
color = label_color_dict[label] | |
performances[label] = [] | |
per_seed_performances[label] = [] | |
stds[label] = [] | |
steps = get_available_steps(model) | |
steps = steps[::subsample_step] | |
steps = [s for s in steps if s > start_step] | |
print("steps:", steps) | |
for step in steps: | |
results = [] | |
for s in range(n_seeds): | |
results.append(calc_perf_for_seed( | |
s, | |
model_name=model, | |
num_frames=step, | |
seed=args.seed, | |
argmax=args.argmax, | |
episodes=args.episodes, | |
)) | |
rewards, success_rates = zip(*results) | |
rewards = np.array(rewards) | |
success_rates = np.array(success_rates) | |
per_seed_performances[label].append(success_rates) | |
performances[label].append(success_rates.mean()) | |
stds[label].append(success_rates.std()) | |
means = np.array(performances[label]) | |
err = np.array(stds[label]) | |
label = label_parser(str(model), label_parser_dict) | |
max_steps = np.max(steps) | |
min_steps = np.min(steps) | |
min_y = 0.0 | |
max_y = 1.0 | |
ylabel = "performance" | |
smooth_factor = 0 | |
plot_with_shade(0, ax[0], steps, means, err, color, color, label, | |
legend=True, xlim=[min_steps, max_steps], ylim=[min_y, max_y], | |
leg_size=20, xlabel="Env steps (millions)", ylabel=ylabel, linewidth=5.0, smooth_factor=smooth_factor) | |
assert len(label_to_model) == len(models_to_evaluate) | |
def get_compatible_steps(model1, model2, subsample_step): | |
steps_1 = get_available_steps(model1)[::subsample_step] | |
steps_2 = get_available_steps(model2)[::subsample_step] | |
min_steps = min(len(steps_1), len(steps_2)) | |
steps_1 = steps_1[:min_steps] | |
steps_2 = steps_2[:min_steps] | |
assert steps_1 == steps_2 | |
return steps_1 | |
# stat tests | |
for k, v in compare.items(): | |
dist_1_steps = per_seed_performances[k] | |
dist_2_steps = per_seed_performances[v] | |
model_k = label_to_model[k] | |
model_v = label_to_model[v] | |
steps = get_compatible_steps(model_k, model_v, subsample_step) | |
steps = [s for s in steps if s > start_step] | |
for step, dist_1, dist_2 in zip(steps, dist_1_steps, dist_2_steps): | |
assert len(dist_1) == n_seeds | |
assert len(dist_2) == n_seeds | |
p = stats.ttest_ind( | |
dist_1, | |
dist_2, | |
equal_var=False | |
).pvalue | |
if np.isnan(p): | |
from IPython import embed; embed() | |
if p < test_p: | |
plt.scatter(step, 0.8, color=label_color_dict[k], s=50, marker="x") | |
print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format( | |
k, np.mean(dist_1), v, np.mean(dist_2), p, | |
"Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p) | |
)) | |
print() | |
f.savefig('graphics/test.png') | |
f.savefig('graphics/test.svg') | |