Spaces:

flowers-team
/

SocialAISchool

Sleeping

App Files Files Community

SocialAISchool / scripts /evaluate.py

grg

Cleaned old git history

be5548b over 1 year ago

raw

history blame contribute delete

11.7 kB

	import argparse
	import matplotlib.pyplot as plt
	import json
	import time
	import numpy as np
	import torch
	from pathlib import Path

	from utils.babyai_utils.baby_agent import load_agent
	from utils.storage import get_status
	from utils.env import make_env
	from utils.other import seed
	from utils.storage import get_model_dir
	from models import *

	from scipy import stats
	print("Wrong script. This is from VIGIL")
	exit()

	start = time.time()

	# Parse arguments

	parser = argparse.ArgumentParser()
	parser.add_argument("--seed", type=int, default=0,
	help="random seed (default: 0)")
	parser.add_argument("--random-agent", action="store_true", default=False,
	help="random actions")
	parser.add_argument("--argmax", action="store_true", default=False,
	help="select the action with highest probability (default: False)")
	parser.add_argument("--episodes", type=int, default=1000,
	help="number of episodes to test")
	parser.add_argument("--test-p", type=float, default=0.05,
	help="p value")
	parser.add_argument("--n-seeds", type=int, default=16,
	help="number of episodes to test")
	parser.add_argument("--subsample-step", type=int, default=1,
	help="subsample step")
	parser.add_argument("--start-step", type=int, default=1,
	help="at which step to start the curves")

	args = parser.parse_args()

	# Set seed for all randomness sources

	seed(args.seed)

	assert args.seed == 1
	assert not args.argmax
	# assert args.num_frames == 28000000
	# assert args.episodes == 1000

	test_p = args.test_p
	n_seeds = args.n_seeds
	subsample_step = args.subsample_step
	start_step = args.start_step

	# Set device

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}\n")

	# what to load
	models_to_evaluate = [
	"25-03_RERUN_WizardGuide_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOutNoLiar-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50",
	"25-03_RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env_MiniGrid-TalkItOut-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2_exploration-bonus-params_5_50"
	]
	print("evaluating models: ", models_to_evaluate)

	# what to put in the legend
	label_parser_dict = {
	"RERUN_WizardGuide_lang64_no_explo": "Abl-MH-BabyAI",
	"RERUN_WizardTwoGuides_lang64_no_explo": "MH-BabyAI",

	"RERUN_WizardGuide_lang64_mm_baby_short_rec_env": "Abl-MH-BabyAI-ExpBonus",
	"RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_env": "MH-BabyAI-ExpBonus",

	"RERUN_WizardGuide_lang64_deaf_no_explo": "Abl-Deaf-MH-BabyAI",
	"RERUN_WizardTwoGuides_lang64_deaf_no_explo": "Deaf-MH-BabyAI",

	"RERUN_WizardGuide_lang64_bow": "Abl-MH-BabyAI-ExpBonus-BOW",
	"RERUN_WizardTwoGuides_lang64_bow": "MH-BabyAI-ExpBonus-BOW",

	"RERUN_WizardGuide_lang64_no_mem": "Abl-MH-BabyAI-ExpBonus-no-mem",
	"RERUN_WizardTwoGuides_lang64_no_mem": "MH-BabyAI-ExpBonus-no-mem",

	"RERUN_WizardGuide_lang64_bigru": "Abl-MH-BabyAI-ExpBonus-bigru",
	"RERUN_WizardTwoGuides_lang64_bigru": "MH-BabyAI-ExpBonus-bigru",

	"RERUN_WizardGuide_lang64_attgru": "Abl-MH-BabyAI-ExpBonus-attgru",
	"RERUN_WizardTwoGuides_lang64_attgru": "MH-BabyAI-ExpBonus-attgru",

	"RERUN_WizardGuide_lang64_curr_dial": "Abl-MH-BabyAI-ExpBonus-current-dialogue",
	"RERUN_WizardTwoGuides_lang64_curr_dial": "MH-BabyAI-ExpBonus-current-dialogue",

	"RERUN_WizardTwoGuides_lang64_mm_baby_short_rec_100M": "MH-BabyAI-ExpBonus-100M"
	}

	# how do to stat tests
	compare = {
	"MH-BabyAI-ExpBonus": "Abl-MH-BabyAI-ExpBonus",
	}

	COLORS = ["red", "blue", "green", "black", "purpule", "brown", "orange", "gray"]
	label_color_dict = {l: c for l, c in zip(label_parser_dict.values(), COLORS)}


	test_set_check_path = Path("test_set_check_{}_nep_{}.json".format(args.seed, args.episodes))

	def calc_perf_for_seed(i, model_name, num_frames, seed, argmax, episodes, random_agent=False):
	print("seed {}".format(i))
	model = Path(model_name) / str(i)
	model_dir = get_model_dir(model)

	if test_set_check_path.exists():
	with open(test_set_check_path, "r") as f:
	check_loaded = json.load(f)
	print("check loaded")
	else:
	print("check not loaded")
	check_loaded = None

	# Load environment
	with open(model_dir+"/config.json") as f:
	conf = json.load(f)

	env_name = conf["env"]

	env = make_env(env_name, seed)
	print("Environment loaded\n")

	# load agent
	agent = load_agent(env, model_dir, argmax, num_frames)
	status = get_status(model_dir, num_frames)
	assert status["num_frames"] == num_frames
	print("Agent loaded\n")

	check = {}

	seed_rewards = []
	for episode in range(episodes):
	print("[{}/{}]: ".format(episode, episodes), end="", flush=True)

	obs = env.reset()

	# check envs are the same during seeds
	if episode in check:
	assert check[episode] == int(obs['image'].sum())
	else:
	check[episode] = int(obs['image'].sum())

	if check_loaded is not None:
	assert check[episode] == int(obs['image'].sum())

	while True:
	if random_agent:
	action = agent.get_random_action(obs)
	else:
	action = agent.get_action(obs)

	obs, reward, done, _ = env.step(action)
	print(".", end="", flush=True)
	agent.analyze_feedback(reward, done)

	if done:
	seed_rewards.append(reward)
	break

	print()

	seed_rewards = np.array(seed_rewards)
	seed_success_rates = seed_rewards > 0

	if not test_set_check_path.exists():
	with open(test_set_check_path, "w") as f:
	json.dump(check, f)
	print("check saved")

	print("seed success rate:", seed_success_rates.mean())
	print("seed reward:", seed_rewards.mean())

	return seed_rewards.mean(), seed_success_rates.mean()



	def get_available_steps(model):
	model_dir = Path(get_model_dir(model))
	per_seed_available_steps = {}
	for seed_dir in model_dir.glob("*"):
	per_seed_available_steps[seed_dir] = sorted([
	int(str(p.with_suffix("")).split("status_")[-1])
	for p in seed_dir.glob("status_*")
	])

	num_steps = min([len(steps) for steps in per_seed_available_steps.values()])

	steps = list(per_seed_available_steps.values())[0][:num_steps]

	for available_steps in per_seed_available_steps.values():
	s_steps = available_steps[:num_steps]
	assert steps == s_steps

	return steps

	def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label,
	legend=False, leg_size=30, leg_loc='best', title=None,
	ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=8.0, linewidth=7.0, ticksize=30,
	zorder=None, xlabel='perf', ylabel='env steps', smooth_factor=1000):
	# plt.rcParams.update({'font.size': 15})
	ax.locator_params(axis='x', nbins=6)
	ax.locator_params(axis='y', nbins=5)
	ax.tick_params(axis='both', which='major', labelsize=ticksize)

	# smoothing
	def smooth(x_, n=50):
	return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))])

	if smooth_factor > 0:
	y = smooth(y, n=smooth_factor)
	err = smooth(err, n=smooth_factor)

	ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder)
	ax.fill_between(x, y - err, y + err, color=shade_color, alpha=0.2)
	if legend:
	leg = ax.legend(loc=leg_loc, fontsize=leg_size, **leg_args) # 34
	for legobj in leg.legendHandles:
	legobj.set_linewidth(leg_linewidth)
	ax.set_xlabel(xlabel, fontsize=30)
	if subplot_nb == 0:
	ax.set_ylabel(ylabel, fontsize=30)
	ax.set_xlim(xmin=xlim[0], xmax=xlim[1])
	ax.set_ylim(bottom=ylim[0], top=ylim[1])
	if title:
	ax.set_title(title, fontsize=22)


	def label_parser(label, label_parser_dict):
	if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1:
	print("ERROR")
	print(label)
	exit()

	for k, v in label_parser_dict.items():
	if k in label: return v

	return label


	f, ax = plt.subplots(1, 1, figsize=(10.0, 6.0))
	ax = [ax]

	performances = {}
	per_seed_performances = {}
	stds = {}


	label_parser_dict_reverse = {v: k for k, v in label_parser_dict.items()}
	assert len(label_parser_dict_reverse) == len(label_parser_dict)

	label_to_model = {}
	# evaluate and draw curves
	for model in models_to_evaluate:
	label = label_parser(model, label_parser_dict)
	label_to_model[label] = model

	color = label_color_dict[label]
	performances[label] = []
	per_seed_performances[label] = []
	stds[label] = []

	steps = get_available_steps(model)
	steps = steps[::subsample_step]
	steps = [s for s in steps if s > start_step]

	print("steps:", steps)

	for step in steps:
	results = []
	for s in range(n_seeds):
	results.append(calc_perf_for_seed(
	s,
	model_name=model,
	num_frames=step,
	seed=args.seed,
	argmax=args.argmax,
	episodes=args.episodes,
	))

	rewards, success_rates = zip(*results)
	rewards = np.array(rewards)
	success_rates = np.array(success_rates)
	per_seed_performances[label].append(success_rates)
	performances[label].append(success_rates.mean())
	stds[label].append(success_rates.std())

	means = np.array(performances[label])
	err = np.array(stds[label])
	label = label_parser(str(model), label_parser_dict)
	max_steps = np.max(steps)
	min_steps = np.min(steps)
	min_y = 0.0
	max_y = 1.0
	ylabel = "performance"
	smooth_factor = 0

	plot_with_shade(0, ax[0], steps, means, err, color, color, label,
	legend=True, xlim=[min_steps, max_steps], ylim=[min_y, max_y],
	leg_size=20, xlabel="Env steps (millions)", ylabel=ylabel, linewidth=5.0, smooth_factor=smooth_factor)

	assert len(label_to_model) == len(models_to_evaluate)


	def get_compatible_steps(model1, model2, subsample_step):
	steps_1 = get_available_steps(model1)[::subsample_step]
	steps_2 = get_available_steps(model2)[::subsample_step]

	min_steps = min(len(steps_1), len(steps_2))
	steps_1 = steps_1[:min_steps]
	steps_2 = steps_2[:min_steps]
	assert steps_1 == steps_2

	return steps_1


	# stat tests
	for k, v in compare.items():
	dist_1_steps = per_seed_performances[k]
	dist_2_steps = per_seed_performances[v]

	model_k = label_to_model[k]
	model_v = label_to_model[v]
	steps = get_compatible_steps(model_k, model_v, subsample_step)
	steps = [s for s in steps if s > start_step]

	for step, dist_1, dist_2 in zip(steps, dist_1_steps, dist_2_steps):
	assert len(dist_1) == n_seeds
	assert len(dist_2) == n_seeds

	p = stats.ttest_ind(
	dist_1,
	dist_2,
	equal_var=False
	).pvalue

	if np.isnan(p):
	from IPython import embed; embed()

	if p < test_p:
	plt.scatter(step, 0.8, color=label_color_dict[k], s=50, marker="x")

	print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format(
	k, np.mean(dist_1), v, np.mean(dist_2), p,
	"Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p)
	))
	print()

	f.savefig('graphics/test.png')
	f.savefig('graphics/test.svg')