From 7b67b9c46f261ec3c1a462d922b1ce89c26210ed Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Mon, 17 Jan 2022 19:30:41 -0300 Subject: [PATCH 1/8] Handle adv. network updates inside training callback --- gym_locm/toolbox/trainer.py | 57 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer.py index 75a552d..85ef97c 100644 --- a/gym_locm/toolbox/trainer.py +++ b/gym_locm/toolbox/trainer.py @@ -454,13 +454,34 @@ def _training_callback(self, _locals=None, _globals=None): self.wandb_run.log(info) - # if training should end, return False to end training - training_is_finished = episodes_so_far >= model.next_switch or episodes_so_far >= self.train_episodes - - if training_is_finished: + # if it is time to update the adversary model, do so + if episodes_so_far >= model.next_switch: model.last_switch = episodes_so_far model.next_switch += self.switch_frequency + # log training win rate at the time of the switch + train_mean_reward = np.mean([np.mean(rewards) for rewards in model.env.env_method('get_episode_rewards')]) + self.wandb_run.log({'train_mean_reward': train_mean_reward}) + + self.logger.debug(f"Model trained for " + f"{sum(model.env.get_attr('episodes'))} episodes. " + f"Train reward: {train_mean_reward}") + + # reset training env rewards + for i in range(model.env.num_envs): + model.env.set_attr('rewards', [0.0], indices=[i]) + + # update parameters of adversary models + try: + model.adversary.load_parameters(model.get_parameters(), exact_match=True) + except AttributeError: + model.adversary.set_parameters(model.get_parameters(), exact_match=True) + + self.logger.debug("Parameters of adversary network updated.") + + # if training should end, return False to end training + training_is_finished = episodes_so_far >= self.train_episodes + return not training_is_finished def _train(self): @@ -481,31 +502,11 @@ def _train(self): self.logger.debug(f"Training will switch models every " f"{self.switch_frequency} episodes") - for _ in range(self.num_switches): - # train the model - self.model.learn(total_timesteps=REALLY_BIG_INT, - reset_num_timesteps=False, - callback=CallbackList(callbacks)) - - # log training win rate at the time of the switch - train_mean_reward = np.mean([np.mean(rewards) for rewards in self.env.env_method('get_episode_rewards')]) - self.wandb_run.log({'train_mean_reward': train_mean_reward}) - - # reset training env rewards - for i in range(self.env.num_envs): - self.env.set_attr('rewards', [0.0], indices=[i]) - - self.logger.debug(f"Model trained for " - f"{sum(self.env.get_attr('episodes'))} episodes. " - f"Train reward: {train_mean_reward}") - - # update parameters of adversary models - try: - self.model.adversary.load_parameters(self.model.get_parameters(), exact_match=True) - except AttributeError: - self.model.adversary.set_parameters(self.model.get_parameters(), exact_match=True) + # train the model + self.model.learn(total_timesteps=REALLY_BIG_INT, + reset_num_timesteps=False, + callback=CallbackList(callbacks)) - self.logger.debug("Parameters of adversary network updated.") except KeyboardInterrupt: pass From 090e0dd088629f5621cae33ce29c8bf017322afd Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Wed, 26 Jan 2022 11:10:03 -0300 Subject: [PATCH 2/8] Capture mean battle length (in turns) on training script --- gym_locm/envs/battle.py | 2 -- gym_locm/toolbox/trainer.py | 35 ++++++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index dd48770..c2be698 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -101,8 +101,6 @@ def step(self, action): if winner is not None: reward = 1 if winner == PlayerOrder.FIRST else -1 - del info['turn'] - self.rewards[-1] += reward return self.encode_state(), reward, done, info diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer.py index 85ef97c..eb2ad9c 100644 --- a/gym_locm/toolbox/trainer.py +++ b/gym_locm/toolbox/trainer.py @@ -50,6 +50,7 @@ def __init__(self, task, params, path, seed, wandb_run=None): self.checkpoints = [] self.win_rates = [] self.episode_lengths = [] + self.battle_lengths = [] self.action_histograms = [] self.start_time, self.end_time = None, None self.wandb_run = wandb_run @@ -70,6 +71,7 @@ def _save_results(self): with open(results_path, 'w') as file: info = dict(task=self.task, **self.params, seed=self.seed, checkpoints=self.checkpoints, win_rates=self.win_rates, ep_lengths=self.episode_lengths, + battle_lengths=self.battle_lengths, action_histograms=self.action_histograms, start_time=str(self.start_time), end_time=str(self.end_time)) info = json.dumps(info, indent=2) @@ -183,7 +185,7 @@ def _training_callback(self, _locals=None, _globals=None): agent = agent_class(self.model) - mean_reward, ep_length, act_hist = \ + mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent, play_first=self.model.role_id == 0) end_time = time.perf_counter() @@ -196,6 +198,7 @@ def _training_callback(self, _locals=None, _globals=None): win_rate = (mean_reward + 1) / 2 self.win_rates.append(win_rate) self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) self.action_histograms.append(act_hist) # update control attributes @@ -208,7 +211,8 @@ def _training_callback(self, _locals=None, _globals=None): # upload stats to wandb, if enabled if self.wandb_run: info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length) + win_rate=win_rate, mean_ep_length=ep_length, + mean_battle_length=battle_length) if self.task == 'battle': info['pass_actions'] = act_hist[0] @@ -405,17 +409,18 @@ def _training_callback(self, _locals=None, _globals=None): if self.evaluator.seed is not None: self.evaluator.seed = self.seed + self.train_episodes - mean_reward, ep_length, act_hist = \ + mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent_class(model), play_first=True) if self.evaluator.seed is not None: self.evaluator.seed += self.eval_episodes - mean_reward2, ep_length2, act_hist2 = \ + mean_reward2, ep_length2, battle_length2, act_hist2 = \ self.evaluator.run(agent_class(model), play_first=False) mean_reward = (mean_reward + mean_reward2) / 2 ep_length = (ep_length + ep_length2) / 2 + battle_length = (battle_length + battle_length2) / 2 act_hist = [(act_hist[i] + act_hist2[i]) / 2 for i in range(model.env.get_attr('action_space', indices=[0])[0].n)] end_time = time.perf_counter() @@ -428,6 +433,7 @@ def _training_callback(self, _locals=None, _globals=None): win_rate = (mean_reward + 1) / 2 self.win_rates.append(win_rate) self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) self.action_histograms.append(act_hist) # update control attributes @@ -440,7 +446,8 @@ def _training_callback(self, _locals=None, _globals=None): # upload stats to wandb, if enabled if self.wandb_run: info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length) + win_rate=win_rate, mean_ep_length=ep_length, + mean_battle_length=battle_length) if self.task == 'battle': info['pass_actions'] = act_hist[0] @@ -673,7 +680,7 @@ def _training_callback(self, _locals=None, _globals=None): else: agent_class = RLDraftAgent - mean_reward, ep_length, act_hist = \ + mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent_class(model), play_first=model.role_id == 0) end_time = time.perf_counter() @@ -686,6 +693,7 @@ def _training_callback(self, _locals=None, _globals=None): win_rate = (mean_reward + 1) / 2 self.win_rates[model.role_id].append(win_rate) self.episode_lengths[model.role_id].append(ep_length) + self.battle_lengths[model.role_id].append(battle_length) self.action_histograms[model.role_id].append(act_hist) # update control attributes @@ -701,7 +709,8 @@ def _training_callback(self, _locals=None, _globals=None): 'checkpoint_' + model.role_id: episodes_so_far, 'mean_reward_' + model.role_id: mean_reward, 'win_rate_' + model.role_id: win_rate, - 'mean_ep_length_' + model.role_id: ep_length + 'mean_ep_length_' + model.role_id: ep_length, + 'mean_battle_length_' + model.role_id: battle_length } if self.task == 'battle': @@ -870,6 +879,7 @@ def run(self, agent: Agent, play_first=True): episodes_so_far = 0 episode_rewards = [[0.0] for _ in range(self.env.num_envs)] episode_lengths = [[0] for _ in range(self.env.num_envs)] + episode_turns = [[] for _ in range(self.env.num_envs)] action_histogram = [0] * self.env.action_space.n # run the episodes @@ -890,7 +900,7 @@ def run(self, agent: Agent, play_first=True): action_histogram[action] += 1 # perform the action and get the outcome - observations, rewards, dones, _ = self.env.step(actions) + observations, rewards, dones, infos = self.env.step(actions) # update metrics for i in range(self.env.num_envs): @@ -900,6 +910,7 @@ def run(self, agent: Agent, play_first=True): if dones[i]: episode_rewards[i].append(0.0) episode_lengths[i].append(0) + episode_turns[i].append(infos[i]['turn']) episodes_so_far += 1 @@ -912,6 +923,11 @@ def run(self, agent: Agent, play_first=True): for reward in rewards[:-1]] all_lengths = [length for lengths in episode_lengths for length in lengths[:-1]] + all_turns = [turn for turns in episode_turns for turn in turns] + + assert len(all_rewards) == self.episodes + assert len(all_lengths) == self.episodes + assert len(all_turns) == self.episodes # transform the action histogram in a probability distribution action_histogram = [action_freq / sum(action_histogram) @@ -920,8 +936,9 @@ def run(self, agent: Agent, play_first=True): # cap any unsolicited additional episodes all_rewards = all_rewards[:self.episodes] all_lengths = all_lengths[:self.episodes] + all_turns = all_turns[:self.episodes] - return mean(all_rewards), mean(all_lengths), action_histogram + return mean(all_rewards), mean(all_lengths), mean(all_turns), action_histogram def close(self): self.env.close() From 67a1a8480c10c5c43b03a05397be64a30d3a67c3 Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Wed, 26 Jan 2022 11:19:49 -0300 Subject: [PATCH 3/8] Comment out length assertions for training stats --- gym_locm/toolbox/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer.py index eb2ad9c..91cf7b0 100644 --- a/gym_locm/toolbox/trainer.py +++ b/gym_locm/toolbox/trainer.py @@ -925,8 +925,9 @@ def run(self, agent: Agent, play_first=True): for length in lengths[:-1]] all_turns = [turn for turns in episode_turns for turn in turns] - assert len(all_rewards) == self.episodes - assert len(all_lengths) == self.episodes + # todo: fix -- sometimes we miss self.episodes by one + # assert len(all_rewards) == self.episodes + # assert len(all_lengths) == self.episodes assert len(all_turns) == self.episodes # transform the action histogram in a probability distribution From 186f3d167ab9688d8e5b29e5bdc6a519f85fd895 Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Thu, 27 Jan 2022 10:03:42 -0300 Subject: [PATCH 4/8] Comment out length assertions for remaining training stats --- gym_locm/toolbox/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer.py index 91cf7b0..240b25a 100644 --- a/gym_locm/toolbox/trainer.py +++ b/gym_locm/toolbox/trainer.py @@ -928,7 +928,7 @@ def run(self, agent: Agent, play_first=True): # todo: fix -- sometimes we miss self.episodes by one # assert len(all_rewards) == self.episodes # assert len(all_lengths) == self.episodes - assert len(all_turns) == self.episodes + # assert len(all_turns) == self.episodes # transform the action histogram in a probability distribution action_histogram = [action_freq / sum(action_histogram) From e00aed6d7b4cabb6d1915583e7e08e9b94d32864 Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Tue, 8 Feb 2022 11:36:19 -0300 Subject: [PATCH 5/8] Add safeguard for greedy agent to not consider pass actions unless necessary --- gym_locm/agents.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gym_locm/agents.py b/gym_locm/agents.py index 17e915f..366878e 100644 --- a/gym_locm/agents.py +++ b/gym_locm/agents.py @@ -91,9 +91,13 @@ def eval_state(state): return score def act(self, state): - best_action, best_score = None, float("-inf") + best_action, best_score = Action(ActionType.PASS), float("-inf") for action in state.available_actions: + + if action.type == ActionType.PASS: + continue + state_copy = state.clone() state_copy.act(action) From f2e411d6f4966960bf9f60000e90262d75f5403a Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Tue, 8 Feb 2022 11:36:39 -0300 Subject: [PATCH 6/8] Fix presence of pass action in available actions list --- gym_locm/engine.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gym_locm/engine.py b/gym_locm/engine.py index e6649a2..7ba840d 100644 --- a/gym_locm/engine.py +++ b/gym_locm/engine.py @@ -410,10 +410,7 @@ def available_actions(self) -> Tuple[Action]: attack.append(Action(ActionType.ATTACK, origin, valid_target)) - available_actions = summon + attack + use - - if not available_actions: - available_actions = [Action(ActionType.PASS)] + available_actions = [Action(ActionType.PASS)] + summon + use + attack self.__available_actions = tuple(available_actions) From 256b3d0d15bfd412a555e643341771a9db961df8 Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Tue, 8 Feb 2022 11:37:59 -0300 Subject: [PATCH 7/8] Comment out really strange code I have no idea why it exists --- gym_locm/toolbox/trainer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer.py index 240b25a..3d40efb 100644 --- a/gym_locm/toolbox/trainer.py +++ b/gym_locm/toolbox/trainer.py @@ -377,12 +377,13 @@ def _training_callback(self, _locals=None, _globals=None): model = self.model episodes_so_far = sum(self.env.get_attr('episodes')) - turns = model.env.get_attr('turn') - playing_first = model.env.get_attr('play_first') - - for i in range(model.env.num_envs): - if turns[i] in range(0, model.env.num_envs): - model.env.set_attr('play_first', not playing_first[i], indices=[i]) + # note: wtf was this code about, ronaldo??? + # turns = model.env.get_attr('turn') + # playing_first = model.env.get_attr('play_first') + # + # for i in range(model.env.num_envs): + # if turns[i] in range(0, model.env.num_envs): + # model.env.set_attr('play_first', not playing_first[i], indices=[i]) # if it is time to evaluate, do so if episodes_so_far >= model.next_eval: From 00e0909d04c8ad62e67eac63a4793ea12db8361c Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Wed, 9 Feb 2022 19:48:04 -0300 Subject: [PATCH 8/8] Normalize player features --- gym_locm/envs/base_env.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gym_locm/envs/base_env.py b/gym_locm/envs/base_env.py index c0dd7bf..de9d1ca 100644 --- a/gym_locm/envs/base_env.py +++ b/gym_locm/envs/base_env.py @@ -366,10 +366,14 @@ def encode_enemy_card_on_board(card: Creature): @staticmethod def encode_players(current, opposing): - return current.health, current.mana, current.next_rune, \ - 1 + current.bonus_draw, opposing.health, \ - opposing.base_mana + opposing.bonus_mana, \ - opposing.next_rune, 1 + opposing.bonus_draw + return current.health / 30, \ + current.mana / 13, \ + current.next_rune / 30, \ + (1 + current.bonus_draw) / 6, \ + opposing.health / 30, \ + (opposing.base_mana + opposing.bonus_mana) / 13, \ + opposing.next_rune / 30, \ + (1 + opposing.bonus_draw) / 6 def encode_state(self): """ Encodes a state object into a numerical matrix. """