Merge pull request #34 from koulanurag/checkers

koulanurag · web-flow · commit 8985a5b12c13 · 2021-07-07T21:22:36.000-07:00
- one hot representation for items
- reward matched with original paper
diff --git a/ma_gym/envs/checkers/checkers.py b/ma_gym/envs/checkers/checkers.py
@@ -15,8 +15,8 @@
 
 class Checkers(gym.Env):
     """
-    The map contains apples and lemons. The first player (red) is very sensitive and scores 5 for
-    the team for an apple (green square) and −5 for a lemon (orange square). The second (blue), less sensitive
+    The map contains apples and lemons. The first player (red) is very sensitive and scores 10 for
+    the team for an apple (green square) and −10 for a lemon (orange square). The second (blue), less sensitive
     player scores 1 for the team for an apple and −1 for a lemon. There is a wall of lemons between the
     players and the apples. Apples and lemons disappear when collected, and the environment resets
     when all apples are eaten. It is important that the sensitive agent eats the apples while the less sensitive
@@ -35,16 +35,16 @@ def __init__(self, full_observable=False, step_cost=-0.01, max_steps=100):
         self.full_observable = full_observable
 
         self.action_space = MultiAgentActionSpace([spaces.Discrete(5) for _ in range(self.n_agents)])
-        self._obs_high = np.array([1.0, 1.0] + [max(OBSERVATION_MEANING.keys())] * 9, dtype=np.float32)
-        self._obs_low = np.array([0.0, 0.0] + [min(OBSERVATION_MEANING.keys())] * 9, dtype=np.float32)
+        self._obs_high = np.ones(2 + (3 * 3 * 5))
+        self._obs_low = np.zeros(2 + (3 * 3 * 5))
         if self.full_observable:
             self._obs_high = np.tile(self._obs_high, self.n_agents)
             self._obs_low = np.tile(self._obs_low, self.n_agents)
-        self.observation_space = MultiAgentObservationSpace(
-            [spaces.Box(self._obs_low, self._obs_high) for _ in range(self.n_agents)])
+        self.observation_space = MultiAgentObservationSpace([spaces.Box(self._obs_low, self._obs_high)
+                                                             for _ in range(self.n_agents)])
 
         self.init_agent_pos = {0: [0, self._grid_shape[1] - 2], 1: [2, self._grid_shape[1] - 2]}
-        self.agent_reward = {0: {'lemon': -5, 'apple': 5},
+        self.agent_reward = {0: {'lemon': -10, 'apple': 10},
                              1: {'lemon': -1, 'apple': 1}}
 
         self.agent_prev_pos = None
@@ -107,19 +107,19 @@ def get_agent_obs(self):
 
             # add 3 x3 mask around the agent current location and share neighbours
             # ( in practice: this information may not be so critical since the map never changes)
-            _agent_i_neighbour = np.zeros((3, 3))
+            _agent_i_neighbour = np.zeros((3, 3, 5))
             for r in range(pos[0] - 1, pos[0] + 2):
                 for c in range(pos[1] - 1, pos[1] + 2):
                     if self.is_valid((r, c)):
-                        item = 0
+                        item = [0, 0, 0, 0, 0]
                         if PRE_IDS['lemon'] in self._full_obs[r][c]:
-                            item = 1
+                            item[ITEM_ONE_HOT_INDEX['lemon']] = 1
                         elif PRE_IDS['apple'] in self._full_obs[r][c]:
-                            item = 2
+                            item[ITEM_ONE_HOT_INDEX['apple']] = 1
                         elif PRE_IDS['agent'] in self._full_obs[r][c]:
-                            item = 3
+                            item[ITEM_ONE_HOT_INDEX[self._full_obs[r][c]]] = 1
                         elif PRE_IDS['wall'] in self._full_obs[r][c]:
-                            item = -1
+                            item[ITEM_ONE_HOT_INDEX['wall']] = 1
                         _agent_i_neighbour[r - (pos[0] - 1)][c - (pos[1] - 1)] = item
             _agent_i_obs += _agent_i_neighbour.flatten().tolist()
 
@@ -267,7 +267,6 @@ def close(self):
 # each pre-id should be unique and single char
 PRE_IDS = {
     'agent': 'A',
-    'prey': 'P',
     'wall': 'W',
     'empty': '0',
     'lemon': 'Y',  # yellow color
@@ -278,6 +277,13 @@ def close(self):
     0: 'red',
     1: 'blue'
 }
+ITEM_ONE_HOT_INDEX = {
+    'lemon': 0,
+    'apple': 1,
+    'A1': 2,
+    'A2': 3,
+    'wall': 4,
+}
 WALL_COLOR = 'black'
 LEMON_COLOR = 'yellow'
 APPLE_COLOR = 'green'
diff --git a/tests/envs/test_checkers.py b/tests/envs/test_checkers.py
@@ -27,14 +27,14 @@ def test_reset(env):
 
     # add agent 1 obs
     agent_1_obs = [0.0, 0.86]
-    agent_1_obs += np.array([[0, 0, 0],
-                             [1, 3, 0],
-                             [2, 0, 0]]).flatten().tolist()
+    agent_1_obs += np.array([[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
+                             [[1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 0]],
+                             [[0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]]).flatten().tolist()
     # add agent 2 obs
     agent_2_obs = [0.67, 0.86]
-    agent_2_obs += np.array([[2, 0, 0],
-                             [1, 3, 0],
-                             [0, 0, 0]]).flatten().tolist()
+    agent_2_obs += np.array([[[0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
+                             [[1, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0]],
+                             [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]]).flatten().tolist()
 
     init_obs_n = [agent_1_obs, agent_2_obs]
 
@@ -49,20 +49,23 @@ def test_reset(env):
 
 @pytest.mark.parametrize('pos,valid',
                          [((-1, -1), False), ((-1, 0), False), ((-1, 8), False), ((3, 8), False)])
-def test_is_valid(env, pos, valid):
+def test_pos_validity(env, pos, valid):
     assert env.is_valid(pos) == valid
 
 
 @pytest.mark.parametrize('action_n,output',
                          [([1, 1],  # action
-                           ([[0.0, 0.71, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0, 1.0, 2.0, 0.0],
-                             [0.67, 0.71, 1.0, 2.0, 0.0, 2.0, 3.0, 0.0, 0.0, 0.0, 0.0]],  # obs
+                           ([[0.0, 0.71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                              0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                             [0.67, 0.71, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
                             {'lemon': 7, 'apple': 9}))])  # food_count
 def test_step(env, action_n, output):
     env.reset()
     target_obs_n, food_count = output
     obs_n, reward_n, done_n, info = env.step(action_n)
 
+    assert obs_n == target_obs_n, 'observation does not match . Expected {}. Got {}'.format(target_obs_n, obs_n)
     for k, v in food_count.items():
         assert info['food_count'][k] == food_count[k], '{} does not match'.format(k)
     assert env._step_count == 1
@@ -99,18 +102,18 @@ def test_observation_space(env):
     assert env.observation_space.contains(env.observation_space.sample())
 
 
-@parametrize_plus('env', [fixture_ref(env),
-                          fixture_ref(env_full)])
-def test_rollout(env):
+@parametrize_plus('env', [fixture_ref(env)])
+def test_rollout_env(env):
     actions = [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1],
                [0, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]]
-    target_rewards = [[-5.01, -1.01], [4.99, 0.99], [-5.01, -1.01], [4.99, 0.99],
-                      [-5.01, -1.01], [4.99, 0.99], [-0.01, -0.01], [-0.01, -0.01],
-                      [-5.01, -0.01], [4.99, -0.01], [-5.01, -0.01], [4.99, -0.01],
-                      [-5.01, -0.01], [4.99, -0.01]]
-    for episode_i in range(2):
+    target_rewards = [[-10.01, -1.01], [9.99, 0.99], [-10.01, -1.01], [9.99, 0.99],
+                      [-10.01, -1.01], [9.99, 0.99], [-0.01, -0.01], [-0.01, -0.01],
+                      [-10.01, -0.01], [9.99, -0.01], [-10.01, -0.01], [9.99, -0.01],
+                      [-10.01, -0.01], [9.99, -0.01]]
 
-        env.reset()
+    for episode_i in range(1):  # multiple episode to validate the seq. again on reset.
+
+        obs = env.reset()
         done = [False for _ in range(env.n_agents)]
         for step_i in range(len(actions)):
             obs, reward_n, done, _ = env.step(actions[step_i])
@@ -132,3 +135,26 @@ def test_max_steps(env):
             step_i += 1
         assert step_i == env._max_steps
         assert done == [True for _m in range(env.n_agents)]
+
+
+@parametrize_plus('env', [fixture_ref(env),
+                          fixture_ref(env_full)])
+def test_collision(env):
+    for episode_i in range(2):
+        env.reset()
+        obs_1, reward_n, done, _ = env.step([0, 2])
+        obs_2, reward_n, done, _ = env.step([0, 2])
+
+        assert obs_1 == obs_2
+
+
+@parametrize_plus('env', [fixture_ref(env),
+                          fixture_ref(env_full)])
+def test_revisit_fruit_cell(env):
+    for episode_i in range(2):
+        env.reset()
+        obs_1, reward_1, done, _ = env.step([1, 1])
+        obs_2, reward_2, done, _ = env.step([3, 3])
+        obs_3, reward_3, done, _ = env.step([1, 1])
+
+        assert reward_1 != reward_3