2021-04-13-Tue-Study
Apr 13, 2021
»
study
RL
Finding the best policy when you don’t know MDP
Policy
Optimal policy is environment dependent.
Target policy
Policy that is the goal to be strengthened
Action policy
Policies that are gaining experience by interacting with the real environment
On Policy
When the target policy and the action policy are the same
ex) SARSA
Off Policy
When the target policy and the action policy are different
1. You can reuse past experiences.
2. You can learn from human data.
3. One-to-many, many-to-one learning is possible.
ex) Q Learning
Monte Carlo Control
Policy interation is problematic to use when model free
1. Repeated policy evaluation cannot be used in the evaluation phase.
In step 2 of the Bellman expectation equation, it is possible to substitute information about the MDP.
2. It is not possible to create a greedy policy at the stage of policy improvement.
When you select each action in the current state, you don't know where the next state will be.
Resolution
1. Use Monte-Carlo or Temporal Difference in the policy evaluation stage.
2. In the policy improvement stage, q(s,a) instead of v(s) was used to indicate that greedy policies cannot be created
If you use it, you can select a greedy action without knowing the information about the MDP.
Synthesis
It is to find q(s,a) using MC.
Exploration
E-greedy
In order for the agent to find the optimal solution,
it must sufficiently search through several states within a given MDP.
How to prevent when a state that you haven't been able to visit occurs throughout the study
In other words, an algorithm that guarantees some search
Choose an action randomly as much as a small probability called Epsilon,
The remaining probability of (1-upsilon) selects the greedy action as usual.
At first, you need to get enough information about the environment,
and when the learning progresses to some extent,
To make the best choice based on the information already obtained.
--> Decaying E-greedy
Used in the policy improvement phase
avatar
Loop
To gain experience in one episode
Update q(s,a) table with experienced data
Create and improve e-greedy policy using updated table
MC Coding
import random
import numpy as np
class GridWorld():
def __init__(self):
self.x=0
self.y=0
def step(self, a):
# Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
if a==0:
self.move_left()
elif a==1:
self.move_up()
elif a==2:
self.move_right()
elif a==3:
self.move_down()
reward = -1 # Reward is always fixed at -1
done = self.is_done()
return (self.x, self.y), reward, done
def move_left(self):
if self.y==0:
pass
elif self.y==3 and self.x in [0,1,2]:
pass
elif self.y==5 and self.x in [2,3,4]:
pass
else:
self.y -= 1
def move_right(self):
if self.y==1 and self.x in [0,1,2]:
pass
elif self.y==3 and self.x in [2,3,4]:
pass
elif self.y==6:
pass
else:
self.y += 1
def move_up(self):
if self.x==0:
pass
elif self.x==3 and self.y==2:
pass
else:
self.x -= 1
def move_down(self):
if self.x==4:
pass
elif self.x==1 and self.y==4:
pass
else:
self.x+=1
def is_done(self):
if self.x==4 and self.y==6: # End when the target point (4,6) is reached
return True
else:
return False
def reset(self):
self.x = 0
self.y = 0
return (self.x, self.y)
class QAgent():
def __init__(self):
self.q_table = np.zeros((5, 7, 4)) # Variable to store q value. All initialized to zero.
self.eps = 0.9
self.alpha = 0.01
def select_action(self, s):
# Select action with eps-greedy
x, y = s
coin = random.random()
if coin <self.eps:
action = random.randint(0,3)
else:
action_val = self.q_table[x,y,:]
action = np.argmax(action_val)
return action
def update_table(self, history):
# Receive the history corresponding to an episode as an input and update the q table value
cum_reward = 0
for transition in history[::-1]:
s, a, r, s_prime = transition
x,y = s
# Update using the Monte Carlo method.
self.q_table[x,y,a] = self.q_table[x,y,a] + self.alpha * (cum_reward-self.q_table[x,y,a])
cum_reward = cum_reward + r
z
def anneal_eps(self):
self.eps -= 0.03
self.eps = max(self.eps, 0.1)
def show_table(self):
# Function to show which action had the highest q value at each location
q_lst = self.q_table.tolist()
data = np.zeros((5,7))
for row_idx in range(len(q_lst)):
row = q_lst[row_idx]
for col_idx in range(len(row)):
col = row[col_idx]
action = np.argmax(col)
data[row_idx, col_idx] = action
print(data)
def main():
env = GridWorld()
agent = QAgent()
for n_epi in range(1000): # training for a total of 1,000 episodes
done = False
history = []
s = env.reset()
while not done: # Until the end of an episode
a = agent.select_action(s)
s_prime, r, done = env.step(a)
history.append((s, a, r, s_prime))
s = s_prime
agent.update_table(history) # Update agent using history
agent.anneal_eps()
agent.show_table() # Print the result of training
if __name__ =='__main__':
main()
Temporal Difference Control
SARSA
Approach to calculating Q using TD
If you choose action a in state s, you get reward r and arrive at state s',
and in state s'you choose the next action a'= SARSA
Update cycle is different than MC.
Call update_table at the end of a step
It must be ensured that the various conditions are sufficiently visited multiple times.
Q Learning
Finding the best policy using TD.
Research that combines reinforcement learning and deep learning to show great results
2015 "Human-level control through deep reinforcement learning" paper
Target policy: greedy
Policy of Conduct: e-greedy
SARSA Code
import random
import numpy as np
class GridWorld():
def __init__(self):
self.x=0
self.y=0
def step(self, a):
# Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
if a==0:
self.move_left()
elif a==1:
self.move_up()
elif a==2:
self.move_right()
elif a==3:
self.move_down()
reward = -1 # Reward is always fixed at -1
done = self.is_done()
return (self.x, self.y), reward, done
def move_left(self):
if self.y==0:
pass
elif self.y==3 and self.x in [0,1,2]:
pass
elif self.y==5 and self.x in [2,3,4]:
pass
else:
self.y -= 1
def move_right(self):
if self.y==1 and self.x in [0,1,2]:
pass
elif self.y==3 and self.x in [2,3,4]:
pass
elif self.y==6:
pass
else:
self.y += 1
def move_up(self):
if self.x==0:
pass
elif self.x==3 and self.y==2:
pass
else:
self.x -= 1
def move_down(self):
if self.x==4:
pass
elif self.x==1 and self.y==4:
pass
else:
self.x+=1
def is_done(self):
if self.x==4 and self.y==6: # End when the target point (4,6) is reached
return True
else:
return False
def reset(self):
self.x = 0
self.y = 0
return (self.x, self.y)
class QAgent():
def __init__(self):
self.q_table = np.zeros((5, 7, 4)) # Likewise, initialize the Q table to 0
self.eps = 0.9
def select_action(self, s):
# Select action with eps-greedy
x, y = s
coin = random.random()
if coin <self.eps:
action = random.randint(0,3)
else:
action_val = self.q_table[x,y,:]
action = np.argmax(action_val)
return action
def update_table(self, transition):
s, a, r, s_prime = transition
x,y = s
next_x, next_y = s_prime
a_prime = self.select_action(s_prime) # Action to select from S'(not actually taken action)
# Use SARSA update formula
self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + self.q_table[next_x,next_y,a_prime]-self.q_table[x,y,a])
def anneal_eps(self):
self.eps -= 0.03
self.eps = max(self.eps, 0.1)
def show_table(self):
q_lst = self.q_table.tolist()
data = np.zeros((5,7))
for row_idx in range(len(q_lst)):
row = q_lst[row_idx]
for col_idx in range(len(row)):
col = row[col_idx]
action = np.argmax(col)
data[row_idx, col_idx] = action
print(data)
def main():
env = GridWorld()
agent = QAgent()
for n_epi in range(1000):
done = False
s = env.reset()
while not done:
a = agent.select_action(s)
s_prime, r, done = env.step(a)
agent.update_table((s,a,r,s_prime))
s = s_prime
agent.anneal_eps()
agent.show_table()
if __name__ =='__main__':
main()
QLearning Code
import random
import numpy as np
class GridWorld():
def __init__(self):
self.x=0
self.y=0
def step(self, a):
# Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
if a==0:
self.move_left()
elif a==1:
self.move_up()
elif a==2:
self.move_right()
elif a==3:
self.move_down()
reward = -1 # Reward is always fixed at -1
done = self.is_done()
return (self.x, self.y), reward, done
def move_left(self):
if self.y==0:
pass
elif self.y==3 and self.x in [0,1,2]:
pass
elif self.y==5 and self.x in [2,3,4]:
pass
else:
self.y -= 1
def move_right(self):
if self.y==1 and self.x in [0,1,2]:
pass
elif self.y==3 and self.x in [2,3,4]:
pass
elif self.y==6:
pass
else:
self.y += 1
def move_up(self):
if self.x==0:
pass
elif self.x==3 and self.y==2:
pass
else:
self.x -= 1
def move_down(self):
if self.x==4:
pass
elif self.x==1 and self.y==4:
pass
else:
self.x+=1
def is_done(self):
if self.x==4 and self.y==6:
return True
else:
return False
def reset(self):
self.x = 0
self.y = 0
return (self.x, self.y)
class QAgent():
def __init__(self):
self.q_table = np.zeros((5, 7, 4)) # Likewise, initialize the Q table to 0
self.eps = 0.9
def select_action(self, s):
# Select action with eps-greedy
x, y = s
coin = random.random()
if coin <self.eps:
action = random.randint(0,3)
else:
action_val = self.q_table[x,y,:]
action = np.argmax(action_val)
return action
def update_table(self, transition):
s, a, r, s_prime = transition
x,y = s
next_x, next_y = s_prime
a_prime = self.select_action(s_prime) # Action to select from S'(not actually taken action)
# Using the Q-learning update formula
self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + np.amax(self.q_table[next_x,next_y,:])-self.q_table[x,y ,a])
def anneal_eps(self):
self.eps -= 0.01 # In Q-run, epsilon decreases more slowly.
self.eps = max(self.eps, 0.2)
def show_table(self):
q_lst = self.q_table.tolist()
data = np.zeros((5,7))
for row_idx in range(len(q_lst)):
row = q_lst[row_idx]
for col_idx in range(len(row)):
col = row[col_idx]
action = np.argmax(col)
data[row_idx, col_idx] = action
print(data)
def main():
env = GridWorld()
agent = QAgent()
for n_epi in range(1000):
done = False
s = env.reset()
while not done:
a = agent.select_action(s)
s_prime, r, done = env.step(a)
agent.update_table((s,a,r,s_prime))
s = s_prime
agent.anneal_eps()
agent.show_table()
if __name__ =='__main__':
main()