RL

Finding the best policy when you don’t know MDP

Policy

Optimal policy is environment dependent.
Target policy
    Policy that is the goal to be strengthened
Action policy
    Policies that are gaining experience by interacting with the real environment
On Policy
    When the target policy and the action policy are the same
    ex) SARSA
Off Policy
    When the target policy and the action policy are different
    1. You can reuse past experiences.
    2. You can learn from human data.
    3. One-to-many, many-to-one learning is possible.
    ex) Q Learning

Monte Carlo Control

Policy interation is problematic to use when model free
    1. Repeated policy evaluation cannot be used in the evaluation phase.
        In step 2 of the Bellman expectation equation, it is possible to substitute information about the MDP.
    2. It is not possible to create a greedy policy at the stage of policy improvement.
        When you select each action in the current state, you don't know where the next state will be.

Resolution
    1. Use Monte-Carlo or Temporal Difference in the policy evaluation stage.
    2. In the policy improvement stage, q(s,a) instead of v(s) was used to indicate that greedy policies cannot be created
        If you use it, you can select a greedy action without knowing the information about the MDP.

Synthesis
    It is to find q(s,a) using MC.

Exploration
    E-greedy
        In order for the agent to find the optimal solution, 
            it must sufficiently search through several states within a given MDP.
            How to prevent when a state that you haven't been able to visit occurs throughout the study
        In other words, an algorithm that guarantees some search

        Choose an action randomly as much as a small probability called Epsilon,
            The remaining probability of (1-upsilon) selects the greedy action as usual.
        
        At first, you need to get enough information about the environment, 
            and when the learning progresses to some extent,
        To make the best choice based on the information already obtained.
            --> Decaying E-greedy
    Used in the policy improvement phase

avatar
    Loop
        To gain experience in one episode
        Update q(s,a) table with experienced data
        Create and improve e-greedy policy using updated table

MC Coding

import random
import numpy as np

class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
    
    def step(self, a):
        # Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1 # Reward is always fixed at -1
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_left(self):
        if self.y==0:
            pass
        elif self.y==3 and self.x in [0,1,2]:
            pass
        elif self.y==5 and self.x in [2,3,4]:
            pass
        else:
            self.y -= 1

    def move_right(self):
        if self.y==1 and self.x in [0,1,2]:
            pass
        elif self.y==3 and self.x in [2,3,4]:
            pass
        elif self.y==6:
            pass
        else:
            self.y += 1
      
    def move_up(self):
        if self.x==0:
            pass
        elif self.x==3 and self.y==2:
            pass
        else:
            self.x -= 1

    def move_down(self):
        if self.x==4:
            pass
        elif self.x==1 and self.y==4:
            pass
        else:
            self.x+=1

    def is_done(self):
        if self.x==4 and self.y==6: # End when the target point (4,6) is reached
            return True
        else:
            return False
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # Variable to store q value. All initialized to zero.
        self.eps = 0.9
        self.alpha = 0.01
        
    def select_action(self, s):
        # Select action with eps-greedy
        x, y = s
        coin = random.random()
        if coin <self.eps:
            action = random.randint(0,3)
        else:
            action_val = self.q_table[x,y,:]
            action = np.argmax(action_val)
        return action

    def update_table(self, history):
        # Receive the history corresponding to an episode as an input and update the q table value
        cum_reward = 0
        for transition in history[::-1]:
            s, a, r, s_prime = transition
            x,y = s
            # Update using the Monte Carlo method.
            self.q_table[x,y,a] = self.q_table[x,y,a] + self.alpha * (cum_reward-self.q_table[x,y,a])
            cum_reward = cum_reward + r
z
    def anneal_eps(self):
        self.eps -= 0.03
        self.eps = max(self.eps, 0.1)

    def show_table(self):
        # Function to show which action had the highest q value at each location
        q_lst = self.q_table.tolist()
        data = np.zeros((5,7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)
      
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(1000): # training for a total of 1,000 episodes
        done = False
        history = []

        s = env.reset()
        while not done: # Until the end of an episode
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            history.append((s, a, r, s_prime))
            s = s_prime
        agent.update_table(history) # Update agent using history
        agent.anneal_eps()

    agent.show_table() # Print the result of training

if __name__ =='__main__':
    main()

Temporal Difference Control

SARSA

Approach to calculating Q using TD
    If you choose action a in state s, you get reward r and arrive at state s', 
        and in state s'you choose the next action a'= SARSA

Update cycle is different than MC.
    Call update_table at the end of a step
    It must be ensured that the various conditions are sufficiently visited multiple times.

Q Learning

Finding the best policy using TD.
Research that combines reinforcement learning and deep learning to show great results
2015 "Human-level control through deep reinforcement learning" paper
Target policy: greedy
Policy of Conduct: e-greedy

SARSA Code

import random
import numpy as np

class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
    
    def step(self, a):
        # Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1 # Reward is always fixed at -1
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_left(self):
        if self.y==0:
            pass
        elif self.y==3 and self.x in [0,1,2]:
            pass
        elif self.y==5 and self.x in [2,3,4]:
            pass
        else:
            self.y -= 1

    def move_right(self):
        if self.y==1 and self.x in [0,1,2]:
            pass
        elif self.y==3 and self.x in [2,3,4]:
            pass
        elif self.y==6:
            pass
        else:
            self.y += 1
      
    def move_up(self):
        if self.x==0:
            pass
        elif self.x==3 and self.y==2:
            pass
        else:
            self.x -= 1

    def move_down(self):
        if self.x==4:
            pass
        elif self.x==1 and self.y==4:
            pass
        else:
            self.x+=1

    def is_done(self):
        if self.x==4 and self.y==6: # End when the target point (4,6) is reached
            return True
        else:
            return False
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # Likewise, initialize the Q table to 0
        self.eps = 0.9

    def select_action(self, s):
        # Select action with eps-greedy
        x, y = s
        coin = random.random()
        if coin <self.eps:
            action = random.randint(0,3)
        else:
            action_val = self.q_table[x,y,:]
            action = np.argmax(action_val)
        return action

    def update_table(self, transition):
        s, a, r, s_prime = transition
        x,y = s
        next_x, next_y = s_prime
        a_prime = self.select_action(s_prime) # Action to select from S'(not actually taken action)
        # Use SARSA update formula
        self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + self.q_table[next_x,next_y,a_prime]-self.q_table[x,y,a])

    def anneal_eps(self):
        self.eps -= 0.03
        self.eps = max(self.eps, 0.1)

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((5,7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)

      
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(1000):
        done = False

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            agent.update_table((s,a,r,s_prime))
            s = s_prime
        agent.anneal_eps()

    agent.show_table()


if __name__ =='__main__':
    main()

QLearning Code

import random
import numpy as np

class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
    
    def step(self, a):
        # Action 0: Left, Action 1: Up, Action 2: Right, Action 3: Down
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1 # Reward is always fixed at -1
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_left(self):
        if self.y==0:
            pass
        elif self.y==3 and self.x in [0,1,2]:
            pass
        elif self.y==5 and self.x in [2,3,4]:
            pass
        else:
            self.y -= 1

    def move_right(self):
        if self.y==1 and self.x in [0,1,2]:
            pass
        elif self.y==3 and self.x in [2,3,4]:
            pass
        elif self.y==6:
            pass
        else:
            self.y += 1
      
    def move_up(self):
        if self.x==0:
            pass
        elif self.x==3 and self.y==2:
            pass
        else:
            self.x -= 1

    def move_down(self):
        if self.x==4:
            pass
        elif self.x==1 and self.y==4:
            pass
        else:
            self.x+=1

    def is_done(self):
        if self.x==4 and self.y==6:
            return True
        else:
            return False
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # Likewise, initialize the Q table to 0
        self.eps = 0.9

    def select_action(self, s):
        # Select action with eps-greedy
        x, y = s
        coin = random.random()
        if coin <self.eps:
            action = random.randint(0,3)
        else:
            action_val = self.q_table[x,y,:]
            action = np.argmax(action_val)
        return action

    def update_table(self, transition):
        s, a, r, s_prime = transition
        x,y = s
        next_x, next_y = s_prime
        a_prime = self.select_action(s_prime) # Action to select from S'(not actually taken action)
        # Using the Q-learning update formula
        self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + np.amax(self.q_table[next_x,next_y,:])-self.q_table[x,y ,a])

    def anneal_eps(self):
        self.eps -= 0.01 # In Q-run, epsilon decreases more slowly.
        self.eps = max(self.eps, 0.2)

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((5,7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)
      

def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(1000):
        done = False

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            agent.update_table((s,a,r,s_prime))
            s = s_prime
        agent.anneal_eps()

    agent.show_table()

if __name__ =='__main__':
    main()