Actor-Critic(AC)——基礎篇

風雨瀟瀟一書生發表於2020-12-04

一、演算法描述

1.1 廣義梯度公式

根據上一章PG演算法可以得知對於策略梯度下降的方法我們關注的是:

因此從廣義的策略理論上,策略梯度可以寫成:

\mathbf{\Psi_{t}}可以表示為一下幾種

1、,軌跡的總回報。

2、,動作後的總彙報(從該動作往後算,可以看成前期的對後面沒有影響)

3、,加入基線的累計回報

4、,狀態-值函式

5、,優勢函式

6、,TD誤差

將廣義的策略梯度分開來看,\mathbf{\Psi_{t}}是一個評價器(critic),評估策略\mathbf{p\left(a_{t} \mid s_{t}, \theta\right)}的好壞(策略為actor),在PG中使用的是一條軌跡的累計回報來評價策略的好壞,但這種評價比較粗糙,不夠精確。以TD誤差為例,構建critic和actor。

對於actor來說,網路的輸入是當前的狀態,網路的輸出是各動作的概率(與PG中的策略網路相同),利用策略網路選出的動作a_{t},生成下一刻的狀態s_{t+1}和回報值

對於critic來說,直觀的理解是評估動作的好壞。以TD為例(6),critic網路的輸入即為當前狀態,回報值和下一時刻的狀態,網路的輸出應為對動作的評估結果,即TD誤差。

兩者聯合在一起,critic給出動作的評價(TD),輸入到actor的網路中(注意actor網路誤差的構建)。

1.2 網路誤差

Actor:

Critic:

 

二、程式碼

Using:
tensorflow 1.0
gym 0.8.0
"""

import numpy as np
import tensorflow as tf
import gym
import os
import pandas as pd
from matplotlib import pyplot as plt
'''
TD-error作為評價指標

CartPole 環境
# action有兩個,即向左或向右移動小車
# state是四維


'''

np.random.seed(2)
tf.set_random_seed(2)  # reproducible

# Superparameters
OUTPUT_GRAPH = True
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 300  # renders environment if total episode reward is greater then this threshold
SAVE_REWARD_THRESHOLD = 600
MAX_EP_STEPS = 1000   # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_F = env.observation_space.shape[0]
N_A = env.action_space.n

class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.int32, None, "act")
        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # todo TD_error

        with tf.variable_scope('Actor'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,    # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.acts_prob = tf.layers.dense(
                inputs=l1,
                units=n_actions,    # output units
                activation=tf.nn.softmax,   # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob'
            )

        with tf.variable_scope('exp_v'):
            log_prob = tf.log(self.acts_prob[0, self.a])
            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a int

class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
        self.r = tf.placeholder(tf.float32, None, 'r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,  # number of hidden units
                activation=tf.nn.relu,  # None
                # have to be linear to make sure the convergence of actor.
                # But linear approximator seems hardly learns the correct Q.
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.r + GAMMA * self.v_ - self.v
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error

sess = tf.Session()

actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor

saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

if len(os.listdir('./logs/')) != 0:
    file_paths = os.listdir('./logs/')
    file_names = []
    for file_path in file_paths:
        if file_path[-4:] == 'meta':
            file_names.append(file_path)
    file_name = file_names[-1]
    file_name = './logs/'+file_name[:-5]

    saver.restore(sess, file_name)

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

#   訓練
sum_episode_reward = 0
Episode_rewards = []
res = []
for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        # if RENDER: env.render()
        a = actor.choose_action(s)

        s_, r, done, info = env.step(a)

        if done: r = -20

        track_r.append(r)

        # 單步更新
        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1

        if done or t >= MAX_EP_STEPS:   # 若模擬結束或是達到最大步數
            ep_rs_sum = sum(track_r)    # 該條軌跡的總回報

            if 'running_reward' not in globals():   # globals 函式返回一個全域性變數的字典
                running_reward = ep_rs_sum
            else:
                # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                running_reward = ep_rs_sum    # 每一個episode的得分
            # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering

            if ep_rs_sum > SAVE_REWARD_THRESHOLD:
                saver.save(sess, save_path='./logs/AC_CartPole_net.ckpt')
                SAVE_REWARD_THRESHOLD = ep_rs_sum

            Episode_rewards.append(running_reward)
            res.append([i_episode, running_reward])
            # print("episode:", i_episode, "  reward:", int(running_reward))
            break

    sum_episode_reward = sum_episode_reward + running_reward
    if i_episode > 0 and i_episode % 10 == 0:

        print("episode:", i_episode, "  reward:", running_reward)
        # print("episode:", i_episode, "  Average reward:", sum_episode_reward / 10)
        sum_episode_reward = 0

# running_reward 中包含了所有episode的得分記錄,一共有MAX_EPISODE條
max_ep_reward = []
min_ep_reward = []
ave_ep_reward = []
num = int(MAX_EPISODE // 10)   # 取商
for i_ep in range(num):
    i_ep = int(i_ep)
    max_ep_reward.append(np.max(Episode_rewards[i_ep*10:(i_ep+1)*10]))
    min_ep_reward.append(np.min(Episode_rewards[i_ep*10:(i_ep+1)*10]))
    ave_ep_reward.append(np.mean(Episode_rewards[i_ep*10:(i_ep+1)*10]))

#  測試
print("#########  測試  #########")
test_episodes = 20
test_MAX_EP_STEPS = 2000
test_Episode_rewards = []
# 匯入訓練過程中的最好模型
file_name = './logs/AC_CartPole_net.ckpt'
saver.restore(sess, file_name)

for i_test_episode in range(test_episodes):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        # if RENDER: env.render()
        a = actor.choose_action(s)
        s_, r, done, info = env.step(a)
        if done: r = -20
        track_r.append(r)
        s = s_
        t += 1

        if done or t >= test_MAX_EP_STEPS:   # 若模擬結束或是達到最大步數
            test_ep_rs_sum = sum(track_r)    # 該條軌跡的總回報

            if 'running_reward' not in globals():   # globals 函式返回一個全域性變數的字典
                running_reward = test_ep_rs_sum
            else:
                # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                running_reward = test_ep_rs_sum    # 每一個episode的得分
            # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            test_Episode_rewards.append(running_reward)
            print("Test_episode:", i_test_episode, "Test  reward:", running_reward)
            break


plt.figure(1)
plt.title("Training")
res = np.array(res)
plt.plot(res[:,0],res[:,1])

plt.figure(2)
plt.title("Training")
plt.plot(max_ep_reward)
plt.plot(min_ep_reward)
plt.plot(ave_ep_reward)

plt.figure(3)
plt.title("Testing")
plt.plot(test_Episode_rewards)

plt.show()

注意事項:

1、兩個網路的輸入層結構是一樣的,都是當前的狀態,不同在於各自的輸出和誤差的構建

2、尤其注意actor網路誤差的構建

3、訓練的效果並不理想,回報函式不收斂,波動較大。

4、從演算法和程式碼中都能看到AC是更新一步狀態就訓練一次網路,這兩個網路是同步交替訓練的。