Actor-Critic(AC)——基礎篇
一、演算法描述
1.1 廣義梯度公式
根據上一章PG演算法可以得知對於策略梯度下降的方法我們關注的是:
因此從廣義的策略理論上,策略梯度可以寫成:
可以表示為一下幾種
1、,軌跡的總回報。
2、,動作後的總彙報(從該動作往後算,可以看成前期的對後面沒有影響)
3、,加入基線的累計回報
4、,狀態-值函式
5、,優勢函式
6、,TD誤差
將廣義的策略梯度分開來看,是一個評價器(critic),評估策略的好壞(策略為actor),在PG中使用的是一條軌跡的累計回報來評價策略的好壞,但這種評價比較粗糙,不夠精確。以TD誤差為例,構建critic和actor。
對於actor來說,網路的輸入是當前的狀態,網路的輸出是各動作的概率(與PG中的策略網路相同),利用策略網路選出的動作,生成下一刻的狀態和回報值
對於critic來說,直觀的理解是評估動作的好壞。以TD為例(6),critic網路的輸入即為當前狀態,回報值和下一時刻的狀態,網路的輸出應為對動作的評估結果,即TD誤差。
兩者聯合在一起,critic給出動作的評價(TD),輸入到actor的網路中(注意actor網路誤差的構建)。
1.2 網路誤差
Actor:
Critic:
二、程式碼
Using:
tensorflow 1.0
gym 0.8.0
"""
import numpy as np
import tensorflow as tf
import gym
import os
import pandas as pd
from matplotlib import pyplot as plt
'''
TD-error作為評價指標
CartPole 環境
# action有兩個,即向左或向右移動小車
# state是四維
'''
np.random.seed(2)
tf.set_random_seed(2) # reproducible
# Superparameters
OUTPUT_GRAPH = True
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 300 # renders environment if total episode reward is greater then this threshold
SAVE_REWARD_THRESHOLD = 600
MAX_EP_STEPS = 1000 # maximum time step in one episode
RENDER = False # rendering wastes time
GAMMA = 0.9 # reward discount in TD error
LR_A = 0.001 # learning rate for actor
LR_C = 0.01 # learning rate for critic
env = gym.make('CartPole-v0')
env.seed(1) # reproducible
env = env.unwrapped
N_F = env.observation_space.shape[0]
N_A = env.action_space.n
class Actor(object):
def __init__(self, sess, n_features, n_actions, lr=0.001):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.a = tf.placeholder(tf.int32, None, "act")
self.td_error = tf.placeholder(tf.float32, None, "td_error") # todo TD_error
with tf.variable_scope('Actor'):
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
self.acts_prob = tf.layers.dense(
inputs=l1,
units=n_actions, # output units
activation=tf.nn.softmax, # get action probabilities
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='acts_prob'
)
with tf.variable_scope('exp_v'):
log_prob = tf.log(self.acts_prob[0, self.a])
self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
def learn(self, s, a, td):
s = s[np.newaxis, :]
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
return exp_v
def choose_action(self, s):
s = s[np.newaxis, :]
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
class Critic(object):
def __init__(self, sess, n_features, lr=0.01):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
self.r = tf.placeholder(tf.float32, None, 'r')
with tf.variable_scope('Critic'):
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu, # None
# have to be linear to make sure the convergence of actor.
# But linear approximator seems hardly learns the correct Q.
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
self.v = tf.layers.dense(
inputs=l1,
units=1, # output units
activation=None,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='V'
)
with tf.variable_scope('squared_TD_error'):
self.td_error = self.r + GAMMA * self.v_ - self.v
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
def learn(self, s, r, s_):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v_ = self.sess.run(self.v, {self.s: s_})
td_error, _ = self.sess.run([self.td_error, self.train_op],
{self.s: s, self.v_: v_, self.r: r})
return td_error
sess = tf.Session()
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
if len(os.listdir('./logs/')) != 0:
file_paths = os.listdir('./logs/')
file_names = []
for file_path in file_paths:
if file_path[-4:] == 'meta':
file_names.append(file_path)
file_name = file_names[-1]
file_name = './logs/'+file_name[:-5]
saver.restore(sess, file_name)
if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)
# 訓練
sum_episode_reward = 0
Episode_rewards = []
res = []
for i_episode in range(MAX_EPISODE):
s = env.reset()
t = 0
track_r = []
while True:
# if RENDER: env.render()
a = actor.choose_action(s)
s_, r, done, info = env.step(a)
if done: r = -20
track_r.append(r)
# 單步更新
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
s = s_
t += 1
if done or t >= MAX_EP_STEPS: # 若模擬結束或是達到最大步數
ep_rs_sum = sum(track_r) # 該條軌跡的總回報
if 'running_reward' not in globals(): # globals 函式返回一個全域性變數的字典
running_reward = ep_rs_sum
else:
# running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
running_reward = ep_rs_sum # 每一個episode的得分
# if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
if ep_rs_sum > SAVE_REWARD_THRESHOLD:
saver.save(sess, save_path='./logs/AC_CartPole_net.ckpt')
SAVE_REWARD_THRESHOLD = ep_rs_sum
Episode_rewards.append(running_reward)
res.append([i_episode, running_reward])
# print("episode:", i_episode, " reward:", int(running_reward))
break
sum_episode_reward = sum_episode_reward + running_reward
if i_episode > 0 and i_episode % 10 == 0:
print("episode:", i_episode, " reward:", running_reward)
# print("episode:", i_episode, " Average reward:", sum_episode_reward / 10)
sum_episode_reward = 0
# running_reward 中包含了所有episode的得分記錄,一共有MAX_EPISODE條
max_ep_reward = []
min_ep_reward = []
ave_ep_reward = []
num = int(MAX_EPISODE // 10) # 取商
for i_ep in range(num):
i_ep = int(i_ep)
max_ep_reward.append(np.max(Episode_rewards[i_ep*10:(i_ep+1)*10]))
min_ep_reward.append(np.min(Episode_rewards[i_ep*10:(i_ep+1)*10]))
ave_ep_reward.append(np.mean(Episode_rewards[i_ep*10:(i_ep+1)*10]))
# 測試
print("######### 測試 #########")
test_episodes = 20
test_MAX_EP_STEPS = 2000
test_Episode_rewards = []
# 匯入訓練過程中的最好模型
file_name = './logs/AC_CartPole_net.ckpt'
saver.restore(sess, file_name)
for i_test_episode in range(test_episodes):
s = env.reset()
t = 0
track_r = []
while True:
# if RENDER: env.render()
a = actor.choose_action(s)
s_, r, done, info = env.step(a)
if done: r = -20
track_r.append(r)
s = s_
t += 1
if done or t >= test_MAX_EP_STEPS: # 若模擬結束或是達到最大步數
test_ep_rs_sum = sum(track_r) # 該條軌跡的總回報
if 'running_reward' not in globals(): # globals 函式返回一個全域性變數的字典
running_reward = test_ep_rs_sum
else:
# running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
running_reward = test_ep_rs_sum # 每一個episode的得分
# if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
test_Episode_rewards.append(running_reward)
print("Test_episode:", i_test_episode, "Test reward:", running_reward)
break
plt.figure(1)
plt.title("Training")
res = np.array(res)
plt.plot(res[:,0],res[:,1])
plt.figure(2)
plt.title("Training")
plt.plot(max_ep_reward)
plt.plot(min_ep_reward)
plt.plot(ave_ep_reward)
plt.figure(3)
plt.title("Testing")
plt.plot(test_Episode_rewards)
plt.show()
注意事項:
1、兩個網路的輸入層結構是一樣的,都是當前的狀態,不同在於各自的輸出和誤差的構建
2、尤其注意actor網路誤差的構建
3、訓練的效果並不理想,回報函式不收斂,波動較大。
4、從演算法和程式碼中都能看到AC是更新一步狀態就訓練一次網路,這兩個網路是同步交替訓練的。
相關文章
- Python基礎篇-Python基礎01Python
- AC自動機 提高篇
- vuex - 基礎篇Vue
- Docker|基礎篇Docker
- Hbase基礎篇
- Python基礎篇Python
- Java基礎篇Java
- redis基礎篇Redis
- Maven——基礎篇Maven
- Git——基礎篇Git
- Nginx-基礎篇Nginx
- React基礎篇1React
- React基礎篇2React
- JAVA精髓(基礎篇)Java
- 爬蟲基礎篇爬蟲
- 初探TypeScript 基礎篇TypeScript
- Docker-基礎篇Docker
- Sass/Scss 基礎篇CSS
- CSS基礎篇(一)CSS
- Java基礎-面相物件篇Java物件
- Python基礎學習篇Python
- python基礎篇實戰Python
- vue系列基礎篇(一)Vue
- RN API基礎操作篇API
- JS 基礎篇(代理模式)JS模式
- (2020)JAVA基礎篇(一)Java
- java基礎篇之多型Java多型
- Android 面試基礎篇Android面試
- 穀粒商城-基礎篇
- Java基礎-併發篇Java
- PowerShell 筆記 - 基礎篇筆記
- Kafka基礎入門篇Kafka
- Go 基礎篇之 MapGo
- 傑裡之AC695AC696 的定時斷電記憶【篇】
- iOS逆向之旅(基礎篇) — 彙編(一)— 彙編基礎iOS
- 前端之路---入坑篇之基礎中的基礎html前端HTML
- JS基礎入門篇( 一 )JS
- 揭秘Flutter Hot Reload(基礎篇)Flutter