tensorflow 增强学习
增强学习(Reinforcement Learning)增强学习的应用领域很广,它假定有一个智能体(agent)在系统中,智能体做出特定的决策,对于表现不好的行为给予惩罚,奖励表现好的行为Setup code# 不显示python使用过程中的警告import warningswarnings.filterwarnings("ignore")%matplotlib i
·
增强学习(Reinforcement Learning)
- 增强学习的应用领域很广,它假定有一个智能体(agent)在系统中,智能体做出特定的决策,对于表现不好的行为给予惩罚,奖励表现好的行为
Setup code
# 不显示python使用过程中的警告
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
return
# with tf.Session( config=tf.ConfigProto(gpu_options=gpu_options) ) as sess:
with tf.Session( ) as sess:
print( sess.run( tf.constant(1) ) )
1
策略搜索
- 策略是模型用于确定行为所采用的算法,它对算法没有要求,比如说输入可以是图像,输出是采取的行为。如果策略包含随机信息,则可以称之为随机策略。
- 对这样一个策略搜索的任务进行训练,我们可以取所有的策略空间(policy space)中的参数,然后找到性能最好的参数;但是一般情况下,策略空间很大,这往往难以实现。
- 另一种搜索策略空间的方法是使用遗传算法
- 计算rewards相对于策略参数的梯度,然后跳调整参数,使得rewards增大,即梯度上升,这种方法被称为策略梯度(policy gradients)
openai gym
- openai gym是一个提供很多仿真环境的工具包,在这上面可以训练智能体、开发RL算法。
import gym
from IPython import display
env = gym.make( "CartPole-v0" )
# CartPole-v0中,有4个状态量,分别是:水平位置、速度、与竖直方向角度、角速度
obs = env.reset()
print( "obs : ", obs )
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
obs : [-0.02977874 0.02100402 -0.03886023 0.03856819]
from PIL import Image, ImageDraw
import matplotlib.animation as animation
# 采用下面的这种方法,无需安装xvfb
try:
from pyglet.gl import gl_info
openai_cart_pole_rendering = True # no problem, let's use OpenAI gym's rendering function
except Exception:
openai_cart_pole_rendering = False # probably no X server available, let's use our own rendering function
def render_cart_pole(env, obs):
if openai_cart_pole_rendering:
# use OpenAI gym's rendering function
return env.render(mode="rgb_array")
else:
# rendering for the cart pole environment (in case OpenAI gym can't do it)
img_w = 600
img_h = 400
cart_w = img_w // 12
cart_h = img_h // 15
pole_len = img_h // 3.5
pole_w = img_w // 80 + 1
x_width = 2
max_ang = 0.2
bg_col = (255, 255, 255)
cart_col = 0x000000 # Blue Green Red
pole_col = 0x669acc # Blue Green Red
pos, vel, ang, ang_vel = obs
img = Image.new('RGB', (img_w, img_h), bg_col)
draw = ImageDraw.Draw(img)
cart_x = pos * img_w // x_width + img_w // x_width
cart_y = img_h * 95 // 100
top_pole_x = cart_x + pole_len * np.sin(ang)
top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)
draw.line((0, cart_y, img_w, cart_y), fill=0)
draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart
draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole
return np.array(img)
def plot_cart_pole(env, obs):
plt.close() # or else nbagg sometimes plots in the previous cell
img = render_cart_pole(env, obs)
plt.imshow(img)
plt.axis("off")
plt.show()
return
def update_scene(num, frames, patch):
patch.set_data(frames[num])
return patch,
def plot_animation(frames, repeat=False, interval=40):
plt.close() # or else nbagg sometimes plots in the previous cell
fig = plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
plot_cart_pole(env, obs)
print(env.action_space) # 上面创建的agent的行为总共有2种:0(向左加速)或1(向右加速)
Discrete(2)
obs = env.reset()
print(obs)
# step(1)表示向右加速
obs, reward, done, info = env.step(1)
print( obs )
[-0.01147464 0.00686224 -0.04214929 -0.03729966]
[-0.0113374 0.20256248 -0.04289529 -0.34297752]
obs = env.reset()
frames = []
max_steps = 1000
change_steps = 10
angles = []
for step in range( max_steps ):
img = render_cart_pole( env, obs )
frames.append( img )
position, velocity, angle, angular_velocity = obs
print( "current status : ", obs )
angles.append( angle )
if angle < 0:
action = 0
else:
action = 1
obs, reward, done, info = env.step(action)
if done:
break
current status : [ 0.03557839 0.01374288 0.02786276 -0.01108547]
current status : [ 0.03585325 0.20845439 0.02764105 -0.29484879]
current status : [ 0.04002234 0.40317159 0.02174408 -0.57868758]
current status : [ 0.04808577 0.59798216 0.01017032 -0.86444211]
current status : [ 0.06004541 0.7929642 -0.00711852 -1.15391002]
current status : [ 0.0759047 0.59793582 -0.03019672 -0.86346768]
current status : [ 0.08786341 0.40323769 -0.04746607 -0.58043012]
current status : [ 0.09592817 0.20881188 -0.05907467 -0.30306967]
current status : [ 0.1001044 0.01457943 -0.06513607 -0.02958707]
current status : [ 0.10039599 -0.17955089 -0.06572781 0.24185443]
current status : [ 0.09680497 -0.37367539 -0.06089072 0.51310237]
current status : [ 0.08933147 -0.56788925 -0.05062867 0.78599244]
current status : [ 0.07797368 -0.76228042 -0.03490882 1.0623271 ]
current status : [ 0.06272807 -0.95692321 -0.01366228 1.3438523 ]
current status : [ 0.04358961 -1.15187066 0.01321476 1.6322296 ]
current status : [ 0.0205522 -0.95690635 0.04585936 1.34369369]
current status : [ 0.00141407 -0.76239033 0.07273323 1.06570434]
current status : [-0.01383374 -0.56830228 0.09404732 0.79670671]
current status : [-0.02519978 -0.37458801 0.10998145 0.53502857]
current status : [-0.03269154 -0.18117051 0.12068202 0.27892462]
current status : [-0.03631495 0.01204167 0.12626051 0.02660872]
current status : [-0.03607412 0.20514808 0.12679269 -0.22372307]
current status : [-0.03197116 0.3982513 0.12231823 -0.47387547]
current status : [-0.02400613 0.59145275 0.11284072 -0.72564042]
current status : [-0.01217708 0.78484852 0.09832791 -0.98078435]
current status : [ 0.00351989 0.97852494 0.07871222 -1.24103385]
current status : [ 0.02309039 1.17255307 0.05389155 -1.50805742]
current status : [ 0.04654145 1.36698187 0.0237304 -1.78344076]
current status : [ 0.07388109 1.56182922 -0.01193842 -2.06865342]
current status : [ 0.10511767 1.36683066 -0.05331149 -1.77968624]
current status : [ 0.13245429 1.17234758 -0.08890521 -1.50404222]
current status : [ 0.15590124 0.97840986 -0.11898606 -1.24038829]
current status : [ 0.17546944 0.78499952 -0.14379382 -0.98722345]
current status : [ 0.19116943 0.59206511 -0.16353829 -0.74293846]
current status : [ 0.20301073 0.3995327 -0.17839706 -0.5058594 ]
current status : [ 0.21100138 0.20731387 -0.18851425 -0.27427808]
current status : [ 0.21514766 0.01531152 -0.19399981 -0.0464724 ]
current status : [ 0.21545389 -0.17657605 -0.19492926 0.17928067]
current status : [ 0.21192237 -0.36845204 -0.19134364 0.40469689]
current status : [ 0.20455333 -0.56041869 -0.18324971 0.63147929]
current status : [ 0.19334496 -0.75257502 -0.17062012 0.86131339]
current status : [ 0.17829345 -0.94501418 -0.15339385 1.09586142]
current status : [ 0.15939317 -1.13782025 -0.13147662 1.33675355]
current status : [ 0.13663677 -1.33106382 -0.10474155 1.58557432]
current status : [ 0.11001549 -1.52479607 -0.07303007 1.84384237]
current status : [ 0.07951957 -1.71904073 -0.03615322 2.11298095]
current status : [ 0.04513875 -1.91378346 0.0061064 2.39427744]
current status : [ 0.00686308 -1.71871547 0.05399195 2.10347617]
current status : [-0.02751122 -1.52417484 0.09606147 1.82795638]
current status : [-0.05799472 -1.33023996 0.1326206 1.56659389]
current status : [-0.08459952 -1.13692847 0.16395248 1.31804919]
current status : [-0.10733809 -0.94421495 0.19031346 1.08084174]
video = plot_animation(frames, interval=1000)
plt.show()
plt.plot( angles, 'r' )
plt.xlabel("step")
plt.ylabel("angle")
plt.show()
- 上面的图可以看出,随着step的增加,系统不稳定,因此这种简单的控制策略无法解决上述倒立摆的稳定性问题。可以尝试一些其他的控制策略。
神经网络的控制策略
- 将观测量作为神经网络的输入,给出agent最终采取特定行为的概率,然后按照概率随机确定采取的行为
- 在有监督学习中,NN可以通过交叉熵等方法去对决策做优化,但是在RL中哦你,唯一可以指导决策的就是奖励(reward)。但是reward一般都是有很大的延迟,即在决策之后很长时间,才会有当前决策的reward。对此,常用的解决方案是:基于一个行为以及其之后的reward对该行为计算reward,后来的reward有一个衰减系数 r r ,,如果,则相当于只采用当前step的reward
- 上述的方法有一个问题:如果一个action是好的,但是他后面是很多不好的action,那么他的action会减少。针对这种问题,为了得到更好的score的方法,我们需要运行很多次,同时对score进行标准化(减去均值再除以方差)
策略梯度(Policy Gradients)
- PG会调整参数,使得策略偏向更高的reward。一种较早的PG算法是
REINFORCE algorithms
,主要的流程如下
- 首先让agent在NN决策下运行一段时间,计算使得行为可能性更大的梯度,但是不立刻应用这些梯度。
- 运行一段时间之后,计算每个行为的score
- 如果action的score是正的,则需要调整参数,使得获得这些action的更高更高;如果action的score是负的,则需要调整参数,使得获得这些action的概率更低。
- 计算梯度向量,使用GD方法进行迭代
# 1. Specify the network architecture
n_inputs = 4 # == env.observation_space.shape[0]
n_hidden = 4 # it's a simple task, we don't need more than this
n_outputs = 1 # only outputs the probability of accelerating left
initializer = tf.contrib.layers.variance_scaling_initializer()
# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid,
kernel_initializer=initializer)
# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
init = tf.global_variables_initializer()
n_max_steps = 1000
frames = []
angles =[]
env.reset()
with tf.Session() as sess:
init.run()
obs = env.reset()
for step in range(max_steps):
img = render_cart_pole(env, obs)
frames.append(img)
action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
angles.append( obs[2] )
if done:
break
env.close()
plt.plot( angles )
[<matplotlib.lines.Line2D at 0x7fb1c00efd30>]
- 在上面的操作中,不使用任何信息,最终agent会不稳定
reset_graph()
n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
y = tf.placeholder(tf.float32, shape=[None, n_outputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(cross_entropy)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_environments = 10
n_iterations = 1000
envs = [gym.make("CartPole-v0") for _ in range(n_environments)]
observations = [env.reset() for env in envs]
angles = np.zeros( (n_iterations, n_environments) )
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations]) # if angle<0 we want proba(left)=1., or else proba(left)=0.
action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})
for env_index, env in enumerate(envs):
obs, reward, done, info = env.step(action_val[env_index][0])
angles[iteration, env_index] = obs[2]
observations[env_index] = obs if not done else env.reset()
saver.save(sess, "./models/RL/policy_net_basic.ckpt")
for env in envs:
env.close()
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
plt.figure( figsize=(10,4) )
plt.plot( angles )
plt.show()
- 上面的操作中,采用交叉熵作为代价函数,进行优化,最终这些agent都在一定的范围内稳定震荡,系统性能比之前没有加代价函数的模型要好。
reset_graph()
n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
gradient_placeholders.append(gradient_placeholder)
grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
def discount_rewards(rewards, discount_rate):
discounted_rewards = np.zeros(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
discounted_rewards[step] = cumulative_rewards
return discounted_rewards
def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
# 测试上面的奖励机制的计算方法是否正确
print(discount_rewards([10, 0, -50], discount_rate=0.8))
print( discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8) )
[-22. -40. -50.]
[array([-0.28435071, -0.86597718, -1.18910299]), array([1.26665318, 1.0727777 ])]
env = gym.make("CartPole-v0")
n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
print("\rIteration: {}".format(iteration), end="")
all_rewards = []
all_gradients = []
for game in range(n_games_per_update):
current_rewards = []
current_gradients = []
obs = env.reset()
for step in range(n_max_steps):
action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
current_rewards.append(reward)
current_gradients.append(gradients_val)
if done:
break
all_rewards.append(current_rewards)
all_gradients.append(current_gradients)
all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
feed_dict = {}
for var_index, gradient_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)], axis=0)
feed_dict[gradient_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations == 0:
saver.save(sess, "./models/RL/policy_net_pg.ckpt")
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Iteration: 249
def get_angle(model_path, action, X, n_max_steps = 1000):
angles = []
env = gym.make("CartPole-v0")
obs = env.reset()
with tf.Session() as sess:
saver.restore(sess, model_path)
for step in range(n_max_steps):
action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
angles.append( obs[2] )
if done:
break
env.close()
return angles
angles = get_angle( "./models/RL/policy_net_pg.ckpt", action, X )
plt.plot( angles, 'r' )
plt.show()
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from ./models/RL/policy_net_pg.ckpt
- 采用策略梯度的方法,最终的角度也会趋于稳定(在某一范围内震荡)
马尔可夫决策过程(Markov Decision Processes)
- Markov过程指的是系统在下一时刻的状态仅与当前时刻有关。系统有若干可能的状态,它们在任意时刻,都有一定的概率转变为其他状态,状态转变的概率组成矩阵,成为状态转移矩阵
更多推荐
已为社区贡献2条内容
所有评论(0)