增强学习(Reinforcement Learning)

  • 增强学习的应用领域很广,它假定有一个智能体(agent)在系统中,智能体做出特定的决策,对于表现不好的行为给予惩罚,奖励表现好的行为

Setup code

# 不显示python使用过程中的警告
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os


def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    return

# with tf.Session( config=tf.ConfigProto(gpu_options=gpu_options) ) as sess:
with tf.Session(  ) as sess:
    print( sess.run( tf.constant(1) ) )
1

策略搜索

  • 策略是模型用于确定行为所采用的算法,它对算法没有要求,比如说输入可以是图像,输出是采取的行为。如果策略包含随机信息,则可以称之为随机策略。
  • 对这样一个策略搜索的任务进行训练,我们可以取所有的策略空间(policy space)中的参数,然后找到性能最好的参数;但是一般情况下,策略空间很大,这往往难以实现。
  • 另一种搜索策略空间的方法是使用遗传算法
  • 计算rewards相对于策略参数的梯度,然后跳调整参数,使得rewards增大,即梯度上升,这种方法被称为策略梯度(policy gradients)

openai gym

  • openai gym是一个提供很多仿真环境的工具包,在这上面可以训练智能体、开发RL算法。
import gym
from IPython import display
env = gym.make( "CartPole-v0" )
# CartPole-v0中,有4个状态量,分别是:水平位置、速度、与竖直方向角度、角速度
obs = env.reset()
print( "obs : ", obs )
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
obs :  [-0.02977874  0.02100402 -0.03886023  0.03856819]
from PIL import Image, ImageDraw
import matplotlib.animation as animation
# 采用下面的这种方法,无需安装xvfb
try:
    from pyglet.gl import gl_info
    openai_cart_pole_rendering = True   # no problem, let's use OpenAI gym's rendering function
except Exception:
    openai_cart_pole_rendering = False  # probably no X server available, let's use our own rendering function

def render_cart_pole(env, obs):
    if openai_cart_pole_rendering:
        # use OpenAI gym's rendering function
        return env.render(mode="rgb_array")
    else:
        # rendering for the cart pole environment (in case OpenAI gym can't do it)
        img_w = 600
        img_h = 400
        cart_w = img_w // 12
        cart_h = img_h // 15
        pole_len = img_h // 3.5
        pole_w = img_w // 80 + 1
        x_width = 2
        max_ang = 0.2
        bg_col = (255, 255, 255)
        cart_col = 0x000000 # Blue Green Red
        pole_col = 0x669acc # Blue Green Red

        pos, vel, ang, ang_vel = obs
        img = Image.new('RGB', (img_w, img_h), bg_col)
        draw = ImageDraw.Draw(img)
        cart_x = pos * img_w // x_width + img_w // x_width
        cart_y = img_h * 95 // 100
        top_pole_x = cart_x + pole_len * np.sin(ang)
        top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)
        draw.line((0, cart_y, img_w, cart_y), fill=0)
        draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart
        draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole
        return np.array(img)

def plot_cart_pole(env, obs):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    img = render_cart_pole(env, obs)
    plt.imshow(img)
    plt.axis("off")
    plt.show()
    return

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
plot_cart_pole(env, obs)
print(env.action_space) # 上面创建的agent的行为总共有2种:0(向左加速)或1(向右加速)

这里写图片描述

Discrete(2)
obs = env.reset()
print(obs)
# step(1)表示向右加速
obs, reward, done, info = env.step(1)
print( obs )
[-0.01147464  0.00686224 -0.04214929 -0.03729966]
[-0.0113374   0.20256248 -0.04289529 -0.34297752]
obs = env.reset()
frames = []
max_steps = 1000
change_steps = 10
angles = []
for step in range( max_steps ):
    img = render_cart_pole( env, obs )
    frames.append( img )
    position, velocity, angle, angular_velocity = obs
    print( "current status : ", obs )
    angles.append( angle )
    if angle < 0:
        action = 0
    else:
        action = 1
    obs, reward, done, info = env.step(action)
    if done:
        break
current status :  [ 0.03557839  0.01374288  0.02786276 -0.01108547]
current status :  [ 0.03585325  0.20845439  0.02764105 -0.29484879]
current status :  [ 0.04002234  0.40317159  0.02174408 -0.57868758]
current status :  [ 0.04808577  0.59798216  0.01017032 -0.86444211]
current status :  [ 0.06004541  0.7929642  -0.00711852 -1.15391002]
current status :  [ 0.0759047   0.59793582 -0.03019672 -0.86346768]
current status :  [ 0.08786341  0.40323769 -0.04746607 -0.58043012]
current status :  [ 0.09592817  0.20881188 -0.05907467 -0.30306967]
current status :  [ 0.1001044   0.01457943 -0.06513607 -0.02958707]
current status :  [ 0.10039599 -0.17955089 -0.06572781  0.24185443]
current status :  [ 0.09680497 -0.37367539 -0.06089072  0.51310237]
current status :  [ 0.08933147 -0.56788925 -0.05062867  0.78599244]
current status :  [ 0.07797368 -0.76228042 -0.03490882  1.0623271 ]
current status :  [ 0.06272807 -0.95692321 -0.01366228  1.3438523 ]
current status :  [ 0.04358961 -1.15187066  0.01321476  1.6322296 ]
current status :  [ 0.0205522  -0.95690635  0.04585936  1.34369369]
current status :  [ 0.00141407 -0.76239033  0.07273323  1.06570434]
current status :  [-0.01383374 -0.56830228  0.09404732  0.79670671]
current status :  [-0.02519978 -0.37458801  0.10998145  0.53502857]
current status :  [-0.03269154 -0.18117051  0.12068202  0.27892462]
current status :  [-0.03631495  0.01204167  0.12626051  0.02660872]
current status :  [-0.03607412  0.20514808  0.12679269 -0.22372307]
current status :  [-0.03197116  0.3982513   0.12231823 -0.47387547]
current status :  [-0.02400613  0.59145275  0.11284072 -0.72564042]
current status :  [-0.01217708  0.78484852  0.09832791 -0.98078435]
current status :  [ 0.00351989  0.97852494  0.07871222 -1.24103385]
current status :  [ 0.02309039  1.17255307  0.05389155 -1.50805742]
current status :  [ 0.04654145  1.36698187  0.0237304  -1.78344076]
current status :  [ 0.07388109  1.56182922 -0.01193842 -2.06865342]
current status :  [ 0.10511767  1.36683066 -0.05331149 -1.77968624]
current status :  [ 0.13245429  1.17234758 -0.08890521 -1.50404222]
current status :  [ 0.15590124  0.97840986 -0.11898606 -1.24038829]
current status :  [ 0.17546944  0.78499952 -0.14379382 -0.98722345]
current status :  [ 0.19116943  0.59206511 -0.16353829 -0.74293846]
current status :  [ 0.20301073  0.3995327  -0.17839706 -0.5058594 ]
current status :  [ 0.21100138  0.20731387 -0.18851425 -0.27427808]
current status :  [ 0.21514766  0.01531152 -0.19399981 -0.0464724 ]
current status :  [ 0.21545389 -0.17657605 -0.19492926  0.17928067]
current status :  [ 0.21192237 -0.36845204 -0.19134364  0.40469689]
current status :  [ 0.20455333 -0.56041869 -0.18324971  0.63147929]
current status :  [ 0.19334496 -0.75257502 -0.17062012  0.86131339]
current status :  [ 0.17829345 -0.94501418 -0.15339385  1.09586142]
current status :  [ 0.15939317 -1.13782025 -0.13147662  1.33675355]
current status :  [ 0.13663677 -1.33106382 -0.10474155  1.58557432]
current status :  [ 0.11001549 -1.52479607 -0.07303007  1.84384237]
current status :  [ 0.07951957 -1.71904073 -0.03615322  2.11298095]
current status :  [ 0.04513875 -1.91378346  0.0061064   2.39427744]
current status :  [ 0.00686308 -1.71871547  0.05399195  2.10347617]
current status :  [-0.02751122 -1.52417484  0.09606147  1.82795638]
current status :  [-0.05799472 -1.33023996  0.1326206   1.56659389]
current status :  [-0.08459952 -1.13692847  0.16395248  1.31804919]
current status :  [-0.10733809 -0.94421495  0.19031346  1.08084174]
video = plot_animation(frames, interval=1000)
plt.show()
plt.plot( angles, 'r' )
plt.xlabel("step")
plt.ylabel("angle")
plt.show()

这里写图片描述

这里写图片描述

  • 上面的图可以看出,随着step的增加,系统不稳定,因此这种简单的控制策略无法解决上述倒立摆的稳定性问题。可以尝试一些其他的控制策略。

神经网络的控制策略

  • 将观测量作为神经网络的输入,给出agent最终采取特定行为的概率,然后按照概率随机确定采取的行为
  • 在有监督学习中,NN可以通过交叉熵等方法去对决策做优化,但是在RL中哦你,唯一可以指导决策的就是奖励(reward)。但是reward一般都是有很大的延迟,即在决策之后很长时间,才会有当前决策的reward。对此,常用的解决方案是:基于一个行为以及其之后的reward对该行为计算reward,后来的reward有一个衰减系数 r r ,,如果r=0,则相当于只采用当前step的reward
  • 上述的方法有一个问题:如果一个action是好的,但是他后面是很多不好的action,那么他的action会减少。针对这种问题,为了得到更好的score的方法,我们需要运行很多次,同时对score进行标准化(减去均值再除以方差)

策略梯度(Policy Gradients)

  • PG会调整参数,使得策略偏向更高的reward。一种较早的PG算法是REINFORCE algorithms,主要的流程如下
    • 首先让agent在NN决策下运行一段时间,计算使得行为可能性更大的梯度,但是不立刻应用这些梯度。
    • 运行一段时间之后,计算每个行为的score
    • 如果action的score是正的,则需要调整参数,使得获得这些action的更高更高;如果action的score是负的,则需要调整参数,使得获得这些action的概率更低。
    • 计算梯度向量,使用GD方法进行迭代
# 1. Specify the network architecture
n_inputs = 4  # == env.observation_space.shape[0]
n_hidden = 4  # it's a simple task, we don't need more than this
n_outputs = 1 # only outputs the probability of accelerating left
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu,
                         kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid,
                          kernel_initializer=initializer)

# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

init = tf.global_variables_initializer()

n_max_steps = 1000
frames = []
angles  =[]
env.reset()
with tf.Session() as sess:
    init.run()
    obs = env.reset()
    for step in range(max_steps):
        img = render_cart_pole(env, obs)
        frames.append(img)
        action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
        obs, reward, done, info = env.step(action_val[0][0])
        angles.append( obs[2] )
        if done:
            break

env.close()
plt.plot( angles )
[<matplotlib.lines.Line2D at 0x7fb1c00efd30>]

这里写图片描述

  • 在上面的操作中,不使用任何信息,最终agent会不稳定
reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1

learning_rate = 0.01

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
y = tf.placeholder(tf.float32, shape=[None, n_outputs])

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(cross_entropy)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_environments = 10
n_iterations = 1000

envs = [gym.make("CartPole-v0") for _ in range(n_environments)]
observations = [env.reset() for env in envs]
angles = np.zeros( (n_iterations, n_environments) )
with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations]) # if angle<0 we want proba(left)=1., or else proba(left)=0.
        action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})
        for env_index, env in enumerate(envs):
            obs, reward, done, info = env.step(action_val[env_index][0])
            angles[iteration, env_index] = obs[2]
            observations[env_index] = obs if not done else env.reset()
    saver.save(sess, "./models/RL/policy_net_basic.ckpt")

for env in envs:
    env.close()
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
plt.figure( figsize=(10,4) )
plt.plot( angles )
plt.show()

这里写图片描述

  • 上面的操作中,采用交叉熵作为代价函数,进行优化,最终这些agent都在一定的范围内稳定震荡,系统性能比之前没有加代价函数的模型要好。
reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1

learning_rate = 0.01

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits)  # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
# 测试上面的奖励机制的计算方法是否正确
print(discount_rewards([10, 0, -50], discount_rate=0.8))
print( discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8) )
[-22. -40. -50.]
[array([-0.28435071, -0.86597718, -1.18910299]), array([1.26665318, 1.0727777 ])]
env = gym.make("CartPole-v0")

n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        print("\rIteration: {}".format(iteration), end="")
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)

        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                      for game_index, rewards in enumerate(all_rewards)
                                          for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, "./models/RL/policy_net_pg.ckpt")
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Iteration: 249
def get_angle(model_path, action, X, n_max_steps = 1000):
    angles = []
    env = gym.make("CartPole-v0")
    obs = env.reset()
    with tf.Session() as sess:
        saver.restore(sess, model_path)
        for step in range(n_max_steps):
            action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0][0])
            angles.append( obs[2] )
            if done:
                break
    env.close()
    return angles 
angles = get_angle( "./models/RL/policy_net_pg.ckpt", action, X )
plt.plot( angles, 'r' )
plt.show()
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from ./models/RL/policy_net_pg.ckpt

这里写图片描述

  • 采用策略梯度的方法,最终的角度也会趋于稳定(在某一范围内震荡)

马尔可夫决策过程(Markov Decision Processes)

  • Markov过程指的是系统在下一时刻的状态仅与当前时刻有关。系统有若干可能的状态,它们在任意时刻,都有一定的概率转变为其他状态,状态转变的概率组成矩阵,成为状态转移矩阵
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐