#假设一个具有9个元素的一维数组,nimo位于第6位(索引从0开始)
#下面用增强学习的思想来让agent找到nimo,并输出是第几步找到的nimo,以及获得的奖励
#本例不涉及到对策略的优化,只是采取keci-贪心算法达到搜寻目的,并不是完整的增强学习算法

import numpy as np
import random


def main():
    print ("start learning....")
    en = np.array(['empty','empty','empty','empty','empty','empty','nimo','empty','empty'])
    actions = np.array(['left','right'])
    Qa = np.array([0,0])
    updateTimes = np.array([0,0])
    agent = 0
    reward = 0
    selectedA = 0
    for t in range(1,100):
    #set the stop point
        print("t = ",t)
        print("Qa = ",Qa)
        if en[agent] =='nimo':
            print ("I find nimo at the ", t, "step.")
            print ("I get a reward of ",reward,".")
            break
        maxA = np.where(Qa == max(Qa))#find the action which lead to the maximum Qa
        print(maxA)
        select = np.random.randint(1,10)
        if select == 1:#select actions equally
            temp = np.random.randint(0,1)
            selectedA = temp
        else:#select the action which lead to the maximum Qa
            selectedA = maxA[0][0]
        #update agent's state
        print("selectedA = ",actions[selectedA])
        if actions[selectedA] == 'left':
            agent = agent - 1
        else:
            agent = agent + 1
        updateTimes[selectedA] = updateTimes[selectedA]+1
        #get reward of action
        if agent>=0 and agent<=9:
            if en[agent] == 'nimo':
                Ra = 100
            else:
                Ra = 0
        else:
            Ra = -100
        Qa[selectedA] = Qa[selectedA] + (Ra - Qa[selectedA])/updateTimes[selectedA]
    return 0

if __name__ == '__main__':
    main()

以上代码在理论上没有问题,但在实际操作中发现random函数部分,尤其是random.randint(0,1)十次里面有九次出的是0,可能这种随机是基于计算机时钟的伪随机机制,因此,对随机部分做一下改进。

下面的代码加入了一个图形化显示界面,----*----用*号表示当前agent所在位置。


import numpy as np
import random
import matplotlib.pyplot as plt

def drawNimo(agent):
    enTest = np.array(['_.','_.','_.','_.','_.','_.','_.','_.','_.'])
    if agent>=0 and agent<9:
        enTest[agent] = '*'
        print(enTest)
    elif agent<0:
        print(np.array(['oops!','_.','_.','_.','_.','_.','_.','_.','_.','_.']))
    else:
        print(np.array(['_.','_.','_.','_.','_.','_.','_.','_.','_.','oops!']))

def main():
    print ("start learning....")
    runTimes = 1000
    en = np.array(['empty','empty','empty','empty','empty','empty','nimo','empty','empty'])
    actions = np.array(['left','right'])
    alpha = 0.6
    Qa = np.array([0,0])
    updateTimes = np.array([0,0])
    agent = 0
    reward = np.zeros(runTimes+1)
    selectedA = 0
    for t in range(1,runTimes):
        maxA = np.where(Qa == max(Qa))#find the action which lead to the maximum Qa
        #print(Qa)
        if agent == 0:
            agent = agent + 1
            selectedA = 1
        elif agent == 8:
            agent = agent - 1
            selectedA = 0
        else:
            select = np.random.randint(0,100)#produce a num between 0 and 100
            if select >= 50 and select <= 70:#select actions equally in a probability of 0.2
                temp = np.random.randint(0,100)# produce a num between 0 and 100
                if temp>50:#choose left or right action half and half
                    selectedA = 1
                else:
                    selectedA = 0
            else:#select the action which lead to the maximum Qa
                selectedA = maxA[0][0]
        #update agent's state
        #print("selectedA = ",actions[selectedA])


        if actions[selectedA] == 'left':
            agent = agent - 1
        else:
            agent = agent + 1
        updateTimes[selectedA] = updateTimes[selectedA]+1
        #get reward of action
        #if agent>=0 and agent<9:
        if en[agent] == 'nimo':
            Ra = 100
            print ("I find nimo at the ", t, "step.")
                #print ("I get a reward of ",reward,".")
        else:
            Ra = 1
        reward[t] = reward[t-1] + Ra
        Qa[selectedA] = Qa[selectedA] + (Ra - Qa[selectedA])/updateTimes[selectedA]
        #drawNimo(agent)
    x = np.linspace(1,runTimes,runTimes)
    plt.plot(x,reward[0:runTimes])
    plt.show()
    return 0

if __name__ == '__main__':
    main()

然而以上程序因为没有对策略进行优化,这种keci贪心策略较好的结果也只是在1000步中找到了19次nimo。程序运行到100次后基本处于发散状态。

start learning....
I find nimo at the  5 step.
I find nimo at the  7 step.
I find nimo at the  9 step.
I find nimo at the  11 step.
I find nimo at the  13 step.
I find nimo at the  15 step.
I find nimo at the  17 step.
I find nimo at the  20 step.
I find nimo at the  22 step.
I find nimo at the  25 step.
I find nimo at the  33 step.
I find nimo at the  36 step.
I find nimo at the  48 step.
I find nimo at the  51 step.
I find nimo at the  63 step.
I find nimo at the  65 step.
I find nimo at the  89 step.
I find nimo at the  91 step.
I find nimo at the  94 step.

最差的结果是一次都找不到,最普通的结果是开头找到两次。

start learning....
I find nimo at the  5 step.
I find nimo at the  8 step.

 

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐