赞
踩
如图所示下图的迷宫,白色代表可以通行的路径,黑色代表障碍物,要寻找一条从左下角出发能够避开障碍达到出口的路径。
利用价值迭代方法求解上述问题的思路在于对迷宫进行编码,确定MDP的状态空间 S S S,动作空间 A A A与确定奖励函数 r r r。首先对上述迷宫进行编码如下:
很显然,每个格子就成了状态空间的一个状态,因此状态空间为:
S
=
{
0
,
1
,
2
,
.
.
24
}
S=\{0,1,2,..24\}
S={0,1,2,..24}。而对于其中行走的任一个智能体而言,其动作空间
A
=
{
′
n
o
r
t
h
′
,
′
s
o
u
t
h
′
,
′
e
a
s
t
′
,
′
w
e
s
t
′
}
A=\{'north','south','east','west'\}
A={′north′,′south′,′east′,′west′}分别代表4个能走的方向,规定每次机器人只能走一个格子。而对于奖励的设置,设置为如下:
r
(
s
t
,
a
t
)
=
{
−
1
,
下个时刻状态
s
t
+
1
(
s
t
,
a
t
)
为可通行白色区域
−
100
,
下个时刻状态
s
t
+
1
(
s
t
,
a
t
)
为黑色障碍区域
100
,
下个时刻状态为终点
s
t
+
1
(
s
t
,
a
t
)
=
14
r(s_t,a_t)=
在编程阶段,我们打算将该环境封装成一个类migong_game(),在初始化函数中实现其问题的编码。
注意代码中的
P
指的是转移矩阵即: s t + 1 = P ( s t , a t ) s_{t+1}=P(s_t,a_t) st+1=P(st,at)
def __init__(self,epsiodes=100,gamma=1.0,epslion=1e-5):
self.epsiodes = epsiodes
self.gamma = gamma
self.epslion = epslion
self.states = np.arange(0,25)
actions = {'north': 0, 'south': 1, 'east': 2, 'west': 3}
self.actions = actions
self.terminate_states = [2,3,4,10,11,18,23,14]
# 下面定义奖励函数Rsa
reward = -1*np.ones((25,4))
# 2号点是障碍物
reward[1][actions['east']] = -100
reward[7][actions['south']] = -100
reward[3][actions['west']] = -100
# 3号点是障碍物
reward[2][actions['east']] = -100
reward[8][actions['south']] = -100
reward[4][actions['west']] = -100
# 4号点是障碍物
reward[3][actions['east']] = -100
reward[9][actions['south']] = -100
# 10号点是障碍物
reward[5][actions['north']] = -100
reward[11][actions['west']] = -100
reward[15][actions['south']] = -100
# 11号点是障碍物
reward[6][actions['north']] = -100
reward[12][actions['west']] = -100
reward[16][actions['south']] = -100
reward[10][actions['east']] = -100
# 18是障碍点
reward[17][actions['east']] = -100
reward[19][actions['west']] = -100
reward[13][actions['north']] = -100
reward[23][actions['south']] = -100
# 23是障碍点
reward[22][actions['east']] = -100
reward[24][actions['west']] = -100
reward[18][actions['north']] = -100
# 14号点出去时有奖励
reward[13][actions['east']] = 100
reward[9][actions['north']] = 100
reward[19][actions['south']] = 100
self.reward = reward
# 下面定义状态转移函数Pss'a
location = np.array([[20,21,22,23,24],
[15,16,17,18,19],
[10,11,12,13,14],
[5,6,7,8,9],
[0,1,2,3,4]])
P = -1*np.ones((25,4))
for ix in [1,2,3]:
for iy in [1,2,3]:
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5*(iy + 1)
P[loc][actions['south']] = ix + 5*(iy - 1)
P[loc][actions['east']] = ix + 1 + 5*iy
P[loc][actions['west']] = ix - 1+5*iy
# 下面是四条边
for ix in [1,2,3]:
iy = 4
loc = ix + 5*iy
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
P[loc][actions['west']] = ix - 1 + 5 * iy
for ix in [1,2,3]:
iy = 0
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
P[loc][actions['west']] = ix - 1 + 5 * iy
for iy in [1,2,3]:
ix = 0
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
for iy in [1,2,3]:
ix = 4
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['west']] = ix - 1 + 5 * iy
# 下面是四个顶点
P[0][actions['east']] = 1
P[0][actions['north']] = 5
P[20][actions['south']] = 15
P[20][actions['east']] = 21
P[24][actions['west']] = 23
P[24][actions['south']] = 19
P[4][actions['north']] = 9
P[4][actions['west']] = 3
# 将走不到的地方都原地踏步
for i in range(P.shape[0]):
for j in range(P.shape[1]):
if P[i][j] == -1:
P[i][j] = i
self.t = P
# 初始状态为0
self.state = 0
该函数的目的是重新确定当前的初值状态为0号状态点。
def reset(self):
# self.state = int(random.random()*len(self.states))
self.state = 0
return self.state
该函数的目的是根据当前的状态
s
t
s_t
st与动作
a
t
a_t
at返回下一个状态
s
t
+
1
s_{t+1}
st+1与当前奖励
r
t
r_t
rt,还有任务是否完成标记is_done
。
def step(self,action):
state = self.state
if state == 14:
return state,100,True,{}
obstacle = [2,3,4,10,11,18,23]
if state in obstacle:
return state,-100,True,{}
next_state = self.t[state][self.actions[action]]
if (next_state in obstacle)or(next_state == 14):
is_done = True # 如果误闯障碍/终点就结束
else:
is_done = False
self.state = next_state
r = self.reward[state][self.actions[action]]
return next_state,r,is_done,{}
是为了得到以下形式的矩阵:
P
s
s
′
a
=
P
r
[
s
t
+
1
=
s
′
∣
s
t
=
s
,
a
t
=
a
]
P_{ss^{'}}^a=\mathbf{Pr}[s_{t+1}=s^{'}|s_{t}=s,a_t=a]
Pss′a=Pr[st+1=s′∣st=s,at=a]
def prob(self):
P = np.zeros((len(self.states),len(self.states),len(self.actions)))
for s in range(len(self.states)):
for a in self.actions:
self.state = s
next_state, r, is_done, _ = self.step(a)
if is_done == False:
P[s,int(next_state),self.actions[a]] = 1
self.P = P
return P
采用以下价值函数不断迭代得到最优策略:
v
k
+
1
(
s
)
←
max
a
{
R
s
q
+
γ
∑
s
s
′
P
s
s
′
a
v
k
(
s
)
}
π
(
.
∣
s
)
←
arg
max
a
{
R
s
q
+
γ
∑
s
s
′
P
s
s
′
a
v
k
(
s
)
}
∀
s
∈
S
v_{k+1}(s)\leftarrow\max_{a}\{R_{s}^q+\gamma\sum_{ss^{'}}P_{ss^{'}}^av_k(s)\}\\ \pi(.|s)\leftarrow \arg \max_{a}\{R_{s}^q+\gamma\sum_{ss^{'}}P_{ss^{'}}^av_k(s)\} \\ \forall s\in S
vk+1(s)←amax{Rsq+γss′∑Pss′avk(s)}π(.∣s)←argamax{Rsq+γss′∑Pss′avk(s)}∀s∈S
直到满足条件
∣
∣
v
k
+
1
−
v
k
∣
∣
<
ε
||\mathbf v_{k+1}-\mathbf v_k||<\varepsilon
∣∣vk+1−vk∣∣<ε,结束迭代。
# 价值迭代
def iterate_value(self):
v1 = np.zeros(len(self.states))
v = np.zeros(len(self.states))
pi = np.zeros(len(self.states))
v_iter = np.zeros(self.epsiodes)
for epsiode in range(self.epsiodes):
for state in self.states:
action_array = np.zeros(len(self.actions))
for action in self.actions:
temp_sum = 0
for next_state in self.states:
temp_sum += v[next_state]*self.P[state,next_state,self.actions[action]]
action_array[self.actions[action]] = self.reward[state,self.actions[action]] + self.gamma*temp_sum
v1[state] = np.max(action_array)
pi[state] = np.argmax(action_array)
if np.linalg.norm(v - v1) <= self.epslion:
v_iter[epsiode] = np.linalg.norm(v - v1)
break
else:
v = v1
self.pi = pi
return pi,v_iter,epsiode
更具策略得到当前轨迹:
def get_trad(self):
road_key = {0:'north',1:'south',2:'east',3:'west'}
# 更新初始位置
self.reset()
state = self.state
road = [self.state]
terminate_state = [2,3,4,10,11,18,23,14]
while True:
action_index = self.pi[int(state)]
if road_key[action_index] == 'north':
state = state + 5
elif road_key[action_index] == 'south':
state = state - 5
elif road_key[action_index] == 'east':
state = state + 1
elif road_key[action_index] == 'west':
state = state - 1
road.append(state)
if state in terminate_state:
break
return road
在主函数中调用以下代码实现问题求解:
if __name__ == "__main__":
migong = migong_game(gamma=0.8)
# 对象更新
migong.reset()
# 对象求P
migong.prob()
# 价值迭代
pi,v_iter,epsiode = migong.iterate_value()
road = migong.get_trad()
print("pi:")
print(pi)
print("最大迭代次数:")
print(epsiode)
print('轨迹为:')
print(road)
pi:
[0. 0. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 2. 2. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0.
1.]
最大迭代次数:
1
轨迹为:
[0, 5, 6, 7, 12, 13, 14]
轨迹可视化如下,确实为最短路径:
import numpy as np
import random
class migong_game():
def __init__(self,epsiodes=100,gamma=1.0,epslion=1e-5):
self.epsiodes = epsiodes
self.gamma = gamma
self.epslion = epslion
self.states = np.arange(0,25)
actions = {'north': 0, 'south': 1, 'east': 2, 'west': 3}
self.actions = actions
self.terminate_states = [2,3,4,10,11,18,23,14]
# 下面定义奖励函数Rsa
reward = -1*np.ones((25,4))
# 2号点是障碍物
reward[1][actions['east']] = -100
reward[7][actions['south']] = -100
reward[3][actions['west']] = -100
# 3号点是障碍物
reward[2][actions['east']] = -100
reward[8][actions['south']] = -100
reward[4][actions['west']] = -100
# 4号点是障碍物
reward[3][actions['east']] = -100
reward[9][actions['south']] = -100
# 10号点是障碍物
reward[5][actions['north']] = -100
reward[11][actions['west']] = -100
reward[15][actions['south']] = -100
# 11号点是障碍物
reward[6][actions['north']] = -100
reward[12][actions['west']] = -100
reward[16][actions['south']] = -100
reward[10][actions['east']] = -100
# 18是障碍点
reward[17][actions['east']] = -100
reward[19][actions['west']] = -100
reward[13][actions['north']] = -100
reward[23][actions['south']] = -100
# 23是障碍点
reward[22][actions['east']] = -100
reward[24][actions['west']] = -100
reward[18][actions['north']] = -100
# 14号点出去时有奖励
reward[13][actions['east']] = 100
reward[9][actions['north']] = 100
reward[19][actions['south']] = 100
self.reward = reward
# 下面定义状态转移函数Pss'a
location = np.array([[20,21,22,23,24],
[15,16,17,18,19],
[10,11,12,13,14],
[5,6,7,8,9],
[0,1,2,3,4]])
P = -1*np.ones((25,4))
for ix in [1,2,3]:
for iy in [1,2,3]:
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5*(iy + 1)
P[loc][actions['south']] = ix + 5*(iy - 1)
P[loc][actions['east']] = ix + 1 + 5*iy
P[loc][actions['west']] = ix - 1+5*iy
# 下面是四条边
for ix in [1,2,3]:
iy = 4
loc = ix + 5*iy
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
P[loc][actions['west']] = ix - 1 + 5 * iy
for ix in [1,2,3]:
iy = 0
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
P[loc][actions['west']] = ix - 1 + 5 * iy
for iy in [1,2,3]:
ix = 0
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['east']] = ix + 1 + 5 * iy
for iy in [1,2,3]:
ix = 4
loc = ix + 5*iy
P[loc][actions['north']] = ix + 5 * (iy + 1)
P[loc][actions['south']] = ix + 5 * (iy - 1)
P[loc][actions['west']] = ix - 1 + 5 * iy
# 下面是四个顶点
P[0][actions['east']] = 1
P[0][actions['north']] = 5
P[20][actions['south']] = 15
P[20][actions['east']] = 21
P[24][actions['west']] = 23
P[24][actions['south']] = 19
P[4][actions['north']] = 9
P[4][actions['west']] = 3
# 将走不到的地方都原地踏步
for i in range(P.shape[0]):
for j in range(P.shape[1]):
if P[i][j] == -1:
P[i][j] = i
self.t = P
# 初始状态为0
self.state = 0
# 更新函数
def reset(self):
# self.state = int(random.random()*len(self.states))
self.state = 0
return self.state
# 动作函数
def step(self,action):
state = self.state
if state == 14:
return state,100,True,{}
obstacle = [2,3,4,10,11,18,23]
if state in obstacle:
return state,-100,True,{}
next_state = self.t[state][self.actions[action]]
if (next_state in obstacle)or(next_state == 14):
is_done = True # 如果误闯障碍/终点就结束
else:
is_done = False
self.state = next_state
r = self.reward[state][self.actions[action]]
return next_state,r,is_done,{}
def prob(self):
P = np.zeros((len(self.states),len(self.states),len(self.actions)))
for s in range(len(self.states)):
for a in self.actions:
self.state = s
next_state, r, is_done, _ = self.step(a)
if is_done == False:
P[s,int(next_state),self.actions[a]] = 1
self.P = P
return P
# 价值迭代
def iterate_value(self):
v1 = np.zeros(len(self.states))
v = np.zeros(len(self.states))
pi = np.zeros(len(self.states))
v_iter = np.zeros(self.epsiodes)
for epsiode in range(self.epsiodes):
for state in self.states:
action_array = np.zeros(len(self.actions))
for action in self.actions:
temp_sum = 0
for next_state in self.states:
temp_sum += v[next_state]*self.P[state,next_state,self.actions[action]]
action_array[self.actions[action]] = self.reward[state,self.actions[action]] + self.gamma*temp_sum
v1[state] = np.max(action_array)
pi[state] = np.argmax(action_array)
if np.linalg.norm(v - v1) <= self.epslion:
v_iter[epsiode] = np.linalg.norm(v - v1)
break
else:
v = v1
self.pi = pi
return pi,v_iter,epsiode
# 得到轨迹
def get_trad(self):
road_key = {0:'north',1:'south',2:'east',3:'west'}
# 更新初始位置
self.reset()
state = self.state
road = [self.state]
terminate_state = [2,3,4,10,11,18,23,14]
while True:
action_index = self.pi[int(state)]
if road_key[action_index] == 'north':
state = state + 5
elif road_key[action_index] == 'south':
state = state - 5
elif road_key[action_index] == 'east':
state = state + 1
elif road_key[action_index] == 'west':
state = state - 1
road.append(state)
if state in terminate_state:
break
return road
if __name__ == "__main__":
migong = migong_game(gamma=0.8)
# 对象更新
migong.reset()
# 对象求P
migong.prob()
# 价值迭代
pi,v_iter,epsiode = migong.iterate_value()
road = migong.get_trad()
print("pi:")
print(pi)
print("最大迭代次数:")
print(epsiode)
print('轨迹为:')
print(road)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。