执行步骤与可视化结果图文并茂。场景设定网格大小5×5坐标以 1-based 书写例如左上角为 (1,1)。奖励规则出格尝试走出边界保持原地不动行为与“原地不动”同值在本例的改进里我们做了并列时偏好不动的处理。进入禁区奖励 −10。进入目标奖励 1。其余格子奖励 0。禁区坐标橙色(2,2), (2,3), (3,3), (4,2), (4,4), (5,2)目标坐标绿色(4,3)动作集合5 个上、下、左、右、原地不动。为了更直观我们提供网格渲染函数将禁区标橙色、目标标绿色并在每个格子里同时显示该格子的价值与当前策略的动作符号↑ ↓ ← → o。核心公式策略评估给定策略 π确定动作\[V_{k1}(s) R(s, a) \gamma\, V_k(s) \]其中a π(s)s是执行a后的下一状态若越界则保持原地且“原地不动”与越界同值。策略改进贪心改进\[\pi_{new}(s) \arg\max_a \Big[ R(s, a) \gamma\, V(s) \Big] \]当不同动作价值相同并列时本例的实现会偏好“原地不动”确保边界不出现“继续向外”的箭头。代码结构关键片段完整代码见本文末尾。以下为关键函数与初始化简述。初始化与奖励矩阵n 5 V0 np.zeros((n, n)) # 初始策略全部不动动作索引 4 policy0 np.full((n, n), 4, dtypeint) # 构建奖励矩阵 r禁区 -10目标 1其它 0 r np.zeros((n, n)) forbidden_positions [(2, 2), (2, 3), (3, 3), (4, 2), (4, 4), (5, 2)] goal_position (4, 3) for (ri, ci) in forbidden_positions: r[ri - 1, ci - 1] -10 r[goal_position[0] - 1, goal_position[1] - 1] 1 gamma 0.9 theta 1e-5策略评估给定策略def policy_evaluation_fixed_policy(V, policy, r, gamma0.9, theta1e-5, max_iter1000): n V.shape[0] V_new V.copy() actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for it in range(max_iter): delta 0 for i in range(n): for j in range(n): a_idx policy[i, j] di, dj actions[a_idx] ni, nj i di, j dj # 越界则保持原地与不动同值 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j v_temp r[ni, nj] gamma * V_new[ni, nj] delta max(delta, abs(v_temp - V_new[i, j])) V_new[i, j] v_temp if delta theta: break return V_new策略改进贪心并列偏好“原地不动”def policy_improvement_fixed_values(V, r, gamma0.9): n V.shape[0] new_policy np.zeros((n, n), dtypeint) actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] for i in range(n): for j in range(n): action_values [] for a_idx, (di, dj) in enumerate(actions): ni, nj i di, j dj # 越界则保持原地与不动同值 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j action_value r[ni, nj] gamma * V[ni, nj] action_values.append(action_value) # tie-breaking偏好“不动”索引 4以避免边界显示外向箭头 max_val max(action_values) best_idxs [idx for idx, val in enumerate(action_values) if val max_val] best_action 4 if 4 in best_idxs else best_idxs[0] new_policy[i, j] best_action return new_policy渲染函数图像def render_policy_value(V, policy, forbiddenNone, goalNone, filenamepolicy_value_grid.png): import matplotlib.pyplot as plt from matplotlib.table import Table n V.shape[0] fig, ax plt.subplots(figsize(8, 8)) ax.set_axis_off() tb Table(ax, bbox[0, 0, 1, 1]) actions_symbols [\u2191, \u2193, \u2190, \u2192, o] forb0 set() if forbidden is not None: forb0 {(r - 1, c - 1) for (r, c) in forbidden} goal0 None if goal is not None: goal0 (goal[0] - 1, goal[1] - 1) for i in range(n): for j in range(n): cell_text f{V[i, j]:.2f}\n{actions_symbols[policy[i, j]]} face white if (i, j) in forb0: face #FFA500 # 橙色禁区 elif goal0 is not None and (i, j) goal0: face #00AA00 # 绿色目标 tb.add_cell(i, j, 1/n, 1/n, textcell_text, loccenter, facecolorface) ax.add_table(tb) plt.title(Policy and Value Function) plt.savefig(filename) plt.close()执行流程与结果渲染初始策略与价值全部不动生成图像render_policy_value(V0, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_value.png)图像第一次策略评估给定策略 policy0更新价值V1 policy_evaluation_fixed_policy(V0, policy0, r, gamma, theta) render_policy_value(V1, policy0, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_value_after_evaluation.png)图像第一次策略改进在 V1 上贪心选动作边界并列偏好不动policy1 policy_improvement_fixed_values(V1, r, gamma) render_policy_value(V1, policy1, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_after_improvement.png)图像一次性整体迭代# 迭代下去直到策略不再改变显示最终的最优策略和价值矩阵 policy_curr policy0 V_curr V0 iter_n 1 while True: iter_n 1 V_next policy_evaluation_fixed_policy(V_curr, policy_curr, r, gamma, theta) policy_next policy_improvement_fixed_values(V_next, r, gamma) if np.array_equal(policy_next, policy_curr): break V_curr, policy_curr V_next, policy_next render_policy_value(V_curr, policy_curr, forbiddenforbidden_positions, goalgoal_position, filenamefinal_policy_value.png) print(f策略迭代共进行了 {iter_n} 次得到最终最优策略和价值矩阵见 final_policy_value.png。)import numpy as np # n * n 网格世界中的策略评估示例, n 5,第一个格子表示为(1,1)最左上方。 # 设定奖励规则 ## 出格奖励-1进入禁区奖励-10进入目标奖励1其余格子奖励0 ## 禁区橙色位置为(2,2),(2,3),(3,3),(4,2),(4,4),(5,2) ## 目标位置(4,3) # 初始化参数 n 5 # 网格大小 n*n V0 np.zeros((n, n)) # 初始价值矩阵 V0全为 0 # 初始化策略为全部不移动 # 动作定义上、下、左、右、不动 # actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 初始策略设为“不动”对应动作索引 4 policy0 np.full((n, n), 4, dtypeint) # 根据设定生成奖励矩阵 r r np.zeros((n, n)) forbidden_positions [(2, 2), (2, 3), (3, 3), (4, 2), (4, 4), (5, 2)] goal_position (4, 3) # 将坐标从 1-based 转换为 0-based 并赋值 for (ri, ci) in forbidden_positions: r[ri - 1, ci - 1] -10 r[goal_position[0] - 1, goal_position[1] - 1] 1 # 说明出格奖励 -1 属于转移导致的外部奖励不直接体现在静态 r 矩阵中 # 若需要在转移模型中体现应在动态更新时处理。 gamma 0.9 theta 1e-5 # 收敛阈值两次迭代间最大变化小于等于 theta 时停止 # 根据当前价值固定价值矩阵找出最优策略得到新的策略移动方案和r_mat ## 只做一次策略改进 def policy_improvement_fixed_values(V, r, gamma0.9): n V.shape[0] new_policy np.zeros((n, n), dtypeint) actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for i in range(n): for j in range(n): action_values [] for a_idx, (di, dj) in enumerate(actions): ni, nj i di, j dj # 检查是否出格 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j # 出格则不动 action_value r[ni, nj] gamma * V[ni, nj] action_values.append(action_value) # 平局时优先选择“不动”索引 4使边界显示为 o max_val max(action_values) best_idxs [idx for idx, val in enumerate(action_values) if val max_val] best_action 4 if 4 in best_idxs else best_idxs[0] new_policy[i, j] best_action return new_policy # 在确定的移动策略和初始的价值矩阵计算出新的价值矩阵 def policy_evaluation_fixed_policy(V, policy, r, gamma0.9, theta1e-5, max_iter1000): n V.shape[0] V_new V.copy() actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for it in range(max_iter): delta 0 for i in range(n): for j in range(n): a_idx policy[i, j] di, dj actions[a_idx] ni, nj i di, j dj # 检查是否出格 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j # 出格则不动 v_temp r[ni, nj] gamma * V_new[ni, nj] delta max(delta, abs(v_temp - V_new[i, j])) V_new[i, j] v_temp if delta theta: break return V_new # 一个画图的函数可视化网格当前的之前的策略和价值矩阵找到的最优的策略和计算得到的价值估计 def render_policy_value(V, policy, forbiddenNone, goalNone, filenamepolicy_value_grid.png): import matplotlib.pyplot as plt from matplotlib.table import Table n V.shape[0] fig, ax plt.subplots(figsize(8, 8)) ax.set_axis_off() tb Table(ax, bbox[0, 0, 1, 1]) actions_symbols [↑, ↓, ←, →, o] # 上、下、左、右、不动 # 处理禁区与目标坐标输入为 1-based转换为 0-based forb0 set() if forbidden is not None: forb0 {(r - 1, c - 1) for (r, c) in forbidden} goal0 None if goal is not None: goal0 (goal[0] - 1, goal[1] - 1) for i in range(n): for j in range(n): cell_text f{V[i, j]:.2f}\n{actions_symbols[policy[i, j]]} face white if (i, j) in forb0: face #FFA500 # 橙色禁区 elif goal0 is not None and (i, j) goal0: face #00AA00 # 绿色目标 tb.add_cell(i, j, 1/n, 1/n, textcell_text, loccenter, facecolorface) ax.add_table(tb) plt.title(Policy and Value Function) plt.savefig(filename) plt.close() # 渲染初始策略和价值矩阵禁区橙色目标绿色 render_policy_value(V0, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_and_initial_value.png) # 执行一次策略评估 V1 policy_evaluation_fixed_policy(V0, policy0, r, gamma, theta) render_policy_value(V1, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_evaluation.png) # 执行一次策略改进 policy1 policy_improvement_fixed_values(V1, r, gamma) render_policy_value(V1, policy1, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_after_first_improvement.png) # 执行一次策略评估 V2 policy_evaluation_fixed_policy(V1, policy1, r, gamma, theta) # 渲染改进后策略对应的价值矩阵 render_policy_value(V2, policy1, forbiddenforbidden_positions, goalgoal_position, filename1st_improved_policy_evaluation.png) # 迭代下去直到策略不再改变显示最终的最优策略和价值矩阵 policy_curr policy0 V_curr V0 iter_n 1 while True: iter_n 1 V_next policy_evaluation_fixed_policy(V_curr, policy_curr, r, gamma, theta) policy_next policy_improvement_fixed_values(V_next, r, gamma) if np.array_equal(policy_next, policy_curr): break V_curr, policy_curr V_next, policy_next render_policy_value(V_curr, policy_curr, forbiddenforbidden_positions, goalgoal_position, filenamefinal_policy_value.png) print(f策略迭代共进行了 {iter_n} 次得到最终最优策略和价值矩阵见 final_policy_value.png。)
网格世界策略评估与策略改进(5 动作)
发布时间:2026/5/23 12:52:39
执行步骤与可视化结果图文并茂。场景设定网格大小5×5坐标以 1-based 书写例如左上角为 (1,1)。奖励规则出格尝试走出边界保持原地不动行为与“原地不动”同值在本例的改进里我们做了并列时偏好不动的处理。进入禁区奖励 −10。进入目标奖励 1。其余格子奖励 0。禁区坐标橙色(2,2), (2,3), (3,3), (4,2), (4,4), (5,2)目标坐标绿色(4,3)动作集合5 个上、下、左、右、原地不动。为了更直观我们提供网格渲染函数将禁区标橙色、目标标绿色并在每个格子里同时显示该格子的价值与当前策略的动作符号↑ ↓ ← → o。核心公式策略评估给定策略 π确定动作\[V_{k1}(s) R(s, a) \gamma\, V_k(s) \]其中a π(s)s是执行a后的下一状态若越界则保持原地且“原地不动”与越界同值。策略改进贪心改进\[\pi_{new}(s) \arg\max_a \Big[ R(s, a) \gamma\, V(s) \Big] \]当不同动作价值相同并列时本例的实现会偏好“原地不动”确保边界不出现“继续向外”的箭头。代码结构关键片段完整代码见本文末尾。以下为关键函数与初始化简述。初始化与奖励矩阵n 5 V0 np.zeros((n, n)) # 初始策略全部不动动作索引 4 policy0 np.full((n, n), 4, dtypeint) # 构建奖励矩阵 r禁区 -10目标 1其它 0 r np.zeros((n, n)) forbidden_positions [(2, 2), (2, 3), (3, 3), (4, 2), (4, 4), (5, 2)] goal_position (4, 3) for (ri, ci) in forbidden_positions: r[ri - 1, ci - 1] -10 r[goal_position[0] - 1, goal_position[1] - 1] 1 gamma 0.9 theta 1e-5策略评估给定策略def policy_evaluation_fixed_policy(V, policy, r, gamma0.9, theta1e-5, max_iter1000): n V.shape[0] V_new V.copy() actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for it in range(max_iter): delta 0 for i in range(n): for j in range(n): a_idx policy[i, j] di, dj actions[a_idx] ni, nj i di, j dj # 越界则保持原地与不动同值 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j v_temp r[ni, nj] gamma * V_new[ni, nj] delta max(delta, abs(v_temp - V_new[i, j])) V_new[i, j] v_temp if delta theta: break return V_new策略改进贪心并列偏好“原地不动”def policy_improvement_fixed_values(V, r, gamma0.9): n V.shape[0] new_policy np.zeros((n, n), dtypeint) actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] for i in range(n): for j in range(n): action_values [] for a_idx, (di, dj) in enumerate(actions): ni, nj i di, j dj # 越界则保持原地与不动同值 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j action_value r[ni, nj] gamma * V[ni, nj] action_values.append(action_value) # tie-breaking偏好“不动”索引 4以避免边界显示外向箭头 max_val max(action_values) best_idxs [idx for idx, val in enumerate(action_values) if val max_val] best_action 4 if 4 in best_idxs else best_idxs[0] new_policy[i, j] best_action return new_policy渲染函数图像def render_policy_value(V, policy, forbiddenNone, goalNone, filenamepolicy_value_grid.png): import matplotlib.pyplot as plt from matplotlib.table import Table n V.shape[0] fig, ax plt.subplots(figsize(8, 8)) ax.set_axis_off() tb Table(ax, bbox[0, 0, 1, 1]) actions_symbols [\u2191, \u2193, \u2190, \u2192, o] forb0 set() if forbidden is not None: forb0 {(r - 1, c - 1) for (r, c) in forbidden} goal0 None if goal is not None: goal0 (goal[0] - 1, goal[1] - 1) for i in range(n): for j in range(n): cell_text f{V[i, j]:.2f}\n{actions_symbols[policy[i, j]]} face white if (i, j) in forb0: face #FFA500 # 橙色禁区 elif goal0 is not None and (i, j) goal0: face #00AA00 # 绿色目标 tb.add_cell(i, j, 1/n, 1/n, textcell_text, loccenter, facecolorface) ax.add_table(tb) plt.title(Policy and Value Function) plt.savefig(filename) plt.close()执行流程与结果渲染初始策略与价值全部不动生成图像render_policy_value(V0, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_value.png)图像第一次策略评估给定策略 policy0更新价值V1 policy_evaluation_fixed_policy(V0, policy0, r, gamma, theta) render_policy_value(V1, policy0, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_value_after_evaluation.png)图像第一次策略改进在 V1 上贪心选动作边界并列偏好不动policy1 policy_improvement_fixed_values(V1, r, gamma) render_policy_value(V1, policy1, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_after_improvement.png)图像一次性整体迭代# 迭代下去直到策略不再改变显示最终的最优策略和价值矩阵 policy_curr policy0 V_curr V0 iter_n 1 while True: iter_n 1 V_next policy_evaluation_fixed_policy(V_curr, policy_curr, r, gamma, theta) policy_next policy_improvement_fixed_values(V_next, r, gamma) if np.array_equal(policy_next, policy_curr): break V_curr, policy_curr V_next, policy_next render_policy_value(V_curr, policy_curr, forbiddenforbidden_positions, goalgoal_position, filenamefinal_policy_value.png) print(f策略迭代共进行了 {iter_n} 次得到最终最优策略和价值矩阵见 final_policy_value.png。)import numpy as np # n * n 网格世界中的策略评估示例, n 5,第一个格子表示为(1,1)最左上方。 # 设定奖励规则 ## 出格奖励-1进入禁区奖励-10进入目标奖励1其余格子奖励0 ## 禁区橙色位置为(2,2),(2,3),(3,3),(4,2),(4,4),(5,2) ## 目标位置(4,3) # 初始化参数 n 5 # 网格大小 n*n V0 np.zeros((n, n)) # 初始价值矩阵 V0全为 0 # 初始化策略为全部不移动 # 动作定义上、下、左、右、不动 # actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 初始策略设为“不动”对应动作索引 4 policy0 np.full((n, n), 4, dtypeint) # 根据设定生成奖励矩阵 r r np.zeros((n, n)) forbidden_positions [(2, 2), (2, 3), (3, 3), (4, 2), (4, 4), (5, 2)] goal_position (4, 3) # 将坐标从 1-based 转换为 0-based 并赋值 for (ri, ci) in forbidden_positions: r[ri - 1, ci - 1] -10 r[goal_position[0] - 1, goal_position[1] - 1] 1 # 说明出格奖励 -1 属于转移导致的外部奖励不直接体现在静态 r 矩阵中 # 若需要在转移模型中体现应在动态更新时处理。 gamma 0.9 theta 1e-5 # 收敛阈值两次迭代间最大变化小于等于 theta 时停止 # 根据当前价值固定价值矩阵找出最优策略得到新的策略移动方案和r_mat ## 只做一次策略改进 def policy_improvement_fixed_values(V, r, gamma0.9): n V.shape[0] new_policy np.zeros((n, n), dtypeint) actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for i in range(n): for j in range(n): action_values [] for a_idx, (di, dj) in enumerate(actions): ni, nj i di, j dj # 检查是否出格 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j # 出格则不动 action_value r[ni, nj] gamma * V[ni, nj] action_values.append(action_value) # 平局时优先选择“不动”索引 4使边界显示为 o max_val max(action_values) best_idxs [idx for idx, val in enumerate(action_values) if val max_val] best_action 4 if 4 in best_idxs else best_idxs[0] new_policy[i, j] best_action return new_policy # 在确定的移动策略和初始的价值矩阵计算出新的价值矩阵 def policy_evaluation_fixed_policy(V, policy, r, gamma0.9, theta1e-5, max_iter1000): n V.shape[0] V_new V.copy() actions [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # 上、下、左、右、不动 for it in range(max_iter): delta 0 for i in range(n): for j in range(n): a_idx policy[i, j] di, dj actions[a_idx] ni, nj i di, j dj # 检查是否出格 if ni 0 or ni n or nj 0 or nj n: ni, nj i, j # 出格则不动 v_temp r[ni, nj] gamma * V_new[ni, nj] delta max(delta, abs(v_temp - V_new[i, j])) V_new[i, j] v_temp if delta theta: break return V_new # 一个画图的函数可视化网格当前的之前的策略和价值矩阵找到的最优的策略和计算得到的价值估计 def render_policy_value(V, policy, forbiddenNone, goalNone, filenamepolicy_value_grid.png): import matplotlib.pyplot as plt from matplotlib.table import Table n V.shape[0] fig, ax plt.subplots(figsize(8, 8)) ax.set_axis_off() tb Table(ax, bbox[0, 0, 1, 1]) actions_symbols [↑, ↓, ←, →, o] # 上、下、左、右、不动 # 处理禁区与目标坐标输入为 1-based转换为 0-based forb0 set() if forbidden is not None: forb0 {(r - 1, c - 1) for (r, c) in forbidden} goal0 None if goal is not None: goal0 (goal[0] - 1, goal[1] - 1) for i in range(n): for j in range(n): cell_text f{V[i, j]:.2f}\n{actions_symbols[policy[i, j]]} face white if (i, j) in forb0: face #FFA500 # 橙色禁区 elif goal0 is not None and (i, j) goal0: face #00AA00 # 绿色目标 tb.add_cell(i, j, 1/n, 1/n, textcell_text, loccenter, facecolorface) ax.add_table(tb) plt.title(Policy and Value Function) plt.savefig(filename) plt.close() # 渲染初始策略和价值矩阵禁区橙色目标绿色 render_policy_value(V0, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_and_initial_value.png) # 执行一次策略评估 V1 policy_evaluation_fixed_policy(V0, policy0, r, gamma, theta) render_policy_value(V1, policy0, forbiddenforbidden_positions, goalgoal_position, filenameinitial_policy_evaluation.png) # 执行一次策略改进 policy1 policy_improvement_fixed_values(V1, r, gamma) render_policy_value(V1, policy1, forbiddenforbidden_positions, goalgoal_position, filenamepolicy_after_first_improvement.png) # 执行一次策略评估 V2 policy_evaluation_fixed_policy(V1, policy1, r, gamma, theta) # 渲染改进后策略对应的价值矩阵 render_policy_value(V2, policy1, forbiddenforbidden_positions, goalgoal_position, filename1st_improved_policy_evaluation.png) # 迭代下去直到策略不再改变显示最终的最优策略和价值矩阵 policy_curr policy0 V_curr V0 iter_n 1 while True: iter_n 1 V_next policy_evaluation_fixed_policy(V_curr, policy_curr, r, gamma, theta) policy_next policy_improvement_fixed_values(V_next, r, gamma) if np.array_equal(policy_next, policy_curr): break V_curr, policy_curr V_next, policy_next render_policy_value(V_curr, policy_curr, forbiddenforbidden_positions, goalgoal_position, filenamefinal_policy_value.png) print(f策略迭代共进行了 {iter_n} 次得到最终最优策略和价值矩阵见 final_policy_value.png。)