% 用QLearing强化学习的一个简单例子
% 一个agent“o”在一维世界的左边,宝藏在最右边
% 看看agent如何改进寻找宝藏的策略
% 网上有的都是python实现、这里实现matlab的
rng('default');
% 无论何时何地开启任何 MATLAB ,后面产生的随机数都一样
global N_STATES ACTIONS EPSILON ALPHA GAMMA MAX_EPISODES FRESH_TIME
N_STATES = 6; % 一维世界的长度
ACTIONS = [1 2];%["left", "right"]; % 可取动作
EPSILON = 0.9; % 最优动作选择概率
ALPHA = 0.1; % 学习率
GAMMA = 0.9; % 衰减因子
MAX_EPISODES = 13; % 最大迭代
FRESH_TIME = 0.05; % 刷新时间
q_table = rl();
disp('Q-table:')
disp(q_table)
save('q_table.mat','q_table')
function table=build_q_table(n_states, actions)
table = zeros(n_states, length(actions));
end
function action_name=choose_action(state, q_table)
global ACTIONS EPSILON
state_actions = q_table(state, :) ;% 取出这一步的概率
if (rand() > EPSILON) || (all(state_actions == 0))
% 初始时随机选择
action_name = ACTIONS(randi(2));
else % 贪心选择
[~,I] = max(state_actions);
action_name = ACTIONS(I);
% 选概率大的
end
end
function [S_, R]=get_env_feedback(S, A)
global N_STATES
% agent与环境互动的设置
if A == 2 % 右移
if S == N_STATES - 1 % 结束
S_ = -1;
R = 1;
else
S_ = S + 1;
R = 0;
end
else % 左移
R = 0;
if S == 1
S_ = S; % 最左端
else
S_ = S - 1;
end
end
end
function update_env(S, episode, step_counter)
global N_STATES FRESH_TIME
% 绘制这个场景
env_list = [repmat('-',1,(N_STATES - 1)) 'T'];
if S == -1
interaction = ['Episode ' num2str(episode) ': total_steps = ' num2str(step_counter)]; % (episode + 1, step_counter)
disp(interaction)
pause(FRESH_TIME*2)
else
env_list(S) = 'o';
disp(env_list)
pause(FRESH_TIME)
end
end
function q_table=rl()
global N_STATES ACTIONS ALPHA GAMMA MAX_EPISODES
% 强化学习主循环
q_table = build_q_table(N_STATES, ACTIONS);
for episode =1:MAX_EPISODES
step_counter = 0;
S = 1;
is_terminated = false;
update_env(S, episode, step_counter);
while ~is_terminated
A = choose_action(S, q_table);
[S_, R] = get_env_feedback(S, A) ; % 采取动作获得状态和奖励
q_predict = q_table(S, A);
if S_ ~= -1
q_target = R + GAMMA * max(q_table(S_, :)); % 没有结束
else
q_target = R; % 一局结束了
is_terminated = true; % 更新标记
end
q_table(S, A) = q_table(S, A) + ALPHA * (q_target - q_predict); % 更新
S = S_ ; % 进行下一步
update_env(S, episode, step_counter + 1)
step_counter = step_counter +1;
end
end