这是一种用于评估多臂赌博机问题策略的方法1。 测试平台的运作方式如下:
- 生成与10个动作 相关联的10个奖励均值
- 在每次迭代中,允许代理采取某个动作 ,并接收奖励 。
我们对100组随机采样的 重复此过程。 代理的目标是最大化平均奖励。理想情况下,它应该学会哪个动作具有最高的均值,并从该动作中采样。
策略一:$\epsilon$-贪婪
class GreedyEpsilon:
def __init__(self, n_actions, eps, reward_fn, bias=0.0):
self.n_actions = n_actions
self.Q = np.array([bias] * n_actions)
self.n_moves = np.zeros((n_actions,))
self.eps = eps
self.reward_fn = reward_fn
self.total_reward = 0.0
def step(self):
if np.random.rand() < self.eps:
action = np.random.randint(0, self.n_actions)
else:
action = np.argmax(self.Q)
reward = self.reward_fn(action)
self.n_moves[action] += 1
self.Q[action] += 1.0 / self.n_moves[action] * (reward - self.Q[action])
self.total_reward += reward
return reward
策略二:欺骗系统
我们本不应该多次采样,但为了展示性能的上限,我们将这样做。
class CheatingModel:
def __init__(self, n_actions, eps, reward_fn, bias=0.0):
self.n_actions = n_actions
self.reward_fn = reward_fn
self.total_reward = 0.0
def step(self):
reward = max(self.reward_fn(action) for action in range(self.n_actions))
self.total_reward += reward
return reward
结果
Loading...
完整代码 (matplotlib 版本)
import numpy as np
from tqdm import trange
import matplotlib.pyplot as plt
def normal_reward(action, action_to_reward_mu, reward_std, n_samples):
return np.random.normal(action_to_reward_mu[action], reward_std, n_samples)
class CheatingModel:
def __init__(self, n_actions, eps, reward_fn, bias=0.0):
self.n_actions = n_actions
self.reward_fn = reward_fn
self.total_reward = 0.0
def step(self):
reward = max(self.reward_fn(action) for action in range(self.n_actions))
self.total_reward += reward
return reward
class GreedyEpsilon:
def __init__(self, n_actions, eps, reward_fn, bias=0.0):
self.n_actions = n_actions
self.Q = np.array([bias] * n_actions)
self.n_moves = np.zeros((n_actions,))
self.eps = eps
self.reward_fn = reward_fn
self.total_reward = 0.0
def step(self):
if np.random.rand() < self.eps:
action = np.random.randint(0, self.n_actions)
else:
action = np.argmax(self.Q)
reward = self.reward_fn(action)
self.n_moves[action] += 1
self.Q[action] += 1.0 / self.n_moves[action] * (reward - self.Q[action])
self.total_reward += reward
return reward
def main():
N_ACTIONS = 10
N_DISTRIBUTIONS = 100
reward_std = 1.0
n_steps = 2000
epsilon_values = [0.0, 0.01, 0.1, 0.2]
avg_rewards = {epsilon: np.zeros((n_steps,)) for epsilon in epsilon_values}
avg_rewards["cheating"] = np.zeros((n_steps,))
bias_values = [0.0, 0.5, 1.0]
avg_rewards_bias = {bias: np.zeros((n_steps,)) for bias in bias_values}
for _ in trange(N_DISTRIBUTIONS):
action_to_reward_mu = np.random.normal(0, 1, (N_ACTIONS,))
for epsilon in epsilon_values:
model = GreedyEpsilon(
N_ACTIONS,
epsilon,
lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
)
for n in range(n_steps):
model.step()
avg_rewards[epsilon][n] += model.total_reward / (n + 1)
avg_rewards[epsilon] /= N_DISTRIBUTIONS
cheating_model = CheatingModel(
N_ACTIONS,
0,
lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
)
for n in range(n_steps):
cheating_model.step()
avg_rewards["cheating"][n] += cheating_model.total_reward / (n + 1)
avg_rewards["cheating"] /= N_DISTRIBUTIONS
for bias in bias_values:
biased_model = GreedyEpsilon(
N_ACTIONS,
0.01,
lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
bias=bias,
)
for n in range(n_steps):
biased_model.step()
avg_rewards_bias[bias][n] += biased_model.total_reward / (n + 1)
avg_rewards_bias[bias] /= N_DISTRIBUTIONS
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for epsilon in epsilon_values:
plt.plot(avg_rewards[epsilon], label=f"epsilon={epsilon}")
plt.plot(avg_rewards["cheating"], label="作弊模型", linestyle="--")
plt.legend()
plt.yscale("log")
plt.title("Epsilon 结果")
plt.subplot(1, 2, 2)
for bias in bias_values:
plt.plot(avg_rewards_bias[bias], label=f"bias={bias}")
plt.plot(avg_rewards["cheating"], label="作弊模型", linestyle="--")
plt.legend()
plt.yscale("log")
plt.title("Epsilon=0.01 时的 Bias 结果")
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
-
《强化学习》 by Sutton 等人 ↩︎