我试图让一个演员评论家变体的钟摆运行,但我似乎遇到了一个特殊的问题。
RuntimeError: Found dtype Double but expected Float
我以前见过很多次这种情况,所以我已经经历过了,并且试图改变我损失的数据类型(保存在注释中),但是它仍然不起作用。有谁能指出如何解决这个问题,让我从中吸取教训吗?
下面是完整代码
import gym, os
import numpy as np
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from collections import namedtuple
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
class ActorCritic(nn.Module):
"""
Implementing both heads of the actor critic model
"""
def __init__(self, state_space, action_space):
super(ActorCritic, self).__init__()
self.state_space = state_space
self.action_space = action_space
# HL 1
self.linear1 = nn.Linear(self.state_space, 128)
# HL 2
self.linear2 = nn.Linear(128, 256)
# Outputs
self.critic_head = nn.Linear(256, 1)
self.action_mean = nn.Linear(256, self.action_space)
self.action_std = nn.Linear(256, self.action_space)
# Saving
self.saved_actions = []
self.rewards = []
# Optimizer
self.optimizer = optim.Adam(self.parameters(), lr = 1e-3)
self.eps = np.finfo(np.float32).eps.item()
def forward(self, state):
"""
Forward pass for both actor and critic
"""
# State to Layer 1
l1_output = F.relu(self.linear1(state))
# Layer 1 to Layer 2
l2_output = F.relu(self.linear2(l1_output))
# Layer 2 to Action
mean = self.action_mean(l2_output)
std = self.action_std(l2_output)
std = torch.clamp(std, min=LOG_SIG_MIN, max = LOG_SIG_MAX)
std = std.exp()
# Layer 2 to Value
value_est = self.critic_head(l2_output)
return value_est, mean, std
def select_action(self,state):
state = torch.from_numpy(state).float().unsqueeze(0)
value_est, mean, std = self.forward(state)
value_est = value_est.reshape(-1)
# Make prob Normal dist
dist = Normal(mean, std)
action = dist.sample()
action = torch.tanh(action)
ln_prob = dist.log_prob(action)
ln_prob = ln_prob.sum()
self.saved_actions.append(SavedAction(ln_prob, value_est))
action = action.numpy()
return action[0]
def compute_returns(self, gamma): # This is the error causing code
"""
Calculate losses and do backprop
"""
R = 0
saved_actions = self.saved_actions
policy_losses = []
value_losses = []
returns = []
for r in self.rewards[::-1]:
# Discount value
R = r + gamma*R
returns.insert(0,R)
returns = torch.tensor(returns)
returns = (returns - returns.mean())/(returns.std()+self.eps)
for (log_prob, value), R in zip(saved_actions, returns):
advantage = R - value.item()
advantage = advantage.type(torch.FloatTensor)
policy_losses.append(-log_prob*advantage)
value_losses.append(F.mse_loss(value, torch.tensor([R])))
self.optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = loss.type(torch.FloatTensor)
loss.backward()
self.optimizer.step()
del self.rewards[:]
del self.saved_actions[:]
env = gym.make("Pendulum-v0")
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
# Train Expert AC
model = ActorCritic(state_space, action_space)
train = True
if train == True:
# Main loop
window = 50
reward_history = []
for ep in count():
state = env.reset()
ep_reward = 0
for t in range(1,1000):
if ep%50 == 0:
env.render()
action = model.select_action(state)
state, reward, done, _ = env.step(action)
model.rewards.append(reward)
ep_reward += reward
if done:
break
print(reward)
model.compute_returns(0.99) # Error begins here
reward_history.append(ep_reward)
# Result information
if ep % 50 == 0:
mean = np.mean(reward_history[-window:])
print(f"Episode: {ep} Last Reward: {ep_reward} Rolling Mean: {mean}")
if np.mean(reward_history[-100:])>199:
print(f"Environment solved at episode {ep}, average run length > 200")
break
完整的错误日志,一些元素被修改以保持隐私。最初,循环、演员、评论家和主循环在单独的文件中。添加到适当的错误导致行的注释。
Traceback (most recent call last):
File "pendulum.py", line 59, in <module>
model.compute_returns(0.99)
File "/home/x/Software/git/x/x/solvers/actorcritic_cont.py", line 121, in compute_returns
loss.backward()
File "/home/x/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/x/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: Found dtype Double but expected Float
发布于 2022-02-23 09:45:02
在这里回答,以防将来有人有类似的问题。在OpenAI体摆-V0中,奖励的输出是双倍的,所以当你计算这一集的回报时,你需要把它转换成一个浮动张量。
我这么做是因为:
returns = torch.tensor(returns)
returns = (returns - returns.mean())/(returns.std()+self.eps)
returns = returns.type(torch.FloatTensor)
https://stackoverflow.com/questions/71224852
复制相似问题