我得到一个非常类似于this的错误。
我的错误如下:
-----------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-85-5a223a19e3f5> in <module>
8 save_run = 'Yes',
9 return_progress_dict = 'Yes',
---> 10 hide_text = 'No')
<ipython-input-84-023bc49b2138> in train_CNN(model, optimizer, train_dataloader, epochs, run_number, val_dataloader, save_run, return_progress_dict, hide_text)
63 print(labels[0].dtype)
64 print("------------")
---> 65 loss = F.cross_entropy(probs, labels)
66
67 total_loss += loss.item()
/usr/local/anaconda/lib/python3.6/site-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
2844 if size_average is not None or reduce is not None:
2845 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
2847
2848
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
我要确保我的labels
和probs
都有相同的数据类型- float
probs[0].dtype:
torch.float32
probs[0]:
tensor([-0.8244, -0.5771], device='cuda:0', grad_fn=<SelectBackward0>)
probs[0].dtype:
torch.float32
------------
labels:
tensor([0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0.,
0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0.,
0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0.,
1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0.,
1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
1., 0., 0., 0., 1., 1.], device='cuda:0')
labels.dtype:
torch.float32
labels[0].dtype:
torch.float32
------------
我的职能如下
def train_CNN(model, optimizer, train_dataloader, epochs, run_number,
val_dataloader=None, save_run=None, return_progress_dict = None, hide_text = None):
# Tracking lowest validation loss
lowest_val_loss = float('inf')
if return_progress_dict == 'Yes':
progress_dict = {run_number: {'Epoch':[], 'Avg_Training_Loss':[], 'Validation_Loss':[], 'Validation_Accuracy':[]} }
# Start training loop
if hide_text != "Yes":
print("Start training...\n")
print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
print("-"*60)
for epoch_i in range(epochs):
# =======================================
# Training
# =======================================
# Tracking time and loss
t0_epoch = time.time()
total_loss = 0
# Put the model into the training mode
model.train()
for step, batch in enumerate(train_dataloader):
# Load batch to CPU
data, labels = tuple(t.to(device) for t in batch)
#labels = labels.type(torch.LongTensor) #23Feb2022 casting to long as per https://stackoverflow.com/questions/69742930/runtimeerror-nll-loss-forward-reduce-cuda-kernel-2d-index-not-implemented-for
#labels.to(device)
# Zero out any previously calculated gradients
optimizer.zero_grad()
# Perform a forward pass. This will return logits.
probs = model(data)
# Compute loss and accumulate the loss values
print("------------")
print ("probs[0].dtype:")
print(probs[0].dtype)
print ("probs[0]:")
print (probs[0])
#print (probs)
probs=probs.type(torch.cuda.FloatTensor)
#probs=probs.type(torch.cuda.LongTensor)
print ("probs[0].dtype:")
print(probs[0].dtype)
print("------------")
labels = labels.type(torch.cuda.FloatTensor)
#labels=labels.type(torch.cuda.LongTensor)
#labels = labels.type(torch.cuda.DoubleTensor)
#x_cuda = Variable(x, requires_grad=True).cuda()
print("labels:")
print (labels)
print("labels.dtype:")
print (labels.dtype)
print("labels[0].dtype:")
print(labels[0].dtype)
print("------------")
loss = F.cross_entropy(probs, labels)
total_loss += loss.item()
# Perform a backward pass to calculate gradients
loss.backward()
# Update parameters
optimizer.step()
有什么建议吗?我很快就会尝试给出一个可复制的例子。
##########################update 1
我试过labels=labels.type(torch.cuda.LongTensor)
。Probs
仍然是float32
,并且仍然得到错误RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'
发布于 2022-02-24 15:56:56
labels
应该是torch.long
类型,而不是torch.float32
类型。
我认为您刚才在代码中注释掉了执行此转换的行。
https://stackoverflow.com/questions/71254428
复制相似问题