在我们做多类单选分类(B, C × B)的场景中,通常的做法是把模型输出先过一层 softmax,再送进 CrossEntropyLoss;或者在用了 label smoothing 的同时仍然手动 softmax。一开始似乎也能降点 loss,但很快收敛变慢、梯度稀得离谱,验证集准确率长期不上 70%
import torch, torch.nn as nn, torch.nn.functional as F
torch.manual_seed(0)
class MLP(nn.Module):
def __init__(self, d_in=20, d_hidden=64, n_class=5):
super().__init__()
self.f = nn.Sequential(nn.Linear(d_in, d_hidden), nn.ReLU(), nn.Linear(d_hidden, n_class))
def forward(self, x): return self.f(x) # 返回原始 logits
def make_loader(n=6000, bs=64):
# 构造线性可分的 5 类问题(可快速收敛)
X = torch.randn(n, 20)
w = torch.randn(20, 5); b = torch.randn(5)
y = (X @ w + b).argmax(dim=1)
ds = torch.utils.data.TensorDataset(X, y)
return torch.utils.data.DataLoader(ds, batch_size=bs, shuffle=True, drop_last=True)
def train(bug=True, steps=300, smoothing=0.1):
model = MLP()
opt = torch.optim.AdamW(model.parameters(), lr=3e-3)
loader = make_loader()
it = iter(loader)
losses, accs = [], []
for step in range(1, steps+1):
try: x, y = next(it)
except StopIteration:
it = iter(loader); x, y = next(it)
logits = model(x) # [B, C]
if bug:
# 错误示范:先 softmax,再交叉熵;还叠加 label_smoothing → 双重平滑
prob = torch.softmax(logits, dim=-1)
loss = F.cross_entropy(prob, y, label_smoothing=smoothing)
else:
# 正确示范:直接把 logits 给交叉熵;平滑由函数内部完成
loss = F.cross_entropy(logits, y, label_smoothing=smoothing)
opt.zero_grad(set_to_none=True)
loss.backward()
opt.step()
with torch.no_grad():
pred = logits.argmax(dim=1) # 指标计算时才用 argmax/softmax
acc = (pred == y).float().mean().item()
losses.append(loss.item()); accs.append(acc)
if step % 50 == 0:
# 观测梯度与 logits 的范围
gnorm = 0.0
for p in model.parameters():
if p.grad is not None: gnorm += p.grad.norm().item()
print(f"[{'BUG' if bug else 'FIX'}] step={step:03d} loss={loss.item():.3f} acc={acc:.3f} gnorm≈{gnorm:.2f} "
f"logits_range=({float(logits.min()):.2f},{float(logits.max()):.2f})")
if __name__ == "__main__":
print("== 错误用法:softmax -> CrossEntropy ==")
train(bug=True)
print("\n== 正确用法:logits -> CrossEntropy ==")
train(bug=False)
你会看到
后果说明
logits = model(x) # [B, C]
loss = F.cross_entropy(logits, y, label_smoothing=0.1) # 可选平滑
with torch.no_grad():
prob = torch.softmax(logits, dim=-1)
pred = prob.argmax(dim=-1)
def soft_cross_entropy(logits, soft_targets):
logp = F.log_softmax(logits, dim=-1)
return -(soft_targets * logp).sum(dim=-1).mean()
logp = F.log_softmax(logits, dim=-1)
loss = F.nll_loss(logp, y) # 与 cross_entropy 等价
交叉熵前手动 softmax 看似“更符合直觉”,却会在训练中两次归一化分布,让可分性与梯度被抹平。把 softmax 从训练损失路径中移除,只在评估阶段使用;对需要的软目标采用 soft cross-entropy 或 KLDivLoss。配合上面的可复现实验与守卫函数,这个常年高频的误用可以一次性根治。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。