
在普通的分类模型中,最初几步 loss 正常下降,随后出现两种诡异现象之一:要么 loss/gnorm 突然飙升并发散,要么显存一轮比一轮高直到 OOM。复盘后发现是训练循环里没有在每次优化步前清梯度,或者对同一计算图做了多次 backward(还 retain_graph=True),导致梯度被悄悄“叠罗汉”。
CPU 可跑;分两类错误切换观察。
import argparse, torch, torch.nn as nn, torch.nn.functional as F
torch.manual_seed(0)
class TinyMLP(nn.Module):
def __init__(self, din=32, dh=128, ncls=3):
super().__init__()
self.f = nn.Sequential(nn.Linear(din, dh), nn.ReLU(), nn.Linear(dh, ncls))
def forward(self, x): return self.f(x)
def make_loader(n=8192, bs=128, din=32, ncls=3):
X = torch.randn(n, din)
W = torch.randn(din, ncls); b = torch.randn(ncls)
y = (X @ W + b).argmax(1)
ds = torch.utils.data.TensorDataset(X, y)
return torch.utils.data.DataLoader(ds, batch_size=bs, shuffle=True, drop_last=True)
def run(mode="no_zero", steps=200):
model = TinyMLP()
opt = torch.optim.AdamW(model.parameters(), lr=3e-3)
loader = make_loader()
it = iter(loader)
if mode != "no_zero":
opt.zero_grad(set_to_none=True)
for step in range(1, steps+1):
try: x, y = next(it)
except StopIteration:
it = iter(loader); x, y = next(it)
logits = model(x)
loss_a = F.cross_entropy(logits, y, label_smoothing=0.0)
loss_b = 0.1 * logits.pow(2).mean()
if mode == "no_zero":
total = loss_a + loss_b
total.backward()
opt.step()
# 没有 zero_grad
elif mode == "double_backward":
opt.zero_grad(set_to_none=True)
loss_a.backward(retain_graph=True)
loss_b.backward()
opt.step()
elif mode == "fix":
opt.zero_grad(set_to_none=True)
total = loss_a + loss_b
total.backward()
opt.step()
else:
raise ValueError(mode)
with torch.no_grad():
gnorm = 0.0
for p in model.parameters():
if p.grad is not None:
gnorm += p.grad.norm().item()
acc = (logits.argmax(1) == y).float().mean().item()
if step % 25 == 0:
print(f"[{mode}] step={step:03d} loss={float((loss_a+loss_b)):.3f} acc={acc:.3f} gnorm≈{gnorm:.2f}")
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--mode", choices=["no_zero","double_backward","fix"], default="no_zero")
args = ap.parse_args()
run(args.mode, steps=200)你会看到
def probe_grad(model, tag):
with torch.no_grad():
grads = [p.grad.norm().item() for n,p in model.named_parameters() if p.grad is not None]
print(f"[{tag}] has_grad={len(grads)}>0 grad_mean={sum(grads)/max(1,len(grads)):.2f}")for step, (x,y) in enumerate(loader):
opt.zero_grad(set_to_none=True) # 清到 None 更省显存,也更容易暴露未清零
logits = model(x)
loss = main_loss(logits, y) + aux_regularizer(logits, y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 若需要
opt.step()total_loss = loss_a + loss_b + loss_c
total_loss.backward()opt.zero_grad(set_to_none=True)
for i, (x,y) in enumerate(loader):
loss = criterion(model(x), y) / accum_steps
loss.backward()
if (i+1) % accum_steps == 0:
opt.step(); opt.zero_grad(set_to_none=True)隐式梯度累加是最容易忽视、却能把训练一步步带偏的暗坑。把清梯度的位置固定在每个优化步开头,合并损失一次反传,避免随手 retain_graph,再配上小小的 grad 监控与显存观察,这类问题基本可以一劳永逸。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。