Custom `Torch` utilitities
with torch.no_grad():
tst = nn.Linear(4, 5)
tst.weight.data.uniform_(-1, 1)
tst.bias.data.uniform_(-1, 1)
tst = init_default(tst, func=lambda x: x.data.fill_(1.0))
test_eq(tst.weight, torch.ones(5, 4))
test_eq(tst.bias, torch.zeros(5))
with torch.no_grad():
tst = nn.Linear(4, 5)
tst.weight.data.uniform_(-1, 1)
tst.bias.data.uniform_(-1, 1)
cond_init(tst, func=lambda x: x.data.fill_(1.0))
test_eq(tst.weight, torch.ones(5, 4))
test_eq(tst.bias, torch.zeros(5))
tst = nn.BatchNorm2d(5)
init = [tst.weight.clone(), tst.bias.clone()]
cond_init(tst, func=lambda x: x.data.fill_(1.0))
test_eq(tst.weight, init[0])
test_eq(tst.bias, init[1])
tst = nn.Sequential(nn.Linear(4, 5), nn.Sequential(nn.Linear(4, 5), nn.Linear(4, 5)))
apply_leaf(tst, partial(init_default, func=lambda x: x.data.fill_(1.0)))
with torch.no_grad():
for l in [tst[0], *tst[1]]:
test_eq(l.weight, torch.ones(5, 4))
for l in [tst[0], *tst[1]]:
test_eq(l.bias, torch.zeros(5))
tst = nn.Sequential(nn.Linear(4, 5), nn.Sequential(nn.Linear(4, 5), nn.BatchNorm1d(5)))
init = [tst[1][1].weight.clone(), tst[1][1].bias.clone()]
apply_init(tst, func=lambda x: x.data.fill_(1.0))
with torch.no_grad():
for l in [tst[0], tst[1][0]]:
test_eq(l.weight, torch.ones(5, 4))
for l in [tst[0], tst[1][0]]:
test_eq(l.bias, torch.zeros(5))
test_eq(tst[1][1].weight, init[0])
test_eq(tst[1][1].bias, init[1])
model = nn.Sequential(nn.Linear(4, 5), nn.BatchNorm1d(5), nn.Linear(5, 1))
grab the first BatchNorm
layer, and store its running mean:
m = model[1].running_mean.clone()
You can see that now that running mean has changed:
i = torch.randn(32, 4)
o = model(i)
test_ne(m, model[1].running_mean.detach())
When we use the set_bn_eval
function, the running statistics will not be changed during training
model = nn.Sequential(nn.Linear(4, 5), nn.BatchNorm1d(5))
model.train()
model.eval()
m = model[1].running_mean.clone()
set_bn_eval(model)
i = torch.randn(32, 4)
o = model(i)
test_eq(m, model[1].running_mean.detach())
with torch.no_grad():
m = nn.Linear(4, 5)
test_eq(trainable_params(m), [m.weight, m.bias])
m.weight.requires_grad_(False)
test_eq(trainable_params(m), [m.bias])
test_eq(params(m), [m.weight, m.bias])
output = torch.randn(10, 10)
t0 = torch.nn.functional.one_hot(torch.arange(0, 10) % 3, num_classes=10)
t1 = torch.arange(0, 10) % 3
o0 = maybe_convert_to_onehot(t0, output)
o1 = maybe_convert_to_onehot(t1, output)
test_eq(o0.shape, output.shape)
test_eq(o1.shape, output.shape)
test_eq(t0, o0)
... We can see that maybe_convert_to_onehot
converted t1
to a one_hot
encoded tensor but did not change t0
because it was already one_hot
in encoded form
/shape
.
model = nn.Sequential(nn.Linear(4, 5), nn.BatchNorm1d(5))
param_list = [{"params": [p for p in model.parameters()]}]
param_list_dl, lrs = build_discriminative_lrs(param_list, lr_stop=1e-03)
assert len(param_list_dl) == 1
assert lrs == [1e-03]
assert param_list_dl[0]["lr"] == lrs[0]
model = nn.Sequential(nn.Linear(4, 5), nn.Linear(5, 10))
p1 = [{"params": [p for p in model[0].parameters()]}]
p2 = [{"params": [p for p in model[1].parameters()]}]
param_list = p1 + p2
param_list_dl, lrs = build_discriminative_lrs(param_list, lr_stop=1e-03)
assert len(param_list_dl) == 2
assert len(lrs) == 2