权重衰减

增加L2范数惩罚,使参数w足够小,防止过拟合。 \(L(w,b)+(\lambda/2)||w||^2\)

Dropout

Net定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def dropout_layer(x, dropout):
# 若dropout值不满足要求,程序在这里崩溃
assert 0 <= dropout <= 1
# 节省资源
if dropout == 1:
return torch.zeros_like(x)
if dropout == 0:
return x
mask = (torch.rand(x.shape) > dropout).float()
return mask*x/(1-dropout)

class Net(nn.Module):
def __init__(self, input_num, hidden1_num, hidden2_num, output_num, dropout_odds, is_training=True) -> None:
super().__init__()
self.input_num = input_num
self.dropout_odds = dropout_odds
self.is_training = is_training
self.flatten = nn.Flatten()
self.line1 = nn.Linear(input_num, hidden1_num)
self.line2 = nn.Linear(hidden1_num, hidden2_num)
self.line3 = nn.Linear(hidden2_num, output_num)
self.relu = nn.ReLU()
self.dropout_layer = dropout_layer

def forward(self, x):
x = self.flatten(x)
h1 = self.line1(x.reshape((-1, self.input_num)))
h1 = self.relu(h1)
if self.is_training:
h1 = self.dropout_layer(h1, self.dropout_odds)
h2 = self.line2(h1)
h2 = self.relu(h2)
if self.is_training:
h2 = self.dropout_layer(h2, self.dropout_odds)
o = self.line3(h2)

return o
def set_is_training(self, is_training):
self.is_training = is_training

多项式拟合

生成多项式数据
随机生成0-1均匀分布的数据,因为选择激活函数为RELU所以不能有负数的y,然后产生多项式,由于使用多层感知机,所以效果会比线性模型好很多,所以加大多项式次数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import torch
from torch import nn
import numpy as np
import math
from torch.utils import data
from d2l.torch import Animator, Accumulator
import torchvision
from torchvision import transforms

def get_polynomial_data(w, size=1000, max_degree=20):
true_w = np.zeros(max_degree)
true_w[:len(w)] = w
x = np.random.random(size=(size, 1))
np.random.shuffle(x)
# 生成多项式矩阵,行为1-20次方
input_x = np.power(x, np.arange(max_degree))
for i in range(max_degree):
input_x[:,i] /= math.gamma(i+1)
y = input_x.dot(true_w.reshape(-1,1)) + np.random.normal(scale=0.1, size=(input_x.shape[0],1))
input_x, y = [torch.tensor(data, dtype=torch.float32) for data in [input_x, y]]
return input_x, y
# len 8
w = [5, 1.2, 3.4, 5.6, 1.2, 2.5, 6.3, 0.1]
polydata, label = get_polynomial_data(w)
test_data, test_label = get_polynomial_data(w)
train
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def train(polydata, label, input_dim, test_data, test_label):
def init_weights(m):
if type == nn.Linear:
nn.init.normal_(m.weight, std=0.01)
# 开始时lr=0.1产生了爆炸
lr = 0.001
num_epochs = 10
batch_size = 100
train_set = data.TensorDataset(polydata[:,:input_dim], label)
train_iter = data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4)

net = Net(input_dim, 8, 8, 1, 0.5)
# net = nn.Sequential(nn.Linear(input_dim, 1))
net.apply(init_weights)
loss = nn.MSELoss(reduction='none')
updater = torch.optim.SGD(net.parameters(), lr=lr)
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2], legend=['train loss', 'test loss'])

for i in range(num_epochs):
metrics = Accumulator(2)
for x,y in train_iter:
y_hat = net(x)
l = loss(y_hat.reshape(y.shape), y)
metrics.add(float(l.sum()), y.numel())
updater.zero_grad()
l.sum().backward()
updater.step()
test_loss = loss(net(test_data[:,:input_dim]), test_label).mean().detach().numpy()
print(metrics[0], metrics[1], test_loss)
animator.add(i + 1, (metrics[0]/metrics[1], test_loss))

return net

net = train(polydata, label, 4, test_data, test_label)
net = Net(input_dim, 8, 8, 1, 0.5) net = nn.Sequential(nn.Linear(input_dim, 1)) 效果没有线性拟合好!

FashionMNIST

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from d2l import torch as d2l
def train(num_epochs=10, batch_size=256, lr = 0.1):
mnist_train = torchvision.datasets.FashionMNIST(
root='../data', train=True, transform=transforms.ToTensor(), download=True
)
mnist_test = torchvision.datasets.FashionMNIST(
root='../data', train=False, transform=transforms.ToTensor(), download=True
)
train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(mnist_test, batch_size,shuffle=True, num_workers=4)

net = Net(28*28, 512, 512, 10, 0.5)
loss = nn.CrossEntropyLoss(reduction='none')
updater = torch.optim.SGD(net.parameters(), lr=lr)

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)
train()

对比[多层感知机及代码实现Pytorch d2l]中的图,可以看出过拟合的情况有所改善,而且这里的test acc实在is_training=True的情况下,在真实预测的情况下可能会更好。