部分手写解释

设计

框架图
在线性回归模型的基础上增加了隐藏层,\(input X(batchsize, n),H(batchsize, m)=X(batchsize, n)*W1(n, m)+B1(1, m)\),B1采用广播机制。
\(O(batchsize, k)=H(batchsize, m)*W2(m, k)+B2(1, k)\),B2采用广播机制。
在隐藏层需要添加激活函数,为什么?
如果没有激活函数f,\(O=(X*W1+B1)*W2+B2=X*W1*W2 + B1*W + B2 = X*W+B\),相当于线性回归了。加入激活函数,\(O=f(X*W1+B1)*W2+B2=X*W1*W2 + B1*W + B2 = X*W+B\)就消除了这个问题。

激活函数

常见的激活函数如下: ### ReLU \(ReLU(x) = max(0,x)\) 它的作用:pass
变体:\(pReLU(x) = max(0,x)+\alpha min(0,x)\) ### sigmoid \(sigmoid(x) = \frac{1}{1+exp(-x)}\) ### tanh \(tanh(x)=\frac{1-exp(-2x)}{1+exp(-2x)}\)

1
2
3
4
5
6
7
8
9
10
import torch
import matplotlib.pyplot as plt

x = torch.arange(-10,10,0.1)
fig, axes = plt.subplots(1,4, figsize=(16,2))
axes = axes.flatten()
axes[0].plot(x,torch.relu(x))
axes[1].plot(x,torch.sigmoid(x))
axes[2].plot(x,torch.tanh(x))
axes[3].plot(x,torch.softmax(x, dim=-1))
Alt text

d2l

从零实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
import torchvision
from torchvision import transforms
from torch.utils import data
import matplotlib.pyplot as plt
from torch import nn

mnist_train = torchvision.datasets.FashionMNIST(
root='../data', train=True, transform=transforms.ToTensor(), download=True
)
mnist_test = torchvision.datasets.FashionMNIST(
root='../data', train=False, transform=transforms.ToTensor(), download=True
)


# 前向传播
def softmax(x):
x_exp = torch.exp(x)
return x_exp/x_exp.sum(1, keepdim=True)


def net(x, w1, b1, w2, b2):
# x shape (batch_size, 28*28)
# w1 (28*28, 256) w2 (256, 10)
h = torch.matmul(x.reshape((-1, w1.shape[0])), w1)+b1
# h shape (batch_size, 256)
y = torch.matmul(torch.relu(h), w2)+b2
return softmax(y)


# 代价函数
def cross_entropy(y, y_hat):
return - torch.log(y_hat[range(len(y_hat)), y])


# sgd
def sgd(params, lr, batch_size):
with torch.no_grad():
for param in params:
param -= lr*param.grad/batch_size
param.grad.zero_()


# 准确率
def ac(w1, b1, w2, b2, data_iter, net):
num_acs = []
for x, y in data_iter:
y_hat = net(x, w1, b1, w2, b2)
maxs, indexs = torch.max(y_hat, dim=1)
num_acs.append(y.eq(indexs).sum()/indexs.shape[0])
return sum(num_acs)/len(num_acs)


# 准备数据
batch_size = 256
train_iter = data.DataLoader(
mnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(mnist_test, batch_size,
shuffle=True, num_workers=4)

# 超参数
lr = 0.1
num_epochs = 10
net = net
loss = cross_entropy

# 参数
num_output = 10
num_input = 28*28
num_hidden = 256
w1 = torch.normal(0, 0.1, (num_input, num_hidden), requires_grad=True)
b1 = torch.zeros(num_hidden, requires_grad=True)
w2 = torch.normal(0, 0.1, (num_hidden, num_output), requires_grad=True)
b2 = torch.zeros(num_output, requires_grad=True)

if __name__ == '__main__':
train_acs = []
test_acs = []
losss = []
for i in range(num_epochs):
for x, y in train_iter:
y_hat = net(x, w1, b1, w2, b2)
l = loss(y, y_hat)
l.sum().backward()
sgd([w1, b1, w2, b2], lr, batch_size)

train_ac = ac(w1, b1, w2, b2, train_iter, net)
test_ac = ac(w1, b1, w2, b2, test_iter, net)
train_acs.append(train_ac)
test_acs.append(test_ac)
losss.append(l.sum().detach().numpy())
print('epoch:{}, train iter ac:{}, test iter ac:{}'.format(
i, train_ac, test_ac))
fig, axes = plt.subplots(1,2, figsize=(8,2))
axes = axes.flatten()
axes[0].plot(range(10), losss)
axes[1].plot(range(10), train_acs, label='train data')
axes[1].plot(range(10), test_acs, label='test data')
axes[1].legend()

loss and ac 发现准确率也不够高!

简洁实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import torch
import torchvision
from torchvision import transforms
from torch.utils import data
import matplotlib.pyplot as plt
from torch import nn


mnist_train = torchvision.datasets.FashionMNIST(
root='../data', train=True, transform=transforms.ToTensor(), download=True
)
mnist_test = torchvision.datasets.FashionMNIST(
root='../data', train=False, transform=transforms.ToTensor(), download=True
)

def init_weights(m):
if type == nn.Linear:
nn.init.normal_(m.weight, std=0.01)


def ac(data_iter, net):
num_acs = []
for x, y in data_iter:
y_hat = net(x)
maxs, indexs = torch.max(y_hat, dim=1)
num_acs.append(y.eq(indexs).sum()/indexs.shape[0])
return sum(num_acs)/len(num_acs)

# 超参数
batch_size = 256
num_epochs = 10
lr = 0.1
train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(mnist_test, batch_size,shuffle=True, num_workers=4)
net = nn.Sequential(nn.Flatten(), nn.Linear(28*28, 256), nn.ReLU(), nn.Linear(256, 10))


net.apply(init_weights)
loss = nn.CrossEntropyLoss()
trainer = torch.optim.SGD(net.parameters(), lr=lr)

# train
if __name__=='__main__':
train_acs = []
test_acs = []
loss_epochs = []
for i in range(num_epochs):
loss_epoch = []
for x, y in train_iter:
y_hat = net(x)
l = loss(y_hat, y)
trainer.zero_grad()
l.backward()
trainer.step()

loss_epoch.append(l.detach().numpy())

train_ac = ac(train_iter, net)
test_ac = ac(test_iter, net)
train_acs.append(train_ac)
test_acs.append(test_ac)
loss_epoch = sum(loss_epoch)/len(loss_epoch)
loss_epochs.append(loss_epoch)
print('epoch:{}, train iter accuracy:{}, test iter accuracy:{}, loss:{}'.format(
i, train_ac, test_ac, loss_epoch))
fig, axes = plt.subplots(1,2, figsize=(8,2))
axes = axes.flatten()
axes[0].plot(range(10), loss_epochs)
axes[1].plot(range(10), train_acs, label='train data')
axes[1].plot(range(10), test_acs, label='test data')
axes[1].legend()

已经趋于稳定,加大num_epochs或许意义不大!

过拟合

训练集小可能会出现过拟合问题,我们的模型可能会记住训练集而不是拟合训练集。解决该问题可以1.减少特征维度;2.正则化,降低参数值。 ## 多项式拟合的例子

权重衰减

pass