全代碼
純手?jǐn)]一個(gè)識別mnist手寫數(shù)據(jù)集的2層DNN網(wǎng)絡(luò),所有庫函數(shù)的低層NumPy代碼都已給出,這串代碼直接運(yùn)行就能跑!不需要其他文件。
如果沒裝TensorFlow和matplotlib的童鞋可以在終端輸入 pip install tensorflow
和 pip install matplotlib
進(jìn)行安裝。
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf #引入tensorflow只是為了導(dǎo)入mnist數(shù)據(jù)集
#下面一大段都是定義函數(shù)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_grad(x):
return (1.0 - sigmoid(x)) * sigmoid(x)
def relu(x):
return np.maximum(0, x)
def relu_grad(x):
#grad = np.zeros(x)
#grad[x>=0] = 1
x = np.where(x>=0,1,0)
return x
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) # 溢出對策
return np.exp(x) / np.sum(np.exp(x))
def mean_squared_error(y, t):
return 0.5 * np.sum((y - t) ** 2)
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
# 監(jiān)督數(shù)據(jù)是one-hot-vector的情況下,轉(zhuǎn)換為正確解標(biāo)簽的索引
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
def softmax_loss(X, t):
y = softmax(X)
return cross_entropy_error(y, t)
def numerical_gradient(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x)
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
idx = it.multi_index
tmp_val = x[idx]
x[idx] = float(tmp_val) + h
fxh1 = f(x) # f(x+h)
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
grad[idx] = (fxh1 - fxh2) / (2 * h)
x[idx] = tmp_val # 還原值
it.iternext()
return grad
class TwoLayerNet:
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
# 初始化權(quán)重
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
a1 = np.dot(x, W1) + b1
#z1 = sigmoid(a1)
z1 = relu(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
return y
# x:輸入數(shù)據(jù), t:監(jiān)督數(shù)據(jù)
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t)
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# x:輸入數(shù)據(jù), t:監(jiān)督數(shù)據(jù)
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
return grads
def gradient(self, x, t):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
grads = {}
batch_num = x.shape[0]
# forward
a1 = np.dot(x, W1) + b1
#z1 = sigmoid(a1)
z1 = relu(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
# backward
dy = (y - t) / batch_num
grads['W2'] = np.dot(z1.T, dy)
grads['b2'] = np.sum(dy, axis=0)
da1 = np.dot(dy, W2.T)
#dz1 = sigmoid_grad(a1) * da1
dz1 = relu_grad(a1) * da1
grads['W1'] = np.dot(x.T, dz1)
grads['b1'] = np.sum(dz1, axis=0)
return grads
def _change_one_hot_label(X):
T = np.zeros((X.size, 10))
for idx, row in enumerate(T):
row[X[idx]] = 1
return T
#開搞
# 讀入數(shù)據(jù)
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0 #歸一化
x_train = x_train.reshape(-1,784) # flatten, (60000,28,28)變(60000,784)
x_test = x_test.reshape(-1,784) # flatten, (10000,28,28)變(10000,784)
y_train = _change_one_hot_label(y_train) #標(biāo)簽變獨(dú)熱碼,才能和前向傳播softmax之后的結(jié)果維度匹配,才能相減算誤差
y_test = _change_one_hot_label(y_test) #標(biāo)簽變獨(dú)熱碼
#兩層DNN(隱藏層50個(gè)神經(jīng)元,784*50*10),激活函數(shù)是relu,可自己改成sigmoid,損失函數(shù)是交叉熵誤差,輸出層是softmax,優(yōu)化函數(shù)是SGD
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
#超參數(shù)設(shè)置
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 512
learning_rate = 0.05
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
#訓(xùn)練
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
y_batch = y_train[batch_mask]
# 梯度
# grad = network.numerical_gradient(x_batch, t_batch)
grad = network.gradient(x_batch, y_batch)
# 更新
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, y_batch)
train_loss_list.append(loss)
#每一個(gè)epoch打印訓(xùn)練和測試的準(zhǔn)確率
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, y_train)
test_acc = network.accuracy(x_test, y_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print(train_acc, test_acc)
# 繪制 loss 曲線
plt.subplot(1,2,1)
plt.title('Loss Function Curve') # 圖片標(biāo)題
plt.xlabel('Step') # x軸變量名稱
plt.ylabel('Loss') # y軸變量名稱
plt.plot(train_loss_list, label="$Loss$") # 逐點(diǎn)畫出loss值并連線,連線圖標(biāo)是Loss
plt.legend() # 畫出曲線圖標(biāo)
# 繪制 Accuracy 曲線
plt.subplot(1,2,2)
plt.title('Acc Curve') # 圖片標(biāo)題
plt.xlabel('Epoch') # x軸變量名稱
plt.ylabel('Acc') # y軸變量名稱
plt.plot(train_acc_list, label="$train_{acc}$") # 逐點(diǎn)畫出train_acc值并連線
plt.plot(test_acc_list, label="$test_{acc}$") # 逐點(diǎn)畫出test_acc值并連線
plt.legend()
plt.show()
總結(jié)
簡單的兩層網(wǎng)絡(luò)(W個(gè)數(shù):784*50+50*10,b個(gè)數(shù):50+10),就能實(shí)現(xiàn)95%的準(zhǔn)確率,且沒有過擬合。
batch_size調(diào)大一點(diǎn)loss就不會這么震蕩,訓(xùn)練周期長一點(diǎn)acc會更大,學(xué)習(xí)率越大訓(xùn)練越快,但太大會跑飛,都可以調(diào)來玩玩。
上面的激活函數(shù)是選了relu,可自己改成sigmoid,代碼里relu換成sigmoid就行,事實(shí)證明是relu好一點(diǎn)。
上面的優(yōu)化器是SGD(隨機(jī)梯度下降),還有Momentum、AdaGrad、Adam等等,一般用Adam會有更好效果。
所以可以總結(jié)神經(jīng)網(wǎng)絡(luò)學(xué)習(xí)全貌:
前提
神經(jīng)網(wǎng)絡(luò)存在合適的權(quán)重和偏置,調(diào)整權(quán)重和偏置以便擬合訓(xùn)練數(shù)據(jù)的
過程稱為“學(xué)習(xí)”。神經(jīng)網(wǎng)絡(luò)的學(xué)習(xí)分成下面4個(gè)步驟。
步驟1(mini-batch)
從訓(xùn)練數(shù)據(jù)中隨機(jī)選出一部分?jǐn)?shù)據(jù),這部分?jǐn)?shù)據(jù)稱為mini-batch。我們
的目標(biāo)是減小mini-batch的損失函數(shù)的值。
步驟2(計(jì)算梯度)
為了減小mini-batch的損失函數(shù)的值,需要求出各個(gè)權(quán)重參數(shù)的梯度。
梯度表示損失函數(shù)的值減小最多的方向。
步驟3(更新參數(shù))
將權(quán)重參數(shù)沿梯度方向進(jìn)行微小更新。
步驟4(算誤差、精度)
每次循環(huán)都算一下誤差,若到一次epoch,算一下精度。
步驟5(重復(fù))
重復(fù)步驟1、步驟2、步驟3、步驟4。
更多深度學(xué)習(xí)入門內(nèi)容可以看看這篇哦《一文極速理解深度學(xué)習(xí)》。