使用numpy构建多层感知机

本文公式较多,由于简书不支持公式渲染,公式完整版请移步个人博客

import numpy as np

目标

使用numpy实现多层感知机的正向和反向传播

层次构建

全连接层

正向传播

正向传播的公式为:$Y = f(W \times X + b)$,其中,Y为输出,W为权值,b为偏置

反向传播

对于反向传播,已知上一层传回的梯度为dY,对应的反向传播公式为:
$$dX = (W^{T} \times dY) \cdot f'(Y)$$
$$dW = \cfrac{1}{m} dY \times X^{T}$$
$$db = \cfrac{1}{m} \sum dY$$

代码实现

class numpy_fc(object):

    def __init__(self, in_channel, out_channel, optim):
        self.weight = np.float64(np.random.randn(out_channel, in_channel) * 0.1)
        self.bias = np.zeros((out_channel, 1),dtype=np.float64)
        self.in_data = np.zeros((1, in_channel))
        self.out_data = None
        self.weight_grad = None
        self.bias_grad = None
        self.optimizer = optim

    def forward(self, data):
        self.in_data = data
        self.out_data = np.dot(self.weight, data) + self.bias
        return self.out_data

    def backward(self, grad):
        data_grad = np.dot(self.weight.T, grad)
        self.weight_grad = np.dot(grad, self.in_data.T)
        self.bias_grad = np.sum(grad, axis=1).reshape((-1,1))
        return data_grad

    def step(self):
#         print(self.bias_grad.shape,self.bias.shape)
        self.weight += self.optimizer(self.weight_grad)
        self.bias += self.optimizer(self.bias_grad)

代码测试

test_fc = numpy_fc(16,8,None)
test_fc_forward = test_fc.forward(np.random.rand(16,10))
print(test_fc_forward.shape)
test_fc_back = test_fc.backward(test_fc_forward)
print(test_fc_back.shape)
print(test_fc.weight_grad.shape,test_fc.weight.shape)
print(test_fc.bias_grad.shape,test_fc.bias.shape)
(8, 10)
(16, 10)
(8, 16) (8, 16)
(8, 1) (8, 1)

激活函数

sigmoid函数

sigmoid函数是常用的二分类问题输出层激活函数,前向传播和反向传播分别如下所示:
$$ sigmoid(x) = \cfrac{1}{1 + e^{-x}}$$
$$ sigmoid'(x) = sigmoid(x) \cdot (1 - sigmoid(x))$$

class numpy_sigmoid(object):
    def __init__(self):
        self.result = None
        
    def forward(self,data):
        self.result = 1 / (1 + np.exp(-data))
        return self.result
        
    def backward(self,grad):
        return grad * self.result * (1 - self.result)
    
    def step(self):
        pass

relu函数

relu是现阶段最常用的隐层激活函数,前向传播和反向传播如下所示
$$relu(x) = max{0,x}$$
$$
relu'(x)=
\begin{cases}
0 &\mbox{$relu(x) \leq 0$}\
1 &\mbox{$relu(x) > 0$ }
\end{cases}
$$

class numpy_relu(object):
    def __init__(self):
        self.result = None
    
    def forward(self,data):
        self.result = data
        self.result[data < 0] = 0
        return self.result
    
    def backward(self,grad):
        relu_grad = self.result
        relu_grad[self.result > 0] = 1
        return grad * relu_grad
    
    def step(self):
        pass

其他组件构建

代价函数

MES

MES代价函数的前向传播和反向传播为:
$$MES(y_pre,y) = \cfrac{1}{m} \sum ( y_pre - y )^2$$
$$\cfrac{dMES}{dy_pre} = \cfrac{2}{m} |y_pre - y|$$

def MES_loss(y_pre,y):
    loss = np.sum((y_pre - y) ** 2)
    loss_back = np.abs(y_pre - y)
    return loss,loss_back

交叉熵

交叉熵的前向传播和反向传播分别为:
$$cross(y_pre,y) = - \cfrac{1}{m} \sum^m_{i = 1}(ylog(y_pre) + (1-y)log(1-y_pre))$$
$$\cfrac{dcross}{dy_pre} = -\cfrac{1}{m}(\cfrac{y}{y_pre} - \cfrac{1-y}{1-y_pre})$$

def Cross_loss(y_pre,y):
    loss = -np.sum(y*np.log(y_pre)+(1-y)*np.log(1-y_pre))
    loss_back = y/y_pre + (1-y)/(1-y_pre)
    return loss,-loss_back

带交叉熵的softmax函数

softmax函数是多分类问题常用的输出激活函数,一般与交叉熵代价函数结合使用,组合函数(softmax+交叉熵)的前向传播如下:
$$J(y_pre,y) = - \sum y_i * log(softmax(y_pre_i))$$
$$softmax_i(x) = \cfrac{e^{x_i}}{\sum_j e^{x_j}}$$

反向传播如下:
$$ \cfrac{dJ(y_pre,y)}{dy_pre} = y_pre - y$$

详细推导可参见这里

def Softmax_cross_loss(y_pre,y):
    softmax = np.exp(y_pre) / np.sum(np.exp(y_pre),axis=0)
#     print(np.sum(np.exp(y_pre),axis=1,keepdims=True))
#     print(np.sum(softmax,axis=0))
#     print(softmax)
    loss = - np.sum(y * np.log(softmax))
    loss_back = softmax - y
    return loss,loss_back
Softmax_cross_loss(np.random.randn(2,4),np.random.randn(2,4))
(-4.9084963417988003,
 array([[-0.09065384,  0.07506358,  0.32789286,  1.26735185],
        [ 1.93958915,  0.01316283,  1.20922904,  2.87550082]]))

优化器SGD

随机梯度下降优化器是一种比较简单的优化方法,优化公式如下:
$$W_{new} = W_{old} - learning_rate \times \cfrac{dJ}{dW_{old}}$$

class optim_sgd(object):
    def __init__(self,learning_rate):
        super(optim_sgd,self).__init__()
        self.learning_rate = learning_rate
        
    def __call__(self,grad):
        return -self.learning_rate * grad

其他组件

导入数据集——乳腺癌数据集

下载数据集

import re
import pandas as pd
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
data_label = """ 1. Sample code number            1id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class                         2 for benign, 4 for malignant)
"""
data_label = [re.sub(r"\s+\d","",x[2:]) for x in re.findall(r"\. [\w\s]+\d",data_label)]
# print(data_label)
data = pd.read_csv(data_url,names=data_label)
# data["Bare Nuclei"] = data["Bare Nuclei"].map(int)
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code numberid number    699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB
None

清洗数据集

data = data.replace(to_replace="?",value=np.nan)
data = data.dropna(how='any')
data["Bare Nuclei"] = data["Bare Nuclei"].map(int)
print(data.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
Sample code numberid number    683 non-null int64
Clump Thickness                683 non-null int64
Uniformity of Cell Size        683 non-null int64
Uniformity of Cell Shape       683 non-null int64
Marginal Adhesion              683 non-null int64
Single Epithelial Cell Size    683 non-null int64
Bare Nuclei                    683 non-null int64
Bland Chromatin                683 non-null int64
Normal Nucleoli                683 non-null int64
Mitoses                        683 non-null int64
Class                          683 non-null int64
dtypes: int64(11)
memory usage: 64.0 KB
None

切分数据集

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data[data_label[1:10]],data[data_label[10]],test_size=0.25,random_state=1)
print(x_train.shape,x_test.shape)
print(y_train.shape)
print(pd.value_counts(y_train))
(512, 9) (171, 9)
(512,)
2    333
4    179
Name: Class, dtype: int64

标准化

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.transform(x_test)
# x_train_ss = x_train.values
# x_test_ss = x_test.values
print(type(x_train_ss))
# print(x_train[:5]/,x_train_ss[:5])
<class 'numpy.ndarray'>

def y_standard(data):
    return (data / 2) - 1
y_train_ss = y_standard(y_train).values
y_test_ss = y_standard(y_test).values
print(pd.value_counts(y_train_ss))
print(pd.value_counts(y_test_ss))
0.0    333
1.0    179
dtype: int64
0.0    111
1.0     60
dtype: int64

制作可迭代数据集

import random
def dataset(data,lable,batch_size=100,epoch=10):
    for _ in range(epoch):
        index = [random.randint(0,data.shape[0]-1) for _ in range(batch_size)]
#         print(index)
        yield data[index],lable[index]
# print(x_train_ss,type(y_train_ss))
for i in dataset(x_train_ss,y_train_ss,batch_size=100):
    print(i[0].shape,i[1].shape)
    break
(100, 9) (100,)

独热码编码

def onehot(data,tp_num):
    x = np.zeros((data.shape[0],tp_num))
    for i in range(data.shape[0]):
        x[i][int(data[i])] = 1
    return x
test_onehot = np.arange(2)
onehot(test_onehot,2)
array([[ 1.,  0.],
       [ 0.,  1.]])

网络

class numpy_network_base(object):
    def __init__(self,network_list):
        self.network = network_list
        
    def forward(self,x):
        for layer in self.network:
            x = layer.forward(x)
        return x
    
    def backward(self,grad):
        last_grad = grad.copy()
        for layer in self.network[::-1]:
            last_grad = layer.backward(last_grad)
        return last_grad
    
    def step(self):
        for layer in self.network:
            layer.step()

准确率计算

def accuracy(y_pre,lable):
    y_pre = np.argmax(y_pre,axis=0)
    return np.mean(np.int8(y_pre == lable))
a = np.arange(4*8).reshape((4,8))
b = np.ones((1,8)) * 3
accuracy(a,b)
1.0

网络训练与测试

网络搭建

network = numpy_network_base([numpy_fc(9,20,optim_sgd(0.001)),numpy_relu(),numpy_fc(20,2,optim_sgd(0.001))])
for i,(din,lable) in enumerate(dataset(x_train_ss,y_train_ss,epoch=10,batch_size=100)):
#     print(din)
    result = network.forward(din.T)
#     print(result)
#     print(np.argmax(result,axis=0),lable)
    loss,grad = Softmax_cross_loss(result.T,onehot(lable,2))
#     print(loss)
#     print(pd.get_dummies(lable))
#     print(grad.shape)
    print(accuracy(result,lable))
    network.backward(grad.T)
    network.step()
0.19
0.32
0.86
0.96
0.94
0.93
0.9
0.96
0.98
0.95

网络测试

result = network.forward(x_test_ss.T)
print(accuracy(result,y_test_ss))
0.982456140351

推荐阅读更多精彩内容