×

# 循环神经网络

• 一个非常简单的循环神经网络（RNN）
• 基于时序的反向传播（BPTT）
• 弹性优化算法

# 线性循环神经网络

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import LogNorm


# 定义数据集

# Create dataset
nb_of_samples = 20
sequence_len = 10
# Create the sequences
X = np.zeros((nb_of_samples, sequence_len))
for row_idx in range(nb_of_samples):
X[row_idx,:] = np.around(np.random.rand(sequence_len)).astype(int)
# Create the targets for each sequence
t = np.sum(X, axis=1)


# 通过基于时序的反向传播（BPTT）算法进行训练

### 正向传播计算RNN的输出结果

forward_states函数通过 for 循环，将update_state函数应用到每一个时间点上面。如果我们将这些步骤都矢量化，那么就可以进行并行计算了。跟常规神经网络一样，我们需要给权重进行初始化。在这个教程中，我们将权重初始化为0。

# Define the forward step functions
def update_state(xk, sk, wx, wRec):
"""
Compute state k from the previous state (sk) and current input (xk),
by use of the input weights (wx) and recursive weights (wRec).
"""
return xk * wx + sk * wRec

def forward_states(X, wx, wRec):
"""
Unfold the network and compute all state activations given the input X,
and input weights (wx) and recursive weights (wRec).
Return the state activations in a matrix, the last column S[:,-1] contains the
final activations.
"""
# Initialise the matrix that holds all states for all input sequences.
# The initial state s0 is set to 0.
S = np.zeros((X.shape[0], X.shape[1]+1))
# Use the recurrence relation defined by update_state to update the
#  states trough time.
for k in range(0, X.shape[1]):
# S[k] = S[k-1] * wRec + X[k] * wx
S[:,k+1] = update_state(X[:,k], S[:,k], wx, wRec)
return S

def cost(y, t):
"""
Return the MSE between the targets t and the outputs y.
"""
return ((t - y)**2).sum() / nb_of_samples


### 反向传播的梯度计算

def output_gradient(y, t):
"""
Compute the gradient of the MSE cost function with respect to the output y.
"""
return 2.0 * (y - t) / nb_of_samples

"""
Accumulate the parameter gradients for wX and wRec by for each layer by addition.
Return the parameter gradients as a tuple, and the gradients at the output of each layer.
"""
# Initialise the array that stores the gradients of the cost with respect to the states.
# Set the gradient accumulations to 0
for k in range(X.shape[1], 0, -1):
# Compute the parameter gradients and accumulate the results.
# Compute the gradient at the output of the previous layer


### 梯度检查

# Perform gradient checking
# Set the weight parameters used during gradient checking
params = [1.2, 1.2]  # [wx, wRec]
# Set the small change to compute the numerical gradient
eps = 1e-7
S = forward_states(X, params[0], params[1])
# Compute the numerical gradient for each parameter in the layer
for p_idx, _ in enumerate(params):
# + eps
params[p_idx] += eps
plus_cost = cost(forward_states(X, params[0], params[1])[:,-1], t)
# - eps
params[p_idx] -= 2 * eps
min_cost = cost(forward_states(X, params[0], params[1])[:,-1], t)
# reset param value
params[p_idx] += eps
grad_num = (plus_cost - min_cost) / (2*eps)
# Raise error if the numerical grade is not close to the backprop gradient


### 参数更新

# Define plotting functions

# Define points to annotate (wx, wRec, color)
points = [(2,1,'r'), (1,2,'b'), (1,-2,'g'), (1,0,'c'), (1,0.5,'m'), (1,-0.5,'y')]

def get_cost_surface(w1_low, w1_high, w2_low, w2_high, nb_of_ws, cost_func):
"""Define a vector of weights for which we want to plot the cost."""
w1 = np.linspace(w1_low, w1_high, num=nb_of_ws)  # Weight 1
w2 = np.linspace(w2_low, w2_high, num=nb_of_ws)  # Weight 2
ws1, ws2 = np.meshgrid(w1, w2)  # Generate grid
cost_ws = np.zeros((nb_of_ws, nb_of_ws))  # Initialize cost matrix
# Fill the cost matrix for each combination of weights
for i in range(nb_of_ws):
for j in range(nb_of_ws):
cost_ws[i,j] = cost_func(ws1[i,j], ws2[i,j])
return ws1, ws2, cost_ws

def plot_surface(ax, ws1, ws2, cost_ws):
"""Plot the cost in function of the weights."""
surf = ax.contourf(ws1, ws2, cost_ws, levels=np.logspace(-0.2, 8, 30), cmap=cm.pink, norm=LogNorm())
ax.set_xlabel('$w_{in}$', fontsize=15)
ax.set_ylabel('$w_{rec}$', fontsize=15)
return surf

def plot_points(ax, points):
"""Plot the annotation points on the given axis."""
for wx, wRec, c in points:
ax.plot(wx, wRec, c+'o', linewidth=2)

def get_cost_surface_figure(cost_func, points):
"""Plot the cost surfaces together with the annotated points."""
# Plot figures
fig = plt.figure(figsize=(10, 4))
# Plot overview of cost function
ws1_1, ws2_1, cost_ws_1 = get_cost_surface(-3, 3, -3, 3, 100, cost_func)
surf_1 = plot_surface(ax_1, ws1_1, ws2_1, cost_ws_1 + 1)
plot_points(ax_1, points)
ax_1.set_xlim(-3, 3)
ax_1.set_ylim(-3, 3)
# Plot zoom of cost function
ws1_2, ws2_2, cost_ws_2 = get_cost_surface(0, 2, 0, 2, 100, cost_func)
surf_2 = plot_surface(ax_2, ws1_2, ws2_2, cost_ws_2 + 1)
plot_points(ax_2, points)
ax_2.set_xlim(0, 2)
ax_2.set_ylim(0, 2)
# Show the colorbar
cax = fig.add_axes([0.85, 0.12, 0.03, 0.78])
cbar = fig.colorbar(surf_1, ticks=np.logspace(0, 8, 9), cax=cax)
cbar.ax.set_ylabel('$\\xi$', fontsize=15, rotation=0, labelpad=20)
cbar.set_ticklabels(['{:.0e}'.format(i) for i in np.logspace(0, 8, 9)])
fig.suptitle('Cost surface', fontsize=15)
return fig

"""Plot the gradients of the annotated point and how the evolve over time."""
fig = plt.figure(figsize=(6.5, 4))
ax = plt.subplot(111)
# Plot points
for wx, wRec, c in points:
plt.plot(x, np.sum(grad_over_time, axis=0), c+'-', label='({0}, {1})'.format(wx, wRec), linewidth=1, markersize=8)
# Set up plot axis
plt.xticks(x)
plt.yscale('symlog')
plt.yticks([10**8, 10**6, 10**4, 10**2, 0, -10**2, -10**4, -10**6, -10**8])
plt.xlabel('timestep k', fontsize=12)
plt.ylabel('$\\frac{\\partial \\xi}{\\partial S_{k}}$', fontsize=20, rotation=0)
plt.grid()
plt.title('Unstability of gradient in backward propagation.\n(backpropagate from left to right)')
# Set legend
leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False, numpoints=1)
leg.set_title('$(w_x, w_{rec})$', prop={'size':15})

"""Helper func to only get the gradient over time from wx and wRec."""
S = forward_states(X, wx, wRec)

# Plot cost surface and gradients

# Get and plot the cost surface figure with markers
fig = get_cost_surface_figure(lambda w1, w2: cost(forward_states(X, w1, w2)[:,-1] , t), points)

# Get the plots of the gradients changing by backpropagating.
# Show figures
plt.show()

Cost surface

### 弹性优化算法

Rprop算法

# Define Rprop optimisation function
def update_rprop(X, t, W, W_prev_sign, W_delta, eta_p, eta_n):
"""
Update Rprop values in one iteration.
X: input data.
t: targets.
W: Current weight parameters.
W_prev_sign: Previous sign of the W gradient.
W_delta: Rprop update values (Delta).
eta_p, eta_n: Rprop hyperparameters.
"""
# Perform forward and backward pass to get the gradients
S = forward_states(X, W[0], W[1])
# Update the Delta (update value) for each weight parameter seperately
for i, _ in enumerate(W):
if W_sign[i] == W_prev_sign[i]:
W_delta[i] *= eta_p
else:
W_delta[i] *= eta_n
return W_delta, W_sign

# Perform Rprop optimisation

# Set hyperparameters
eta_p = 1.2
eta_n = 0.5

# Set initial parameters
W = [-1.5, 2]  # [wx, wRec]
W_delta = [0.001, 0.001]  # Update values (Delta) for W
W_sign = [0, 0]  # Previous sign of W

ls_of_ws = [(W[0], W[1])]  # List of weights to plot
# Iterate over 500 iterations
for i in range(500):
# Get the update values and sign of the last gradient
W_delta, W_sign = update_rprop(X, t, W, W_sign, W_delta, eta_p, eta_n)
# Update each weight parameter seperately
for i, _ in enumerate(W):
W[i] -= W_sign[i] * W_delta[i]
ls_of_ws.append((W[0], W[1]))  # Add weights to list to plot

print('Final weights are: wx = {0},  wRec = {1}'.format(W[0], W[1]))


Final weights are: wx = 1.00135554721, wRec = 0.999674473785

# Plot the cost surface with the weights over the iterations.

# Define plot function
def plot_optimisation(ls_of_ws, cost_func):
"""Plot the optimisation iterations on the cost surface."""
ws1, ws2 = zip(*ls_of_ws)
# Plot figures
fig = plt.figure(figsize=(10, 4))
# Plot overview of cost function
ws1_1, ws2_1, cost_ws_1 = get_cost_surface(-3, 3, -3, 3, 100, cost_func)
surf_1 = plot_surface(ax_1, ws1_1, ws2_1, cost_ws_1 + 1)
ax_1.plot(ws1, ws2, 'b.')
ax_1.set_xlim([-3,3])
ax_1.set_ylim([-3,3])
# Plot zoom of cost function
ws1_2, ws2_2, cost_ws_2 = get_cost_surface(0, 2, 0, 2, 100, cost_func)
surf_2 = plot_surface(ax_2, ws1_2, ws2_2, cost_ws_2 + 1)
ax_2.set_xlim([0,2])
ax_2.set_ylim([0,2])
surf_2 = plot_surface(ax_2, ws1_2, ws2_2, cost_ws_2)
ax_2.plot(ws1, ws2, 'b.')
# Show the colorbar
cax = fig.add_axes([0.85, 0.12, 0.03, 0.78])
cbar = fig.colorbar(surf_1, ticks=np.logspace(0, 8, 9), cax=cax)
cbar.ax.set_ylabel('$\\xi$', fontsize=15)
cbar.set_ticklabels(['{:.0e}'.format(i) for i in np.logspace(0, 8, 9)])
plt.suptitle('Cost surface', fontsize=15)
plt.show()

# Plot the optimisation
plot_optimisation(ls_of_ws, lambda w1, w2: cost(forward_states(X, w1, w2)[:,-1] , t))
plt.show()

Cost surface

# 测试模型

test_inpt = np.asmatrix([[0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1]])
print test_inpt
test_outpt = forward_states(test_inpt, W[0], W[1])[:,-1]
print 'Target output: {:d} vs Model output: {:.2f}'.format(test_inpt.sum(), test_outpt[0])


Target output: 5 vs Model output: 4.99

Deep Learning