MNIST Train with 2 Layer Neural Network
TOC
- Import Necessary Packages
- Getting MNIST Data Set
- Affine Neuron
- ReLU - Activation Function
- Softmax-With-Loss
- Two Layer Neural Network for MNIST
- Train MNIST Data Set
Import Necessary Packages
- os and pickle are required to read MNIST data set.
- collections is required to use OrderedDict:
- numpy is required to use several mathematics functions and numpy array.
- matplotlib.pyplot is required to draw plot.
import os
import pickle
import collections
import numpy as np
import matplotlib.pyplot as plt
Getting MNIST Data Set
- Before training MNIST data set, MNIST data set should be prepared.
# File manager
class MnistManager():
def __init__(self):
# The number of labels
self._nb_labels = 10
# Transform data to one hot encoding format
def _encode_one_hot(self, X):
T = np.zeros((X.size, self._nb_labels))
for idx, row in enumerate(T):
row[X[idx]] = 1
return T
# Get MNIST data which is normalized and one hot encoded
def getMNIST(self):
# The pickle file for MNIST data set
pklFile = "mnist.pkl"
if not os.path.exists(pklFile):
return (0, 0), (0, 0)
dataset = None
with open(pklFile, "rb") as f:
dataset = pickle.load(f)
# Data normalization
for key in ("train_img", "test_img"):
dataset[key] = dataset[key].astype(np.float32)
dataset[key] /= 255.0
dataset["train_label"] = self._encode_one_hot(\
dataset["train_label"])
dataset["test_label"] = self._encode_one_hot(\
dataset["test_label"])
return (dataset["train_img"], dataset["train_label"]),\
(dataset["test_img"], dataset["test_label"])
Affine Neuron
- The class of Affine neuron is
# Affine
class Affine():
def __init__(self, W, b):
self._W = W
self._b = b
self._X = None
self._original_x_shape = None
self._dW = None
self._db = None
def forward(self, X):
self._original_x_shape = X.shape
X = X.reshape(X.shape[0], -1)
self._X = X
out = np.dot(self._X, self._W) + self._b
return out
def backward(self, d):
dx = np.dot(d, self._W.T)
self.dW = np.dot(self._X.T, d)
self.db = np.sum(d, axis=0)
dx = dx.reshape(*self._original_x_shape)
return dx
ReLU - Activation Function
- ReLU is used as activation function of Affine neuron.
# ReLU
class Relu():
def __init__(self):
self._mask = None
def forward(self, X):
self._mask = (X <= 0)
out = X.copy()
out[self._mask] = 0
return out
def backward(self, d):
d[self._mask] = 0
dx = d
return dx
Softmax-With-Loss
- Softmax-With-Loss neuron determines the answer and calculates cost.
# Softmax-With-Loss
class SoftmaxWithLoss():
def __init__(self):
self._loss = None
self._Y = None
self._labels = None
def _softmax(self, X):
if X.ndim == 2:
X = X.T
X = X - np.max(X, axis=0)
Y = np.exp(X) / np.sum(np.exp(X), axis=0)
return Y.T
# Protect from overflow
X = X - np.max(x)
return np.exp(X) / np.sum(np.exp(X))
def _cross_entropy_error(self, Y, labels):
if Y.ndim == 1:
labels = labels.reshape(1, labels.size)
Y = Y.reshape(1, Y.size)
# If train data is one-hot encoded,
# translate it to answer label
if labels.size == Y.size:
labels = labels.argmax(axis=1)
batch_size = Y.shape[0]
log_val = np.log(Y[np.arange(batch_size), labels])
return -np.sum(log_val) / batch_size
def forward(self, X, labels):
self._labels = labels
self._Y = self._softmax(X)
self._loss = self._cross_entropy_error(self._Y, self._labels)
return self._loss
def backward(self, d=1):
batch_size = self._labels.shape[0]
# If train data is one-hot encoded,
# translate it to answer label
if self._labels.size == self._Y.size:
dx = (self._Y - self._labels) / batch_size
else:
dx = self._Y.copy()
dx[np.arange(batch_size), self._labels] -= 1
dx = dx / batch_size
return dx
Two Layer Neural Network for MNIST
- For MNIST data set, input layer has 784 nodes and output layer has 10 nodes.
- By using batch with size 100, the row of input and output layer is 100.
- Here, 2 layer NN is
- The code is
# Two Layer Neural Network
class TwoLayerNeuralNetwork():
def __init__(self, input_size, \
hidden_size, \
output_size, \
weight_init_std=0.01):
# Dictionary for weights and bias
self._params = {}
# randn returns a sample \
# from the standard normal distribution which is 1.
# Multiplying with weight_init_std makes \
# random number array \
# whose standard variation is weight_init_std.
self._params['W1'] = weight_init_std * \
np.random.randn(input_size, hidden_size)
self._params['W2'] = weight_init_std * \
np.random.randn(hidden_size, output_size)
# Set zero to all biases
self._params["b1"] = np.zeros(hidden_size)
self._params["b2"] = np.zeros(output_size)
# Create hierarchical layers
self._layers = collections.OrderedDict()
self._layers["Affine1"] = Affine(self._params["W1"], self._params["b1"])
self._layers["Relu"] = Relu()
self._layers["Affine2"] = Affine(self._params["W2"], self._params["b2"])
self._layers["Output"] = SoftmaxWithLoss()
# Prediction function
def predict(self, X):
for name, layer in self._layers.items():
if name is not "Output":
X = layer.forward(X)
return X
# Cost function
def cost(self, X, labels):
Y = self.predict(X)
return self._layers["Output"].forward(Y, labels)
# Accuracy function
def accuracy(self, X, labels):
Y = self.predict(X)
Y = np.argmax(Y, axis=1)
labels = np.argmax(labels, axis=1)
if labels.ndim != 1:
labels = np.argmax(labels, axis=1)
accuracy = np.sum(Y == labels) / float(X.shape[0])
return accuracy
# Gradient function
def gradient(self, X, labels):
# Forward
self.cost(X, labels)
# Backward
d = 1
layers = reversed(list(self._layers.values()))
for layer in layers:
d = layer.backward(d)
# Result
grads = {}
grads["W1"] = self._layers["Affine1"].dW
grads["W2"] = self._layers["Affine2"].dW
grads["b1"] = self._layers["Affine1"].db
grads["b2"] = self._layers["Affine2"].db
return grads
# Update Weight and bias
def update(self, grads, learning_rate):
for key in ("W1", "W2", "b1", "b2"):
self._params[key] -= learning_rate * grads[key]
Train MNIST Data Set
- The number of instances is 6000, and batch size is 100, so one epoch takes 600 iterations.
# Train MNIST
# Create MNIST manager
mnistManager = MnistManager()
# Get Mnist data
(X_train, label_train), (X_test, label_test) = mnistManager.getMNIST()
print("Training Input: {0}".format(X_train.shape))
print("Training Label: {0}".format(label_train.shape))
print("Testing Input: {0}".format(X_test.shape))
print("Testing Label: {0}".format(label_train.shape))
# Create one layer neural network
network = TwoLayerNeuralNetwork(input_size=784, hidden_size=50, output_size=10)
# Total number of trials
iters_num = 10001
# The number of instances
train_size = X_train.shape[0]
print("Instances: {0}".format(train_size))
# How many instances will be used for one training trial
batch_size = 100
# Learning rate
learning_rate = 0.1
# Variables for graph
train_costs = []
train_accs = []
test_accs = []
# The number of iterations for one epoch
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
# Stochastic method - randomly choose data set to train
batch_mask = np.random.choice(train_size, batch_size)
# Training input data chosen
X_batch = X_train[batch_mask]
# Training label data chosen
label_batch = label_train[batch_mask]
# Gradient descent to optimize weights and bias
grads = network.gradient(X_batch, label_batch)
# Update weights ans bias
network.update(grads, learning_rate)
# Total iteration number is 10001.
# Check variations of cost, accuracy per epoch
if i % iter_per_epoch == 0:
# Cost
cost = network.cost(X_batch, label_batch)
train_costs.append(cost)
# Accuracy of training data
train_acc = network.accuracy(X_train, label_train)
train_accs.append(train_acc)
# Accuracy of testing data
test_acc = network.accuracy(X_test, label_test)
test_accs.append(test_acc)
print("Iter: {0}, Cost: {1:0.5f}, Train acc: {2:0.5f}, Test acc: {3:0.5f}".\
format(i, cost, train_acc, test_acc))
# Draw graph
x = np.arange(len(train_accs))
plt.plot(x, train_accs, label="train acc")
plt.plot(x, test_accs, label="test acc", linestyle="--")
plt.xlabel("trials")
plt.ylabel("accuracy")
plt.title("2-layer Neural Network")
plt.legend(loc="lower right")
plt.show()
Training Input: (60000, 784)
Training Label: (60000, 10)
Testing Input: (10000, 784)
Testing Label: (60000, 10)
Instances: 60000
Iter: 0, Cost: 2.30126, Train acc: 0.12698, Test acc: 0.13270
Iter: 600, Cost: 0.27200, Train acc: 0.90525, Test acc: 0.90800
Iter: 1200, Cost: 0.39566, Train acc: 0.92173, Test acc: 0.92550
Iter: 1800, Cost: 0.13303, Train acc: 0.93557, Test acc: 0.93380
Iter: 2400, Cost: 0.09149, Train acc: 0.94503, Test acc: 0.94150
Iter: 3000, Cost: 0.12236, Train acc: 0.95112, Test acc: 0.94730
Iter: 3600, Cost: 0.14326, Train acc: 0.95478, Test acc: 0.95200
Iter: 4200, Cost: 0.11878, Train acc: 0.96085, Test acc: 0.95630
Iter: 4800, Cost: 0.10453, Train acc: 0.96307, Test acc: 0.95910
Iter: 5400, Cost: 0.17780, Train acc: 0.96685, Test acc: 0.95860
Iter: 6000, Cost: 0.04278, Train acc: 0.96935, Test acc: 0.96150
Iter: 6600, Cost: 0.05189, Train acc: 0.97065, Test acc: 0.96300
Iter: 7200, Cost: 0.06384, Train acc: 0.97180, Test acc: 0.96350
Iter: 7800, Cost: 0.20248, Train acc: 0.97400, Test acc: 0.96680
Iter: 8400, Cost: 0.05188, Train acc: 0.97630, Test acc: 0.96750
Iter: 9000, Cost: 0.03629, Train acc: 0.97742, Test acc: 0.96720
Iter: 9600, Cost: 0.04282, Train acc: 0.97823, Test acc: 0.96940
Image 1. Accuracy of 2 layer Neural Network for MNIST
COMMENTS