MNIST Train with 2 Layer Neural Network

tensorflow

Import Necessary Packages

os and pickle are required to read MNIST data set.
collections is required to use OrderedDict:
numpy is required to use several mathematics functions and numpy array.
matplotlib.pyplot is required to draw plot.

import os
import pickle
import collections

import numpy as np
import matplotlib.pyplot as plt

Getting MNIST Data Set

Before training MNIST data set, MNIST data set should be prepared.

# File manager
class MnistManager():
    def __init__(self):
        # The number of labels
        self._nb_labels = 10

    # Transform data to one hot encoding format
    def _encode_one_hot(self, X):
        T = np.zeros((X.size, self._nb_labels))
        for idx, row in enumerate(T):
            row[X[idx]] = 1

        return T

    # Get MNIST data which is normalized and one hot encoded
    def getMNIST(self):
        # The pickle file for MNIST data set
        pklFile = "mnist.pkl"

        if not os.path.exists(pklFile):
            return (0, 0), (0, 0)

        dataset = None
        with open(pklFile, "rb") as f:
            dataset = pickle.load(f)

        # Data normalization
        for key in ("train_img", "test_img"):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        dataset["train_label"] = self._encode_one_hot(\
                                dataset["train_label"])
        dataset["test_label"] = self._encode_one_hot(\
                                dataset["test_label"])

        return (dataset["train_img"], dataset["train_label"]),\
         (dataset["test_img"], dataset["test_label"])

Affine Neuron

The class of Affine neuron is

# Affine
class Affine():
    def __init__(self, W, b):
        self._W = W
        self._b = b

        self._X = None
        self._original_x_shape = None

        self._dW = None
        self._db = None

    def forward(self, X):
        self._original_x_shape = X.shape
        X = X.reshape(X.shape[0], -1)
        self._X = X
        out = np.dot(self._X, self._W) + self._b

        return out

    def backward(self, d):
        dx = np.dot(d, self._W.T)
        self.dW = np.dot(self._X.T, d)
        self.db = np.sum(d, axis=0)

        dx = dx.reshape(*self._original_x_shape)
        return dx

ReLU - Activation Function

ReLU is used as activation function of Affine neuron.

# ReLU
class Relu():
    def __init__(self):
        self._mask = None

    def forward(self, X):
        self._mask = (X <= 0)
        out = X.copy()
        out[self._mask] = 0

        return out

    def backward(self, d):
        d[self._mask] = 0
        dx = d

        return dx

Softmax-With-Loss

Softmax-With-Loss neuron determines the answer and calculates cost.

# Softmax-With-Loss
class SoftmaxWithLoss():
    def __init__(self):
        self._loss = None
        self._Y = None
        self._labels = None

    def _softmax(self, X):
        if X.ndim == 2:
            X = X.T
            X = X - np.max(X, axis=0)
            Y = np.exp(X) / np.sum(np.exp(X), axis=0)
            return Y.T

        # Protect from overflow
        X = X - np.max(x)
        return np.exp(X) / np.sum(np.exp(X))

    def _cross_entropy_error(self, Y, labels):
        if Y.ndim == 1:
            labels = labels.reshape(1, labels.size)
            Y = Y.reshape(1, Y.size)

        # If train data is one-hot encoded,
        #  translate it to answer label
        if labels.size == Y.size:
            labels = labels.argmax(axis=1)

        batch_size = Y.shape[0]

        log_val = np.log(Y[np.arange(batch_size), labels])
        return -np.sum(log_val) / batch_size

    def forward(self, X, labels):
        self._labels = labels
        self._Y = self._softmax(X)
        self._loss = self._cross_entropy_error(self._Y, self._labels)

        return self._loss

    def backward(self, d=1):
        batch_size = self._labels.shape[0]
        # If train data is one-hot encoded,
        #  translate it to answer label
        if self._labels.size == self._Y.size:
            dx = (self._Y - self._labels) / batch_size
        else:
            dx = self._Y.copy()
            dx[np.arange(batch_size), self._labels] -= 1
            dx = dx / batch_size

        return dx

Two Layer Neural Network for MNIST

For MNIST data set, input layer has 784 nodes and output layer has 10 nodes.
By using batch with size 100, the row of input and output layer is 100.
Here, 2 layer NN is

The code is

# Two Layer Neural Network
class TwoLayerNeuralNetwork():
    def __init__(self, input_size, \
                       hidden_size, \
                       output_size, \
                       weight_init_std=0.01):
        # Dictionary for weights and bias
        self._params = {}
        # randn returns a sample \
        #  from the standard normal distribution which is 1.
        # Multiplying with weight_init_std makes \
        #  random number array \
        #  whose standard variation is weight_init_std.
        self._params['W1'] = weight_init_std * \
                    np.random.randn(input_size, hidden_size)
        self._params['W2'] = weight_init_std * \
                    np.random.randn(hidden_size, output_size)
        # Set zero to all biases
        self._params["b1"] = np.zeros(hidden_size)
        self._params["b2"] = np.zeros(output_size)

        # Create hierarchical layers
        self._layers = collections.OrderedDict()
        self._layers["Affine1"] = Affine(self._params["W1"], self._params["b1"])
        self._layers["Relu"] = Relu()
        self._layers["Affine2"] = Affine(self._params["W2"], self._params["b2"])

        self._layers["Output"] = SoftmaxWithLoss()

    # Prediction function
    def predict(self, X):
        for name, layer in self._layers.items():
            if name is not "Output":
                X = layer.forward(X)

        return X

    # Cost function
    def cost(self, X, labels):
        Y = self.predict(X)
        return self._layers["Output"].forward(Y, labels)

    # Accuracy function
    def accuracy(self, X, labels):
        Y = self.predict(X)
        Y = np.argmax(Y, axis=1)
        labels = np.argmax(labels, axis=1)

        if labels.ndim != 1:
            labels = np.argmax(labels, axis=1)

        accuracy = np.sum(Y == labels) / float(X.shape[0])
        return accuracy

    # Gradient function
    def gradient(self, X, labels):
        # Forward
        self.cost(X, labels)

        # Backward
        d = 1

        layers = reversed(list(self._layers.values()))
        for layer in layers:
            d = layer.backward(d)

        # Result
        grads = {}
        grads["W1"] = self._layers["Affine1"].dW
        grads["W2"] = self._layers["Affine2"].dW
        grads["b1"] = self._layers["Affine1"].db
        grads["b2"] = self._layers["Affine2"].db

        return grads

    # Update Weight and bias
    def update(self, grads, learning_rate):
        for key in ("W1", "W2", "b1", "b2"):
            self._params[key] -= learning_rate * grads[key]

Train MNIST Data Set

The number of instances is 6000, and batch size is 100, so one epoch takes 600 iterations.

# Train MNIST

# Create MNIST manager
mnistManager = MnistManager()
# Get Mnist data
(X_train, label_train), (X_test, label_test) = mnistManager.getMNIST()

print("Training Input: {0}".format(X_train.shape))
print("Training Label: {0}".format(label_train.shape))
print("Testing Input: {0}".format(X_test.shape))
print("Testing Label: {0}".format(label_train.shape))

# Create one layer neural network
network = TwoLayerNeuralNetwork(input_size=784, hidden_size=50, output_size=10)

# Total number of trials
iters_num = 10001
# The number of instances
train_size = X_train.shape[0]
print("Instances: {0}".format(train_size))
# How many instances will be used for one training trial
batch_size = 100
# Learning rate
learning_rate = 0.1

# Variables for graph
train_costs = []
train_accs = []
test_accs = []

# The number of iterations for one epoch
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    # Stochastic method - randomly choose data set to train
    batch_mask = np.random.choice(train_size, batch_size)
    # Training input data chosen
    X_batch = X_train[batch_mask]
    # Training label data chosen
    label_batch = label_train[batch_mask]

    # Gradient descent to optimize weights and bias
    grads = network.gradient(X_batch, label_batch)

    # Update weights ans bias
    network.update(grads, learning_rate)

    # Total iteration number is 10001.
    # Check variations of cost, accuracy per epoch
    if i % iter_per_epoch == 0:
        # Cost
        cost = network.cost(X_batch, label_batch)
        train_costs.append(cost)
        # Accuracy of training data
        train_acc = network.accuracy(X_train, label_train)
        train_accs.append(train_acc)
        # Accuracy of testing data
        test_acc = network.accuracy(X_test, label_test)
        test_accs.append(test_acc)
        print("Iter: {0}, Cost: {1:0.5f}, Train acc: {2:0.5f}, Test acc: {3:0.5f}".\
            format(i, cost, train_acc, test_acc))

# Draw graph
x = np.arange(len(train_accs))
plt.plot(x, train_accs, label="train acc")
plt.plot(x, test_accs, label="test acc", linestyle="--")
plt.xlabel("trials")
plt.ylabel("accuracy")
plt.title("2-layer Neural Network")
plt.legend(loc="lower right")
plt.show()

Training Input: (60000, 784)
Training Label: (60000, 10)
Testing Input: (10000, 784)
Testing Label: (60000, 10)
Instances: 60000
Iter: 0, Cost: 2.30126, Train acc: 0.12698, Test acc: 0.13270
Iter: 600, Cost: 0.27200, Train acc: 0.90525, Test acc: 0.90800
Iter: 1200, Cost: 0.39566, Train acc: 0.92173, Test acc: 0.92550
Iter: 1800, Cost: 0.13303, Train acc: 0.93557, Test acc: 0.93380
Iter: 2400, Cost: 0.09149, Train acc: 0.94503, Test acc: 0.94150
Iter: 3000, Cost: 0.12236, Train acc: 0.95112, Test acc: 0.94730
Iter: 3600, Cost: 0.14326, Train acc: 0.95478, Test acc: 0.95200
Iter: 4200, Cost: 0.11878, Train acc: 0.96085, Test acc: 0.95630
Iter: 4800, Cost: 0.10453, Train acc: 0.96307, Test acc: 0.95910
Iter: 5400, Cost: 0.17780, Train acc: 0.96685, Test acc: 0.95860
Iter: 6000, Cost: 0.04278, Train acc: 0.96935, Test acc: 0.96150
Iter: 6600, Cost: 0.05189, Train acc: 0.97065, Test acc: 0.96300
Iter: 7200, Cost: 0.06384, Train acc: 0.97180, Test acc: 0.96350
Iter: 7800, Cost: 0.20248, Train acc: 0.97400, Test acc: 0.96680
Iter: 8400, Cost: 0.05188, Train acc: 0.97630, Test acc: 0.96750
Iter: 9000, Cost: 0.03629, Train acc: 0.97742, Test acc: 0.96720
Iter: 9600, Cost: 0.04282, Train acc: 0.97823, Test acc: 0.96940

Image 1. Accuracy of 2 layer Neural Network for MNIST

Universe In Computer

Header$type=social_icons

$type=grid$count=3$meta=0$sn=0$rm=0

22. MNIST Train with 2 Layer Neural Network

TOC

Import Necessary Packages

Getting MNIST Data Set

Affine Neuron

ReLU - Activation Function

Softmax-With-Loss

Two Layer Neural Network for MNIST

Train MNIST Data Set

라벨:

COMMENTS

Labels

RECENT$type=list-tab$date=0$au=0$c=5

REPLIES$type=list-tab$com=0$c=4$src=recent-comments

RANDOM$type=list-tab$date=0$au=0$c=5$src=random-posts

$type=grid$count=3$meta=0$sn=0$rm=0

22. MNIST Train with 2 Layer Neural Network

TOC

Import Necessary Packages

Getting MNIST Data Set

Affine Neuron

ReLU - Activation Function

Softmax-With-Loss

Two Layer Neural Network for MNIST

Train MNIST Data Set

라벨:

SHARE:

COMMENTS

Labels

RECENT$type=list-tab$date=0$au=0$c=5

REPLIES$type=list-tab$com=0$c=4$src=recent-comments

RANDOM$type=list-tab$date=0$au=0$c=5$src=random-posts