How to optimize DNN with TensorFlow
TOC
Adam Optimizer
- Gradient Descent or Stochastic Gradient Descent is a base algorithm to find optimized weights and biases.
- However, many new and efficient optimizers are proposed. Adam is one of powerful optimizers.
Image 1. Adam(src: SRC)
- Fortunately, TensorFlow provides Adam method.
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
- This is an example for single layer for MNIST with Adam optimizer.
# Single layer MNIST with Adam optimizer
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
# Reproducibility
tf.set_random_seed(777)
# Get MNIST data set
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Parameters
learning_rate = 0.001
epochs = 15
batch_size = 100
# Input place holders
X = tf.placeholder(tf.float32, [None, 784])
Y = tf.placeholder(tf.float32, [None, 10])
# Weights & bias for single layers
W = tf.Variable(tf.random_normal([784, 10]))
b = tf.Variable(tf.random_normal([10]))
# Default hypothesis is softmax(affine()),
# but softmax is integrated to softmax_cross_entropy_with_logits().
# Therefore, input of softmax_cross_entropy_with_logits() is hypothesis, now.
hypothesis = tf.matmul(X, W) + b
# Prediction
prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
# Cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=hypothesis, labels=Y))
# Accuracy
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
# Optimizer - Adam
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())
costs = []
accs = []
# Train
for epoch in range(epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
costs.append(avg_cost)
acc = sess.run(accuracy, feed_dict={
X: mnist.test.images, Y: mnist.test.labels})
accs.append(acc)
# Print result
print("Final cost: {0}".format(costs[-1]))
print("Final accuracy: {0}".format(accs[-1]))
# Draw graph
plt.plot(range(epochs), costs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title("Costs")
plt.show()
plt.plot(range(epochs), accs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.title("Accuracies")
plt.show()
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Final cost: 0.41734982719475594
Final accuracy: 0.8978000283241272
Image 1. Cost of single layer MNIST training with Adam optimizer
Image 2. Accuracy of single layer MNIST training with Adam optimizer
Multi Layer with Adam Optimizer
- This is an example for Multi layer MNIST training with Adam optimizer.
# Multi layer MNIST with Adam optimizer
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
# Reproducibility
tf.set_random_seed(777)
# Get MNIST data set
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Parameters
learning_rate = 0.001
epochs = 15
batch_size = 100
# Input place holders
X = tf.placeholder(tf.float32, [None, 784])
Y = tf.placeholder(tf.float32, [None, 10])
# Weights & bias for multiple layers
# For Multiple layer, activation function is ReLU.
W1 = tf.Variable(tf.random_normal([784, 256]))
b1 = tf.Variable(tf.random_normal([256]))
L1 = tf.nn.relu(tf.matmul(X, W1) + b1)
W2 = tf.Variable(tf.random_normal([256, 256]))
b2 = tf.Variable(tf.random_normal([256]))
L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
W3 = tf.Variable(tf.random_normal([256, 10]))
b3 = tf.Variable(tf.random_normal([10]))
hypothesis = tf.matmul(L2, W3) + b3
# Prediction
prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
# Cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=hypothesis, labels=Y))
# Accuracy
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
# Optimizer - Adam
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())
costs = []
accs = []
# Train
for epoch in range(epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
costs.append(avg_cost)
acc = sess.run(accuracy, feed_dict={
X: mnist.test.images, Y: mnist.test.labels})
accs.append(acc)
# Print result
print("Final cost: {0}".format(costs[-1]))
print("Final accuracy: {0}".format(accs[-1]))
# Draw graph
plt.plot(range(epochs), costs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title("Costs")
plt.show()
plt.plot(range(epochs), accs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.title("Accuracies")
plt.show()
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Final cost: 0.796504732984805
Final accuracy: 0.9509000182151794
Image 3. Cost of multi layer MNIST training with Adam optimizer
Image 4. Accuracy of multi layer MNIST training with Adam optimizer
Xavier Initializer
- This is an example for multi layer MNIST trainig with Adam optimizer and Xavier initializer.
# Multi layer MNIST with Adam optimizer and Xavier initialization
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
# Reproducibility
tf.set_random_seed(777)
# Get MNIST data set
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Parameters
learning_rate = 0.001
epochs = 15
batch_size = 100
# Input place holders
X = tf.placeholder(tf.float32, [None, 784])
Y = tf.placeholder(tf.float32, [None, 10])
# Weights & bias for multiple layers
W1 = tf.get_variable("W1_X", shape=[784, 256],
initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.random_normal([256]))
L1 = tf.nn.relu(tf.matmul(X, W1) + b1)
W2 = tf.get_variable("W2_X", shape=[256, 256],
initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.random_normal([256]))
L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
W3 = tf.get_variable("W3_X", shape=[256, 10],
initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.Variable(tf.random_normal([10]))
hypothesis = tf.matmul(L2, W3) + b3
# Prediction
prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
# Cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=hypothesis, labels=Y))
# Accuracy
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
# Optimizer - Adam
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())
costs = []
accs = []
# Train
for epoch in range(epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
costs.append(avg_cost)
acc = sess.run(accuracy, feed_dict={
X: mnist.test.images, Y: mnist.test.labels})
accs.append(acc)
# Print result
print("Final cost: {0}".format(costs[-1]))
print("Final accuracy: {0}".format(accs[-1]))
# Draw graph
plt.plot(range(epochs), costs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title("Costs")
plt.show()
plt.plot(range(epochs), accs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.title("Accuracies")
plt.show()
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Final cost: 0.011271840706811581
Final accuracy: 0.9779999852180481
Image 5. Cost of multi layer MNIST training with Adam optimizer and Xavier initializer
Image 6. Accuracy of multi layer MNIST training with Adam optimizer and Xavier initializer
Deep Neural Network with Adam Optimizer and Xavier Initializer
- This is an example for deep neural network MNIST trainig with Adam optimizer and Xavier initializer.
# Deep layer MNIST with Adam optimizer and Xavier initialization
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
# Reproducibility
tf.set_random_seed(777)
# Get MNIST data set
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Parameters
learning_rate = 0.001
epochs = 15
batch_size = 100
# Input place holders
X = tf.placeholder(tf.float32, [None, 784])
Y = tf.placeholder(tf.float32, [None, 10])
# Weights & bias for multiple layers
# Initialization: Xavier
W1 = tf.get_variable("W1_D", shape=[784, 512],
initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.random_normal([512]))
L1 = tf.nn.relu(tf.matmul(X, W1) + b1)
W2 = tf.get_variable("W2_D", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.random_normal([512]))
L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
W3 = tf.get_variable("W3_D", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.Variable(tf.random_normal([512]))
L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
W4 = tf.get_variable("W4_D", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b4 = tf.Variable(tf.random_normal([512]))
L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
W5 = tf.get_variable("W5_D", shape=[512, 10],
initializer=tf.contrib.layers.xavier_initializer())
b5 = tf.Variable(tf.random_normal([10]))
hypothesis = tf.matmul(L4, W5) + b5
# Prediction
prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
# Cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=hypothesis, labels=Y))
# Accuracy
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
# Optimizer - Adam
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())
costs = []
accs = []
# Train
for epoch in range(epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
costs.append(avg_cost)
acc = sess.run(accuracy, feed_dict={
X: mnist.test.images, Y: mnist.test.labels})
accs.append(acc)
# Print result
print("Final cost: {0}".format(costs[-1]))
print("Final accuracy: {0}".format(accs[-1]))
# Draw graph
plt.plot(range(epochs), costs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title("Costs")
plt.show()
plt.plot(range(epochs), accs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.title("Accuracies")
plt.show()
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Final cost: 0.014999570602045743
Final accuracy: 0.9779000282287598
Image 7. Cost of deep neural network MNIST training with Adam optimizer and Xavier initializer
Image 8. Accuracy of deep neural network MNIST training with Adam optimizer and Xavier initializer
Deep Neural Network with Adam Optimizer, Xavier Initializer and Drop Out
This is an example for deep neural network MNIST trainig with Adam optimizer, Xavier initializer and Drop out.
# Deep layer MNIST with Adam optimizer, Xavier initialization and Drop out
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
# Reproducibility
tf.set_random_seed(777)
# Get MNIST data set
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Parameters
learning_rate = 0.001
epochs = 15
batch_size = 100
# Input place holders
X = tf.placeholder(tf.float32, [None, 784])
Y = tf.placeholder(tf.float32, [None, 10])
# Dropout (keep_prob) rate 0.7 on training, but should be 1 for testing
keep_prob = tf.placeholder(tf.float32)
# Weights & bias for multiple layers
# Drop out
W1 = tf.get_variable("W1_DDO", shape=[784, 512],
initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.random_normal([512]))
L1 = tf.nn.relu(tf.matmul(X, W1) + b1)
L1 = tf.nn.dropout(L1, keep_prob=keep_prob)
W2 = tf.get_variable("W2_DDO", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.random_normal([512]))
L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
L2 = tf.nn.dropout(L2, keep_prob=keep_prob)
W3 = tf.get_variable("W3_DDO", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.Variable(tf.random_normal([512]))
L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
L3 = tf.nn.dropout(L3, keep_prob=keep_prob)
W4 = tf.get_variable("W4_DDO", shape=[512, 512],
initializer=tf.contrib.layers.xavier_initializer())
b4 = tf.Variable(tf.random_normal([512]))
L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
L4 = tf.nn.dropout(L4, keep_prob=keep_prob)
W5 = tf.get_variable("W5_DDO", shape=[512, 10],
initializer=tf.contrib.layers.xavier_initializer())
b5 = tf.Variable(tf.random_normal([10]))
hypothesis = tf.matmul(L4, W5) + b5
# Prediction
prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
# Cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=hypothesis, labels=Y))
# Accuracy
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
# Optimizer - Adam
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())
costs = []
accs = []
# Train
for epoch in range(epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys, keep_prob: 0.7}
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
costs.append(avg_cost)
acc = sess.run(accuracy, feed_dict={
X: mnist.test.images, Y: mnist.test.labels, keep_prob: 1})
accs.append(acc)
# Print results
print("Final cost: {0}".format(costs[-1]))
print("Final accuracy: {0}".format(accs[-1]))
# Draw graph
plt.plot(range(epochs), costs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title("Costs")
plt.show()
plt.plot(range(epochs), accs)
plt.grid()
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.title("Accuracies")
plt.show()
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Final cost: 0.04678738850369966
Final accuracy: 0.9828000068664551
Image 9. Cost of deep neural network MNIST training with Adam optimizer, Xavier initializer and Drop out
Image 9. Accuracy of deep neural network MNIST training with Adam optimizer, Xavier initializer and Drop out
Batch Normalization
- TBD
COMMENTS