Imports

Code

from tensorflow.keras.datasets import mnist
import numpy as np
import pandas as pd
from matplotlib.pyplot import plot as plt
import sys

Standardization

Code

def standardize(x):
    return (x - np.mean(x)) / np.std(x)

Prediction

Code

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

Code

def predict(X, W, B):
    pred = sigmoid(np.dot(X, W) + B)
    return pred

Cost Function

Code

def cost(y, y_hat):
    return np.mean(
        -y * np.log(y_hat + sys.float_info.min)
        - (1 - y) * np.log(1 - y_hat + sys.float_info.min)
    )

Train

Code

def train(X, y, learning_rate, epochs):
    W = np.random.rand(X.shape[1], y.shape[1])
    B = np.random.random()
    costs = np.array([])
    for i in range(epochs):
        y_hat = predict(X, W, B)
        dW = np.dot(X.T, (y_hat - y)) / len(X)
        db = np.mean(y_hat - y)
        W -= learning_rate * dW
        B -= learning_rate * db
        if i % 100 == 0:
            costValue: float = cost(y, y_hat)
            costs = np.append(costs, costValue)
            print(f"Cost at epoch {i}: {costValue}")
    return W, B, costs

Train with L1 Regularization

Code

def train_L1(X, y, learning_rate, epochs, Lambda):
    W = np.random.random(X.shape[1])
    B = np.random.random()
    costs = np.array([])
    for i in range(epochs):
        y_hat = predict(X, W, B)
        dW = np.dot(X.T, (y_hat - y)) / len(X)
        db = np.mean(y_hat - y)
        W -= learning_rate * (dW + Lambda * np.sign(W))
        B -= learning_rate * db
        if i % 100 == 0:
            costValue: float = cost(y, y_hat)
            costs = np.append(costs, costValue)
            print(f"Cost at epoch {i}: {costValue}")
    return W, B, costs

Mini_Batch Gradient Descent

Code

def train_mini_batch(X, y, learning_rate, epochs, batch_size):
    W = np.random.random(X.shape[1])
    B = np.random.random()
    costs = np.array([])
    for i in range(epochs):
        for j in range(0, len(X), batch_size):
            X_batch = X[j : j + batch_size]
            y_batch = y[j : j + batch_size]
            y_hat = predict(X_batch, W, B)
            dW = np.dot(X_batch.T, (y_hat - y_batch)) / len(X_batch)
            db = np.mean(y_hat - y_batch)
            W -= learning_rate * dW
            B -= learning_rate * db
            if i % 100 == 0:
                costValue: float = cost(y, y_hat)
                costs = np.append(costs, costValue)
                print(f"Cost at epoch {i}: {costValue}")
    return W, B, costs

RMS Prop

Code

def train_rms_prop(X, y, learning_rate, epochs, beta, epsilon=sys.float_info.min):
    W = np.random.random(X.shape[1])
    B = np.random.random()
    costs = np.array([])
    vW = np.zeros(X.shape[1])
    vB = 0
    for i in range(epochs):
        y_hat = predict(X, W, B)
        dW = np.dot(X.T, (y_hat - y)) / len(X)
        dB = np.mean(y_hat - y)
        vW = beta * vW + (1 - beta) * dW**2
        vB = beta * vB + (1 - beta) * dB**2
        W -= learning_rate * dW / (np.sqrt(vW) + epsilon)
        B -= learning_rate * dB / (np.sqrt(vB) + epsilon)
        if i % 100 == 0:
            costValue: float = cost(y, y_hat)
            costs = np.append(costs, costValue)
            print(f"Cost at epoch {i}: {costValue}")
    return W, B, costs

Adam

Code

def train_adam(X, y, learning_rate, epochs, beta1, beta2, epsilon=sys.float_info.min):
    W = np.random.random(X.shape[1])
    B = np.random.random()
    costs = np.array([])
    vW = np.zeros(X.shape[1])
    vB = 0
    sW = np.zeros(X.shape[1])
    sB = 0
    for i in range(epochs):
        y_hat = predict(X, W, B)
        dW = np.dot(X.T, (y_hat - y)) / len(X)
        dB = np.mean(y_hat - y)
        vW = beta1 * vW + (1 - beta1) * dW
        vB = beta1 * vB + (1 - beta1) * dB
        sW = beta2 * sW + (1 - beta2) * dW**2
        sB = beta2 * sB + (1 - beta2) * dB**2
        vW_corrected = vW / (1 - beta1 ** (i + 1))
        vB_corrected = vB / (1 - beta1 ** (i + 1))
        sW_corrected = sW / (1 - beta2 ** (i + 1))
        sB_corrected = sB / (1 - beta2 ** (i + 1))
        W -= learning_rate * vW_corrected / (np.sqrt(sW_corrected) + epsilon)
        B -= learning_rate * vB_corrected / (np.sqrt(sB_corrected) + epsilon)
        if i % 100 == 0:
            costValue: float = cost(y, y_hat)
            costs = np.append(costs, costValue)
            print(f"Cost at epoch {i}: {costValue}")
    return W, B, costs

Test

Code

def test(X, y, W, b):
    y_hat = predict(X, W, b)
    return accuracy(y, y_hat)

Accuracy

Code

def accuracy(y, y_hat):
    return np.mean(y == y_hat)

Loading the data

Code

trainSet, testSet = mnist.load_data()

Code

xTrain = trainSet[0].astype("float32")
yTrain = trainSet[1].astype("int32")

xTest = testSet[0].astype("float32")
yTest = testSet[1].astype("int32")

xTrain = xTrain.reshape(xTrain.shape[0], -1)
xTest = xTest.reshape(xTest.shape[0], -1)

xTrain = standardize(xTrain)
xTest = standardize(xTest)

p1 = np.random.permutation(len(xTrain))
p2 = np.random.permutation(len(xTest))

xTrain = xTrain[p1]
yTrain = yTrain[p1]

xTest = xTest[p2]
yTest = yTest[p2]

Code

xTrain

Code

yTrain

Code

yTrain = pd.get_dummies(yTrain)
yTrain = yTrain.astype("int32")
yTrain

K-Fold Cross-Validation

Code

def k_fold_cross_validation(X, y, K, lr, epochs):
    fold_size = len(X) // K
    accuracies = []
    for i in range(K):
        # Divide the data into training and testing sets
        X_train = np.concatenate([X[: i * fold_size], X[(i + 1) * fold_size :]])
        Y_train = np.concatenate([y[: i * fold_size], y[(i + 1) * fold_size :]])
        x_valid = X[i * fold_size : (i + 1) * fold_size]
        y_valid = y[i * fold_size : (i + 1) * fold_size]
        # Train the model
        print(f"\nTraining at Iteration {i + 1} of {K}")
        w, b, Costs = train(X_train, Y_train, lr, epochs)
        # Test the model
        print(f"\nTesting at Iteration {i + 1} of {K}")
        acc = test(x_valid, y_valid, w, b)
        print(f"Accuracy at Iteration {i + 1} of {K}: {acc}")
        accuracies.append(acc)
    return np.mean(accuracies), Costs

Running Code

Code

k = 10
iterations = 1000
LRs = [0.1, 0.01, 0.001, 0.0001]
for lr in LRs:
    print(f"K-Fold Cross Validation with {k} Folds and eta {lr}:")
    average_accuracy, c = k_fold_cross_validation(xTrain, yTrain, k, lr, iterations)
    print("\nAverage Accuracy: ", average_accuracy)
    plt(c, label=f"eta = {lr}")