Code
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
from mlxtend.plotting import plot_decision_regions
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
Setosa
, Versicolor
, and Virginica
~93%
def split(features, label, valRatio, testRatio):
# getting the samples sizes
testSample = int(len(label) * testRatio)
validationSample = int(len(label) * valRatio + testSample)
# splitting the data into 3 samples
x_test, y_test = features[:testSample], label[:testSample]
x_validation, y_validation = (
features[testSample:validationSample],
label[testSample:validationSample],
)
x_train, y_train = features[validationSample:], label[validationSample:]
# returning samples
return x_train, y_train, x_test, y_test, x_validation, y_validation
def train_validate_test_split(features, labels, testRatio=0.3, valRatio=0.3):
# first we split the data into 3 sets:
# 1) train dataset to train our model (x_train, y_train)
# 2) validation dataset to improve model's accuracy (x_validation, y_validation)
# 3) test dataset to test the model and make it predict the labels for unknown data (x_test, y_test)
x_train, y_train, x_test, y_test, x_validation, y_validation = split(
features, labels, valRatio, testRatio
)
# secondly, we train our model using the training samples
train(x_train, y_train)
# thirdly, we validate the model using validation samples
validate(x_validation, y_validation)
# and then we test the model and let it predict the labels of unknown data
predicted_values = test(x_test)
# finally, we calculate the model's accuracy
calc_accuracy(predicted_values, y_test)