import pandas as pd

df = pd.read_csv('A1_Data_ILG.csv')
print(df)

   experience  skill  athleticism label
0           5      8            5  good
1           4      4            3  good
2           2      2            4   bad
3           0      2            3   bad
4           5      6            5  good
5           1      0            0   bad

# plot the data in A1_Data_ILG.csv

import pandas as pd
import matplotlib.pyplot as plt

# read the data
df = pd.read_csv('A1_Data_ILG.csv')

fig = plt.figure()
ax = plt.axes(projection='3d')

x1 = df['experience']
x2 = df['skill']
x3 = df['athleticism']
y = df['label']

# convert good to green and bad to red for the labels
y = y.replace('good', 'green')
y = y.replace('bad', 'red')

# plot the data with different colors for each label
ax.scatter3D(x1, x2, x3, s=100, c=y)

# add labels
ax.set_xlabel('experience')
ax.set_ylabel('skill')
ax.set_zlabel('athleticism')
ax.set_title('Simple 3D Dataset')

# show the plot
plt.show()

# Normalize the Data

df = pd.read_csv('A1_Data_ILG.csv')

x1 = df['experience']
x2 = df['skill']
x3 = df['athleticism']
y = df['label']

df_norm = df.copy()
df_norm['experience'] = (x1 - x1.min()) / (x1.max() - x1.min())
df_norm['skill'] = (x2 - x2.min()) / (x2.max() - x2.min())
df_norm['athleticism'] = (x3 - x3.min()) / (x3.max() - x3.min())

df_norm.to_csv('A1_Data_ILG_norm.csv', index=False)

print('Normalized Data: \n', df_norm)

Normalized Data: 
    experience  skill  athleticism label
0         1.0   1.00          1.0  good
1         0.8   0.50          0.6  good
2         0.4   0.25          0.8   bad
3         0.0   0.25          0.6   bad
4         1.0   0.75          1.0  good
5         0.2   0.00          0.0   bad

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def read_data(path):
    """read in data from a file path
    path is a string pointing to a local directory
    the data file should be a csv
    if debug >= 1: 
        prints the top rows of the datafile
    returns a pandas dataframe of the csv
    """
    if debug >= 1: 
        print('Reading in data from ' + path + '...')
    df = pd.read_csv(path)
    if debug >= 2: 
        print('Data loaded. Here is a sneak peak: \n', df.head())
    return df


def clean_data(df):
    """clean the data
    Takes in a dataframe df
    Returns two numpy arrays x and y
    x is inputs normalized between 0-1
    y is labels converted to 0 or 1
    """
    if debug >= 1: 
        print('Now cleaning the data.')
    # get the size of the matrix
    nrows = df.shape[0]
    ncols = df.shape[1]
    if debug >= 2: 
        print('The shape of the data is: ', df.shape)
    
    # intialize x and y
    x = np.empty((nrows,ncols-1))
    y = np.empty((nrows,1))

    # separate labels
    if 'LABEL' in df.columns:
        y = np.array(df['LABEL'])
        x = np.array(df[df.columns.difference(['LABEL'])])
    elif 'label' in df.columns:
        y = np.array(df['label'])
        x = np.array(df[df.columns.difference(['label'])])
    else:
        raise Exception('There is no column with the title label in the data')
    if debug >= 2: 
        print('x is: \n', x)
    if debug >= 2: 
        print('y is: \n', y)
    
    # convert labels to 0 or 1
    y = convert_labels(y)
    # convert shape from 1D array to 2D with 1 column
    y = y.reshape(-1, 1) 
    if debug >= 2: 
        print('New y is : \n', y)
    
    # normalize x
    x = normalize(x)
    if debug >= 2: 
        print('New x is : \n', x)

    # return x and y
    return x, y


def convert_labels(y):
    """converts y labels
    accepts in a numpy array with binary values
    converts the first value to 0, all others to 1
    returns a numpy array of 0s and 1s
    """
    if debug >= 1: 
        print('Converting labels to 0,1...')
    y_values = set(y)
    y_value_0 = list(y_values)[0]
    y_value_1 = list(y_values)[1]

    y[y == y_value_0] = 0
    y[y == y_value_1] = 1
    return y

def normalize(x):
    """normalizes x values
    accepts a numpy array of floats
    converts them to a max of 1 and min of 0
    returns a numpy array
    """
    if debug >= 1: 
        print('Normalizing x values...')
    xrows = x.shape[0]
    xcols = x.shape[1]
    xnorm = np.empty((xrows, xcols))
    for j, column in enumerate(x.T):
        max = column.max()
        min = column.min()
        for i, item in enumerate(column):
            xnorm[i,j] = (item - min)/(max - min)
    return xnorm

def logistic_regression(x, y, learning_rate = 0.1, epochs = 100):
    """run logistic regression on the input data
    accepts the inputs, labels, LR, and # of epochs
    runs logistic regression for the given number of epochs
    graphs the error rate over time
    graphs the confusion matrix for the final predictions
    """
    if debug >= 1: 
        print('Running logistic regression...')
    
    # inspect shape of data
    xrows = x.shape[0]
    xcols = x.shape[1]
    yrows = y.shape[0]
    ycols = y.shape[1]
    
    # validate y is N x 1 dimensions
    if yrows != xrows:
        raise Exception('The size of the label vector does not match the input data')
    if ycols != 1:
        raise Exception('The label vector should only have 1 column')

    # create arrays to store losses 
    cols = []
    for i in range(xcols):
        cols.append('w' + str(i+1))
    cols.append('b')
    cols.append('lce_avg')
    weights_df = pd.DataFrame(columns = cols)
    
    # initialize the weights and the bias
    w = np.ones((1, xcols))
    b = 0
    if debug >= 2: 
        print('The initial weights are: \n', w)
    if debug >= 2: 
        print('The initial bias is: ', b)
    
    for e in range(epochs+1):

        # log current weights
        new_weight_row = []
        for i, weight in enumerate(w[0]):
            new_weight_row.append(weight)
        new_weight_row.append(b)
        new_weight_row.append(0) # placeholder for loss later
        weights_df.loc[len(weights_df)] = new_weight_row
        if debug >= 2: 
            print('The weights_df is now:\n', weights_df)

        # compute z = x * w^T + b
        z = x @ w.T + b
        z = z.astype(float)
        if debug >= 2: 
            print('The z vector is: \n', z)

        y_hat = 1 / ( 1 + np.exp(-z) )
        if debug >= 2: 
            print('The y_hat is: \n', y_hat)

        # calculate the loss with binary cross entropy
        lce = (- y * np.log(y_hat) - (1-y) * np.log(1-y_hat))
        lce_avg = 1/xrows * np.sum(lce)
        weights_df['lce_avg'][e] = lce_avg
        if debug >= 2: 
            print('The loss is \n:', lce_avg)

        # calculate the gradient for w
        dlce_dw = 1/xrows * (y_hat - y).T @ x
        if debug >= 2: 
            print('The weight gradient is: ', dlce_dw)

        # update the weights
        dw = dlce_dw * learning_rate
        w = w - dw
        if debug >= 2: 
            print('The new weights are: ', w)

        # calculate the gradient for b
        dlce_db = 1/xrows * np.sum(y_hat - y)
        if debug >= 2: 
            print('The bias gradient is: ', dlce_db)

        # update the bias
        db = dlce_db * learning_rate
        b = b - db
        if debug >= 2: 
            print('The new bias is :', b)

        # repeat for e epochs
    if export == 1:
        weights_df.to_csv('A1_Weights_ILG.csv', index=False)
    return weights_df, y_hat

def plot_results(weights_df, y_hat, y):
    '''plot the results of the logistic regression
    takes in a dataframe with n weights, the bias, and loss for each epoch
    plots the error over time
    plots the weights and bias over time
    plots the confusion matrix for the final result
    '''
    # plot the error over each epoch from the index column
    if debug >= 1: 
        print('Plotting the error over time...')
    _, ax = plt.subplots()
    ax.plot(weights_df.index, weights_df['lce_avg'])
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    ax.set_title('Loss over time')
    plt.show()

    # plot the weights and bias over time
    if debug >= 1: 
        print('Plotting the weights over time...')
    _, ax = plt.subplots()
    for column in weights_df.columns[:-1]:
        ax.plot(weights_df.index, weights_df[column])
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Weights')
    ax.set_title('Weights over time')
    # add legend
    ax.legend(weights_df.columns[:-1])
    plt.show()

    # plot a confusion matrix of y_hat vs y
    if debug >= 1: 
        print('Plotting the confusion matrix...')
    y_hat = y_hat.round()
    y_hat = y_hat.astype(int)
    y = y.astype(int)
    confusion_matrix = pd.crosstab(y_hat[:,0], y[:,0], rownames=['Predicted'], colnames=['Actual'])
    # create a plot of the confusion matrix with a 2x2 grid with colors
    _, ax = plt.subplots()
    ax.matshow(confusion_matrix, cmap=plt.cm.Blues)
    # add labels
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    # add numbers
    for i in range(2):
        for j in range(2):
            ax.text(j, i, confusion_matrix.iloc[i, j], ha='center', va='center', color='green')
    plt.show()

    # print the weights
    if debug >= 2:
        print('Full weight evolution: \n', weights_df)
    return

def main():
    """main function
    Entrypoint to the file
    Calls all other functions
    """
    df = read_data('A1_Data_ILG.csv')
    x, y = clean_data(df)
    LR = 25
    epochs = 20
    weights_df, y_hat = logistic_regression(x, y, LR, epochs)
    plot_results(weights_df, y_hat, y)
    return

# set debug level
# 0 = print nothing
# 1 = print important statements
# 2 = print all statements
debug = 1
export = 1

# call of the code above!
main()

Reading in data from A1_Data_ILG.csv...
Now cleaning the data.
Converting labels to 0,1...
Normalizing x values...
Running logistic regression...
Plotting the error over time...

Plotting the weights over time...

Plotting the confusion matrix...

# export to HTML for webpage
import os

# os.system('jupyter nbconvert --to html mod1.ipynb')
os.system('jupyter nbconvert --to html mod1.ipynb --HTMLExporter.theme=dark')

[NbConvertApp] Converting notebook mod1.ipynb to html
[NbConvertApp] Writing 795339 bytes to mod1.html

0

Logistic Regression Optimization¶

1. Dataset¶

2. General Linear Equation¶

3. Plot the Data¶

4. Visual Grouping¶

5. Sigmoid Function¶

6. Run the First Epoch¶

7. Loss Function¶

8. Try Again with New Weights¶

9. Calculate the Gradient¶

10. Run the Second Epoch with Gradient Descent¶

11. Hand Code a Logistic Regression Model¶

Conclusion¶