import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

data = pd.read_csv('data/adult.csv') # read in the file
data.head() # look at the first few rows of the dataframe

data.info() # look at the various columns and their data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

# look for null values and duplicate rows
print("Null values by column:\n", data.isnull().sum())
print("Number of duplicate rows: ", data.duplicated().sum())

Null values by column:
 age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64
Number of duplicate rows:  24

# clean the data and reinspect
data.drop_duplicates(inplace=True) # drop the duplicate rows
data.replace('?', np.nan, inplace=True) # replace any values with a ? with "NaN" or "Not a Number"
data.dropna(inplace=True) # drop any rows that have NA values
data.info() # inspect data again

<class 'pandas.core.frame.DataFrame'>
Index: 30139 entries, 1 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30139 non-null  int64 
 1   workclass       30139 non-null  object
 2   fnlwgt          30139 non-null  int64 
 3   education       30139 non-null  object
 4   education.num   30139 non-null  int64 
 5   marital.status  30139 non-null  object
 6   occupation      30139 non-null  object
 7   relationship    30139 non-null  object
 8   race            30139 non-null  object
 9   sex             30139 non-null  object
 10  capital.gain    30139 non-null  int64 
 11  capital.loss    30139 non-null  int64 
 12  hours.per.week  30139 non-null  int64 
 13  native.country  30139 non-null  object
 14  income          30139 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# one hot encode the categorical features
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 
                        'relationship', 'race', 'sex', 'native.country', 'income']

for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])

# extract the label column
X = data.drop('income', axis=1)
y = data['income']

# scale the numeric features to each have a mean of 0, std dev of 1
continuous_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

# split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# convert test and training data to a tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create a TensorDataset within Pytorch
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# wrap the Dataset in a DataLoader to be iterable
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x

# figure out the width of the input tensor
print('The shape of the X_train_tensor is:', X_train_tensor.shape)
# let's use the second value, the # of columns
input_dim = X_train_tensor.shape[1]

# instantiate the model
model = NeuralNetwork(input_dim)

The shape of the X_train_tensor is: torch.Size([24111, 14])

# define a loss function
criterion = nn.CrossEntropyLoss() # a go to for classication problems
learning_rate = 0.001 # a standard starting point, use factors of 10
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam Optimizer: https://arxiv.org/abs/1412.6980

# Set the number of times to iterate over the entire training dataset
num_epochs = 30

# Start the training loop, iterating through the dataset `num_epochs` times
for epoch in range(num_epochs):  # Loop over each epoch
    model.train()  # Put the model into training mode (enables features like dropout)
    running_loss = 0.0  # Initialize a variable to keep track of cumulative loss for the epoch

    # Loop through each batch of data in the training dataset
    for inputs, labels in train_loader:  # `inputs` are the features, `labels` are the targets
        optimizer.zero_grad()  # Clear the gradients from the previous step
        
        outputs = model(inputs)  # Perform a forward pass through the model to get predictions
        loss = criterion(outputs, labels)  # Compute the loss between predictions and actual labels
        loss.backward()  # Perform backpropagation to calculate gradients of loss with respect to parameters
        optimizer.step()  # Update model parameters based on the gradients
        
        running_loss += loss.item()  # Accumulate the loss for this batch
    
    # Calculate the average loss for this epoch
    avg_loss = running_loss / len(train_loader)  # Divide total loss by the number of batches
    # Print progress, showing the current epoch and average loss for the epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

Epoch [1/30], Loss: 0.4452
Epoch [2/30], Loss: 0.3727
Epoch [3/30], Loss: 0.3559
Epoch [4/30], Loss: 0.3429
Epoch [5/30], Loss: 0.3401
Epoch [6/30], Loss: 0.3345
Epoch [7/30], Loss: 0.3363
Epoch [8/30], Loss: 0.3340
Epoch [9/30], Loss: 0.3323
Epoch [10/30], Loss: 0.3305
Epoch [11/30], Loss: 0.3302
Epoch [12/30], Loss: 0.3278
Epoch [13/30], Loss: 0.3257
Epoch [14/30], Loss: 0.3263
Epoch [15/30], Loss: 0.3248
Epoch [16/30], Loss: 0.3233
Epoch [17/30], Loss: 0.3224
Epoch [18/30], Loss: 0.3201
Epoch [19/30], Loss: 0.3210
Epoch [20/30], Loss: 0.3180
Epoch [21/30], Loss: 0.3163
Epoch [22/30], Loss: 0.3166
Epoch [23/30], Loss: 0.3149
Epoch [24/30], Loss: 0.3134
Epoch [25/30], Loss: 0.3131
Epoch [26/30], Loss: 0.3109
Epoch [27/30], Loss: 0.3128
Epoch [28/30], Loss: 0.3107
Epoch [29/30], Loss: 0.3086
Epoch [30/30], Loss: 0.3082

model.eval()  # Put the model in evaluation mode (disables features like dropout and gradient tracking)
correct = 0  # Initialize a counter for correctly classified samples
total = 0  # Initialize a counter for the total number of samples

with torch.no_grad():  # Disable gradient calculation for efficiency and to save memory
    for inputs, labels in test_loader:  # Loop through each batch in the test dataset
        outputs = model(inputs)  # Perform a forward pass through the model to get predictions
        _, predicted = torch.max(outputs.data, 1)  # Get the class with the highest probability for each sample
        total += labels.size(0)  # Update the total count with the number of samples in this batch
        correct += (predicted == labels).sum().item()  # Increment the correct count for accurate predictions

accuracy = 100 * correct / total  # Calculate accuracy as a percentage
print(f'Accuracy on test data: {accuracy:.2f}%')  # Display the accuracy of the model on the test data

Accuracy on test data: 85.00%

torch.save(model.state_dict(), "models/income.pth")
print("Saved PyTorch Model State to models/income.pth")

Saved PyTorch Model State to model.pth

# The process for loading a model includes re-creating the model structure and loading the state dictionary into it
model = NeuralNetwork(input_dim)
model.load_state_dict(torch.load("models/income.pth", weights_only=True))
model = model.to(device) # move from cpu to gpu if available

# This model can now be used to make predictions.
classes = ["Over $50k", "Under $50k"] # is this correct, or should it be switched?
row_index_to_test = 2

# evaluate the model on this one item from the dataset
model.eval()
x, y = test_dataset[row_index_to_test][0], test_dataset[row_index_to_test][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Over $50k", Actual: "Over $50k"

# supress warnings
import warnings
warnings.filterwarnings("ignore")

# export to HTML for webpage
import os
os.system('jupyter nbconvert --to html income-mlp.ipynb --HTMLExporter.theme=dark')

[NbConvertApp] Converting notebook income-mlp.ipynb to html
[NbConvertApp] Writing 334100 bytes to income-mlp.html

0

	age	workclass	fnlwgt	education	education.num	marital.status	occupation	relationship	race	sex	capital.loss	hours.per.week	native.country	income
0	90	?	77053	HS-grad	9	Widowed	?	Not-in-family	White	Female	4356	40	United-States	<=50K
1	82	Private	132870	HS-grad	9	Widowed	Exec-managerial	Not-in-family	White	Female	4356	18	United-States	<=50K
2	66	?	186061	Some-college	10	Widowed	?	Unmarried	Black	Female	4356	40	United-States	<=50K
3	54	Private	140359	7th-8th	4	Divorced	Machine-op-inspct	Unmarried	White	Female	3900	40	United-States	<=50K
4	41	Private	264663	Some-college	10	Separated	Prof-specialty	Own-child	White	Female	3900	40	United-States	<=50K

Introduction to MLPs using US Census Income Data¶

Income Predictor Dataset - US Adult¶

Environment Setup¶

Data Exploration & Cleaning¶

PyTorch & Tensors¶

Datasets & Dataloaders¶

Get Device for Training¶

Build a Machine Learning Model¶

Train a Neural Network¶

Model Evaluation¶

Saving & Loading our Model¶

Assignment¶

Bonus Challenge #1¶

Bonus Challenge #2¶

Bonus Challenge #3¶

Export Notebook to HTML¶