Import Necessary Libraries¶
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# machine learning
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN
# for reading stock data from yahoo'
from pandas_datareader.data import DataReader
import yfinance as yf
from pandas_datareader import data as pdr
# misc
from datetime import datetime
Pull in the Stock Data from Yahoo Finance¶
yf.pdr_override() # not sure what this does
stock = 'SCHK'
end = datetime.now()
print('End date: ', end)
start = datetime(end.year - 6, end.month, end.day)
print('Start date: ', start)
schk = yf.download(stock, start, end)
schk.tail()
End date: 2023-11-24 17:55:10.354131 Start date: 2017-11-24 00:00:00 [*********************100%***********************] 1 of 1 completed
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2023-11-17 | 43.380001 | 43.480000 | 43.299999 | 43.419998 | 43.419998 | 160800 |
| 2023-11-20 | 43.369999 | 43.855000 | 43.369999 | 43.779999 | 43.779999 | 179700 |
| 2023-11-21 | 43.669998 | 43.700001 | 43.549999 | 43.660000 | 43.660000 | 198100 |
| 2023-11-22 | 43.810001 | 43.930000 | 43.740002 | 43.860001 | 43.860001 | 165300 |
| 2023-11-24 | 43.810001 | 43.900002 | 43.810001 | 43.900002 | 43.900002 | 56867 |
Descriptive Statistics on the Data¶
# describe the data
print(schk.describe())
# note that weekends are not included, thus 255 days per year roughly
print(schk.info())
Open High Low Close Adj Close \
count 1510.000000 1510.000000 1510.000000 1510.000000 1510.000000
mean 34.789909 34.980387 34.560558 34.780530 33.383337
std 6.629948 6.653950 6.599332 6.629106 7.153823
min 22.240000 22.340000 21.299999 21.700001 20.476364
25% 28.242500 28.402500 28.122500 28.260000 26.230622
50% 35.295000 35.570000 34.957500 35.219999 34.089796
75% 40.877501 41.207500 40.557501 40.822501 39.952845
max 47.000000 47.025002 46.700001 46.849998 45.638546
Volume
count 1.510000e+03
mean 2.113612e+05
std 2.168370e+05
min 2.960000e+04
25% 1.121250e+05
50% 1.619000e+05
75% 2.443750e+05
max 4.797900e+06
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1510 entries, 2017-11-24 to 2023-11-24
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 1510 non-null float64
1 High 1510 non-null float64
2 Low 1510 non-null float64
3 Close 1510 non-null float64
4 Adj Close 1510 non-null float64
5 Volume 1510 non-null int64
dtypes: float64(5), int64(1)
memory usage: 82.6 KB
None
# visualize the closing price overtime
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History')
plt.plot(schk['Close'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()
/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/1599654624.py:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. plt.grid(b=True, which='major', color='#666666', linestyle='-')
# plot sale volume over time
plt.figure(figsize=(16,8))
plt.title('SCHK Sale Volume History')
plt.plot(schk['Volume'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Sale Volume (Millions of Shares)', fontsize=18)
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()
/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/4262586195.py:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. plt.grid(b=True, which='major', color='#666666', linestyle='-')
Prepare the Training Data for the RNN and LSTM¶
# filter out just the closing price
df = schk.filter(['Close'])
dataset = df.values
# get the number of rows to train the model on
training_data_len = int(len(dataset) * 0.9)
print('The length of the full dataset is: ', len(dataset))
print('The length of the training dataset is: ', training_data_len)
The length of the full dataset is: 1510 The length of the training dataset is: 1359
# scale the data
max = np.max(dataset)
min = np.min(dataset)
print('The max value is: ', max)
print('The min value is: ', min)
# scale the data
dataset_scaled = (dataset - min) / (max - min)
print(dataset_scaled)
The max value is: 46.849998474121094 The min value is: 21.700000762939453 [[0.15491055] [0.15427433] [0.16421469] ... [0.87316108] [0.88111339] [0.88270389]]
# create the training dataset
# we need a bunch of sequences of length 90 (90 days) for x_train and
# the next day for y_train
# create the scaled training dataset
dataset_train = dataset_scaled[0:training_data_len]
print('The shape of dataset_train is: ', dataset_train.shape)
# create placeholders for the x_train and y_train
x_train = []
y_train = []
# populate the x_train and y_train
# loop over the data in dataset_train from day 0 to end day - 90
# append the data from day 0 to day 89 to x_train
# append the data from day 90 to y_train
for i in range(0, len(dataset_train) - 90):
x_train.append(dataset_train[i:i+90]) # exclusive of day 90
y_train.append(dataset_train[i+90])
# inspect the x_train and y_train to make sure they are correct
# print('x_train is :\n', x_train[0])
# print('y_train is :\n', y_train[0])
# convert the x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)
print('The shape of x_train is: ', x_train.shape)
print('The shape of y_train is: ', y_train.shape)
The shape of dataset_train is: (1359, 1) The shape of x_train is: (1269, 90, 1) The shape of y_train is: (1269, 1)
Prepare the Testing Data for the LSTM and RNN¶
# use the last 20% of the data as the test data
# include the last 90 days of the training data in the test data
dataset_test = dataset_scaled[training_data_len - 90:]
print('The shape of dataset_test is: ', dataset_test.shape)
# create the x_test and y_test
x_test = []
y_test = dataset[training_data_len:] # just the last 20% of the data, unscaled values
# populate the x_test, starting 90s before the end of the training data
for i in range (0, len(dataset_test)-90):
x_test.append(dataset_test[i:i+90])
# convert the x_test to a numpy array
x_test = np.array(x_test)
y_test = np.array(y_test)
print('The shape of x_test is: ', x_test.shape)
print('The shape of y_test is: ', y_test.shape)
# create scaled versions of the x_test and y_test called x_valid and y_valid
x_valid = (x_test - min) / (max - min)
y_valid = (y_test - min) / (max - min)
# unscaled data for analysis
train = df[:training_data_len]
test = df[training_data_len:]
The shape of dataset_test is: (241, 1) The shape of x_test is: (151, 90, 1) The shape of y_test is: (151, 1)
Baseline Benchmarks to Beat¶
Let's look at how we would do if we just used the average value of the stock or the previous day's close to predict the next day's close.
# compare with using the mean value from the training data
train_mean = np.mean(train['Close'])
# compute rmse for y_test using the mean value from the training data
rmse_mean = np.sqrt(np.mean(train_mean - y_test)**2)
print('The RMSE is when using the mean value from the training data: ', rmse_mean)
The RMSE is when using the mean value from the training data: 7.9133179196546495
# Compare with using the previous day's closing price
test['MSE_to_Prev_Close'] = test['Close'].diff()**2
test['Diff_to_Prev_Close'] = test['Close'].diff()
# compute rmse for y_test using the previous day's closing price
rmse_previous = np.sqrt(np.mean(test['MSE_to_Prev_Close']))
print('The RMSE is when using the previous day\'s closing price: ', rmse_previous)
print('The mean difference from the previous day\'s closing price is: ', np.mean(test['Diff_to_Prev_Close'].abs()))
# hide warnings in the cell output
import warnings
warnings.filterwarnings('ignore')
The RMSE is when using the previous day's closing price: 0.3320853754026828 The mean difference from the previous day's closing price is: 0.2650000762939453
/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/2967036786.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test['MSE_to_Prev_Close'] = test['Close'].diff()**2 /var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/2967036786.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test['Diff_to_Prev_Close'] = test['Close'].diff()
Train an RNN¶
# clear backend session
keras.backend.clear_session()
# instantiate the model
rnn = Sequential()
# add layers
rnn.add(SimpleRNN(
128,
activation='relu',
input_shape=(x_train.shape[1], x_train.shape[2]),
return_sequences=True,
))
rnn.add(SimpleRNN(
64,
activation='relu',
return_sequences=False,
))
rnn.add(Dense(32))
rnn.add(Dense(1)) # output layer
# compile the model
rnn.compile(optimizer='adam', loss='mean_squared_error')
# print the rnn
print(rnn.summary())
2023-11-24 17:55:12.159937: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= simple_rnn (SimpleRNN) (None, 90, 128) 16640 _________________________________________________________________ simple_rnn_1 (SimpleRNN) (None, 64) 12352 _________________________________________________________________ dense (Dense) (None, 32) 2080 _________________________________________________________________ dense_1 (Dense) (None, 1) 33 ================================================================= Total params: 31,105 Trainable params: 31,105 Non-trainable params: 0 _________________________________________________________________ None
# train the model
history_rnn = rnn.fit(
x_train,
y_train,
epochs=5,
batch_size=32,
validation_data=(x_valid, y_valid), # this data isn't scaled, needs to be
)
Epoch 1/5
2023-11-24 17:55:12.365020: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
40/40 [==============================] - 3s 46ms/step - loss: 0.0051 - val_loss: 0.4133 Epoch 2/5 40/40 [==============================] - 2s 39ms/step - loss: 5.3857e-04 - val_loss: 0.4301 Epoch 3/5 40/40 [==============================] - 1s 37ms/step - loss: 4.9881e-04 - val_loss: 0.4498 Epoch 4/5 40/40 [==============================] - 1s 37ms/step - loss: 4.3218e-04 - val_loss: 0.4667 Epoch 5/5 40/40 [==============================] - 1s 35ms/step - loss: 4.1891e-04 - val_loss: 0.4388
Evaluate the RNN¶
# get the predicted values
predictions_rnn = rnn.predict(x_test)
# reverse transform them to compare with y_test
predictions_rnn = predictions_rnn * (max - min) + min
# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_rnn - y_test)**2)
print('The RMSE is: ', rmse)
The RMSE is: 0.14117949529988877
# plot the results
test['PredictionsRNN'] = predictions_rnn
# visualize the data
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History')
plt.plot(train['Close'])
plt.plot(test[['Close', 'PredictionsRNN']])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.legend(['Train', 'Test', 'PredictionsRNN'], loc='lower right')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()
Let me see if shifting the predictions systematically will reduce the MSE.
# try shifting them up
shift_rnn = 0.14
predictions_shifted_rnn = predictions_rnn + shift_rnn
# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_shifted_rnn - y_test)**2)
print('The RMSE when predictions are shifted up by', shift_rnn, 'is:', rmse)
The RMSE when predictions are shifted up by 0.14 is: 0.0011801056514512624
Train an LSTM¶
# clear backend
tf.keras.backend.clear_session()
# instantiate the model
lstm = Sequential()
# add layers
lstm.add(LSTM(
128,
return_sequences=True,
input_shape=(x_train.shape[1], x_train.shape[2])
))
lstm.add(LSTM(
64,
return_sequences=False
))
lstm.add(Dense(32))
lstm.add(Dense(1)) # output layer
# compile the lstm
lstm.compile(optimizer='adam', loss='mean_squared_error')
# print the lstm
print(lstm.summary())
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, 90, 128) 66560 _________________________________________________________________ lstm_1 (LSTM) (None, 64) 49408 _________________________________________________________________ dense (Dense) (None, 32) 2080 _________________________________________________________________ dense_1 (Dense) (None, 1) 33 ================================================================= Total params: 118,081 Trainable params: 118,081 Non-trainable params: 0 _________________________________________________________________ None
# train the model
history_lstm = lstm.fit(
x_train,
y_train,
batch_size=32,
epochs=5,
validation_data=(x_valid, y_valid)
)
Epoch 1/5 40/40 [==============================] - 9s 144ms/step - loss: 0.0258 - val_loss: 2.1427 Epoch 2/5 40/40 [==============================] - 4s 112ms/step - loss: 0.0016 - val_loss: 2.1746 Epoch 3/5 40/40 [==============================] - 4s 104ms/step - loss: 0.0014 - val_loss: 2.2028 Epoch 4/5 40/40 [==============================] - 5s 116ms/step - loss: 0.0013 - val_loss: 2.1885 Epoch 5/5 40/40 [==============================] - 4s 110ms/step - loss: 0.0013 - val_loss: 2.2123
Evaluate the LSTM¶
# get the predicted values
predictions_lstm = lstm.predict(x_test)
# reverse transform them to compare with y_test
predictions_lstm = predictions_lstm * (max - min) + min
# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_lstm - y_test)**2)
print('The RMSE is: ', rmse)
The RMSE is: 0.13354103138904697
# plot the results
train = df[:training_data_len]
test = df[training_data_len:]
test['Predictions'] = predictions_lstm
# visualize the data
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History LSTM Predictions')
plt.plot(train['Close'])
plt.plot(test[['Close', 'Predictions']])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.legend(['Train', 'Test', 'Predictions'], loc='lower right')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()
It appears the predictions are consistently low. Let me try shifting them up to see if this reduces the MSE.
# try shifting them up
shift_lstm = 0.13
predictions_shifted_lstm = predictions_lstm + shift_lstm
# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_shifted_lstm - y_test)**2)
print('The RMSE when predictions are shifted up by', shift_lstm, 'is:', rmse)
The RMSE when predictions are shifted up by 0.13 is: 0.0035399632738126034
Conclusions¶
Our baseline benchmark was to use the average value of the stock for the last 6 years, which gave a terrible MSE of $7.89. However, a more intuitive baseline is to simply use the previous day's closing price and expect that it won't change. This gives a much better MSE of $0.33. However, this isn't that good compared to the mean daily change in the stock price, which is $0.26. I look to the RNN and LSTM to beat this baseline.
The RNN gave an MSE of just $0.14, much better than the base. This is visible on the plot, you can see the orange and green lines tracking quite closely. I was surprised by this, since the RNN does not have a sense of long term dependencies beyond the last 60 days. This tells us that much of the trajectory of a stock is based on the last 60 days.
The LSTM gave an MSE of $0.13, which is only slightly better than the RNN.
What's interesting is that the MSE varied widely from just a few cents to as much as 40 cents depending on the batch size and the number of epochs. Additionally, it was nearly impossible to get the same parameter values twice, even holding the batch and epoch size constant. This tells me that the LSTM and RNN are very sensitive to the initial conditions, which is unfortunate since they are largely random.
Additionally, both models had a consistent bias to underpredict stock prices. On a few runs, it overpredicted, but mostly underpredicted. This is weird since it was visually obvious which direction the prediction needed to be shifted. I tested this by adding a constant to each one and I was able to reduce the MSE for each:
- RNN: $0.14 -> $0.0011 -- two orders of magnitude!
- LSTM: $0.13 -> $0.0035 -- about 1.5 orders of magnitude
While I don't know why the models were consistently biased, I was impressed at their predictive performance after shifting, given that the stock went both up and down during the 7 month period of test data.
Finally, while this seems like I'm able to predict future stock value quite well, I'm using all of the data from the previous 60 days as inputs to the model just to predict the stock price tomorrow. It would be much more interesting as someone who is not a day trader to be able to use the previous year worth of data in order to predict the next one month of stock prices. I would need to modify the model to output a sequence instead of just a single value, and then I would need to modify the training data to include the previous year's worth of data. This would be a fun project for the future.
Export to HTML¶
# export to HTML for webpage
import os
# os.system('jupyter nbconvert --to html mod1.ipynb')
os.system('jupyter nbconvert --to html pt4_rnn_lstm.ipynb --HTMLExporter.theme=dark')
[NbConvertApp] Converting notebook pt4_rnn_lstm.ipynb to html [NbConvertApp] WARNING | Alternative text is missing on 4 image(s). [NbConvertApp] Writing 785445 bytes to pt4_rnn_lstm.html
0
