# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# machine learning
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN

# for reading stock data from yahoo'
from pandas_datareader.data import DataReader
import yfinance as yf
from pandas_datareader import data as pdr

# misc
from datetime import datetime

yf.pdr_override() # not sure what this does

stock = 'SCHK'
end = datetime.now()
print('End date: ', end)
start = datetime(end.year - 6, end.month, end.day)
print('Start date: ', start)

schk = yf.download(stock, start, end)
schk.tail()

End date:  2023-11-24 17:55:10.354131
Start date:  2017-11-24 00:00:00
[*********************100%***********************]  1 of 1 completed

# describe the data
print(schk.describe())

# note that weekends are not included, thus 255 days per year roughly
print(schk.info())

              Open         High          Low        Close    Adj Close  \
count  1510.000000  1510.000000  1510.000000  1510.000000  1510.000000   
mean     34.789909    34.980387    34.560558    34.780530    33.383337   
std       6.629948     6.653950     6.599332     6.629106     7.153823   
min      22.240000    22.340000    21.299999    21.700001    20.476364   
25%      28.242500    28.402500    28.122500    28.260000    26.230622   
50%      35.295000    35.570000    34.957500    35.219999    34.089796   
75%      40.877501    41.207500    40.557501    40.822501    39.952845   
max      47.000000    47.025002    46.700001    46.849998    45.638546   

             Volume  
count  1.510000e+03  
mean   2.113612e+05  
std    2.168370e+05  
min    2.960000e+04  
25%    1.121250e+05  
50%    1.619000e+05  
75%    2.443750e+05  
max    4.797900e+06  
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1510 entries, 2017-11-24 to 2023-11-24
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1510 non-null   float64
 1   High       1510 non-null   float64
 2   Low        1510 non-null   float64
 3   Close      1510 non-null   float64
 4   Adj Close  1510 non-null   float64
 5   Volume     1510 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 82.6 KB
None

# visualize the closing price overtime
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History')
plt.plot(schk['Close'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()

/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/1599654624.py:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  plt.grid(b=True, which='major', color='#666666', linestyle='-')

# plot sale volume over time
plt.figure(figsize=(16,8))
plt.title('SCHK Sale Volume History')
plt.plot(schk['Volume'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Sale Volume (Millions of Shares)', fontsize=18)
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()

/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/4262586195.py:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  plt.grid(b=True, which='major', color='#666666', linestyle='-')

# filter out just the closing price
df = schk.filter(['Close'])
dataset = df.values

# get the number of rows to train the model on
training_data_len = int(len(dataset) * 0.9)

print('The length of the full dataset is: ', len(dataset))
print('The length of the training dataset is: ', training_data_len)

The length of the full dataset is:  1510
The length of the training dataset is:  1359

# scale the data
max = np.max(dataset)
min = np.min(dataset)
print('The max value is: ', max)
print('The min value is: ', min)

# scale the data
dataset_scaled = (dataset - min) / (max - min)
print(dataset_scaled)

The max value is:  46.849998474121094
The min value is:  21.700000762939453
[[0.15491055]
 [0.15427433]
 [0.16421469]
 ...
 [0.87316108]
 [0.88111339]
 [0.88270389]]

# create the training dataset
# we need a bunch of sequences of length 90 (90 days) for x_train and
# the next day for y_train

# create the scaled training dataset
dataset_train = dataset_scaled[0:training_data_len]
print('The shape of dataset_train is: ', dataset_train.shape)

# create placeholders for the x_train and y_train
x_train = []
y_train = []

# populate the x_train and y_train
# loop over the data in dataset_train from day 0 to end day - 90
# append the data from day 0 to day 89 to x_train
# append the data from day 90 to y_train
for i in range(0, len(dataset_train) - 90):
    x_train.append(dataset_train[i:i+90]) # exclusive of day 90
    y_train.append(dataset_train[i+90])

# inspect the x_train and y_train to make sure they are correct
# print('x_train is :\n', x_train[0])
# print('y_train is :\n', y_train[0])

# convert the x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)
print('The shape of x_train is: ', x_train.shape)
print('The shape of y_train is: ', y_train.shape)

The shape of dataset_train is:  (1359, 1)
The shape of x_train is:  (1269, 90, 1)
The shape of y_train is:  (1269, 1)

# use the last 20% of the data as the test data
# include the last 90 days of the training data in the test data
dataset_test = dataset_scaled[training_data_len - 90:]
print('The shape of dataset_test is: ', dataset_test.shape)

# create the x_test and y_test
x_test = []
y_test = dataset[training_data_len:] # just the last 20% of the data, unscaled values

# populate the x_test, starting 90s before the end of the training data
for i in range (0, len(dataset_test)-90):
    x_test.append(dataset_test[i:i+90])

# convert the x_test to a numpy array
x_test = np.array(x_test)
y_test = np.array(y_test)
print('The shape of x_test is: ', x_test.shape)
print('The shape of y_test is: ', y_test.shape)

# create scaled versions of the x_test and y_test called x_valid and y_valid
x_valid = (x_test - min) / (max - min)
y_valid = (y_test - min) / (max - min)

# unscaled data for analysis
train = df[:training_data_len]
test = df[training_data_len:]

The shape of dataset_test is:  (241, 1)
The shape of x_test is:  (151, 90, 1)
The shape of y_test is:  (151, 1)

# compare with using the mean value from the training data
train_mean = np.mean(train['Close'])

# compute rmse for y_test using the mean value from the training data
rmse_mean = np.sqrt(np.mean(train_mean - y_test)**2)

print('The RMSE is when using the mean value from the training data: ', rmse_mean)

The RMSE is when using the mean value from the training data:  7.9133179196546495

# Compare with using the previous day's closing price
test['MSE_to_Prev_Close'] = test['Close'].diff()**2
test['Diff_to_Prev_Close'] = test['Close'].diff()

# compute rmse for y_test using the previous day's closing price
rmse_previous = np.sqrt(np.mean(test['MSE_to_Prev_Close']))

print('The RMSE is when using the previous day\'s closing price: ', rmse_previous)
print('The mean difference from the previous day\'s closing price is: ', np.mean(test['Diff_to_Prev_Close'].abs()))

# hide warnings in the cell output
import warnings
warnings.filterwarnings('ignore')

The RMSE is when using the previous day's closing price:  0.3320853754026828
The mean difference from the previous day's closing price is:  0.2650000762939453

/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/2967036786.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['MSE_to_Prev_Close'] = test['Close'].diff()**2
/var/folders/1p/wbwsvy8x73v0fxb0dfm9pj446vjgl7/T/ipykernel_29499/2967036786.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Diff_to_Prev_Close'] = test['Close'].diff()

# clear backend session
keras.backend.clear_session()

# instantiate the model
rnn = Sequential()

# add layers
rnn.add(SimpleRNN(
    128,
    activation='relu',
    input_shape=(x_train.shape[1], x_train.shape[2]),
    return_sequences=True,
))
rnn.add(SimpleRNN(
    64,
    activation='relu',
    return_sequences=False,
))
rnn.add(Dense(32))
rnn.add(Dense(1)) # output layer

# compile the model
rnn.compile(optimizer='adam', loss='mean_squared_error')

# print the rnn
print(rnn.summary())

2023-11-24 17:55:12.159937: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
simple_rnn (SimpleRNN)       (None, 90, 128)           16640     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                12352     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
=================================================================
Total params: 31,105
Trainable params: 31,105
Non-trainable params: 0
_________________________________________________________________
None

# train the model
history_rnn = rnn.fit(
    x_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_data=(x_valid, y_valid), # this data isn't scaled, needs to be
)

Epoch 1/5

2023-11-24 17:55:12.365020: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)

40/40 [==============================] - 3s 46ms/step - loss: 0.0051 - val_loss: 0.4133
Epoch 2/5
40/40 [==============================] - 2s 39ms/step - loss: 5.3857e-04 - val_loss: 0.4301
Epoch 3/5
40/40 [==============================] - 1s 37ms/step - loss: 4.9881e-04 - val_loss: 0.4498
Epoch 4/5
40/40 [==============================] - 1s 37ms/step - loss: 4.3218e-04 - val_loss: 0.4667
Epoch 5/5
40/40 [==============================] - 1s 35ms/step - loss: 4.1891e-04 - val_loss: 0.4388

# get the predicted values
predictions_rnn = rnn.predict(x_test)
# reverse transform them to compare with y_test
predictions_rnn = predictions_rnn * (max - min) + min

# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_rnn - y_test)**2)
print('The RMSE is: ', rmse)

The RMSE is:  0.14117949529988877

# plot the results
test['PredictionsRNN'] = predictions_rnn

# visualize the data
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History')
plt.plot(train['Close'])
plt.plot(test[['Close', 'PredictionsRNN']])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.legend(['Train', 'Test', 'PredictionsRNN'], loc='lower right')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()

# try shifting them up
shift_rnn = 0.14
predictions_shifted_rnn = predictions_rnn + shift_rnn

# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_shifted_rnn - y_test)**2)
print('The RMSE when predictions are shifted up by', shift_rnn, 'is:', rmse)

The RMSE when predictions are shifted up by 0.14 is: 0.0011801056514512624

# clear backend
tf.keras.backend.clear_session()

# instantiate the model
lstm = Sequential()

# add layers
lstm.add(LSTM(
    128, 
    return_sequences=True, 
    input_shape=(x_train.shape[1], x_train.shape[2])
))
lstm.add(LSTM(
    64, 
    return_sequences=False
))
lstm.add(Dense(32))
lstm.add(Dense(1)) # output layer

# compile the lstm
lstm.compile(optimizer='adam', loss='mean_squared_error')

# print the lstm
print(lstm.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm (LSTM)                  (None, 90, 128)           66560     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
=================================================================
Total params: 118,081
Trainable params: 118,081
Non-trainable params: 0
_________________________________________________________________
None

# train the model
history_lstm = lstm.fit(
    x_train, 
    y_train, 
    batch_size=32, 
    epochs=5,
    validation_data=(x_valid, y_valid)
)

Epoch 1/5
40/40 [==============================] - 9s 144ms/step - loss: 0.0258 - val_loss: 2.1427
Epoch 2/5
40/40 [==============================] - 4s 112ms/step - loss: 0.0016 - val_loss: 2.1746
Epoch 3/5
40/40 [==============================] - 4s 104ms/step - loss: 0.0014 - val_loss: 2.2028
Epoch 4/5
40/40 [==============================] - 5s 116ms/step - loss: 0.0013 - val_loss: 2.1885
Epoch 5/5
40/40 [==============================] - 4s 110ms/step - loss: 0.0013 - val_loss: 2.2123

# get the predicted values
predictions_lstm = lstm.predict(x_test)
# reverse transform them to compare with y_test
predictions_lstm = predictions_lstm * (max - min) + min

# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_lstm - y_test)**2)
print('The RMSE is: ', rmse)

The RMSE is:  0.13354103138904697

# plot the results
train = df[:training_data_len]
test = df[training_data_len:]
test['Predictions'] = predictions_lstm

# visualize the data
plt.figure(figsize=(16,8))
plt.title('SCHK Close Price History LSTM Predictions')
plt.plot(train['Close'])
plt.plot(test[['Close', 'Predictions']])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.legend(['Train', 'Test', 'Predictions'], loc='lower right')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.show()

# try shifting them up
shift_lstm = 0.13
predictions_shifted_lstm = predictions_lstm + shift_lstm

# get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(predictions_shifted_lstm - y_test)**2)
print('The RMSE when predictions are shifted up by', shift_lstm, 'is:', rmse)

The RMSE when predictions are shifted up by 0.13 is: 0.0035399632738126034

# export to HTML for webpage
import os

# os.system('jupyter nbconvert --to html mod1.ipynb')
os.system('jupyter nbconvert --to html pt4_rnn_lstm.ipynb --HTMLExporter.theme=dark')

[NbConvertApp] Converting notebook pt4_rnn_lstm.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 785445 bytes to pt4_rnn_lstm.html

0

	Open	High	Low	Close	Adj Close	Volume
Date
2023-11-17	43.380001	43.480000	43.299999	43.419998	43.419998	160800
2023-11-20	43.369999	43.855000	43.369999	43.779999	43.779999	179700
2023-11-21	43.669998	43.700001	43.549999	43.660000	43.660000	198100
2023-11-22	43.810001	43.930000	43.740002	43.860001	43.860001	165300
2023-11-24	43.810001	43.900002	43.810001	43.900002	43.900002	56867

RNN versus LSTM on Stock Price Prediction¶

Import Necessary Libraries¶

Pull in the Stock Data from Yahoo Finance¶

Descriptive Statistics on the Data¶

Prepare the Training Data for the RNN and LSTM¶

Prepare the Testing Data for the LSTM and RNN¶

Baseline Benchmarks to Beat¶

Train an RNN¶

Evaluate the RNN¶

Train an LSTM¶

Evaluate the LSTM¶

Conclusions¶

Export to HTML¶