The aim of this notebook is to check whether it is possible to predict/evaluate stock price trend given set of features derived from historical priceaction.
Here we will be using modified neural net from:
Both these articles are evaluating Iris Dataset for flower species classification. Turns out that Iris dataset has very similar structure to our training data for stocks, hence the code will be more or less reusable.
The article from blog.quantisti.com
We will be using neural network instead and will have also much longer time horizon for trend predictions.
How the neural net is trained:
Specifically this is how our training/target condition looks like:
# price above trend multiple days later
df['target_cls'] = np.where(df['Adj Close'].shift(-34) > df.EMA150.shift(-34), 1, 0)
We are performing classification task (logistic regression).
The output of the neural net will be 1 or 0 (Buy or Not Buy).
Based on given features the network will be trying to predict whether price will be in n
days above specific moving average.
For example as shown above - in 34 days above 150 Exponencial Moving Average.
The neural net will never be trained on the specific moving average it is trying to predict, it will always use different input features.
import talib as ta
import joblib
import pandas as pd
#suppress 'SettingWithCopy' warning
pd.set_option('mode.chained_assignment', None)
#!pip install pandas_datareader
#!pip3 install seaborn
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# ___library_import_statements___
import pandas as pd
# for pandas_datareader, otherwise it might have issues, sometimes there is some version mismatch
pd.core.common.is_list_like = pd.api.types.is_list_like
# make pandas to print dataframes nicely
pd.set_option('expand_frame_repr', False)
import pandas_datareader.data as web
import numpy as np
import matplotlib.pyplot as plt
import datetime
import time
#newest yahoo API
import yfinance as yahoo_finance
#optional
#yahoo_finance.pdr_override()
%matplotlib inline
import talib as ta
import numpy as np
import matplotlib.pyplot as plt
# was giving me some warnings
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# ___variables___
#ticker = 'AAPL'
#ticker = 'TSLA'
#ticker = 'FB'
#ticker = 'MSFT'
#ticker = 'NFLX'
#ticker = 'GOOGL'
ticker = 'BIDU'
#ticker = 'AMZN'
#ticker = 'IBM'
start_time = datetime.datetime(1980, 1, 1)
#end_time = datetime.datetime(2019, 1, 20)
end_time = datetime.datetime.now().date().isoformat() # today
def get_data(ticker):
# yahoo gives only daily historical data
connected = False
while not connected:
try:
df = web.get_data_yahoo(ticker, start=start_time, end=end_time)
connected = True
print('connected to yahoo')
except Exception as e:
print("type error: " + str(e))
time.sleep( 5 )
pass
# use numerical integer index instead of date
df = df.reset_index()
#print(df.head(5))
return df
#df = get_data(ticker)
For each stock we compute several technical indicators, we use mainly exponencial moving averages, Bollinger Bands, RSI and so on. We will then feed these into neural network as features (or values derived from these indicators).
def compute_technical_indicators(df):
df['EMA5'] = ta.EMA(df['Adj Close'].values, timeperiod=5)
df['EMA10'] = ta.EMA(df['Adj Close'].values, timeperiod=10)
df['EMA15'] = ta.EMA(df['Adj Close'].values, timeperiod=15)
df['EMA20'] = ta.EMA(df['Adj Close'].values, timeperiod=10)
df['EMA30'] = ta.EMA(df['Adj Close'].values, timeperiod=30)
df['EMA40'] = ta.EMA(df['Adj Close'].values, timeperiod=40)
df['EMA50'] = ta.EMA(df['Adj Close'].values, timeperiod=50)
df['EMA60'] = ta.EMA(df['Adj Close'].values, timeperiod=60)
df['EMA70'] = ta.EMA(df['Adj Close'].values, timeperiod=70)
df['EMA80'] = ta.EMA(df['Adj Close'].values, timeperiod=80)
df['EMA90'] = ta.EMA(df['Adj Close'].values, timeperiod=90)
df['EMA100'] = ta.EMA(df['Adj Close'].values, timeperiod=100)
df['EMA150'] = ta.EMA(df['Adj Close'].values, timeperiod=150)
df['EMA200'] = ta.EMA(df['Adj Close'].values, timeperiod=200)
df['upperBB'], df['middleBB'], df['lowerBB'] = ta.BBANDS(df['Adj Close'].values, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
df['SAR'] = ta.SAR(df['High'].values, df['Low'].values, acceleration=0.02, maximum=0.2)
# we will normalize RSI
df['RSI'] = ta.RSI(df['Adj Close'].values, timeperiod=14)
df['normRSI'] = ta.RSI(df['Adj Close'].values, timeperiod=14) / 100.0
df.tail()
return df
#df = compute_technical_indicators(df)
def compute_features(df):
# computes features for forest decisions
df['aboveEMA5'] = np.where(df['Adj Close'] > df['EMA5'], 1, 0)
df['aboveEMA10'] = np.where(df['Adj Close'] > df['EMA10'], 1, 0)
df['aboveEMA15'] = np.where(df['Adj Close'] > df['EMA15'], 1, 0)
df['aboveEMA20'] = np.where(df['Adj Close'] > df['EMA20'], 1, 0)
df['aboveEMA30'] = np.where(df['Adj Close'] > df['EMA30'], 1, 0)
df['aboveEMA40'] = np.where(df['Adj Close'] > df['EMA40'], 1, 0)
df['aboveEMA50'] = np.where(df['Adj Close'] > df['EMA50'], 1, 0)
df['aboveEMA60'] = np.where(df['Adj Close'] > df['EMA60'], 1, 0)
df['aboveEMA70'] = np.where(df['Adj Close'] > df['EMA70'], 1, 0)
df['aboveEMA80'] = np.where(df['Adj Close'] > df['EMA80'], 1, 0)
df['aboveEMA90'] = np.where(df['Adj Close'] > df['EMA90'], 1, 0)
df['aboveEMA100'] = np.where(df['Adj Close'] > df['EMA100'], 1, 0)
df['aboveEMA150'] = np.where(df['Adj Close'] > df['EMA150'], 1, 0)
df['aboveEMA200'] = np.where(df['Adj Close'] > df['EMA200'], 1, 0)
df['aboveUpperBB'] = np.where(df['Adj Close'] > df['upperBB'], 1, 0)
df['belowLowerBB'] = np.where(df['Adj Close'] < df['lowerBB'], 1, 0)
df['aboveSAR'] = np.where(df['Adj Close'] > df['SAR'], 1, 0)
df['oversoldRSI'] = np.where(df['RSI'] < 30, 1, 0)
df['overboughtRSI'] = np.where(df['RSI'] > 70, 1, 0)
# very important - cleanup NaN values, otherwise prediction does not work
df=df.fillna(0).copy()
df.tail()
return df
#df = compute_features(df)
def plot_train_data(df):
# plot price
plt.figure(figsize=(15,2.5))
plt.title('Stock data ' + str(ticker))
plt.plot(df['Date'], df['Adj Close'])
#plt.title('Price chart (Adj Close) ' + str(ticker))
plt.show()
return None
def define_target_condition(df):
# price higher later - bad predictive results
#df['target_cls'] = np.where(df['Adj Close'].shift(-34) > df['Adj Close'], 1, 0)
# price above trend multiple days later
df['target_cls'] = np.where(df['Adj Close'].shift(-34) > df.EMA150.shift(-34), 1, 0)
# important, remove NaN values
df=df.fillna(0).copy()
df.tail()
return df
#df = define_target_condition(df)
#plot_train_data(df)
neural network will be trained on this dataframe. Data will be split eventually into training and testing set.
tickers = ['F', 'IBM', 'GE', 'AAPL', 'ADM',
'XOM', 'GM','MMM','KO','PEP','SO','GS']
# 'HAS','PEAK','HPE','HLT','HD','HON','HRL','HST','HPQ','HUM','ILMN',
# 'INTC','ICE','INTU','ISRG','IVZ','IRM','JNJ','JPM','JNPR','K','KMB',
# 'KIM', 'KMI','KSS','KHC', 'KR', 'LB', 'LEG', 'LIN', 'LMT','LOW',
# 'MAR', 'MA','MCD','MDT', 'MRK', 'MET', 'MGM', 'MU','MSFT', 'MAA',
# 'MNST', 'MCO','MS', 'MSI',
# 'MMM', 'ABT','ACN','ATVI','ADBE','AMD','A','AKAM','ARE','GOOG','AMZN','AAL']
# parent dataframe to append to
ticker = 'SPY'
df = get_data(ticker)
df = compute_technical_indicators(df)
df = compute_features(df)
df = define_target_condition(df)
for ticker in tickers:
t_df = get_data(ticker)
t_df = compute_technical_indicators(t_df)
t_df = compute_features(t_df)
t_df = define_target_condition(t_df)
df = df.append(t_df, ignore_index=True)
predictors_list = ['aboveSAR','aboveUpperBB','belowLowerBB','normRSI','oversoldRSI','overboughtRSI',
'aboveEMA5','aboveEMA10','aboveEMA15','aboveEMA20','aboveEMA30','aboveEMA40',
'aboveEMA50','aboveEMA60','aboveEMA70','aboveEMA80','aboveEMA90','aboveEMA100']
def splitting_and_training(df, predictors_list, test_size=0.3):
# __predictors__
# __features__
X = df[predictors_list].fillna(0).values
#X.tail()
# __targets__
y_cls = df.target_cls.fillna(0).values
#y_cls.tail(10)
# __train test split__
from sklearn.model_selection import train_test_split
y=y_cls
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
print (X_cls_train.shape, y_cls_train.shape)
print (X_cls_test.shape, y_cls_test.shape)
return X_cls_train, X_cls_test, y_cls_train, y_cls_test
############ START OF MAIN SOURCE FROM KAGGLE ###############
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim,100)
self.layer2 = nn.Linear(100, 30)
self.layer3 = nn.Linear(30, 2)
self.drop = nn.Dropout(0.2)
def forward(self, x):
x = F.relu(self.layer1(x))
x = self.drop(x)
x = F.relu(self.layer2(x))
x = self.drop(x)
x = F.softmax(self.layer3(x)) # To check with the loss function
return x
#features, labels = load_iris(return_X_y=True)
#features[:3]
#labels[:3]
#features_train,features_test, labels_train, labels_test = train_test_split(features, labels, random_state=42, shuffle=True)
# mine version
# so far the variables are dataframes, not arrays or tensors
features_train,features_test, labels_train, labels_test = splitting_and_training(df, predictors_list)
features_train[:3]
labels_train[:3]
# make data tensors
features_train = Variable(torch.Tensor(features_train).float())
features_test = Variable(torch.Tensor(features_test).float())
labels_train = Variable(torch.Tensor(labels_train).long())
labels_test = Variable(torch.Tensor(labels_test).long())
x_train = features_train
y_train = labels_train
# Training
model = Model(features_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.CrossEntropyLoss()
epochs = 150
def print_(loss):
print ("The loss calculated: ", loss)
# Not using dataloader
#######x_train, y_train = Variable(torch.from_numpy(features_train)).float(), Variable(torch.from_numpy(labels_train)).long()
for epoch in range(1, epochs+1):
print ("Epoch #",epoch)
y_pred = model(x_train)
loss = loss_fn(y_pred, y_train)
print_(loss.item())
# Zero gradients
optimizer.zero_grad()
loss.backward() # Gradients
optimizer.step() # Update
# Prediction
######x_test = Variable(torch.from_numpy(features_test)).float()
x_test = features_test
pred = model(x_test)
pred = pred.detach().numpy()
pred
print ("The accuracy is", accuracy_score(labels_test, np.argmax(pred, axis=1)))
# Checking for first value
np.argmax(model(x_test[0]).detach().numpy(), axis=0)
labels_test[0]
torch.save(model, "iris-pytorch.pkl")
saved_model = torch.load("iris-pytorch.pkl")
np.argmax(saved_model(x_test[0]).detach().numpy(), axis=0)
x_test[0]
for i in x_test[:3]:
print(i)
prediction = np.argmax(saved_model(i).detach().numpy(), axis=0)
print('prediction', prediction)
############ END OF MAIN SOURCE FROM KAGGLE ###############
Let's provide the model with new stock data it was not trained on to see how it performs.
#ticker='BP'
#ticker='ABBV'
#ticker='GILD'
#ticker='NGG'
#ticker='BPY'
ticker='AIR'
def plot_stock_prediction(df, ticker):
# plot values and significant levels
plt.figure(figsize=(30,7))
plt.title('Predictive model ' + str(ticker))
plt.plot(df['Date'], df['Adj Close'], label='Adj Close', alpha=0.2)
plt.plot(df['Date'], df['EMA10'], label='EMA10', alpha=0.2)
plt.plot(df['Date'], df['EMA20'], label='EMA20', alpha=0.2)
plt.plot(df['Date'], df['EMA30'], label='EMA30', alpha=0.2)
plt.plot(df['Date'], df['EMA40'], label='EMA40', alpha=0.2)
plt.plot(df['Date'], df['EMA50'], label='EMA50', alpha=0.2)
plt.plot(df['Date'], df['EMA100'], label='EMA100', alpha=0.2)
plt.plot(df['Date'], df['EMA150'], label='EMA150', alpha=0.99)
plt.plot(df['Date'], df['EMA200'], label='EMA200', alpha=0.2)
plt.scatter(df['Date'], df['Buy']*df['Adj Close'], label='Buy', marker='^', color='magenta', alpha=0.15)
#lt.scatter(df.index, df['sell_sig'], label='Sell', marker='v')
plt.legend()
plt.show()
return None
new_df = get_data(ticker)
new_df = compute_technical_indicators(new_df)
new_df = compute_features(new_df)
new_df=define_target_condition(new_df)
saved_model = torch.load("iris-pytorch.pkl")
def predict_timeseries(df):
# making sure we have good dimensions
# column will be rewritten later
df['Buy'] = df['target_cls']
for i in range(len(df)):
X_cls_valid = [[df['aboveSAR'][i],df['aboveUpperBB'][i],df['belowLowerBB'][i],
df['normRSI'][i],df['oversoldRSI'][i],df['overboughtRSI'][i],
df['aboveEMA5'][i],df['aboveEMA10'][i],df['aboveEMA15'][i],df['aboveEMA20'][i],
df['aboveEMA30'][i],df['aboveEMA40'][i],df['aboveEMA50'][i],
df['aboveEMA60'][i],df['aboveEMA70'][i],df['aboveEMA80'][i],df['aboveEMA90'][i],
df['aboveEMA100'][i]]]
x_test = Variable(torch.Tensor(X_cls_valid).float())
#####print('x_test',x_test)
#####print('i',i)
prediction = np.argmax(saved_model(x_test[0]).detach().numpy(), axis=0)
#####print('prediction', prediction)
df['Buy'][i] = prediction
print(df.head())
return df
new_df = predict_timeseries(new_df)
plot_stock_prediction(new_df, ticker)
# zoom in on the data
temp_df = new_df[-3000:-2000]
plot_stock_prediction(temp_df, ticker)
In the testing example (on unseen data) we can see that the model performs very reasonably for identifying overall uptrend.
when marks are on the price values in the graph, the neural network is signaling 'Buy'. When marks are on 0 level in the graph, network is signalling 'Sell/Don't buy'.