# Import the libraries we'll be using
import numpy as np
import statsmodels.api as sm
from statsmodels import regression
import matplotlib.pyplot as plt

# Pull the pricing data for our two stocks and S&P 500
from quantrocket.master import get_securities
from quantrocket import get_prices

securities = get_securities(symbols=["SPY", "AAPL", "JNJ"], vendors='usstock')

start = '2013-01-01'
end = '2015-01-01'

closes = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
closes = closes.rename(columns=sids_to_symbols)

bench = closes['SPY']
a1 = closes['JNJ']
a2 = closes['AAPL']

# Perform linear regression and print R-squared values
slr12 = regression.linear_model.OLS(a2, sm.add_constant(a1)).fit()
slrb1 = regression.linear_model.OLS(a1, sm.add_constant(bench)).fit()
slrb2 = regression.linear_model.OLS(a2, sm.add_constant(bench)).fit()
print("R-squared values of linear regression")
print("JNJ and AAPL:", slr12.rsquared)
print("JNJ and SPY:", slrb1.rsquared)
print("AAPL and SPY:", slrb2.rsquared)

R-squared values of linear regression
JNJ and AAPL: 0.7561816151426892
JNJ and SPY: 0.9537085501322876
AAPL and SPY: 0.8241935110041808

# Pull pricing data from further back
start = '2009-01-01'
end = '2015-01-01'

closes = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']
closes = closes.rename(columns=sids_to_symbols)

bench = closes['SPY']
a1 = closes['JNJ']
a2 = closes['AAPL']

# Perform linear regression and print R-squared values
slr12 = regression.linear_model.OLS(a2, sm.add_constant(a1)).fit()
slrb1 = regression.linear_model.OLS(a1, sm.add_constant(bench)).fit()
slrb2 = regression.linear_model.OLS(a2, sm.add_constant(bench)).fit()
print("R-squared values of linear regression")
print("JNJ and AAPL:", slr12.rsquared)
print("JNJ and SPY:", slrb1.rsquared)
print("AAPL and SPY:", slrb2.rsquared)

R-squared values of linear regression
JNJ and AAPL: 0.662057671399793
JNJ and SPY: 0.9498743121988567
AAPL and SPY: 0.7888478832212861

# Load one year's worth of pricing data for five different assets

securities = get_securities(symbols=["AAPL", "MSFT", "JNJ", "XOM", "MON"], vendors='usstock')

start = '2017-01-01'
end = '2018-01-01'

closes = get_prices("usstock-free-1min", data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
closes = closes.rename(columns=sids_to_symbols)

x1 = closes['MSFT']
x2 = closes['JNJ']
x3 = closes['MON']
x4 = closes['XOM']
y = closes['AAPL']

# Build a linear model using only x1 to explain y
slr = regression.linear_model.OLS(y, sm.add_constant(x1)).fit()
slr_prediction = slr.params.iloc[0] + slr.params.iloc[1]*x1

# Run multiple linear regression using x1, x2, x3, x4 to explain y
mlr = regression.linear_model.OLS(y, sm.add_constant(np.column_stack((x1,x2,x3,x4)))).fit()
mlr_prediction = mlr.params.iloc[0] + mlr.params.iloc[1]*x1 + mlr.params.iloc[2]*x2 + mlr.params.iloc[3]*x3 + mlr.params.iloc[4]*x4

# Compute adjusted R-squared for the two different models
print('SLR R-squared:', slr.rsquared_adj)
print('MLR R-squared:', mlr.rsquared_adj)

# Plot y along with the two different predictions
y.plot()
slr_prediction.plot()
mlr_prediction.plot()
plt.legend(['AAPL', 'SLR', 'MLR']);

SLR R-squared: 0.8434259550549307
MLR R-squared: 0.9170955837349951

# Load a half year of pricing data
start = '2018-01-01'
end = '2018-06-01'

closes = get_prices("usstock-free-1min", data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']
closes = closes.rename(columns=sids_to_symbols)

x1 = closes['MSFT']
x2 = closes['JNJ']
x3 = closes['MON']
x4 = closes['XOM']
y = closes['AAPL']

# Extend our model from before to the new time period
slr_prediction2 = slr.params.iloc[0] + slr.params.iloc[1]*x1
mlr_prediction2 = mlr.params.iloc[0] + mlr.params.iloc[1]*x1 + mlr.params.iloc[2]*x2 + mlr.params.iloc[3]*x3 + mlr.params.iloc[4]*x4

# Compute adjusted R-squared over the extended time period
adj = float(len(y) - 1)/(len(y) - 5) # Compute adjustment factor
SST = sum((y - np.mean(y))**2)
SSRs = sum((slr_prediction2 - y)**2)
print('SLR R-squared:', 1 - adj*SSRs/SST)
SSRm = sum((mlr_prediction2 - y)**2)
print('MLR R-squared:', 1 - adj*SSRm/SST)

# Plot y along with the two different predictions
y.plot()
slr_prediction2.plot()
mlr_prediction2.plot()
plt.legend(['AAPL', 'SLR', 'MLR']);

SLR R-squared: -3.17289751446555
MLR R-squared: -6.131583377986047

# Generate two artificial samples and pool them
sample1 = np.arange(30) + 4*np.random.randn(30)
sample2 = sample1 + np.arange(30)
pool = np.hstack((sample1, sample2))

# Run a regression on the pooled data, with the independent variable being the original indices
model = regression.linear_model.OLS(pool, sm.add_constant(np.hstack((np.arange(30),np.arange(30))))).fit()

# Plot the two samples along with the regression line
plt.scatter(np.arange(30), sample1, color='b')
plt.scatter(np.arange(30), sample2, color='g')
plt.plot(model.params[0] + model.params[1]*np.arange(30), color='r');

from zipline.pipeline import Pipeline, sharadar, master
from zipline.research import run_pipeline

fundamentals = sharadar.Fundamentals.slice(dimension='ARQ', period_offset=0)

pipeline = Pipeline(
    columns={
        "free_cash_flow": fundamentals.FCF.latest,
        "operating_cash_flow": fundamentals.NCFO.latest,
        "total_revenue": fundamentals.REVENUE.latest
    },
    screen=fundamentals.MARKETCAP.latest > 1e8,
    initial_universe=master.SecuritiesMaster.sharadar_FamaIndustry.latest.eq('Transportation')
)

data = run_pipeline(pipeline, start_date="2015-12-01", end_date="2015-12-31", bundle='sharadar-1d')
data = data.xs('2015-12-31')

# Drop missing data
data.dropna(inplace=True)

# Run linear regression and print R-squared value to evaluate the goodness of the fit 
unscaled_model = regression.linear_model.OLS(data['operating_cash_flow'],
                                    sm.add_constant(data['free_cash_flow'])).fit()
prediction = unscaled_model.params.iloc[0] + unscaled_model.params.iloc[1]*data['free_cash_flow']
print('R-squared value of model:', unscaled_model.rsquared)

# Plot the raw data for visualization
plt.scatter(data['free_cash_flow'], data['operating_cash_flow'])
plt.plot(data['free_cash_flow'], prediction)
plt.legend(['Model Prediction', 'Data'])
plt.xlabel('Free cash flow')
plt.ylabel('Operating cash flow')
plt.title('Transportation companies with market cap > $100 mil');

R-squared value of model: 0.4888431067969575

# drop rows with 0 revenue to avoid division by zero
data = data[data.total_revenue > 0]

# Run linear regression with inputs scaled by total revenue
scaled_model = regression.linear_model.OLS(
    data['operating_cash_flow']/data['total_revenue'],
    sm.add_constant(data['free_cash_flow']/data['total_revenue']), missing='drop').fit()
print('R-squared value of scaled model:', scaled_model.rsquared)

prediction = scaled_model.params.iloc[0] + scaled_model.params.iloc[1]*(data['free_cash_flow']/data['total_revenue'])

# Plot the scaled data
plt.scatter(data['free_cash_flow']/data['total_revenue'],
            data['operating_cash_flow']/data['total_revenue'])
plt.plot(data['free_cash_flow']/data['total_revenue'], prediction)
plt.legend(['Model Prediction', 'Data'])
plt.xlabel('Free cash flow/Total revenue')
plt.ylabel('Operating cash flow/Total revenue')
plt.title('Transportation companies with market cap > $100 mil');

R-squared value of scaled model: 0.08582368595314782

# Generate normally distributed errors
randos = [np.random.randn(100) for i in range(100)]
y = np.random.randn(100)
# Generate random walks
randows = [[sum(rando[:i+1]) for i in range(100)] for rando in randos]
yw = [sum(y[:i+1]) for i in range(100)]

for i in range(100):
    plt.plot(randows[i], alpha=0.5)

# Compute R-squared of linear regression for each element of randows with yw
rs = [regression.linear_model.OLS(yw, x).fit().rsquared for x in randows]
                    
# Plot and count the random walks that have R-squared with yw > .8
rcount = 0
for i in range(100):
    if rs[i] > .8:
        rcount += 1
        plt.plot(randows[i], alpha=0.5)
print('Linearly related walks out of 100:', rcount)

# Plot yw
plt.plot(yw, color='k');

Linearly related walks out of 100: 32

from scipy.stats import pearsonr

# Compute correlation coefficients (Pearson r) and record their p-values
ps = [pearsonr(yw, x)[1] for x in randows]
                    
# Plot and count the random walks that have p-value of correlation with yw < 0.05
pcount = 0
for i in range(100):
    if ps[i] < .05:
        pcount += 1
        plt.plot(randows[i], alpha=0.5)
print('Significantly correlated walks out of 100:', pcount)

# Plot yw
plt.plot(yw, color='k');

Significantly correlated walks out of 100: 90

from statsmodels.tsa.stattools import adfuller

# Compute the p-value of the Dickey-Fuller statistic to test the null hypothesis that yw has a unit root
print(adfuller(yw)[1])

0.9393236647801395

Model Misspecification¶

Exclusion of important variables¶

Inclusion of unnecessary variables¶

Errors in independent variables¶

Incorrect functional form¶

Pooling different populations¶

Nonstationary time series¶

All the walks¶

Just those correlated with a randomly chosen one¶

References¶