import numpy as np
import pandas as pd
from statsmodels import regression, stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy as sp

def linreg(X,Y):
    # Running the linear regression
    x = sm.add_constant(X) # Add a row of 1's so that our model has a constant term
    model = regression.linear_model.OLS(Y, x).fit()
    return model.params[0], model.params[1] # Return the coefficients of the linear model

# Draw observations from normal distribution
np.random.seed(107) # Fix seed for random number generation
rand = np.random.randn(20)

# Conduct linear regression on the ordered list of observations
xs = np.arange(20)
a, b = linreg(xs, rand)
print('Slope:', b, 'Intercept:', a)

# Plot the raw data and the regression line
plt.scatter(xs, rand, alpha=0.7)
Y_hat = xs * b + a
plt.plot(xs, Y_hat, 'r', alpha=0.9);

Slope: 0.009072503822685521 Intercept: -0.4020774408530383

import seaborn

seaborn.regplot(x=xs, y=rand)

<Axes: >

# Draw more observations
rand2 = np.random.randn(100)

# Conduct linear regression on the ordered list of observations
xs2 = np.arange(100)
a2, b2 = linreg(xs2, rand2)
print('Slope:', b2, 'Intercept:', a2)

# Plot the raw data and the regression line
plt.scatter(xs2, rand2, alpha=0.7)
Y_hat2 = xs2 * b2 + a2
plt.plot(xs2, Y_hat2, 'r', alpha=0.9);

Slope: -0.0005693423631053353 Intercept: 0.009011767319021785

from quantrocket.master import get_securities
from quantrocket import get_prices

exxon_sid = get_securities(symbols="XOM", vendors='usstock').index[0]

start = '2010-01-01'
end = '2016-01-01'
prices = get_prices('usstock-free-1min', data_frequency='daily', sids=exxon_sid, fields='Close', start_date=start, end_date=end)
prices = prices.loc['Close'][exxon_sid]

# Manually set the point where we think a structural break occurs
breakpoint = 1150
xs = np.arange(len(prices))
xs2 = np.arange(breakpoint)
xs3 = np.arange(len(prices) - breakpoint)

# Perform linear regressions on the full data set, the data up to the breakpoint, and the data after
a, b = linreg(xs, prices.values)
a2, b2 = linreg(xs2, prices[:breakpoint].values)
a3, b3 = linreg(xs3, prices[breakpoint:].values)

Y_hat = pd.Series(xs * b + a, index=prices.index)
Y_hat2 = pd.Series(xs2 * b2 + a2, index=prices.index[:breakpoint])
Y_hat3 = pd.Series(xs3 * b3 + a3, index=prices.index[breakpoint:])

# Plot the raw data
prices.plot()
Y_hat.plot(color='y')
Y_hat2.plot(color='r')
Y_hat3.plot(color='r')
plt.title('XOM Price')
plt.ylabel('Price');

stats.diagnostic.breaks_cusumolsresid(
    regression.linear_model.OLS(prices, sm.add_constant(xs)).fit().resid)[1]

2.0582432630257646e-61

# Get pricing data for two tech stocks and HD
securities = get_securities(symbols=["MSFT", "AAPL", "HD"], vendors='usstock')

start = '2013-01-01'
end = '2015-01-01'

prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc["Close"]

sids_to_symbols = securities.Symbol.to_dict()
prices = prices.rename(columns=sids_to_symbols)

b1 = prices['AAPL']
b2 = prices['MSFT']
asset = prices['HD']

mlr = regression.linear_model.OLS(asset, sm.add_constant(np.column_stack((b1, b2)))).fit()
prediction = mlr.params.iloc[0] + mlr.params.iloc[1]*b1 + mlr.params.iloc[2]*b2
print('Constant:', mlr.params.iloc[0], 'MLR beta to AAPL:', mlr.params.iloc[1], 'MLR beta to MSFT', mlr.params.iloc[2])

# Plot the asset pricing data and the regression model prediction, just for fun
asset.plot()
prediction.plot();
plt.ylabel('Price')
plt.legend(['Asset', 'Linear Regression Prediction']);

Constant: 31.67091871476558 MLR beta to AAPL: 0.09478435173254046 MLR beta to MSFT 1.0723677769884103

# Get pricing data for two tech stocks and HD
securities = get_securities(symbols=["MSFT", "AAPL", "HD"], vendors='usstock')

start = '2013-01-01'
end = '2015-04-01'

prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc["Close"]

sids_to_symbols = securities.Symbol.to_dict()
prices = prices.rename(columns=sids_to_symbols)

b1 = prices['AAPL']
b2 = prices['MSFT']
asset = prices['HD']

mlr = regression.linear_model.OLS(asset, sm.add_constant(np.column_stack((b1, b2)))).fit()
prediction = mlr.params.iloc[0] + mlr.params.iloc[1]*b1 + mlr.params.iloc[2]*b2
print('Constant:', mlr.params.iloc[0], 'MLR beta to AAPL:', mlr.params.iloc[1], 'MLR beta to MSFT', mlr.params.iloc[2])

# Plot the asset pricing data and the regression model prediction, just for fun
asset.plot()
prediction.plot();
plt.ylabel('Price')
plt.legend(['Asset', 'Linear Regression Prediction']);

Constant: 31.12000732698595 MLR beta to AAPL: 0.565604191584881 MLR beta to MSFT 0.09379442671394965

# Compute Pearson correlation coefficient
sp.stats.pearsonr(b1,b2)[0] # Second return value is p-value

0.8597066549901697

Regression Model Instability¶

Biased noise¶

Regime changes¶

Multicollinearity¶