# Import all the libraries we'll be using
import numpy as np
import statsmodels.api as sm
from statsmodels import regression, stats
import statsmodels
import matplotlib.pyplot as plt

residuals = np.random.normal(0, 1, 100)

_, pvalue, _, _ = statsmodels.stats.stattools.jarque_bera(residuals)
print(pvalue)

residuals = np.random.poisson(size=100)

_, pvalue, _, _ = statsmodels.stats.stattools.jarque_bera(residuals)
print(pvalue)

0.6940434007675116
0.026179144649025356

# Artificially create dataset with constant variance around a line
xs = np.arange(100)
y1 = xs + 3*np.random.randn(100)

# Get results of linear regression
slr1 = regression.linear_model.OLS(y1, sm.add_constant(xs)).fit()

# Construct the fit line
fit1 = slr1.params[0] + slr1.params[1]*xs

# Plot data and regression line
plt.scatter(xs, y1)
plt.plot(xs, fit1)
plt.title('Homoskedastic errors');
plt.legend(['Predicted', 'Observed'])
plt.xlabel('X')
plt.ylabel('Y');

# Artificially create dataset with changing variance around a line
y2 = xs*(1 + .5*np.random.randn(100))

# Perform linear regression
slr2 = regression.linear_model.OLS(y2, sm.add_constant(xs)).fit()
fit2 = slr2.params[0] + slr2.params[1]*xs

# Plot data and regression line
plt.scatter(xs, y2)
plt.plot(xs, fit2)
plt.title('Heteroskedastic errors')
plt.legend(['Predicted', 'Observed'])
plt.xlabel('X')
plt.ylabel('Y')

# Print summary of regression results
slr2.summary()

residuals1 = y1-fit1
residuals2 = y2-fit2

xs_with_constant = sm.add_constant(xs)

_, jb_pvalue1, _, _ = statsmodels.stats.stattools.jarque_bera(residuals1)
_, jb_pvalue2, _, _ = statsmodels.stats.stattools.jarque_bera(residuals2)
print("p-value for residuals1 being normal", jb_pvalue1)
print("p-value for residuals2 being normal", jb_pvalue2)

_, pvalue1, _, _ = stats.diagnostic.het_breuschpagan(residuals1, xs_with_constant)
_, pvalue2, _, _ = stats.diagnostic.het_breuschpagan(residuals2, xs_with_constant)
print("p-value for residuals1 being heteroskedastic", pvalue1)
print("p-value for residuals2 being heteroskedastic", pvalue2)

p-value for residuals1 being normal 0.5343383318360992
p-value for residuals2 being normal 0.0005716800307310119
p-value for residuals1 being heteroskedastic 0.9816910926365803
p-value for residuals2 being heteroskedastic 1.7842313935965266e-06

print(slr2.summary())
print(slr2.get_robustcov_results().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     76.61
Date:                Thu, 21 Mar 2024   Prob (F-statistic):           6.12e-14
Time:                        15:31:50   Log-Likelihood:                -489.35
No. Observations:                 100   AIC:                             982.7
Df Residuals:                      98   BIC:                             987.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1517      6.473     -0.332      0.740     -14.998      10.695
x1             0.9888      0.113      8.753      0.000       0.765       1.213
==============================================================================
Omnibus:                        8.067   Durbin-Watson:                   2.311
Prob(Omnibus):                  0.018   Jarque-Bera (JB):               14.934
Skew:                           0.181   Prob(JB):                     0.000572
Kurtosis:                       4.858   Cond. No.                         114.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     57.56
Date:                Thu, 21 Mar 2024   Prob (F-statistic):           1.91e-11
Time:                        15:31:50   Log-Likelihood:                -489.35
No. Observations:                 100   AIC:                             982.7
Df Residuals:                      98   BIC:                             987.9
Df Model:                           1                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1517      4.247     -0.507      0.614     -10.581       6.277
x1             0.9888      0.130      7.587      0.000       0.730       1.247
==============================================================================
Omnibus:                        8.067   Durbin-Watson:                   2.311
Prob(Omnibus):                  0.018   Jarque-Bera (JB):               14.934
Skew:                           0.181   Prob(JB):                     0.000572
Kurtosis:                       4.858   Cond. No.                         114.
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC1)

# Load pricing data for an asset
from quantrocket.master import get_securities
from quantrocket import get_prices

dal_sid = get_securities(symbols="DAL", vendors='usstock').index[0]

start = '2009-01-01'
end = '2010-01-01'

prices = get_prices('usstock-learn-1d', sids=dal_sid, fields='Close', start_date=start, end_date=end)
y = prices.loc['Close'][dal_sid]
x = np.arange(len(y))

# Regress pricing data against time
model = regression.linear_model.OLS(y, sm.add_constant(x)).fit()

# Construct the fit line
prediction = model.params.iloc[0] + model.params.iloc[1]*x

# Plot pricing data and regression line
plt.plot(x,y)
plt.plot(x, prediction, color='r')
plt.legend(['DAL Price', 'Regression Line'])
plt.xlabel('Time')
plt.ylabel('Price')

# Print summary of regression results
model.summary()

_, prices_qstats, prices_qstat_pvalues = statsmodels.tsa.stattools.acf(y, qstat=True, nlags=40, fft=False)
_, prices_qstats, prices_qstat_pvalues = statsmodels.tsa.stattools.acf(y - prediction, qstat=True, nlags=40, fft=False)

print('Prices autocorrelation p-values', prices_qstat_pvalues)
print('Residuals autocorrelation p-values', prices_qstat_pvalues)

_, jb_pvalue, _, _ = statsmodels.stats.stattools.jarque_bera(y - prediction)

print('Jarque-Bera p-value that residuals are normally distributed', jb_pvalue)

Prices autocorrelation p-values [6.76516804e-052 1.35164989e-095 2.10501059e-134 1.28223664e-168
 1.14441272e-197 1.41851995e-222 5.03861385e-244 3.87497716e-262
 1.69095431e-277 3.85316686e-290 6.48113696e-300 3.79940719e-307
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
Residuals autocorrelation p-values [6.76516804e-052 1.35164989e-095 2.10501059e-134 1.28223664e-168
 1.14441272e-197 1.41851995e-222 5.03861385e-244 3.87497716e-262
 1.69095431e-277 3.85316686e-290 6.48113696e-300 3.79940719e-307
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
Jarque-Bera p-value that residuals are normally distributed 1.8211247590999288e-26

from math import sqrt

# Find the covariance matrix of the coefficients
cov_mat = stats.sandwich_covariance.cov_hac(model)

# Print the standard errors of each coefficient from the original model and from the adjustment
print('Old standard errors:', model.bse.iloc[0], model.bse.iloc[1])
print('Adjusted standard errors:', sqrt(cov_mat[0,0]), sqrt(cov_mat[1,1]))

Old standard errors: 0.21740611282231034 0.0014987407749050077
Adjusted standard errors: 0.6966292524753421 0.004512902166098103

# Load pricing data for asset and two market indices
securities = get_securities(symbols=["SPY", "MDY", "HPQ"], vendors='usstock')

start = '2009-01-01'
end = '2010-01-01'

closes = get_prices("usstock-learn-1d", sids=securities.index.tolist(), fields="Close", start_date=start, end_date=end).loc["Close"]

sids_to_symbols = securities.Symbol.to_dict()
closes = closes.rename(columns=sids_to_symbols)

b1 = closes["SPY"]
b2 = closes["MDY"]
a = closes['HPQ']

# Run multiple linear regression
mlr = regression.linear_model.OLS(a, sm.add_constant(np.column_stack((b1,b2)))).fit()

# Construct fit curve using dependent variables and estimated coefficients
mlr_prediction = mlr.params.iloc[0] + mlr.params.iloc[1]*b1 + mlr.params.iloc[2]*b2

# Print regression statistics 
print('R-squared:', mlr.rsquared_adj)
print('t-statistics of coefficients:\n', mlr.tvalues)

# Plot asset and model
a.plot()
mlr_prediction.plot()
plt.legend(['Asset', 'Model']);
plt.ylabel('Price')

R-squared: 0.9572417101669867
t-statistics of coefficients:
 const   -12.690509
x1        9.287386
x2       -1.555911
dtype: float64

Text(0, 0.5, 'Price')

# Perform linear regression
slr = regression.linear_model.OLS(a, sm.add_constant(b1)).fit()
slr_prediction = slr.params.iloc[0] + slr.params.iloc[1]*b1

# Print fit statistics
print('R-squared:', slr.rsquared_adj)
print('t-statistics of coefficients:\n', slr.tvalues)

# Plot asset and model
a.plot()
slr_prediction.plot()
plt.ylabel('Price')
plt.legend(['Asset', 'Model']);

R-squared: 0.9569986961494943
t-statistics of coefficients:
 const   -18.404597
SPY      74.746454
dtype: float64

from scipy.stats import pearsonr

# Construct Anscombe's arrays
x1 = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
x2 = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
x3 = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]

# Perform linear regressions on the datasets
slr1 = regression.linear_model.OLS(y1, sm.add_constant(x1)).fit()
slr2 = regression.linear_model.OLS(y2, sm.add_constant(x2)).fit()
slr3 = regression.linear_model.OLS(y3, sm.add_constant(x3)).fit()
slr4 = regression.linear_model.OLS(y4, sm.add_constant(x4)).fit()

# Print regression coefficients, Pearson r, and R-squared for the 4 datasets
print('Cofficients:', slr1.params, slr2.params, slr3.params, slr4.params)
print('Pearson r:', pearsonr(x1, y1)[0], pearsonr(x2, y2)[0], pearsonr(x3, y3)[0], pearsonr(x4, y4)[0])
print('R-squared:', slr1.rsquared, slr2.rsquared, slr3.rsquared, slr4.rsquared)

# Plot the 4 datasets with their regression lines
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
xs = np.arange(20)
ax1.plot(slr1.params[0] + slr1.params[1]*xs, 'r')
ax1.scatter(x1, y1)
ax1.set_xlabel('x1')
ax1.set_ylabel('y1')
ax2.plot(slr2.params[0] + slr2.params[1]*xs, 'r')
ax2.scatter(x2, y2)
ax2.set_xlabel('x2')
ax2.set_ylabel('y2')
ax3.plot(slr3.params[0] + slr3.params[1]*xs, 'r')
ax3.scatter(x3, y3)
ax3.set_xlabel('x3')
ax3.set_ylabel('y3')
ax4.plot(slr4.params[0] + slr4.params[1]*xs, 'r')
ax4.scatter(x4,y4)
ax4.set_xlabel('x4')
ax4.set_ylabel('y4');

Cofficients: [3.00009091 0.50009091] [3.00090909 0.5       ] [3.00245455 0.49972727] [3.00172727 0.49990909]
Pearson r: 0.8164205163448399 0.8162365060002426 0.816286739489598 0.8165214368885028
R-squared: 0.666542459508775 0.6662420337274844 0.6663240410665594 0.6667072568984653

Violations of Regression Models¶

Focus on the Residuals¶

Residuals not normally-distributed¶

Testing for normality¶

Heteroskedasticity¶

Testing for Heteroskedasticity¶

Correcting for Heteroskedasticity¶

Serial correlation of errors¶

Testing for Autocorrelation¶

Newey-West¶

Multicollinearity¶

Example: Anscombe's quartet¶

References¶

Dep. Variable:	y	R-squared:	0.439
Model:	OLS	Adj. R-squared:	0.433
Method:	Least Squares	F-statistic:	76.61
Date:	Thu, 21 Mar 2024	Prob (F-statistic):	6.12e-14
Time:	15:31:50	Log-Likelihood:	-489.35
No. Observations:	100	AIC:	982.7
Df Residuals:	98	BIC:	987.9
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-2.1517	6.473	-0.332	0.740	-14.998	10.695
x1	0.9888	0.113	8.753	0.000	0.765	1.213

Omnibus:	8.067	Durbin-Watson:	2.311
Prob(Omnibus):	0.018	Jarque-Bera (JB):	14.934
Skew:	0.181	Prob(JB):	0.000572
Kurtosis:	4.858	Cond. No.	114.

Dep. Variable:	FIBBG000R7Z112	R-squared:	0.109
Model:	OLS	Adj. R-squared:	0.106
Method:	Least Squares	F-statistic:	30.72
Date:	Thu, 21 Mar 2024	Prob (F-statistic):	7.55e-08
Time:	15:31:50	Log-Likelihood:	-494.80
No. Observations:	252	AIC:	993.6
Df Residuals:	250	BIC:	1001.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	66.323	Durbin-Watson:	0.049
Prob(Omnibus):	0.000	Jarque-Bera (JB):	118.536
Skew:	1.411	Prob(JB):	1.82e-26
Kurtosis:	4.824	Cond. No.	289.

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	6.4903	0.217	29.853	0.000	6.062	6.918
x1	0.0083	0.001	5.543	0.000	0.005	0.011