import numpy as np
import pandas as pd
import statsmodels.api as sm
# If the observations are in a dataframe, you can use statsmodels.formulas.api to do the regression instead
from statsmodels import regression
import matplotlib.pyplot as plt

Y = np.array([1, 3.5, 4, 8, 12])
Y_hat = np.array([1, 3, 5, 7, 9])

print('Error ' + str(Y_hat - Y))

# Compute squared error
SE = (Y_hat - Y) ** 2

print('Squared Error ' + str(SE))
print('Sum Squared Error ' + str(np.sum(SE)))

Error [ 0.  -0.5  1.  -1.  -3. ]
Squared Error [0.   0.25 1.   1.   9.  ]
Sum Squared Error 11.25

# Construct a simple linear curve of 1, 2, 3, ...
X1 = np.arange(100)

# Make a parabola and add X1 to it, this is X2
X2 = np.array([i ** 2 for i in range(100)]) + X1

# This is our real Y, constructed using a linear combination of X1 and X2
Y = X1 + X2

plt.plot(X1, label='X1')
plt.plot(X2, label='X2')
plt.plot(Y, label='Y')
plt.legend();

# Use column_stack to combine independent variables, then add a column of ones so we can fit an intercept
X = sm.add_constant(np.column_stack((X1, X2)))

# Run the model
results = regression.linear_model.OLS(Y, X).fit()

print('Beta_0:', results.params[0])
print('Beta_1:', results.params[1])
print('Beta_2:', results.params[2])

Beta_0: 3.979039320256561e-12
Beta_1: 1.000000000000262
Beta_2: 0.9999999999999959

# Load pricing data for two arbitrarily-chosen assets and SPY
from quantrocket.master import get_securities
from quantrocket import get_prices

securities = get_securities(symbols=['SPY', 'AAPL', 'JNJ'], vendors='usstock')

start = '2019-01-01'
end = '2020-01-01'

prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
prices = prices.rename(columns=sids_to_symbols)

asset1 = prices['AAPL']
asset2 = prices['JNJ']
benchmark = prices['SPY']

# First, run a linear regression on the two assets
slr = regression.linear_model.OLS(asset1, sm.add_constant(asset2)).fit()
print('SLR beta of asset2:', slr.params.iloc[1])

SLR beta of asset2: 3.013492176301252

# Run multiple linear regression using asset2 and SPY as independent variables
mlr = regression.linear_model.OLS(asset1, sm.add_constant(np.column_stack((asset2, benchmark)))).fit()

prediction = mlr.params.iloc[0] + mlr.params.iloc[1]*asset2 + mlr.params.iloc[2]*benchmark
prediction.name = 'Prediction'

print('MLR beta of asset2:', mlr.params.iloc[1], '\nMLR beta of S&P 500:', mlr.params.iloc[2])

MLR beta of asset2: -0.43441769523091556 
MLR beta of S&P 500: 2.1255544919804126

# Plot the three variables along with the prediction given by the MLR
asset1.plot()
asset2.plot()
benchmark.plot()
prediction.plot(color='y')
plt.xlabel('Price')
plt.legend(bbox_to_anchor=(1,1), loc=2);

# Plot only the dependent variable and the prediction to get a closer look
asset1.plot()
prediction.plot(color='y')
plt.xlabel('Price')
plt.legend();

mlr.summary()

X1 = np.arange(100)
X2 = [i**2 for i in range(100)] - X1
X3 = [np.log(i) for i in range(1, 101)] + X2
X4 = 5 * X1
Y = 2 * X1 + 0.5 * X2 + 10 * X3 + X4

plt.plot(X1, label='X1')
plt.plot(X2, label='X2')
plt.plot(X3, label='X3')
plt.plot(X4, label='X4')
plt.plot(Y, label='Y')
plt.legend();

results = regression.linear_model.OLS(Y, sm.add_constant(np.column_stack((X1,X2,X3,X4)))).fit()

print("Beta_0:", results.params[0])
print("Beta_1:", results.params[1])
print("Beta_2:", results.params[2])
print("Beta_3:", results.params[3])
print("Beta_4:", results.params[4])

Beta_0: 1.1596057447604835e-11
Beta_1: 0.2692307692305018
Beta_2: 0.5000000000015916
Beta_3: 10.000000000000455
Beta_4: 1.346153846153797

data = pd.DataFrame(np.column_stack((X1,X2,X3,X4)), columns=['X1','X2','X3','X4'])
response = pd.Series(Y, name='Y')

def forward_aic(response, data):
    # This function will work with pandas dataframes and series
    
    # Initialize some variables
    explanatory = list(data.columns)
    selected = pd.Series(np.ones(data.shape[0]), name="Intercept")
    current_score, best_new_score = np.inf, np.inf
    
    # Loop while we haven't found a better model
    while current_score == best_new_score and len(explanatory) != 0:
        
        scores_with_elements = []
        count = 0
        
        # For each explanatory variable
        for element in explanatory:
            # Make a set of explanatory variables including our current best and the new one
            tmp = pd.concat([selected, data[element]], axis=1)
            # Test the set
            result = regression.linear_model.OLS(Y, tmp).fit()
            score = result.aic
            scores_with_elements.append((score, element, count))
            count += 1
        
        # Sort the scoring list
        scores_with_elements.sort(reverse = True)
        # Get the best new variable
        best_new_score, best_element, index = scores_with_elements.pop()
        if current_score > best_new_score:
            # If it's better than the best add it to the set
            explanatory.pop(index)
            selected = pd.concat([selected, data[best_element]],axis=1)
            current_score = best_new_score
    # Return the final model
    model = regression.linear_model.OLS(Y, selected).fit()
    return model

result = forward_aic(Y, data)
result.summary()

Dep. Variable:	AAPL	R-squared:	0.910
Model:	OLS	Adj. R-squared:	0.909
Method:	Least Squares	F-statistic:	1259.
Date:	Thu, 21 Mar 2024	Prob (F-statistic):	6.04e-131
Time:	15:25:33	Log-Likelihood:	-950.94
No. Observations:	252	AIC:	1908.
Df Residuals:	249	BIC:	1918.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-346.9961	17.963	-19.317	0.000	-382.375	-311.618
x1	-0.4344	0.153	-2.834	0.005	-0.736	-0.133
x2	2.1256	0.048	44.739	0.000	2.032	2.219

Omnibus:	1.925	Durbin-Watson:	0.085
Prob(Omnibus):	0.382	Jarque-Bera (JB):	1.704
Skew:	-0.198	Prob(JB):	0.426
Kurtosis:	3.079	Cond. No.	8.54e+03

Dep. Variable:	y	R-squared:	1.000
Model:	OLS	Adj. R-squared:	1.000
Method:	Least Squares	F-statistic:	8.311e+27
Date:	Thu, 21 Mar 2024	Prob (F-statistic):	0.00
Time:	15:26:03	Log-Likelihood:	1865.3
No. Observations:	100	AIC:	-3723.
Df Residuals:	96	BIC:	-3712.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-3.183e-11	1.35e-09	-0.024	0.981	-2.72e-09	2.65e-09
X3	10.0000	8.18e-10	1.22e+10	0.000	10.000	10.000
X4	1.4000	1.3e-11	1.08e+11	0.000	1.400	1.400
X2	0.5000	8.18e-10	6.11e+08	0.000	0.500	0.500

Multiple Linear Regression¶

Evaluation¶

Model Assumptions¶

Model Selection Example¶

Omnibus:	13.841	Durbin-Watson:	0.001
Prob(Omnibus):	0.001	Jarque-Bera (JB):	10.110
Skew:	-0.658	Prob(JB):	0.00638
Kurtosis:	2.166	Cond. No.	5.48e+04