import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import covariance

# Generate random values of x
X = np.random.normal(size = 1000)
epsilon = np.random.normal(0, 3, size = len(X))
Y = 5*X + epsilon

product = (X - np.mean(X))*(Y - np.mean(Y))
expected_value = np.mean(product)

print('Value of the covariance between X and Y:', expected_value)

Value of the covariance between X and Y: 5.226838721269527

np.cov([X, Y])

array([[ 1.04749864,  5.23207079],
       [ 5.23207079, 35.57008158]])

print(np.var(X), np.var(Y))

1.0464511390306734 35.53451150113138

# scatter plot of X and y
from statsmodels import regression
import statsmodels.api as sm
def linreg(X,Y):
    # Running the linear regression
    X = sm.add_constant(X)
    model = regression.linear_model.OLS(Y, X).fit()
    a = model.params[0]
    b = model.params[1]
    X = X[:, 1]

    # Return summary of the regression and plot results
    X2 = np.linspace(X.min(), X.max(), 100)
    Y_hat = X2 * b + a
    plt.scatter(X, Y, alpha=0.3) # Plot the raw data
    plt.plot(X2, Y_hat, 'r', alpha=0.9);  # Add the regression line, colored in red
    plt.xlabel('X Value')
    plt.ylabel('Y Value')
    return model.summary()

linreg(X, Y)
plt.scatter(X, Y)
plt.title('Scatter plot and linear equation of x as a function of y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(['Linear equation', 'Scatter Plot']);

from quantrocket.master import get_securities
from quantrocket import get_prices

# Four asset example of the covariance matrix.
symbol_list = ['SBUX', 'AAPL', 'GS', 'GILD']

securities = get_securities(symbols=symbol_list, vendors='usstock')

start_date = '2009-01-01'
end_date = '2009-02-01'

prices = get_prices("usstock-learn-1d", sids=securities.index.tolist(), start_date=start_date, end_date=end_date, fields='Close')
returns = prices.loc['Close'].pct_change()[1:]

sids_to_symbols = securities.Symbol.to_dict()
returns = returns.rename(columns=sids_to_symbols)

print('Covariance matrix:')
print(returns.cov())

Covariance matrix:
Sid       AAPL        GS      GILD      SBUX
Sid                                         
AAPL  0.001044  0.001863  0.000288  0.000535
GS    0.001863  0.005755  0.001030  0.001451
GILD  0.000288  0.001030  0.000443  0.000301
SBUX  0.000535  0.001451  0.000301  0.000779

# Getting the return data of assets. 
start_date = '2009-01-01'
end_date = '2009-02-01'

symbols = ['AAPL', 'MSFT', 'BRK.A', 'GE', 'FDX', 'SBUX']

securities = get_securities(symbols=symbols, vendors='usstock')

prices = get_prices("usstock-learn-1d", sids=securities.index.tolist(), start_date=start_date, end_date=end_date, fields='Close')
returns = prices.loc['Close'].pct_change()[1:]

sids_to_symbols = securities.Symbol.to_dict()
returns = returns.rename(columns=sids_to_symbols)

returns.head()

in_sample_lw = covariance.ledoit_wolf(returns)[0]
print(in_sample_lw)

[[0.00101248 0.00037204 0.00052483 0.00014367 0.00037033 0.00039591]
 [0.00037204 0.00082933 0.00059563 0.00044826 0.00038429 0.0002667 ]
 [0.00052483 0.00059563 0.00156691 0.00023826 0.00049181 0.00035761]
 [0.00014367 0.00044826 0.00023826 0.00141561 0.0003834  0.00030402]
 [0.00037033 0.00038429 0.00049181 0.0003834  0.00082922 0.00030816]
 [0.00039591 0.0002667  0.00035761 0.00030402 0.00030816 0.00081023]]

oos_start = '2009-02-01'
oos_end = '2009-03-01'

oos_prices = get_prices("usstock-learn-1d", sids=securities.index.tolist(), start_date=oos_start, end_date=oos_end, fields='Close')
oos_returns = oos_prices.loc['Close'].pct_change()[1:]
oos_returns = oos_returns.rename(columns=sids_to_symbols)
out_sample_lw = covariance.ledoit_wolf(oos_returns)[0]

lw_errors = sum(abs(np.subtract(in_sample_lw, out_sample_lw)))
print("Average Ledoit-Wolf error:", np.mean(lw_errors))

Average Ledoit-Wolf error: 0.0010022817112747538

sample_errors = sum(abs(np.subtract(returns.cov().values, oos_returns.cov().values)))
print('Average sample covariance error:', np.mean(sample_errors))

Average sample covariance error: 0.0013734439945932459

print('Error improvement of LW over sample: {0:.2f}%'.format((np.mean(sample_errors/lw_errors)-1)*100))

Error improvement of LW over sample: 37.07%

sns.boxplot(
    data = pd.DataFrame({
        'Sample Covariance Error': sample_errors,
        'Ledoit-Wolf Error': lw_errors
    })
)
plt.title('Box Plot of Errors')
plt.ylabel('Error');

start_date = '2009-01-01'
end_date = '2009-06-01'

symbols = [
    'SPY', 'XLF', 'XLE', 'XLU','XLK', 'XLI', 'XLB', 'GE', 'GS', 'BRK.A', 'JPM', 'AAPL', 'MMM', 'BA',
    'CSCO','KO', 'DIS','DD', 'XOM', 'INTC', 'IBM', 'NKE', 'MSFT', 'PG', 'UTX', 'HD', 'MCD', 'CVX', 
    'AXP','JNJ', 'MRK', 'CAT', 'PFE', 'TRV', 'UNH', 'WMT', 'VZ', 'QQQ', 'BAC', 'F', 'C', 'CMCSA',
    'MS', 'ORCL', 'PEP', 'HON', 'GILD', 'LMT', 'UPS', 'HP', 'FDX', 'GD', 'SBUX'
]

securities = get_securities(symbols=symbols, vendors='usstock')

prices = get_prices("usstock-learn-1d", sids=securities.index.tolist(), start_date=start_date, end_date=end_date, fields='Close')
returns = prices.loc['Close'].pct_change()[1:]

sids_to_symbols = securities.Symbol.to_dict()
returns = returns.rename(columns=sids_to_symbols)

dates = returns.resample('M').first().index

sample_covs = []
lw_covs = []

for i in range(1, len(dates)-1):
    sample_cov = returns[dates[i-1]:dates[i]].cov().values
    sample_covs.append(sample_cov)
    
    lw_cov = covariance.ledoit_wolf(returns[dates[i-1]:dates[i]])[0]
    lw_covs.append(lw_cov)

lw_diffs = []
for pair in zip(lw_covs[:-1], lw_covs[1:]):
    diff = np.mean(np.sum(np.abs(pair[0] - pair[1])))
    lw_diffs.append(diff)
    
sample_diffs = []
for pair in zip(sample_covs[:-1], sample_covs[1:]):
    diff = np.mean(np.sum(np.abs(pair[0] - pair[1])))
    sample_diffs.append(diff)

plt.plot(dates[2:-1], lw_diffs)
plt.plot(dates[2:-1], sample_diffs)
plt.xlabel('Time')
plt.ylabel('Mean Error')
plt.legend(['Ledoit-Wolf Errors', 'Sample Covariance Errors']);

Sid	AAPL	FDX	GE	MSFT	SBUX	BRK.A
Date
2009-01-05	0.042204	-0.007604	-0.025776	0.009346	0.008130	0.026103
2009-01-06	-0.016494	0.002033	0.013830	0.011696	0.030242	-0.024464
2009-01-07	-0.021608	-0.022316	-0.044484	-0.060212	-0.022505	-0.035968
2009-01-08	0.018569	-0.008939	0.001862	0.031266	0.015015	0.022904
2009-01-09	-0.022869	-0.028507	-0.008674	-0.029821	-0.036489	-0.040020

Estimation of Covariance Matrices¶

Measuring Volatility¶

Covariance¶

The Covariance Matrix¶

Why does all this matter?¶

Shrinkage Estimators¶

Ledoit-Wolf Estimator.¶

Calculating Errors¶

Comparing to Sample Matrix¶

Adding More Assets¶