import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from zipline.pipeline import Pipeline, sharadar, master, EquityPricing
from zipline.pipeline.factors import Returns, AverageDollarVolume
from zipline.research import run_pipeline

# date range for building risk model
start = "2009-01-01"
end = "2011-01-01"

TradableStocksUS = (
    # Market cap over $500M
    (sharadar.Fundamentals.slice(dimension='ARQ', period_offset=0).MARKETCAP.latest >= 500e6)
    # dollar volume over $2.5M over trailing 200 days
    & (AverageDollarVolume(window_length=200) >= 2.5e6)
    # price > $5
    & (EquityPricing.close.latest > 5)
    # no missing data for 200 days (exclude trading halts, IPOs, etc.)
    & EquityPricing.close.all_present(window_length=200)
    & (EquityPricing.volume.latest > 0).all(window_length=200)
)
initial_universe = (
    # common stocks only
    master.SecuritiesMaster.usstock_SecurityType2.latest.eq("Common Stock")
    # primary share only
    & master.SecuritiesMaster.usstock_PrimaryShareSid.latest.isnull()  
)

def tus_returns(start_date, end_date):
    pipe = Pipeline(
        columns={'Close': EquityPricing.close.latest},
        initial_universe=initial_universe,
        screen=TradableStocksUS,
    )
    stocks = run_pipeline(pipe, start_date=start_date, end_date=end_date, bundle='sharadar-1d')  
    prices = stocks.Close.unstack()
    
    tus_returns = prices.pct_change()[1:]
    return tus_returns


R = tus_returns(start, end)
print("The universe we define includes {} assets.".format(R.shape[1]))
print('The number of timestamps is {} from {} to {}.'.format(R.shape[0], start, end))

The universe we define includes 1965 assets.
The number of timestamps is 504 from 2009-01-01 to 2011-01-01.

assets = R.columns

Fundamentals = sharadar.Fundamentals.slice(dimension='ARQ', period_offset=0)

def make_pipeline():
    """
    Create and return our pipeline.
    
    We break this piece of logic out into its own function to make it easier to
    test and modify in isolation.
    """
    # Market Cap
    market_cap = Fundamentals.MARKETCAP.latest
    # Book to Price ratio
    book_to_price = 1/Fundamentals.PB.latest
    
    # Build Filters representing the top and bottom 500 stocks by our combined ranking system.
    biggest = market_cap.top(500, mask=TradableStocksUS)
    smallest = market_cap.bottom(500, mask=TradableStocksUS)
    
    highpb = book_to_price.top(500, mask=TradableStocksUS)
    lowpb = book_to_price.bottom(500, mask=TradableStocksUS)
    
    universe = biggest | smallest | highpb | lowpb
    
    pipe = Pipeline(
        columns = {
            'returns' : Returns(window_length=2),
            'market_cap' : market_cap,
            'book_to_price' : book_to_price,
            'biggest' : biggest,
            'smallest' : smallest,
            'highpb' : highpb,
            'lowpb' : lowpb
        },
        initial_universe=initial_universe,
        screen=universe
    )
    return pipe

from quantrocket.master import get_securities
from quantrocket import get_prices

pipe = make_pipeline()
# This takes a few minutes.
results = run_pipeline(pipe, start_date=start, end_date=end, bundle='sharadar-1d')
R_biggest = results[results.biggest]['returns'].groupby(level=0).mean()
R_smallest = results[results.smallest]['returns'].groupby(level=0).mean()

R_highpb = results[results.highpb]['returns'].groupby(level=0).mean()
R_lowpb = results[results.lowpb]['returns'].groupby(level=0).mean()

SMB = R_smallest - R_biggest
HML = R_highpb - R_lowpb

df = pd.DataFrame({
         'SMB': SMB, # company size
         'HML': HML  # company PB ratio
    },columns =["SMB","HML"]).dropna()

SPY = get_securities(symbols='SPY', vendors='usstock').index[0]

MKT = get_prices(
    'sharadar-1d', 
    data_frequency='daily',
    sids=SPY,
    start_date=start, 
    end_date=end, 
    fields='Close').loc['Close'][SPY].pct_change().shift()[1:]
MKT = pd.DataFrame({'MKT':MKT})

F = pd.concat([MKT,df],axis = 1).dropna()

ax = ((F + 1).cumprod() - 1).plot(subplots=True, title='Cumulative Fundamental Factors')
ax[0].set(ylabel = "daily returns")
ax[1].set(ylabel = "daily returns")
ax[2].set(ylabel = "daily returns")
plt.show()

# factor exposure
B = pd.DataFrame(index=assets, dtype=np.float32)

from IPython.display import clear_output

x = sm.add_constant(F)

all_assets = {}
for i, asset in enumerate(assets):

    print(f"asset {i+1} of {len(assets)}")
    clear_output(wait=True)
    y = R.loc[:, asset].iloc[1:-1]
    y_inlier = y[np.abs(y - y.mean())<=(3*y.std())]
    x_inlier = x[np.abs(y - y.mean())<=(3*y.std())]
    result = sm.OLS(y_inlier, x_inlier).fit()

    B.loc[asset,"MKT_beta"] = result.params.iloc[1]
    B.loc[asset,"SMB_beta"] = result.params.iloc[2]
    B.loc[asset,"HML_beta"] = result.params.iloc[3]
    all_assets[asset] = y - (x.iloc[:,0] * result.params.iloc[0] +
                            x.iloc[:,1] * result.params.iloc[1] + 
                            x.iloc[:,2] * result.params.iloc[2] +
                            x.iloc[:,3] * result.params.iloc[3])

epsilon = pd.DataFrame(all_assets, index=R.index, dtype=np.float32)

asset 1965 of 1965

fig,axes = plt.subplots(3, 1)
ax1,ax2,ax3 =axes

B.iloc[0:10,0].plot.barh(ax=ax1, figsize=[15,15], title=B.columns[0])
B.iloc[0:10,1].plot.barh(ax=ax2, figsize=[15,15], title=B.columns[1])
B.iloc[0:10,2].plot.barh(ax=ax3, figsize=[15,15], title=B.columns[2])

ax1.set(xlabel='beta')
ax2.set(xlabel='beta')
ax3.set(xlabel='beta')
plt.show()

from zipline.research import symbol
aapl = symbol("AAPL", bundle='sharadar-1d')
B.loc[aapl,:]

MKT_beta    1.043912
SMB_beta    0.131474
HML_beta   -0.379823
Name: Equity(FIBBG000B9XRY4 [AAPL]), dtype: float64

F.head(3)

B.head(3)

w = np.ones([1,R.shape[1]])/R.shape[1]

def compute_common_factor_variance(factors, factor_exposures, w):   
    B = np.asarray(factor_exposures)
    F = np.asarray(factors)
    V = np.asarray(factors.cov())
    
    
    return w.dot(B.dot(V).dot(B.T)).dot(w.T)

common_factor_variance = compute_common_factor_variance(F, B, w)[0][0]
print("Common Factor Variance: {0}".format(common_factor_variance))

Common Factor Variance: 0.00020233227345977094

def compute_specific_variance(epsilon, w):       
    
    D = np.diag(np.asarray(epsilon.var())) * epsilon.shape[0] / (epsilon.shape[0]-1)

    return w.dot(D).dot(w.T)

specific_variance = compute_specific_variance(epsilon, w)[0][0]
print("Specific Variance: {0}".format(specific_variance))

Specific Variance: 3.0442046290836587e-07

common_factor_pct = common_factor_variance/(common_factor_variance + specific_variance)*100.0
print("Percentage of Portfolio Variance Due to Common Factor Risk: {0:.2f}%".format(common_factor_pct))

Percentage of Portfolio Variance Due to Common Factor Risk: 99.85%

w_0 = np.random.rand(R.shape[1])
w_0 = w_0/np.sum(w_0)

f = B.T.dot(w_0)
f

MKT_beta    0.914839
SMB_beta    0.535738
HML_beta   -0.097987
dtype: float64

	MKT	SMB	HML
2009-01-06	-0.001178	-0.002559	0.009278
2009-01-07	0.006674	0.010067	0.019556
2009-01-08	-0.029957	-0.001879	-0.012302

	MKT_beta	SMB_beta	HML_beta
asset
Equity(FIBBG000C2V3D6 [A])	1.199859	0.454460	-0.369525
Equity(FIBBG000BBVK30 [AAMRQ])	1.127701	0.638793	-0.906826
Equity(FIBBG000F7RCJ1 [AAP])	0.698091	0.345682	-0.441000

Risk-Constrained Portfolio Optimization¶

Factor Models¶

Universe¶

Factor Returns and Exposures¶

Calculating the Exposures¶

Summary of the Setup:¶

Splitting Variance into Common Factor Risks¶

Computing Common Factor and Specific Variance:¶

Risk-Constrained Optimization¶

References¶