import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller

import matplotlib.pyplot as plt

def generate_datapoint(params):
    mu = params[0]
    sigma = params[1]
    return np.random.normal(mu, sigma)

# Set the parameters and the number of datapoints
params = (0, 1)
T = 100

A = pd.Series(index=range(T), dtype='float64')
A.name = 'A'

for t in range(T):
    A[t] = generate_datapoint(params)

plt.plot(A)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series A']);

# Set the number of datapoints
T = 100

B = pd.Series(index=range(T), dtype="float64")
B.name = 'B'

for t in range(T):
    # Now the parameters are dependent on time
    # Specifically, the mean of the series changes over time
    params = (t * 0.1, 1)
    B[t] = generate_datapoint(params)

plt.plot(B)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series B']);

m = np.mean(B)

plt.plot(B)
plt.hlines(m, 0, len(B), linestyles='dashed', colors='r')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series B', 'Mean']);

def check_for_stationarity(X, cutoff=0.01):
    # H_0 in adfuller is unit root exists (non-stationary)
    # We must observe significant p-value to convince ourselves that the series is stationary
    pvalue = adfuller(X)[1]
    if pvalue < cutoff:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely stationary.')
        return True
    else:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely non-stationary.')
        return False

check_for_stationarity(A);
check_for_stationarity(B);

p-value = 1.7369248755844484e-16 The series A is likely stationary.
p-value = 0.862204016214658 The series B is likely non-stationary.

# Set the number of datapoints
T = 100

C = pd.Series(index=range(T), dtype="float64")
C.name = 'C'

for t in range(T):
    # Now the parameters are dependent on time
    # Specifically, the mean of the series changes over time
    params = (np.sin(t), 1)
    C[t] = generate_datapoint(params)

plt.plot(C)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series C']);

check_for_stationarity(C);

p-value = 2.661870345658161e-10 The series C is likely stationary.

plt.plot(A)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series A']);

A1 = np.cumsum(A)

plt.plot(A1)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series A1']);

A2 = np.cumsum(A1)

plt.plot(A2)
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(['Series A2']);

from quantrocket.master import get_securities
from quantrocket import get_prices

MSFT = get_securities(symbols='MSFT', vendors='usstock').index[0]

prices = get_prices(
    'usstock-learn-1d',
    data_frequency='daily',
    sids=MSFT, 
    fields='Close', 
    start_date='2010-01-01', 
    end_date='2011-01-01').loc['Close']

prices = prices.rename(columns={MSFT: 'MSFT'})

X = prices['MSFT']

check_for_stationarity(X);

p-value = 0.293245336891862 The series MSFT is likely non-stationary.

plt.plot(X.index, X.values)
plt.ylabel('Price')
plt.legend([X.name]);

X1 = X.diff()[1:]
X1.name = X.name + ' Additive Returns'
check_for_stationarity(X1)
plt.plot(X1.index, X1.values)
plt.ylabel('Additive Returns')
plt.legend([X1.name]);

p-value = 1.9852616550851073e-16 The series MSFT Additive Returns is likely stationary.

X1 = X.pct_change()[1:]
X1.name = X.name + ' Multiplicative Returns'
check_for_stationarity(X1)
plt.plot(X1.index, X1.values)
plt.ylabel('Multiplicative Returns')
plt.legend([X1.name]);

p-value = 1.6666604839217122e-16 The series MSFT Multiplicative Returns is likely stationary.

# Length of series
N = 100

# Generate a stationary random X1
X1 = np.random.normal(0, 1, N)
# Integrate it to make it I(1)
X1 = np.cumsum(X1)
X1 = pd.Series(X1)
X1.name = 'X1'

# Make an X2 that is X1 plus some noise
X2 = X1 + np.random.normal(0, 1, N)
X2.name = 'X2'

plt.plot(X1)
plt.plot(X2)
plt.xlabel('Time')
plt.ylabel('Series Value')
plt.legend([X1.name, X2.name]);

Z = X2.diff()[1:]
Z.name = 'Z'

check_for_stationarity(Z);

p-value = 2.004300713964593e-15 The series Z is likely stationary.

Z = X2 - X1
Z.name = 'Z'

plt.plot(Z)
plt.xlabel('Time')
plt.ylabel('Series Value')
plt.legend(['Z']);

check_for_stationarity(Z);

p-value = 1.6888381661562583e-15 The series Z is likely stationary.

securities = get_securities(symbols=['GLD', 'GDX'], vendors='usstock')

prices = get_prices(
    'usstock-learn-1d',
    data_frequency='daily',
    sids=securities.index.tolist(), 
    fields=['Close'],
    start_date='2010-01-01', 
    end_date='2011-01-01').loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
prices = prices.rename(columns=sids_to_symbols)

X1 = prices['GLD']
X2 = prices['GDX']

plt.plot(X1.index, X1.values)
plt.plot(X1.index, X2.values)
plt.xlabel('Time')
plt.ylabel('Series Value')
plt.legend([X1.name, X2.name]);

X1 = sm.add_constant(X1)
results = sm.OLS(X2, X1).fit()

# Get rid of the constant column
X1 = X1['GLD']

results.params

const   -17.757124
GLD       0.573269
dtype: float64

b = results.params['GLD']
Z = X2 - b * X1
Z.name = 'Z'

plt.plot(Z.index, Z.values)
plt.xlabel('Time')
plt.ylabel('Series Value')
plt.legend([Z.name]);

check_for_stationarity(Z);

p-value = 0.00016375076700962633 The series Z is likely stationary.

from statsmodels.tsa.stattools import coint

coint(X1, X2)

(-4.486407756630515,
 0.0012891933169168481,
 array([-3.94060523, -3.36058133, -3.06139039]))

Integration, Cointegration, and Stationarity¶

Stationarity/Non-Stationarity¶

Series A¶

Series B¶

Why Non-Stationarity is Dangerous¶

Testing for Stationarity¶

Order of Integration¶

Moving Average Representation/Wold's Theorem¶

Back to Order of Integration¶

Testing for $I(0)$¶

Inductively Building Up Orders of Integration¶

Breaking Down Orders of Integration¶

Important Take-Away¶

Real Data¶

IMPORTANT NOTE¶

Note: Returns Analysis¶

Cointegration¶

Def: Linear Combination¶

Formal Definition¶

Example¶

Intuition¶

Simulated Data Example¶

Testing for Cointegration¶

Real Data Example¶

This is only a forecast!¶

Existing Tests¶