import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

# Example of ranking data
l = [10, 9, 5, 7, 5]
print('Raw data:', l)
print('Ranking:', list(stats.rankdata(l, method='average')))

Raw data: [10, 9, 5, 7, 5]
Ranking: [5.0, 4.0, 1.5, 3.0, 1.5]

## Let's see an example of this
n = 100

def compare_correlation_and_spearman_rank(n, noise):
    X = np.random.poisson(size=n)
    Y = np.exp(X) + noise * np.random.normal(size=n)

    Xrank = stats.rankdata(X, method='average')
    # n-2 is the second to last element
    Yrank = stats.rankdata(Y, method='average')

    diffs = Xrank - Yrank # order doesn't matter since we'll be squaring these values
    r_s = 1 - 6*sum(diffs*diffs)/(n*(n**2 - 1))
    c_c = np.corrcoef(X, Y)[0,1]
    
    return r_s, c_c

experiments = 1000
spearman_dist = np.ndarray(experiments)
correlation_dist = np.ndarray(experiments)
for i in range(experiments):
    r_s, c_c = compare_correlation_and_spearman_rank(n, 1.0)
    spearman_dist[i] = r_s
    correlation_dist[i] = c_c
    
print('Spearman Rank Coefficient: ' + str(np.mean(spearman_dist)))
# Compare to the regular correlation coefficient
print('Correlation coefficient: ' + str(np.mean(correlation_dist)))

Spearman Rank Coefficient: 0.8773016441644164
Correlation coefficient: 0.7722707460760246

plt.hist(spearman_dist, bins=50, alpha=0.5)
plt.hist(correlation_dist, bins=50, alpha=0.5)
plt.legend(['Spearman Rank', 'Regular Correlation'])
plt.xlabel('Correlation Coefficient')
plt.ylabel('Frequency');

n = 100
noises = np.linspace(0, 3, 30)
experiments = 100
spearman = np.ndarray(len(noises))
correlation = np.ndarray(len(noises))

for i in range(len(noises)):
    # Run many experiments for each noise setting
    rank_coef = 0.0
    corr_coef = 0.0
    noise = noises[i]
    for j in range(experiments):
        r_s, c_c = compare_correlation_and_spearman_rank(n, noise)
        rank_coef += r_s
        corr_coef += c_c
    spearman[i] = rank_coef/experiments
    correlation[i] = corr_coef/experiments
    
plt.scatter(noises, spearman, color='r')
plt.scatter(noises, correlation)
plt.legend(['Spearman Rank', 'Regular Correlation'])
plt.xlabel('Amount of Noise')
plt.ylabel('Average Correlation Coefficient')

Text(0, 0.5, 'Average Correlation Coefficient')

n = 100

X = np.random.rand(n)
Xrank = stats.rankdata(X, method='average')
# n-2 is the second to last element
Yrank = stats.rankdata([1,1] + list(X[:(n-2)]), method='average')

diffs = Xrank - Yrank # order doesn't matter since we'll be squaring these values
r_s = 1 - 6*sum(diffs*diffs)/(n*(n**2 - 1))
print(r_s)

0.1037833783378338

# Generate two random data sets
np.random.seed(161)
X = np.random.rand(10)
Y = np.random.rand(10)

r_s = stats.spearmanr(X, Y)
print('Spearman Rank Coefficient:', r_s[0])
print('p-value:', r_s[1])

Spearman Rank Coefficient: 0.23636363636363633
p-value: 0.5108853175152002

!curl -L 'https://gist.github.com/dursk/82eee65b7d1056b469ab/raw/2b3ad3b3b1b8964a22db73730c24b366e0df51b0/mutual_fund_data.csv' -o 'mutual_fund_data.csv'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1708  100  1708    0     0   5200      0 --:--:-- --:--:-- --:--:--  5200

mutual_fund_data = pd.read_csv('mutual_fund_data.csv')
expense = mutual_fund_data['Annual Expense Ratio'].values
sharpe = mutual_fund_data['Three Year Sharpe Ratio'].values

plt.scatter(expense, sharpe)
plt.xlabel('Expense Ratio')
plt.ylabel('Sharpe Ratio')

r_S = stats.spearmanr(expense, sharpe)
print('Spearman Rank Coefficient: ', r_S[0])
print('p-value: ', r_S[1])

Spearman Rank Coefficient:  -0.23757393235503554
p-value:  0.016746509711640854

from quantrocket.master import get_securities
from quantrocket import get_prices

symbol_list = ['AAPL', 'MSFT', 'JNJ', 'HD', 'MON', 'XOM', 'KKD']

securities = get_securities(symbols=symbol_list, vendors='usstock')

# Get the returns over the lookback window
start = '2014-12-01'
end = '2015-01-01'

prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end)
historical_returns = prices.loc['Close'].pct_change()[1:]

sids_to_symbols = securities.Symbol.to_dict()
historical_returns = historical_returns.rename(columns=sids_to_symbols)

# Compute our stock score
scores = np.mean(historical_returns)
print('Our Scores\n')
print(scores)
print('\n')

start = '2015-01-01'
end = '2015-02-01'
prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end)
walk_forward_returns = prices.loc['Close'].pct_change()[1:]
walk_forward_returns = walk_forward_returns.rename(columns=sids_to_symbols)
walk_forward_returns = np.mean(walk_forward_returns)
print('The Walk Forward Returns\n')
print(walk_forward_returns)
print('\n')

plt.scatter(scores, walk_forward_returns)
plt.xlabel('Scores')
plt.ylabel('Walk Forward Returns')

r_s = stats.spearmanr(scores, walk_forward_returns)
print('Correlation Coefficient:' + str(r_s[0]))
print('p-value:' + str(r_s[1]))

Our Scores

-0.0005684113170874466


The Walk Forward Returns

-0.0011568442798346316


Correlation Coefficient:nan
p-value:nan

Measuring Monotonic Relationships¶

Spearman Rank Correlation¶

Intuition¶

Definition¶

Experiment¶

Delay in correlation¶

Built-In Function¶

Real World Example: Mutual Fund Expense Ratio¶

Data Source¶

Real World Use Case: Evaluating a Ranking Model¶

References¶