import numpy as np
import pandas as pd

import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
# just set the seed for the random number generator
np.random.seed(107)

import matplotlib.pyplot as plt

X_returns = np.random.normal(0, 1, 100) # Generate the daily returns
# sum them and shift all the prices up into a reasonable range
X = pd.Series(np.cumsum(X_returns), name='X') + 50
X.plot();

some_noise = np.random.normal(0, 1, 100)
Y = X + 5 + some_noise
Y.name = 'Y'
pd.concat([X, Y], axis=1).plot();

(Y - X).plot() # Plot the spread
plt.axhline((Y - X).mean(), color='red', linestyle='--') # Add the mean
plt.xlabel('Time')
plt.legend(['Price Spread', 'Mean']);

# compute the p-value of the cointegration test
# will inform us as to whether the spread between the 2 timeseries is stationary
# around its mean
score, pvalue, _ = coint(X,Y, maxlag=1)
print(pvalue)

2.050341865341208e-16

X.corr(Y)

0.9497090646385932

X_returns = np.random.normal(1, 1, 100)
Y_returns = np.random.normal(2, 1, 100)

X_diverging = pd.Series(np.cumsum(X_returns), name='X')
Y_diverging = pd.Series(np.cumsum(Y_returns), name='Y')

pd.concat([X_diverging, Y_diverging], axis=1).plot();

print('Correlation: ' + str(X_diverging.corr(Y_diverging)))
score, pvalue, _ = coint(X_diverging,Y_diverging, maxlag=1)
print('Cointegration test p-value: ' + str(pvalue))

Correlation: 0.9931343801275689
Cointegration test p-value: 0.8815557674695229

Y2 = pd.Series(np.random.normal(0, 1, 1000), name='Y2') + 20
Y3 = Y2.copy()

# Y2 = Y2 + 10
Y3[0:100] = 30
Y3[100:200] = 10
Y3[200:300] = 30
Y3[300:400] = 10
Y3[400:500] = 30
Y3[500:600] = 10
Y3[600:700] = 30
Y3[700:800] = 10
Y3[800:900] = 30
Y3[900:1000] = 10

Y2.plot()
Y3.plot()
plt.ylim([0, 40]);

# correlation is nearly zero
print('Correlation: ' + str(Y2.corr(Y3)))
score, pvalue, _ = coint(Y2,Y3, maxlag=1)
print('Cointegration test p-value: ' + str(pvalue))

Correlation: -0.04130406958091664
Cointegration test p-value: 0.0

def find_cointegrated_pairs(data):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2, maxlag=1)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.05:
                pairs.append((keys[i], keys[j]))
    return score_matrix, pvalue_matrix, pairs

from quantrocket.master import get_securities
from quantrocket import get_prices

securities = get_securities(symbols=['ABGB', 'ASTI', 'CSUN', 'DQ', 'FSLR','SPY'], vendors='usstock')

prices_df = get_prices(
    'usstock-1d-bundle',
    data_frequency='daily',
    sids=securities.index.tolist(),
    fields=['Close'],
    start_date='2014-02-01', 
    end_date='2015-01-01').loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
prices_df = prices_df.rename(columns=sids_to_symbols)

# Heatmap to show the p-values of the cointegration test between each pair of
# stocks. Only show the value in the upper-diagonal of the heatmap
scores, pvalues, pairs = find_cointegrated_pairs(prices_df)
import seaborn
seaborn.heatmap(
    pvalues, 
    xticklabels=prices_df.columns, 
    yticklabels=prices_df.columns, 
    cmap='RdYlGn_r', 
    mask = (pvalues >= 0.05))
print(pairs)

[('ABGB', 'FSLR')]

S1 = prices_df['ABGB']
S2 = prices_df['FSLR']

score, pvalue, _ = coint(S1, S2, maxlag=1)
pvalue

0.04011151339508293

S1 = sm.add_constant(S1)
results = sm.OLS(S2, S1).fit()
S1 = S1['ABGB']
b = results.params['ABGB']

spread = S2 - b * S1
spread.plot()
plt.axhline(spread.mean(), color='black')
plt.legend(['Spread']);

ratio = S1/S2
ratio.plot()
plt.axhline(ratio.mean(), color='black')
plt.legend(['Price Ratio']);

def zscore(series):
    return (series - series.mean()) / np.std(series)

zscore(spread).plot()
plt.axhline(zscore(spread).mean(), color='black')
plt.axhline(1.0, color='red', linestyle='--')
plt.axhline(-1.0, color='green', linestyle='--')
plt.legend(['Spread z-score', 'Mean', '+1', '-1']);

# Get the spread between the 2 stocks
# Calculate rolling beta coefficient
window = 30
rolling_beta = [np.nan] * window
for n in range(window, len(S1)):
    y = S1[(n - window):n]
    X = S2[(n - window):n]
    rolling_beta.append(sm.OLS(y, X).fit().params[0])

rolling_beta = pd.Series(rolling_beta, index=S2.index)

spread = S2 - rolling_beta * S1
spread.name = 'spread'

# Get the 1 day moving average of the price spread
spread_mavg1 = spread.rolling(window=1).mean()
spread_mavg1.name = 'spread 1d mavg'

# Get the 30 day moving average
spread_mavg30 = spread.rolling(30).mean()
spread_mavg30.name = 'spread 30d mavg'

plt.plot(spread_mavg1.index, spread_mavg1.values)
plt.plot(spread_mavg30.index, spread_mavg30.values)

plt.legend(['1 Day Spread MAVG', '30 Day Spread MAVG'])

plt.ylabel('Spread');

# Take a rolling 30 day standard deviation
std_30 = spread.rolling(30).std()
std_30.name = 'std 30d'

# Compute the z score for each day
zscore_30_1 = (spread_mavg1 - spread_mavg30)/std_30
zscore_30_1.name = 'z-score'
zscore_30_1.plot()
plt.axhline(0, color='black')
plt.axhline(1.0, color='red', linestyle='--');

# Plot the prices scaled down along with the negative z-score
# just divide the stock prices by 10 to make viewing it on the plot easier
plt.plot(S1.index, S1.values/10)
plt.plot(S2.index, S2.values/10)
plt.plot(zscore_30_1.index, zscore_30_1.values)
plt.legend(['S1 Price / 10', 'S2 Price / 10', 'Price Spread Rolling z-Score']);

securities = get_securities(symbols=['ABGB', 'FSLR'], vendors='usstock')

prices_df = get_prices(
    'usstock-1d-bundle',
    data_frequency='daily',
    sids=securities.index.tolist(),
    fields=['Close'], 
    start_date='2015-01-01', 
    end_date='2016-01-01').loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
prices_df = prices_df.rename(columns=sids_to_symbols)

S1 = prices_df['ABGB']
S2 = prices_df['FSLR']

score, pvalue, _ = coint(S1, S2)
print('p-value:', pvalue)

p-value: 0.9867017226340101

Introduction to Pairs Trading¶

Generating Two Fake Securities¶

Cointegration¶

Testing for Cointegration¶

Correlation vs. Cointegration¶

Correlation Without Cointegration¶

Cointegration Without Correlation¶

Hedging¶

The Trick: Where it all comes together¶

Going Long the Spread¶

Going Short the Spread¶

Specific Bets¶

Finding real securities that behave like this¶

WARNING: This will incur a large amount of multiple comparisons bias.¶

Looking for Cointegrated Pairs of Alternative Energy Securities¶

Calculating the Spread¶

WARNING¶

Simple Strategy:¶

Trading using constantly updating statistics¶

Moving Averages¶

Out of Sample Test¶

Implementation¶

Further Research¶