DB = "usstock-1d"
UNIVERSE = "us-etf"
MIN_DOLLAR_VOLUME = 80e6 # $80M USD
COINTEGRATION_CONFIDENCE_LEVEL = 90 # require cointegration at 90%, 95%, or 99% confidence 
COINTEGRATION_START_DATE = "2011-01-01" # conintegration test starts here...
COINTEGRATION_END_DATE = "2011-12-31" # ...and ends here
IN_SAMPLE_END_DATE = "2015-12-31" # in-sample backtest starts at the cointegration end date and ends here

from quantrocket import get_prices
prices = get_prices(DB, universes=UNIVERSE, start_date=COINTEGRATION_START_DATE, end_date=COINTEGRATION_END_DATE, fields=["Close", "Volume"])
closes = prices.loc["Close"]
volumes = prices.loc["Volume"]
dollar_volumes = (closes * volumes).mean()
adequate_dollar_volumes = dollar_volumes[dollar_volumes >= MIN_DOLLAR_VOLUME]

print(f"{len(adequate_dollar_volumes.index)} of {len(closes.columns)} have dollar volume > {MIN_DOLLAR_VOLUME}")

90 of 1473 have dollar volume > 80000000.0

import itertools
all_pairs = list(itertools.combinations(adequate_dollar_volumes.index, 2))

print(f"formed {len(all_pairs)} total pairs")

formed 4005 total pairs

from statsmodels.tsa.vector_ar.vecm import coint_johansen
import numpy as np
from IPython.display import clear_output

cointegrating_pairs = []

for i, (sid_1, sid_2) in enumerate(all_pairs): 
    
    # Display progress
    clear_output(wait=True)
    print(f"Running Johansen test on pair {i} of {len(all_pairs)}")
    
    pair_closes = closes[[sid_1, sid_2]].dropna()
    
    # Skip pairs with less than 90 non-null obversations
    if len(pair_closes) < 90:
        continue
        
    # The second and third parameters indicate constant term, with a lag of 1. 
    # See Chan, Algorithmic Trading, chapter 2.
    result = coint_johansen(pair_closes, 0, 1)
    
    # the 90%, 95%, and 99% confidence levels for the trace statistic and maximum 
    # eigenvalue statistic are stored in the first, second, and third column of 
    # cvt and cvm, respectively
    confidence_level_cols = {
        90: 0,
        95: 1,
        99: 2
    }
    confidence_level_col = confidence_level_cols[COINTEGRATION_CONFIDENCE_LEVEL]
    
    trace_crit_value = result.cvt[:, confidence_level_col]
    eigen_crit_value = result.cvm[:, confidence_level_col]
    
    # The trace statistic and maximum eigenvalue statistic are stored in lr1 and lr2;
    # see if they exceeded the confidence threshold
    if np.all(result.lr1 >= trace_crit_value) and np.all(result.lr2 >= eigen_crit_value):
        
        cointegrating_pairs.append(dict(
            sid_1=sid_1,
            sid_2=sid_2
        ))
        
clear_output()

len(cointegrating_pairs)

81

from quantrocket.master import download_master_file
import io
import pandas as pd

f = io.StringIO()
download_master_file(f, sids=list(dollar_volumes.index), fields=["Symbol","Name"])
securities = pd.read_csv(f, index_col="Sid")

# Convert to dict of {<sid>: {Symbol: <symbol>, Name: <name>}}
securities = securities.to_dict(orient="index")

from quantrocket.moonshot import backtest
from moonchart import DailyPerformance

all_results = []
for i, pair in enumerate(cointegrating_pairs):
    
    sid_1 = pair["sid_1"]
    sid_2 = pair["sid_2"]
    
    security_1 = securities[sid_1]
    security_2 = securities[sid_2]
    
    symbol_1 = security_1["Symbol"]
    symbol_2 = security_2["Symbol"]
    
    name_1 = security_1["Name"]
    name_2 = security_2["Name"]
       
    # Display progress
    clear_output(wait=True)
    print(f"Backtesting pair {i+1} of {len(cointegrating_pairs)}: {symbol_1}/{symbol_2} ({name_1} and {name_2})")
        
    f = io.StringIO()
    
    # Run backtest
    backtest("pairs", start_date=COINTEGRATION_END_DATE, 
             end_date=IN_SAMPLE_END_DATE,
             params={
                 "DB": DB,
                 "SIDS": [sid_1, sid_2]}, 
             filepath_or_buffer=f)
    
    # Get Sharpe and CAGR
    perf = DailyPerformance.from_moonshot_csv(f)
    sharpe = perf.sharpe.iloc[0]
    cagr = perf.cagr.iloc[0]
    
    all_results.append({
        "sid_1": sid_1,
        "sid_2": sid_2,
        "symbol_1": symbol_1,
        "symbol_2": symbol_2,
        "name_1": name_1,
        "name_2": name_2,
        "sharpe": sharpe,
        "cagr": cagr,
    })

clear_output()
results = pd.DataFrame(all_results)

results = results.sort_values("sharpe", ascending=False)
best_pairs = results.head(5)
best_pairs

# Save the strategy codes to use in the subsequent backtest
best_pairs_codes = []

# Print the subclass definitions
for i, pair in best_pairs.iterrows():
    
    strategy_code = f"pair-{pair.symbol_1.lower()}-{pair.symbol_2.lower()}"
    
    subclass_code = f"""
class {pair.symbol_1}_{pair.symbol_2}_Pair(PairsStrategy):
    
    CODE = "{strategy_code}"
    DB = "{DB}"
    SIDS = [
        "{pair.sid_1}", # {pair.symbol_1}
        "{pair.sid_2}" # {pair.symbol_2}
    ]""" 
    print(subclass_code)
    best_pairs_codes.append(strategy_code)

class XLK_UCO_Pair(PairsStrategy):
    
    CODE = "pair-xlk-uco"
    DB = "usstock-1d"
    SIDS = [
        "FIBBG000BJ7007", # XLK
        "FIBBG000CSVPZ6" # UCO
    ]

class GDX_QID_Pair(PairsStrategy):
    
    CODE = "pair-gdx-qid"
    DB = "usstock-1d"
    SIDS = [
        "FIBBG000PLNQN7", # GDX
        "FIBBG000PT7GJ5" # QID
    ]

class EWW_MOO_Pair(PairsStrategy):
    
    CODE = "pair-eww-moo"
    DB = "usstock-1d"
    SIDS = [
        "FIBBG000BK42M9", # EWW
        "FIBBG000KJ4073" # MOO
    ]

class XLK_GDX_Pair(PairsStrategy):
    
    CODE = "pair-xlk-gdx"
    DB = "usstock-1d"
    SIDS = [
        "FIBBG000BJ7007", # XLK
        "FIBBG000PLNQN7" # GDX
    ]

class XLK_SKF_Pair(PairsStrategy):
    
    CODE = "pair-xlk-skf"
    DB = "usstock-1d"
    SIDS = [
        "FIBBG000BJ7007", # XLK
        "FIBBG000QXGKF0" # SKF
    ]

backtest(best_pairs_codes, start_date=IN_SAMPLE_END_DATE, filepath_or_buffer="best_pairs_results.csv")

from moonchart import Tearsheet
Tearsheet.from_moonshot_csv("best_pairs_results.csv")

Pairs Selection Pipeline¶

Variables¶

Step 1: Filter by dollar volume¶

Step 2: Find Cointegrating Pairs¶

Step 3: Run In-Sample Backtests on All Cointegrating Pairs¶

Step 4: Out-of-sample Backtest¶

	sid_1	sid_2	symbol_1	symbol_2	name_1	name_2	sharpe	cagr
31	FIBBG000BJ7007	FIBBG000CSVPZ6	XLK	UCO	TECHNOLOGY SELECT SECT SPDR	PROSHARES ULTRA BLOOMBERG CR	1.087876	0.358299
76	FIBBG000PLNQN7	FIBBG000PT7GJ5	GDX	QID	VANECK GOLD MINERS	PROSHARES ULTRASHORT QQQ	0.784192	0.184588
38	FIBBG000BK42M9	FIBBG000KJ4073	EWW	MOO	ISHARES MSCI MEXICO ETF	VANECK AGRIBUSINESS	0.760353	0.075929
34	FIBBG000BJ7007	FIBBG000PLNQN7	XLK	GDX	TECHNOLOGY SELECT SECT SPDR	VANECK GOLD MINERS	0.746456	0.123332
36	FIBBG000BJ7007	FIBBG000QXGKF0	XLK	SKF	TECHNOLOGY SELECT SECT SPDR	PROSHARES ULTSHRT FINANCIALS	0.687180	0.107301