# Import libraries
import numpy as np
import pandas as pd
from statsmodels import regression
import statsmodels.api as sm
import statsmodels.stats.diagnostic as smd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

def linreg(X,Y):
    # Running the linear regression
    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()
    B0 = model.params[0]
    B1 = model.params[1]
    X = X[:, 1]

    # Return summary of the regression and plot results
    X2 = np.linspace(X.min(), X.max(), 100)
    Y_hat = X2 * B1 + B0
    plt.scatter(X, Y, alpha=1) # Plot the raw data
    plt.plot(X2, Y_hat, 'r', alpha=1);  # Add the regression line, colored in red
    plt.xlabel('X Value')
    plt.ylabel('Y Value')
    return model, B0, B1

n = 50
X = np.random.randint(0, 100, n)
epsilon = np.random.normal(0, 1, n)

Y = 10 + 0.5 * X + epsilon

linreg(X,Y)[0];
print("Line of best fit: Y = {0} + {1}*X".format(linreg(X, Y)[1], linreg(X, Y)[2]))

Line of best fit: Y = 10.293236475704886 + 0.492974457218984*X

model, B0, B1 = linreg(X,Y)

residuals = model.resid
plt.errorbar(X,Y,xerr=0,yerr=[residuals,0*residuals],linestyle="None",color='Green');

residuals = model.resid
print(residuals)

[-0.67164157 -1.76099618 -0.30365386  0.89951205  0.13111216  1.84579793
 -0.24615041 -0.61291441 -0.16805376 -2.01007856  0.97105556  1.0335287
  0.30758715 -0.39443363 -0.20703832 -0.84868733 -0.3618303  -1.33749005
 -0.33532895 -0.11257807  2.15170323  0.74535268 -0.73381961 -1.04266584
 -1.09253471  0.1517854  -0.81191279  2.33493536  0.47629945  1.91093397
 -0.74114386 -0.28935872  0.07483554  0.46211878 -1.73681098 -0.70175562
  1.17377211  0.35187893  1.89796585 -1.23642993 -1.63012988  1.17657191
 -0.2298605  -1.84124037  0.55669397 -0.69165981  2.08042533  1.06844539
 -0.2232824   0.57116899]

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');
plt.xlim([1,50]);

n = 50
X = np.random.randint(0, 50, n)
epsilon = np.random.normal(0, 1, n)
Y_nonlinear = 10 - X**1.2 + epsilon

model = sm.OLS(Y_nonlinear, sm.add_constant(X)).fit()
B0, B1 = model.params
residuals = model.resid

print('beta_0:', B0)
print('beta_1:', B1)
plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

beta_0: 16.749406324930508
beta_1: -2.2582325051107133

n = 50
X = np.random.randint(0, 100, n)
epsilon = np.random.normal(0, 1, n)
Y_heteroscedastic = 100 + 2*X + epsilon*X

model = sm.OLS(Y_heteroscedastic, sm.add_constant(X)).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

breusch_pagan_p = smd.het_breuschpagan(model.resid, model.model.exog)[1]
print(breusch_pagan_p)
if breusch_pagan_p > 0.05:
    print("The relationship is not heteroscedastic.")
if breusch_pagan_p < 0.05:
    print("The relationship is heteroscedastic.")

0.0010935999868479371
The relationship is heteroscedastic.

# Finding first-order differences in Y_heteroscedastic
Y_heteroscedastic_diff = np.diff(Y_heteroscedastic)

model = sm.OLS(Y_heteroscedastic_diff, sm.add_constant(X[1:])).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

breusch_pagan_p = smd.het_breuschpagan(residuals, model.model.exog)[1]
print(breusch_pagan_p)
if breusch_pagan_p > 0.05:
    print("The relationship is not heteroscedastic.")
if breusch_pagan_p < 0.05:
    print("The relationship is heteroscedastic.")

0.0049117416665093875
The relationship is heteroscedastic.

# Taking the log of the previous data Y_heteroscedastic and saving it in Y_heteroscedastic_log
Y_heteroscedastic_log = np.log(Y_heteroscedastic)

model = sm.OLS(Y_heteroscedastic_log, sm.add_constant(X)).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

# Running and interpreting a Breusch-Pagan test
breusch_pagan_p = smd.het_breuschpagan(residuals, model.model.exog)[1]
print(breusch_pagan_p)
if breusch_pagan_p > 0.05:
    print("The relationship is not heteroscedastic.")
if breusch_pagan_p < 0.05:
    print("The relationship is heteroscedastic.")

0.06983179032520836
The relationship is not heteroscedastic.

# Finding a power transformation adjusted Y_heteroscedastic
Y_heteroscedastic_box_cox = stats.boxcox(Y_heteroscedastic)[0]

model = sm.OLS(Y_heteroscedastic_box_cox, sm.add_constant(X)).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

# Running and interpreting a Breusch-Pagan test
breusch_pagan_p = smd.het_breuschpagan(residuals, model.model.exog)[1]
print(breusch_pagan_p)
if breusch_pagan_p > 0.05:
    print("The relationship is not heteroscedastic.")
if breusch_pagan_p < 0.05:
    print("The relationship is heteroscedastic.")

0.11981079385385478
The relationship is not heteroscedastic.

n = 50
X = np.linspace(0, n, n)
Y_autocorrelated = np.zeros(n)
Y_autocorrelated[0] = 50
for t in range(1, n):
    Y_autocorrelated[t] = Y_autocorrelated[t-1] + np.random.normal(0, 1) 

# Regressing X and Y_autocorrelated
model = sm.OLS(Y_autocorrelated, sm.add_constant(X)).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

ljung_box = smd.acorr_ljungbox(residuals, lags=10, return_df=True)
print("Lagrange Multiplier Statistics:", ljung_box.lb_stat.tolist())
print("\nP-values:", ljung_box.lb_pvalue.tolist(), "\n")

if (ljung_box.lb_pvalue < 0.05).any():
    print("The residuals are autocorrelated.")
else:
    print("The residuals are not autocorrelated.")

Lagrange Multiplier Statistics: [41.20303806067113, 68.48106724559787, 85.60238863877254, 95.72944552604835, 100.55673658072398, 102.15007597664821, 102.28262654391536, 102.57102275502551, 104.21587287946316, 107.85978429224278]

P-values: [1.3720911307573524e-10, 1.3474888808945885e-15, 1.9268496770484735e-18, 7.972792573801277e-20, 4.033747699491496e-20, 8.92857505419026e-20, 3.6415800501787816e-19, 1.27192245084767e-18, 2.2024721331799075e-18, 1.4403010494720476e-18] 

The residuals are autocorrelated.

# Finding first-order differences in Y_autocorrelated
Y_autocorrelated_diff = np.diff(Y_autocorrelated)

model = sm.OLS(Y_autocorrelated_diff, sm.add_constant(X[1:])).fit()
B0, B1 = model.params
residuals = model.resid

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values');
plt.ylabel('Residuals');

# Running and interpreting a Ljung-Box test
ljung_box = smd.acorr_ljungbox(residuals, lags=10, return_df=True)
print("P-values:", ljung_box.lb_pvalue.tolist(), "\n")

if (ljung_box.lb_pvalue < 0.05).any():
    print("The residuals are autocorrelated.")
else:
    print("The residuals are not autocorrelated.")

P-values: [0.17264886534613838, 0.3278814889855232, 0.5242570878937549, 0.6292323413314802, 0.6953534497764703, 0.7279135451873229, 0.8220961266759883, 0.8892880400214722, 0.916320911885328, 0.48499986345032586] 

The residuals are not autocorrelated.

from quantrocket.master import get_securities
from quantrocket import get_prices

securities = get_securities(symbols=["AAPL", "SPY"], vendors='usstock')

start = '2017-01-01'
end = '2018-01-01'

closes = get_prices("usstock-free-1min", data_frequency="daily", sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
closes = closes.rename(columns=sids_to_symbols)

asset = closes['AAPL']
benchmark = closes['SPY']

# We have to take the percent changes to get to returns
# Get rid of the first (0th) element because it is NAN
r_a = asset.pct_change()[1:].values
r_b = benchmark.pct_change()[1:].values

# Regressing the benchmark b and asset a
r_b = sm.add_constant(r_b)
model = sm.OLS(r_a, r_b).fit()
r_b = r_b[:, 1]
B0, B1 = model.params

# Plotting the regression
A_hat = (B1*r_b + B0)
plt.scatter(r_b, r_a, alpha=1) # Plot the raw data
plt.plot(r_b, A_hat, 'r', alpha=1);  # Add the regression line, colored in red
plt.xlabel('AAPL Returns')
plt.ylabel('SPY Returns')

# Print our result
print("Estimated AAPL Beta:", B1)

# Calculating the residuals
residuals = model.resid

Estimated AAPL Beta: 1.3605712244705852

plt.scatter(model.predict(), residuals);
plt.axhline(0, color='red')
plt.xlabel('AAPL Returns');
plt.ylabel('Residuals');

bp_test = smd.het_breuschpagan(residuals, model.model.exog)

print("Lagrange Multiplier Statistic:", bp_test[0])
print("P-value:", bp_test[1])
print("f-value:", bp_test[2])
print("f_p-value:", bp_test[3], "\n")
if bp_test[1] > 0.05:
    print("The relationship is not heteroscedastic.")
if bp_test[1] < 0.05:
    print("The relationship is heteroscedastic.")

Lagrange Multiplier Statistic: 0.03631046966792728
P-value: 0.8488757668605337
f-value: 0.03602521828096212
f_p-value: 0.8496186813191156 

The relationship is not heteroscedastic.

ljung_box = smd.acorr_ljungbox(r_a, lags=1, return_df=True)
print("P-Values:", ljung_box.lb_pvalue.tolist(), "\n")
if (ljung_box.lb_pvalue < 0.05).any():
    print("The residuals are autocorrelated.")
else:
    print("The residuals are not autocorrelated.")

P-Values: [0.7154123872552114] 

The residuals are not autocorrelated.

Residuals Analysis¶

Linear Regression¶

Simple Linear Regression¶

Residuals¶

Diagnosing Residuals¶

Appropriateness of a Linear Model¶

Heteroscedasticity¶

Statistical Methods for Detecting Heteroscedasticity¶

Adjusting for Heteroscedasticity¶

Differences Analysis¶

Logarithmic Transformation¶

Box-Cox Transformation¶

GARCH Modeling¶

Residuals and Autocorrelation¶

Statistical Methods for Detecting Autocorrelation¶

Adjusting for Autocorrelation¶

Example: Market Beta Calculation¶

Breusch-Pagan Heteroscedasticity Test¶

Ljung-Box Autocorrelation Test¶

References¶