import numpy as np
import pandas as pd
import statsmodels.api as sm
# If the observations are in a dataframe, you can use statsmodels.formulas.api to do the regression instead
from statsmodels import regression
import matplotlib.pyplot as plt

Y = np.array([1, 3.5, 4, 8, 12])
Y_hat = np.array([1, 3, 5, 7, 9])

print('Error ' + str(Y_hat - Y))

# Compute squared error
SE = (Y_hat - Y) ** 2

print('Squared Error ' + str(SE))
print('Sum Squared Error ' + str(np.sum(SE)))

Error [ 0.  -0.5  1.  -1.  -3. ]
Squared Error [0.   0.25 1.   1.   9.  ]
Sum Squared Error 11.25

# Construct a simple linear curve of 1, 2, 3, ...
X1 = np.arange(100)

# Make a parabola and add X1 to it, this is X2
X2 = np.array([i ** 2 for i in range(100)]) + X1

# This is our real Y, constructed using a linear combination of X1 and X2
Y = X1 + X2

plt.plot(X1, label='X1')
plt.plot(X2, label='X2')
plt.plot(Y, label='Y')
plt.legend();

# Use column_stack to combine independent variables, then add a column of ones so we can fit an intercept
X = sm.add_constant(np.column_stack((X1, X2)))

# Run the model
results = regression.linear_model.OLS(Y, X).fit()

print('Beta_0:', results.params[0])
print('Beta_1:', results.params[1])
print('Beta_2:', results.params[2])

Beta_0: 1.3642420526593924e-12
Beta_1: 1.0000000000003713
Beta_2: 0.9999999999999948

# Load pricing data for two arbitrarily-chosen assets and SPY
from quantrocket.master import get_securities
from quantrocket import get_prices

securities = get_securities(symbols=['SPY', 'AAPL', 'JNJ'], vendors='usstock')

start = '2019-01-01'
end = '2020-01-01'

prices = get_prices('usstock-free-1min', data_frequency='daily', sids=securities.index.tolist(), fields='Close', start_date=start, end_date=end).loc['Close']

sids_to_symbols = securities.Symbol.to_dict()
prices = prices.rename(columns=sids_to_symbols)

asset1 = prices['AAPL']
asset2 = prices['JNJ']
benchmark = prices['SPY']

# First, run a linear regression on the two assets
slr = regression.linear_model.OLS(asset1, sm.add_constant(asset2)).fit()
print('SLR beta of asset2:', slr.params[1])

SLR beta of asset2: 3.013492176301254

# Run multiple linear regression using asset2 and SPY as independent variables
mlr = regression.linear_model.OLS(asset1, sm.add_constant(np.column_stack((asset2, benchmark)))).fit()

prediction = mlr.params[0] + mlr.params[1]*asset2 + mlr.params[2]*benchmark
prediction.name = 'Prediction'

print('MLR beta of asset2:', mlr.params[1], '\nMLR beta of S&P 500:', mlr.params[2])

MLR beta of asset2: -0.6073823796384996 
MLR beta of S&P 500: 2.2676397477901746

# Plot the three variables along with the prediction given by the MLR
asset1.plot()
asset2.plot()
benchmark.plot()
prediction.plot(color='y')
plt.xlabel('Price')
plt.legend(bbox_to_anchor=(1,1), loc=2);

# Plot only the dependent variable and the prediction to get a closer look
asset1.plot()
prediction.plot(color='y')
plt.xlabel('Price')
plt.legend();

mlr.summary()

X1 = np.arange(100)
X2 = [i**2 for i in range(100)] - X1
X3 = [np.log(i) for i in range(1, 101)] + X2
X4 = 5 * X1
Y = 2 * X1 + 0.5 * X2 + 10 * X3 + X4

plt.plot(X1, label='X1')
plt.plot(X2, label='X2')
plt.plot(X3, label='X3')
plt.plot(X4, label='X4')
plt.plot(Y, label='Y')
plt.legend();

results = regression.linear_model.OLS(Y, sm.add_constant(np.column_stack((X1,X2,X3,X4)))).fit()

print("Beta_0:", results.params[0])
print("Beta_1:", results.params[1])
print("Beta_2:", results.params[2])
print("Beta_3:", results.params[3])
print("Beta_4:", results.params[4])

Beta_0: -6.366462912410498e-12
Beta_1: 0.2692307692305427
Beta_2: 0.4999999999940883
Beta_3: 10.000000000004547
Beta_4: 1.346153846153797

data = pd.DataFrame(np.column_stack((X1,X2,X3,X4)), columns=['X1','X2','X3','X4'])
response = pd.Series(Y, name='Y')

def forward_aic(response, data):
    # This function will work with pandas dataframes and series
    
    # Initialize some variables
    explanatory = list(data.columns)
    selected = pd.Series(np.ones(data.shape[0]), name="Intercept")
    current_score, best_new_score = np.inf, np.inf
    
    # Loop while we haven't found a better model
    while current_score == best_new_score and len(explanatory) != 0:
        
        scores_with_elements = []
        count = 0
        
        # For each explanatory variable
        for element in explanatory:
            # Make a set of explanatory variables including our current best and the new one
            tmp = pd.concat([selected, data[element]], axis=1)
            # Test the set
            result = regression.linear_model.OLS(Y, tmp).fit()
            score = result.aic
            scores_with_elements.append((score, element, count))
            count += 1
        
        # Sort the scoring list
        scores_with_elements.sort(reverse = True)
        # Get the best new variable
        best_new_score, best_element, index = scores_with_elements.pop()
        if current_score > best_new_score:
            # If it's better than the best add it to the set
            explanatory.pop(index)
            selected = pd.concat([selected, data[best_element]],axis=1)
            current_score = best_new_score
    # Return the final model
    model = regression.linear_model.OLS(Y, selected).fit()
    return model

result = forward_aic(Y, data)
result.summary()

Dep. Variable:	AAPL	R-squared:	0.913
Model:	OLS	Adj. R-squared:	0.912
Method:	Least Squares	F-statistic:	1305.
Date:	Tue, 25 May 2021	Prob (F-statistic):	1.11e-132
Time:	15:26:24	Log-Likelihood:	-946.89
No. Observations:	252	AIC:	1900.
Df Residuals:	249	BIC:	1910.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-366.8366	17.764	-20.651	0.000	-401.823	-331.850
x1	-0.6074	0.153	-3.977	0.000	-0.908	-0.307
x2	2.2676	0.050	45.552	0.000	2.170	2.366

Omnibus:	1.760	Durbin-Watson:	0.100
Prob(Omnibus):	0.415	Jarque-Bera (JB):	1.445
Skew:	-0.122	Prob(JB):	0.485
Kurtosis:	3.280	Cond. No.	8.60e+03

Dep. Variable:	y	R-squared:	1.000
Model:	OLS	Adj. R-squared:	1.000
Method:	Least Squares	F-statistic:	3.092e+26
Date:	Tue, 25 May 2021	Prob (F-statistic):	0.00
Time:	15:26:35	Log-Likelihood:	1700.7
No. Observations:	100	AIC:	-3393.
Df Residuals:	96	BIC:	-3383.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-1.455e-11	7.01e-09	-0.002	0.998	-1.39e-08	1.39e-08
X3	10.0000	4.24e-09	2.36e+09	0.000	10.000	10.000
X1	0.2692	1.3e-11	2.08e+10	0.000	0.269	0.269
X2	0.5000	4.24e-09	1.18e+08	0.000	0.500	0.500
X4	1.3462	6.48e-11	2.08e+10	0.000	1.346	1.346

Multiple Linear Regression¶

Evaluation¶

Model Assumptions¶

Model Selection Example¶

Omnibus:	14.070	Durbin-Watson:	0.001
Prob(Omnibus):	0.001	Jarque-Bera (JB):	9.981
Skew:	-0.647	Prob(JB):	0.00680
Kurtosis:	2.152	Cond. No.	6.22e+17