# 如何规避线性回归的陷阱（下）

## 使用变量变换或广义线性模型

# Create gamma dataset
np.random.seed(1)
x1 = np.random.uniform(-1, 1, 200)
x2 = np.random.uniform(-1, 1, 200)
mu = np.exp(1 + x1 + 2*x2 + np.random.randn())
y = np.random.gamma(shape = 2, scale = mu/2, size = 200)
gamma_data = pd.DataFrame({'X1':x1, 'X2':x2, 'Y':y})
# Plot data
plt.hist(gamma_data['Y'], bins=30)
plt.ylabel('Count')
plt.xlabel('Y')
plt.show()

# Fit linear regression
non_norm_model = smf.ols(formula='Y ~ X1 + X2',
data=gamma_data).fit()
# Calculate residuals
resid = gamma_data['Y'] - non_norm_model.predict(gamma_data[['X1', 'X2']])
# Plot residuals
plt.scatter(non_norm_model.predict(gamma_data[['X1', 'X2']]), resid, alpha=0.5)
plt.xlabel('fitted')
plt.ylabel('residual')
plt.show()

# Transform Y by taking the log of it
gamma_data.loc[:, 'log_y'] = gamma_data['Y'].apply(lambda x:
np.log(x))
# Refit linear regression to transformed data
non_norm_model2 = smf.ols(formula='log_y ~ X1 + X2',
data=gamma_data).fit()
# Calculate residuals
resid = gamma_data['log_y'] -
non_norm_model2.predict(gamma_data[['X1', 'X2']])
# Plot residuals
plt.scatter(non_norm_model2.predict(gamma_data[['X1', 'X2']]), resid, alpha=0.5)
plt.xlabel('fitted')
plt.ylabel('residual')

# Fit GLM to data
gamma_model = sm.GLM(gamma_data['Y'],
# Calculate residuals
resid = gamma_model.resid_deviance
# Plot residuals
plt.xlabel('fitted')
plt.ylabel('residual')
plt.show()

## 使用时间序列模型处理自相关

# Only keep data for AAL
sandp_data = sandp_data[sandp_data['Name'] == 'AAL']
# Transform date to datetime format
sandp_data['date'] = pd.to_datetime(sandp_data['date'])
# Sort data by date
sandp_data.sort_values(by = ['date'], inplace = True)
# Plot data
plt.plot(sandp_data['date'], sandp_data['close'])
plt.show()

# Create year and month variables
sandp_data['Year'] = sandp_data['date'].map(lambda x: x.year)
sandp_data['Month'] = sandp_data['date'].map(lambda x: x.month)
# Fit linear regression to data
sandp_model = smf.ols(formula='close ~ Year + C(Month)',
data=sandp_data).fit()# Plot data with regression line overlay
plt.plot(sandp_data['date'], sandp_data['close'])
plt.plot(sandp_data['date'], sandp_model.predict(sandp_data[['Year', 'Month']]), color = 'red')
plt.show()

##### 例如，如果d=1，y（t）=y（t）-y（t-1），如果d=2，y（t）=z（t）-z（t-1），其中z（t）=y（t）-y（t-1），依此类推。

# Index dataset by date
sandp_data2 = sandp_data.set_index('date')
# Fit ARIMA model to the data
ts_model = sm.tsa.ARIMA(sandp_data2['close'], (5, 1, 0)).fit()
# Plot actual vs fitted values
ts_model.plot_predict(dynamic=False)

Occam剃刀原理指出，如果“对某一事件存在两种解释，通常需要最少猜测的解释是正确的”。

