# 特征选择概述

rubbish in, rubbish out

1. feature selection
2. feature extraction

1. 如果`feature`太多的话，模型的复杂度过大，可能会发生过拟合
2. 如果`feature`太多的话，数据变得稀疏，会有`curse of dimension`
3. 如果`feature`太多的话，训练以及预测需要的计算量更大

# 数据预处理

```#放电数据主要包含如下几个特征：
pd_location
signal_width
rise_time
fall_time
peak_voltage
polarity
mean_voltage
rms
sd
skewness
kurtosis
crest
form_factor
MainFreq
phase_angle
T
W
pC

#载入数据
import pandas as pd
#观察数据
d.groupby("pd_class").agg({'T': len})
d.isnull().any()
#训练集和测试集划分
from sklearn.cross_validation import train_test_split
n1 = d.shape[1]-1
X, y = d.iloc[:, 0:n1].values, d.iloc[:, n1].values
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=0)
print ("X_train shape:",X_train.shape)
print ("X_test  shape:",X_test.shape)
print ("Y_train shape:",y_train.shape)
print ("Y_test  shape:",y_test.shape)

#数据归一化
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)```

# L1-regularization

`L1-regularization`的原理以及与`L2`的区别如上图，其使用了`coordinate descent`以及`subgradient`的原理，会使某一个`feature`迅速地变成0。

```['W', 'pC']
['T', 'W', 'pC']
['pd_location', 'kurtosis', 'phase_angle', 'T', 'W', 'pC']
['pd_location', 'skewness', 'kurtosis', 'phase_angle', 'T', 'W', 'pC']```

`l1 path`如下图：

```COLUMNS = d.columns[:-1]
#第一次选权重
weights, params = [], []
for c in np.arange(-4, 6):
lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
pd.DataFrame({"c":np.arange(-4, 6),"num":[sum(i!=0) for i in weights]})
#第二次选权重
weights, params = [], []
for c in np.linspace(-2, -1,5):
lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
pd.DataFrame({"c":np.linspace(-2, -1,5),"num":[sum(i!=0) for i in weights]})
#第三次选权重
weights, params = [], []
for c in np.linspace(-1.25, -1,10):
lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
pd.DataFrame({"c":np.linspace(-1.25, -1,10),"num":[sum(i!=0) for i in weights]})
#查看结果
for i in range (len(weights)):
print (COLUMNS[weights[i] !=0])

#可视化
import matplotlib.pyplot as plt
##颜色的选择用`iWantHue`
colors = ["#6FC9CA",
"#CD5430",
"#CE4DC4",
"#88D54D",
"#3D455E",
"#5A752D",
"#CC436D",
"#603422",
"#D2B43F",
"#CD88BC",
"#C8C28D",
"#71D195",
"#6A89C8",
"#C18170",
"#7F68D1",
"#4A6553",
"#BABAC7",
"#693263"]
##计算权重
weights, params = [], []
for c in np.arange(-2, 10):
lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)

weights = np.array(weights)
##开始画图
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure()
ax = plt.subplot(111)
for column, color in zip(range(weights.shape[1]), colors):
plt.plot(params, weights[:, column],
label=pd.columns[column],
color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=5)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left',markerscale=5)
ax.legend(loc='upper center',
bbox_to_anchor=(1.38, 1.03),
ncol=1, fancybox=True)
plt.savefig('l1_path.png', dpi=600)
plt.show()```

# sequential feature selection

• forward selection
• backward selection
• forward and backward selection

• validation集上的精度
• 特征的p值

```['pd_location' 'signal_width' 'peak_voltage' 'form_factor' 'pC']
['signal_width' 'peak_voltage' 'form_factor' 'pC']
['signal_width' 'peak_voltage' 'pC']
['peak_voltage' 'pC']
['peak_voltage']```

```#定义sequential feature selection函数
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state

def fit(self, X, y):

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)

dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]

while dim > self.k_features:
scores = []
subsets = []

for p in combinations(self.indices_, r=dim-1):
score = self._calc_score(X_train, y_train,
X_test, y_test, p)
scores.append(score)
subsets.append(p)

best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1

self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]

return self

def transform(self, X):
return X[:, self.indices_]

def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
#训练
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

lr = LogisticRegression()

# selecting features
sbs = SBS(lr, k_features=1)
sbs.fit(X_train_std, y_train)

# plotting performance of feature subsets
k_feat = [len(k) for k in sbs.subsets_]

plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0, 1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.tight_layout()
# plt.savefig('./sbs.png', dpi=300)
plt.show()
#看参数
for i in sbs.subsets_:
print (np.array(COLUMNS[[i]]))```

# random forest

``` 1) pd_location                    0.135614
2) signal_width                   0.105168
3) rise_time                      0.104014
4) fall_time                      0.100831
5) peak_voltage                   0.084795
6) polarity                       0.071462
7) mean_voltage                   0.056699
8) rms                            0.056144
9) sd                             0.048083
10) skewness                       0.044781
11) kurtosis                       0.041479
12) crest                          0.036688
13) form_factor                    0.033465
14) MainFreq                       0.029730
15) phase_angle                    0.029136
16) T                              0.011835
17) W                              0.010076
18) pC                             0.000000```

```from sklearn.ensemble import RandomForestClassifier
feat_labels = d.columns[0:-1]
forest = RandomForestClassifier(n_estimators=10000,
random_state=0,
n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[f],
importances[indices[f]]))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels, rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('./figures/random_forest.png', dpi=300)
plt.show()

#指定的threshold进行选择
X_selected = forest.transform(X_train, threshold=0.15)
X_selected.shape```

# 参考资料

1. PythonMachineLeaning

0 条评论

• ### LSA概述与实例

LSA概述 Latent Semantic Analysis简单来说，就是将word和document透射到concept space，然后在concept s...

• ### 记忆网络RNN、LSTM与GRU

RNN 结构 训练 应用 RNN Variants LSTM 结构 梯度消失及梯度爆炸 GRU 结构 ? 一般的神经网络输入和输出的维度大小都是固定的，针对序列...

power检验 ?

• ### 使用 RNN 进行情感分析的初学者指南

情感分析可能是最常见的 自然语言处理 的应用之一。我无需去额外强调在客服工具中情感分析的重要性。本文将利用循环神经网络，训练出一个基于 IMDB 数据集的电影评...

• ### 机器学习|模型选择之划分数据集及Sklearn实现

直接将数据集D划分为两个互斥的集合：训练集S和测试集T（D = S∪T，S∩T = ∅），在S上训练模型，用T来评估其测试误差。

GBDT相关知识模块：前向分布算法，负梯度拟合，损失函数，回归，二分类，多分类，正则化。

• ### Oracle Profile文件

一、Profile文件概述:Profiles是Oracle安全策略的一个组成部分,当Oracle建立数据库时,会自动建立名称为Default的profile,当...

• ### 100天搞定机器学习|Day2简单线性回归分析

第一天机器学习100天|Day1数据预处理，我们学习了数据预处理。知道了，数据预处理是机器学习中最基础和最麻烦，未来占用时间最长的一步操作。数据预处理一般有六个...

• ### python 数据分析基础 day13－套嵌字典在数据整理过程中的应用

今天是读《python数据分析基础》的第11天，今天笔记的内容主要是涉及这本书的5.2节。很多时候，业务数据是按照业务这个维度来组织数据的，按5.2节的实例来说...