# 利用scikit-learn进行机器学习：特征工程（一）数据预处理

scikit-learn是Python中专门针对机器学习应用而发展起来的一款优秀的开源机器学习框架。从scikit-learn给出的官方文档我们可以看到，scikit-learn将机器学习内容分为六大块：分类、回归、聚类、降维、模型选择与评估以及数据预处理。其中降维和数据预处理的内容均可视为特征工程的内容。下面就开始小编的特征工程第一讲：数据预处理。

>>>>

fromsklearnimportpreprocessing

importnumpyasnp

X_train=np.array([[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]])

X_scaled=preprocessing.scale(X_train)

X_scaled

array([[ 0. ..., -1.22..., 1.33...],[ 1.22..., 0. ..., -0.26...], [-1.22..., 1.22..., -1.06...]])

X_scaled.mean(axis=)

array([ 0., 0., 0.])

X_scaled.std(axis=)

array([ 1., 1., 1.])

scaler=preprocessing.StandardScaler().fit(X_train)

scalerStandardScaler(copy=True, with_mean=True, with_std=True)

scaler.mean_

array([ 1. ..., 0. ..., 0.33...])

scaler.scale_

array([ 0.81..., 0.81..., 1.24...])

scaler.transform(X_train)

array([[ 0. ..., -1.22..., 1.33...],[ 1.22..., 0. ..., -0.26...],[-1.22..., 1.22..., -1.06...]])

X_test=[[-1.,1.,0.]]

scaler.transform(X_test)

array([[-2.44..., 1.22..., -0.26...]])

X_train=np.array([[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]])

min_max_scaler=preprocessing.MinMaxScaler()

X_train_minmax=min_max_scaler.fit_transform(X_train)

X_train_minmaxarray([[ 0.5,0.,1.],[1.,0.5,0.33333333],[ 0.,1.,0.]])

>>>>

X=[[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]]

X_normalized=preprocessing.normalize(X,norm='l2')

X_normalized

array([[ 0.40..., -0.40..., 0.81...],[ 1. ..., 0. ..., 0. ...],[ 0. ..., 0.70..., -0.70...]])

normalizer=preprocessing.Normalizer().fit(X)

normalizer

normalizerNormalizer(copy=True, norm='l2')

normalizer.transform(X)

array([[ 0.40..., -0.40..., 0.81...],[ 1. ..., 0. ..., 0. ...],[ 0. ..., 0.70..., -0.70...]])

normalizer.transform([[-1.,1.,0.]])

array([[-0.70..., 0.70..., 0. ...]])

>>>>

X=[[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]]

binarizer=preprocessing.Binarizer().fit(X)

binarizerBinarizer(copy=True, threshold=0.0)

binarizer.transform(X)

array([[1.,0.,1.],[ 1.,0.,0.],[0.,1.,0.]])

binarizer=preprocessing.Binarizer(threshold=1.1)

binarizer.transform(X)

array([[ 0.,0.,1.],[ 1.,0.,0.],[ 0.,0.,0.]])

>>>>

enc=preprocessing.OneHotEncoder()

enc.fit([[,,3],[1,1,],[,2,1],[1,,2]])

OneHotEncoder(categorical_features='all', dtype=,handle_unknown='error', n_values='auto', sparse=True)

enc.transform([[,1,3]]).toarray()array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])

>>>>

importnumpyasnp

fromsklearn.preprocessingimportImputer

imp=Imputer(missing_values='NaN',strategy='mean',axis=)

imp.fit([[1,2],[np.nan,3],[7,6]])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)>X=[[np.nan,2],[6,np.nan],[7,6]]

print(imp.transform(X))

[[ 4. 2.][ 6. 3.666][ 7. 6.]]

>>>>

http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

