文章转载自:Github.com
数据预处理
如图所示,按照以上六部完成图像预处理。
此例用到的数据、代码见文末。
第1步:导入库
importnumpyasnp
importpandasaspd
第2步:导入数据集
dataset=pd.read_csv('../datasets/Data.csv')
X=dataset.iloc[ : , :-1].values
Y=dataset.iloc[ : ,3].values
第3步:处理丢失数据
fromsklearn.preprocessingimportImputer
imputer=Imputer(missing_values ="NaN",strategy ="mean",axis =)
imputer=imputer.fit(X[ : , 1:3])
X[ : ,1:3]=imputer.transform(X[ : ,1:3])
第4步:解析分类数据
fromsklearn.preprocessingimportLabelEncoder, OneHotEncoder
labelencoder_X=LabelEncoder()
X[ : ,]=labelencoder_X.fit_transform(X[ : ,])
创建虚拟变量
onehotencoder=OneHotEncoder(categorical_features =[])
X=onehotencoder.fit_transform(X).toarray()
labelencoder_Y=LabelEncoder()
Y=labelencoder_Y.fit_transform(Y)
第5步:拆分数据集为训练集合和测试集合
fromsklearn.model_selectionimporttrain_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=)
第6步:特征量化
fromsklearn.preprocessingimportStandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)
数据:
Country,Age,Salary,Purchased
France,44,72000,No
Spain,27,48000,Yes
Germany,30,54000,No
Spain,38,61000,No
Germany,40,,Yes
France,35,58000,Yes
Spain,,52000,No
France,48,79000,Yes
Germany,50,83000,No
France,37,67000,Yes
代码:
#Day 1: Data Prepocessing
#Step 1: Importing the libraries
importnumpyasnp
importpandasaspd
#Step 2: Importing dataset
dataset=pd.read_csv('../datasets/Data.csv')
X=dataset.iloc[ : , :-1].values
Y=dataset.iloc[ : ,3].values
print("Step 2: Importing dataset")
print("X")
print(X)
print("Y")
print(Y)
#Step 3: Handling the missing data
fromsklearn.preprocessingimportImputer
imputer=Imputer(missing_values ="NaN",strategy ="mean",axis =)
imputer=imputer.fit(X[ : , 1:3])
X[ : ,1:3]=imputer.transform(X[ : ,1:3])
print("---------------------")
print("Step 3: Handling the missing data")
print("step2")
print("X")
print(X)
#Step 4: Encoding categorical data
fromsklearn.preprocessingimportLabelEncoder, OneHotEncoder
labelencoder_X=LabelEncoder()
X[ : ,]=labelencoder_X.fit_transform(X[ : ,])
#Creating a dummy variable
onehotencoder=OneHotEncoder(categorical_features =[])
X=onehotencoder.fit_transform(X).toarray()
labelencoder_Y=LabelEncoder()
Y=labelencoder_Y.fit_transform(Y)
print("---------------------")
print("Step 4: Encoding categorical data")
print("X")
print(X)
print("Y")
print(Y)
#Step 5: Splitting the datasets into training sets and Test sets
fromsklearn.model_selectionimporttrain_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=)
print("---------------------")
print("Step 5: Splitting the datasets into training sets and Test sets")
print("X_train")
print(X_train)
print("X_test")
print(X_test)
print("Y_train")
print(Y_train)
print("Y_test")
print(Y_test)
#Step 6: Feature Scaling
fromsklearn.preprocessingimportStandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)
print("---------------------")
print("Step 6: Feature Scaling")
print("X_train")
print(X_train)
print("X_test")
print(X_test)
◆◆◆◆◆
领取专属 10元无门槛券
私享最新 技术干货