TVP

# 人工智能：有监督学习分类与回归

1有监督学习与无监督学习

2分类的含义

3数据预处理

l二值化

l平均去除

l缩放

l正常化

import numpy as np

from sklearn importpreprocessing

input_data =np.array([[5.1, -2.9, 3.3],

[-1.2, 7.8, -6.1],

[3.9, 0.4, 2.1],

[7.3, -9.9, -4.5]])

# Binarize data

data_binarized =preprocessing.Binarizer(threshold=2.1).transform(input_data)

print("\nBinarizeddata:\n", data_binarized)

Binarized data:

[[1. 0. 1.]

[0. 1. 0.]

[1. 0. 0.]

[1. 0. 0.]]

# Print mean andstandard deviation

print("\nBEFORE:")

print("Mean=", input_data.mean(axis=0))

print("Stddeviation =", input_data.std(axis=0))

# Remove mean

data_scaled =preprocessing.scale(input_data)

print("\nAFTER:")

print("Mean=", data_scaled.mean(axis=0))

print("Stddeviation =", data_scaled.std(axis=0))

BEFORE:

Mean = [ 3.775-1.15 -1.3 ]

AFTER:

Std deviation = [1. 1.1.]

# Min max scaling

data_scaler_minmax =preprocessing.MinMaxScaler(feature_range=(0, 1))

data_scaled_minmax =data_scaler_minmax.fit_transform(input_data)

print("\nMin maxscaled data:\n", data_scaled_minmax)

Min max scaled data:

[0. 1. 0. ]

# Normalize data

data_normalized_l1 =preprocessing.normalize(input_data, norm='l1')

data_normalized_l2 =preprocessing.normalize(input_data, norm='l2')

print("\nL1normalized data:\n", data_normalized_l1)

print("\nL2normalized data:\n", data_normalized_l2)

L1 normalized data:

[ 0.609375 0.0625 0.328125 ]

L2 normalized data:

4标签编码

from sklearn importpreprocessing

# Sample input labels

input_labels = ['red','black', 'red', 'green', 'black', 'yellow', 'white']

# Create label encoderand fit the labels

encoder =preprocessing.LabelEncoder()

encoder.fit(input_labels)

# Print the mapping

print("\nLabelmapping:")

for i, item inenumerate(encoder.classes_):

print(item, '-->', i)

Label mapping:

black --> 0

green --> 1

white --> 3

yellow --> 4

# Encode a set of labelsusing the encoder

test_labels = ['green','red', 'black']

encoded_values =encoder.transform(test_labels)

print("\nLabels=", test_labels)

print("Encodedvalues =", list(encoded_values))

Labels = ['green','red', 'black']

Encoded values = [1, 2,0]

# Decode a set of valuesusing the encoder

encoded_values = [3, 0,4, 1]

decoded_list =encoder.inverse_transform(encoded_values)

print("\nEncodedvalues =", encoded_values)

print("Decodedlabels =", list(decoded_list))

Encoded values = [3, 0,4, 1]

Decoded labels =['white', 'black', 'yellow', 'green']

5逻辑斯特回归

import numpy as np

from sklearn importlinear_model

import matplotlib.pyplotas plt

# Define sample inputdata

X = np.array([[3.1,7.2], [4, 6.7], [2.9, 8], [5.1, 4.5], [6, 5], [5.6, 5], [3.3, 0.4], [3.9, 0.9],[2.8, 1], [0.5, 3.4], [1, 4], [0.6, 4.9]])

y = np.array([0, 0, 0,1, 1, 1, 2, 2, 2, 3, 3, 3])

# Create the logisticregression classifier

classifier =linear_model.LogisticRegression(solver='liblinear', C=1)

# Train the classifier

classifier.fit(X, y)

# Visualize theperformance of the classifier

visualize_classifier(classifier,X, y)

defvisualize_classifier(classifier, X, y):

# Define the minimum and maximum values forX and Y

# that will be used in the mesh grid

min_x, max_x = X[:, 0].min() - 1.0, X[:,0].max() + 1.0

min_y, max_y = X[:, 1].min() - 1.0, X[:,1].max() + 1.0

# Define the step size to use in plottingthe mesh grid

mesh_step_size = 0.01

# Define the mesh grid of X and Y values

x_vals, y_vals =np.meshgrid(np.arange(min_x, max_x, mesh_step_size), np.arange(min_y, max_y,mesh_step_size))

# Run the classifier on the mesh grid

output =classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])

# Reshape the output array

output = output.reshape(x_vals.shape)

# Create a plot

plt.figure()

# Choose a color scheme for the plot

# Overlay the training points on the plot

plt.scatter(X[:, 0], X[:, 1], c=y, s=75,edgecolors='black', linewidth=1, cmap=plt.cm.Paired)

# Specify the boundaries of the plot

plt.xlim(x_vals.min(), x_vals.max())

plt.ylim(y_vals.min(), y_vals.max())

# Specify the ticks on the X and Y axes

plt.xticks((np.arange(int(X[:, 0].min() -1), int(X[:, 0].max() + 1), 1.0)))

plt.yticks((np.arange(int(X[:, 1].min() -1), int(X[:, 1].max() + 1), 1.0)))

plt.show()

classifier =linear_model.LogisticRegression(solver='liblinear', C=100)

6朴素贝叶斯分类器

import numpy as np

from sklearn.naive_bayesimport GaussianNB

from sklearn importcross_validation

# Input file containingdata

input_file ='data_multivar_nb.txt'

X, y = data[:, :-1],data[:, -1]

# Create Naive Bayesclassifier

classifier =GaussianNB()

# Train the classifier

classifier.fit(X, y)

# Predict the values fortraining data

y_pred =classifier.predict(X)

# Compute accuracy

accuracy = 100.0 * (y ==y_pred).sum() / X.shape[0]

print("Accuracy ofNaive Bayes classifier =", round(accuracy, 2), "%")

# Visualize theperformance of the classifier

visualize_classifier(classifier,X, y)

Accuracy of Naive Bayesclassifier = 99.75 %

# Cross validation

# Split data intotraining and test data

X_train, X_test,y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=3)

classifier_new =GaussianNB()

classifier_new.fit(X_train,y_train)

y_test_pred =classifier_new.predict(X_test)

# compute accuracy ofthe classifier

accuracy = 100.0 *(y_test == y_test_pred).sum() / X_test.shape[0]

print("Accuracy ofthe new classifier =", round(accuracy, 2), "%")

# Visualize theperformance of the classifier

visualize_classifier(classifier_new,X_test, y_test)

Accuracy of the newclassifier = 100.0 %

# Scoring functions

num_folds = 3

accuracy_values =cross_validation.cross_val_score(classifier,

X, y, scoring='accuracy', cv=num_folds)

print("Accuracy:" + str(round(100*accuracy_values.mean(), 2)) + "%")

precision_values =cross_validation.cross_val_score(classifier,

X, y, scoring='precision_weighted',cv=num_folds)

print("Precision:" + str(round(100*precision_values.mean(), 2)) + "%")

recall_values =cross_validation.cross_val_score(classifier,

X, y, scoring='recall_weighted',cv=num_folds)

print("Recall:" + str(round(100*recall_values.mean(), 2)) + "%")

f1_values =cross_validation.cross_val_score(classifier,

X, y, scoring='f1_weighted',cv=num_folds)

print("F1: " +str(round(100*f1_values.mean(), 2)) + "%")

Accuracy: 99.75%

Precision: 99.76%

Recall: 99.75%

F1: 99.75%

7混淆矩阵

l真正例：预测为1的样本，真实数据也是1。

l真反例：预测为的样本，真实数据也是。

l假正例：预测为1的样本，但真实数据为。也被称为I型误差。

l假反例：预测为的样本，但真实数据为1。也被称为II型误差。

import numpy as np

import matplotlib.pyplotas plt

from sklearn.metricsimport confusion_matrix

from sklearn.metricsimport classification_report

# Define sample labels

true_labels = [2, 0, 0,2, 4, 4, 1, 0, 3, 3, 3]

pred_labels = [2, 1, 0,2, 4, 3, 1, 0, 1, 3, 3]

# Create confusionmatrix

confusion_mat =confusion_matrix(true_labels, pred_labels)

# Visualize confusionmatrix

plt.title('Confusionmatrix')

plt.colorbar()

ticks = np.arange(5)

plt.xticks(ticks, ticks)

plt.yticks(ticks, ticks)

plt.ylabel('Truelabels')

plt.xlabel('Predictedlabels')

plt.show()

# Classification report

targets = ['Class-0','Class-1', 'Class-2', 'Class-3', 'Class-4']

print('\n',classification_report(true_labels, pred_labels, target_names=targets))

precision recall f1-score support

Class-0 1.00 0.67 0.80 3

Class-1 0.33 1.00 0.50 1

Class-2 1.00 1.00 1.00 2

Class-3 0.67 0.67 0.67 3

Class-4 1.00 0.50 0.67 2

avg / total 0.85 0.73 0.75 11

8支持向量机

import numpy as np

from sklearn importpreprocessing

from sklearn.svm importLinearSVC

from sklearn.multiclassimport OneVsOneClassifier

from sklearn importcross_validation

# Input file containingdata

input_file ='income_data.txt'

X = []

y = []

count_class1 = 0

count_class2 = 0

max_datapoints = 25000

with open(input_file,'r') as f:

if count_class1 >= max_datapointsand count_class2 >= max_datapoints:

break

if '?' in line:

continue

data = line[:-1].split(', ')

if data[-1] == '

X.append(data)

count_class1 += 1

if data[-1] == '>50K' andcount_class2

X.append(data)

count_class2 += 1

# Convert to numpy array

X = np.array(X)

# Convert string data tonumerical data

label_encoder = []

X_encoded =np.empty(X.shape)

for i,item inenumerate(X[0]):

if item.isdigit():

X_encoded[:, i] = X[:, i]

else:

label_encoder.append(preprocessing.LabelEncoder())

X_encoded[:, i] =label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:,:-1].astype(int)

y = X_encoded[:,-1].astype(int)

# Create SVM classifier

classifier =OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier

classifier.fit(X, y)

# Cross validation

X_train, X_test,y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=5)

classifier =OneVsOneClassifier(LinearSVC(random_state=0))

classifier.fit(X_train,y_train)

y_test_pred =classifier.predict(X_test)

# Compute the F1 scoreof the SVM classifier

f1 =cross_validation.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)

print("F1 score:" + str(round(100*f1.mean(), 2)) + "%")

# Predict output for atest datapoint

input_data = ['37','Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners','Not-in-family', 'White', 'Male', '0', '0', '40', 'United-States']

# Encode test datapoint

input_data_encoded =[-1] * len(input_data)

count = 0

for i, item inenumerate(input_data):

ifitem.isdigit():

input_data_encoded[i] =int(input_data[i])

else:

input_data_encoded[i] =int(label_encoder[count].transform([input_data[i]]))

count += 1

input_data_encoded =np.array(input_data_encoded)

# Run classifier onencoded datapoint and print output

predicted_class =classifier.predict(input_data_encoded.reshape(1,-1))

print('The predictresult is:')

print(label_encoder[-1].inverse_transform(predicted_class)[0])

F1 score: 70.82%

The predict result is:

9回归

import numpy as np

from sklearn importlinear_model

import sklearn.metricsas sm

import matplotlib.pyplotas plt

# Input file containingdata

input_file ='../data/data_singlevar_regr.txt'

X, y = data[:, :-1],data[:, -1]

# Train and test split

num_training = int(0.8 *len(X))

num_test = len(X) -num_training

# Training data

X_train, y_train =X[:num_training], y[:num_training]

# Test data

X_test, y_test =X[num_training:], y[num_training:]

# Create linearregressor object

regressor =linear_model.LinearRegression()

# Train the model usingthe training sets

regressor.fit(X_train, y_train)

# Predict the output

y_test_pred =regressor.predict(X_test)

# Plot outputs

plt.scatter(X_test,y_test, color='green')

plt.plot(X_test,y_test_pred, color='black', linewidth=4)

plt.xticks(())

plt.yticks(())

plt.show()

# Compute performancemetrics

print("Linearregressor performance:")

print("Meanabsolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2))

print("Mean squarederror =", round(sm.mean_squared_error(y_test, y_test_pred), 2))

print("Medianabsolute error =", round(sm.median_absolute_error(y_test, y_test_pred),2))

print("Explainvariance score =", round(sm.explained_variance_score(y_test, y_test_pred),2))

print("R2 score=", round(sm.r2_score(y_test, y_test_pred), 2))

Linear regressorperformance:

Mean absolute error =0.59

Mean squared error =0.49

Median absolute error =0.51

Explain variance score =0.86

R2 score = 0.86

import pickle

# Model persistence

output_model_file ='model.pkl'

# Save the model

withopen(output_model_file, 'wb') as f:

pickle.dump(regressor, f)

withopen(output_model_file, 'rb') as f:

# Perform prediction ontest data

y_test_pred_new =regressor_model.predict(X_test)

print("\nNew meanabsolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_new),2))

New mean absolute error = 0.59

import numpy as np

from sklearn import linear_model

import sklearn.metricsas sm

fromsklearn.preprocessing import PolynomialFeatures

# Input file containingdata

input_file ='../data/data_multivar_regr.txt'

# Load the data from theinput file

X, y = data[:, :-1],data[:, -1]

# Split data intotraining and testing

num_training = int(0.8 *len(X))

num_test = len(X) -num_training

# Training data

X_train, y_train =X[:num_training], y[:num_training]

# Test data

X_test, y_test =X[num_training:], y[num_training:]

# Create the linearregressor model

linear_regressor =linear_model.LinearRegression()

# Train the model usingthe training sets

linear_regressor.fit(X_train,y_train)

# Predict the output

y_test_pred =linear_regressor.predict(X_test)

# Measure performance

print("LinearRegressor performance:")

print("Meanabsolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2))

print("Mean squarederror =", round(sm.mean_squared_error(y_test, y_test_pred), 2))

print("Medianabsolute error =", round(sm.median_absolute_error(y_test, y_test_pred),2))

print("Explainedvariance score =", round(sm.explained_variance_score(y_test, y_test_pred),2))

print("R2 score=", round(sm.r2_score(y_test, y_test_pred), 2))

Linear Regressorperformance:

Mean absolute error =3.58

Mean squared error =20.31

Median absolute error =2.99

Explained variance score= 0.86

R2 score = 0.86

# Polynomial regression

polynomial =PolynomialFeatures(degree=10)

X_train_transformed =polynomial.fit_transform(X_train)

datapoint = [[7.75,6.35, 5.56]]

poly_datapoint =polynomial.fit_transform(datapoint)

poly_linear_model =linear_model.LinearRegression()

poly_linear_model.fit(X_train_transformed,y_train)

print("\nLinearregression:\n", linear_regressor.predict(datapoint))

print("\nPolynomialregression:\n", poly_linear_model.predict(poly_datapoint))

Linear regression:

Polynomial regression:

10支持向量回归机

from sklearn importdatasets

from sklearn.svm importSVR

from sklearn.metricsimport mean_squared_error, explained_variance_score

from sklearn.utilsimport shuffle

# Shuffle the data

X, y =shuffle(data.data, data.target, random_state=7)

# Split the data intotraining and testing datasets

num_training = int(0.8 *len(X))

X_train, y_train =X[:num_training], y[:num_training]

X_test, y_test =X[num_training:], y[num_training:]

# Create Support VectorRegression model

sv_regressor =SVR(kernel='linear', C=1.0, epsilon=0.1)

# Train Support VectorRegressor

sv_regressor.fit(X_train,y_train)

# Evaluate performanceof Support Vector Regressor

y_test_pred =sv_regressor.predict(X_test)

mse =mean_squared_error(y_test, y_test_pred)

evs =explained_variance_score(y_test, y_test_pred)

print("\n####Performance ####")

print("Mean squarederror =", round(mse, 2))

print("Explainedvariance score =", round(evs, 2))

#### Performance ####

Mean squared error =15.41

Explained variance score= 0.82

# Test the regressor ontest datapoint

test_data = [3.7, 0,18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27]

print("\nPredictedprice:", sv_regressor.predict([test_data])[0])

[1]普拉提克·乔希（Prateek Joshi）,人工智能：Python实现(影印版)(英文版),东南大学出版社, 2017.

PS：本节内容由陆工大硕士研究生张洋硕翻译，王一鸣校对。

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20180518G1NV7000?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2018-06-14

2018-01-27

2018-06-13

2018-10-23

2021-05-12

2018-01-26

2023-07-16

2019-02-01

2018-06-13

2018-05-30

2021-05-21

2023-01-03

2018-01-26

2018-12-19

2018-12-21

2018-12-11

2019-02-02

2023-10-05

2020-01-14

2018-08-09