# As usual we will start by importing the modules that we need
# This set will get us started, but you will need to add
# others.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# We can now access the dataset
wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02 # step size in the mesh
# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you
# will need to work with the train_test_split( ... ) function
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Next we can look at the cross validation. Remember we are
# selecting the two best features through this process.
# Take a look at tutorial 4 for an example implementation
best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0
for f1 in range(0,13):
for f2 in range(0, 13):
#We need to store the best performance and the features that achieved them.
# We want 2 features, not 1
if f1 == f2:
continue
features_idx_to_use = [f1,f2]
clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42) #COMPLETE
clf.fit(X_train, y_train) #COMPLETE
# Return the predictions for the 3-Fold crossvalidation
y_predicted = cross_val_predict(clf, X_train, y_train ) #COMPLETE
# Construct the confusion matricies
conf_mat_train = confusion_matrix(y_train, y_predicted) #COMPLETE
# Print out the recall, precision and F1 scores
# There will be a value for each class
# CV Train
print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None)) #COMPLETE
print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None)) #COMPLETE
print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None)) #COMPLETE
# Now we need to store the result
current_f1 = np.mean(f1_score(y_train,y_predicted,average=None)) #COMPLETE
if current_f1 > best_mean_f1:
best_f1 = f1
best_f2 = f2
best_mean_f1 = current_f1
best_clf = clf
# Plot the best performing features to visualise them. This will allow us to
# sanity check our cross validation.
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(X_train[:,best_f1], X_train[:,best_f2], c=y_train, cmap='rainbow', alpha=0.3, s=30)
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
y_test_predicted = best_clf.predict(X_test)
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
# Now we can plot a ROC curve and calculate the AUC
y_score = cross_val_predict(clf, X, y, cv=3,method="decision_function")
y_test_bin = label_binarize(y, classes=[0, 1, 2])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:,i], y_score[:,i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')
# Plot also the training points
for i, color in zip(clf.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')
# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_
# Lets make a function to plot the hyperplanes used by the SVM for
# Classification.
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
for i, color in zip(clf.classes_, colors):
plot_hyperplane(i, color)
plt.legend()
plt.show()我得到了这个错误ValueError: X每个样本有2个特征;预期13个
在第118行,即Z= clf.predict(np.c_xx.ravel(),yy.ravel())
当我打印召回时,我也应该得到重复的输出,我一次又一次地得到相同的3个数字,我觉得它应该运行并输出不同的变量。
发布于 2021-08-27 20:21:05
在计算roc_auc之前,您的代码是正确的,错误的原因是Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])需要一个shape= (102723, 13),其中clf.predict调用的数据形状是`shape=(102723,2)。
您需要做的是使用2个特征作为输入来训练分类器。
要做到这一点,你只需选择2个功能。原因是您不能绘制13D图。在选择了这两个特性之后,只将它们用于决策表面的可视化。
所以更改后的代码部分:
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
plt.rcParams["figure.figsize"] = (12,5)
# Train on two features
wine = load_wine()
X_new = wine.data
y_new = wine.target
X_new_two=X_new[:,:2]
y_new_two=y_new
print(f'X_new_two.shape={X_new_two.shape}')
print(f'y_new_two.shape={y_new_two.shape}')
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, ax = plt.subplots(figsize=(10,10))
clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)
clf.fit(X_new_two, y_new_two)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')
# Plot also the training points
for i, color in zip(clf.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')
# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_
# Lets make a function to plot the hyperplanes used by the SVM for
# Classification.
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
for i, color in zip(clf.classes_, colors):
plot_hyperplane(i, color)
plt.legend()
plt.show()输出:

https://stackoverflow.com/questions/68955335
复制相似问题