问SGD分类机器学习错误
EN

Stack Overflow用户

提问于 2021-08-27 14:43:14

回答 1查看 16关注 0票数 0

    # As usual we will start by importing the modules that we need
# This set will get us started, but you will need to add
# others.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


# We can now access the dataset

wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02  # step size in the mesh

# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you  
# will need to work with the train_test_split( ... ) function

X_train, X_test, y_train, y_test =  train_test_split(X, y)

# Next we can look at the cross validation. Remember we are
# selecting the two best features  through this process.
# Take a look at tutorial 4 for an example implementation

best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0

for f1 in range(0,13):
    for f2 in range(0, 13):
#We need to store the best performance and the features that achieved them.
        # We want 2 features, not 1
        if f1 == f2:
            continue
        
        features_idx_to_use = [f1,f2]
        
        clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)       #COMPLETE
        clf.fit(X_train, y_train)   #COMPLETE
        
        # Return the predictions for the 3-Fold crossvalidation
        
        y_predicted = cross_val_predict(clf, X_train, y_train ) #COMPLETE
        
        # Construct the confusion matricies
        conf_mat_train = confusion_matrix(y_train, y_predicted) #COMPLETE
        
        # Print out the recall, precision and F1 scores
        # There will be a value for each class
        # CV Train
        print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None))  #COMPLETE
        print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None)) #COMPLETE
        print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None))        #COMPLETE

        # Now we need to store the result 
        current_f1 = np.mean(f1_score(y_train,y_predicted,average=None)) #COMPLETE
        if current_f1 > best_mean_f1:
            best_f1 = f1
            best_f2 = f2
            best_mean_f1 = current_f1
            best_clf = clf
              
# Plot the best performing features to visualise them. This will allow us to
# sanity check our cross validation.              
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(X_train[:,best_f1], X_train[:,best_f2], c=y_train, cmap='rainbow', alpha=0.3, s=30)
        
        
        
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
        
y_test_predicted = best_clf.predict(X_test)
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
        
# Now we can plot a ROC curve and calculate the AUC    
y_score = cross_val_predict(clf, X, y, cv=3,method="decision_function")
y_test_bin = label_binarize(y, classes=[0, 1, 2]) 
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:,i], y_score[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
    
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

fig, ax = plt.subplots(figsize=(10,10))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')

# Plot also the training points
for i, color in zip(clf.classes_, colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
                cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')


# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_

# Lets make a function to plot the hyperplanes used by the SVM for 
# Classification.

def plot_hyperplane(c, color):
    def line(x0):
        return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]

    plt.plot([xmin, xmax], [line(xmin), line(xmax)],
             ls="--", color=color)

for i, color in zip(clf.classes_, colors):
    plot_hyperplane(i, color)
plt.legend()
plt.show()

我得到了这个错误ValueError: X每个样本有2个特征；预期13个

在第118行，即Z= clf.predict(np.c_xx.ravel()，yy.ravel())

当我打印召回时，我也应该得到重复的输出，我一次又一次地得到相同的3个数字，我觉得它应该运行并输出不同的变量。

python

回答 1

Stack Overflow用户

发布于 2021-08-27 20:21:05

在计算roc_auc之前，您的代码是正确的，错误的原因是Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])需要一个shape= (102723, 13)，其中clf.predict调用的数据形状是`shape=(102723,2)。

您需要做的是使用2个特征作为输入来训练分类器。

要做到这一点，你只需选择2个功能。原因是您不能绘制13D图。在选择了这两个特性之后，只将它们用于决策表面的可视化。

所以更改后的代码部分：

# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)

plt.rcParams["figure.figsize"] = (12,5)
# Train on two features
wine = load_wine()
X_new = wine.data
y_new = wine.target
X_new_two=X_new[:,:2]
y_new_two=y_new
print(f'X_new_two.shape={X_new_two.shape}')
print(f'y_new_two.shape={y_new_two.shape}')
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                 np.arange(y_min, y_max, h))

fig, ax = plt.subplots(figsize=(10,10))

clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)  
clf.fit(X_new_two, y_new_two)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')

# Plot also the training points
for i, color in zip(clf.classes_, colors):
   idx = np.where(y == i)
   plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
            cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')


# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_

# Lets make a function to plot the hyperplanes used by the SVM for 
# Classification.

def plot_hyperplane(c, color):
    def line(x0):
       return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]

    plt.plot([xmin, xmax], [line(xmin), line(xmax)],
         ls="--", color=color)

for i, color in zip(clf.classes_, colors):
   plot_hyperplane(i, color)
plt.legend()
plt.show()

输出：

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/68955335

复制

相似问题

问SGD分类机器学习错误
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问SGD分类机器学习错误EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问SGD分类机器学习错误
EN