首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >问答首页 >SGD分类机器学习错误

SGD分类机器学习错误
EN

Stack Overflow用户
提问于 2021-08-27 14:43:14
回答 1查看 16关注 0票数 0
代码语言:javascript
复制
    # As usual we will start by importing the modules that we need
# This set will get us started, but you will need to add
# others.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


# We can now access the dataset

wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02  # step size in the mesh

# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you  
# will need to work with the train_test_split( ... ) function

X_train, X_test, y_train, y_test =  train_test_split(X, y)

# Next we can look at the cross validation. Remember we are
# selecting the two best features  through this process.
# Take a look at tutorial 4 for an example implementation

best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0

for f1 in range(0,13):
    for f2 in range(0, 13):
#We need to store the best performance and the features that achieved them.
        # We want 2 features, not 1
        if f1 == f2:
            continue
        
        features_idx_to_use = [f1,f2]
        
        clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)       #COMPLETE
        clf.fit(X_train, y_train)   #COMPLETE
        
        # Return the predictions for the 3-Fold crossvalidation
        
        y_predicted = cross_val_predict(clf, X_train, y_train ) #COMPLETE
        
        # Construct the confusion matricies
        conf_mat_train = confusion_matrix(y_train, y_predicted) #COMPLETE
        
        # Print out the recall, precision and F1 scores
        # There will be a value for each class
        # CV Train
        print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None))  #COMPLETE
        print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None)) #COMPLETE
        print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None))        #COMPLETE

        # Now we need to store the result 
        current_f1 = np.mean(f1_score(y_train,y_predicted,average=None)) #COMPLETE
        if current_f1 > best_mean_f1:
            best_f1 = f1
            best_f2 = f2
            best_mean_f1 = current_f1
            best_clf = clf
              
# Plot the best performing features to visualise them. This will allow us to
# sanity check our cross validation.              
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(X_train[:,best_f1], X_train[:,best_f2], c=y_train, cmap='rainbow', alpha=0.3, s=30)
        
        
        
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
        
y_test_predicted = best_clf.predict(X_test)
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
        
# Now we can plot a ROC curve and calculate the AUC    
y_score = cross_val_predict(clf, X, y, cv=3,method="decision_function")
y_test_bin = label_binarize(y, classes=[0, 1, 2]) 
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:,i], y_score[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
    
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

fig, ax = plt.subplots(figsize=(10,10))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')

# Plot also the training points
for i, color in zip(clf.classes_, colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
                cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')


# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_

# Lets make a function to plot the hyperplanes used by the SVM for 
# Classification.

def plot_hyperplane(c, color):
    def line(x0):
        return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]

    plt.plot([xmin, xmax], [line(xmin), line(xmax)],
             ls="--", color=color)

for i, color in zip(clf.classes_, colors):
    plot_hyperplane(i, color)
plt.legend()
plt.show()

我得到了这个错误ValueError: X每个样本有2个特征;预期13个

在第118行,即Z= clf.predict(np.c_xx.ravel(),yy.ravel())

当我打印召回时,我也应该得到重复的输出,我一次又一次地得到相同的3个数字,我觉得它应该运行并输出不同的变量。

EN

回答 1

Stack Overflow用户

发布于 2021-08-27 20:21:05

在计算roc_auc之前,您的代码是正确的,错误的原因是Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])需要一个shape= (102723, 13),其中clf.predict调用的数据形状是`shape=(102723,2)。

您需要做的是使用2个特征作为输入来训练分类器。

要做到这一点,你只需选择2个功能。原因是您不能绘制13D图。在选择了这两个特性之后,只将它们用于决策表面的可视化。

所以更改后的代码部分:

代码语言:javascript
复制
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)

plt.rcParams["figure.figsize"] = (12,5)
# Train on two features
wine = load_wine()
X_new = wine.data
y_new = wine.target
X_new_two=X_new[:,:2]
y_new_two=y_new
print(f'X_new_two.shape={X_new_two.shape}')
print(f'y_new_two.shape={y_new_two.shape}')
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                 np.arange(y_min, y_max, h))

fig, ax = plt.subplots(figsize=(10,10))

clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)  
clf.fit(X_new_two, y_new_two)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('tight')

# Plot also the training points
for i, color in zip(clf.classes_, colors):
   idx = np.where(y == i)
   plt.scatter(X[idx, 0], X[idx, 1], c=color, label=wine.target_names[i],
            cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
plt.axis('tight')


# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_

# Lets make a function to plot the hyperplanes used by the SVM for 
# Classification.

def plot_hyperplane(c, color):
    def line(x0):
       return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]

    plt.plot([xmin, xmax], [line(xmin), line(xmax)],
         ls="--", color=color)

for i, color in zip(clf.classes_, colors):
   plot_hyperplane(i, color)
plt.legend()
plt.show()

输出:

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/68955335

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档