前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >根据已给字符数据,训练逻辑回归、随机森林、SVM,生成ROC和箱线图

根据已给字符数据,训练逻辑回归、随机森林、SVM,生成ROC和箱线图

作者头像
MachineLP
发布2018-01-09 15:43:38
1.7K0
发布2018-01-09 15:43:38
举报
文章被收录于专栏:小鹏的专栏小鹏的专栏

代码下载:here。

已知训练数据如下:

预处理代码如下:

代码语言:javascript
复制
# -*- coding: utf-8 -*-
"""
Created on 2017 11.17
@author: liupeng
"""

import pandas as pd  
import numpy as np  
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler  
  
train_data = pd.read_csv("bank-full.csv")  

train_data = train_data['age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'].str.split(';',expand=True)  

text = '"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'
text = str(text)
text = text.split(';')

train_data = pd.DataFrame(train_data.values[0::, 1::], index=train_data.values[0::, 0].astype(np.int), columns = text)  
train_data.index.name = "age" 
train_data.to_csv("input.csv")  

# print (train_data)


train_data = pd.read_csv("input.csv")  

data = []
y = train_data['age']
data.append(y)
train_data['age'] = y
y = train_data['"job"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"job"'] = y
y = train_data['"marital"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"marital"'] = y
y = train_data['"education"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"education"'] = y
y = train_data['"default"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"default"'] = y
y = train_data['"balance"']
data.append(y)
train_data['"balance"'] = y
y = train_data['"housing"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"housing"'] = y
y = train_data['"loan"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"loan"'] = y
y = train_data['"contact"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"contact"'] = y
y = train_data['"day"']
data.append(y)
train_data['"day"'] = y
y = train_data['"month"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"month"'] = y
y = train_data['"duration"']
data.append(y)
train_data['"duration"'] = y
y = train_data['"campaign"']
data.append(y)
train_data['"campaign"'] = y
y = train_data['"pdays"']
data.append(y)
train_data['"pdays"'] = y
y = train_data['"previous"']
data.append(y)
train_data['"previous"'] = y
y = train_data['"poutcome"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"poutcome"'] = y
y = train_data['"y"']
y = LabelEncoder().fit(y).transform(y)  
data.append(y)
train_data['"y"'] = y

# 将每一列数据进行归一化
train_data = train_data.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))  #方法一  
print ('train_data.values[0::, 0]', train_data.values[0::, 0])

train_data = pd.DataFrame(train_data.values[0::, 1::], index=train_data.values[0::, 0], columns = text)  
train_data.index.name = "age" 
train_data.to_csv("train_data.csv")  

print (train_data)

训练测试模型代码如下:

代码语言:javascript
复制
# -*- coding: utf-8 -*-
"""
Created on 2017 11.17
@author: liupeng
"""

import pandas as pd  
import numpy as np  
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler  


from sklearn import cross_validation
from sklearn.metrics import roc_curve, auc  
from scipy import interp
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# ROC曲线。
def c_roc(y_pred, y_test):
    mean_tpr = 0.0  
    mean_fpr = np.linspace(0, 1, 1000)  
    all_tpr = [] 
    fpr, tpr, thresholds = roc_curve(y_test, y_pred )
    mean_tpr += interp(mean_fpr, fpr, tpr)          #对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数  
    mean_tpr[0] = 0.0                               #初始处为0  
    roc_auc = auc(fpr, tpr)  
    plt.plot(fpr, tpr, lw=1, label='LogisticRegression %d (area = %0.2f)' % (0, roc_auc)) 
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')  
    plt.xlim([-0.05, 1.05])  
    plt.ylim([-0.05, 1.05])  
    plt.xlabel('False Positive Rate')  
    plt.ylabel('True Positive Rate')  
    plt.title('Receiver operating characteristic example')  
    plt.legend(loc="lower right")  
    plt.show() 
# 箱线图。
def c_boxplot(data):

    # Creating boxplots 
    fig = plt.figure(1, figsize=(9, 6))
    # Create an axes instance
    ax = fig.add_subplot(111)
    # 合并 X, y。
    #data = np.hstack((X,np.reshape(y,[-1, 1])))
    bp = ax.boxplot(data, patch_artist=True)
    #plt.boxplot(X, sym="o", whis=1.5)
    
    for box in bp['boxes']:
        # change outline color
        box.set( color='#7570b3', linewidth=2)
        # change fill color
        box.set( facecolor = '#1b9e77' )
    
    ## change color and linewidth of the whiskers
    for whisker in bp['whiskers']:
        whisker.set(color='#7570b3', linewidth=2)
    
    ## change color and linewidth of the caps
    for cap in bp['caps']:
        cap.set(color='#7570b3', linewidth=2)
    
    ## change color and linewidth of the medians
    for median in bp['medians']:
        median.set(color='#b2df8a', linewidth=2)
    
    ## change the style of fliers and their fill
    for flier in bp['fliers']:
        flier.set(marker='o', color='#e7298a', alpha=0.5)
    
    # Save the figure
    # fig.savefig('fig1.png', bbox_inches='tight')
    plt.show()


###获取数据和训练---------------------------------------------------------###
# 从 csv中读取训练数据和标签。
train_data = pd.read_csv("train_data.csv")  
print (train_data)

# 分离 训练样本 和 标签。
X = train_data.values[0::, 0:16]
y = train_data.values[0::, 16].astype(np.int)

# 仅取 5000 个样本。
X = X[40000:45000]
y = y[40000:45000]

print (X)
print (y)


# 5折交叉验证。
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.5, random_state = 0)
print (len(X_train), len(X_test), len(y_train), len(y_test))
#使用6折交叉验证,并且画ROC曲线,也可以用下面产生5折交叉验证,产生5个集合。
#cv = StratifiedKFold(y, n_folds=5) 

################LogisticRegression  
from sklearn.linear_model import LogisticRegression   
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
# 预测
y_pred = classifier.predict(X_test)
y_train_pred = classifier.predict(X_train)
# 计算得分
scores = classifier.score(X_test , y_test)
print ('scores:', scores)

# 画ROC曲线
c_roc(y_pred, y_test)

# 合并 X, y。
data = np.hstack((X,np.reshape(y,[-1, 1])))
# 画 boxplot
c_boxplot(data)

# 计算混淆矩阵
confusion_matrix=confusion_matrix(y_test,y_pred)
print (confusion_matrix)
# 显示混淆矩阵
plt.matshow(confusion_matrix)
plt.title(u'混淆矩阵')
plt.colorbar()
plt.ylabel(u'实际类型')
plt.xlabel(u'预测类型')
plt.show()


###-------------------------------------------------------------------------------------###
# 以上的交叉验证只需要下面即可。
from sklearn.cross_validation import cross_val_score
# 在X,y原始数据上,计算5折交叉验证的准确率
classifier = LogisticRegression()

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores1 = np.mean(scores)
print ('LogisticRegression Accuracy:',np.mean(scores), scores)
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


################AdaBoostClassifier 
from sklearn.ensemble import AdaBoostClassifier  

classifier = AdaBoostClassifier(n_estimators=100) #迭代100次  

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores2 = np.mean(scores)
print ('AdaBoostClassifier Accuracy :',np.mean(scores), scores)
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


################RandomForestClassifier 
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100) #迭代100次  

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores3 = np.mean(scores)
print ('RandomForestClassifier Accuracy :',np.mean(scores), scores)   
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


###############radial basis functions, and support vector machines
from sklearn import svm

classifier = svm.SVC(kernel='linear')

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores4 = np.mean(scores)
print ('support vector machines(linear) Accuracy :',np.mean(scores), scores)   
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


classifier = svm.SVC(kernel='poly', degree=3)

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores5 = np.mean(scores)
print ('support vector machines(poly) Accuracy :',np.mean(scores), scores)   
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


classifier = svm.SVC(kernel='rbf')

classifier.fit(X_train, y_train)
scores = classifier.score(X_test , y_test)
scores6 = np.mean(scores)
print ('support vector machines(rbf) Accuracy :',np.mean(scores), scores)   
y_pred = classifier.predict(X_test)
# 合并 X, y。
data = np.hstack((X_test,np.reshape(y_test,[-1, 1]), np.reshape(y_pred,[-1, 1])))
# 画 boxplot
c_boxplot(data)


# 各准确率的比较
data = np.hstack((np.reshape(scores1,[-1, 1]), np.reshape(scores2,[-1, 1]),np.reshape(scores3,[-1, 1]),np.reshape(scores4,[-1, 1]),np.reshape(scores5,[-1, 1]), np.reshape(scores5,[-1, 1])))
# 画 boxplot
c_boxplot(data)
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2017年12月18日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档