交叉验证(所有数据分成n等分 )
最常用的为10折交叉验证
举例:
4折交叉验证(分成4等分时):
最后求出4个准确率的均值
网格搜索:调参数
对模型预设几种超参数组合,每组超参数都采用交叉验证来进行评估,选出最优参数组合建立模型
API
from sklearn.model_selection import GridSearchCV
# coding=utf8
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
df = pd.read_csv(
r'E:\Python 机器学习\csv\datingTestSet.txt',
sep='\t',
header=None,
names=['flight', 'icecream', 'game', 'type']
)
df_value = df[['flight', 'icecream', 'game']].values
df_value = np.array(df_value)
# test_size=0.25 表示选用25%的数据进行验证
x_train, x_test, y_train, y_test = train_test_split(df_value, df['type'], test_size=0.25) # 切割数据
# 预处理:数据标准化(满足正态分布即标准差为1,平均值为0的数组)
# 处理公式为 X=(x-x̅)/α
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
# coding=utf8
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
df = pd.read_csv(
r'E:\Python 机器学习\csv\datingTestSet.txt',
sep='\t',
header=None,
names=['flight', 'icecream', 'game', 'type']
)
df_value = df[['flight', 'icecream', 'game']].values
df_value = np.array(df_value)
# test_size=0.25 表示选用25%的数据进行验证
x_train, x_test, y_train, y_test = train_test_split(df_value, df['type'], test_size=0.25) # 切割数据
# 预处理:数据标准化(满足正态分布即标准差为1,平均值为0的数组)
# 处理公式为 X=(x-x̅)/α
scaler = StandardScaler()
x_train
实例
# coding=utf8
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
df = pd.read_csv(
r'E:\Python 机器学习\csv\datingTestSet.txt',
sep='\t',
header=None,
names=['flight', 'icecream', 'game', 'type']
)
df_value = df[['flight', 'icecream', 'game']].values
df_value = np.array(df_value)
# test_size=0.25 表示选用25%的数据进行验证
x_train, x_test, y_train, y_test = train_test_split(df_value, df['type'], test_size=0.25) # 切割数据
# 预处理:数据标准化(满足正态分布即标准差为1,平均值为0的数组)
# 处理公式为 X=(x-x̅)/α
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
# coding=utf8
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
df = pd.read_csv(
r'E:\Python 机器学习\csv\datingTestSet.txt',
sep='\t',
header=None,
names=['flight', 'icecream', 'game', 'type']
)
df_value = df[['flight', 'icecream', 'game']].values
df_value = np.array(df_value)
# test_size=0.25 表示选用25%的数据进行验证
x_train, x_test, y_train, y_test = train_test_split(df_value, df['type'], test_size=0.25) # 切割数据
# 预处理:数据标准化(满足正态分布即标准差为1,平均值为0的数组)
# 处理公式为 X=(x-x̅)/α
scaler = StandardScaler()
x_train
网格搜索
# 使用K近邻算法
knn = KNeighborsClassifier()
# 构造一些参数的值进行搜索
param = {'n_neighbors':[3,5,10]}
# 选用2折交叉验证
cv = 2
# 进行网格搜索
gc = GridSearchCV(knn, param_grid=param,cv=cv)
gc.fit(x_train,y_train)
gc_s = gc.score(x_test,y_test)
print(gc.best_score_) # 显示在交叉验证中最好的结果
print(gc.best_estimator_) # 显示选择最好的模型参数
print(gc.cv_results_) # 显示每个超参数每次交叉验证的结果
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。