我正在尝试在流水线上运行k折交叉验证(Standardscaler,DecisionTreeClassifier)。
首先,我导入数据。
data = pd.read_csv('train_strokes.csv')然后对数据帧进行预处理
# Preprocessing data
data.drop('id',axis=1,inplace=True)
data['age'] =data['age'].apply(lambda x : x if round(x) else np.nan)
data['bmi'] = data['bmi'].apply(lambda bmi : bmi if 12< bmi <45 else np.nan)
data['gender'] = data['gender'].apply(lambda gender : gender if gender =='Female' or gender =='Male' else np.nan)
data.sort_values(['gender', 'age','bmi'], inplace=True)
data['bmi'].ffill(inplace=True)
data.dropna(axis=0,inplace=True)
data.reset_index(drop=True, inplace=True)
#categorial data to numeric value
enc = LabelEncoder()
data['gender'] = enc.fit_transform(data['gender'])
data['work_type'] = enc.fit_transform(data['work_type'])
data['Residence_type'] = enc.fit_transform(data['Residence_type'])
data['smoking_status'] = enc.fit_transform(data['smoking_status'])
data['ever_married'] = enc.fit_transform(data['ever_married'])然后切片要素和目标
target = data['stroke']
feat = data.drop('stroke',axis=1)并使用SMOTE来平衡数据
sm = SMOTE(random_state = 1)
feat, target = sm.fit_resample(feat, target)
feat['age'] = feat['age'].apply(lambda x : round(x))
feat['hypertension'] = feat['hypertension'].apply(lambda x : round(x))
feat['heart_disease'] = feat['heart_disease'].apply(lambda x : round(x))
feat['ever_married'] = feat['ever_married'].apply(lambda x : round(x))
#split training and test
X_train, X_test, y_train, y_test = train_test_split(feat, target, test_size=0.3, random_state= 2)这是问题的一部分。
Kfold =KFold(n_splits=10)
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
n_iter = 0
for train_idx, test_idx in Kfold.split(feat):
pipeline.fit(X_train[train_idx], y_train[train_idx])
score = pipeline.score(X_train[test_idx],y_train[test_idx])
print('Fold #{} accuracy{}'.format(1,score))错误码
Traceback (most recent call last):
File "/Users/merb/Documents/Dev/DataScience/TP.py", line 84, in <module>
pipeline.fit(X_train[train_idx], y_train[train_idx])
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-
packages/pandas/core/frame.py", line 3030, in __getitem__
indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-
packages/pandas/core/indexing.py", line 1266, in _get_listlike_indexer
self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-
packages/pandas/core/indexing.py", line 1308, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([ 5893, 5894, 5895, 5896, 5897, 5898, 5899, 5900,
5901,\n 5902,\n ...\n 58912, 58913, 58914, 58915,
58916, 58917, 58918, 58919, 58920,\n 58921],\n dtype='int64',
length=53029)] are in the [columns]"发布于 2021-05-26 17:11:16
您应该使用df.loc[indexes]根据索引选择行。如果要按整型位置选择行,则应使用df.iloc[indexes]。
除此之外,你还可以阅读这篇关于索引和选择pandas数据的page。
https://stackoverflow.com/questions/67701679
复制相似问题