我有一个有一些数值和分类值的数据。当目标变量是等级时,我想做一些特征选择来可视化数据集中的低维分割。然而,当我做皮尔逊相关时,目标列就消失了。这阻碍了我只选择高度相关的特性。
下面是数据的一个特例:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog hardship_flag hardship_type hardship_reason hardship_status deferral_term hardship_amount hardship_start_date hardship_end_date payment_plan_start_date hardship_length hardship_dpd hardship_loan_status orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
0 1040017 NaN NaN 14000 14000 14000.0 36 months 12.69 469.63 C C2 Receiving Dock Worker 9 years MORTGAGE 40000.0 Not Verified 2015-10-01 Charged Off n NaN NaN debt_consolidation Debt consolidation 166xx PA 17.07 0.0 Jun-2001 1.0 NaN NaN 5.0 0.0 5848 90.0 15.0 f 0.0 0.0 6057.790000 6057.79 4091.51 1556.41 0.0 409.87 73.7766 Oct-2016 469.63 NaN Jul-2018 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 119776.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6500.0 NaN NaN NaN 4.0 23955.0 2167.0 90.0 0.0 0.0 141.0 172.0 3.0 3.0 1.0 3.0 NaN 3.0 NaN 0.0 3.0 3.0 8.0 8.0 6.0 3.0 8.0 3.0 5.0 NaN 0.0 0.0 2.0 100.0 100.0 0.0 0.0 123292.0 29809.0 6500.0 25992.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
1 1050463 NaN NaN 1000 1000 1000.0 36 months 9.17 31.88 B B2 Portfolio Manager 1 year MORTGAGE 80000.0 Verified 2015-10-01 Fully Paid n NaN NaN credit_card Credit card refinancing 949xx CA 12.51 0.0 Oct-1967 3.0 NaN 22.0 9.0 1.0 7634 37.2 32.0 w 0.0 0.0 1021.730000 1021.73 999.99 21.74 0.0 0.00 0.0000 Feb-2016 27.85 NaN Feb-2017 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 53994.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20500.0 NaN NaN NaN 4.0 5999.0 12866.0 37.2 0.0 0.0 188.0 575.0 4.0 4.0 3.0 4.0 NaN 1.0 NaN 0.0 3.0 3.0 6.0 16.0 9.0 6.0 20.0 3.0 9.0 0.0 0.0 0.0 3.0 100.0 0.0 1.0 0.0 80788.0 53994.0 20500.0 60288.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
下面是相关关系的创建
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df1.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
下面是与输出变量的相关性:
#Correlation with output variable
cor_target = abs(cor["grade"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features
然而,它给了我答案:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'grade'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-141-9f49267b4ee8> in <module>
1 #Correlation with output variable
----> 2 cor_target = abs(cor["grade"])
3 #Selecting highly correlated features
4 relevant_features = cor_target[cor_target>0.5]
5 relevant_features
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'grade'
发布于 2019-09-11 12:43:28
您的级别列似乎是非数值的,而corr()
方法忽略数据中的任何非数值数据类型列。要检查打印您的cor
变量,您将发现dataframe只包含原始数据中的数字列。
要解决这个问题,可以尝试将年级编码为数值,然后应用关联方法。
https://datascience.stackexchange.com/questions/60039
复制相似问题