# coding=utf-8
# /usr/bin/python
'''
Author:Yan Errol
Email:2681506@gmail.com
Wechat:qq260187357
Date:2019-04-21--22:07
Describe:数据质量分析和数据的清洗
'''
import seaborn as sns
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
dataMin = 0
dataMax = 100
# 读数据集
def read_data(data_path):
train_data = pd.read_csv(data_path + 'train_dataset.csv')
test_data = pd.read_csv(data_path + 'test_dataset.csv')
sample_sub = pd.read_csv(data_path + 'submit_example.csv')
return train_data, test_data, sample_sub
# 数据异常值检测(质量分析)
def show_data(data):
print(plt.rcParams.keys())
# plt.rcParams["front.sans-serif"] = ['SimHei'] # 设置中文标签字体
plt.rcParams["axes.unicode_minus"] = False # 正常显示正负数
plt.figure()
p = data.boxplot() # 箱线图,检测异常值
plt.title(u'boxplot bad data detection')
plt.tight_layout()
plt.show()
# 统计量分析代码(特征分析)
def statistics_analyze(data):
statistics = data.describe() # 取出统计量
# 极差
statistics.loc["range"] = statistics.loc['max'] - statistics.loc['min']
# 变异系数 = 标准差/平均数
statistics.loc['var'] = statistics.loc['std'] / statistics.loc['mean']
# 四分位数间距
statistics.loc['dis'] = statistics.loc['75%'] - statistics.loc['25%']
print(statistics)
# 帕累托分析:(类似主成分分分析)
def dish_pareto(data):
data_score = data['score']
print(data_score)
#data_score.sort(ascending = False)
plt.figure()
data_score.plot(kind='bar')
# plt.ylabel('score')
# S = 1.0*data_score.cumsum()/data_score.sum()
# S.plot(color = 'r',secondary_y = True,style = '-o',linewidth = 2)
plt.show()
# 相关系分析
def corr_analyze(data):
# 任意变量之间的相关系数,默认的额方法是pearson相关系数
data.corr()
print("Score and other statistics's Correlation ",data.corr()['score'])
# 计算协方差
print("data's cov:",data.cov())
def data_normalization(data):
#最小-最大归一化
global dataMin
global dataMax
normal_result = (data - data.min())/(data.max()-data.min())
# 零-均值归一化
zero_one_nor = (data -data.mean())/data.std()
# 小数定标归一化
decimal_number = data/10**np.ceil(np.log(data.abs().max()))
return normal_result
# 主函数
def main():
data_path = '../input/'
train_data, test_data, sample_sub = read_data(data_path)
train_data.columns = [
'uid',
'true_name_flag',
'age',
'uni_student_flag',
'blk_list_flag',
'4g_unhealth_flag',
'net_age_till_now',
'top_up_month_diff',
'top_up_amount',
'recent_6month_avg_use',
'total_account_fee',
'curr_month_balance',
'curr_overdue_flag',
'cost_sensitivity',
'connect_num',
'freq_shopping_flag',
'recent_3month_shopping_count',
'wanda_flag',
'sam_flag',
'movie_flag',
'tour_flag',
'sport_flag',
'online_shopping_count',
'express_count',
'finance_app_count',
'video_app_count',
'flight_count',
'train_count',
'tour_app_count',
'score']
test_data.columns = train_data.columns[:-1]
show_data(train_data)
show_data(test_data)
statistics_analyze(train_data)
statistics_analyze(test_data)
dish_pareto(train_data)
corr_analyze(train_data)
normal_input = train_data.drop('uid',axis=1)
# 归一化方法1
#normal_result = data_normalization(normal_input)
# 归一化方法2
mm = MinMaxScaler()
normal_result = mm.fit_transform(normal_input)
np.savetxt("../nomal/train_data_nor.csv", normal_result, delimiter=',')
test_data_normal = test_data.drop('uid',axis=1)
normal_result = mm.fit_transform(test_data_normal)
np.savetxt("../nomal/test_data_nor.csv", normal_result, delimiter=',')
if __name__ == "__main__":
main()