# 数据分析小实验(下)

1）workclass

government_jobs = df.workclass.str.endswith('-gov')

df.loc[government_jobs,'workclass'] = 'Government'

others = df.workclass.isin(['Retired','Without-pay','Never-worked','Unknown'])

df.loc[others,'workclass'] = 'Other'

df.workclass = df.workclass.replace({"Self-emp-not-inc":"Freelance","Self-emp-inc":"Proprietor"})

2）对education分析

primary = df.education.isin(['Preschool', '1st-4th', '5th-6th'])

df.loc[primary,'education'] = 'Primary'

secondary = df.education.isin(['Some-college', '11th', '9th', '7th-8th', '10th','12th','HS-grad','Assoc-voc'])

df.loc[secondary,'education'] = 'Secondary'

teriary = df.education.isin(['Bachelors', 'Masters', 'Doctorate', 'Prof-school','Assoc-acdm'])

df.loc[teriary,'education'] = 'Tertiary'

3）maritalstatus婚姻状况

df.maritalstatus = (df.maritalstatus != 'Never-married').astype(int)

4）occupation职业

income_per_occupation = df[['occupation','income']].groupby('occupation').income.value_counts().unstack()

income_per_occupation = (income_per_occupation.large / income_per_occupation.sum(1)).sort_values(ascending=False)

high_earnings = df.occupation.isin(income_per_occupation[income_per_occupation >= 0.33].index.tolist())

df.loc[high_earnings,'occupation'] = 'High'

mid_earnings = df.occupation.isin(income_per_occupation[(income_per_occupation > 0.20) & (income_per_occupation < 0.33)].index.tolist())

df.loc[mid_earnings,'occupation'] = 'Med'

low_earnings = df.occupation.isin(income_per_occupation[income_per_occupation <= 0.20].index.tolist())

df.loc[low_earnings,'occupation'] = 'Low'

5）relationship

not_spouse = df.relationship.isin(['Not-in-family', 'Own-child', 'Unmarried', 'Other-relative'])

df.loc[not_spouse, 'relationship'] = 'Other'

6）race种族

7）nativecountry

USA又是一巨无霸,而且涉及到很多的国家，如果简单的将结果分为USA和非USA，肯定会错失很多关键信息，毕竟国家的发展状况还是会影响到个人收入的，因此可以将国家GDP考虑进来，将GDP按高低分段。或按发达国家，发展中国家和贫穷国家划分都是可选的方案的。嗯，这就交给你们去实践了，实验就按USA和非USA来了。

df.nativecountry = (df.nativecountry == 'United-States').astype(int) 。Categorical的清理工作已经完成，还剩下最关键的income和continuous没有处理了。

8) continuous连续变量

df['y'] = (df.income == 'large').astype(int)

from sklearn.ensemble import RandomForestClassifier as RF

from sklearn.cross_validation import cross_val_score

data = df.select_dtypes(['int','float'])

X = data[data.y.notnull()].drop('y',axis=1)

y = df.y[df.y.notnull()]

X_new = df[df.y.isnull()].select_dtypes(['int','float']).drop('y',axis=1)

cls = RF(25, n_jobs=-1)

cross_val_score(cls,X,y,'roc_auc',cv=5)

cls = RF(25, n_jobs=-1)

cls.fit(X,y)

df.y[df.y.isnull()] = cls.predict(X_new)

'age','sex_male','is_married','is_white','education_tertiary','earning_potential_high','relationship_husband','relationship_wife'，‘hoursperweek’

kmeans = KMeans(k,)

kmeans.fit(X)

2.0 2751

0.0 2603

1.0 2175

3.0 1451

4.0 1020

KMeans：http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

Silhoutte Score:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html

328 篇文章62 人订阅

0 条评论

## 相关文章

702

7534

831

8604

2884

2888

### 行为科学统计第一章知识点总结

1、什么是总体？什么是样本？ 总体是一个研究的所有研究对象的个体的集合。样本是被选择出来的参与研究的特定的个体集合。样本被期望能够代表总体。

1331

1032

1052

2016