# Python机器学习漫游指南

1. Linear Regression 线性回归

2. Logistic Regression 逻辑回归

3. Decision Trees 决策树

4. Support Vector Machines 支持向量机

5. K-Nearest Neighbors k最邻近分类算法

6. Random Forests 随机森林算法

7. K-Means Clustering k-平均算法

8. Principal Components Analysis 主成分分析

import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

import seaborn as sns

%matplotlib inline

DO NOT PANIC，我们要开始Machine Learning漫游了！

1

Linear Regression

Simpler is always preferred over complex when performance is comparable.

from sklearn import linear_model

df.columns = [‘X’, ‘Y’]

sns.set_context(“notebook”, font_scale=1.1)

sns.set_style(“ticks”)

sns.lmplot(‘X’,’Y’, data=df)

plt.ylabel(‘Response’)

plt.xlabel(‘Explanatory’)

linear = linear_model.LinearRegression()

trainX = np.asarray(df.X[20:len(df.X)]).reshape(-1, 1)

trainY = np.asarray(df.Y[20:len(df.Y)]).reshape(-1, 1)

testX = np.asarray(df.X[:20]).reshape(-1, 1)

testY = np.asarray(df.Y[:20]).reshape(-1, 1)

linear.fit(trainX, trainY)

linear.score(trainX, trainY)

print(‘Coefficient: \n’, linear.coef_)

print(‘Intercept: \n’, linear.intercept_)

print(‘R² Value: \n’, linear.score(trainX, trainY))

predicted = linear.predict(testX)

2

Logistic Regression

from sklearn.linear_model import LogisticRegression

df.columns = [‘X’, ‘Y’]

sns.set_context(“notebook”, font_scale=1.1)

sns.set_style(“ticks”)

sns.regplot(‘X’,’Y’, data=df, logistic=True)

plt.ylabel(‘Probability’)

plt.xlabel(‘Explanatory’)

logistic = LogisticRegression()

X = (np.asarray(df.X)).reshape(-1, 1)

Y = (np.asarray(df.Y)).ravel()

logistic.fit(X, Y)

logistic.score(X, Y)

print(‘Coefficient: \n’, logistic.coef_)

print(‘Intercept: \n’, logistic.intercept_)

print(‘R² Value: \n’, logistic.score(X, Y))

3

Decision Trees

from sklearn import tree

df.columns = [‘X1’, ‘X2’, ‘X3’, ‘X4’, ‘Y’]

from sklearn.cross_validation import train_test_split

decision = tree.DecisionTreeClassifier(criterion=’gini’)

X = df.values[:, 0:4]

Y = df.values[:, 4]

trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3)

decision.fit(trainX, trainY)

print(‘Accuracy: \n’, decision.score(testX, testY))

4

Support Vector Machines (SVM)

SVM是监督学习中常用的分类模型。可以把它理解成，在分散的点集中，画一条线，让每个集合中最近的点相互之间离得最远。

from sklearn import svm

df.columns = [‘X4’, ‘X3’, ‘X1’, ‘X2’, ‘Y’]

df = df.drop([‘X4’, ‘X3’], 1)

from sklearn.cross_validation import train_test_split

support = svm.SVC()

X = df.values[:, 0:2]

Y = df.values[:, 2]

trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3)

support.fit(trainX, trainY)

print(‘Accuracy: \n’, support.score(testX, testY))

pred = support.predict(testX)

sns.set_context(“notebook”, font_scale=1.1)

sns.set_style(“ticks”)

sns.lmplot(‘X1’,’X2', scatter=True, fit_reg=False, data=df, hue=’Y’)

plt.ylabel(‘X2’)

plt.xlabel(‘X1’)

5

KNN

K-nearest neighbour 顾名思义就是找合适的中心点，然后把离这个中心点最近的点集合在一起。所以KNN常被用在数据分类中。

KNN是不是很简单？我们看看如何代码实现吧。

from sklearn.neighbors import KNeighborsClassifier

df.columns = [‘X1’, ‘X2’, ‘X3’, ‘X4’, ‘Y’]

df = df.drop([‘X4’, ‘X3’], 1)

sns.set_context(“notebook”, font_scale=1.1)

sns.set_style(“ticks”)

sns.lmplot(‘X1’,’X2', scatter=True, fit_reg=False, data=df, hue=’Y’)

plt.ylabel(‘X2’)

plt.xlabel(‘X1’)

from sklearn.cross_validation import train_test_split

neighbors = KNeighborsClassifier(n_neighbors=5)

X = df.values[:, 0:2]

Y = df.values[:, 2]

trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3)

neighbors.fit(trainX, trainY)

print(‘Accuracy: \n’, neighbors.score(testX, testY))

pred = neighbors.predict(testX)

6

Random Forests

Forest顾名思义就是很多树tree组成的，random是随机的意思。那么random forest算法就是通过不同的随机树，然后归总。

from sklearn.ensemble import RandomForestClassifier

df.columns = [‘X1’, ‘X2’, ‘X3’, ‘X4’, ‘Y’]

from sklearn.cross_validation import train_test_split

forest = RandomForestClassifier()

X = df.values[:, 0:4]

Y = df.values[:, 4]

trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3)

forest.fit(trainX, trainY)

print(‘Accuracy: \n’, forest.score(testX, testY))

pred = forest.predict(testX)

7

K-Means Clustering

K-Means是非监督学习中常用的分类算法，K指的是分类的类别数目。

from sklearn.cluster import KMeans

df.columns = [‘X1’, ‘X2’, ‘X3’, ‘X4’, ‘Y’]

df = df.drop([‘X4’, ‘X3’], 1)

from sklearn.cross_validation import train_test_split

kmeans = KMeans(n_clusters=3)

X = df.values[:, 0:2]

kmeans.fit(X)

df[‘Pred’] = kmeans.predict(X)

sns.set_context(“notebook”, font_scale=1.1)

sns.set_style(“ticks”)

sns.lmplot(‘X1’,’X2', scatter=True, fit_reg=False, data=df, hue = ‘Pred’)

8

Principal Components Analysis

from sklearn import decomposition

df.columns = [‘X1’, ‘X2’, ‘X3’, ‘X4’, ‘Y’]

from sklearn import decomposition

pca = decomposition.PCA()

fa = decomposition.FactorAnalysis()

X = df.values[:, 0:4]

Y = df.values[:, 4]

train, test = train_test_split(X,test_size = 0.3)

train_reduced = pca.fit_transform(train)

test_reduced = pca.transform(test)

pca.n_components_

https://archive.ics.uci.edu/ml/datasets/Diabetes

https://archive.ics.uci.edu/ml/datasets/Iris

xCOURSE∣干货分享 求职辅导

• 发表于:
• 原文链接http://kuaibao.qq.com/s/20180209G16XEO00?refer=cp_1026
• 腾讯「云+社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 yunjia_community@tencent.com 删除。

2018-06-19

2018-06-13

2020-06-02

2020-06-02

2020-06-02

2020-06-02