# Python机器学习的练习八：异常检测和推荐系统

## 异常检测

```import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

X= data['X']
X.shape
(307L,2L)
fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1]) ```

```def estimate_gaussian(X):
mu= X.mean(axis=0)
sigma= X.var(axis=0)

return mu, sigma

mu, sigma= estimate_gaussian(X)
mu, sigma
(array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))```

```Xval= data['Xval']
yval= data['yval']

Xval.shape, yval.shape
((307L,2L), (307L,1L))```

`from scipyimport stats `
`dist= stats.norm(mu[0], sigma[0]) `
`dist.pdf(X[:,0])[0:50]`
`array([0.183842  , 0.20221694, 0.21746136, 0.19778763, 0.20858956,`
`        0.21652359, 0.16991291, 0.15123542, 0.1163989 , 0.1594734 ,`
`        0.21716057, 0.21760472, 0.20141857, 0.20157497, 0.21711385,`
`        0.21758775, 0.21695576, 0.2138258 , 0.21057069, 0.1173018 ,`
`        0.20765108, 0.21717452, 0.19510663, 0.21702152, 0.17429399,`
`        0.15413455, 0.21000109, 0.20223586, 0.21031898, 0.21313426,`
`        0.16158946, 0.2170794 , 0.17825767, 0.17414633, 0.1264951 ,`
`        0.19723662, 0.14538809, 0.21766361, 0.21191386, 0.21729442,`
`        0.21238912, 0.18799417, 0.21259798, 0.21752767, 0.20616968,`
`        0.21520366, 0.1280081 , 0.21768113, 0.21539967, 0.16913173])`

```p= np.zeros((X.shape[0], X.shape[1]))
p[:,0]= stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1]= stats.norm(mu[1], sigma[1]).pdf(X[:,1])

p.shape
(307L,2L)```

```pval= np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0]= stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1]= stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])```

```def select_threshold(pval, yval):
best_epsilon= 0
best_f1= 0
f1= 0

step= (pval.max()- pval.min())/ 1000

for epsilonin np.arange(pval.min(), pval.max(), step):
preds= pval < epsilon

tp= np.sum(np.logical_and(preds== 1, yval== 1)).astype(float)
fp= np.sum(np.logical_and(preds== 1, yval== 0)).astype(float)
fn= np.sum(np.logical_and(preds== 0, yval== 1)).astype(float)

precision= tp/ (tp+ fp)
recall= tp/ (tp+ fn)
f1= (2 * precision* recall)/ (precision+ recall)

if f1 > best_f1:
best_f1= f1
best_epsilon= epsilon

return best_epsilon, best_f1

epsilon, f1= select_threshold(pval, yval)
epsilon, f1
(0.0095667060059568421,0.7142857142857143)```

```# indexes of the values considered to be outliers
outliers= np.where(p < epsilon)

fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')```

## 协同过滤

```data= loadmat('data/ex8_movies.mat')
data
{'R': array([[1,1,0, ...,1,0,0],
[1,0,0, ...,0,0,1],
[1,0,0, ...,0,0,0],
...,
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0]], dtype=uint8),
'Y': array([[5,4,0, ...,5,0,0],
[3,0,0, ...,0,0,5],
[4,0,0, ...,0,0,0],
...,
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0]], dtype=uint8),
'__globals__': [],
'__header__':'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Thu Dec  1 17:19:26 2011',
'__version__':'1.0'}```

Y是一个包含从一到五的评分的数据组，R是一个包含二进制值的“指示器”数据组，它表明了用户是否对电影进行了评分。两者应该具有相同的形状。

```Y= data['Y']
R= data['R']
Y.shape, R.shape
((1682L,943L), (1682L,943L))```

```Y[1,R[1,:]].mean()
2.5832449628844114```

```fig, ax= plt.subplots(figsize=(12,12))
ax.imshow(Y)
ax.set_xlabel('Users')
ax.set_ylabel('Movies')
fig.tight_layout()```

```def cost(params, Y, R, num_features):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

return J```

```users= 4
movies= 5
features= 3

X= params_data['X']
Theta= params_data['Theta']

X_sub= X[:movies, :features]
Theta_sub= Theta[:users, :features]
Y_sub= Y[:movies, :users]
R_sub= R[:movies, :users]

params= np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub)))

cost(params, Y_sub, R_sub, features)
22.224603725685675```

```def cost(params, Y, R, num_features):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

# unravel the gradient matrices into a single array

J, grad= cost(params, Y_sub, R_sub, features)
(22.224603725685675,
array([-2.52899165,  7.57570308, -1.89979026, -0.56819597,
3.35265031, -0.52339845, -0.83240713,  4.91163297,
-0.76677878, -0.38358278,  2.26333698, -0.35334048,
-0.80378006,  4.74271842, -0.74040871,-10.5680202 ,
4.62776019, -7.16004443, -3.05099006,  1.16441367,
-3.47410789,  0.        ,  0.        ,  0.        ,
0.        ,  0.        ,  0.        ]))```

```def cost(params, Y, R, num_features, learning_rate):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

J= J+ ((learning_rate/ 2)* np.sum(np.power(Theta,2)))
J= J+ ((learning_rate/ 2)* np.sum(np.power(X,2)))

# calculate the gradients with regularization

# unravel the gradient matrices into a single array

J, grad= cost(params, Y_sub, R_sub, features,1.5)
(31.344056244274221,
array([-0.95596339,  6.97535514, -0.10861109,  0.60308088,
2.77421145,  0.25839822,  0.12985616,  4.0898522 ,
-0.89247334,  0.29684395,  1.06300933,  0.66738144,
0.60252677,  4.90185327, -0.19747928,-10.13985478,
2.10136256, -6.76563628, -2.29347024,  0.48244098,
-2.99791422, -0.64787484, -0.71820673,  1.27006666,
1.09289758, -0.40784086,  0.49026541]))```

```movie_idx= {}
f= open('data/movie_ids.txt')
for linein f:
tokens= line.split(' ')
tokens[-1]= tokens[-1][:-1]
movie_idx[int(tokens[0])- 1]= ' '.join(tokens[1:])

ratings= np.zeros((1682,1))

ratings[0]= 4
ratings[6]= 3
ratings[11]= 5
ratings[53]= 4
ratings[63]= 5
ratings[65]= 3
ratings[68]= 5
ratings[97]= 2
ratings[182]= 4
ratings[225]= 5
ratings[354]= 5

print('Rated {0} with {1} stars.'.format(movie_idx[0],str(int(ratings[0]))))
print('Rated {0} with {1} stars.'.format(movie_idx[6],str(int(ratings[6]))))
print('Rated {0} with {1} stars.'.format(movie_idx[11],str(int(ratings[11]))))
print('Rated {0} with {1} stars.'.format(movie_idx[53],str(int(ratings[53]))))
print('Rated {0} with {1} stars.'.format(movie_idx[63],str(int(ratings[63]))))
print('Rated {0} with {1} stars.'.format(movie_idx[65],str(int(ratings[65]))))
print('Rated {0} with {1} stars.'.format(movie_idx[68],str(int(ratings[68]))))
print('Rated {0} with {1} stars.'.format(movie_idx[97],str(int(ratings[97]))))
print('Rated {0} with {1} stars.'.format(movie_idx[182],str(int(ratings[182]))))
print('Rated {0} with {1} stars.'.format(movie_idx[225],str(int(ratings[225]))))
print('Rated {0} with {1} stars.'.format(movie_idx[354],str(int(ratings[354]))))
Rated Toy Story (1995) with4 stars.
Rated Twelve Monkeys (1995) with3 stars.
Rated Usual Suspects, The (1995) with5 stars.
Rated Outbreak (1995) with4 stars.
Rated Shawshank Redemption, The (1994) with5 stars.
Rated While You Were Sleeping (1995) with3 stars.
Rated Forrest Gump (1994) with5 stars.
Rated Silence of the Lambs, The (1991) with2 stars.
Rated Alien (1979) with4 stars.
Rated Die Hard2 (1990) with5 stars.
Rated Sphere (1998) with5 stars.```

```R= data['R']
Y= data['Y']

Y= np.append(Y, ratings, axis=1)
R= np.append(R, ratings != 0, axis=1)```

```from scipy.optimizeimport minimize

movies= Y.shape[0]
users= Y.shape[1]
features= 10
learning_rate= 10.

X= np.random.random(size=(movies, features))
Theta= np.random.random(size=(users, features))
params= np.concatenate((np.ravel(X), np.ravel(Theta)))

Ymean= np.zeros((movies,1))
Ynorm= np.zeros((movies, users))

for iin range(movies):
idx= np.where(R[i,:]== 1)[0]
Ymean[i]= Y[i,idx].mean()
Ynorm[i,idx]= Y[i,idx]- Ymean[i]

fmin= minimize(fun=cost, x0=params, args=(Ynorm, R, features, learning_rate),
method='CG', jac=True, options={'maxiter':100})
fmin
status:1
success:False
njev:149
nfev:149
fun:38953.88249706676
x: array([-0.07177334,-0.08315075, 0.1081135 , ..., 0.1817828 ,
0.16873062, 0.03383596])
message:'Maximum number of iterations has been exceeded.'
jac: array([0.01833555, 0.07377974, 0.03999323, ...,-0.00970181,
0.00758961,-0.01181811])```

`X= np.matrix(np.reshape(fmin.x[:movies* features], (movies, features))) `
`Theta= np.matrix(np.reshape(fmin.x[movies* features:], (users, features)))`
`X.shape, Theta.shape`
`((1682L,10L), (944L,10L))`

```predictions= X* Theta.T
my_preds= predictions[:,-1]+ Ymean
sorted_preds= np.sort(my_preds, axis=0)[::-1]
sorted_preds[:10]
matrix([[5.00000264],
[5.00000249],
[4.99999831],
[4.99999671],
[4.99999659],
[4.99999253],
[4.99999238],
[4.9999915 ],
[4.99999019],
[4.99998643]]```

```idx= np.argsort(my_preds, axis=0)[::-1]
print("Top 10 movie predictions:")
for iin range(10):
j= int(idx[i])
print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[j])), movie_idx[j]))
Top10 movie predictions:
Predicted rating of5.00000264002 for movie Prefontaine (1997).
Predicted rating of5.00000249142 for movie Santa with Muscles (1996).
Predicted rating of4.99999831018 for movie Marlene Dietrich: Shadowand Light (1996) .
Predicted rating of4.9999967124 for movie Saint of Fort Washington, The (1993).
Predicted rating of4.99999658864 for movie They Made Me a Criminal (1939).
Predicted rating of4.999992533 for movie Someone Else's America (1995).
Predicted rating of4.99999238336 for movie Great Dayin Harlem, A (1994).
Predicted rating of4.99999149604 for movie Star Kid (1997).
Predicted rating of4.99999018592 for movie Aiqing wansui (1994).
Predicted rating of4.99998642746 for movie Entertaining Angels: The Dorothy Day Story (1996).```

0 条评论

• ### Python机器学习的练习五：神经网络

在这篇文章中，我们将再次处理手写数字数据集，但这次使用反向传播的前馈神经网络。我们将通过反向传播算法实现神经网络成本函数的非正则化和正则化版本以及梯度计算。最后...

• ### Python机器学习的练习四：多元逻辑回归

在本系列的第3部分中，我们实现了简单的和正则化的逻辑回归。但我们的解决方法有一个限制—它只适用于二进制分类。在本文中，我们将在之前的练习中扩展我们的解决方案，以...

• ### Python可视化解析MCMC

马尔可夫链可以定义为一个随机过程Y，其中t时刻各点的值只取决于t-1时刻的值。这意味着随机过程在t时刻有状态x的概率，给定它所有的过去状态，等于在t时刻有状态x...

• ### 100个Numpy练习【3】

翻译：YingJoy 网址： https://www.yingjoy.cn/ 来源： https://github.com/rougier/numpy-100...

• ### 机器学习算法Python实现

目录 一、线性回归 1、代价函数 2、梯度下降算法 3、均值归一化 4、最终运行结果 5、使用scikit-learn库中的线性模型实现 二、逻辑回归 1、代价...

• ### 100道测试题，轻松玩转python的Numpy模块！

Numpy是Python做数据分析所必须要掌握的基础库之一。以下为入门Numpy的100题小练习，原为github上的开源项目，由和鲸社区的小科翻译并整理（保留...

• ### 100道测试题，带你玩转Numpy模块！

Numpy是Python做数据分析所必须要掌握的基础库之一。以下为入门Numpy的100题小练习，原为github上的开源项目，由和鲸社区的小科翻译并整理（保留...

• ### 数据可视化入门

np.array(collection)，collection为序列型对象(list)，嵌套序列 (list of list)

• ### 100道测试题，带你玩转Numpy模块！

Numpy是Python做数据分析所必须要掌握的基础库之一。以下为入门Numpy的100题小练习，原为github上的开源项目，由和鲸社区的小科翻译并整理（保留...

• ### 这100道练习，带你玩转Numpy

Numpy是Python做数据分析所必须要掌握的基础库之一。以下为入门Numpy的100题小练习，原为github上的开源项目，由和鲸社区的小科翻译并整理（保留...