# Python机器学习的练习八：异常检测和推荐系统

## 异常检测

```import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

X= data['X']
X.shape
(307L,2L)
fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1]) ```

```def estimate_gaussian(X):
mu= X.mean(axis=0)
sigma= X.var(axis=0)

return mu, sigma

mu, sigma= estimate_gaussian(X)
mu, sigma
(array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))```

```Xval= data['Xval']
yval= data['yval']

Xval.shape, yval.shape
((307L,2L), (307L,1L))```

`from scipyimport stats `
`dist= stats.norm(mu[0], sigma[0]) `
`dist.pdf(X[:,0])[0:50]`
`array([0.183842  , 0.20221694, 0.21746136, 0.19778763, 0.20858956,`
`        0.21652359, 0.16991291, 0.15123542, 0.1163989 , 0.1594734 ,`
`        0.21716057, 0.21760472, 0.20141857, 0.20157497, 0.21711385,`
`        0.21758775, 0.21695576, 0.2138258 , 0.21057069, 0.1173018 ,`
`        0.20765108, 0.21717452, 0.19510663, 0.21702152, 0.17429399,`
`        0.15413455, 0.21000109, 0.20223586, 0.21031898, 0.21313426,`
`        0.16158946, 0.2170794 , 0.17825767, 0.17414633, 0.1264951 ,`
`        0.19723662, 0.14538809, 0.21766361, 0.21191386, 0.21729442,`
`        0.21238912, 0.18799417, 0.21259798, 0.21752767, 0.20616968,`
`        0.21520366, 0.1280081 , 0.21768113, 0.21539967, 0.16913173])`

```p= np.zeros((X.shape[0], X.shape[1]))
p[:,0]= stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1]= stats.norm(mu[1], sigma[1]).pdf(X[:,1])

p.shape
(307L,2L)```

```pval= np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0]= stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1]= stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])```

```def select_threshold(pval, yval):
best_epsilon= 0
best_f1= 0
f1= 0

step= (pval.max()- pval.min())/ 1000

for epsilonin np.arange(pval.min(), pval.max(), step):
preds= pval < epsilon

tp= np.sum(np.logical_and(preds== 1, yval== 1)).astype(float)
fp= np.sum(np.logical_and(preds== 1, yval== 0)).astype(float)
fn= np.sum(np.logical_and(preds== 0, yval== 1)).astype(float)

precision= tp/ (tp+ fp)
recall= tp/ (tp+ fn)
f1= (2 * precision* recall)/ (precision+ recall)

if f1 > best_f1:
best_f1= f1
best_epsilon= epsilon

return best_epsilon, best_f1

epsilon, f1= select_threshold(pval, yval)
epsilon, f1
(0.0095667060059568421,0.7142857142857143)```

```# indexes of the values considered to be outliers
outliers= np.where(p < epsilon)

fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')```

## 协同过滤

```data= loadmat('data/ex8_movies.mat')
data
{'R': array([[1,1,0, ...,1,0,0],
[1,0,0, ...,0,0,1],
[1,0,0, ...,0,0,0],
...,
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0]], dtype=uint8),
'Y': array([[5,4,0, ...,5,0,0],
[3,0,0, ...,0,0,5],
[4,0,0, ...,0,0,0],
...,
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0],
[0,0,0, ...,0,0,0]], dtype=uint8),
'__globals__': [],
'__header__':'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Thu Dec  1 17:19:26 2011',
'__version__':'1.0'}```

Y是一个包含从一到五的评分的数据组，R是一个包含二进制值的“指示器”数据组，它表明了用户是否对电影进行了评分。两者应该具有相同的形状。

```Y= data['Y']
R= data['R']
Y.shape, R.shape
((1682L,943L), (1682L,943L))```

```Y[1,R[1,:]].mean()
2.5832449628844114```

```fig, ax= plt.subplots(figsize=(12,12))
ax.imshow(Y)
ax.set_xlabel('Users')
ax.set_ylabel('Movies')
fig.tight_layout()```

```def cost(params, Y, R, num_features):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

return J```

```users= 4
movies= 5
features= 3

X= params_data['X']
Theta= params_data['Theta']

X_sub= X[:movies, :features]
Theta_sub= Theta[:users, :features]
Y_sub= Y[:movies, :users]
R_sub= R[:movies, :users]

params= np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub)))

cost(params, Y_sub, R_sub, features)
22.224603725685675```

```def cost(params, Y, R, num_features):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

# unravel the gradient matrices into a single array

J, grad= cost(params, Y_sub, R_sub, features)
(22.224603725685675,
array([-2.52899165,  7.57570308, -1.89979026, -0.56819597,
3.35265031, -0.52339845, -0.83240713,  4.91163297,
-0.76677878, -0.38358278,  2.26333698, -0.35334048,
-0.80378006,  4.74271842, -0.74040871,-10.5680202 ,
4.62776019, -7.16004443, -3.05099006,  1.16441367,
-3.47410789,  0.        ,  0.        ,  0.        ,
0.        ,  0.        ,  0.        ]))```

```def cost(params, Y, R, num_features, learning_rate):
Y= np.matrix(Y) # (1682, 943)
R= np.matrix(R) # (1682, 943)
num_movies= Y.shape[0]
num_users= Y.shape[1]

# reshape the parameter array into parameter matrices
X= np.matrix(np.reshape(params[:num_movies* num_features], (num_movies, num_features))) # (1682, 10)
Theta= np.matrix(np.reshape(params[num_movies* num_features:], (num_users, num_features))) # (943, 10)

# initializations
J= 0

# compute the cost
error= np.multiply((X* Theta.T)- Y, R) # (1682, 943)
squared_error= np.power(error,2) # (1682, 943)
J= (1. / 2)* np.sum(squared_error)

J= J+ ((learning_rate/ 2)* np.sum(np.power(Theta,2)))
J= J+ ((learning_rate/ 2)* np.sum(np.power(X,2)))

# calculate the gradients with regularization

# unravel the gradient matrices into a single array

J, grad= cost(params, Y_sub, R_sub, features,1.5)
(31.344056244274221,
array([-0.95596339,  6.97535514, -0.10861109,  0.60308088,
2.77421145,  0.25839822,  0.12985616,  4.0898522 ,
-0.89247334,  0.29684395,  1.06300933,  0.66738144,
0.60252677,  4.90185327, -0.19747928,-10.13985478,
2.10136256, -6.76563628, -2.29347024,  0.48244098,
-2.99791422, -0.64787484, -0.71820673,  1.27006666,
1.09289758, -0.40784086,  0.49026541]))```

```movie_idx= {}
f= open('data/movie_ids.txt')
for linein f:
tokens= line.split(' ')
tokens[-1]= tokens[-1][:-1]
movie_idx[int(tokens[0])- 1]= ' '.join(tokens[1:])

ratings= np.zeros((1682,1))

ratings[0]= 4
ratings[6]= 3
ratings[11]= 5
ratings[53]= 4
ratings[63]= 5
ratings[65]= 3
ratings[68]= 5
ratings[97]= 2
ratings[182]= 4
ratings[225]= 5
ratings[354]= 5

print('Rated {0} with {1} stars.'.format(movie_idx[0],str(int(ratings[0]))))
print('Rated {0} with {1} stars.'.format(movie_idx[6],str(int(ratings[6]))))
print('Rated {0} with {1} stars.'.format(movie_idx[11],str(int(ratings[11]))))
print('Rated {0} with {1} stars.'.format(movie_idx[53],str(int(ratings[53]))))
print('Rated {0} with {1} stars.'.format(movie_idx[63],str(int(ratings[63]))))
print('Rated {0} with {1} stars.'.format(movie_idx[65],str(int(ratings[65]))))
print('Rated {0} with {1} stars.'.format(movie_idx[68],str(int(ratings[68]))))
print('Rated {0} with {1} stars.'.format(movie_idx[97],str(int(ratings[97]))))
print('Rated {0} with {1} stars.'.format(movie_idx[182],str(int(ratings[182]))))
print('Rated {0} with {1} stars.'.format(movie_idx[225],str(int(ratings[225]))))
print('Rated {0} with {1} stars.'.format(movie_idx[354],str(int(ratings[354]))))
Rated Toy Story (1995) with4 stars.
Rated Twelve Monkeys (1995) with3 stars.
Rated Usual Suspects, The (1995) with5 stars.
Rated Outbreak (1995) with4 stars.
Rated Shawshank Redemption, The (1994) with5 stars.
Rated While You Were Sleeping (1995) with3 stars.
Rated Forrest Gump (1994) with5 stars.
Rated Silence of the Lambs, The (1991) with2 stars.
Rated Alien (1979) with4 stars.
Rated Die Hard2 (1990) with5 stars.
Rated Sphere (1998) with5 stars.```

```R= data['R']
Y= data['Y']

Y= np.append(Y, ratings, axis=1)
R= np.append(R, ratings != 0, axis=1)```

```from scipy.optimizeimport minimize

movies= Y.shape[0]
users= Y.shape[1]
features= 10
learning_rate= 10.

X= np.random.random(size=(movies, features))
Theta= np.random.random(size=(users, features))
params= np.concatenate((np.ravel(X), np.ravel(Theta)))

Ymean= np.zeros((movies,1))
Ynorm= np.zeros((movies, users))

for iin range(movies):
idx= np.where(R[i,:]== 1)[0]
Ymean[i]= Y[i,idx].mean()
Ynorm[i,idx]= Y[i,idx]- Ymean[i]

fmin= minimize(fun=cost, x0=params, args=(Ynorm, R, features, learning_rate),
method='CG', jac=True, options={'maxiter':100})
fmin
status:1
success:False
njev:149
nfev:149
fun:38953.88249706676
x: array([-0.07177334,-0.08315075, 0.1081135 , ..., 0.1817828 ,
0.16873062, 0.03383596])
message:'Maximum number of iterations has been exceeded.'
jac: array([0.01833555, 0.07377974, 0.03999323, ...,-0.00970181,
0.00758961,-0.01181811])```

`X= np.matrix(np.reshape(fmin.x[:movies* features], (movies, features))) `
`Theta= np.matrix(np.reshape(fmin.x[movies* features:], (users, features)))`
`X.shape, Theta.shape`
`((1682L,10L), (944L,10L))`

```predictions= X* Theta.T
my_preds= predictions[:,-1]+ Ymean
sorted_preds= np.sort(my_preds, axis=0)[::-1]
sorted_preds[:10]
matrix([[5.00000264],
[5.00000249],
[4.99999831],
[4.99999671],
[4.99999659],
[4.99999253],
[4.99999238],
[4.9999915 ],
[4.99999019],
[4.99998643]]```

```idx= np.argsort(my_preds, axis=0)[::-1]
print("Top 10 movie predictions:")
for iin range(10):
j= int(idx[i])
print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[j])), movie_idx[j]))
Top10 movie predictions:
Predicted rating of5.00000264002 for movie Prefontaine (1997).
Predicted rating of5.00000249142 for movie Santa with Muscles (1996).
Predicted rating of4.99999831018 for movie Marlene Dietrich: Shadowand Light (1996) .
Predicted rating of4.9999967124 for movie Saint of Fort Washington, The (1993).
Predicted rating of4.99999658864 for movie They Made Me a Criminal (1939).
Predicted rating of4.999992533 for movie Someone Else's America (1995).
Predicted rating of4.99999238336 for movie Great Dayin Harlem, A (1994).
Predicted rating of4.99999149604 for movie Star Kid (1997).
Predicted rating of4.99999018592 for movie Aiqing wansui (1994).
Predicted rating of4.99998642746 for movie Entertaining Angels: The Dorothy Day Story (1996).```

