# 二.Item Based and User Based

## 1.原理

Item-based和User-Based是CF算法中最基础的两个了，其算法思想很intuitive：

User-based就是把与你有相同爱好的用户所喜欢的物品(并且你还没有评过分)推荐给你

Item-based则与之相反，把和你之前喜欢的物品近似的物品推荐给你：

## 2.实现

[python] view plaincopy

1. from __future__ import division
2. import numpy as np
3. import scipy as sp
4. class Item_based_C:
5. def __init__(self,X):
6. self.X=np.array(X)
7. print "the input data size is ",self.X.shape
8. self.movie_user={}
9. self.user_movie={}
10. self.ave=np.mean(self.X[:,2])
11. for i in range(self.X.shape[0]):
12. uid=self.X[i][0]
13. mid=self.X[i][1]
14. rat=self.X[i][2]
15. self.movie_user.setdefault(mid,{})
16. self.user_movie.setdefault(uid,{})
17. self.movie_user[mid][uid]=rat
18. self.user_movie[uid][mid]=rat
19. self.similarity={}
20. pass
21. def sim_cal(self,m1,m2):
22. self.similarity.setdefault(m1,{})
23. self.similarity.setdefault(m2,{})
24. self.movie_user.setdefault(m1,{})
25. self.movie_user.setdefault(m2,{})
26. self.similarity[m1].setdefault(m2,-1)
27. self.similarity[m2].setdefault(m1,-1)
28. if self.similarity[m1][m2]!=-1:
29. return self.similarity[m1][m2]
30. si={}
31. for user in self.movie_user[m1]:
32. if user in self.movie_user[m2]:
33. si[user]=1
34. n=len(si)
35. if (n==0):
36. self.similarity[m1][m2]=1
37. self.similarity[m2][m1]=1
38. return 1
39. s1=np.array([self.movie_user[m1][u] for u in si])
40. s2=np.array([self.movie_user[m2][u] for u in si])
41. sum1=np.sum(s1)
42. sum2=np.sum(s2)
43. sum1Sq=np.sum(s1**2)
44. sum2Sq=np.sum(s2**2)
45. pSum=np.sum(s1*s2)
46. num=pSum-(sum1*sum2/n)
47. den=np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))
48. if den==0:
49. self.similarity[m1][m2]=0
50. self.similarity[m2][m1]=0
51. return 0
52. self.similarity[m1][m2]=num/den
53. self.similarity[m2][m1]=num/den
54. return num/den
55. def pred(self,uid,mid):
56. sim_accumulate=0.0
57. rat_acc=0.0
58. for item in self.user_movie[uid]:
59. sim=self.sim_cal(item,mid)
60. if sim<0:continue
61. #print sim,self.user_movie[uid][item],sim*self.user_movie[uid][item]
62. rat_acc+=sim*self.user_movie[uid][item]
63. sim_accumulate+=sim
64. #print rat_acc,sim_accumulate
65. if sim_accumulate==0: #no same user rated,return average rates of the data
66. return self.ave
67. return rat_acc/sim_accumulate
68. def test(self,test_X):
69. test_X=np.array(test_X)
70. output=[]
71. sums=0
72. print "the test data size is ",test_X.shape
73. for i in range(test_X.shape[0]):
74. pre=self.pred(test_X[i][0],test_X[i][1])
75. output.append(pre)
76. #print pre,test_X[i][2]
77. sums+=(pre-test_X[i][2])**2
78. rmse=np.sqrt(sums/test_X.shape[0])
79. print "the rmse on test data is ",rmse
80. return output

sim_cal()为相似度计算,pred(uid,mid)预测uid号用户对mid号电影评分,然后我们在test()中计算RMSE,来看看结果:

# 三.matrix factorization model 和 Baseline Predictors

## 2.Baseline Predictors

Baseline Predictors就简单多了，我们设定μ是平均值，然后分别用bi和bu来代表具体用户和物品的“偏好”，也就是

# 四.SVD and ++ and so on

## 1.SVD及其衍生算法的原理

SVD算法其实就是上面介绍的matrix factorization的一种，加上baseline predictor算是一种优化而已，最简单的SVD是优化下面的Loss function：

### SVD++

R(u)表示用户rate过的电影，这样加入参数后使模型更加复杂了，但其效果也就更好了，具体的优化过程就不贴了，反正还是那样，对Loss function求导而已。

## 2.SVD的实现

[python] view

from __future__ import division

1. import numpy as np
2. import scipy as sp
3. from numpy.random import random
4. class SVD_C:
5. def __init__(self,X,k=20):
6. '''''
7. k is the length of vector
8. '''
9. self.X=np.array(X)
10. self.k=k
11. self.ave=np.mean(self.X[:,2])
12. print "the input data size is ",self.X.shape
13. self.bi={}
14. self.bu={}
15. self.qi={}
16. self.pu={}
17. self.movie_user={}
18. self.user_movie={}
19. for i in range(self.X.shape[0]):
20. uid=self.X[i][0]
21. mid=self.X[i][1]
22. rat=self.X[i][2]
23. self.movie_user.setdefault(mid,{})
24. self.user_movie.setdefault(uid,{})
25. self.movie_user[mid][uid]=rat
26. self.user_movie[uid][mid]=rat
27. self.bi.setdefault(mid,0)
28. self.bu.setdefault(uid,0)
29. self.qi.setdefault(mid,random((self.k,1))/10*(np.sqrt(self.k)))
30. self.pu.setdefault(uid,random((self.k,1))/10*(np.sqrt(self.k)))
31. def pred(self,uid,mid):
32. self.bi.setdefault(mid,0)
33. self.bu.setdefault(uid,0)
34. self.qi.setdefault(mid,np.zeros((self.k,1)))
35. self.pu.setdefault(uid,np.zeros((self.k,1)))
36. if (self.qi[mid]==None):
37. self.qi[mid]=np.zeros((self.k,1))
38. if (self.pu[uid]==None):
39. self.pu[uid]=np.zeros((self.k,1))
40. ans=self.ave+self.bi[mid]+self.bu[uid]+np.sum(self.qi[mid]*self.pu[uid])
41. if ans>5:
42. return 5
43. elif ans<1:
44. return 1
45. return ans
46. def train(self,steps=20,gamma=0.04,Lambda=0.15):
47. for step in range(steps):
48. print 'the ',step,'-th step is running'
49. rmse_sum=0.0
50. kk=np.random.permutation(self.X.shape[0])
51. for j in range(self.X.shape[0]):
52. i=kk[j]
53. uid=self.X[i][0]
54. mid=self.X[i][1]
55. rat=self.X[i][2]
56. eui=rat-self.pred(uid,mid)
57. rmse_sum+=eui**2
58. self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])
59. self.bi[mid]+=gamma*(eui-Lambda*self.bi[mid])
60. temp=self.qi[mid]
61. self.qi[mid]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[mid])
62. self.pu[uid]+=gamma*(eui*temp-Lambda*self.pu[uid])
63. gamma=gamma*0.93
64. print "the rmse of this step on train data is ",np.sqrt(rmse_sum/self.X.shape[0])
65. #self.test(test_data)
66. def test(self,test_X):
67. output=[]
68. sums=0
69. test_X=np.array(test_X)
70. #print "the test data size is ",test_X.shape
71. for i in range(test_X.shape[0]):
72. pre=self.pred(test_X[i][0],test_X[i][1])
73. output.append(pre)
74. #print pre,test_X[i][2]
75. sums+=(pre-test_X[i][2])**2
76. rmse=np.sqrt(sums/test_X.shape[0])
77. print "the rmse on test data is ",rmse
78. return output

[python]

1. a=SVD_C(train_X,30)
2. a.train()
3. a.test(test_X)

# 五.CF with RBM

## 2.实现

[python] view placopy

1. from __future__ import division
2. import numpy as np
3. import scipy as sp
4. from numpy.random import normal,random,uniform
5. '''''
6. this code still have some problem,I can only get 0.98 rmse on movielens data
7. If you can figure it out,PLEASE!!! tell me .
8. '''
9. class TEMP:
10. def __init__(self):
11. self.AccVH=None
12. self.CountVH=None
13. self.AccV=None
14. self.temp.CountV=None
15. self.temp.AccH=None
16. class CF_RMB_C:
17. def __init__(self,X,UserNum=943,HiddenNum=30,ItemNum=1682,Rate=5):
18. self.X=np.array(X)
19. self.HiddenNum=HiddenNum
20. self.ItemNum=ItemNum
21. self.UserNum=UserNum
22. self.Rate=Rate
23. self.movie_user={}
24. self.user_movie={}
25. self.bik=np.zeros((self.ItemNum,self.Rate))
26. self.Momentum={}
27. self.Momentum['bik']=np.zeros((self.ItemNum,self.Rate))
28. self.UMatrix=np.zeros((self.UserNum,self.ItemNum))
29. self.V=np.zeros((self.ItemNum,self.Rate))
30. for i in range(self.X.shape[0]):
31. uid=self.X[i][0]-1
32. mid=self.X[i][1]-1
33. rat=self.X[i][2]-1
34. self.UMatrix[uid][mid]=1
35. self.bik[mid][rat]+=1
36. self.movie_user.setdefault(mid,{})
37. self.user_movie.setdefault(uid,{})
38. self.movie_user[mid][uid]=rat
39. self.user_movie[uid][mid]=rat
40. pass
41. self.W=normal(0,0.01,(self.ItemNum,self.Rate,HiddenNum))
42. self.Momentum['W']=np.zeros(self.W.shape)
43. self.initialize_bik()
44. self.bj=np.zeros((HiddenNum,1)).flatten(1)
45. self.Momentum['bj']=np.zeros(self.bj.shape)
46. self.Dij=np.zeros((self.ItemNum,self.HiddenNum))
47. self.Momentum['Dij']=np.zeros((self.ItemNum,self.HiddenNum))
48. def initialize_bik(self):
49. for i in range(self.ItemNum):
50. total=np.sum(self.bik[i])
51. if total>0:
52. for k in range(self.Rate):
53. if self.bik[i][k]==0:
54. self.bik[i][k]=-10
55. else:
56. self.bik[i][k]=np.log(self.bik[i][k]/total)
57. def test(self,test_X):
58. output=[]
59. sums=0
60. test_X=np.array(test_X)
61. #print "the test data size is ",test_X.shape
62. for i in range(test_X.shape[0]):
63. pre=self.pred(test_X[i][0]-1,test_X[i][1]-1)
64. #print test_X[i][2],pre
65. output.append(pre)
66. #print pre,test_X[i][2]
67. sums+=(pre-test_X[i][2])**2
68. rmse=np.sqrt(sums/test_X.shape[0])
69. print "the rmse on test data is ",rmse
70. return output
71. def pred(self,uid,mid):
72. V=self.clamp_user(uid)
73. pj=self.update_hidden(V,uid)
74. vp=self.update_visible(pj,uid,mid)
75. ans=0
76. for i in range(self.Rate):
77. ans+=vp[i]*(i+1)
78. return ans
79. def clamp_user(self,uid):
80. V=np.zeros(self.V.shape)
81. for i in self.user_movie[uid]:
82. V[i][self.user_movie[uid][i]]=1
83. return V
84. def train(self,para,test_X,cd_steps=3,batch_size=30,numEpoch=100,Err=0.00001):
85. for epo in range(numEpoch):
86. print 'the ',epo,'-th epoch is running'
87. kk=np.random.permutation(range(self.UserNum))
88. bt_count=0
89. while bt_count<=self.UserNum:
90. btend=min(self.UserNum,bt_count+batch_size)
91. users=kk[bt_count:btend]
92. temp=TEMP
93. temp.AccVH=np.zeros(self.W.shape)
94. temp.CountVH=np.zeros(self.W.shape)
95. temp.AccV=np.zeros(self.V.shape)
96. temp.CountV=np.zeros(self.V.shape)
97. temp.AccH=np.zeros(self.bj.shape)
98. watched=np.zeros(self.UMatrix[0].shape)
99. for user in users:
100. watched[self.UMatrix[user]==1]=1
101. sv=self.clamp_user(user)
102. pj=self.update_hidden(sv,user)
103. temp=self.accum_temp(sv,pj,temp,user)
104. #AccVH+=pj*
105. for step in range(cd_steps):
106. sh=self.sample_hidden(pj)
107. vp=self.update_visible(sh,user)
108. sv=self.sample_visible(vp,user)
109. pj=self.update_hidden(sv,user)
110. deaccum_temp=self.deaccum_temp(sv,pj,temp,user)
111. self.updateall(temp,batch_size,para,watched)
112. #updateall============================================
113. bt_count+=batch_size
114. self.test(test_X)
115. def accum_temp(self,V,pj,temp,uid):
116. for i in self.user_movie[uid]:
117. temp.AccVH[i]+=np.dot(V[i].reshape(-1,1),pj.reshape(1,-1))
118. temp.CountVH[i]+=1
119. temp.AccV[i]+=V[i]
120. temp.CountV[i]+=1
121. temp.AccH+=pj
122. return temp
123. def deaccum_temp(self,V,pj,temp,uid):
124. for i in self.user_movie[uid]:
125. temp.AccVH[i]-=np.dot(V[i].reshape(-1,1),pj.reshape(1,-1))
126. temp.AccV[i]-=V[i]
127. temp.AccH-=pj
128. return temp
129. def updateall(self,temp,batch_size,para,watched):
130. delatW=np.zeros(temp.CountVH.shape)
131. delatBik=np.zeros(temp.CountV.shape)
132. delatW[temp.CountVH!=0]=temp.AccVH[temp.CountVH!=0]/temp.CountVH[temp.CountVH!=0]
133. delatBik[temp.CountV!=0]=temp.AccV[temp.CountV!=0]/temp.CountV[temp.CountV!=0]
134. delataBj=temp.AccH/batch_size
135. self.Momentum['W'][temp.CountVH!=0]=self.Momentum['W'][temp.CountVH!=0]*para['Momentum']
136. self.Momentum['W'][temp.CountVH!=0]+=para['W']*(delatW[temp.CountVH!=0]-para['weight_cost']*self.W[temp.CountVH!=0])
137. self.W[temp.CountVH!=0]+=self.Momentum['W'][temp.CountVH!=0]
138. self.Momentum['bik'][temp.CountV!=0]=self.Momentum['bik'][temp.CountV!=0]*para['Momentum']
139. self.Momentum['bik'][temp.CountV!=0]+=para['bik']*delatBik[temp.CountV!=0]
140. self.bik[temp.CountV!=0]+=self.Momentum['bik'][temp.CountV!=0]
141. self.Momentum['bj']=self.Momentum['bj']*para['Momentum']
142. self.Momentum['bj']+=para['bj']*delataBj
143. self.bj+=self.Momentum['bj']
144. for i in range(self.ItemNum):
145. if watched[i]==1:
146. self.Momentum['Dij'][i]=self.Momentum['Dij'][i]*para['Momentum']
147. self.Momentum['Dij'][i]+=para['D']*temp.AccH/batch_size
148. self.Dij[i]+=self.Momentum['Dij'][i]
149. np.seterr(all='raise')
150. def update_hidden(self,V,uid):
151. r=self.UMatrix[uid]
152. hp=None
153. for i in self.user_movie[uid]:
154. if hp==None:
155. hp=np.dot(V[i],self.W[i]).flatten(1)
156. else:
157. hp+=np.dot(V[i],self.W[i]).flatten(1)
158. pj=1/(1+np.exp(-self.bj-hp+np.dot(r,self.Dij).flatten(1)))
159. #pj=1/(1+np.exp(-self.bj-hp))
160. return pj
161. def sample_hidden(self,pj):
162. sh=uniform(size=pj.shape)
163. for i in range(sh.shape[0]):
164. if sh[i]<pj[i]:
165. sh[i]=1.0
166. else:
167. sh[i]=0.0
168. return sh
169. def update_visible(self,sh,uid,mid=None):
170. if mid==None:
171. vp=np.zeros(self.V.shape)
172. for i in self.user_movie[uid]:
173. vp[i]=np.exp(self.bik[i]+np.dot(self.W[i],sh))
174. vp[i]=vp[i]/np.sum(vp[i])
175. return vp
176. vp=np.exp(self.bik[mid]+np.dot(self.W[mid],sh))
177. vp=vp/np.sum(vp)
178. return vp
179. def sample_visible(self,vp,uid):
180. sv=np.zeros(self.V.shape)
181. for i in self.user_movie[uid]:
182. r=uniform()
183. k=0
184. for k in range(self.Rate):
185. r-=vp[i][k]
186. if r<=0:break
187. sv[i][k]=1
188. return sv

822 篇文章234 人订阅

0 条评论

## 相关文章

### 最近下载的以及一些朋友共享的图像方面的论文备份。

过完年以后，一直忙于各种杂事和杂务，本来可以做研究的时间被带崽和加班所占据，偶尔有闲看到一些好文章也只能先备份在那里在等日后有空了在去研究。还有就是QQ里的...

26990

22580

### KDD2018 | 电商搜索场景中的强化排序学习：形式化、理论分析以及应用

（1）对电商搜索场景中的多步排序问题进行形式化描述，定义搜索会话马尔科夫决策过程问题模型（Search Session Markov Decision Proc...

27420

### 【AI可能真的要代替插画师了】复旦同济用cGAN生成动画人物

【新智元导读】复旦大学、同济、CMU等的研究者使用cGAN生成各种属性的二次元人物头像，效果非常令人印象深刻。生成的图片质量非常之高，本文作者认为这项工作如果加...

51250

### 机器学习人工学weekly-2018/3/17

1. PyTorch构架分析 PyTorch – Internal Architecture Tour 链接：http://blog.christianper...

32370

### 机械版CG 实验6 简单光照明模型实现

Phong光照明模型是由物体表面上一点P反射到视点的光强I为环境光的反射光强Ie、理想漫反射光强Id、和镜面反射光Is的总和，即

12410

15530

13510

### 机器学习人工学weekly-2018/7/1

Building the Software 2.0 Stack by Andrej Karpathy from Tesla

14740

### Github 项目推荐 | 基于 PyTorch，面向 AI 系统加速研究与开发的深度学习框架

TorchFusion 基于 PyTorch 并且完全兼容纯 PyTorch 和其他 PyTorch 软件包，它供了一个全面的可扩展训练框架，可以轻松用开发者的...

15320