# 基于协同过滤的推荐引擎（实战部分）

## 数据处理

#### 数据读取

```import pandas as pd
in_file = '/Users/liukaixin/MachineLearning/dataset/ml-latest-small/ratings.csv'

print(len(full_data)) # 看看数据总条数```

#### 添加预测列

```import time
real_rating = full_data['rating'] # 原rating
predict_rating = np.array(full_data['rating']) # 复制的要1/3置零的rating
# 获得userId改变点的下标函数
def get_change_index_points(full_dataframe, key):
return np.array(full_dataframe.drop_duplicates([key]).index)
start = time.clock()
points = get_change_index_points(full_data, 'userId')
end = time.clock()
print "run time: %f s" % (end - start)
print(len(points))```

`full_dataframe.drop_duplicates([key])`函数的意思是根据key去重，这个算法是dataframe优化过的，速度很快，得到的还是dataframe，只要取index，转成array就是我们想要的了。接下来写1/3置零的函数。

```# 预测列置0，zero_percent是前百分之多少置0
def change_predict_data(predict_data, zero_percent):
last = 0
for i in points:
num = int((i-last) * zero_percent)
for j in range(0, num):
predict_data[last + j] = 0
last = i
return predict_data
import time
start = time.clock()
predict_rating = change_predict_data(predict_rating, 0.3)
end = time.clock()
print "run time: %f s" % (end - start)  # run time: 0.031788 s
full_data.insert(4,'predict_rating',predict_rating)
print(full_data[:20])```

## 获得要比较的两个列向量

```# full_data:原DataFrame
# mov_id:要预测的movie id
# ref_mov_id:对照的movie id
# 返回值colA是要预测的列的rating，colB是对照列的rating，由于评分预测接受的传值是矩阵的列向量，所以转成矩阵
def get_colA_and_colB(full_data, mov_id, ref_mov_id):
colA = []
colB = []
movies = full_data[full_data['movieId'] == mov_id] # 相当于sql查询movie id是mov_id的所有数据，所以得到的是一个DataFrame
for i, movie in movies.iterrows(): # 遍历要预测的电影
if movie['predict_rating'] == 0.0: # 去掉没打分的电影
continue
user_id = movie['userId'] # 找打过分的记录，看是谁打的分
comp_user_movies = full_data[full_data['userId'] == user_id] # 找到打过分的用户的所有电影
rating = []
for i, mov in comp_user_movies.iterrows(): # 遍历电影，看他是否也看过要预测的电影，如果看过，则把打分加入colB
if mov['movieId'] == ref_mov_id:
rating.append(mov['predict_rating'])
if len(rating) == 0:
continue
else:
colA.append([movie['predict_rating']])
colB.append(rating)
return np.mat(colA), np.mat(colB)```

## 计算预测评分

```# 欧氏距离
def eulid_sim(colA, colB):
return 1.0 / (1.0 + np.linalg.norm(colA - colB))
# 皮尔逊系数
def pears_sim(colA, colB):
if len(colA) < 3:
return 1.0
return 0.5 + 0.5 * np.corrcoef(colA, colB, rowvar = 0)[0][1]
# 余弦相似度
def cos_sim(colA, colB):
if (colA.shape==(1,0)):
return 0
num = float(colA.T * colB) # colA和colB都是列向量，shape一样，都形如[[1],[2],[3],[4]]，两个shape一样不能相乘，需要将其中一个转为行向量
denom = np.linalg.norm(colA) * np.linalg.norm(colB)
return 0.5 + 0.5 * (num / denom)```

```# full_datas:原数据DataFrame
# user_id:要推荐的用户id
# movie_to_pre_id：要预测评分的电影id
# est：选择的相似度计算函数
def calculate_score(full_datas, user_id, movie_to_pre_id, est):
user_movies = full_datas[full_datas['userId'] == user_id]
sim_total = 0.0
rat_sim_total = 0.0
for i, movie in user_movies.iterrows():
if movie['predict_rating'] == 0.0:
continue
movie_id = movie['movieId']
colA, colB = get_colA_and_colB(full_data, movie_to_pre_id, movie_id)
similarity = est(colA, colB)
#         print('the %d and %d similarity is %f' % (movie_to_pre_id, movie_id, similarity))
sim_total += similarity
rat_sim_total += similarity * movie['predict_rating']
return rat_sim_total / sim_total

print('eulid_sim = %f' % calculate_score(full_data, 12, 1028, eulid_sim))
print('pearson_sim = %f' % calculate_score(full_data, 12, 1028, pears_sim))
print('cos_sim = %f' % calculate_score(full_data, 12, 1028, cos_sim))```

## 推荐

```def new_recommend(full_datas, user_id):
user_movies = full_datas[full_datas['userId'] == user_id]
score_df = pd.DataFrame(columns=['movieId','predict_rating', 'real_rating'])
score = []
movid = []
rel_rat = []
for i, data in user_movies.iterrows():
if data['predict_rating'] == 0:
score.append(calculate_score(full_datas, user_id, data['movieId'], eulid_sim))
movid.append(data['movieId'])
rel_rat.append(data['rating'])
#     score.sort()
score_df['movieId'] = movid
score_df['predict_rating'] = score
score_df['real_rating'] = rel_rat
return score_df.sort_values(by='predict_rating',ascending=False)
print(new_recommend(full_data, 1)) # 看第一个用户的预测评分```

100 篇文章31 人订阅

0 条评论

5539

4016

2333

1.5K2

3474

2.5K3

1051

47812

4286

3698