所用数据:grouplens.org/datasets/mo…
数据读取
""" @author: LiShiHang @software: PyCharm @file: data_preprocessing.py @time: 2019/4/3 21:12 @desc: """ import pandas as pd import numpy as np np.set_printoptions(threshold=np.inf) def read_data(dir="../data/ml-1m/"): """ 将data/ml-1m文件夹中的dat文件转化为csv :param dir: :return: """ #################################### f_users = pd.read_table( dir + "users.dat", engine="python", sep="::", names=[ 'userID', 'Gender', 'Age', 'Occupation', 'Zip-code']) f_users.to_csv("users.csv", index=False) f_users.info() ########################################### f_ratings = pd.read_table(dir + "ratings.dat", engine="python", sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp']) f_ratings.to_csv("ratings.csv", index=False) f_ratings.info() ################################################ f_movies = pd.read_table(dir + "movies.dat", engine="python", sep="::", names=['MovieID', 'Title', 'Genres']) f_movies.to_csv("movies.csv", index=False) f_movies.info() print("finish.") if __name__ == '__main__': read_data()
模型构建
""" @author: LiShiHang @software: PyCharm @file: user_cf.py @time: 2019/4/4 8:31 @desc: """ import math import data_preprocessing import pandas as pd class UserCF(): def __init__(self, path_rating): self.data_rating = pd.read_csv(path_rating) def calc_user_sim(self, item1, item2): """ 计算相似度 :param item1: :param item2: :return: """ cosine = len(set(item1) & set(item2)) / math.sqrt(len(item1) * len(item2)) return cosine def get_users_topN(self, source_user_id, N): """ 得到N个相似用户 :param source_user_id: :param topN: :return: [[用户ID,相似度]] """ source_user_movies = self.data_rating[self.data_rating['UserID'] == source_user_id]["MovieID"] # 目标用户看过的电影ID others_id = [ i for i in set( self.data_rating['UserID']) if i != source_user_id] # 其他用户ID others_movies = [ self.data_rating[self.data_rating['UserID'] == i]["MovieID"] for i in others_id] # [[其他用户看的电影ID] 其他用户ID] sim_list = [ self.calc_user_sim( source_user_movies, movies) for movies in others_movies] # 根据目标用户和其他用户 看过的电影ID计算相似度 sim_list = sorted(zip(others_id, sim_list), key=lambda x: x[1], reverse=True) return sim_list[:N] def get_candidate(self, source_user_id): """ 得到候选电影清单 :param source_user_id: :return: """ source_user_movies = set( self.data_rating[self.data_rating['UserID'] == source_user_id]["MovieID"]) # 目标用户看过的电影ID others_movies = set( self.data_rating[self.data_rating['UserID'] != source_user_id]["MovieID"]) candidate_movies = others_movies - source_user_movies return candidate_movies def get_item_topN(self, top_n_users, candidate_movies, topN): """ 得到推荐电影列表 :param top_n_users: :param candidate_movies: :param topN: :return: """ top_n_users_data = [self.data_rating[self.data_rating['UserID'] == i] for i, _ in top_n_users] # 相似用户数据 interest_item = [] for cm in candidate_movies: # 候选电影 tmp = [] for user_data in top_n_users_data: # 相似用户,[[用户ID,相似度]] if cm in user_data["MovieID"].values: tmp.append(user_data[user_data["MovieID"] == cm]['Rating'].values[0] / 5.0) else: tmp.append(0) interest = sum([top_n_users[i][1] * tmp[i] for i in range(len(top_n_users))]) # 相似用户对每个候选电影的感兴趣度(评分) interest_item.append((cm, interest)) interest_item = sorted(interest_item, key=lambda x: x[1], reverse=True) return interest_item[:topN] if __name__ == '__main__': ucf = UserCF("ratings.csv") ui = 1 top_n_users = ucf.get_users_topN(ui, 10) candidate = ucf.get_candidate(ui) top_n_movies = ucf.get_item_topN(top_n_users, candidate, 10) print(top_n_movies) # 推荐的电影ID,推荐程度 # 显示电影名 movies = pd.read_csv("movies.csv") # print("*" * 20) # for i in ucf.data_rating[ucf.data_rating["UserID"]==ui]["MovieID"]: # 目标用户看过的电影名称 # print(*movies[movies["MovieID"]==i].values[0]) print("*" * 20) for i, j in top_n_movies: # 推荐看的电影名称 print(*movies[movies["MovieID"] == i].values[0], j)
由于最近较忙,博客不做说明。
参考链接:
blog.appearancecsdn.net/sinat_33741…
github.compython是什么意思/lpty/recomm…
www.cnblogs.com/tbiiann/
github.com/apachecn/Re…
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
评论(0)