当前位置:   article > 正文

Item2vec_movielens item2vec

movielens item2vec

数据结构:

 

数据集:在参考链接里面,因为预训练时间太长,所以数据集换成ml-latest-small,取出里面的rating.csv和movie.csv。 

链接:

https://github.com/rexrex9/kb4recMovielensDataProcess

注:每个文件夹下的orginal下面是 原始数据。

代码: 

index_2.py

  1. # 参考链接: https://blog.csdn.net/fuzi2012/article/details/91345164
  2. import pandas as pd
  3. import numpy as np
  4. df_movies=pd.read_csv('../data2/movies.csv')
  5. df_ratings=pd.read_csv('../data2/ratings.csv')
  6. Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
  7. Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
  8. # print(type(Title_id))
  9. # print(Title_id)
  10. # index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
  11. # obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
  12. # print(obj)
  13. for df in list((df_movies,df_ratings)):
  14. rand_idx=np.random.choice(len(df),5,replace=False)
  15. # print(df.iloc[rand_idx,:])
  16. import matplotlib.pyplot as plt
  17. # import plotly.plotly as py
  18. plt.figure(figsize=(8, 6))
  19. ax = plt.subplot(111)
  20. ax.set_title("Distribution of Movie Ratings", fontsize=16)
  21. ax.spines["top"].set_visible(False)
  22. ax.spines["right"].set_visible(False)
  23. plt.xticks(fontsize=12)
  24. plt.yticks(fontsize=12)
  25. plt.xlabel("Movie Rating", fontsize=14)
  26. plt.ylabel("Count", fontsize=14)
  27. plt.hist(df_ratings['rating'], color="#3F5D7D")
  28. # plt.show()
  29. #划分数据集
  30. from sklearn.model_selection import train_test_split
  31. df_ratings_train, df_ratings_test= train_test_split(df_ratings,
  32. stratify=df_ratings['userId'],
  33. random_state = 15688,
  34. test_size=0.30)
  35. # print("Number of training data: "+str(len(df_ratings_train)))
  36. # print("Number of test data: "+str(len(df_ratings_test)))
  37. #评分>4,设为1,否则为0
  38. def rating_splitter(df):
  39. df['liked']=np.where(df['rating']>=4,1,0)
  40. df['movieId']=df['movieId'].astype('str')#转换数组的类型
  41. gp_user_like=df.groupby(['liked','userId'])
  42. return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
  43. pd.options.mode.chained_assignment = None
  44. splitted_movies =rating_splitter(df_ratings_train)
  45. print(splitted_movies)
  46. # 放入word2vec里面进行训练
  47. import warnings
  48. warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
  49. import gensim
  50. assert gensim.models.word2vec.FAST_VERSION>-1
  51. import random
  52. #将训练数据打乱
  53. for movie_list in splitted_movies:
  54. random.shuffle(movie_list)
  55. # 喂入模型,进行训练
  56. from gensim.models import Word2Vec
  57. import datetime
  58. start = datetime.datetime.now()
  59. #这个model得换一个名称,否则报错,所以将其注释
  60. # model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
  61. # iter = 5, # epoch
  62. # min_count = 10, # a movie has to appear more than 10 times to be keeped
  63. # # size = 200, # size of the hidden layer
  64. # workers = 4, # specify the number of threads to be used for training
  65. # sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
  66. # # hs = 0, # Set to 0, as we are applying negative sampling.
  67. # # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
  68. # window = 20)
  69. #
  70. # print("Time passed: " + str(datetime.datetime.now()-start))
  71. # Word2Vec.save('item2vec_2021.h5')
  72. model_w2v_sg = Word2Vec(sentences = splitted_movies,
  73. iter = 10, # epoch
  74. min_count = 5, # a movie has to appear more than 5 times to be keeped
  75. size = 300, # size of the hidden layer
  76. workers = 4, # specify the number of threads to be used for training
  77. sg = 1,
  78. hs = 0,
  79. negative = 5,
  80. window = 20)
  81. print("Time passed: " + str(datetime.datetime.now()-start))
  82. model_w2v_sg.save('item2vec_word2vecSg_2021')
  83. # del model_w2v_sg
  84. #加载模型
  85. import warnings
  86. warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
  87. from gensim.models import Word2Vec
  88. model = Word2Vec.load('item2vec_word2vecSg_2021')
  89. # word_vectors = model.wv
  90. for key in model.wv.vocab:
  91. print(key) # 词
  92. print(model.wv.vocab[key])

model_2.py

  1. from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
  2. import requests
  3. import re
  4. from bs4 import BeautifulSoup
  5. from gensim.models import Word2Vec
  6. import pandas as pd
  7. import numpy as np
  8. model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
  9. df_movies=pd.read_csv('../data2/movies.csv')
  10. df_ratings=pd.read_csv('../data2/ratings.csv')
  11. def refine_search(search_term):
  12. """
  13. Refine the movie name to be recognized by the recommender
  14. Args:
  15. search_term (string): Search Term
  16. Returns:
  17. refined_term (string): a name that can be search in the dataset
  18. """
  19. target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
  20. html = requests.get(target_url).content
  21. parsed_html = BeautifulSoup(html, 'html.parser')
  22. for tag in parsed_html.find_all('td', class_="result_text"):
  23. search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
  24. if search_result:
  25. if search_result[0][0].split()[0] == "The":
  26. str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
  27. refined_name = str_frac + " " + search_result[0][1].strip()
  28. else:
  29. refined_name = search_result[0][0] + " " + search_result[0][1].strip()
  30. return refined_name
  31. def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
  32. """
  33. Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
  34. Ambiguous movie names can be supplied if useRefineSearch is set to True
  35. Args:
  36. list_of_movieName (List): A list of movie names.
  37. useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True
  38. Returns:
  39. list_of_movie_id (List of strings): A list of movie ids.
  40. """
  41. try:
  42. list_of_movie_id = []
  43. for movieName in list_of_movieName:
  44. if useRefineSearch:
  45. movieName = refine_search(movieName)
  46. print("Refined Name: " + movieName)
  47. if movieName in Title_id.keys():
  48. list_of_movie_id.append(str(Title_id[movieName]))
  49. except:
  50. produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
  51. return list_of_movie_id
  52. def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
  53. recommend_movie_ls = []
  54. if positive_list:
  55. positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
  56. if negative_list:
  57. negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
  58. for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
  59. recommend_movie_ls.append(movieId)
  60. return recommend_movie_ls
  61. ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
  62. # print('Recommendation Result based on "Up (2009)":')
  63. print(df_movies[df_movies['movieId'].isin(ls)])
  64. #评估模型
  65. def user_liked_movies_builder(model, df, for_prediction=False):
  66. df['liked'] = np.where(df['rating'] >= 4, 1, 0)
  67. df['movieId'] = df['movieId'].astype('str')
  68. df_liked = df[df['liked'] == 1]
  69. if for_prediction:
  70. df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]
  71. user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()
  72. return user_liked_movies
  73. def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
  74. sum_liked = 0
  75. sum_correct = 0
  76. sum_total = 0
  77. common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))
  78. for userid in common_users:
  79. current_test_set = set(user_liked_movies_test[userid])
  80. pred = [pred_result[0] for pred_result in
  81. model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
  82. sum_correct += len(set(pred).intersection(current_test_set))
  83. sum_liked += len(current_test_set)
  84. precision_at_m = sum_correct / (topn * len(common_users))
  85. recall_at_m = sum_correct / sum_liked
  86. f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
  87. return [precision_at_m, recall_at_m, f1]
  88. pd.options.mode.chained_assignment = None
  89. user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
  90. user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)
  91. model = Word2Vec.load('item2vec_word2vecSg_2021')
  92. model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
  93. del model
  94. print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
  95. print(model_score_sg1)

结果:

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/一键难忘520/article/detail/752540
推荐阅读
相关标签
  

闽ICP备14008679号