赞
踩
数据集:在参考链接里面,因为预训练时间太长,所以数据集换成ml-latest-small,取出里面的rating.csv和movie.csv。
链接:
https://github.com/rexrex9/kb4recMovielensDataProcess
注:每个文件夹下的orginal下面是 原始数据。
index_2.py
- # 参考链接: https://blog.csdn.net/fuzi2012/article/details/91345164
- import pandas as pd
- import numpy as np
- df_movies=pd.read_csv('../data2/movies.csv')
- df_ratings=pd.read_csv('../data2/ratings.csv')
-
- Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
- Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
- # print(type(Title_id))
- # print(Title_id)
-
- # index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
- # obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
- # print(obj)
- for df in list((df_movies,df_ratings)):
- rand_idx=np.random.choice(len(df),5,replace=False)
- # print(df.iloc[rand_idx,:])
-
- import matplotlib.pyplot as plt
- # import plotly.plotly as py
-
- plt.figure(figsize=(8, 6))
- ax = plt.subplot(111)
- ax.set_title("Distribution of Movie Ratings", fontsize=16)
- ax.spines["top"].set_visible(False)
- ax.spines["right"].set_visible(False)
-
- plt.xticks(fontsize=12)
- plt.yticks(fontsize=12)
-
- plt.xlabel("Movie Rating", fontsize=14)
- plt.ylabel("Count", fontsize=14)
-
- plt.hist(df_ratings['rating'], color="#3F5D7D")
-
- # plt.show()
-
- #划分数据集
- from sklearn.model_selection import train_test_split
-
- df_ratings_train, df_ratings_test= train_test_split(df_ratings,
- stratify=df_ratings['userId'],
- random_state = 15688,
- test_size=0.30)
-
- # print("Number of training data: "+str(len(df_ratings_train)))
- # print("Number of test data: "+str(len(df_ratings_test)))
- #评分>4,设为1,否则为0
- def rating_splitter(df):
- df['liked']=np.where(df['rating']>=4,1,0)
- df['movieId']=df['movieId'].astype('str')#转换数组的类型
- gp_user_like=df.groupby(['liked','userId'])
- return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
- pd.options.mode.chained_assignment = None
- splitted_movies =rating_splitter(df_ratings_train)
- print(splitted_movies)
-
- # 放入word2vec里面进行训练
- import warnings
- warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
- import gensim
- assert gensim.models.word2vec.FAST_VERSION>-1
- import random
- #将训练数据打乱
- for movie_list in splitted_movies:
- random.shuffle(movie_list)
-
- # 喂入模型,进行训练
- from gensim.models import Word2Vec
- import datetime
- start = datetime.datetime.now()
- #这个model得换一个名称,否则报错,所以将其注释
- # model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
- # iter = 5, # epoch
- # min_count = 10, # a movie has to appear more than 10 times to be keeped
- # # size = 200, # size of the hidden layer
- # workers = 4, # specify the number of threads to be used for training
- # sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
- # # hs = 0, # Set to 0, as we are applying negative sampling.
- # # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
- # window = 20)
- #
- # print("Time passed: " + str(datetime.datetime.now()-start))
- # Word2Vec.save('item2vec_2021.h5')
- model_w2v_sg = Word2Vec(sentences = splitted_movies,
- iter = 10, # epoch
- min_count = 5, # a movie has to appear more than 5 times to be keeped
- size = 300, # size of the hidden layer
- workers = 4, # specify the number of threads to be used for training
- sg = 1,
- hs = 0,
- negative = 5,
- window = 20)
-
- print("Time passed: " + str(datetime.datetime.now()-start))
- model_w2v_sg.save('item2vec_word2vecSg_2021')
- # del model_w2v_sg
-
- #加载模型
- import warnings
- warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
-
- from gensim.models import Word2Vec
- model = Word2Vec.load('item2vec_word2vecSg_2021')
- # word_vectors = model.wv
- for key in model.wv.vocab:
- print(key) # 词
- print(model.wv.vocab[key])
-

model_2.py
- from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
- import requests
- import re
- from bs4 import BeautifulSoup
- from gensim.models import Word2Vec
- import pandas as pd
- import numpy as np
- model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
- df_movies=pd.read_csv('../data2/movies.csv')
- df_ratings=pd.read_csv('../data2/ratings.csv')
-
- def refine_search(search_term):
- """
- Refine the movie name to be recognized by the recommender
- Args:
- search_term (string): Search Term
- Returns:
- refined_term (string): a name that can be search in the dataset
- """
- target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
- html = requests.get(target_url).content
- parsed_html = BeautifulSoup(html, 'html.parser')
- for tag in parsed_html.find_all('td', class_="result_text"):
- search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
- if search_result:
- if search_result[0][0].split()[0] == "The":
- str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
- refined_name = str_frac + " " + search_result[0][1].strip()
- else:
- refined_name = search_result[0][0] + " " + search_result[0][1].strip()
- return refined_name
-
-
- def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
- """
- Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
- Ambiguous movie names can be supplied if useRefineSearch is set to True
- Args:
- list_of_movieName (List): A list of movie names.
- useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True
- Returns:
- list_of_movie_id (List of strings): A list of movie ids.
- """
- try:
- list_of_movie_id = []
-
- for movieName in list_of_movieName:
- if useRefineSearch:
- movieName = refine_search(movieName)
- print("Refined Name: " + movieName)
- if movieName in Title_id.keys():
- list_of_movie_id.append(str(Title_id[movieName]))
- except:
- produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
- return list_of_movie_id
-
-
- def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
- recommend_movie_ls = []
- if positive_list:
- positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
- if negative_list:
- negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
- for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
- recommend_movie_ls.append(movieId)
- return recommend_movie_ls
-
- ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
- # print('Recommendation Result based on "Up (2009)":')
- print(df_movies[df_movies['movieId'].isin(ls)])
- #评估模型
- def user_liked_movies_builder(model, df, for_prediction=False):
- df['liked'] = np.where(df['rating'] >= 4, 1, 0)
- df['movieId'] = df['movieId'].astype('str')
- df_liked = df[df['liked'] == 1]
- if for_prediction:
- df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]
-
- user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()
-
- return user_liked_movies
-
-
- def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
- sum_liked = 0
- sum_correct = 0
- sum_total = 0
- common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))
-
- for userid in common_users:
- current_test_set = set(user_liked_movies_test[userid])
- pred = [pred_result[0] for pred_result in
- model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
- sum_correct += len(set(pred).intersection(current_test_set))
- sum_liked += len(current_test_set)
- precision_at_m = sum_correct / (topn * len(common_users))
- recall_at_m = sum_correct / sum_liked
- f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
- return [precision_at_m, recall_at_m, f1]
-
- pd.options.mode.chained_assignment = None
- user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
- user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)
-
- model = Word2Vec.load('item2vec_word2vecSg_2021')
- model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
- del model
-
- print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
- print(model_score_sg1)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。