Item2vec_movielens item2vec

作者：一键难忘520 | 2024-06-24 10:52:27

踩

movielens item2vec

数据结构：

数据集：在参考链接里面，因为预训练时间太长，所以数据集换成ml-latest-small，取出里面的rating.csv和movie.csv。

链接：

https://github.com/rexrex9/kb4recMovielensDataProcess

注：每个文件夹下的orginal下面是原始数据。

代码：

index_2.py


# 参考链接： https://blog.csdn.net/fuzi2012/article/details/91345164
import  pandas as pd
import  numpy as np
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')
 
Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
# print(type(Title_id))
# print(Title_id)
 
# index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
# obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
# print(obj)
for df in list((df_movies,df_ratings)):
    rand_idx=np.random.choice(len(df),5,replace=False)
# print(df.iloc[rand_idx,:])
 
import matplotlib.pyplot as plt
# import plotly.plotly as py
 
plt.figure(figsize=(8, 6))
ax = plt.subplot(111)
ax.set_title("Distribution of Movie Ratings", fontsize=16)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
 
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
 
plt.xlabel("Movie Rating", fontsize=14)
plt.ylabel("Count", fontsize=14)
 
plt.hist(df_ratings['rating'], color="#3F5D7D")
 
# plt.show()
 
#划分数据集
from sklearn.model_selection import train_test_split
 
df_ratings_train, df_ratings_test= train_test_split(df_ratings,
                                                    stratify=df_ratings['userId'],
                                                    random_state = 15688,
                                                    test_size=0.30)
 
# print("Number of training data: "+str(len(df_ratings_train)))
# print("Number of test data: "+str(len(df_ratings_test)))
#评分>4,设为1，否则为0
def rating_splitter(df):
    df['liked']=np.where(df['rating']>=4,1,0)
    df['movieId']=df['movieId'].astype('str')#转换数组的类型
    gp_user_like=df.groupby(['liked','userId'])
    return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
pd.options.mode.chained_assignment = None
splitted_movies =rating_splitter(df_ratings_train)
print(splitted_movies)
 
# 放入word2vec里面进行训练
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION>-1
import random
#将训练数据打乱
for movie_list in splitted_movies:
    random.shuffle(movie_list)
 
# 喂入模型，进行训练
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()
#这个model得换一个名称，否则报错，所以将其注释
# model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
#                  iter = 5, # epoch
#                  min_count = 10, # a movie has to appear more than 10 times to be keeped
#                  # size = 200, # size of the hidden layer
#                  workers = 4, # specify the number of threads to be used for training
#                  sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
#                  # hs = 0, # Set to 0, as we are applying negative sampling.
#                  # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
#                  window = 20)
#
# print("Time passed: " + str(datetime.datetime.now()-start))
# Word2Vec.save('item2vec_2021.h5')
model_w2v_sg = Word2Vec(sentences = splitted_movies,
                        iter = 10, # epoch
                        min_count = 5, # a movie has to appear more than 5 times to be keeped
                        size = 300, # size of the hidden layer
                        workers = 4, # specify the number of threads to be used for training
                        sg = 1,
                        hs = 0,
                        negative = 5,
                        window = 20)
 
print("Time passed: " + str(datetime.datetime.now()-start))
model_w2v_sg.save('item2vec_word2vecSg_2021')
# del model_w2v_sg
 
#加载模型
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
 
from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_word2vecSg_2021')
# word_vectors = model.wv
for key in model.wv.vocab:
    print(key) # 词
    print(model.wv.vocab[key])

model_2.py


from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
import requests
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')
 
def refine_search(search_term):
    """
    Refine the movie name to be recognized by the recommender
    Args:
        search_term (string): Search Term
    Returns:
        refined_term (string): a name that can be search in the dataset
    """
    target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
    html = requests.get(target_url).content
    parsed_html = BeautifulSoup(html, 'html.parser')
    for tag in parsed_html.find_all('td', class_="result_text"):
        search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
        if search_result:
            if search_result[0][0].split()[0] == "The":
                str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
                refined_name = str_frac + " " + search_result[0][1].strip()
            else:
                refined_name = search_result[0][0] + " " + search_result[0][1].strip()
    return refined_name
 
 
def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
    """
    Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
    Ambiguous movie names can be supplied if useRefineSearch is set to True
    Args:
        list_of_movieName (List): A list of movie names.
        useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True
    Returns:
        list_of_movie_id (List of strings): A list of movie ids.
    """
    try:
        list_of_movie_id = []
 
        for movieName in list_of_movieName:
            if useRefineSearch:
                movieName = refine_search(movieName)
                print("Refined Name: " + movieName)
            if movieName in Title_id.keys():
                list_of_movie_id.append(str(Title_id[movieName]))
    except:
        produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
    return list_of_movie_id
 
 
def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
    recommend_movie_ls = []
    if positive_list:
        positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
    if negative_list:
        negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
    for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_movie_ls.append(movieId)
    return recommend_movie_ls
 
ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
# print('Recommendation Result based on "Up (2009)":')
print(df_movies[df_movies['movieId'].isin(ls)])
#评估模型
def user_liked_movies_builder(model, df, for_prediction=False):
    df['liked'] = np.where(df['rating'] >= 4, 1, 0)
    df['movieId'] = df['movieId'].astype('str')
    df_liked = df[df['liked'] == 1]
    if for_prediction:
        df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]
 
    user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()
 
    return user_liked_movies
 
 
def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
    sum_liked = 0
    sum_correct = 0
    sum_total = 0
    common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))
 
    for userid in common_users:
        current_test_set = set(user_liked_movies_test[userid])
        pred = [pred_result[0] for pred_result in
                model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
        sum_correct += len(set(pred).intersection(current_test_set))
        sum_liked += len(current_test_set)
    precision_at_m = sum_correct / (topn * len(common_users))
    recall_at_m = sum_correct / sum_liked
    f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
    return [precision_at_m, recall_at_m, f1]
 
pd.options.mode.chained_assignment = None
user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)
 
model = Word2Vec.load('item2vec_word2vecSg_2021')
model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
del model
 
print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
print(model_score_sg1)

结果：

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/一键难忘520/article/detail/752540