output_dict = {}
with open(data_home + 'train_triplets.txt') as f:
for line_number, line in enumerate(f):
user = line.split('\t')[0]
play_count = int(line.split('\t')[2])
# 如果统计过该用户,则将该用户播放歌曲+1
if user in output_dict:
play_count += output_dict[user]
output_list = [{'user':k, 'play_count':v} for k, v in output_dict.items()]
play_count_df = pd.DataFrame(output_list)
# 将用户按照播放量从高到低排序(之后过滤掉播放量太少的用户)
song_count_df = play_count_df.sort_values(by='play_count', ascending=False)
song_count_df.to_csv(path_or_buf='user_playcount_df.csv', index=False)
output_dict = {}
with open(data_home + 'train_triplets.txt') as f:
for line_number, line in enumerate(f):
song = line.split('\t')[1]
play_count = int(line.split('\t')[2])
if song in output_dict:
play_count += output_dict[song]
output_list = [{'song':k, 'play_count':v} for k, v in output_dict.items()]
song_count_df = pd.DataFrame(output_list)
# 将歌曲按照播放量从高到低排序(之后过滤掉播放量太少的用户)
song_count_df = song_count_df.sort_values(by='play_count', ascending=False)
song_count_df.to_csv(path_or_buf='song_playcount_df.csv', index=False)
total_play_count = sum(song_count_df.play_count) # 所有歌曲的播放量
print((float(play_count_df.head(n=100000).play_count.sum()) / total_play_count) * 100) # 前10万用户播放总量占比
play_count_subset = play_count_df.head(n=100000)
(float(song_count_df.head(n=30000).play_count.sum()) / total_play_count) * 100 # 前3万首歌曲播放占比
user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)
triplet_dataset = pd.read_csv(filepath_or_buffer=data_home + 'train_triplets.txt', sep='\t',
header=None, names=['user', 'song', 'play_count'])
triplet_dataset_sub = triplet_dataset[triplet_dataset.user.isin(user_subset)]
triplet_dataset_sub_song = triplet_dataset_sub[triplet_dataset_sub.song.isin(song_subset)]
triplet_dataset_sub_song.to_csv(path_or_buf=data_home + 'triplet_dataset_sub_song.csv', index=False)
.db文件需要稍微处理下 转换成csv
conn = sqlite3.connect(data_home + 'track_metadata.db')
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
track_metadata_df = pd.read_sql(con=conn, sql='select * from songs')
track_metadata_df_sub = track_metadata_df[track_metadata_df.song_id.isin(song_subset)]
track_metadata_df_sub.to_csv(path_or_buf=data_home + 'track_metadata_df_sub.csv', index=False) # 生成csv文件
(30447, 14)
triplet_dataset_sub_song = pd.read_csv(filepath_or_buffer=data_home + 'triplet_dataset_sub_song.csv', encoding="ISO-8859-1")
track_metadata_df_sub = pd.read_csv(filepath_or_buffer=data_home + 'track_metadata_df_sub.csv', encoding="ISO-8859-1")
# 去除掉无用的和重复的
track_metadata_df_sub = track_metadata_df_sub.drop_duplicates(['song_id'])
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song, track_metadata_df_sub, how='left', left_on='song', right_on='song_id')
triplet_dataset_sub_song_merged.rename(columns={'play_count': 'listen_count'}, inplace=True)
import matplotlib.pyplot as plt; plt.rcdefaults() import numpy as np import matplotlib.pyplot as plt #按歌曲名字来统计其播放量的总数 popular_songs = triplet_dataset_sub_song_merged[['title','listen_count']].groupby('title').sum().reset_index() #对结果进行排序 popular_songs_top_20 = popular_songs.sort_values('listen_count', ascending=False).head(n=20) #转换成list格式方便画图 objects = (list(popular_songs_top_20['title'])) #设置位置 y_pos = np.arange(len(objects)) #对应结果值 performance = list(popular_song_top_20['listen_count']) #绘图 plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects, rotation='vertical') plt.ylabel('Item count') plt.title('Most popular songs') plt.show()
popular_release = triplet_dataset_sub_song_merged[['release', 'listen_count']].groupby('release').sum().reset_index()
popular_release_top_20 = popular_release.sort_values('listen_count', ascending=False).head(n=20)
objects = (list(popular_release_top_20['release']))
y_pos = np.arange(len(objects))
performance = list(popular_release_top_20['listen_count'])
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Release')
popular_artist = triplet_dataset_sub_song_merged[['artist_name', 'listen_count']].groupby('artist_name').sum().reset_index()
popular_artist_top_20 = popular_artist.sort_values('listen_count', ascending=False).head(n=20)
objects = (list(popular_artist_top_20['artist_name']))
y_pos = np.arange(len(objects))
performance = list(popular_artist_top_20['listen_count'])
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Artist')
user_song_count_distribution = triplet_dataset_sub_song_merged[['user','title']].groupby('user').count().reset_index().sort_values(
by='title',ascending = False)
count 99996.000000
mean 107.749890
std 79.742561
min 1.000000
25% 53.000000
50% 89.000000
75% 141.000000
max 1189.000000
Name: title, dtype: float64
x = user_song_count_distribution.title
n, bins, patches = plt.hist(x, 50, facecolor='green', alpha=0.75)
plt.xlabel('Play Counts')
plt.ylabel('Num of Users')
plt.title(r'$\mathrm{Histogram\ of\ User\ Play\ Count\ Distribution}\ $')
triplet_dataset_sub_song_merged_set = triplet_dataset_sub_song_merged
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_set, test_size=0.40, random_state=0)
def create_popularity_recommendation(train_data, user_id, item_id):
train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index()
train_data_grouped.rename(columns = {user_id: 'score'}, inplace=True)
train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
popularity_recommendations = train_data_sort.head(20)
return popularity_recommendations
recommendations = create_popularity_recommendation(triplet_dataset_sub_song_merged,'user','title')
song_count_subset = song_count_df.head(n=5000)
user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)
triplet_dataset_sub_song_merged_sub = triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.song.isin(song_subset)]
# Thanks to Siraj Raval for this module # Refer to https://github.com/llSourcell/recommender_live for more details import numpy as np import pandas #Class for Popularity based Recommender System model class popularity_recommender_py(): def __init__(self): self.train_data = None self.user_id = None self.item_id = None self.popularity_recommendations = None #Create the popularity based recommender system model def create(self, train_data, user_id, item_id): self.train_data = train_data self.user_id = user_id self.item_id = item_id #Get a count of user_ids for each unique song as recommendation score train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index() train_data_grouped.rename(columns = {user_id: 'score'},inplace=True) #Sort the songs based upon recommendation score train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1]) #Generate a recommendation rank based upon score train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first') #Get the top 10 recommendations self.popularity_recommendations = train_data_sort.head(10) #Use the popularity based recommender system model to #make recommendations def recommend(self, user_id): user_recommendations = self.popularity_recommendations #Add user_id column for which the recommendations are being generated user_recommendations['user_id'] = user_id #Bring user_id column to the front cols = user_recommendations.columns.tolist() cols = cols[-1:] + cols[:-1] user_recommendations = user_recommendations[cols] return user_recommendations #Class for Item similarity based Recommender System model class item_similarity_recommender_py(): def __init__(self): self.train_data = None self.user_id = None self.item_id = None self.cooccurence_matrix = None self.songs_dict = None self.rev_songs_dict = None self.item_similarity_recommendations = None #Get unique items (songs) corresponding to a given user def get_user_items(self, user): user_data = self.train_data[self.train_data[self.user_id] == user] user_items = list(user_data[self.item_id].unique()) return user_items #Get unique users for a given item (song) def get_item_users(self, item): item_data = self.train_data[self.train_data[self.item_id] == item] item_users = set(item_data[self.user_id].unique()) return item_users #Get unique items (songs) in the training data def get_all_items_train_data(self): all_items = list(self.train_data[self.item_id].unique()) return all_items #Construct cooccurence matrix def construct_cooccurence_matrix(self, user_songs, all_songs): #################################### #Get users for all songs in user_songs. # 现在要计算的是给我选中的测试用户推荐什么 # 流程如下 # 1. 先把选中的测试用户听过的歌曲都拿到 # 2. 找出这些歌曲中每一个歌曲都被那些其他用户听过 # 3. 在整个歌曲集中遍历每一个歌曲,计算它与选中测试用户中每一个听过歌曲的Jaccard相似系数 # 通过听歌的人的交集与并集情况来计算 #################################### user_songs_users = [] for i in range(0, len(user_songs)): user_songs_users.append(self.get_item_users(user_songs[i])) ############################################### #Initialize the item cooccurence matrix of size #len(user_songs) X len(songs) ############################################### cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float) ############################################################# #Calculate similarity between user songs and all unique songs #in the training data ############################################################# for i in range(0,len(all_songs)): #Calculate unique listeners (users) of song (item) i songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]] users_i = set(songs_i_data[self.user_id].unique()) for j in range(0,len(user_songs)): #Get unique listeners (users) of song (item) j users_j = user_songs_users[j] #Calculate intersection of listeners of songs i and j users_intersection = users_i.intersection(users_j) #Calculate cooccurence_matrix[i,j] as Jaccard Index if len(users_intersection) != 0: #Calculate union of listeners of songs i and j users_union = users_i.union(users_j) cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union)) else: cooccurence_matrix[j,i] = 0 return cooccurence_matrix #Use the cooccurence matrix to make top recommendations def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs): print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix)) #Calculate a weighted average of the scores in cooccurence matrix for all user songs. user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0]) user_sim_scores = np.array(user_sim_scores)[0].tolist() #Sort the indices of user_sim_scores based upon their value #Also maintain the corresponding score sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True) #Create a dataframe from the following columns = ['user_id', 'song', 'score', 'rank'] #index = np.arange(1) # array of numbers for the number of samples df = pandas.DataFrame(columns=columns) #Fill the dataframe with top 10 item based recommendations rank = 1 for i in range(0,len(sort_index)): if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10: df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank] rank = rank+1 #Handle the case where there are no recommendations if df.shape[0] == 0: print("The current user has no songs for training the item similarity based recommendation model.") return -1 else: return df #Create the item similarity based recommender system model def create(self, train_data, user_id, item_id): self.train_data = train_data self.user_id = user_id self.item_id = item_id #Use the item similarity based recommender system model to #make recommendations def recommend(self, user): ######################################## #A. Get all unique songs for this user ######################################## user_songs = self.get_user_items(user) print("No. of unique songs for the user: %d" % len(user_songs)) ###################################################### #B. Get all unique items (songs) in the training data ###################################################### all_songs = self.get_all_items_train_data() print("no. of unique songs in the training set: %d" % len(all_songs)) ############################################### #C. Construct item cooccurence matrix of size #len(user_songs) X len(songs) ############################################### cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs) ####################################################### #D. Use the cooccurence matrix to make recommendations ####################################################### df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs) return df_recommendations #Get similar items to given items def get_similar_items(self, item_list): user_songs = item_list ###################################################### #B. Get all unique items (songs) in the training data ###################################################### all_songs = self.get_all_items_train_data() print("no. of unique songs in the training set: %d" % len(all_songs)) ############################################### #C. Construct item cooccurence matrix of size #len(user_songs) X len(songs) ############################################### cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs) ####################################################### #D. Use the cooccurence matrix to make recommendations ####################################################### user = "" df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs) return df_recommendations
奇异值分解(Singular Value Decomposition,SVD)是矩阵分解中一个经典方法,接下来我们的推荐就可以SVD来进行计算,奇异值分解的基本出发点跟我们之前讲的隐语义模型有些类似都是将大矩阵转换成小矩阵的组合,基本形式如下图所示:
重新计算 USV的结果得到A2 来比较下A2和A的差异,看起来差异是有的,但是并不大,所以我们可以近似来代替:
triplet_dataset_sub_song_merged_sum_df = triplet_dataset_sub_song_merged[['user','listen_count']].groupby('user').sum().reset_index()
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song_merged,triplet_dataset_sub_song_merged_sum_df)
triplet_dataset_sub_song_merged['fractional_play_count'] = triplet_dataset_sub_song_merged['listen_count']/triplet_dataset_sub_song_merged['total_listen_count']
triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.user =='d6589314c0a9bcbca4fee0c93b14bc402363afea'][['user','song','listen_count','fractional_play_count']].head()
from scipy.sparse import coo_matrix small_set = triplet_dataset_sub_song_merged user_codes = small_set.user.drop_duplicates().reset_index() song_codes = small_set.song.drop_duplicates().reset_index() user_codes.rename(columns={'index':'user_index'}, inplace=True) song_codes.rename(columns={'index':'song_index'}, inplace=True) song_codes['so_index_value'] = list(song_codes.index) user_codes['us_index_value'] = list(user_codes.index) small_set = pd.merge(small_set,song_codes,how='left') small_set = pd.merge(small_set,user_codes,how='left') mat_candidate = small_set[['us_index_value','so_index_value','fractional_play_count']] data_array = mat_candidate.fractional_play_count.values row_array = mat_candidate.us_index_value.values col_array = mat_candidate.so_index_value.values data_sparse = coo_matrix((data_array, (row_array, col_array)),dtype=float)
<99996x30000 sparse matrix of type '<class 'numpy.float64'>'
with 10774558 stored elements in COOrdinate format>
user_codes[user_codes.user =='2a2f776cbac6df64d6cb505e7e834e01684673b6']
import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
def compute_svd(urm, K): U, s, Vt = svds(urm, K) dim = (len(s), len(s)) S = np.zeros(dim, dtype=np.float32) for i in range(0, len(s)): S[i,i] = mt.sqrt(s[i]) U = csc_matrix(U, dtype=np.float32) S = csc_matrix(S, dtype=np.float32) Vt = csc_matrix(Vt, dtype=np.float32) return U, S, Vt def compute_estimated_matrix(urm, U, S, Vt, uTest, K, test): rightTerm = S*Vt max_recommendation = 250 estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16) recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16) for userTest in uTest: prod = U[userTest, :]*rightTerm estimatedRatings[userTest, :] = prod.todense() recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation] return recomendRatings
urm = data_sparse
MAX_PID = urm.shape[1]
MAX_UID = urm.shape[0]
U, S, Vt = compute_svd(urm, K)
uTest = [4,5,6,7,8,873,23]
uTest = [4,5,6,7,8,873,23]
uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)
for user in uTest:
print("Recommendation for user with user id {}". format(user))
rank_value = 1
for i in uTest_recommended_items[user,0:10]:
song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
print("The number {} recommended song is {} BY {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
Recommendation for user with user id 4 The number 1 recommended song is Fireflies BY Charttraxx Karaoke The number 2 recommended song is Hey_ Soul Sister BY Train The number 3 recommended song is OMG BY Usher featuring will.i.am The number 4 recommended song is Lucky (Album Version) BY Jason Mraz & Colbie Caillat The number 5 recommended song is Vanilla Twilight BY Owl City The number 6 recommended song is Crumpshit BY Philippe Rochard The number 7 recommended song is Billionaire [feat. Bruno Mars] (Explicit Album Version) BY Travie McCoy The number 8 recommended song is Love Story BY Taylor Swift The number 9 recommended song is TULENLIEKKI BY M.A. Numminen The number 10 recommended song is Use Somebody BY Kings Of Leon Recommendation for user with user id 5 The number 1 recommended song is Sehr kosmisch BY Harmonia The number 2 recommended song is Ain't Misbehavin BY Sam Cooke The number 3 recommended song is Dog Days Are Over (Radio Edit) BY Florence + The Machine The number 4 recommended song is Revelry BY Kings Of Leon The number 5 recommended song is Undo BY Björk The number 6 recommended song is Cosmic Love BY Florence + The Machine The number 7 recommended song is Home BY Edward Sharpe & The Magnetic Zeros The number 8 recommended song is You've Got The Love BY Florence + The Machine The number 9 recommended song is Bring Me To Life BY Evanescence The number 10 recommended song is Tighten Up BY The Black Keys Recommendation for user with user id 6 The number 1 recommended song is Crumpshit BY Philippe Rochard The number 2 recommended song is Marry Me BY Train The number 3 recommended song is Hey_ Soul Sister BY Train The number 4 recommended song is Lucky (Album Version) BY Jason Mraz & Colbie Caillat The number 5 recommended song is One On One BY the bird and the bee The number 6 recommended song is I Never Told You BY Colbie Caillat The number 7 recommended song is Canada BY Five Iron Frenzy The number 8 recommended song is Fireflies BY Charttraxx Karaoke The number 9 recommended song is TULENLIEKKI BY M.A. Numminen The number 10 recommended song is Bring Me To Life BY Evanescence Recommendation for user with user id 7 The number 1 recommended song is Behind The Sea [Live In Chicago] BY Panic At The Disco The number 2 recommended song is The City Is At War (Album Version) BY Cobra Starship The number 3 recommended song is Dead Souls BY Nine Inch Nails The number 4 recommended song is Una Confusion BY LU The number 5 recommended song is Home BY Edward Sharpe & The Magnetic Zeros The number 6 recommended song is Climbing Up The Walls BY Radiohead The number 7 recommended song is Tighten Up BY The Black Keys The number 8 recommended song is Tive Sim BY Cartola The number 9 recommended song is West One (Shine On Me) BY The Ruts The number 10 recommended song is Cosmic Love BY Florence + The Machine Recommendation for user with user id 8 The number 1 recommended song is Undo BY Björk The number 2 recommended song is Canada BY Five Iron Frenzy The number 3 recommended song is Better To Reign In Hell BY Cradle Of Filth The number 4 recommended song is Unite (2009 Digital Remaster) BY Beastie Boys The number 5 recommended song is Behind The Sea [Live In Chicago] BY Panic At The Disco The number 6 recommended song is Rockin' Around The Christmas Tree BY Brenda Lee The number 7 recommended song is Devil's Slide BY Joe Satriani The number 8 recommended song is Revelry BY Kings Of Leon The number 9 recommended song is 16 Candles BY The Crests The number 10 recommended song is Catch You Baby (Steve Pitron & Max Sanna Radio Edit) BY Lonnie Gordon Recommendation for user with user id 873 The number 1 recommended song is The Scientist BY Coldplay The number 2 recommended song is Yellow BY Coldplay The number 3 recommended song is Clocks BY Coldplay The number 4 recommended song is Fix You BY Coldplay The number 5 recommended song is In My Place BY Coldplay The number 6 recommended song is Shiver BY Coldplay The number 7 recommended song is Speed Of Sound BY Coldplay The number 8 recommended song is Creep (Explicit) BY Radiohead The number 9 recommended song is Sparks BY Coldplay The number 10 recommended song is Use Somebody BY Kings Of Leon Recommendation for user with user id 23 The number 1 recommended song is Garden Of Eden BY Guns N' Roses The number 2 recommended song is Don't Speak BY John Dahlbäck The number 3 recommended song is Master Of Puppets BY Metallica The number 4 recommended song is TULENLIEKKI BY M.A. Numminen The number 5 recommended song is Bring Me To Life BY Evanescence The number 6 recommended song is Kryptonite BY 3 Doors Down The number 7 recommended song is Make Her Say BY Kid Cudi / Kanye West / Common The number 8 recommended song is Night Village BY Deep Forest The number 9 recommended song is Better To Reign In Hell BY Cradle Of Filth The number 10 recommended song is Xanadu BY Olivia Newton-John;Electric Light Orchestra
