赞
踩
一.数据集
train.dat共393366项,第一维是用户的代号,第二维是商品代号,第三项是用户对该商品的评分,第四项是评论数,第五项是评论内容。
test.dat第一维是用户代号,第二维是商品代号。
二.简单思路及实现过程
这里简单先贴出代码实现,后期再慢慢补上具体过程说明~
1.数据预处理
去除部分无关常用词,这里调用nltk包,将评论词词词根化等,进行评论数据的预处理,存为new dat.dat
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
def textPrecessing(text):
wordLst = nltk.word_tokenize(text)
filtered = [w for w in wordLst if w not in stopwords.words('english')]
refiltered =nltk.pos_tag(filtered)
filtered = [w for w, pos in refiltered if pos.startswith('NN')]
ps = PorterStemmer()
filtered = [ps.stem(w) for w in filtered]
return " ".join(filtered)
def split_word():
x = []
y = []
with open("E:/project/o/comdata/train.dat", encoding='utf-8') as f:
for data in f.readlines():
data = data.strip("\n").split(" ")
temp = []
temp.append(' '.join(data[0:4]))
text = data[4:]
temp.append(textPrecessing(" ".join(text)))
x.append(' '.join(temp))
with open("E:/project/o/comdata/newdat.dat", "w", encoding='utf-8') as w:
for i in x:
w.write(i)
w.write('\n')
f.close()
w.close()
with open('train.dat',encoding='UTF-8') as file_object: #获取训练集
for j in range(303366):
line = file_object.readline()
if(line):
traindata.append(line.strip('\n').split(' ')[0:4]) #
word=line.strip('\n').split(' ')[4:]
comm=[]
for w in word:
if w not in stoplist:
comm.append(w)
commentdata.append(comm) # get comment information
2.获取对应的用户名-id,物品-id字典,构建用户id-物品id 评分表
useridlist={} #用户名对应序号
itemidlist={} #物品对应序号
num=0
mun=0
uu=0
for item in traindata:
if item[0] in useridlist:
uu+=1
if item[0] not in useridlist:
useridlist[item[0]]=num
num+=1
for item in traindata:
if item[1] not in itemidlist:
itemidlist[item[1]]=mun
mun+=1
scorematr=np.zeros((len(useridlist),len(itemidlist))) #评分表ui
for item in traindata:
i=int(useridlist[item[0]])
j=int(itemidlist[item[1]])
scorematr[i][j]=int(item[2])
3.尝试LFM法
def Lfm(T,k):
alpha = np.float32(0.05)
lambda_ = np.float32(0.08)
m, n = T.shape
u = np.float32(np.random.rand(m,k))
v = np.float32(np.random.ran77dn(k,n))
du=np.float32(0)
dv=np.float32(0)
for t in range(500):
for i in range(m):
for j in range(n):
if math.fabs(T[i][j]) > 1e-4:
err = T[i][j] - np.dot(u[i],v[:,j])
for t in range(k):
du = err * v[t][j] - lambda_ * u[i][t]
dv = err * u[i][t] - lambda_ * v[t][j]
u[i][t] += alpha * du; v[t][j] += alpha * dv
return u,v
#u,v=Lfm(scorearr) print(np.dot(u,v))
四.尝试LDA法/word2vec法基于评论预测
① LDA 模型
dictionary = corpora.Dictionary(commentdata) #建立字典
dictionary.filter_tokens(bad_ids=['i','the','is','and','very','a','an','had','about','for','it','if','of','to'])
dictionary.filter_extremes(no_above=40/100)
#dictionary.filter_n_most_frequent(8)
print(dictionary.token2id) #打印字典表
corpus = [dictionary.doc2bow(text) for text in commentdata]
print(corpus[3:5])
print("corpus----")
lda=LdaModel(corpus=corpus,id2word=dictionary,num_topics=20)
# 训练LDA模型
lda.save("./lda.model")
# lda=LdaModel.load("./lda.model")
② word2vec 模型
model="./pinglun.model"
if os.path.exists(model):
model=gensim.models.Word2Vec.load("./pinglun.model")
else :
model = gensim.models.word2vec.Word2Vec(commentdata, size=200,
workers=7,min_count=2) #训练 word2vec模型
model.save("./pinglun.model")
(1)求θuij
① LDA方法的θ取法
def getHuik(doc_topic,n,bo): #获得Huik
Huikarr = np.zeros((u0, i0, n))
for ui in traindata:
u=int(useridlist[ui[0]])
i=int(itemidlist[ui[1]])
comid=Bui[u][i] #u,i对应的评论的序号
item = doc_topic[int(comid)] #遍历评论的词汇,与主题特征词
for topid in range(5):
Huikarr[u][i][topid]=float(item[topid][1]) #评论ui对应topid的概率
bo[int(comid)][topid]=float(item[topid][1])
return Huikarr
② word2vec的θ取法
def getHuik(doc_topic): #获得Huik
for top in topic:
topid=topictokenlist[top]
temp=topic[topid]
model = gensim.models.Word2Vec.load("./pinglun.model")
simlar=model.most_similar(temp,topn=50) #主题的特征词集,取了10个
for ui in traindata:
u=int(useridlist[ui[0]])
i=int(itemidlist[ui[1]])
comid=Bui[u][i] #u,i对应的评论的序号
for item in commentdata[int(comid)]: #遍历评论的词汇,与主题特征词
tempcom=0 # 统计一条评论中主题特征词的相似和
if item in simlar:
tempcom+=model.similarity(item,temp)
print("zhaodoa!!!")
Huikarr[u][i][topid]=tempcom
return Huikarr
(1)求puij ‘
def getpuj0(u,j): #求单个的puj’
Huij=getHuik() #得θuij数组
tempujs = 0
for item in traindata:
i=int(itemidlist[item[1]]) # 物品号
tempujs+=Huij[u][i][j]
return tempujs/Cu[u] #得到puj ‘
Puj0=np.zeros((len(useridlist),len(topictokenlist)))
def getPuj0(): #求全体puj ’数组
for u in range(len(useridlist)):
for j in range((len(topictokenlist))):
Puj0[u][j]=getpuj0(u,j)
return Puj0
(3)求 puj
def getPuj1():
bottom=np.zeros((len(useridlist)))
for u in range(len(useridlist)):
temp=0
for j in range(len(topictokenlist)):
temp+=Puj0[u][j]
bottom[u]=temp
for u in range(len(useridlist)):
for j in range((len(topictokenlist))):
Puj1[u][j]=Puj0[u][j]/bottom[u]
return Puj1
(2)求qij’
def getqij0(i,j):
Huij=getHuik() #uij数组
tempu = 0
for u in range(len(useridlist)):
tempu+=Huij[u][i][j]
return tempu/Ci[i] #得到p'u
# j
Qij0=np.zeros((len(itemidlist),len(topictokenlist)))
def getQij0():
for i in range(len(itemidlist)):
for j in range(len(topictokenlist)):
Qij0[i][j]=getqij0(i,j)
return Qij0
(4)求qij
def getQij1():
bottom = np.zeros((len(itemidlist)))
for i in range(len(itemidlist)):
temp = 0
for j in range(len(topictokenlist)):
temp += Qij0[i][j]
bottom[i] = temp
for i in range(len((itemidlist))):
for j in range((len(topictokenlist))):
Qij1[i][j] = Qij0[i][j] / bottom[i]
return Qij1
(5)产生用户u未评论物品i的主题分布
def getTuij0(Huij):
Tuij0 = np.zeros((393366, 5))
Qij1=getQij1(Huij)
Puj1=getPuj1(Huij)
Tuij0 = np.zeros((len(useridlist), len(itemidlist),5))
for u in range(len(useridlist)):
for i in range(len(itemidlist)):
mm=int(Bui[u][i])
for j in range(5):
Tuij0[m][j]=Puj1[u][j]*Qij1[i][j]
return Tuij0
def getTuij_input(Huij):
Tuij_input = np.zeros((393366,5))
bottom=np.zeros((393366,1))
Tuij0=getTuij0(Huij)
for u in range(len(useridlist)):
for i in range(len(itemidlist)):
temp=0
hh=int(Bui[u][i])
for j in range(5):
temp+=Tuij0[hh][j]
jj=Bui[u][i]
bottom[jj]=temp
for u in range(len(useridlist)):
for i in range(len(itemidlist)):
for j in range(5):
tok=Bui[u][i]
Tuij_input[tok][j]=Tuij0[tok][j]/bottom[tok]
return Tuij_input
(6)线性回归与预测
bo=np.zeros((393366,5))
m=getHuik(doc_topic,5,bo)
clf= sklearn.linear_model.LinearRegression()
clf.fit(bo,mark)
Test_input=getTuij_input(m)
print("结果:")
for item in testdata:
# item=testdata[1]
print(item)
u=int(useridlist[item[0]])
i=int(itemidlist[item[1]])
print(clf.predict(Test_input[u][i])) #输出预测评分
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。