赞
踩
import time import numpy as np import math wordHash = {} wordNum = 0 window = 2 words = [] vecSize = 100 u = 0.1 t = 500 #将单词存入map def read_file(): global wordNum,wordHash,words f =open("test.txt",encoding="utf-8") sentences = f.readlines() for sentence in sentences: words = sentence.split(" ") for word in words: if word in wordHash: wordHash[word] += 1 else: wordHash[word] = 1 wordNum+=1 #对单词构建哈弗曼编码 def buildHFMTree(): global wordHash vocab = sorted(wordHash.items(), key=lambda item: item[1], reverse=True) length = len(vocab)*2 weight = [None]*length parent = [None]*length pos = [None]*length for i in range(length): if i < length/2: wordHash[vocab[i][0]] = i weight[i] = vocab[i][1] else: weight[i] = wordNum lp = len(vocab)-1 rp = lp+1 addp = lp+1 while True: if lp<0: if rp+1==addp: break weight[addp] = weight[rp]+weight[rp+1] pos[rp] = 0 pos[rp+1] = 1 parent[rp] = addp parent[rp+1] = addp addp+=1 rp+=2 continue if weight[lp] < weight[rp]: if lp-1>=0 and weight[lp-1] < weight[rp]: min = lp max = lp-1 lp = lp-2 else: min = lp max = rp lp-=1 rp+=1 else: if weight[rp+1] > weight[lp]: min = rp max = lp lp -= 1 rp+=1 else: min = rp max = rp+1 rp+=2 weight[addp] = weight[min]+weight[max] pos[min] = 0 pos[max] = 1 parent[min] = addp parent[max] = addp addp+=1 return pos,parent def sigmiod(n): return np.exp(n)/(1+np.exp(n)) def getHFMCode(word,pos,parent): global wordHash i = wordHash[word] code = [] while parent[i]!=None: code.append(pos[i]) i = parent[i] print("单词'"+word+"'的哈弗曼编码:"+str(code)) return code def updataParam(word,pos,parent,ansVec,projVec,paramVec): global wordHash i = wordHash[word] ll = 0 paramChange = np.zeros((vecSize,wordNum-1)) projChange = np.zeros((vecSize)) while parent[i] != None: d = pos[i] n = ansVec[parent[i]-wordNum] try: ll += (1-d)*math.log(sigmiod(n))+d*math.log(1-sigmiod(n)) except Exception: ll += 0 m = 1-d-sigmiod(n) gradProj = m*projVec gradParam = m * paramVec[:,parent[i] - wordNum] paramChange[:,parent[i]-wordNum] += gradProj*u projChange += gradParam*u i = parent[i] return projChange,paramChange,ll def initVec(): global wordNum,vecSize wordVec = np.random.random((wordNum,vecSize)) paramVec = np.zeros((vecSize,wordNum-1)) for i in range(wordNum): for j in range(vecSize): wordVec[i][j] = (wordVec[i][j]-0.5)/vecSize return wordVec,paramVec def train(wordVec,paramVec,pos,parent): global vecSize for k in range(t+1): paramChange = np.zeros((vecSize, wordNum - 1)) wordChange = np.zeros((wordNum,vecSize)) for i in range(len(words)): projVec = np.zeros(vecSize) n = 0 for j in range(i-window,i): if j<0: continue projVec += wordVec[wordHash[words[j]]] n+=1 for j in range(i+1,i+window): if j>=len(words): continue projVec += wordVec[wordHash[words[j]]] n+=1 projVec = projVec/n ansVec = projVec.dot(paramVec) projChange1,paramChange1,ll = updataParam(words[i],pos,parent,ansVec,projVec,paramVec) for j in range(i - window, i): if j < 0: continue wordChange[wordHash[words[j]]]+=projChange1 for j in range(i + 1, i + window): if j >= len(words): continue wordChange[wordHash[words[j]]]+=projChange1 paramChange+=paramChange1 if k%100==0: print("第"+str(k)+"轮训练中,单词"+words[i]+"的损失为:"+str(ll)) wordVec+=wordChange paramVec+=paramChange print("开始读取单词") read_file() print("读取单词结束") print("开始构建哈弗曼树") pos,parent = buildHFMTree() print("构建完成") #getHFMCode('interpretations',pos,parent) print("开始初始化单词向量") wordVec,paramVec = initVec() print("单词向量初始完成") print("准备训练参数") train(wordVec,paramVec,pos,parent)
语料:
In the near future the translation history will only be viewable when you log in to your account and it will be centrally managed in my activity record. This upgrade will clear the previous history so if you want the system to record certain translations for future review please be sure to save the translation results
运行结果:
不足
相对于源码,未实现负采样,多线程和指数运算近似来加快性能的功能
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。