赞
踩
Nlp使用有一段时间了,代码写得总是写得有点凌乱,想总结一下:
1. 在使用时,希望这个是单例,python的单例实现;
2. 通过cmdline来控制;
3 在一个类中有一些加载数据,加载词典及模型的功能。
3.1 构建一个语料词典;
3.2 训练tfidf模型及基于它的关键词抽取;
3.3 训练word2vec及基于它的相关词汇的提取;
3.4 基于关键词特征的simhash的文章编码,形成文章的指纹;并实现基于文章指纹的文章相似度计算。
TF:
The weight of a term that occurs in a document is simply proportional to the term frequency.
IDF:
The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.
详细的参见[4] tf–idf https://en.wikipedia.org/wiki/Tf%E2%80%93idf
simhash是google用来处理海量文本去重的算法。simhash将文档换成一个64位的hash码,然后判断hash码的海明距离D来决定文章是否相似,根据经验当D小于3,判断两个文档相似。[1]
def singleton(cls):
instances = {}
def wrapper(*args, **kwargs):
if cls not in instances:
instances[cls] = cls(*args, **kwargs)
return instances[cls]
logging.info('singleton size %d' % (len(instances)))
return wrapper
from zhon.hanzi import non_stops
jieba.load_userdict(DICT_PATH)
STOPS_LIST = {}.fromkeys([line.strip() for line in open(STOPWORDS_PATH, 'r', encoding='utf-8')])
C_PUNCTUATION = string.punctuation + non_stops
def cut_txt(txt, r_type=1):
"""
:param txt:
:param r_type: 2:表示返回数组
:return:
"""
segs = jieba.cut(txt, cut_all=False)
segs = [word for word in list(segs)
if word.lstrip() is not None
and word.lstrip() not in STOPS_LIST
and word.lstrip() not in C_PUNCTUATION]
rs = None
if r_type == 2:
rs = segs
else:
rs = " ".join(segs)
return rs
# 词典数据加载
# coding=utf-8
import logging
import time
import numpy as np
from gensim import corpora, models
from gensim.models import Word2Vec
class ForDict(object):
"""
对于字典的字符读取
"""
def __init__(self, file):
self.file = file
def __iter__(self):
for line in open(self.file, 'r', encoding='utf-8'):
yield line.lower().split()
# 对Word2Vec加载数据
class ForWord2Vec(object):
def __init__(self, in_file):
self.in_file = in_file
def __iter__(self):
for line in open(self.in_file, encoding='utf-8'):
yield line.split()
@singleton
class NlpModel(object):
def __init__(self,
opts={'is_load_data': False},
dic_path='%s%s.dict' % (MODEL_PATH, 'all'),
tfidf_path='%s%s.tfidf' % (MODEL_PATH, 'all'),
word2vec_path='%s%s.w2v' % (MODEL_PATH, 'all')):
self.opts = opts
self.has_load_data = False
self.dic = None
self.dic_path = dic_path
self.tfidf = None
self.tfidf_path = tfidf_path
self.has_comebined = False
self.word2vec = None
self.word2vec_path = word2vec_path
def _load_dic(self):
logging.info('load dict...')
if not self.dic:
self.dic = corpora.Dictionary.load(self.dic_path)
else:
logging.info('has loaded dic..')
def _load_tfidf(self):
if not self.tfidf:
self.tfidf = models.TfidfModel.load(self.tfidf_path)
else:
logging.info('has loaded tfidf.')
def _load_word2vec(self):
if not self.word2vec:
self.word2vec = Word2Vec.load(self.word2vec_path)
else:
logging.info('word2vec has loaded.')
def _load_data(self):
if (not self.has_load_data) and self.opts.is_load_data:
logging.info('load data...')
load_data()
combine_cut_data()
self.has_load_data = True
else:
logging.info('has downed data')
def train_corpus_dic(self):
"""
构建语料字典模型
:param opts:
:return:
"""
t0 = time.time()
# 1. 数据下载
self._load_data()
# 3. 建立词典
logging.info('create dictionary.')
dict_data = ForDict('%s%s.cut' % (CORPUS_PATH, 'all'))
dict = corpora.Dictionary(dict_data)
# 4. 保存词典
logging.info('save dictionary.')
dict.save('%s%s.dict' % (MODEL_PATH, 'all'))
# 5. 记录完成日志
logging.info('number of words:%d; number of num_docs:%d; numbert of num_pos:%d; cost time:%f' % (
len(dict.keys()), dict.num_docs, dict.num_pos, time.time() - t0))
def train_tfidf(self, opts=None):
"""
计算TFIDF
:param opts:
:return:
"""
t0 = time.time()
logging.info('do_keyword begin...')
# 1.加载词典
self._load_dic()
# 2.计算tfidf
logging.info('compute tfidf...')
tfidf = models.TfidfModel(dictionary=self.dic, wlocal=identity_a, wglobal=df2idf_a)
# 3.保存iftdf
logging.info('save tfidf...')
tfidf.save('%s%s.tfidf' % (MODEL_PATH, 'all'))
logging.info('do_keyword end.cost:%f' % (time.time() - t0))
def train_word2vec(self):
"""
word2vec
:param opts:
:return:
"""
t0 = time.time()
# 1. 数据下载
self._load_data()
# 3.计算word2vec
logging.info('begin word2vec...')
sentences = ForWord2Vec('%s%s.cut' % (CORPUS_PATH, 'all'))
m_word2vec = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
# 4. 保存word2vec
m_word2vec.save('%s%s.w2v' % (MODEL_PATH, 'all'))
logging.info('end word2vec. cost:%fs' % (time.time() - t0))
def get_keyword(self, doc_str=None, top_k=10, is_blank=False):
"""
关键词提取
"""
t0 = time.time()
logging.info('do_keyword begin...')
# 1.加载词典
self._load_dic()
# tfidf
logging.info('load tfidf...')
self._load_tfidf()
# 计算关键词
rs = None
if doc_str:
logging.info('leng of doc_str:%s' % (len(doc_str)))
vec_bow = self.dic.doc2bow(doc_str.split() if is_blank else cut_txt(doc_str, r_type=2), allow_update=False)
vec_tfidf = self.tfidf[vec_bow]
vec_tfidf.sort(key=lambda x: x[1], reverse=True)
rs = list(map(lambda x: (self.dic[x[0]], x[1]), vec_tfidf[:min(top_k, len(vec_tfidf))]))
logging.info('do_keyword end.cost time:%f' % (time.time() - t0))
return rs
def _string_hash(self, source):
"""
compute hash.
:param source:
:return:
"""
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
return str(x)
def get_simhash(self, content, top_k=200):
"""
获取simhash
:param content:
:param top_k:
:return:
"""
t0 = time.time()
logging.info('get simhash.')
simhash = []
try:
keyWord = self.get_keyword(content, top_k=top_k, is_blank=False)
keyList = []
for feature, weight in keyWord:
weight = round(weight, 6)
feature = self._string_hash(feature)
temp = []
for i in feature:
if (i == '1'):
temp.append(weight)
else:
temp.append(-weight)
keyList.append(temp)
list1 = np.sum(np.array(keyList), axis=0)
if (keyList == []):
return '00'
for i in list1:
if (i > 0):
simhash.append('1')
else:
simhash.append('0')
except Exception as e:
logging.error('Simhash.simhash err:%s' % e)
finally:
logging.info('get hash end.cost time:%f' % (time.time() - t0))
return ''.join(simhash)
def get_word_similar(self, word='婴儿', topn=10):
"""
获取相关词汇
:param word:
:param model_path:
:param topn:
:return:
"""
t0 = time.time()
self._load_word2vec()
rs = self.word2vec.wv.similar_by_word(word, topn=topn)
logging.info('word similar end.cost time %s' % (time.time() - t0))
return rs
from optparse import OptionParser
usage = "usage: %prog [options] arg1 arg2"
MODEL_OP = OptionParser(usage=usage)
MODEL_OP.add_option("-t", "--type",
# action="store_true",
dest='type',
default='keyword',
help="types[keyword,dict,tfidf,word2vec]")
MODEL_OP.add_option("-o", "--output",
dest="model",
default="test.model",
help="output model file name")
MODEL_OP.add_option("-l", "--is_load_data",
action="store_true",
dest="is_load_data",
default=False,
help="whether will be loaded data or not")
import logging
# 日志记录
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filemode=LOGMODE,
filename='%s/maodel_%s.log' % (LOGPATH, datetime.datetime.now().strftime('%Y-%m-%d'))
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
logging.info("""
train...train...train...
* * * * * *
- - -
train...train...train...
""")
if __name__ == '__main__':
logging.info('start args:%s', str(sys.argv))
(opts, args) = MODEL_OP.parse_args(sys.argv[1:])
model = NlpModel(opts)
if len(args) > 0:
MODEL_OP.error("this script takes no arguments.")
sys.exit(1)
if 'dict' == opts.type: # 词典
logging.info('dict')
model.train_corpus_dic()
elif 'tfidf' == opts.type: # tfidf
logging.info('tfidf')
model.train_tfidf()
elif 'word2vec' == opts.type: # word2vec
logging.info('word2vec')
model.train_word2vec()
else:
pass
model = NlpModel()
print(model.get_simhash(content=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
"""))
print(model.get_keyword(doc_str=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
""", is_blank=False))
ws = [
# '感冒',
# '高血压'
# '维生素',
# '乙肝',
'婴儿',
# '小猴子',
# '营养品',
# '盆腔炎',
# '咽喉炎',
# '高尿酸',
# '高胆固醇血症'
]
for w in ws:
print(w)
a = model.get_word_similar(w, topn=30)
for i in range(0, len(a)):
print(a[i])
结果:
0010010001000010000000101100110011100011010010101000100010111111
[('含钾', 0.3054520633861392), ('钾', 0.28989261631141955), ('元素', 0.28500796381274923), ('驾车', 0.23984964041886256), ('犯困', 0.22359016443278812), ('缺钾', 0.20313506276231016), ('日片', 0.19394250188386772), ('药', 0.18726227727860778), ('嗜睡', 0.18180214856953114), ('夏季', 0.14477385758300293)]
对于word2vec:
婴儿
('新生儿', 0.8387792110443115)
('早产儿', 0.78364098072052)
('宝宝', 0.7606385946273804)
('小宝宝', 0.7359695434570312)
('婴幼儿', 0.7072071433067322)
('幼儿', 0.6674544811248779)
('婴儿期', 0.657639741897583)
('宝贝', 0.6358252763748169)
('体重儿', 0.6267213821411133)
('足月儿', 0.6226769685745239)
('男婴', 0.6199154853820801)
('小孩', 0.6098051071166992)
('孩子', 0.6038563847541809)
('乳母', 0.5991641283035278)
('胎儿', 0.5983285903930664)
('小儿', 0.5968100428581238)
('刚出生', 0.5956158638000488)
('出生', 0.594602108001709)
('儿童', 0.589705765247345)
('喂养', 0.5812875032424927)
('母乳', 0.5803599953651428)
('母乳喂养', 0.5779118537902832)
('崽', 0.5771138668060303)
('月龄', 0.5713974237442017)
('婴', 0.5667303204536438)
('患儿', 0.5656975507736206)
('配方奶粉', 0.5621417760848999)
('新生儿期', 0.5526482462882996)
('母亲', 0.5463709831237793)
('孩童', 0.5433803796768188)
[1] simhash算法原理及实现
https://yanyiwu.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html
[2]part 3: the simhash algorithm
http://matpalm.com/resemblance/simhash/
[3] Similarity Estimation Techniques from Rounding Algorithms
http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf
[4] tf–idf
https://en.wikipedia.org/wiki/Tf%E2%80%93idf
[5] Luhn, Hans Peter (1957). “A Statistical Approach to Mechanized Encoding and Searching of Literary Information” (PDF). IBM Journal of research and development. IBM. 1 (4): 315.doi:10.1147/rd.14.0309. Retrieved 2 March 2015. There is also the probability that the more frequently a notion and combination of notions occur, the more importance the author attaches to them as reflecting the essence of his overall idea.
[6] Spärck Jones, K. (1972). “A Statistical Interpretation of Term Specificity and Its Application in Retrieval”. Journal of Documentation. 28: 11–21. doi:10.1108/eb026526.
[7] Hans Peter Luhn
https://en.wikipedia.org/wiki/Hans_Peter_Luhn
[8]Karen Spärck Jones: https://en.wikipedia.org/wiki/Karen_Sp%C3%A4rck_Jones
[happyprince , http://blog.csdn.net/ld326/article/details/79117241]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。