当前位置:   article > 正文

NLP16-总结之一[dict,tfidf,word2vec,关键词,simhash]_tfidf word2vec适用

tfidf word2vec适用

一, 介绍

Nlp使用有一段时间了,代码写得总是写得有点凌乱,想总结一下:
1. 在使用时,希望这个是单例,python的单例实现;
2. 通过cmdline来控制;
3 在一个类中有一些加载数据,加载词典及模型的功能。
3.1 构建一个语料词典;
3.2 训练tfidf模型及基于它的关键词抽取;
3.3 训练word2vec及基于它的相关词汇的提取;
3.4 基于关键词特征的simhash的文章编码,形成文章的指纹;并实现基于文章指纹的文章相似度计算。

二,TF-IDF

TF:
The weight of a term that occurs in a document is simply proportional to the term frequency.
IDF:
The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.
这里写图片描述
详细的参见[4] tf–idf https://en.wikipedia.org/wiki/Tf%E2%80%93idf

三,Simhash

simhash是google用来处理海量文本去重的算法。simhash将文档换成一个64位的hash码,然后判断hash码的海明距离D来决定文章是否相似,根据经验当D小于3,判断两个文档相似。[1]
这里写图片描述

四、基于装饰器的单例

def singleton(cls):
    instances = {}

    def wrapper(*args, **kwargs):
        if cls not in instances:
            instances[cls] = cls(*args, **kwargs)
        return instances[cls]

    logging.info('singleton size %d' % (len(instances)))
    return wrapper
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

五,分词

from zhon.hanzi import non_stops
jieba.load_userdict(DICT_PATH)
STOPS_LIST = {}.fromkeys([line.strip() for line in open(STOPWORDS_PATH, 'r', encoding='utf-8')])
C_PUNCTUATION = string.punctuation + non_stops
def cut_txt(txt, r_type=1):
    """
    :param txt:
    :param r_type: 2:表示返回数组
    :return:
    """
    segs = jieba.cut(txt, cut_all=False)
    segs = [word for word in list(segs)
            if word.lstrip() is not None
            and word.lstrip() not in STOPS_LIST
            and word.lstrip() not in C_PUNCTUATION]
    rs = None
    if r_type == 2:
        rs = segs
    else:
        rs = " ".join(segs)
    return rs
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

六、model类

# 词典数据加载
# coding=utf-8
import logging
import time

import numpy as np
from gensim import corpora, models
from gensim.models import Word2Vec

class ForDict(object):
    """
    对于字典的字符读取
    """

    def __init__(self, file):
        self.file = file

    def __iter__(self):
        for line in open(self.file, 'r', encoding='utf-8'):
            yield line.lower().split()

# 对Word2Vec加载数据
class ForWord2Vec(object):
    def __init__(self, in_file):
        self.in_file = in_file

    def __iter__(self):
        for line in open(self.in_file, encoding='utf-8'):
            yield line.split()


@singleton
class NlpModel(object):
    def __init__(self,
                 opts={'is_load_data': False},
                 dic_path='%s%s.dict' % (MODEL_PATH, 'all'),
                 tfidf_path='%s%s.tfidf' % (MODEL_PATH, 'all'),
                 word2vec_path='%s%s.w2v' % (MODEL_PATH, 'all')):
        self.opts = opts
        self.has_load_data = False

        self.dic = None
        self.dic_path = dic_path

        self.tfidf = None
        self.tfidf_path = tfidf_path

        self.has_comebined = False

        self.word2vec = None
        self.word2vec_path = word2vec_path

    def _load_dic(self):
        logging.info('load dict...')
        if not self.dic:
            self.dic = corpora.Dictionary.load(self.dic_path)
        else:
            logging.info('has loaded dic..')

    def _load_tfidf(self):
        if not self.tfidf:
            self.tfidf = models.TfidfModel.load(self.tfidf_path)
        else:
            logging.info('has loaded tfidf.')

    def _load_word2vec(self):
        if not self.word2vec:
            self.word2vec = Word2Vec.load(self.word2vec_path)
        else:
            logging.info('word2vec has loaded.')

    def _load_data(self):
        if (not self.has_load_data) and self.opts.is_load_data:
            logging.info('load data...')
            load_data()
            combine_cut_data()
            self.has_load_data = True
        else:
            logging.info('has downed data')

    def train_corpus_dic(self):
        """
        构建语料字典模型
        :param opts:
        :return:
        """
        t0 = time.time()
        # 1. 数据下载
        self._load_data()

        # 3. 建立词典
        logging.info('create dictionary.')
        dict_data = ForDict('%s%s.cut' % (CORPUS_PATH, 'all'))
        dict = corpora.Dictionary(dict_data)

        # 4. 保存词典
        logging.info('save dictionary.')
        dict.save('%s%s.dict' % (MODEL_PATH, 'all'))

        # 5. 记录完成日志
        logging.info('number of words:%d; number of num_docs:%d; numbert of num_pos:%d; cost time:%f' % (
            len(dict.keys()), dict.num_docs, dict.num_pos, time.time() - t0))

    def train_tfidf(self, opts=None):
        """
        计算TFIDF
        :param opts:
        :return:
        """
        t0 = time.time()
        logging.info('do_keyword begin...')

        # 1.加载词典
        self._load_dic()

        # 2.计算tfidf
        logging.info('compute tfidf...')
        tfidf = models.TfidfModel(dictionary=self.dic, wlocal=identity_a, wglobal=df2idf_a)

        # 3.保存iftdf
        logging.info('save tfidf...')
        tfidf.save('%s%s.tfidf' % (MODEL_PATH, 'all'))
        logging.info('do_keyword end.cost:%f' % (time.time() - t0))

    def train_word2vec(self):
        """
        word2vec
        :param opts:
        :return:
        """
        t0 = time.time()
        # 1. 数据下载
        self._load_data()

        # 3.计算word2vec
        logging.info('begin word2vec...')
        sentences = ForWord2Vec('%s%s.cut' % (CORPUS_PATH, 'all'))
        m_word2vec = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

        # 4. 保存word2vec
        m_word2vec.save('%s%s.w2v' % (MODEL_PATH, 'all'))
        logging.info('end word2vec. cost:%fs' % (time.time() - t0))

    def get_keyword(self, doc_str=None, top_k=10, is_blank=False):
        """
        关键词提取
       """
        t0 = time.time()
        logging.info('do_keyword begin...')
        # 1.加载词典
        self._load_dic()
        # tfidf
        logging.info('load tfidf...')
        self._load_tfidf()
        # 计算关键词
        rs = None
        if doc_str:
            logging.info('leng of doc_str:%s' % (len(doc_str)))
            vec_bow = self.dic.doc2bow(doc_str.split() if is_blank else cut_txt(doc_str, r_type=2), allow_update=False)
            vec_tfidf = self.tfidf[vec_bow]
            vec_tfidf.sort(key=lambda x: x[1], reverse=True)
            rs = list(map(lambda x: (self.dic[x[0]], x[1]), vec_tfidf[:min(top_k, len(vec_tfidf))]))
        logging.info('do_keyword end.cost time:%f' % (time.time() - t0))
        return rs

    def _string_hash(self, source):
        """
        compute hash.
        :param source:
        :return:
        """
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            return str(x)

    def get_simhash(self, content, top_k=200):
        """
        获取simhash
        :param content:
        :param top_k:
        :return:
        """
        t0 = time.time()
        logging.info('get simhash.')
        simhash = []
        try:
            keyWord = self.get_keyword(content, top_k=top_k, is_blank=False)
            keyList = []
            for feature, weight in keyWord:
                weight = round(weight, 6)
                feature = self._string_hash(feature)
                temp = []
                for i in feature:
                    if (i == '1'):
                        temp.append(weight)
                    else:
                        temp.append(-weight)
                keyList.append(temp)
            list1 = np.sum(np.array(keyList), axis=0)
            if (keyList == []):
                return '00'
            for i in list1:
                if (i > 0):
                    simhash.append('1')
                else:
                    simhash.append('0')
        except Exception as e:
            logging.error('Simhash.simhash err:%s' % e)
        finally:
            logging.info('get hash end.cost time:%f' % (time.time() - t0))
        return ''.join(simhash)

    def get_word_similar(self, word='婴儿', topn=10):
        """
        获取相关词汇
        :param word:
        :param model_path:
        :param topn:
        :return:
        """
        t0 = time.time()
        self._load_word2vec()
        rs = self.word2vec.wv.similar_by_word(word, topn=topn)
        logging.info('word similar end.cost time %s' % (time.time() - t0))
        return rs
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235

七、参数解释

from optparse import OptionParser

usage = "usage: %prog [options] arg1 arg2"
MODEL_OP = OptionParser(usage=usage)
MODEL_OP.add_option("-t", "--type",
                    # action="store_true",
                    dest='type',
                    default='keyword',
                    help="types[keyword,dict,tfidf,word2vec]")
MODEL_OP.add_option("-o", "--output",
                    dest="model",
                    default="test.model",
                    help="output model file name")
MODEL_OP.add_option("-l", "--is_load_data",
                    action="store_true",
                    dest="is_load_data",
                    default=False,
                    help="whether will be loaded data or not")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

八, 设置日志

import logging
# 日志记录
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filemode=LOGMODE,
                    filename='%s/maodel_%s.log' % (LOGPATH, datetime.datetime.now().strftime('%Y-%m-%d'))
                    )

console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

logging.info("""
   train...train...train...
   *   *   *   *   *   *
     -       -       -
   train...train...train...
""")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23

九. Main方法

if __name__ == '__main__':
    logging.info('start args:%s', str(sys.argv))
    (opts, args) = MODEL_OP.parse_args(sys.argv[1:])

    model = NlpModel(opts)
    if len(args) > 0:
        MODEL_OP.error("this script takes no arguments.")
        sys.exit(1)

    if 'dict' == opts.type:  # 词典
        logging.info('dict')
        model.train_corpus_dic()
    elif 'tfidf' == opts.type:  # tfidf
        logging.info('tfidf')
        model.train_tfidf()
    elif 'word2vec' == opts.type:  # word2vec
        logging.info('word2vec')
        model.train_word2vec()
    else:
        pass
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

十、测试

model = NlpModel()
print(model.get_simhash(content=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
"""))

print(model.get_keyword(doc_str=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
""", is_blank=False))
ws = [
    # '感冒',
    # '高血压'
    # '维生素',
    # '乙肝',
    '婴儿',
    # '小猴子',
    # '营养品',
    # '盆腔炎',
    # '咽喉炎',
    # '高尿酸',
    # '高胆固醇血症'
]
for w in ws:
    print(w)
    a = model.get_word_similar(w, topn=30)
    for i in range(0, len(a)):
        print(a[i])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
结果:
0010010001000010000000101100110011100011010010101000100010111111
[('含钾', 0.3054520633861392), ('钾', 0.28989261631141955), ('元素', 0.28500796381274923), ('驾车', 0.23984964041886256), ('犯困', 0.22359016443278812), ('缺钾', 0.20313506276231016), ('日片', 0.19394250188386772), ('药', 0.18726227727860778), ('嗜睡', 0.18180214856953114), ('夏季', 0.14477385758300293)]
对于word2vec:
婴儿
('新生儿', 0.8387792110443115)
('早产儿', 0.78364098072052)
('宝宝', 0.7606385946273804)
('小宝宝', 0.7359695434570312)
('婴幼儿', 0.7072071433067322)
('幼儿', 0.6674544811248779)
('婴儿期', 0.657639741897583)
('宝贝', 0.6358252763748169)
('体重儿', 0.6267213821411133)
('足月儿', 0.6226769685745239)
('男婴', 0.6199154853820801)
('小孩', 0.6098051071166992)
('孩子', 0.6038563847541809)
('乳母', 0.5991641283035278)
('胎儿', 0.5983285903930664)
('小儿', 0.5968100428581238)
('刚出生', 0.5956158638000488)
('出生', 0.594602108001709)
('儿童', 0.589705765247345)
('喂养', 0.5812875032424927)
('母乳', 0.5803599953651428)
('母乳喂养', 0.5779118537902832)
('崽', 0.5771138668060303)
('月龄', 0.5713974237442017)
('婴', 0.5667303204536438)
('患儿', 0.5656975507736206)
('配方奶粉', 0.5621417760848999)
('新生儿期', 0.5526482462882996)
('母亲', 0.5463709831237793)
('孩童', 0.5433803796768188)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35

十二、参考

[1] simhash算法原理及实现
https://yanyiwu.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html
[2]part 3: the simhash algorithm
http://matpalm.com/resemblance/simhash/
[3] Similarity Estimation Techniques from Rounding Algorithms
http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf
[4] tf–idf
https://en.wikipedia.org/wiki/Tf%E2%80%93idf
[5] Luhn, Hans Peter (1957). “A Statistical Approach to Mechanized Encoding and Searching of Literary Information” (PDF). IBM Journal of research and development. IBM. 1 (4): 315.doi:10.1147/rd.14.0309. Retrieved 2 March 2015. There is also the probability that the more frequently a notion and combination of notions occur, the more importance the author attaches to them as reflecting the essence of his overall idea.
[6] Spärck Jones, K. (1972). “A Statistical Interpretation of Term Specificity and Its Application in Retrieval”. Journal of Documentation. 28: 11–21. doi:10.1108/eb026526.
[7] Hans Peter Luhn
https://en.wikipedia.org/wiki/Hans_Peter_Luhn
[8]Karen Spärck Jones: https://en.wikipedia.org/wiki/Karen_Sp%C3%A4rck_Jones

[happyprince , http://blog.csdn.net/ld326/article/details/79117241]

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/353469
推荐阅读
相关标签
  

闽ICP备14008679号