赞
踩
def add_endings(text): """将文章分句并加句子结束符</s>,返回带有</s>的文章(字符串类型)""" sentences = nltk.sent_tokenize( text ) result_text = """""" for sentence in sentences: sentence = "<s> " + sentence + " </s> " result_text += sentence return result_text def preprocess(text): """传入文章原文,进行简单的预处理后,返回tokens列表""" text = text.lower() # 小写 text = add_endings( text ) # 分句并加句子开始结束符 # print( "经过预处理的文章:\n", text ) pattern = re.compile( r"[-\[\]()\t\n.,;!?“”‘'\\`~\s]+" ) tokens = pattern.split( text ) return tokens def unigrams(tokens): """传入规范化以及分词后的符记序列,并以字典的形式返回一元语法模型,其中键具有唯一性,值则代表了该一元语法在语料库中出现的次数""" unigram = {} for word in tokens: if word not in unigram: unigram[word] = 1 else: unigram[word] = unigram[word] + 1 return unigram def bigrams(tokens): """传入规范化以及分词后的符记序列,并以字典的形式返回二元语法模型,其中键具有唯一性,值则代表了该二元语法在语料库中出现的次数""" bigram = {} # 每两个相邻单词截取为一个二元语法 bi_grammar = '' for index, word in enumerate( tokens ): # 同时枚举出符记的下标和值 # print( index, word ) if word != "</s>" and word != "": bi_grammar = word + " " + tokens[index + 1] # print( bi_grammar ) if bi_grammar not in bigram: bigram[bi_grammar] = 1 else: bigram[bi_grammar] = bigram[bi_grammar] + 1 return bigram
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。