当前位置:   article > 正文

NLP:Python 实现unigram(一元组)和bigram(二元组)_python unigram

python unigram
def add_endings(text):
    """将文章分句并加句子结束符</s>,返回带有</s>的文章(字符串类型)"""
    sentences = nltk.sent_tokenize( text )
    result_text = """"""
    for sentence in sentences:
        sentence = "<s> " + sentence + " </s> "
        result_text += sentence
    return result_text


def preprocess(text):
    """传入文章原文,进行简单的预处理后,返回tokens列表"""
    text = text.lower()  # 小写
    text = add_endings( text )  # 分句并加句子开始结束符
    # print( "经过预处理的文章:\n", text )
    pattern = re.compile( r"[-\[\]()\t\n.,;!?“”‘'\\`~\s]+" )
    tokens = pattern.split( text )
    return tokens


def unigrams(tokens):
    """传入规范化以及分词后的符记序列,并以字典的形式返回一元语法模型,其中键具有唯一性,值则代表了该一元语法在语料库中出现的次数"""
    unigram = {}
    for word in tokens:
        if word not in unigram:
            unigram[word] = 1
        else:
            unigram[word] = unigram[word] + 1
    return unigram


def bigrams(tokens):
    """传入规范化以及分词后的符记序列,并以字典的形式返回二元语法模型,其中键具有唯一性,值则代表了该二元语法在语料库中出现的次数"""
    bigram = {}
    # 每两个相邻单词截取为一个二元语法
    bi_grammar = ''
    for index, word in enumerate( tokens ):  # 同时枚举出符记的下标和值
        #  print( index, word )
        if word != "</s>" and word != "":
            bi_grammar = word + " " + tokens[index + 1]
            #  print( bi_grammar )
            if bi_grammar not in bigram:
                bigram[bi_grammar] = 1
            else:
                bigram[bi_grammar] = bigram[bi_grammar] + 1

    return bigram
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号