赞
踩
具体步骤:
import collections #collections是python内建的集合模块,提供了许多有用的集合 import re #python正则表达式 #读入文本,返回文本的行数 def read_time_machine(): with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f: #创建文件类型f lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] #strip()函数去掉空格、换行符等 #lower:大写->小写 #使用正则表达式把非小写字母替换成空格 return lines lines = read_time_machine() print('# sentences %d' % len(lines)) #分词 #sentences:句子 #token:flag def tokenize(sentences, token='word'): """Split sentences into word or char tokens""" if token == 'word': return [sentence.split(' ') for sentence in sentences] elif token == 'char': return [list(sentence) for sentence in sentences] else: print('ERROR: unkown token type '+token) tokens = tokenize(lines) tokens[0:2] #建立字典 class Vocab(object): def __init__(self, tokens, min_freq=0, use_special_tokens=False): counter = count_corpus(tokens) # : <词,词频> self.token_freqs = list(counter.items()) self.idx_to_token = [] if use_special_tokens: # padding, begin of sentence, end of sentence, unknown self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token] self.token_to_idx = dict() for idx, token in enumerate(self.idx_to_token): self.token_to_idx[token] = idx #成员函数返回token的长度 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] def count_corpus(sentences): tokens = [tk for st in sentences for tk in st] return collections.Counter(tokens) # 返回一个字典,记录每个词的出现次数 vocab = Vocab(tokens) print(list(vocab.token_to_idx.items())[0:10]) #将词转换为索引 for i in range(8, 10): print('words:', tokens[i]) #打印词 print('indices:', vocab[tokens[i]])#打印索引
上诉分词的缺点:
标点符号可以提供语义,直接去掉了
shouldn‘t,"Mr."被错误地处理
text = "Mr. Chen doesn't agree with my suggestion."
#spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.text for token in doc])
#NLTK
from nltk.tokenize import word_tokenize
from nltk import data
data.path.append('/home/kesci/input/nltk_data3784/nltk_data')
print(word_tokenize(text))
语言模型的目标:评估序列是否合理,即计算该序列的概率
语言模型:
需要的数学知识:条件概率;马尔可夫假设
马尔科夫假设是指一个词的出现只与前面 n n n个词相关,即 n n n阶马尔可夫链(Markov chain of order n n n),如果 n = 1 n=1 n=1,那么有 P ( w 3 ∣ w 1 , w 2 ) = P ( w 3 ∣ w 2 ) P(w_3 \mid w_1, w_2) = P(w_3 \mid w_2) P(w3∣w1,w2)=P(w3∣w2)。基于 n − 1 n-1 n−1阶马尔可夫链,我们可以将语言模型改写为
P ( w 1 , w 2 , … , w T ) = ∏ t = 1 T P ( w t ∣ w t − ( n − 1 ) , … , w t − 1 ) . P(w_1, w_2, \ldots, w_T) = \prod_{t=1}^T P(w_t \mid w_{t-(n-1)}, \ldots, w_{t-1}) . P(w1,w2,…,w
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。