赞
踩
@TokenEmbedder.register("glove_embedding") # 与json中tokens里的type相对应 class GloVeEmbedding(TokenEmbedder): # GloVeEmbedding与Embedding完全相同,但是其中调用了读取预训练文件的函数,该函数需要重写 # 修改__read_embeddings_from_text_file,共改动3处 def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" ) -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped. The remainder of the docstring is identical to `_read_pretrained_embeddings_file`. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) char_embeddings = {} # 1.添加char embedding embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line, ) continue vector = numpy.asarray(fields[1:], dtype="float32") # 2.对token中每个字母进行统计,字符出现在词中,则(向量累加,计数) for char in list(token): if char in char_embeddings: char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1) else: char_embeddings[char] = (vector, 1) embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary" ) # char vector:向量和/出现次数 char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings} chars = set(char_embeddings.keys()) all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std ) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 elif len(set(token) - chars) == 0: # 3.只要字符可以组成该token,字符向量和作为词向量,也算预训练文件中包括该词 embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token ) logger.info( "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size ) return embedding_matrix
{
"token_embedders": {
"tokens": {
"type": "glove_embedding",
"trainable": false,
"embedding_dim": 300,
"pretrained_file": "path/glove.6B.300d.txt"
}
}
}
allennlp train -s allennlp_model/model -f --include-package allennlp_model.embedder.glove_embedder allennlp_model/run_glove_embedder.json
–include-package path1.path2.py_file_name
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。