赞
踩
**前言:**之前有幸参与《真实世界自然语言处理(Real-World Natural Language Processing)》一书的翻译,主要负责粗译部分,从中收获良多。 当实体书本拿到后我更是兴奋不已。但当我扫描书背后的二维码时发现,里面的“源代码”就是段文字,不能点击下载。有觉及此,我决定将里面的代码搬上来,也全部运行一遍, 里面有些package已经过期不能再install,我也放上了我运行成功的代码和它的版本号like below。可以用作大家参考。
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install spaCy==3.7.2
第一章主要介绍什么事NLP,以及NLP与其他AI领域的关系。没有代码。
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/sentiment/sst_classifier.ipynb
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install spaCy==3.7.2
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
from itertools import chain from typing import Dict import numpy as np import torch import torch.optim as optim from allennlp.data.data_loaders import MultiProcessDataLoader from allennlp.data.samplers import BucketBatchSampler from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.nn.util import get_text_field_mask from allennlp.training import GradientDescentTrainer from allennlp.training.metrics import CategoricalAccuracy, F1Measure from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \ StanfordSentimentTreeBankDatasetReader from realworldnlp.predictors import SentenceClassifierPredictor
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
# Model in AllenNLP represents a model that is trained. class LstmClassifier(Model): def __init__(self, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: str = '4') -> None: super().__init__(vocab) # We need the embeddings to convert word IDs to their vector representations self.embedder = embedder self.encoder = encoder # After converting a sequence of vectors to a single vector, we feed it into # a fully-connected linear layer to reduce the dimension to the total number of labels. self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) positive_index = vocab.get_token_index(positive_label, namespace='labels') self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_index) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss() # Instances are fed to forward after batching. # Fields are passed through arguments with the same name. def forward(self, tokens: Dict[str, torch.Tensor], label: torch.Tensor = None) -> torch.Tensor: # In deep NLP, when sequences of tensors in different lengths are batched together, # shorter sequences get padded with zeros to make them equal length. # Masking is the process to ignore extra zeros added by padding mask = get_text_field_mask(tokens) # Forward pass embeddings = self.embedder(tokens) encoder_out = self.encoder(embeddings, mask) logits = self.linear(encoder_out) # In AllenNLP, the output of forward() is a dictionary. # Your output dictionary must contain a "loss" key for your model to be trained. output = {"logits": logits} if label is not None: self.accuracy(logits, label) self.f1_measure(logits, label) output["loss"] = self.loss_function(logits, label) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {'accuracy': self.accuracy.get_metric(reset), **self.f1_measure.get_metric(reset)}
reader = StanfordSentimentTreeBankDatasetReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)
# You can optionally specify the minimum count of tokens/labels.
# `min_count={'tokens':3}` here means that any tokens that appear less than three times
# will be ignored and not included in the vocabulary.
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()),
min_count={'tokens': 3})
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_DIM)
# BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
# not for labels, which are used as-is as the "answer" of the sentence classification
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
# Seq2VecEncoder is a neural network abstraction that takes a sequence of something
# (usually a sequence of embedded word vectors), processes it, and returns a single
# vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
# AllenNLP also supports CNNs and other simple architectures (for example,
# just averaging over the input vectors).
encoder = PytorchSeq2VecWrapper(
torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmClassifier(word_embeddings, encoder, vocab)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=train_data_loader,
validation_data_loader=dev_data_loader,
patience=10,
num_epochs=20,
cuda_device=-1)
trainer.train()
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict('This is the best movie ever!')['logits']
label_id = np.argmax(logits)
print(model.vocab.get_token_from_index(label_id, 'labels'))
最终结果:
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/tokenization.ipynb
!pip install nltk==3.8.1
!pip install spacy==3.6.1
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
s = '''Good muffins cost $3.88\nin New York. Please buy me two of them.\n\nThanks.'''
# word_tokenize() uses TreebankWordTokenizer internally
word_tokenize(s)
from nltk.tokenize import sent_tokenize
# sent_tokenizer() uses PunktSentenceTokenizer internally
sent_tokenize(s)
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(s)
[token.text for token in doc]
[sent.text.strip() for sent in doc.sents]
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
words = ['caresses', 'flies', 'dies', 'mules', 'denied',
'died', 'agreed', 'owned', 'humbled', 'sized',
'meetings', 'stating', 'siezing', 'itemization',
'sensational', 'traditional', 'reference', 'colonizer',
'plotted']
[stemmer.stem(word) for word in words]
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in words]
doc = nlp(' '.join(words))
[token.lemma_ for token in doc]
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/embeddings/word2vec.ipynb
!pip install allennlp==2.10.1
!pip install overrides==4.1.2
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
from collections import Counter
import torch
import torch.optim as optim
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.token_embedders import Embedding
from allennlp.training import GradientDescentTrainer
from torch.nn import CosineSimilarity
from torch.nn import functional
from examples.embeddings.word2vec import SkipGramReader
EMBEDDING_DIM = 256
BATCH_SIZE = 256
class SkipGramModel(Model):
def __init__(self, vocab, embedding_in):
super().__init__(vocab)
self.embedding_in = embedding_in
self.linear = torch.nn.Linear(
in_features=EMBEDDING_DIM,
out_features=vocab.get_vocab_size('token_out'),
bias=False)
def forward(self, token_in, token_out):
embedded_in = self.embedding_in(token_in)
logits = self.linear(embedded_in)
loss = functional.cross_entropy(logits, token_out)
return {'loss': loss}
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10):
"""Given a token, return a list of top N most similar words to the token."""
token_id = vocab.get_token_index(token, 'token_in')
token_vec = embedding.weight[token_id]
cosine = CosineSimilarity(dim=0)
sims = Counter()
for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
sim = cosine(token_vec, embedding.weight[index]).item()
sims[token] = sim
return sims.most_common(num_synonyms)
reader = SkipGramReader()
text8 = reader.read('https://realworldnlpbook.s3.amazonaws.com/data/text8/text8')
text8 = list(text8)
print(len(text8))
text8 = text8[:1000000]
vocab = Vocabulary.from_instances(
text8, min_count={'token_in': 5, 'token_out': 5})
data_loader = SimpleDataLoader(text8, batch_size=BATCH_SIZE)
data_loader.index_with(vocab)
embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
embedding_dim=EMBEDDING_DIM)
model = SkipGramModel(vocab=vocab,
embedding_in=embedding_in)
optimizer = optim.Adam(model.parameters())
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=data_loader,
num_epochs=5,
cuda_device=-1)
trainer.train()
print(get_related('one', embedding_in, vocab))
print(get_related('december', embedding_in, vocab))
https://github.com/mhagiwara/realworldnlp/blob/master/examples/embeddings/word2vec.py
!pip install allennlp==2.10.1
!pip install overrides==4.1.2
!pip install torch==2.0.1
import math import random from collections import Counter import numpy as np import torch import torch.optim as optim from allennlp.common.file_utils import cached_path from allennlp.data.data_loaders import SimpleDataLoader from allennlp.data.dataset_readers.dataset_reader import DatasetReader from allennlp.data.fields import LabelField from allennlp.data.instance import Instance from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.token_embedders import Embedding from allennlp.training import GradientDescentTrainer from overrides import overrides from scipy.stats import spearmanr from torch.nn import CosineSimilarity from torch.nn import functional EMBEDDING_DIM = 256 BATCH_SIZE = 256 CUDA_DEVICE = -1 @DatasetReader.register("skip_gram") class SkipGramReader(DatasetReader): def __init__(self, window_size=5, vocab: Vocabulary=None): """A DatasetReader for reading a plain text corpus and producing instances for the SkipGram model. When vocab is not None, this runs sub-sampling of frequent words as described in (Mikolov et al. 2013). """ super().__init__() self.window_size = window_size self.reject_probs = None if vocab: self.reject_probs = {} threshold = 1.e-3 token_counts = vocab._retained_counter['token_in'] # HACK total_counts = sum(token_counts.values()) for _, token in vocab.get_index_to_token_vocabulary('token_in').items(): counts = token_counts[token] if counts > 0: normalized_counts = counts / total_counts reject_prob = 1. - math.sqrt(threshold / normalized_counts) reject_prob = max(0., reject_prob) else: reject_prob = 0. self.reject_probs[token] = reject_prob def _subsample_tokens(self, tokens): """Given a list of tokens, runs sub-sampling. Returns a new list of tokens where rejected tokens are replaced by Nones. """ new_tokens = [] for token in tokens: reject_prob = self.reject_probs.get(token, 0.) if random.random() <= reject_prob: new_tokens.append(None) else: new_tokens.append(token) return new_tokens @overrides def _read(self, file_path: str): with open(cached_path(file_path), "r") as text_file: for line in text_file: tokens = line.strip().split(' ') tokens = tokens[:1000000] # TODO: remove if self.reject_probs: tokens = self._subsample_tokens(tokens) print(tokens[:200]) # for debugging for i, token in enumerate(tokens): if token is None: continue token_in = LabelField(token, label_namespace='token_in') for j in range(i - self.window_size, i + self.window_size + 1): if j < 0 or i == j or j > len(tokens) - 1: continue if tokens[j] is None: continue token_out = LabelField(tokens[j], label_namespace='token_out') yield Instance({'token_in': token_in, 'token_out': token_out}) class SkipGramModel(Model): def __init__(self, vocab, embedding_in, cuda_device=-1): super().__init__(vocab) self.embedding_in = embedding_in self.linear = torch.nn.Linear( in_features=EMBEDDING_DIM, out_features=vocab.get_vocab_size('token_out'), bias=False) if cuda_device > -1: self.linear = self.linear.to(cuda_device) def forward(self, token_in, token_out): embedded_in = self.embedding_in(token_in) logits = self.linear(embedded_in) loss = functional.cross_entropy(logits, token_out) return {'loss': loss} class SkipGramNegativeSamplingModel(Model): def __init__(self, vocab, embedding_in, embedding_out, neg_samples=10, cuda_device=-1): super().__init__(vocab) self.embedding_in = embedding_in self.embedding_out = embedding_out self.neg_samples = neg_samples self.cuda_device = cuda_device # Pre-compute probability for negative sampling token_to_probs = {} token_counts = vocab._retained_counter['token_in'] # HACK total_counts = sum(token_counts.values()) total_probs = 0. for token, counts in token_counts.items(): unigram_freq = counts / total_counts unigram_freq = math.pow(unigram_freq, 3 / 4) token_to_probs[token] = unigram_freq total_probs += unigram_freq self.neg_sample_probs = np.ndarray((vocab.get_vocab_size('token_in'),)) for token_id, token in vocab.get_index_to_token_vocabulary('token_in').items(): self.neg_sample_probs[token_id] = token_to_probs.get(token, 0) / total_probs def forward(self, token_in, token_out): batch_size = token_out.shape[0] # Calculate loss for positive examples embedded_in = self.embedding_in(token_in) embedded_out = self.embedding_out(token_out) inner_positive = torch.mul(embedded_in, embedded_out).sum(dim=1) log_prob = functional.logsigmoid(inner_positive) # Generate negative examples negative_out = np.random.choice(a=self.vocab.get_vocab_size('token_in'), size=batch_size * self.neg_samples, p=self.neg_sample_probs) negative_out = torch.LongTensor(negative_out).view(batch_size, self.neg_samples) if self.cuda_device > -1: negative_out = negative_out.to(self.cuda_device) # Subtract loss for negative examples embedded_negative_out = self.embedding_out(negative_out) inner_negative = torch.bmm(embedded_negative_out, embedded_in.unsqueeze(2)).squeeze() log_prob += functional.logsigmoid(-1. * inner_negative).sum(dim=1) return {'loss': -log_prob.sum() / batch_size} def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary): with open(file_path, mode='w') as f: for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): values = ['{:.5f}'.format(val) for val in embedding.weight[index]] f.write(' '.join([token] + values)) f.write('\n') def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10): """Given a token, return a list of top N most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[token_id] cosine = CosineSimilarity(dim=0) sims = Counter() for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_synonyms) def read_simlex999(): simlex999 = [] with open('data/SimLex-999/SimLex-999.txt') as f: next(f) for line in f: fields = line.strip().split('\t') word1, word2, _, sim = fields[:4] sim = float(sim) simlex999.append((word1, word2, sim)) return simlex999 def evaluate_embeddings(embedding, vocab: Vocabulary): cosine = CosineSimilarity(dim=0) simlex999 = read_simlex999() sims_pred = [] oov_count = 0 for word1, word2, sim in simlex999: word1_id = vocab.get_token_index(word1, 'token_in') if word1_id == 1: sims_pred.append(0.) oov_count += 1 continue word2_id = vocab.get_token_index(word2, 'token_in') if word2_id == 1: sims_pred.append(0.) oov_count += 1 continue sim_pred = cosine(embedding.weight[word1_id], embedding.weight[word2_id]).item() sims_pred.append(sim_pred) assert len(sims_pred) == len(simlex999) print('# of OOV words: {} / {}'.format(oov_count, len(simlex999))) return spearmanr(sims_pred, [sim for _, _, sim in simlex999]) def main(): reader = SkipGramReader() text8 = reader.read('https://realworldnlpbook.s3.amazonaws.com/data/text8/text8') vocab = Vocabulary.from_instances(text8, min_count={'token_in': 5, 'token_out': 5}) reader = SkipGramReader(vocab=vocab) text8 = reader.read('https://realworldnlpbook.s3.amazonaws.com/data/text8/text8') data_loader = SimpleDataLoader(list(text8), batch_size=BATCH_SIZE) data_loader.index_with(vocab) embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = GradientDescentTrainer( model=model, optimizer=optimizer, data_loader=data_loader, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab) print(get_synonyms('one', embedding_in, vocab)) print(get_synonyms('december', embedding_in, vocab)) print(get_synonyms('flower', embedding_in, vocab)) print(get_synonyms('design', embedding_in, vocab)) print(get_synonyms('snow', embedding_in, vocab)) rho = evaluate_embeddings(embedding_in, vocab) print('simlex999 speareman correlation: {}'.format(rho)) if __name__ == '__main__': main()
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/sentiment/sst_classifier.ipynb
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install spaCy==3.7.2
!pip install torch==2.0.1
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
from itertools import chain from typing import Dict import numpy as np import torch import torch.optim as optim from allennlp.data.data_loaders import MultiProcessDataLoader from allennlp.data.samplers import BucketBatchSampler from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.nn.util import get_text_field_mask from allennlp.training import GradientDescentTrainer from allennlp.training.metrics import CategoricalAccuracy, F1Measure from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \ StanfordSentimentTreeBankDatasetReader from realworldnlp.predictors import SentenceClassifierPredictor
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
# Model in AllenNLP represents a model that is trained. class LstmClassifier(Model): def __init__(self, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: str = '4') -> None: super().__init__(vocab) # We need the embeddings to convert word IDs to their vector representations self.embedder = embedder self.encoder = encoder # After converting a sequence of vectors to a single vector, we feed it into # a fully-connected linear layer to reduce the dimension to the total number of labels. self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) positive_index = vocab.get_token_index(positive_label, namespace='labels') self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_index) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss() # Instances are fed to forward after batching. # Fields are passed through arguments with the same name. def forward(self, tokens: Dict[str, torch.Tensor], label: torch.Tensor = None) -> torch.Tensor: # In deep NLP, when sequences of tensors in different lengths are batched together, # shorter sequences get padded with zeros to make them equal length. # Masking is the process to ignore extra zeros added by padding mask = get_text_field_mask(tokens) # Forward pass embeddings = self.embedder(tokens) encoder_out = self.encoder(embeddings, mask) logits = self.linear(encoder_out) # In AllenNLP, the output of forward() is a dictionary. # Your output dictionary must contain a "loss" key for your model to be trained. output = {"logits": logits} if label is not None: self.accuracy(logits, label) self.f1_measure(logits, label) output["loss"] = self.loss_function(logits, label) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {'accuracy': self.accuracy.get_metric(reset), **self.f1_measure.get_metric(reset)}
reader = StanfordSentimentTreeBankDatasetReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)
# You can optionally specify the minimum count of tokens/labels.
# `min_count={'tokens':3}` here means that any tokens that appear less than three times
# will be ignored and not included in the vocabulary.
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()),
min_count={'tokens': 3})
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_DIM)
# BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
# not for labels, which are used as-is as the "answer" of the sentence classification
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
# Seq2VecEncoder is a neural network abstraction that takes a sequence of something
# (usually a sequence of embedded word vectors), processes it, and returns a single
# vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
# AllenNLP also supports CNNs and other simple architectures (for example,
# just averaging over the input vectors).
encoder = PytorchSeq2VecWrapper(
torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmClassifier(word_embeddings, encoder, vocab)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=train_data_loader,
validation_data_loader=dev_data_loader,
patience=10,
num_epochs=20,
cuda_device=-1)
trainer.train()
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict('This is the best movie ever!')['logits']
label_id = np.argmax(logits)
print(model.vocab.get_token_from_index(label_id, 'labels'))
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/pos/pos_tagger.ipynb
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install torch==2.0.1
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
from itertools import chain from typing import Dict import numpy as np import torch import torch.optim as optim from allennlp.data.data_loaders import MultiProcessDataLoader from allennlp.data.samplers import BucketBatchSampler from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits from allennlp.training.metrics import CategoricalAccuracy from allennlp.training import GradientDescentTrainer from allennlp_models.structured_prediction.dataset_readers.universal_dependencies import UniversalDependenciesDatasetReader from realworldnlp.predictors import UniversalPOSPredictor
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 128
class LstmTagger(Model): def __init__(self, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.embedder = embedder self.encoder = encoder self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('pos')) self.accuracy = CategoricalAccuracy() def forward(self, words: Dict[str, torch.Tensor], pos_tags: torch.Tensor = None, **args) -> Dict[str, torch.Tensor]: mask = get_text_field_mask(words) embeddings = self.embedder(words) encoder_out = self.encoder(embeddings, mask) tag_logits = self.linear(encoder_out) output = {"tag_logits": tag_logits} if pos_tags is not None: self.accuracy(tag_logits, pos_tags, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, pos_tags, mask) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)}
reader = UniversalDependenciesDatasetReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu'
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["words"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
dev_data_loader.iter_instances()))
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_SIZE)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
encoder = PytorchSeq2SeqWrapper(
torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))
model = LstmTagger(word_embeddings, encoder, vocab)
optimizer = optim.Adam(model.parameters())
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=train_data_loader,
validation_data_loader=dev_data_loader,
patience=10,
num_epochs=10,
cuda_device=-1)
trainer.train()
predictor = UniversalPOSPredictor(model, reader)
tokens = ['The', 'dog', 'ate', 'the', 'apple', '.']
logits = predictor.predict(tokens)['tag_logits']
tag_ids = np.argmax(logits, axis=-1)
[vocab.get_token_from_index(tag_id, 'pos') for tag_id in tag_ids]
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/generation/lm.ipynb
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install torch==2.0.1
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
import re from typing import Dict, List, Tuple, Set import torch import torch.optim as optim from allennlp.common.file_utils import cached_path from allennlp.common.util import START_SYMBOL, END_SYMBOL from allennlp.data.data_loaders import SimpleDataLoader from allennlp.data.samplers import BucketBatchSampler from allennlp.data.fields import TextField from allennlp.data.instance import Instance from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer from allennlp.data.tokenizers import Token, CharacterTokenizer from allennlp.data.vocabulary import Vocabulary, DEFAULT_PADDING_TOKEN from allennlp.models import Model from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits from allennlp.training import GradientDescentTrainer
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256
BATCH_SIZE = 128
def read_dataset(all_chars: Set[str]=None) -> List[List[Token]]: """Read a plan text file and return character-tokenized sentences.""" tokenizer = CharacterTokenizer() sentences = [] with open(cached_path('https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.eng.10k.txt')) as f: for line in f: line = line.strip() if not line: continue line = re.sub(' +', ' ', line) tokens = tokenizer.tokenize(line) if all_chars: tokens = [token for token in tokens if token.text in all_chars] sentences.append(tokens) return sentences
def tokens_to_lm_instance(tokens: List[Token],
token_indexers: Dict[str, TokenIndexer]):
tokens = list(tokens) # shallow copy
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
input_field = TextField(tokens[:-1], token_indexers)
output_field = TextField(tokens[1:], token_indexers)
return Instance({'input_tokens': input_field,
'output_tokens': output_field})
class RNNLanguageModel(Model): def __init__(self, embedder: TextFieldEmbedder, hidden_size: int, max_len: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.embedder = embedder self.rnn = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)) self.hidden2out = torch.nn.Linear(in_features=self.rnn.get_output_dim(), out_features=vocab.get_vocab_size('tokens')) self.hidden_size = hidden_size self.max_len = max_len def forward(self, input_tokens, output_tokens): mask = get_text_field_mask(input_tokens) embeddings = self.embedder(input_tokens) rnn_hidden = self.rnn(embeddings, mask) out_logits = self.hidden2out(rnn_hidden) loss = sequence_cross_entropy_with_logits(out_logits, output_tokens['tokens']['tokens'], mask) return {'loss': loss} def generate(self) -> Tuple[List[Token], torch.tensor]: start_symbol_idx = self.vocab.get_token_index(START_SYMBOL, 'tokens') end_symbol_idx = self.vocab.get_token_index(END_SYMBOL, 'tokens') padding_symbol_idx = self.vocab.get_token_index(DEFAULT_PADDING_TOKEN, 'tokens') log_likelihood = 0. words = [] state = (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size)) word_idx = start_symbol_idx for i in range(self.max_len): tokens = torch.tensor([[word_idx]]) embeddings = self.embedder({'tokens': {'tokens': tokens}}) output, state = self.rnn._module(embeddings, state) output = self.hidden2out(output) log_prob = torch.log_softmax(output[0, 0], dim=0) dist = torch.exp(log_prob) word_idx = start_symbol_idx while word_idx in {start_symbol_idx, padding_symbol_idx}: word_idx = torch.multinomial( dist, num_samples=1, replacement=False).item() log_likelihood += log_prob[word_idx] if word_idx == end_symbol_idx: break token = Token(text=self.vocab.get_token_from_index(word_idx, 'tokens')) words.append(token) return words, log_likelihood
all_chars = {END_SYMBOL, START_SYMBOL}
all_chars.update("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,!?'-")
train_set = read_dataset(all_chars)
token_counts = {char: 1 for char in all_chars}
vocab = Vocabulary({'tokens': token_counts})
token_indexers = {'tokens': SingleIdTokenIndexer()}
instances = [tokens_to_lm_instance(tokens, token_indexers)
for tokens in train_set]
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_SIZE)
embedder = BasicTextFieldEmbedder({"tokens": token_embedding})
model = RNNLanguageModel(embedder=embedder,
hidden_size=HIDDEN_SIZE,
max_len=80,
vocab=vocab)
data_loader = SimpleDataLoader(instances, batch_size=BATCH_SIZE)
data_loader.index_with(vocab)
optimizer = optim.Adam(model.parameters(), lr=5.e-3)
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=data_loader,
num_epochs=10,
cuda_device=-1)
trainer.train()
def predict(text: str, model: Model) -> float:
tokenizer = CharacterTokenizer()
tokens = tokenizer.tokenize(text)
token_indexers = {'tokens': SingleIdTokenIndexer()}
instance = tokens_to_lm_instance(tokens, token_indexers)
output = model.forward_on_instance(instance)
print(output)
predict('The trip to the beach was ruined by bad weather.', model)
predict('The trip to the beach was ruined by bad dogs.', model)
predict('by weather was trip my bad beach the ruined to.', model)
for _ in range(50):
tokens, _ = model.generate()
print(''.join(token.text for token in tokens))
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/sentiment/sst_cnn_classifier.ipynb
!pip install allennlp==2.10.1
!pip install allennlp-models==2.10.1
!pip install overrides==7.4.0
!pip install spaCy==3.7.2
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp
from itertools import chain from typing import Dict import numpy as np import torch import torch.optim as optim from allennlp.data.data_loaders import MultiProcessDataLoader from allennlp.data.samplers import BucketBatchSampler from allennlp.data.token_indexers import SingleIdTokenIndexer from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, CnnEncoder from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.nn.util import get_text_field_mask from allennlp.training.metrics import CategoricalAccuracy, F1Measure from allennlp.training import GradientDescentTrainer from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \ StanfordSentimentTreeBankDatasetReader from realworldnlp.predictors import SentenceClassifierPredictor
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
@Model.register("cnn_classifier") class CnnClassifier(Model): def __init__(self, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: str = '4') -> None: super().__init__(vocab) self.embedder = embedder self.encoder = encoder self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) positive_index = vocab.get_token_index(positive_label, namespace='labels') self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_index) self.loss_function = torch.nn.CrossEntropyLoss() def forward(self, tokens: Dict[str, torch.Tensor], label: torch.Tensor = None) -> torch.Tensor: mask = get_text_field_mask(tokens) embeddings = self.embedder(tokens) encoder_out = self.encoder(embeddings, mask) logits = self.linear(encoder_out) output = {"logits": logits} if label is not None: self.accuracy(logits, label) self.f1_measure(logits, label) output["loss"] = self.loss_function(logits, label) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {'accuracy': self.accuracy.get_metric(reset), **self.f1_measure.get_metric(reset)}
# Note: CnnEncoder (with ngram filter size = 5) requires the padding length >= 5
token_indexer = SingleIdTokenIndexer(token_min_padding_length=5)
reader = StanfordSentimentTreeBankDatasetReader(token_indexers={'tokens': token_indexer})
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
dev_data_loader.iter_instances()),
min_count={'tokens': 3})
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
encoder = CnnEncoder(
embedding_dim=EMBEDDING_DIM,
num_filters=8,
ngram_filter_sizes=(2, 3, 4, 5))
model = CnnClassifier(word_embeddings, encoder, vocab)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
trainer = GradientDescentTrainer(
model=model,
optimizer=optimizer,
data_loader=train_data_loader,
validation_data_loader=dev_data_loader,
patience=10,
num_epochs=20,
cuda_device=-1)
trainer.train()
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict('This is the best movie ever!')['logits']
label_id = np.argmax(logits)
print(model.vocab.get_token_from_index(label_id, 'labels'))
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/generation/transformers.ipynb
!pip install transformers==4.6.1
!pip install sacremoses
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (vocabulary size) top_k >0: keep only top k tokens with highest probability (top-k filtering). top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Source: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear top_k = min(top_k, logits.size(-1)) # Safety check if top_k > 0: # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p > 0.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 indices_to_remove = sorted_indices[sorted_indices_to_remove] logits[indices_to_remove] = filter_value return logits
def sample_token(output):
logits = output[..., -1, :].squeeze(0)
logits = top_k_top_p_filtering(logits, top_k=10)
log_probs = torch.softmax(logits, dim=-1)
token = torch.multinomial(log_probs, num_samples=1)[0]
return token
Transformer-XL
tokenizer = AutoTokenizer.from_pretrained('transfo-xl-wt103')
model = AutoModelWithLMHead.from_pretrained('transfo-xl-wt103')
generated = tokenizer.encode("On our way to the beach")
context = torch.tensor([generated])
past = None
for i in range(100):
output = model(context, mems=past)
token = sample_token(output.prediction_scores)
generated.append(token.item())
context = token.view(1, -1)
past = output.mems
print(tokenizer.decode(generated))
GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelWithLMHead.from_pretrained('gpt2-large')
generated = tokenizer.encode("On our way to the beach")
context = torch.tensor([generated])
past = None
for i in range(100):
output = model(context, past_key_values=past)
token = sample_token(output.logits)
generated.append(token.item())
context = token.unsqueeze(0)
past = output.past_key_values
print(tokenizer.decode(generated))
XLM
tokenizer = AutoTokenizer.from_pretrained('xlm-clm-enfr-1024')
model = AutoModelWithLMHead.from_pretrained('xlm-clm-enfr-1024')
generated = [0] # start with just <s>
context = torch.tensor([generated])
lang = 0 # English
for i in range(100):
langs = torch.zeros_like(context).fill_(lang)
output = model(context, langs=langs)
token = sample_token(output.logits)
generated.append(token.item())
context = torch.tensor([generated])
print(tokenizer.decode(generated))
<s>and other risk factors are discussed in our Annual Report on Form 10-K filed with the Securities and Exchange Commission. </s>" We do not know how we will proceed to respond to these matters. " </s>" We have made a full investigation. </s>He is not aware that any other member of the board of directors have been made aware of. </s>He was the first to have been aware of the potential that they had had. " </s>The company had been approached for some time and had been working on an
generated = [0] # start with just <s>
context = torch.tensor([generated])
lang = 1 # French
for i in range(100):
langs = torch.zeros_like(context).fill_(lang)
output = model(context, langs=langs)
token = sample_token(output.logits)
generated.append(token.item())
context = torch.tensor([generated])
print(tokenizer.decode(generated))
<s>, which are also considered to be highly sensitive to the threat of a nuclear warhead. </s>Mais pour le moment, il ne sera ni pour les Etats-Unis ni même à ce niveau ", a déclaré à des journalistes ". </s>En plus, les autres "... il y en a qui vont voir ça comme un problème ", a-t-il lancé. </s>L' objectif d' un jour, c' est de les faire ", a-t-il ajouté. </s>C' est l' un des plus gros succès d' Amazon ", a
https://colab.research.google.com/github/mhagiwara/realworldnlp/blob/master/examples/sentiment/sst_classifier_transformers.ipynb
!pip install transformers==4.6.1
!curl https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt --output dev.txt
!curl https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt --output train.txt
import re
import torch
from torch import nn, optim
from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
BERT_MODEL = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
class BertClassifier(nn.Module): def __init__(self, model_name, num_labels): super(BertClassifier, self).__init__() self.bert_model = AutoModel.from_pretrained(model_name) self.linear = nn.Linear(self.bert_model.config.hidden_size, num_labels) self.loss_function = torch.nn.CrossEntropyLoss() def forward(self, input_ids, attention_mask, token_type_ids, label=None): bert_out = self.bert_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) logits = self.linear(bert_out.pooler_output) loss = None if label is not None: loss = self.loss_function(logits, label) return loss, logits
token_ids = tokenizer.encode('The best movie ever!')
token_ids
tokenizer.decode(token_ids)
result = tokenizer(
['The best movie ever!', 'Aweful movie'],
max_length=10,
pad_to_max_length=True,
truncation=True,
return_tensors='pt')
result
result['input_ids']
result['token_type_ids']
result['attention_mask']
def read_dataset(file_path, batch_size, tokenizer, max_length): batches = [] with open(file_path) as f: texts = [] labels = [] for line in f: text = line.strip() label = int(text[1]) text = re.sub('\)+', '', re.sub('\(\d ', '', text)) text = text.replace('-LRB-', '(').replace('-RRB-', ')') texts.append(text) labels.append(label) if len(texts) == batch_size: batch = tokenizer( texts, max_length=max_length, pad_to_max_length=True, truncation=True, return_tensors='pt') batch['label'] = torch.tensor(labels) batches.append(batch) texts = [] labels = [] if texts: batch = tokenizer( texts, max_length=max_length, pad_to_max_length=True, truncation=True, return_tensors='pt') batch['label'] = torch.tensor(labels) batches.append(batch) return batches
train_data = read_dataset('train.txt', batch_size=32, tokenizer=tokenizer, max_length=128)
dev_data = read_dataset('dev.txt', batch_size=32, tokenizer=tokenizer, max_length=128)
len(train_data), len(dev_data)
def move_to(batch, device):
for key in batch.keys():
batch[key] = batch[key].to(device)
model = BertClassifier(model_name=BERT_MODEL, num_labels=5).to(device)
move_to(dev_data[0], device)
model(**dev_data[0])
epochs = 30
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_cosine_schedule_with_warmup(
optimizer, num_warmup_steps=1000,
num_training_steps=len(train_data) * epochs)
for epoch in range(epochs): print(f'epoch = {epoch}') model.train() losses = [] total_instances = 0 correct_instances = 0 for batch in train_data: batch_size = batch['input_ids'].size(0) move_to(batch, device) optimizer.zero_grad() loss, logits = model(**batch) loss.backward() optimizer.step() scheduler.step() losses.append(loss) total_instances += batch_size correct_instances += torch.sum(torch.argmax(logits, dim=-1) == batch['label']).item() avr_loss = sum(losses) / len(losses) accuracy = correct_instances / total_instances print(f'train loss = {avr_loss}, accuracy = {accuracy}') losses = [] total_instances = 0 correct_instances = 0 model.eval() for batch in dev_data: batch_size = batch['input_ids'].size(0) move_to(batch, device) with torch.no_grad(): loss, logits = model(**batch) losses.append(loss) total_instances += batch_size correct_instances += torch.sum(torch.argmax(logits, dim=-1) == batch['label']).item() avr_loss = sum(losses) / len(losses) accuracy = correct_instances / total_instances print(f'dev loss = {avr_loss}, accuracy = {accuracy}')
因为内存不够, 一个epoch 都不够就爆掉了,只能截个图书里的图
https://github.com/mhagiwara/realworldnlp/blob/master/examples/tuning/sst_classifier.jsonnet
https://github.com/mhagiwara/realworldnlp/blob/master/examples/tuning/hparams.json
https://optuna.readthedocs.io/en/stable/reference/visualization/generated/optuna.visualization.plot_contour.html#optuna.visualization.plot_contour
主要介绍如何部署和服务NLP模型,没有代码。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。