当前位置:   article > 正文

关于Tokenizer的用法_tokenizer参数padding

tokenizer参数padding
  1. from transformers import AutoTokenizer
  2. import torch
  3. import torch.nn as nn
  4. import torch.nn.functional as F
  5. from transformers import AutoModel
  6. text = "I don't really like working, but I need the money!"
  7. tokens = tokenizer.tokenize(text)
  8. print(tokens)
  9. dic = tokenizer(text, padding="max_length", truncation=True, max_length=39, return_tensors="pt")
  10. # tokens ['i', 'don', "'", 't', 'really', 'like', 'working', ',', 'but', 'i', 'need', 'the', 'money', '!']
  11. #'input_ids': [ 101, 1045, 2123, 1005, 1056, 2428, 2066, 2551, 1010, 2021, 1045, 2342,
  12. # 1996, 2769, 999, 102, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  13. # 这里说明了tokenizer获取input_ids时,是自带了分词功能的。当然,只是分词的话,调用.tokenize就行
  14. text= ['I like he', 'like he', 'he but', 'but he', 'he like she', 'like', 'she like I', 'I he she']
  15. # [bs]
  16. tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
  17. dic = tokenizer(text, padding="max_length", truncation=True, max_length=10, return_tensors="pt")
  18. input_ids = dic['input_ids'] # [bs, max_length] 添加了开头101和结尾102
  19. token_type_ids = dic['token_type_ids'] # [bs, max_length
  20. attention_mask = dic['attention_mask']# [bs, max_length]
  21. # 这里只是想看一看它们的实际样子
  22. # attention_mask
  23. # [[1,1,1,1,1,0,0,0,0,0],
  24. #  [1,1,1,1,0,0,0,0,0,0]
  25. # ....]
  26. # input_ids
  27. # [[101,1045,1001,2006,102,0,0,0,0,0],
  28. #  [101,1001,2006,102,0,0,0,0,0,0]
  29. # ....]
  30. class PretrainedLanguageModel(nn.Module):
  31. def __init__(self, pretrained_language_model_name):
  32. super(PretrainedLanguageModel, self).__init__()
  33. self.model = AutoModel.from_pretrained(pretrained_language_model_name)
  34. def forward(self, input_ids, token_type_ids, attention_mask):
  35. output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).last_hidden_state
  36. return output
  37. model = PretrainedLanguageModel('bert-base-uncased')
  38. output = model(input_ids, token_type_ids, attention_mask)
  39. # 【bs, max_length, hidden_dim] 返回结果始终是这个样子。是文本的表示
  40. print(input_ids.shape)
  41. print(output.shape)

   

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/559367
推荐阅读
相关标签
  

闽ICP备14008679号