第3章 基础工具集与常用数据集

3.1 NLTK工具集

3.1.1 常用语料库和词典资源

3.1.2 常用自然语言处理工具集

3.2 LTP工具集

3.2.1 中文分词

3.2.2 其他中文自然语言处理功能

3.3 PyTorch基础

3.3.1 张量的基本概念

3.2.2 张量的基本运算

3.3.3 自动微分

3.3.4 调整张量形状

3.3.5 广播机制

3.3.6 索引与切片

3.3.7 降维与升维

3.4 大规模预训练数据

3.4.1 维基百科数据

3.4.2 原始数据的获取

3.4.3 语料处理方法

3.4.4 Common Crawl数据

3.5 更多数据集


  1. from nltk.corpus import stopwords
  2. print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]



  1. from nltk.corpus import gutenberg
  2. print(gutenberg.raw('austen-emma.txt'))



  1. from nltk.corpus import sentence_polarity
  2. # print(sentence_polarity.categories())
  3. # print(sentence_polarity.words())
  4. # print(sentence_polarity.sents())
  5. a = [(sentence, category) for category in sentence_polarity.categories() for sentence in sentence_polarity.sents(categories=category)]
  6. print(a)



  1. from nltk.corpus import wordnet
  2. syns = wordnet.synsets('bank')
  3. print(syns[0].name())
  4. print(syns[0].definition())
  5. print(syns[1].definition())
  6. print(syns[0].examples())
  7. print(syns[0].hypernyms())
  8. dog = wordnet.synset('dog.n.01')
  9. cat = wordnet.synset('cat.n.01')
  10. print(dog.wup_similarity(cat))
  1. bank.n.01
  2. sloping land (especially the slope beside a body of water)
  3. a financial institution that accepts deposits and channels the money into lending activities
  4. ['they pulled the canoe up on the bank', 'he sat on the bank of the river and watched the currents']
  5. [Synset('slope.n.01')]
  6. 0.8571428571428571


  1. from nltk.corpus import sentiwordnet
  2. print(sentiwordnet.senti_synset('good.a.01'))
<good.a.01: PosScore=0.75 NegScore=0.0>

3.1.2 常用自然语言处理工具集



  1. from nltk.tokenize import sent_tokenize
  2. from nltk.corpus import gutenberg
  3. text = gutenberg.raw('austen-emma.txt')
  4. sentences = sent_tokenize(text)
  5. print(sentences[100])
  1. Mr. Knightley loves to find fault with me, you know--
  2. in a joke--it is all a joke.



  1. from nltk.tokenize import word_tokenize
  2. from nltk.tokenize import sent_tokenize
  3. from nltk.corpus import gutenberg
  4. text = gutenberg.raw('austen-emma.txt')
  5. sentences = sent_tokenize(text)
  6. print(word_tokenize(sentences[100]))
['Mr.', 'Knightley', 'loves', 'to', 'find', 'fault', 'with', 'me', ',', 'you', 'know', '--', 'in', 'a', 'joke', '--', 'it', 'is', 'all', 'a', 'joke', '.']



  1. from nltk import pos_tag
  2. from nltk.tokenize import word_tokenize
  3. results = pos_tag(word_tokenize('They sat by the fire.'))
  4. print(results)
[('They', 'PRP'), ('sat', 'VBP'), ('by', 'IN'), ('the', 'DT'), ('fire', 'NN'), ('.', '.')]
  1. results = pos_tag(word_tokenize('They fire a gun.'))
  2. print(results)
[('They', 'PRP'), ('fire', 'VBP'), ('a', 'DT'), ('gun', 'NN'), ('.', '.')]
  1. import nltk.help
  2. nltk.help.upenn_tagset('NN')
  3. nltk.help.upenn_tagset('VBP')
  4. nltk.help.upenn_tagset()
  1. NN: noun, common, singular or mass
  2. common-carrier cabbage knuckle-duster Casino afghan shed thermostat
  3. investment slide humour falloff slick wind hyena override subhumanity
  4. machinist ...
  5. VBP: verb, present tense, not 3rd person singular
  6. predominate wrap resort sue twist spill cure lengthen brush terminate
  7. appear tend stray glisten obtain comprise detest tease attract
  8. emphasize mold postpone sever return wag ...






3.2 LTP工具集


pip install ltp

3.2.1 中文分词

  1. from ltp import LTP
  2. ltp = LTP() # 默认加载Small模型,首次使用时会自动下载并加载模型
  3. segment, hidden = ltp.seg(['南京市长江大桥。']) # 对句子进行分词,结果使用segment访问,hidden用于访问每个词的隐含层向量,用于后续分析步骤
  4. print(segment) # LTP能够获得正确的分词结果,而不会错误地分为[['南京', '市长', '江大桥', '.']
[['南京市', '长江大桥', '。']]

3.2.2 其他中文自然语言处理功能


  1. from ltp import LTP
  2. ltp = LTP() # 默认加载Small模型,首次使用时会自动下载并加载模型
  3. # segment, hidden = ltp.seg(['南京市长江大桥。']) # 对句子进行分词,结果使用segment访问,hidden用于访问每个词的隐含层向量,用于后续分析步骤
  4. # print(segment) # LTP能够获得正确的分词结果,而不会错误地分为[['南京', '市长', '江大桥', '.']
  5. sentences = ltp.sent_split(['南京市长江大桥。', '汤姆生病了。他去了医院。']) # 分句
  6. print(sentences)
  7. segment, hidden = ltp.seg(sentences)
  8. print(segment)
  9. pos_tags = ltp.pos(hidden) # 词性标注
  10. print(pos_tags) # 词性标注的结果为每个词对应的词性,LTP使用的词性标记集与NLTK不尽相同,但基本大同小异
  1. ['南京市长江大桥。', '汤姆生病了。', '他去了医院。']
  2. [['南京市', '长江大桥', '。'], ['汤姆', '生病', '了', '。'], ['他', '去', '了', '医院', '。']]
  3. [['ns', 'ns', 'wp'], ['nh', 'v', 'u', 'wp'], ['r', 'v', 'u', 'n', 'wp']]

3.3 PyTorch基础


3.3.1 张量的基本概念


  1. import torch
  2. print(torch.empty(2, 3)) # 创建一个形状为(2, 3)的空张量(未初始化)
  1. tensor([[0., 0., 0.],
  2. [0., 0., 0.]])
print(torch.rand(2, 3))  # 创建一个形状为(2, 3)的随机张量,每个值从[0, 1)之间的均匀分布中采用
  1. tensor([[0.5289, 0.8055, 0.9490],
  2. [0.7827, 0.0692, 0.5653]])
print(torch.randn(2, 3))  # 创建一个形状为(2, 3)的随机张量,每个值从标准正态分布(均值为0,方差为1)中采用
  1. tensor([[ 0.4637, 0.1505, -0.0608],
  2. [-0.6243, 0.2489, -1.2854]])
print(torch.zeros(2, 3, dtype=torch.long))  # 创建一个形状为(2, 3)的0张量,其中dtype设置张量的数据类型,此处为整数
  1. tensor([[0, 0, 0],
  2. [0, 0, 0]])
print(torch.zeros(2, 3, dtype=torch.float))  # 创建一个形状为(2, 3)的0张量,类型为双精度浮点数
  1. tensor([[0., 0., 0.],
  2. [0., 0., 0.]])
print(torch.tensor([[1.0, 3.8, 2.1], [8.6, 4.0, 2.4]]))  # 通过Python列表创建张量
  1. tensor([[1.0000, 3.8000, 2.1000],
  2. [8.6000, 4.0000, 2.4000]])
print(torch.arange(10))  # 生成包含0至9,共10个数字的张量
tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


  1. import torch
  2. print(torch.empty(2, 3).cuda()) # 创建一个形状为(2, 3)的空张量(未初始化)
  3. print(torch.rand(2, 3).to('cuda')) # 创建一个形状为(2, 3)的随机张量,每个值从[0, 1)之间的均匀分布中采用
  4. print(torch.randn(2, 3, device='cuda')) # 创建一个形状为(2, 3)的随机张量,每个值从标准正态分布(均值为0,方差为1)中采用
  1. tensor([[-1.6376e+35, nan, 2.3727e-35],
  2. [ nan, -6.6785e-21, nan]], device='cuda:0')
  3. tensor([[0.9834, 0.7833, 0.0906],
  4. [0.8226, 0.6121, 0.7161]], device='cuda:0')
  5. tensor([[-1.1911, -1.8766, -0.5631],
  6. [ 0.1184, -0.5241, 1.5375]], device='cuda:0')

3.2.2 张量的基本运算

  1. import torch
  2. x = torch.tensor([1, 2, 3], dtype=torch.double)
  3. y = torch.tensor([4, 5, 6], dtype=torch.double)
  4. print(x + y)
tensor([5., 7., 9.], dtype=torch.float64)
print(x - y)
tensor([-3., -3., -3.], dtype=torch.float64)
print(x * y)
tensor([ 4., 10., 18.], dtype=torch.float64)
print(x / y)
tensor([0.2500, 0.4000, 0.5000], dtype=torch.float64)
print(x.dot(y))  # 向量x和y的点积
tensor(32., dtype=torch.float64)
print(x.sin())  # 对x按元素求正弦值
tensor([0.8415, 0.9093, 0.1411], dtype=torch.float64)
print(x.exp())  # 对x按元素求e^x
tensor([ 2.7183,  7.3891, 20.0855], dtype=torch.float64)
print(torch.cat((x, y), dim=0))
tensor([1., 2., 3., 4., 5., 6.], dtype=torch.float64)
  1. import torch
  2. x = torch.tensor([[1, 2, 3], [1, 2, 3]], dtype=torch.double)
  3. print(x.mean())
  4. print(x.mean(axis=0))
  5. print(x.mean(axis=1))
  1. tensor(2., dtype=torch.float64)
  2. tensor([1., 2., 3.], dtype=torch.float64)
  3. tensor([2., 2.], dtype=torch.float64)
  1. import torch
  2. M = torch.rand(1000, 1000)
  3. print(timeit -n 500 M.mm(M).mm(M))
  4. N = torch.rand(1000, 1000).cuda()
  5. print(timeit -n 500 N.mm(N).mm(N))


3.3.3 自动微分



  1. import torch
  2. x = torch.tensor([2.], requires_grad=True)
  3. y = torch.tensor([3.], requires_grad=True)
  4. z = (x + y) * (y - 2)
  5. print(z)
  6. z.backward()
  7. print(x.grad, y.grad)
  1. tensor([5.], grad_fn=<MulBackward0>)
  2. tensor([1.]) tensor([6.])

3.3.4 调整张量形状

  1. import torch
  2. x = torch.tensor([1, 2, 3, 4, 5, 6])
  3. print(x, x.shape)
  4. x = x.view(2, 3) # 将x的形状调整为(2, 3)
  5. print(x)
  6. x = x.reshape(2, 3)
  7. print(x)
  8. x = x.view(3, 2) # 将x的形状调整为(3, 2)
  9. print(x)
  10. x = x.reshape(3, 2)
  11. print(x)
  12. x = x.view(-1, 3) # -1位置的大小可以通过其他维的大小推断出,此处为2
  13. print(x)
  14. x = x.reshape(-1, 3)
  15. print(x)
  1. tensor([1, 2, 3, 4, 5, 6]) torch.Size([6])
  2. tensor([[1, 2, 3],
  3. [4, 5, 6]])
  4. tensor([[1, 2, 3],
  5. [4, 5, 6]])
  6. tensor([[1, 2],
  7. [3, 4],
  8. [5, 6]])
  9. tensor([[1, 2],
  10. [3, 4],
  11. [5, 6]])
  12. tensor([[1, 2, 3],
  13. [4, 5, 6]])
  14. tensor([[1, 2, 3],
  15. [4, 5, 6]])
  1. import torch
  2. x = torch.tensor([[1, 2, 3], [4, 5, 6]])
  3. print(x)
  4. x = x.transpose(0, 1)
  5. print(x)
  1. tensor([[1, 2, 3],
  2. [4, 5, 6]])
  3. tensor([[1, 4],
  4. [2, 5],
  5. [3, 6]])
  1. import torch
  2. x = torch.tensor([[[1, 2, 3], [4, 5, 6]]])
  3. print(x, x.shape)
  4. x = x.permute(2, 0, 1)
  5. print(x, x.shape)
  1. tensor([[[1, 2, 3],
  2. [4, 5, 6]]]) torch.Size([1, 2, 3])
  3. tensor([[[1, 4]],
  4. [[2, 5]],
  5. [[3, 6]]]) torch.Size([3, 1, 2])

3.3.5 广播机制

  1. import torch
  2. x = torch.arange(1, 4).view(3, 1)
  3. y = torch.arange(4, 6).view(1, 2)
  4. print(x)
  5. print(y)
  6. print(x + y)
  1. tensor([[1],
  2. [2],
  3. [3]])
  4. tensor([[4, 5]])
  5. tensor([[5, 6],
  6. [6, 7],
  7. [7, 8]])

3.3.6 索引与切片


  1. import torch
  2. x = torch.arange(12).view(3, 4)
  3. print(x)
  4. print(x[1, 3]) # 第2行,第4列的元素(7)
  5. print(x[1]) # 第2行全部元素
  6. print(x[1:3]) # 第2、3两行元素
  7. print(x[:, 2]) # 第3列全部元素
  8. print(x[:, 2:4]) # 第3、4两列元素
  9. x[:, 2:4] = 100 # 第3、4两列元素全部赋值为100
  10. print(x)
  1. tensor([[ 0, 1, 2, 3],
  2. [ 4, 5, 6, 7],
  3. [ 8, 9, 10, 11]])
  4. tensor(7)
  5. tensor([4, 5, 6, 7])
  6. tensor([[ 4, 5, 6, 7],
  7. [ 8, 9, 10, 11]])
  8. tensor([ 2, 6, 10])
  9. tensor([[ 2, 3],
  10. [ 6, 7],
  11. [10, 11]])
  12. tensor([[ 0, 1, 100, 100],
  13. [ 4, 5, 100, 100],
  14. [ 8, 9, 100, 100]])

3.3.7 降维与升维

具体来讲,所谓升维,就是通过调用torch.unsqueeze(input, dim, out=None)函数,对输入张量的dim位置插入维度1,并返回一个新的张量。与索引相同,dim的值也可以为负数。

降维恰好相反,使用torch.squeeze(input, dim=None, out=None)函数,在不指定dim时,张量中形状为1的所有维都将被除去。

  1. import torch
  2. a = torch.tensor([1, 2, 3, 4])
  3. print(a.shape)
  4. b = torch.unsqueeze(a, dim=0) # 将a的第1维升高
  5. print(b, b.shape) # 打印b以及b的形状
  6. b = a.unsqueeze(dim=0) # unsqueeze函数的另一种等价调用方式
  7. print(b, b.shape)
  8. c = b.squeeze() # 对b进行降维,去掉所有形状中为1的维
  9. print(c, c.shape)
  1. torch.Size([4])
  2. tensor([[1, 2, 3, 4]]) torch.Size([1, 4])
  3. tensor([[1, 2, 3, 4]]) torch.Size([1, 4])
  4. tensor([1, 2, 3, 4]) torch.Size([4])

3.4 大规模预训练数据


3.4.1 维基百科数据

3.4.2 原始数据的获取

3.4.3 语料处理方法


pip install wikiextractor
python -m wikiextractor.WikiExtractor 维基百科快照文件



pip install opencc
python convert_t2s.py input_file > output_file
  1. import sys
  2. import opencc
  3. converter = opencc.OpenCC('t2s.json')
  4. f_in = open(sys.argv[0], 'r')
  5. for line in f_in.readlines():
  6. line = line.strip()
  7. line_t2s = converter.convert(line)
  8. print(line_t2s)



python wikidata_cleaning.py input_file > output_file
  1. import sys
  2. import re
  3. def remove_empty_paired_punc(in_str):
  4. return in_str.replace('()', '').replace('《》', '').replace('【】', '').replace('[]', '')
  5. def remove_html_tags(in_str):
  6. html_pattern = re.compile(r'<[^>]+>', re.S)
  7. return html_pattern.sub('', in_str)
  8. def remove_contro_chars(in_str):
  9. control_chars = ''.join(map(unichr, range(0, 32) + range(127, 160)))
  10. control_chars = re.compile('[%s]' % re.escape(control_chars))
  11. return control_chars.sub('', in_str)
  12. f_in = open(sys.argv[0], 'r')
  13. for line in f_in.readlines():
  14. line = line.strip()
  15. if re.search(r'^(<doc id>)|(</doc>)', line):
  16. print(line)
  17. continue
  18. line = remove_empty_paired_punc(line)
  19. line = remove_html_tags(line)
  20. line = remove_contro_chars(line)
  21. print(line)

3.4.4 Common Crawl数据

3.5 更多数据集

  1. from pprint import pprint
  2. from datasets import list_datasets, load_dataset
  3. datasets_list = list_datasets()
  4. print(len(datasets_list))
  5. dataset = load_dataset('sst', split='train')
  6. print(len(dataset))
  7. print(pprint((dataset[0])))
  1. from datasets import list_metrics, load_metric
  2. metrics_list = list_metrics()
  3. print(len(metrics_list))
  4. print(','.join(metrics_list))
  5. accuracy_metric = load_metric('accuracy')
  6. results = accuracy_metric.compute(reference=[0, 1, 0], predictions=[1, 1, 0])
  7. print(results)



  1. import nltk
  2. emma = nltk.corpus.gutenberg.words('austen-emma.txt')
  3. stopwords = nltk.corpus.stopwords.words('english')
  4. print(stopwords)
  5. emma = [w.lower() for w in emma]
  6. emma_without_stopwords = [w for w in emma if w not in stopwords]
  7. print(emma_without_stopwords)


  1. from nltk.corpus import wordnet
  2. word1 = 'dog'
  3. word2 = 'cat'
  4. word1_synsets = wordnet.synsets(word1)
  5. word2_synsets = wordnet.synsets(word2)
  6. result = max([w1.path_similarity(w2) for w1 in word1_synsets for w2 in word2_synsets])
  7. print(result)


  1. import nltk
  2. sentence = ['welcome', 'to', 'harbin', 'institute', 'of', 'technology']
  3. sentence_tag = nltk.pos_tag(sentence)
  4. tag_map = {'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'UH': 'n', 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBZ': 'v', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'RB': 'r', 'RBR': 'r', 'RBS': 'r', 'RP': 'r', 'WRB': 'r'}
  5. sentence_tag = [(t[0], tag_map[t[1]]) if t[1] in tag_map else (t[0], '') for t in sentence_tag]
  6. sentiment_synsets = [list(nltk.corpus.sentiwordnet.senti_synsets(t[0], t[1])) for t in sentence_tag]
  7. score = sum(sum([x.pos_score() - x.neg_score() for x in s]) / len(s) for s in sentiment_synsets if len(s) !=0)
  8. print(score)


  1. import ltp
  2. sentence = ['南京市长江大桥', '行行行', '结婚的和尚未结婚的确实在干扰分词啊']
  3. ltp_model = ltp.LTP()
  4. segment, _ = ltp_model.seg(sentence)
  5. print(segment)
[['南京市', '长江大桥'], ['行行行'], ['结婚', '的和尚未', '结婚', '的确实在', '干扰分词', '啊']]


  1. import torch
  2. a = torch.randn(2, 3, 4)
  3. print(a)
  4. print(a.shape)
  5. a = a.permute(1, 0, 2)
  6. print(a.shape)
  7. # print(a.view(2, 3, 4))
  1. print(a.view(2, 3, 4))
  2. RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
  1. print(a.reshape(2, 3, 4).shape)
  2. a = torch.rand(3, 4, 5)
  3. b = a.transpose(0, 1).transpose(1, 2)
  4. print(b.shape)
  5. c = a.permute(1, 2, 0)
  6. print(c.shape)
  1. torch.Size([2, 3, 4])
  2. torch.Size([4, 5, 3])
  3. torch.Size([4, 5, 3])


  1. import timeit
  2. it1 = timeit.Timer('M.mm(M).mm(M)', 'import torch\nM = torch.rand(1000, 1000)')
  3. t1 = it1.timeit(500)/500
  4. print('{:.4f}ms'.format(t1*1000))
  5. it2 = timeit.Timer('M.mm(M)', 'import torch\nM = torch.rand(1000, 1000).cuda()')
  6. t2 = it2.timeit(500)/500
  7. print('{:.4f}ms'.format(t2*1000))
  1. 29.2605ms
  2. 0.9476ms

7.下载最新的Common Crawl数据,并实现抽取中文、去重、简繁转换、数据清洗等功能。


  1. import re
  2. def translate(str):
  3. pattern = re.compile('[^\u4e00-\u9fa50-9\s]')
  4. # 中文的编码范围是:\u4e00到\u9fa5
  5. str = re.sub(pattern, '', str)
  6. return str
  7. print(translate('你好hello哈哈哈'))


  1. import sys
  2. f_in = open(sys.argv[1], 'r') # 输入文件
  3. lines_dic = {}
  4. for line in f_in.readlines():
  5. line = line.strip()
  6. hashcode = hash(line)
  7. if hashcode not in lines_dic.keys():
  8. lines_dic[hashcode] = [line]
  9. print(line)
  10. elif line not in lines_dic[hashcode]:
  11. lines_dic[hashcode].append(line)
  12. print(line)
  13. f_in.close()


  1. import sys
  2. import opencc
  3. converter = opencc.OpenCC('t2s.json') # 载入繁简转换配置文件
  4. f_in = open(sys.argv[1], 'r') # 输入文件
  5. for line in f_in.readlines():
  6. line = line.strip()
  7. line_t2s = converter.convert(line)
  8. print(line_t2s)


  1. import re
  2. def remove_empty_paired_punc(in_str):
  3. return in_str.replace('()', '').replace('《》', '').replace('【】', '').replace('[]', '')
  4. def remove_html_tag(in_str):
  5. html_pattern = re.compile(r'<[^>]+>', re.S)
  6. return html_pattern.sub('', in_str)
  7. def remove_control_chars(in_str):
  8. control_chars = ''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))
  9. control_chars = re.compile('[%s]' % re.escape(control_chars))
  10. return control_chars.sub('', in_str)


  1. import re
  2. import sys
  3. import opencc
  4. def translate(str):
  5. pattern = re.compile('[^\u4e00-\u9fa50-9\s]')
  6. # 中文的编码范围是:\u4e00到\u9fa5
  7. str = re.sub(pattern, '', str)
  8. return str
  9. def remove_empty_paired_punc(in_str):
  10. return in_str.replace('()', '').replace('《》', '').replace('【】', '').replace('[]', '')
  11. def remove_html_tag(in_str):
  12. html_pattern = re.compile(r'<[^>]+>', re.S)
  13. return html_pattern.sub('', in_str)
  14. def remove_control_chars(in_str):
  15. control_chars = ''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))
  16. control_chars = re.compile('[%s]' % re.escape(control_chars))
  17. return control_chars.sub('', in_str)
  18. converter = opencc.OpenCC('t2s.json') # 载入繁简转换配置文件
  19. f_in = open(sys.argv[1], 'r') # 输入文件
  20. lines_dic = {}
  21. for line in f_in.readlines():
  22. line = line.strip()
  23. # 数据清洗
  24. line = remove_empty_paired_punc(line)
  25. line = remove_html_tag(line)
  26. line = remove_control_chars(line)
  27. # 中文抽取
  28. line = translate(line)
  29. # 简繁转换
  30. line = converter.convert(line)
  31. # 去重
  32. hashcode = hash(line)
  33. if hashcode not in lines_dic.keys():
  34. lines_dic[hashcode] = [line]
  35. print(line)
  36. elif line not in lines_dic[hashcode]:
  37. lines_dic[hashcode].append(line)
  38. print(line)

