赞
踩
一、NLP 文本分类步骤
第一步:准备数据集,X:句子;Y:类别
第二步:分词,并去除停词(中文理由停词,比如而且,逗号之类;英文的话需要做词的时态转换之类)
第三步:word2idx/word2vec;这里word2vec,可以利用语料库,训练一个单词转为向量的model,这个模型你输入单词,会给你一个向量,并且能计算单词的相似度,相当于提前给词语做了归一化;word2idx就直接用词汇表的id作为向量的元素;
第四步:建模训练
二、代码
1、数据准备➕预处理
- 我们采用,头条新闻数据集作为本次demo的数据集。
- https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset
下载好之后,需要进行预处理,个人习惯转为json字典;并且2/8开分为测试和训练集;
- file = open("./toutiao.txt", 'r')
- file = file.readlines()
- print(file[0],len(file))
- print(file[0].split("_!_"))
- data = {"train": [],
- "test": [],
- "class_name":{},
- "class_info": {}}
- # shuffle data
- random.shuffle(file)
- max_sentence = 0
- for i, line in enumerate(file):
- if i < int(0.8 * len(file)):
- line = line.split("_!_")
- if line[1] not in data["class_name"].keys():
- data["class_name"][line[1]] = line[2]
- if line[1] not in data["class_info"].keys():
- data["class_info"][line[1]] = 1
- else:
- data["class_info"][line[1]] += 1
-
- data["train"].append({"x": line[3],
- "y": line[1]})
- max_sentence = len(line[3]) if len(line[3]) > max_sentence else max_sentence
- else:
- line = line.split("_!_")
- if line[1] not in data["class_name"].keys():
- data["class_name"][line[1]] = line[2]
- if line[1] not in data["class_info"].keys():
- data["class_info"][line[1]] = 1
- else:
- data["class_info"][line[1]] += 1
-
- data["test"].append({"x": line[3],
- "y": line[1]})
- max_sentence = len(line[3]) if len(line[3]) > max_sentence else max_sentence
- data["max_sentence"] = max_sentence
- data["num_train"] = len(data["train"])
- data["num_test"] = len(data["test"])
- import matplotlib.pyplot as plt
- %matplotlib inline
-
- from matplotlib import gridspec
- fig = plt.figure(figsize=(20, 4.5))
- gs = gridspec.GridSpec(1, 2, width_ratios=[1, 2.5])
- ax1 = plt.subplot(gs[0])
- ax2 = plt.subplot(gs[1])
-
- counts = [data["num_train"], data["num_test"]]
-
- colors = ['silver', 'purple']
- explode = (0.1, 0) # explode 1st slice
- labels = ['train','test']
- ax1.pie(counts, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
-
- counts = []
- labels = []
- for namecode in data["class_name"].keys():
- counts.append(data["class_info"][namecode])
- labels.append(data["class_name"][namecode])
- print(len(counts),len(labels))
- print(counts)
- print(labels)
-
- df = pd.DataFrame({"labels": labels,
- "counts": counts})
- ax2.bar(df["labels"], df["counts"])
- ax2.set_title("nums")
- ax2.set_ylabel("% nums")
- # ax2.set_xticks(rotation=-15)
- ax2.set_xticklabels(labels = labels, rotation=-15)
-
- plt.show()
到这里为止,数据分析和预处理就结束了
2、对句子进行分词
- stopwords = [i.strip() for i in open('stop_words.txt').readlines()]
-
- def pretty_cut(sentence):
- cut_list = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', sentence)), cut_all = True)
- for i in range(len(cut_list)-1, -1, -1):
- if cut_list[i] in stopwords:
- del cut_list[i]
- return cut_list
这里的stop_word.txt 可以在github下载
GitHub - goto456/stopwords: 中文常用停用词表(哈工大停用词表、百度停用词表等)
用“cn_stopwords.txt” 就可以了
效果如下:
3、接下来可以采用多种方式把中文转为向量
方法一:
- import logging
- import sys
- # import gensim.models as word2vec
- from gensim.models.word2vec import LineSentence, logger
-
- from gensim.models import Word2Vec
- train_data_wv = []
- for sentence in data["train"]:
- jieba_word = " ".join(pretty_cut(sentence["x"]))
- train_data_wv.append(jieba_word)
- sentence["x_jieba"] = jieba_word
- for sentence in data["test"]:
- jieba_word = " ".join(pretty_cut(sentence["x"]))
- train_data_wv.append(jieba_word)
- sentence["x_jieba"] = jieba_word
-
-
- train_w2v = Word2Vec(train_data_wv, window=5, min_count=0,vector_size=50, workers=10)
- train_w2v.train(train_data_wv, total_examples=len(train_data_wv), epochs=10)
方法二:
- vocabs = {}
- index_word = 1
- for se in train_data_wv:
- vo = set(se.split(' '))
- for word in vo:
- if word not in vocabs.keys():
- vocabs[word]=index_word
- index_word += 1
-
- def get_pretrain_pad_seq(vocab, sentence, maxlen):
- transformed_sentence = []
- for word in sentence:
- tran_word = vocab.get(word, None)
- if tran_word:
- transformed_sentence.append(tran_word)
- else:
- transformed_sentence.append(107335)
- transformed_sentence += [0 for _ in range(abs(maxlen - len(sentence)))]
- return np.array(transformed_sentence)
-
- max_len = 0
- for sentence in train_data_wv:
- max_len = len(sentence.split(' ')) if len(sentence.split(' ')) > max_len else max_len
采用方法二,通常需要词汇量+1,多的那个用来pad,通常是0;
接下来就是把整个数据集进行整理,转化为训练举证
- trainX = []
- trainY = []
- testX = []
- testY = []
- for sample in data["train"]:
- jieba_word = sample["x_jieba"].split(' ')
- x = get_pretrain_pad_seq(vocabs, jieba_word, max_len)
- trainX.append(x)
- trainY.append(int(sample["y"]))
- for sample in data["test"]:
- jieba_word = sample["x_jieba"].split(' ')
- x = get_pretrain_pad_seq(vocabs, jieba_word, max_len)
- testX.append(x)
- testY.append(int(sample["y"]))
-
- trainX = np.array(trainX)
- trainY = np.array(trainY)
- testX = np.array(testX)
- testY = np.array(testY)
-
- import tensorflow as tf
- trainY = tf.keras.utils.to_categorical(trainY)
- testY = tf.keras.utils.to_categorical(testY)
- print(trainX.shape, trainY.shape, testX.shape, testY.shape)
4、 建模训练
- from tensorflow.keras import Model
- from tensorflow.keras.layers import Embedding, Dense, LSTM
-
-
- class TextRNN(Model):
- def __init__(self,
- maxlen,
- max_features,
- embedding_dims,
- class_num=1,
- last_activation='sigmoid'):
- super(TextRNN, self).__init__()
- self.maxlen = maxlen
- self.max_features = max_features
- self.embedding_dims = embedding_dims
- self.class_num = class_num
- self.last_activation = last_activation
- self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
- self.rnn = LSTM(128) # LSTM or GRU
- self.classifier = Dense(self.class_num, activation=self.last_activation)
-
- def call(self, inputs):
- if len(inputs.get_shape()) != 2:
- raise ValueError('The rank of inputs of TextRNN must be 2, but now is %d' % len(inputs.get_shape()))
- if inputs.get_shape()[1] != self.maxlen:
- raise ValueError('The maxlen of inputs of TextRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
- embedding = self.embedding(inputs)
- x = self.rnn(embedding)
- output = self.classifier(x)
- return output
上面是模型代码,接下来就是训练代码了:
- from tensorflow.keras.callbacks import EarlyStopping
- from tensorflow.keras.preprocessing import sequence
-
- max_features = 107335 # 单次数+1
- maxlen = 80
- batch_size = 32
- embedding_dims = 32
- epochs = 10
-
-
- print('Build model...') # 15类,但是编码用的百位数
- model = TextRNN(max_len, max_features, embedding_dims, class_num=117, last_activation='softmax')
- model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
-
- print('Train...')
- early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
- model.fit(trainX, trainY,
- batch_size=batch_size,
- epochs=epochs,
- callbacks=[early_stopping],
- validation_data=(testX, testY))
-
- print('Test...')
- result = model.predict(testX)
这里会采用embedding层把每个单词原本index转为词向量:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。