赞
踩
在这个项目里, 你需要完成一个意图识别的任务,主要使用的模型是SVM算法。使用的数据集是SMP2018中文人机对话技术评测
,是由科大讯飞股份有限公司提供数据。具体的数据可以从以下的链接下载: https://worksheets.codalab.org/worksheets/0x27203f932f8341b79841d50ce0fd684f/#, 下载之后把是数据集解压在当前的工程的根目录下。 在这个任务里,你即将要完成的几个任务是:
tfidf
下载tiny-word-embedding
:https://github.com/Embedding/Chinese-Word-Vectors, 分别使用tfidf
, word2vec
建立SVM
模型, 并对比二者结果。
- import numpy as np
- import pandas as pd
- import jieba # 感谢这个magic package,你不必要担心如何分词。如果想了解细节,可以参考此文件。
- import requests
- import os
- from collections import Counter
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.feature_extraction.text import CountVectorizer
- from gensim.models import Word2Vec
- from sklearn.svm import SVC
- from sklearn import metrics
- from tqdm import tqdm
- import pickle
- import warnings
-
- warnings.filterwarnings('ignore')
- #下载数据,数据集为SMP2018
- #训练集数据保存在同目录下的train.json文件中,测试集数据保存在同目录下的test.json文件中
- #也可以使用! wget https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/ 方式下载
- if not os.path.exists('train.json'):
- trainData = requests.get("https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/")
- with open("train.json", "wb") as f:
- f.write(trainData.content)
-
- if not os.path.exists('test.json'):
- testData = requests.get("https://worksheets.codalab.org/rest/bundles/0x1f96bc12222641209ad057e762910252/contents/blob/")
- with open("test.json", "wb") as f:
- f.write(testData.content)
- #读取数据至DataFrame中
- train_df = pd.read_json("train.json").transpose()
- test_df = pd.read_json("test.json").transpose()
- # 先来查看一下数据,确保没有任何错误!
- print ("训练数据和测试数据:", train_df.shape, test_df.shape)
- print ("标签的种类: ", train_df.label.unique()) # 查看标签的个数以及标签种类,预计10个类别。
- labelName = train_df.label.unique() #全部label列表
-
- #实现文本label 与index的映射 hint : zip dict
-
- label_index_dict = dict(zip(labelName,range(len(labelName))))
-
- #查看label 与index 的映射关系
-
- print(label_index_dict)
- #统计并展示每一个类别出现的次数 hint groupby().count() / value_counts()
- train_df['label'].value_counts()
- #将dataframe 中文本label转换为数字。 hint: map
- train_df["labelIndex"] = train_df.label.map(lambda x:label_index_dict.get(x))# TODO
- test_df["labelIndex"] = test_df.label.map(lambda x:label_index_dict.get(x))# TODO
- # 对数据中的文本进行分词 hint: jieba.cut
-
- # jieba.cut 返回一个generator, 需要进行转换 hint: list
-
- def query_cut(query):
- # TODO
- return list(jieba.cut(query))
-
- train_df["queryCut"] = train_df["query"].apply(query_cut)
- test_df["queryCut"] = test_df["query"].apply(query_cut)
- # 查看分词结果
-
- train_df.head()
- #下载中文停用词表并保存到stopWord.json文件中,数据来源:https://github.com/goto456/stopwords/blob/master/%E4%B8%AD%E6%96%87%E5%81%9C%E7%94%A8%E8%AF%8D%E8%A1%A8.txt
- if not os.path.exists('stopWord.json'):
- stopWord = requests.get("https://raw.github.com/goto456/stopwords/master/%E4%B8%AD%E6%96%87%E5%81%9C%E7%94%A8%E8%AF%8D%E8%A1%A8.txt")
- with open("stopWord.json", "wb") as f:
- f.write(stopWord.content)
- # 读取停用词
-
- with open("stopWord.json","r",encoding='utf-8') as f:
- stopWords = f.read()[1:-1].replace('"','').split(',')
- # 查看停用词
-
- stopWords[0:30]
- # 使用停止词过滤上一步分词结果
-
- def rm_stop_word(wordList):
- # TODO
- new_wordList = [word for word in wordList if word not in stopWords]
- return new_wordList
- train_df["queryCutRMStopWord"] = train_df["queryCut"].apply(rm_stop_word)
- test_df["queryCutRMStopWord"] = test_df["queryCut"].apply(rm_stop_word)
- # 查看过滤停止词后的结果
-
- train_df.head()
- # 计算词频 hint collections.Counter()
- import collections
- allWords = [word for query in train_df.queryCutRMStopWord for word in query] #所有词组成的列表
- freWord = dict(collections.Counter(allWords))#统计词频,一个字典,键为词,值为词出现的次数
- # 过滤低频词
- highFreWords = [word for word in freWord.keys() if freWord[word]>3] #词频超过3的词列表
- def rm_low_fre_word(query):
- # TODO
- new_query = [word for word in query if word in highFreWords]
- return new_query
- #去除低频词
- train_df["queryFinal"] = train_df["queryCutRMStopWord"].apply(rm_low_fre_word)
- test_df["queryFinal"] = test_df["queryCutRMStopWord"].apply(rm_low_fre_word)
TFIDF
tfidf
有几个关键参数ngram_range: tuple(min_n, max_n) 要提取的n-gram的n-values的下限和上限范围,在min_n <= n <= max_n区间的n的全部值
stop_words:string {'english'}, list, or None(default)
如果为english,用于英语内建的停用词列表
如果为list,该列表被假定为包含停用词,列表中的所有词都将从令牌中删除
如果None,不使用停用词。max_df可以被设置为范围[0.7, 1.0)的值,基于内部预料词频来自动检测和过滤停用词
max_df: float in range [0.0, 1.0] or int, optional, 1.0 by default
当构建词汇表时,严格忽略高于给出阈值的文档频率的词条,语料指定的停用词。如果是浮点值,该参数代表文档的比例,整型绝对计数值,如果词汇表不为None,此参数被忽略。
min_df:float in range [0.0, 1.0] or int, optional, 1.0 by default
当构建词汇表时,严格忽略低于给出阈值的文档频率的词条,语料指定的停用词。如果是浮点值,该参数代表文档的比例,整型绝对计数值,如果词汇表不为None,此参数被忽略。
max_features: optional, None by default
如果不为None,构建一个词汇表,仅考虑max_features--按语料词频排序,如果词汇表不为None,这个参数被忽略
norm:'l1', 'l2', or None,optional
范数用于标准化词条向量。None为不归一化
smooth_idf:boolean,optional
通过加1到文档频率平滑idf权重,为防止除零,加入一个额外的文档
- # 将分词且过滤后的文本数据转化为tfidf 形式:
-
-
- trainText = [' '.join(query) for query in train_df["queryFinal"]]
- testText = [' '.join(query) for query in test_df["queryFinal"]]
- allText = trainText+testText
-
- # sklearn tfidf vector fit_transform
- vectorizer = CountVectorizer()
- transformer = TfidfTransformer()
- tfidf = transformer.fit_transform(vectorizer.fit_transform(allText))
- # 切分数据集 hint sklearn train_test_split()
- trainLen = len(train_df)
- train_x_tfidf = tfidf.toarray()[0:trainLen]
- test_x_tfidf = tfidf.toarray()[trainLen:]
- train_y_tfidf = train_df["labelIndex"]
- test_y_tfidf = test_df["labelIndex"]
- # 切分后的数据信息
- print("train_x_tfidf.shape =",train_x_tfidf.shape)
- print("train_y_tfidf.shape =",train_y_tfidf.shape)
- print("test_x_tfidf.shape =",test_x_tfidf.shape)
- print("test_y_tfidf.shape =",test_y_tfidf.shape)
- # 读取embedding
- with open("tiny_word2vec.pickle","rb") as f:
- word2vec = pickle.load(f)
-
- #词向量举例
- word2vec["今天"]
word embedding
gensim
提供了很多有用的功能, 详细文档参考: https://radimrehurek.com/gensim/models/word2vec.html- #导入预训练好的词向量,词向量来源:https://github.com/Embedding/Chinese-Word-Vectors
- with open("tiny_word2vec.pickle","rb") as f:
- word2vec = pickle.load(f)
- # 词向量举例
- word2vec["今天"]
-
- # 将过滤后的分词文本转换为相同维度的向量
- vocabulary = word2vec.keys()
- # count = 0
- def sentence2vec(query):
- # TODO
- result_array = np.zeros(len(word2vec['今天']))
- if len(query):
- for word in query:
- if word not in vocabulary:
- rand_array = -1 + 2 * np.random.random(size=len(word2vec['今天']))# randomly generate an array between -1 and 1
- result_array = np.vstack((result_array,rand_array))
- else:
- result_array = np.vstack((result_array,word2vec.get(word)))
- return result_array.mean(axis=0) #get the average value
- else:
- return np.zeros(len(word2vec['今天']))
- # 将转换为词向量的数据, 切分为训练集, 验证集
- train_x_vec = np.vstack(train_df["queryCutRMStopWord"].apply(sentence2vec))
- test_x_vec = np.vstack(test_df["queryCutRMStopWord"].apply(sentence2vec))
- train_y_vec = train_df["labelIndex"]
- test_y_vec = test_df["labelIndex"]
- # 查看切分后的数据信息
- print("train_x_vec.shape =",train_x_vec.shape)
- print("train_y_vec.shape =",train_y_vec.shape)
- print("test_x_vec.shape =",test_x_vec.shape)
- print("test_y_vec.shape =",test_y_vec.shape)
4. 支持向量机
SVM
模型(感谢sklearn), 更多相关文档参考:https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.htmlSVM
关键参数:C:C-SVC的惩罚参数C?默认值是1.0
C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大,趋向于对训练集全分对的情况,这样对训练集测试时准确率很高,但泛化能力弱。C值小,对误分类的惩罚减小,允许容错,将他们当成噪声点,泛化能力较强。 kernel :核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
0 – 线性:u'v
1 – 多项式:(gammau'v + coef0)^degree
2 – RBF函数:exp(-gamma|u-v|^2)
3 –sigmoid:tanh(gammau'v + coef0)
degree :多项式poly函数的维度,默认是3,选择其他核函数时会被忽略。
gamma : ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1/n_features
coef0 :核函数的常数项。对于‘poly’和 ‘sigmoid’有用。
probability :是否采用概率估计?.默认为False
shrinking :是否采用shrinking heuristic方法,默认为true
decision_function_shape :‘ovo’, ‘ovr’ or None, default=None3
- # 使用tfidf 特征建立线性SVM模型 hint: SVC()
-
- tfidfLinearSVM = SVC(kernel='linear')
- tfidfLinearSVM.fit(train_x_tfidf,train_y_tfidf)
-
- # 输出模型结果, accuracy, F1_score
-
- print('train accuracy %s' % metrics.accuracy_score(train_y_tfidf, tfidfLinearSVM.predict(train_x_tfidf)))
- print('train F1_score %s' % metrics.f1_score(train_y_tfidf, tfidfLinearSVM.predict(train_x_tfidf),average="macro"))
- print('test accuracy %s' % metrics.accuracy_score(test_y_tfidf, tfidfLinearSVM.predict(test_x_tfidf)))
- print('test F1_score %s' % metrics.f1_score(test_y_tfidf, tfidfLinearSVM.predict(test_x_tfidf),average="macro"))
- # 使用tfidf 特征建立`rbf` SVM 模型
-
- tfidfKernelizedSVM = SVC(kernel='rbf')
- tfidfKernelizedSVM.fit(train_x_tfidf,train_y_tfidf)
-
- # 输出模型结果, accuracy, F1_score
-
- print('train accuracy %s' % metrics.accuracy_score(train_y_tfidf, tfidfKernelizedSVM.predict(train_x_tfidf)))
- print('train F1_score %s' % metrics.f1_score(train_y_tfidf, tfidfKernelizedSVM.predict(train_x_tfidf),average="macro"))
- print('test accuracy %s' % metrics.accuracy_score(test_y_tfidf, tfidfKernelizedSVM.predict(test_x_tfidf)))
- print('test F1_score %s' % metrics.f1_score(test_y_tfidf, tfidfKernelizedSVM.predict(test_x_tfidf),average="macro"))
- # 使用embeding 特征建立线性SVM模型
-
- word2vecLinearSVM = SVC(kernel='linear')
- word2vecLinearSVM.fit(train_x_vec,train_y_vec)
-
- # 输出模型结果, accuracy, F1_score
-
- print('train accuracy %s' % metrics.accuracy_score(train_y_vec, word2vecLinearSVM.predict(train_x_vec)))
- print('train F1_score %s' % metrics.f1_score(train_y_vec, word2vecLinearSVM.predict(train_x_vec),average="macro"))
- print('test accuracy %s' % metrics.accuracy_score(test_y_vec, word2vecLinearSVM.predict(test_x_vec)))
- print('test F1_score %s' % metrics.f1_score(test_y_vec, word2vecLinearSVM.predict(test_x_vec),average="macro"))
- # 使用embedding 特征建立`rbf` SVM模型
-
- word2vecKernelizedSVM = SVC(kernel='rbf')
- word2vecKernelizedSVM.fit(train_x_vec,train_y_vec)
-
- # 输出模型结果, accuracy, F1_score
-
- print('train accuracy %s' % metrics.accuracy_score(train_y_vec, word2vecKernelizedSVM.predict(train_x_vec)))
- print('train F1_score %s' % metrics.f1_score(train_y_vec, word2vecKernelizedSVM.predict(train_x_vec),average="macro"))
- print('test accuracy %s' % metrics.accuracy_score(test_y_vec, word2vecKernelizedSVM.predict(test_x_vec)))
- print('test F1_score %s' % metrics.f1_score(test_y_vec, word2vecKernelizedSVM.predict(test_x_vec),average="macro"))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。