赞
踩
import pandas as pd from bs4 import BeautifulSoup with open("textClassifier/data/rawData/unlabeledTrainData.tsv", "r",encoding='UTF-8') as f: unlabeledTrain = [line.strip().split("\t") for line in f.readlines() if len(line.strip().split("\t")) == 2] #print (type(unlabeledTrain)) #print (len(unlabeledTrain)) #print (type(unlabeledTrain[0])) print (unlabeledTrain[1]) with open("textClassifier/data/rawData/labeledTrainData.tsv", "r",encoding='UTF-8') as f: labeledTrain = [line.strip().split("\t") for line in f.readlines() if len(line.strip().split("\t")) == 3] #print (type(labeledTrain)) #print (len(labeledTrain)) #print (labeledTrain[0]) #print (labeledTrain[1]) unlabel = pd.DataFrame(unlabeledTrain[1: ], columns=unlabeledTrain[0]) label = pd.DataFrame(labeledTrain[1: ], columns=labeledTrain[0]) #print (unlabel) def cleanReview(subject): beau = BeautifulSoup(subject) newSubject = beau.get_text() newSubject = newSubject.replace("\\", "").replace("\'", "").replace('/', '').replace('"', '').replace(',', '').replace('.', '').replace('?', '').replace('(', '').replace(')', '') newSubject = newSubject.strip().split(" ") newSubject = [word.lower() for word in newSubject] newSubject = " ".join(newSubject) return newSubject unlabel["review"] = unlabel["review"].apply(cleanReview) label["review"] = label["review"].apply(cleanReview) print (unlabel.iloc[0][1]) newDf = pd.concat([unlabel["review"], label["review"]], axis=0) # 保存成txt文件 newDf.to_csv("wordEmbdiing.txt", index=False)
用pandas直接处理数据,建议用apply方法,处理速度快,
数据处理完之后将有标签和无标签的数据合并,并保存成txt
这下只有一列啦!全是review!
class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)
sentences:要分析的语料,可是一个列表,或从文件中遍历读出(word2vec.LineSentence(filename) )
size:词向量维度,默认100。一般与我们的语料的大小相关,不大的语料,比如小于100M,则使用默认值一般就可以。
window:即词向量上下文最大距离,越大,则和某一词较远的词也会产生上下文关系。默认5
小语料则这个值可以设的更小。
对一般的语料这个值推荐在[5;10]
sg:0, 则是CBOW;是1则是Skip-Gram;默认是0
hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
negative:即使用Negative Sampling时负采样的个数,默认5。
7 cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
8 min_count:需要计算词向量的最小词频。
训练模型的代码
import logging import gensim from gensim.models import word2vec # 设置输出日志 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 直接用gemsim提供的API去读取txt文件,读取文件的API有LineSentence 和 Text8Corpus, PathLineSentences等。 sentences = word2vec.LineSentence("/data4T/share/jiangxinyang848/textClassifier/data/preProcess/wordEmbdiing.txt") # 训练模型,词向量的长度设置为200, 迭代次数为8,采用skip-gram模型,模型保存为bin格式 model = gensim.models.Word2Vec(sentences, size=200, sg=1, iter=8) model.wv.save_word2vec_format("./word2Vec" + ".bin", binary=True) # 加载bin格式的模型 wordVec = gensim.models.KeyedVectors.load_word2vec_format("word2Vec.bin", binary=True)
优点:
缺点:
杭州 [0,0,0,0,0,0,0,1,0,……,0,0,0,0,0,0,0]
上海 [0,0,0,0,1,0,0,0,0,……,0,0,0,0,0,0,0]
宁波 [0,0,0,1,0,0,0,0,0,……,0,0,0,0,0,0,0]
北京 [0,0,0,0,0,0,0,0,0,……,1,0,0,0,0,0,0]
将king这个词从一个可能非常稀疏的向量坐在的空间,映射到现在这个四维向量所在的空间,
必须满足
这个映射是单设(不懂的概念自行搜索);
映射之后的向量不会丢失之前的那种向量所含的信息。
这个过程称word embedding,
原因是,最后的词向量的训练过程中引入了词的上下文。
You shall know a word by the company it keeps.
举栗
想到得到"learning"的词向量,训练过程中考虑了它的上下文
就可以使"learning"带有语义信息
通过这种操作,我们可以得到近义词,甚至cat和它的复数cats的向量极其相近。
下面开始正片
字典有V个单词
N是嵌入的维度
#coding=utf-8 from sklearn.feature_extraction.text import TfidfVectorizer document = ["I have a pen.", "I have an apple."] tfidf_model = TfidfVectorizer().fit(document) sparse_result = tfidf_model.transform(document) # 得到tf-idf矩阵,稀疏矩阵表示法 print(sparse_result) print(sparse_result.todense()) # 转化为更直观的一般矩阵 print(tfidf_model.vocabulary_) # 词语与列的对应关系 (0, 3) 0.8148024746671689 (0, 2) 0.5797386715376657 (1, 2) 0.4494364165239821 (1, 1) 0.6316672017376245 (1, 0) 0.6316672017376245 [[0. 0. 0.57973867 0.81480247] [0.6316672 0.6316672 0.44943642 0. ]] {'have': 2, 'pen': 3, 'an': 0, 'apple': 1}
from sklearn.feature_extraction.text import TfidfVectorizer document = ["I have a pen.", "I have an apple."] tfidf_model = TfidfVectorizer(norm=None).fit(document) sparse_result = tfidf_model.transform(document) # 得到tf-idf矩阵,稀疏矩阵表示法 print(sparse_result) print(sparse_result.todense()) # 转化为更直观的一般矩阵 print(tfidf_model.vocabulary_) (0, 3) 1.4054651081081644 (0, 2) 1.0 (1, 2) 1.0 (1, 1) 1.4054651081081644 (1, 0) 1.4054651081081644 [[0. 0. 1. 1.40546511] [1.40546511 1.40546511 1. 0. ]] {'have': 2, 'pen': 3, 'an': 0, 'apple': 1}
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf = True, max_df = 0.5)
CountVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,
token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
参数很多,分三个处理步骤:
设置参数:ngram_range,
input
encodeing
decode_error
strip_accents
analyzer
from sklearn.feature_extraction.text import CountVectorizer texts=["1 12 3","2 3 4","5 5", '6'] cv = CountVectorizer(analyzer='word',token_pattern="[0-9]+")#创建词袋数据结构 cv_fit=cv.fit_transform(texts) print(cv_fit.toarray()) #.toarray() 是将结果转化为稀疏矩阵矩阵的表示方式; print(cv.get_feature_names()) print(cv.vocabulary_) # {‘dog’:2,'cat':1,'fish':3,'bird':0} 字典形式呈现,key:词,value:词频 print(cv_fit) print(cv_fit.toarray()) #.toarray() 是将结果转化为稀疏矩阵矩阵的表示方式; print(cv_fit.toarray().sum(axis=0)) #每个词在所有文档中的词频 [[1 1 0 1 0 0 0] [0 0 1 1 1 0 0] [0 0 0 0 0 2 0] [0 0 0 0 0 0 1]] ['1', '12', '2', '3', '4', '5', '6'] {'1': 0, '12': 1, '3': 3, '2': 2, '4': 4, '5': 5, '6': 6} (0, 0) 1 (0, 1) 1 (0, 3) 1 (1, 3) 1 (1, 2) 1 (1, 4) 1 (2, 5) 2 (3, 6) 1 [[1 1 0 1 0 0 0] [0 0 1 1 1 0 0] [0 0 0 0 0 2 0] [0 0 0 0 0 0 1]] [1 1 1 2 1 2 1]
from sklearn.feature_extraction.text import CountVectorizer texts=["dog cat fish","dog cat cat","fish bird", 'bird'] cv = CountVectorizer()#创建词袋数据结构 cv_fit=cv.fit_transform(texts) #上述代码等价于下面两行 #cv.fit(texts) #cv_fit=cv.transform(texts) print(cv.get_feature_names()) print(cv.vocabulary_ ) print(cv_fit) print(cv_fit.toarray()) print(cv_fit.toarray().sum(axis=0)) ['bird', 'cat', 'dog', 'fish'] {'dog': 2, 'cat': 1, 'fish': 3, 'bird': 0} (0, 2) 1 (0, 1) 1 (0, 3) 1 (1, 2) 1 (1, 1) 2 (2, 3) 1 (2, 0) 1 (3, 0) 1 [[0 1 1 1] [0 2 1 0] [1 0 0 1] [1 0 0 0]] [2 3 2 2]
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english')
vec=cv.fit_transform(['hello world','this is a panda.'])#传入句子组成的list
arr=vec.toarray()
print(arr)
[[0.70710678 0. 0.70710678]
[0. 1. 0. ]]
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words=None)
vec=cv.fit_transform(['hello world','this is a panda.'])#传入句子组成的list
arr=vec.toarray()
print (arr)
print (cv.get_feature_names())
runfile('C:/Users/ZTZ/Desktop/temp.py', wdir='C:/Users/ZTZ/Desktop')
[[0.70710678 0. 0. 0. 0.70710678]
[0. 0.57735027 0.57735027 0.57735027 0. ]]
['hello', 'is', 'panda', 'this', 'world']
corpus=["I come to China to travel",
"This is a car polupar in China",
"I love tea and Apple ",
"The work is to write some papers in science"]
不考虑停用词,得到的词向量如下:
[[0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 1 0 0]
[0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0]
[1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1]]
scikit-learn中,两种方法进行TF-IDF的预处理。
用CountVectorizer类向量化之后再调用TfidfTransformer类预处理。
直接用TfidfVectorizer完成向量化与TF-IDF预处理。
第一种方法,CountVectorizer+TfidfTransformer的组合
各个文本各个词的TF-IDF值
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer corpus=["I come to China to travel", "This is a car polupar in China", "I love tea and Apple ", "The work is to write some papers in science"] vectorizer=CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) print (vectorizer.vocabulary_) print (tfidf) runfile('C:/Users/ZTZ/untitled0.py', wdir='C:/Users/ZTZ') {'come': 4, 'to': 15, 'china': 3, 'travel': 16, 'this': 14, 'is': 6, 'car': 2, 'polupar': 9, 'in': 5, 'love': 7, 'tea': 12, 'and': 0, 'apple': 1, 'the': 13, 'work': 17, 'write': 18, 'some': 11, 'papers': 8, 'science': 10} (0, 16) 0.4424621378947393 (0, 15) 0.697684463383976 (0, 4) 0.4424621378947393 (0, 3) 0.348842231691988 (1, 14) 0.45338639737285463 (1, 9) 0.45338639737285463 (1, 6) 0.3574550433419527 (1, 5) 0.3574550433419527 (1, 3) 0.3574550433419527 (1, 2) 0.45338639737285463 (2, 12) 0.5 (2, 7) 0.5 (2, 1) 0.5 (2, 0) 0.5 (3, 18) 0.3565798233381452 (3, 17) 0.3565798233381452 (3, 15) 0.2811316284405006 (3, 13) 0.3565798233381452 (3, 11) 0.3565798233381452 (3, 10) 0.3565798233381452 (3, 8) 0.3565798233381452 (3, 6) 0.2811316284405006 (3, 5) 0.2811316284405006
from sklearn.feature_extraction.text import TfidfVectorizer corpus=["I come to China to travel", "This is a car polupar in China", "I love tea and Apple ", "The work is to write some papers in science "] tfidf2 = TfidfVectorizer() re = tfidf2.fit_transform(corpus) print (type(re)) print (re.toarray()) print (tfidf2.get_feature_names())
<class 'scipy.sparse.csr.csr_matrix'> [[0. 0. 0. 0.34884223 0.44246214 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.69768446 0.44246214 0. 0. ] [0. 0. 0.4533864 0.35745504 0. 0.35745504 0.35745504 0. 0. 0.4533864 0. 0. 0. 0. 0.4533864 0. 0. 0. 0. ] [0.5 0.5 0. 0. 0. 0. 0. 0.5 0. 0. 0. 0. 0.5 0. 0. 0. 0. 0. 0. ] [0. 0. 0. 0. 0. 0.28113163 0.28113163 0. 0.35657982 0. 0.35657982 0.35657982 0. 0.35657982 0. 0.28113163 0. 0.35657982 0.35657982]] ['and', 'apple', 'car', 'china', 'come', 'in', 'is', 'love', 'papers', 'polupar', 'science', 'some', 'tea', 'the', 'this', 'to', 'travel', 'work', 'write']
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。