当前位置:   article > 正文

随机森林 Word2Vec 文本分类_labeledtraindata.csv

labeledtraindata.csv

数据集是来自kaggle semantic classification任务的

1、加载文件

  1. import pandas as pd
  2. train = pd.read_csv(r"labeledTrainData\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
  3. unlabeled=pd.read_csv(r"unlabeledTrainData\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
  4. test=pd.read_csv(r"testData\testData.tsv", header=0, delimiter="\t", quoting=3)

2、输出数据格式

  1. print(train.shape)
  2. print(train.columns.values)
  3. # train 第一个参数是列名
  4. # print(train['review'][0])
  5. print(unlabeled.shape)
  6. print(unlabeled.columns.values)
  7. print(test.shape)
  8. print(test.columns.values)

3、删除停用词、只保留数字字母

  1. import re
  2. from bs4 import BeautifulSoup
  3. from nltk.corpus import stopwords
  4. import nltk
  5. stopwords_=set(stopwords.words("english"))
  6. def review_to_words(raw_review,isStopwords=False):
  7. delete_label=BeautifulSoup(raw_review).get_text()
  8. letters_only=re.sub("[^a-zA-Z0-9]"," ",delete_label).lower().split()
  9. if isStopwords:
  10. letters_only=[words for words in letters_only if words not in stopwords_]
  11. return letters_only
  12. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  13. def tokenize_to_sentence(raw_para,tokenizer):
  14. raw_sentences=tokenizer.tokenize(raw_para.strip())
  15. sentences=[]
  16. for sent in raw_sentences:
  17. if(len(sent)>0):
  18. sentences.append(review_to_words(sent))
  19. return sentences
  20. nums_train=len(train["review"])
  21. nums_unlabeled=len(unlabeled["review"])
  22. nums_test=len(test["review"])
  23. clean_review_sentence=[]
  24. for i in range(nums_train):
  25. clean_review_sentence+=tokenize_to_sentence(train["review"][i],tokenizer)
  26. if i%1000==0:
  27. print("*"*25,i)
  28. for i in range(nums_unlabeled):
  29. clean_review_sentence+=tokenize_to_sentence(unlabeled["review"][i],tokenizer)
  30. if i%1000==0:
  31. print("*"*25,i)
  32. for i in range(nums_test):
  33. clean_review_sentence+=tokenize_to_sentence(test["review"][i],tokenizer)
  34. if i%1000==0:
  35. print("*"*25,i)

4、显示log

  1. print(len(clean_review_sentence))
  2. import logging
  3. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

 

5、Word2Vec

  1. print(clean_review_sentence[0])
  2. num_features=300
  3. min_word_count=40
  4. num_workers=4
  5. context=10
  6. downsampling=1e-3
  7. from gensim.models import word2vec
  8. print("Training model...")
  9. model = word2vec.Word2Vec(clean_review_sentence, workers=num_workers, \
  10. size=num_features, min_count = min_word_count, \
  11. window = context, sample = downsampling)
  12. # If you don't plan to train the modl any further, calling
  13. # init_sims will make the model much more memory-efficient.
  14. model.init_sims(replace=True)
  15. model_name = "300features_40minwords_10context"
  16. model.save(model_name)

6、Word2Vec 应用

  1. model.doesnt_match("man woman child kitchen".split())
  2. model.doesnt_match("france england germany berlin".split())
  3. model.most_similar("awful")

7、句子的向量表示

  1. import numpy as np
  2. index2word_set = set(model.wv.index2word)
  3. def getAvgFeatureVecs(paras,nums_features):
  4. counter=0
  5. reviews_vecs=[]
  6. for review in paras:
  7. if counter%1000==0:
  8. print("*"*25,counter)
  9. reviews_vecs.append(makeFeatureVec(review,nums_features))
  10. counter+=1
  11. return reviews_vecs
  12. def makeFeatureVec(review,nums_features):
  13. features_vec=np.zeros((nums_features),dtype="float32")
  14. words=review
  15. sum=0
  16. for word in words:
  17. if word in index2word_set:
  18. features_vec=np.add(features_vec,model[word])
  19. sum+=1
  20. if sum!=0:
  21. features_vec=np.divide(features_vec,sum)
  22. return features_vec
  23. clean_train_reviews = []
  24. for review in train["review"]:
  25. clean_train_reviews.append( review_to_words( review ))
  26. trainVec = getAvgFeatureVecs( clean_train_reviews, num_features )
  27. print("Creating average feature vecs for test reviews")
  28. clean_test_reviews = []
  29. for review in test["review"]:
  30. clean_test_reviews.append( review_to_words( review))
  31. testVec = getAvgFeatureVecs( clean_test_reviews, num_features )

 

8、随机森林

  1. from sklearn.ensemble import RandomForestClassifier
  2. forest=RandomForestClassifier(n_estimators=200)
  3. print("Fitting a random forest to labeled training data...")
  4. forest = forest.fit( trainVec, train["sentiment"] )
  5. res=forest.predict(testVec)
  6. # Write the test results
  7. output = pd.DataFrame( data={"id":test["id"], "sentiment":res} )
  8. output.to_csv( "Word2Vec_AverageVectors.tsv", index=False, quoting=3 )

 

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/353483
推荐阅读
相关标签
  

闽ICP备14008679号