赞
踩
数据集是来自kaggle semantic classification任务的
1、加载文件
- import pandas as pd
- train = pd.read_csv(r"labeledTrainData\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
- unlabeled=pd.read_csv(r"unlabeledTrainData\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
- test=pd.read_csv(r"testData\testData.tsv", header=0, delimiter="\t", quoting=3)
2、输出数据格式
- print(train.shape)
- print(train.columns.values)
- # train 第一个参数是列名
- # print(train['review'][0])
-
- print(unlabeled.shape)
- print(unlabeled.columns.values)
-
- print(test.shape)
- print(test.columns.values)
3、删除停用词、只保留数字字母
- import re
- from bs4 import BeautifulSoup
- from nltk.corpus import stopwords
- import nltk
-
- stopwords_=set(stopwords.words("english"))
- def review_to_words(raw_review,isStopwords=False):
- delete_label=BeautifulSoup(raw_review).get_text()
- letters_only=re.sub("[^a-zA-Z0-9]"," ",delete_label).lower().split()
-
- if isStopwords:
- letters_only=[words for words in letters_only if words not in stopwords_]
- return letters_only
-
- tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
- def tokenize_to_sentence(raw_para,tokenizer):
- raw_sentences=tokenizer.tokenize(raw_para.strip())
- sentences=[]
- for sent in raw_sentences:
- if(len(sent)>0):
- sentences.append(review_to_words(sent))
- return sentences
-
- nums_train=len(train["review"])
- nums_unlabeled=len(unlabeled["review"])
- nums_test=len(test["review"])
- clean_review_sentence=[]
-
- for i in range(nums_train):
- clean_review_sentence+=tokenize_to_sentence(train["review"][i],tokenizer)
- if i%1000==0:
- print("*"*25,i)
-
- for i in range(nums_unlabeled):
- clean_review_sentence+=tokenize_to_sentence(unlabeled["review"][i],tokenizer)
- if i%1000==0:
- print("*"*25,i)
-
- for i in range(nums_test):
- clean_review_sentence+=tokenize_to_sentence(test["review"][i],tokenizer)
- if i%1000==0:
- print("*"*25,i)

4、显示log
- print(len(clean_review_sentence))
- import logging
- logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
5、Word2Vec
- print(clean_review_sentence[0])
- num_features=300
- min_word_count=40
- num_workers=4
- context=10
- downsampling=1e-3
-
- from gensim.models import word2vec
- print("Training model...")
- model = word2vec.Word2Vec(clean_review_sentence, workers=num_workers, \
- size=num_features, min_count = min_word_count, \
- window = context, sample = downsampling)
- # If you don't plan to train the modl any further, calling
- # init_sims will make the model much more memory-efficient.
-
- model.init_sims(replace=True)
-
- model_name = "300features_40minwords_10context"
- model.save(model_name)

6、Word2Vec 应用
- model.doesnt_match("man woman child kitchen".split())
- model.doesnt_match("france england germany berlin".split())
- model.most_similar("awful")
7、句子的向量表示
- import numpy as np
-
- index2word_set = set(model.wv.index2word)
-
- def getAvgFeatureVecs(paras,nums_features):
- counter=0
- reviews_vecs=[]
- for review in paras:
- if counter%1000==0:
- print("*"*25,counter)
- reviews_vecs.append(makeFeatureVec(review,nums_features))
- counter+=1
- return reviews_vecs
-
- def makeFeatureVec(review,nums_features):
- features_vec=np.zeros((nums_features),dtype="float32")
- words=review
- sum=0
- for word in words:
- if word in index2word_set:
- features_vec=np.add(features_vec,model[word])
- sum+=1
- if sum!=0:
- features_vec=np.divide(features_vec,sum)
- return features_vec
-
- clean_train_reviews = []
- for review in train["review"]:
- clean_train_reviews.append( review_to_words( review ))
-
- trainVec = getAvgFeatureVecs( clean_train_reviews, num_features )
-
- print("Creating average feature vecs for test reviews")
- clean_test_reviews = []
- for review in test["review"]:
- clean_test_reviews.append( review_to_words( review))
-
- testVec = getAvgFeatureVecs( clean_test_reviews, num_features )

8、随机森林
- from sklearn.ensemble import RandomForestClassifier
- forest=RandomForestClassifier(n_estimators=200)
- print("Fitting a random forest to labeled training data...")
- forest = forest.fit( trainVec, train["sentiment"] )
-
- res=forest.predict(testVec)
-
- # Write the test results
- output = pd.DataFrame( data={"id":test["id"], "sentiment":res} )
- output.to_csv( "Word2Vec_AverageVectors.tsv", index=False, quoting=3 )
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。