赞
踩
比赛地址:https://www.kaggle.com/c/nlp-getting-started/submit
- """
- __author__:shuangrui Guo
- __description__:
- """
- import pandas as pd
- import numpy as np
- import re
- import nltk
- from nltk.corpus import stopwords
- import matplotlib.pyplot as plt
- from nltk.stem import SnowballStemmer
- import seaborn as sns
- from sklearn.svm import SVC
- from sklearn.metrics import f1_score
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.pipeline import Pipeline
- from sklearn.model_selection import train_test_split
- from sklearn.feature_selection import SelectKBest ,chi2
-
- #读取数据集
- train = pd.read_csv('./data/train.csv')
- test = pd.read_csv('./data/test.csv')
- #在keyword存在61个空值,location列存在2533个空值
- #print(train.isnull().sum())
-
- #可视化,在条形图顶部显示数字
- # plt.figure()
- # ax = train['target'].value_counts().plot.bar()
- # for p in ax.patches:
- # ax.annotate(np.round(p.get_height(),decimals=2),
- # ((p.get_x()+p.get_width()/2.0),p.get_height()),
- # ha='center',
- # va='center',
- # xytext=(0,5),
- # textcoords='offset points')
- # plt.title('True vs False Disaster Tweets')
- # plt.xlabel('True vs False')
- # plt.xticks(rotation=360)
- # plt.show()
-
- #清洗文本
- #使用SnowballStemmer来把句子中的单词词干化
- stemmer = SnowballStemmer('english')
- stopwords_list = stopwords.words('english')
-
- def clean_content(string:str):
- cleaned = []
- temp=re.sub("[^a-zA-Z]"," ",string).split()
- for word in temp:
- if word not in stopwords_list:
- cleaned.append(stemmer.stem(word))
- return " ".join(cleaned).lower()
- train['cleaned']=train['text'].apply(clean_content)
-
- #步骤二:去除一些没有用的词与符号
- def review_cleaning(text):
- text = re.sub(r'([!”#$%&’()*+,-./:;<=>?[\]^_`{|}~])'," ",text)
- text = re.sub(r'http',' ',text)
- text = re.sub(r'https',' ',text)
- text = re.sub(r'http\S+',' ',text)
- text = re.sub(r'https\S+',' ',text)
- text = re.sub(r'co',' ',text)
- text = re.sub(r'\s+',' ',text)
- text = re.sub(r'\d+',' ',text)
- text = re.sub(r'[^a-zA-Z0-9]+',' ',text)
- return text
-
- train['cleaned'] = train['cleaned'].apply(review_cleaning)
-
- #删除一些只有一个单词的行:
- train['cleaned'] = [t for t in train['cleaned'] if len(t)>1]
-
- #创建训练集与测试集
- #train['cleaned'] = train['cleaned'].values
-
-
- #创建tf-idf
- tfidf = TfidfVectorizer(analyzer='word',
- max_features=10000,
- ngram_range=(1,3),
- stop_words='english')
- X = tfidf.fit_transform(train['cleaned'])
- X_train,X_test,y_train,y_test = train_test_split(X,train['target'].tolist(),test_size=0.2,stratify=train['target'].tolist())
-
- pipeline = Pipeline(
- [('mutual_info_classif',SelectKBest(chi2,k=6500)),
- ('classifier',SVC(kernel='rbf',random_state=0,verbose=True,gamma=1,C=1,degree=6,shrinking=True,probability=False,cache_size=5))]
- )
-
- model = pipeline.fit(X_train,y_train)
- y_pred = model.predict(X_test)
- print(f1_score(y_test,y_pred))
-
- #在真正的测试集上进行预测并保存
- test['cleaned'] = test['text'].apply(clean_content)
- test['cleaned'] = test['cleaned'].apply(review_cleaning)
-
- testing = tfidf.transform(test['cleaned'])
- test_pred = model.predict(testing)
- test['target'] = test_pred
- columns = ['id','target']
- submission = test[columns]
- submission.to_csv('./submission.csv',index=False)
目前的不足:
文本清洗部分觉得有些奇怪
使用TFIDF的结果直接去划分训练集和测试集不能理解
SelectKBest的作用不清楚
Pipeline的使用不了解
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。