当前位置:   article > 正文

kaggle竞赛——入门二(Natural Language Processing with Disaster Tweets)

natural language processing with disaster tweets

比赛地址:https://www.kaggle.com/c/nlp-getting-started/submit

  1. """
  2. __author__:shuangrui Guo
  3. __description__:
  4. """
  5. import pandas as pd
  6. import numpy as np
  7. import re
  8. import nltk
  9. from nltk.corpus import stopwords
  10. import matplotlib.pyplot as plt
  11. from nltk.stem import SnowballStemmer
  12. import seaborn as sns
  13. from sklearn.svm import SVC
  14. from sklearn.metrics import f1_score
  15. from sklearn.feature_extraction.text import TfidfVectorizer
  16. from sklearn.pipeline import Pipeline
  17. from sklearn.model_selection import train_test_split
  18. from sklearn.feature_selection import SelectKBest ,chi2
  19. #读取数据集
  20. train = pd.read_csv('./data/train.csv')
  21. test = pd.read_csv('./data/test.csv')
  22. #在keyword存在61个空值,location列存在2533个空值
  23. #print(train.isnull().sum())
  24. #可视化,在条形图顶部显示数字
  25. # plt.figure()
  26. # ax = train['target'].value_counts().plot.bar()
  27. # for p in ax.patches:
  28. # ax.annotate(np.round(p.get_height(),decimals=2),
  29. # ((p.get_x()+p.get_width()/2.0),p.get_height()),
  30. # ha='center',
  31. # va='center',
  32. # xytext=(0,5),
  33. # textcoords='offset points')
  34. # plt.title('True vs False Disaster Tweets')
  35. # plt.xlabel('True vs False')
  36. # plt.xticks(rotation=360)
  37. # plt.show()
  38. #清洗文本
  39. #使用SnowballStemmer来把句子中的单词词干化
  40. stemmer = SnowballStemmer('english')
  41. stopwords_list = stopwords.words('english')
  42. def clean_content(string:str):
  43. cleaned = []
  44. temp=re.sub("[^a-zA-Z]"," ",string).split()
  45. for word in temp:
  46. if word not in stopwords_list:
  47. cleaned.append(stemmer.stem(word))
  48. return " ".join(cleaned).lower()
  49. train['cleaned']=train['text'].apply(clean_content)
  50. #步骤二:去除一些没有用的词与符号
  51. def review_cleaning(text):
  52. text = re.sub(r'([!”#$%&’()*+,-./:;<=>?[\]^_`{|}~])'," ",text)
  53. text = re.sub(r'http',' ',text)
  54. text = re.sub(r'https',' ',text)
  55. text = re.sub(r'http\S+',' ',text)
  56. text = re.sub(r'https\S+',' ',text)
  57. text = re.sub(r'co',' ',text)
  58. text = re.sub(r'\s+',' ',text)
  59. text = re.sub(r'\d+',' ',text)
  60. text = re.sub(r'[^a-zA-Z0-9]+',' ',text)
  61. return text
  62. train['cleaned'] = train['cleaned'].apply(review_cleaning)
  63. #删除一些只有一个单词的行:
  64. train['cleaned'] = [t for t in train['cleaned'] if len(t)>1]
  65. #创建训练集与测试集
  66. #train['cleaned'] = train['cleaned'].values
  67. #创建tf-idf
  68. tfidf = TfidfVectorizer(analyzer='word',
  69. max_features=10000,
  70. ngram_range=(1,3),
  71. stop_words='english')
  72. X = tfidf.fit_transform(train['cleaned'])
  73. X_train,X_test,y_train,y_test = train_test_split(X,train['target'].tolist(),test_size=0.2,stratify=train['target'].tolist())
  74. pipeline = Pipeline(
  75. [('mutual_info_classif',SelectKBest(chi2,k=6500)),
  76. ('classifier',SVC(kernel='rbf',random_state=0,verbose=True,gamma=1,C=1,degree=6,shrinking=True,probability=False,cache_size=5))]
  77. )
  78. model = pipeline.fit(X_train,y_train)
  79. y_pred = model.predict(X_test)
  80. print(f1_score(y_test,y_pred))
  81. #在真正的测试集上进行预测并保存
  82. test['cleaned'] = test['text'].apply(clean_content)
  83. test['cleaned'] = test['cleaned'].apply(review_cleaning)
  84. testing = tfidf.transform(test['cleaned'])
  85. test_pred = model.predict(testing)
  86. test['target'] = test_pred
  87. columns = ['id','target']
  88. submission = test[columns]
  89. submission.to_csv('./submission.csv',index=False)

目前的不足:

文本清洗部分觉得有些奇怪

使用TFIDF的结果直接去划分训练集和测试集不能理解

SelectKBest的作用不清楚

Pipeline的使用不了解

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/443828
推荐阅读
相关标签
  

闽ICP备14008679号