当前位置:   article > 正文

Python实现Kmeans文本聚类_python 文本聚类

python 文本聚类

目录

一、数据

二、代码

        2.1、加载停用词

        2.2、加载数据       

        2.3、计算tf-idf向量值

        2.4、训练

三、完整代码


一、数据

        通过爬虫爬取贴吧数据,这里怎么爬取的就不记录了。然后以一句一行的格式存入到txt中。接着我们要通过对每句话进行分词转向量,最后使用kmeans进行聚类并输出结果。

二、代码

        2.1、加载停用词

                在stop_words目录下有多个停用词表,需要循环加总所有停用词。

  1. def defined_stop_words():
  2. all_stop_words = []
  3. for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
  4. # 读取图片
  5. filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
  6. with open(filepath, 'r', encoding='utf-8') as fp:
  7. all_line = fp.readlines()
  8. for line in all_line:
  9. all_stop_words.append(line.replace('\n',''))
  10. return all_stop_words

        2.2、加载数据       

                这边主要是对原始数据的一个筛选+jieba分词+去停用词。这是相对标准的一个流程。

  1. def loadDataset(filepath):
  2. '''导入文本数据集'''
  3. dataset = []
  4. key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
  5. with open(filepath,'r',encoding='utf-8') as fp:
  6. all_line = fp.readlines()
  7. for line in all_line:
  8. dataset.append(line.replace('\n','' ))
  9. fp.close()
  10. # print(len(dataset))
  11. # # 随机抽样10W条
  12. # dataset = random.sample(dataset,10000)
  13. # print(len(dataset))
  14. # 加载停用词
  15. stop_words = defined_stop_words()
  16. all_sen = []
  17. original_sen = []
  18. for sen in list(set(dataset)):
  19. # 判断句子是否包含关键字
  20. for key in key_list:
  21. if operator.contains(sen,key):
  22. sentence = ""
  23. # jieba分词
  24. word = jieba_postag(sen)
  25. for w in word:
  26. # 去停用词
  27. if w.word not in stop_words:
  28. sentence += w.word + ' '
  29. all_sen.append(sentence)
  30. original_sen.append(sen)
  31. break
  32. # 原句 原句分词结果
  33. return original_sen,all_sen

        2.3、计算tf-idf向量值

                X返回输入dataset的向量值,参数看数据选择合适的。

  1. def transform(dataset, n_features=1000):
  2. vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
  3. X = vectorizer.fit_transform(dataset)
  4. return X, vectorizer

        2.4、训练

                这里选择Kmeans的方式,自定义k值,欠考虑的一个方案。

  1. def train(X, vectorizer, true_k=10, minibatch=False):
  2. # 使用采样数据还是原始数据训练k-means,
  3. if minibatch:
  4. km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
  5. init_size=1000, batch_size=1000, verbose=False)
  6. else:
  7. km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
  8. verbose=False)
  9. km.fit(X)
  10. # 报存模型
  11. save_model_file(km,'Kmeans.pkl')
  12. result = list(km.predict(X))
  13. print('Cluster distribution:')
  14. print(dict([(i, result.count(i)) for i in result]))
  15. return km.score(X),result

三、完整代码

        根据实际的数据情况有部分是需要调整的,这里是做的文本聚类。这样盲目的定义k的取值为100是不太合理的。感兴趣的可以看下Canopy算法,它能根据你的数据集来输出最佳k的取值。使用Canopy + Kmeans 或许效果会好一些。

  1. from __future__ import print_function
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.cluster import KMeans, MiniBatchKMeans
  4. import pandas as pd
  5. import sys
  6. import os
  7. import jieba.posseg as pseg
  8. import operator
  9. import random
  10. from sklearn.externals import joblib
  11. def save_model_file(model,save_model_name):
  12. joblib.dump(model, save_model_name)
  13. def jieba_postag(text):
  14. words = pseg.cut(text)
  15. return words
  16. def defined_stop_words():
  17. all_stop_words = []
  18. for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
  19. # 读取图片
  20. filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
  21. with open(filepath, 'r', encoding='utf-8') as fp:
  22. all_line = fp.readlines()
  23. for line in all_line:
  24. all_stop_words.append(line.replace('\n',''))
  25. return all_stop_words
  26. def loadDataset(filepath):
  27. '''导入文本数据集'''
  28. dataset = []
  29. key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
  30. with open(filepath,'r',encoding='utf-8') as fp:
  31. all_line = fp.readlines()
  32. for line in all_line:
  33. dataset.append(line.replace('\n','' ))
  34. fp.close()
  35. # print(len(dataset))
  36. # # 随机抽样10W条
  37. # dataset = random.sample(dataset,10000)
  38. # print(len(dataset))
  39. stop_words = defined_stop_words()
  40. all_sen = []
  41. original_sen = []
  42. for sen in list(set(dataset)):
  43. # 判断句子是否包含关键字
  44. for key in key_list:
  45. if operator.contains(sen,key):
  46. sentence = ""
  47. # jieba分词
  48. word = jieba_postag(sen)
  49. for w in word:
  50. # 去停用词
  51. if w.word not in stop_words:
  52. sentence += w.word + ' '
  53. all_sen.append(sentence)
  54. original_sen.append(sen)
  55. break
  56. return original_sen,all_sen
  57. def transform(dataset, n_features=1000):
  58. vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
  59. X = vectorizer.fit_transform(dataset)
  60. return X, vectorizer
  61. def train(X, vectorizer, true_k=10, minibatch=False):
  62. # 使用采样数据还是原始数据训练k-means,
  63. if minibatch:
  64. km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
  65. init_size=1000, batch_size=1000, verbose=False)
  66. else:
  67. km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
  68. verbose=False)
  69. km.fit(X)
  70. # 报存模型
  71. save_model_file(km,'Kmeans.pkl')
  72. result = list(km.predict(X))
  73. print('Cluster distribution:')
  74. print(dict([(i, result.count(i)) for i in result]))
  75. return -km.score(X),result
  76. def test():
  77. '''测试选择最优参数'''
  78. # 读数据
  79. filepath = r'D:\Gitlab\extract_key\all.txt'
  80. original_data,dataset = loadDataset(filepath)
  81. X, vectorizer = transform(dataset, n_features=500)
  82. train_score,class_result = train(X, vectorizer, true_k=100)
  83. socre = train_score / len(dataset)
  84. print(socre)
  85. abc_dict = {
  86. 'original_sentence':original_data,
  87. 'class':class_result,
  88. 'cut_words':dataset
  89. }
  90. result = pd.DataFrame(abc_dict)
  91. # print(result)
  92. result.to_csv('result.csv',index=False)
  93. if __name__ == '__main__':
  94. test()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/289979
推荐阅读
相关标签
  

闽ICP备14008679号