赞
踩
目录
通过爬虫爬取贴吧数据,这里怎么爬取的就不记录了。然后以一句一行的格式存入到txt中。接着我们要通过对每句话进行分词转向量,最后使用kmeans进行聚类并输出结果。
在stop_words目录下有多个停用词表,需要循环加总所有停用词。
- def defined_stop_words():
- all_stop_words = []
- for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
- # 读取图片
- filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
- with open(filepath, 'r', encoding='utf-8') as fp:
- all_line = fp.readlines()
- for line in all_line:
- all_stop_words.append(line.replace('\n',''))
- return all_stop_words
-
这边主要是对原始数据的一个筛选+jieba分词+去停用词。这是相对标准的一个流程。
-
- def loadDataset(filepath):
- '''导入文本数据集'''
- dataset = []
- key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
- with open(filepath,'r',encoding='utf-8') as fp:
- all_line = fp.readlines()
- for line in all_line:
- dataset.append(line.replace('\n','' ))
- fp.close()
-
- # print(len(dataset))
- # # 随机抽样10W条
- # dataset = random.sample(dataset,10000)
- # print(len(dataset))
-
- # 加载停用词
- stop_words = defined_stop_words()
- all_sen = []
- original_sen = []
- for sen in list(set(dataset)):
- # 判断句子是否包含关键字
- for key in key_list:
- if operator.contains(sen,key):
- sentence = ""
- # jieba分词
- word = jieba_postag(sen)
- for w in word:
- # 去停用词
- if w.word not in stop_words:
- sentence += w.word + ' '
- all_sen.append(sentence)
- original_sen.append(sen)
- break
- # 原句 原句分词结果
- return original_sen,all_sen
X返回输入dataset的向量值,参数看数据选择合适的。
- def transform(dataset, n_features=1000):
- vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
- X = vectorizer.fit_transform(dataset)
- return X, vectorizer
这里选择Kmeans的方式,自定义k值,欠考虑的一个方案。
- def train(X, vectorizer, true_k=10, minibatch=False):
- # 使用采样数据还是原始数据训练k-means,
- if minibatch:
- km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
- init_size=1000, batch_size=1000, verbose=False)
- else:
- km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
- verbose=False)
- km.fit(X)
- # 报存模型
- save_model_file(km,'Kmeans.pkl')
-
- result = list(km.predict(X))
- print('Cluster distribution:')
- print(dict([(i, result.count(i)) for i in result]))
- return km.score(X),result
根据实际的数据情况有部分是需要调整的,这里是做的文本聚类。这样盲目的定义k的取值为100是不太合理的。感兴趣的可以看下Canopy算法,它能根据你的数据集来输出最佳k的取值。使用Canopy + Kmeans 或许效果会好一些。
- from __future__ import print_function
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans, MiniBatchKMeans
- import pandas as pd
- import sys
- import os
- import jieba.posseg as pseg
- import operator
- import random
- from sklearn.externals import joblib
-
-
-
- def save_model_file(model,save_model_name):
- joblib.dump(model, save_model_name)
-
- def jieba_postag(text):
- words = pseg.cut(text)
- return words
-
-
- def defined_stop_words():
- all_stop_words = []
- for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
- # 读取图片
- filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
- with open(filepath, 'r', encoding='utf-8') as fp:
- all_line = fp.readlines()
- for line in all_line:
- all_stop_words.append(line.replace('\n',''))
- return all_stop_words
-
-
- def loadDataset(filepath):
- '''导入文本数据集'''
- dataset = []
- key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
- with open(filepath,'r',encoding='utf-8') as fp:
- all_line = fp.readlines()
- for line in all_line:
- dataset.append(line.replace('\n','' ))
- fp.close()
-
- # print(len(dataset))
- # # 随机抽样10W条
- # dataset = random.sample(dataset,10000)
- # print(len(dataset))
-
- stop_words = defined_stop_words()
- all_sen = []
- original_sen = []
- for sen in list(set(dataset)):
- # 判断句子是否包含关键字
- for key in key_list:
- if operator.contains(sen,key):
- sentence = ""
- # jieba分词
- word = jieba_postag(sen)
- for w in word:
- # 去停用词
- if w.word not in stop_words:
- sentence += w.word + ' '
- all_sen.append(sentence)
- original_sen.append(sen)
- break
-
- return original_sen,all_sen
-
-
- def transform(dataset, n_features=1000):
- vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
- X = vectorizer.fit_transform(dataset)
- return X, vectorizer
-
-
- def train(X, vectorizer, true_k=10, minibatch=False):
- # 使用采样数据还是原始数据训练k-means,
- if minibatch:
- km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
- init_size=1000, batch_size=1000, verbose=False)
- else:
- km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
- verbose=False)
- km.fit(X)
- # 报存模型
- save_model_file(km,'Kmeans.pkl')
-
-
- result = list(km.predict(X))
- print('Cluster distribution:')
- print(dict([(i, result.count(i)) for i in result]))
- return -km.score(X),result
-
-
- def test():
- '''测试选择最优参数'''
- # 读数据
- filepath = r'D:\Gitlab\extract_key\all.txt'
- original_data,dataset = loadDataset(filepath)
-
- X, vectorizer = transform(dataset, n_features=500)
- train_score,class_result = train(X, vectorizer, true_k=100)
- socre = train_score / len(dataset)
- print(socre)
-
-
- abc_dict = {
- 'original_sentence':original_data,
- 'class':class_result,
- 'cut_words':dataset
- }
- result = pd.DataFrame(abc_dict)
- # print(result)
-
- result.to_csv('result.csv',index=False)
-
-
-
-
-
-
- if __name__ == '__main__':
-
- test()
-
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。