赞
踩
#!/usr/bin/env python#-*- coding: utf-8 -*-#@File : kmeans.py#@Author: 田智凯#@Date : 2020/3/19#@Desc :机器学习kmeans算法,对科技成果项目进行聚类分析
from __future__ importprint_functionimporttimefrom sklearn.feature_extraction.text importTfidfVectorizerimportmatplotlib.pyplot as pltfrom sklearn.cluster importKMeans, MiniBatchKMeansimportpymssql#读取sqlserver数据库
defget_dbdata():
conn_read= pymssql.connect("127.0.0.1", "sa", "###", "test", charset="GBK")
dataset=[]
sql= "select guanjianci from julei_test"cursor=conn_read.cursor()
cursor.execute(sql)
data_count=0for line incursor:
data_count+=1dataset.append(line[0])
cursor.close()
conn_read.close()print(dataset)returndatasetdef transform(dataset, n_features=1000):
vectorizer= TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X=vectorizer.fit_transform(dataset)returnX, vectorizerdef train(X, vectorizer, true_k=10, minibatch=False, showLable=False):#使用采样数据还是原始数据训练k-means,
ifminibatch:
km= MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)else:
km= KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)ifshowLable:print("Top terms per cluster:")
order_centroids= km.cluster_centers_.argsort()[:, ::-1]
terms=vectorizer.get_feature_names()print(vectorizer.get_stop_words())for i inrange(true_k):print("Cluster %d:" % i, end='')for ind in order_centroids[i, :10]:print('%s' % terms[ind], end='')print()
result=list(km.predict(X))print('Cluster distribution:')print(dict([(i, result.count(i)) for i inresult]))return -km.score(X)#指定簇的个数k
defk_determin():'''测试选择最优参数'''dataset=get_dbdata()print("%d documents" %len(dataset))
X, vectorizer= transform(dataset, n_features=500)
true_ks=[]
scores=[]#中心点的个数从3到200(根据自己的数据量改写)
for i in range(3, 200, 1):
score= train(X, vectorizer, true_k=i) /len(dataset)print(i, score)
true_ks.append(i)
scores.append(score)
plt.figure(figsize=(8, 4))
plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
plt.xlabel("n_features")
plt.ylabel("error")
plt.legend()
plt.show()defmain():'''在最优参数下输出聚类结果'''dataset=get_dbdata()
X, vectorizer= transform(dataset, n_features=500)
score= train(X, vectorizer, true_k=25, showLable=True) /len(dataset)print(score)if __name__ == '__main__':
start=time.time()#k_determin()#先确定k值
main()
end=time.time()print('程序运行时间',end-start)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。