赞
踩
Python 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.
IPython 7.12.0 -- An enhanced Interactive Python.
import pandas as pd
df = pd.read_csv(r'F:\Python\data_clean.csv')
df.head()
Out[3]:
ATM_POS TBM CSC
0 -0.852354 -0.294938 0.143935
1 -0.333078 -0.244334 0.939343
2 0.918067 0.593787 2.349496
3 -0.741847 -0.210507 -0.521592
4 -0.499703 -0.492714 -0.367629
##########################K-Means 聚类的第一种方式######################
####不进行变量分布的正太转换--用于寻找异常值
var = ["ATM_POS","TBM","CSC"] # var: variable-变量
skew_var = {}
for i in var:
skew_var[i]=abs(df[i].skew()) # .skew() 求该变量的偏度
skew=pd.Series(skew_var).sort_values(ascending=False)
skew
Out[4]:
TBM 51.881233
CSC 6.093417
ATM_POS 2.097633
dtype: float64
# 进行k-means聚类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # n_clusters=3 表示聚成3类
# kmeans = cluster.KMeans(n_clusters=3, init='random', n_init=1)
result = kmeans.fit(df)
result
Out[5]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
# 对分类结果进行解读
model_data_l = df.join(pd.DataFrame(result.labels_))
# .labels_ 表示这一个数据点属于什么类
model_data_l = model_data_l.rename(columns={0: "clustor"})
model_data_l.sample(10)
Out[6]:
ATM_POS TBM CSC clustor
77679 0.936609 0.009762 1.564585 1
32603 -0.209777 -0.215745 -0.076762 0
14039 1.623835 0.302078 2.613356 2
74096 -0.691668 -0.261177 -0.476536 0
99545 -0.883165 0.177141 -0.502084 0
78975 2.620059 0.312637 -0.399348 1
50228 -0.641716 -0.215492 -0.516080 0
44720 -0.887047 -0.409208 -0.179773 0
9908 -0.883445 0.108431 -0.502625 0
24114 1.216122 -0.316941 0.019256 1
# 饼图呈现
import matplotlib
model_data_l.clustor.value_counts().plot(kind = 'pie')
Out[7]: <matplotlib.axes._subplots.AxesSubplot at 0x1d50b13dd08>
################### k-means聚类的第二种方式 ###################
####进行变量分布的正态转换--用于客户细分
# 进行变量分布的正态转换
import numpy as np
from sklearn import preprocessing
quantile_transformer = \
preprocessing.QuantileTransformer(output_distribution='normal',
random_state=0) # 正态转换
## 插播:转换的方式有很多种,每种都会涉及一些咋看起来比较晦涩的统计学公式
## 但请不要担心,每种代码其实都是比较固定的,
## 这里使用 QT 转换(每种转换的原理和特点优劣等可参考网络资源)
df_trans = quantile_transformer.fit_transform(df)
df_trans = pd.DataFrame(df_trans)
# 因为 .fit_transform 转换出来的数据类型为 Series,
## 所以用 pandas 给 DataFrame 化一下
df_trans = df_trans.rename(columns={0: "ATM_POS", 1: "TBM", 2: "CSC"})
df_trans.head()
Out[8]:
ATM_POS TBM CSC
0 -0.501859 -0.265036 0.770485
1 0.097673 -0.154031 1.316637
2 0.952085 1.168354 1.845934
3 -0.333179 -0.084688 -1.780166
4 -0.071278 -0.888898 -0.066404
# 检验一下偏度:发现几乎都为 0 了
var = ["ATM_POS","TBM","CSC"]
skew_var = {}
# 循环计算偏度:发现都差不多等于 0 了。
for i in var:
skew_var[i] = abs(df_trans[i].skew())
skew = pd.Series(skew_var).sort_values(ascending=False)
skew
Out[9]:
ATM_POS 0.006430
CSC 0.000474
TBM 0.000046
dtype: float64
kmeans = KMeans(n_clusters=4) # 这次聚成 4 类
result = kmeans.fit(df_trans)
model_data_l = df_trans.join(pd.DataFrame(result.labels_))
model_data_l = model_data_l.rename(columns={0: "clustor"})
model_data_l.head()
Out[10]:
ATM_POS TBM CSC clustor
0 -0.501859 -0.265036 0.770485 3
1 0.097673 -0.154031 1.316637 2
2 0.952085 1.168354 1.845934 2
3 -0.333179 -0.084688 -1.780166 1
4 -0.071278 -0.888898 -0.066404 3
model_data_l.clustor.value_counts().plot(kind='pie',
shadow=True, autopct='%.2f%%') # 两位小数百分比
Out[11]: <matplotlib.axes._subplots.AxesSubplot at 0x1d50b1f2708>
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。