赞
踩
import numpy as np from scipy.stats import multivariate_normal from sklearn.mixture import GaussianMixture from mpl_toolkits.mplot3d import Axes3D import matplotlib as mpl import matplotlib.pyplot as plt from sklearn.metrics.pairwise import pairwise_distances_argmin mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False if __name__ == '__main__': style = 'myself' np.random.seed(0) # 构造均值与方差 mu1_fact = (0, 0, 0) # numpy.diag()返回一个矩阵的对角线元素,或者创建一个对角阵( diagonal array.) cov1_fact = np.diag((1, 2, 3)) # 依据指定的均值和协方差生成高斯分布数据 data1 = np.random.multivariate_normal(mu1_fact, cov1_fact, 400) # print(data1) mu2_fact = (2, 2, 1) cov2_fact = np.array(((1, 1, 3), (1, 2, 1), (0, 0, 1))) data2 = np.random.multivariate_normal(mu2_fact, cov2_fact, 100) # print(data2) # 纵向叠加矩阵 data = np.vstack((data1, data2)) # print(data) y = np.array([True] * 400 + [False] * 100) if style == 'sklearn': # 高斯分布 # n_components :混合元素(聚类)的数量,默认为1 # covariance_type:描述要使用的协方差参数类型的字符串,必选一个(‘full’ , ‘tied’, ‘diag’, ‘spherical’),默认为full, # full:每个混合元素有它公用的协方差矩阵;即表示不要求方差相等,不要求方差平行坐标轴 # tol:float类型, 默认值: 0.001.收敛阈值,当平均增益低于这个值时迭代停止 # max_iter:最大迭代次数,默认为100 g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) g.fit(data) # weights_ : array-like, shape (n_components,),每个混合元素权重 # means_ : array-like, shape (n_components, n_features),每个混合元素均值 # covariances_ : array-like,每个混合元素的协方差,它的形状依靠协方差类型 print('类别概率:\t', g.weights_[1]) print('均值:\n', g.means_, '\n') print('方差:\n', g.covariances_, '\n') mu1, mu2 = g.means_ sigma1, sigma2 = g.covariances_ # 自定义的基于高斯分布的EM算法 else: num_iter = 100 n, d = data.shape # 随机指定 # mu1 = np.random.standard_normal(d) # print mu1 # mu2 = np.random.standard_normal(d) # print mu2 # 均值、方差、先验概率pi mu1 = data.min(axis=0) mu2 = data.max(axis=0) # Numpy.identity()的document. 输入n为行数或列数,返回一个n*n的对角阵,对角线元素为1,其余为0 sigma1 = np.identity(d) sigma2 = np.identity(d) pi = 0.5 # EM for i in range(num_iter): # E Step # 两个模型 norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) # 先验概率*概率密度 tau1 = pi * norm1.pdf(data) tau2 = (1 - pi) * norm2.pdf(data) # ganma值,对于样本x_i,它由第一个组分生成的概率 gamma = tau1 / (tau1 + tau2) # M Step mu1 = np.dot(gamma, data) / np.sum(gamma) mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma)) sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma) sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma) pi = np.sum(gamma) / n # print(i, ":\t", mu1, mu2) print('类别概率:\t', pi) print('均值:\t', mu1, mu2) print('方差:\n', sigma1, '\n\n', sigma2, '\n') # 预测分类 # multivariate_normal根据实际情况生成一个多元正态分布矩阵 norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) # 计算在data下的概率密度值 tau1 = norm1.pdf(data) tau2 = norm2.pdf(data) # 面向对象 fig = plt.figure(figsize=(13, 7), facecolor='w') # 3d ax = fig.add_subplot(121, projection='3d') ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'原始数据', fontsize=18) ax = fig.add_subplot(122, projection='3d') # pairwise_distances_argmin使用欧几里得距离,返回的是X距离Y最近点的index,如果是[0,1]则没问题。如果[1,0]则反了 order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean') print(order) # [1 0] # 调整顺序 if order[0] == 0: c1 = tau1 > tau2 else: c1 = tau1 < tau2 c2 = ~c1 acc = np.mean(y == c1) print(u'准确率:%.2f%%' % (100*acc)) ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True) ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'EM算法分类', fontsize=18) plt.suptitle(u'EM算法的实现', fontsize=21) plt.subplots_adjust(top=0.90) plt.tight_layout() plt.show()
类别概率: 0.7650337783291882
均值: [-0.123994 -0.02138048 -0.06003756] [1.9076683 1.79622192 1.11752474]
方差:
[[ 0.82563399 -0.10180706 -0.0414597 ]
[-0.10180706 2.15816316 -0.16360603]
[-0.0414597 -0.16360603 2.79283956]]
[[0.69690051 0.90370392 0.73552321]
[0.90370392 1.8856117 0.76747618]
[0.73552321 0.76747618 2.94819132]]
[0 1]
准确率:89.80%
import numpy as np from sklearn.mixture import GaussianMixture from sklearn.model_selection import train_test_split import matplotlib as mpl import matplotlib.colors import matplotlib.pyplot as plt # 指定字体 mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False # from matplotlib.font_manager import FontProperties # font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15) # fontproperties=font_set def expand(a, b): d = (b - a) * 0.05 return a-d, b+d if __name__ == '__main__': data = np.loadtxt('HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1) print(data.shape) y, x = np.split(data, [1, ], axis=1) x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0) # 高斯混合模型 gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) x_min = np.min(x, axis=0) x_max = np.max(x, axis=0) gmm.fit(x) print('均值 = \n', gmm.means_) print('方差 = \n', gmm.covariances_) y_hat = gmm
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。