赞
踩
熵的定义:
条件熵的定义:
信息增益计算流程
import numpy as np import math ''' 熵的计算 ''' def entropy(y_values): e = 0 unique_vals = np.unique(y_values) for val in unique_vals: p = np.sum(y_values == val)/len(y_values) e += (p * math.log(p, 2)) return -1 * e ''' 条件熵的计算 ''' def entropy_condition(x_values, y_values): ey = entropy(y_values) ey_condition = 0 xy = np.hstack((x_values, y_values)) unique_x = np.unique(x_values) for x_val in unique_x: px = np.sum(x_values == x_val) / len(x_values) xy_condition_x = xy[np.where(xy[:, 0] == x_val)] ey_condition_x = entropy(xy_condition_x[:, 1]) ey_condition += (px * ey_condition_x) return ey - ey_condition ''' 信息增益比:摒弃了选择取值多的特征为重要特征的缺点 ''' def entropy_condition_ratio(x_values, y_values): return entropy_condition(x_values, y_values) / entropy(x_values)
xy = np.array([[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2], [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0], [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0], [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0], [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]]).T #A1 print(entropy_condition(xy[:, 0].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A2 print(entropy_condition(xy[:, 1].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A3 print(entropy_condition(xy[:, 2].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A4 print(entropy_condition(xy[:, 3].reshape(-1, 1), xy[:, -1].reshape(-1, 1)))
xy = np.array([[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2], [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0], [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0], [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0], [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]]).T #A1 print(entropy_condition_ratio(xy[:, 0].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A2 print(entropy_condition_ratio(xy[:, 1].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A3 print(entropy_condition_ratio(xy[:, 2].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A4 print(entropy_condition_ratio(xy[:, 3].reshape(-1, 1), xy[:, -1].reshape(-1, 1)))
Gini
(
p
)
=
∑
k
=
1
K
p
k
(
1
−
p
k
)
=
1
−
∑
k
=
1
K
p
k
2
\operatorname{Gini}(p)=\sum_{k=1}^{K} p_{k}\left(1-p_{k}\right)=1-\sum_{k=1}^{K} p_{k}^{2}
Gini(p)=k=1∑Kpk(1−pk)=1−k=1∑Kpk2
Gini
(
D
)
=
1
−
∑
k
=
1
K
(
∣
C
k
∣
∣
D
∣
)
2
\operatorname{Gini}(D)=1-\sum_{k=1}^{K}\left(\frac{\left|C_{k}\right|}{|D|}\right)^{2}
Gini(D)=1−k=1∑K(∣D∣∣Ck∣)2
Gini
(
D
,
A
)
=
∣
D
1
∣
∣
D
∣
Gini
(
D
1
)
+
∣
D
2
∣
∣
D
∣
Gini
(
D
2
)
\operatorname{Gini}(D, A)=\frac{\left|D_{1}\right|}{|D|} \operatorname{Gini}\left(D_{1}\right)+\frac{\left|D_{2}\right|}{|D|} \operatorname{Gini}\left(D_{2}\right)
Gini(D,A)=∣D∣∣D1∣Gini(D1)+∣D∣∣D2∣Gini(D2)
''' 基尼指数计算 ''' def gini(y_values): g = 0 unique_vals = np.unique(y_values) for val in unique_vals: p = np.sum(y_values == val)/len(y_values) g += (p * p) return 1 - g ''' 按照x取值的基尼指数的计算 ''' def gini_condition(x_values, y_values): g_condition = {} xy = np.hstack((x_values, y_values)) unique_x = np.unique(x_values) for x_val in unique_x: xy_condition_x = xy[np.where(xy[:, 0] == x_val)] xy_condition_notx = xy[np.where(xy[:, 0] != x_val)] g_condition[x_val] = len(xy_condition_x)/len(x_values) * gini(xy_condition_x[:, 1]) + len(xy_condition_notx)/len(x_values) * gini(xy_condition_notx[:, 1]) return g_condition
xy = np.array([[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2], [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0], [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0], [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0], [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]]).T #A1 print(gini_condition(xy[:, 0].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A2 print(gini_condition(xy[:, 1].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A3 print(gini_condition(xy[:, 2].reshape(-1, 1), xy[:, -1].reshape(-1, 1))) #A4 print(gini_condition(xy[:, 3].reshape(-1, 1), xy[:, -1].reshape(-1, 1)))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42).fit(xy[:, :-1], xy[:, -1])
print(rf. feature_importances_)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。