支持度support := 所有买X的人数
置信度confidence := 所有买 X 和 Y 的人数 所有买 X 的人数 \frac{ 所有买X和Y的人数 } { 所有买X的人数 } 所有买X的人数所有买X和Y的人数
# 引入库
import numpy as np
from operator import itemgetter
# 准备数据 # 创造随机生成的数据 X = np.zeros((100, 5), dtype='bool') for i in range(X.shape[0]): if np.random.random() < 0.3: # A bread winner X[i][0] = 1 if np.random.random() < 0.5: # Who likes milk X[i][1] = 1 if np.random.random() < 0.2: # Who likes cheese X[i][2] = 1 if np.random.random() < 0.25: # Who likes apples X[i][3] = 1 if np.random.random() < 0.5: # Who likes bananas X[i][4] = 1 else: # Not a bread winner if np.random.random() < 0.5: # Who likes milk X[i][1] = 1 if np.random.random() < 0.2: # Who likes cheese X[i][2] = 1 if np.random.random() < 0.25: # Who likes apples X[i][3] = 1 if np.random.random() < 0.5: # Who likes bananas X[i][4] = 1 else: if np.random.random() < 0.8: # Who likes cheese X[i][2] = 1 if np.random.random() < 0.6: # Who likes apples X[i][3] = 1 if np.random.random() < 0.7: # Who likes bananas X[i][4] = 1 if X[i].sum() == 0: X[i][4] = 1 # Must buy something, so gets bananas np.savetxt("./data/affinity_dataset.txt", X, fmt='%d') # 保存 # 读取数据 dataset_filename = "./data/affinity_dataset.txt" X = np.loadtxt(dataset_filename) # 加载数据 n_samples, n_features = X.shape print(X.shape) print(X[:5]) ''' (100, 5) [[0. 0. 1. 1. 0.] [1. 1. 0. 0. 0.] [1. 0. 0. 1. 1.] [0. 1. 1. 0. 1.] [0. 1. 0. 0. 0.]] '''
下面 rule_valid 表示买了苹果又买香蕉的有多少人
显然支持度 := 所有买X的人数,即支持度=rule_valid
# 文件affinity_dataset.txt是生成的数据,得我们来指定列 features = ["bread", "milk", "cheese", "apples", "bananas"] num_apple_purchases = 0 # 计数 for sample in X: if sample[3] == 1: # 记录买 Apples 的有多少人 num_apple_purchases += 1 print("买苹果的有{0}人".format(num_apple_purchases)) rule_valid = 0 rule_invalid = 0 for sample in X: if sample[3] == 1: # 买了苹果 if sample[4] == 1:# 又买香蕉的 rule_valid += 1 else:# 不买香蕉的 rule_invalid += 1 print("买了苹果又买香蕉的有{0}人".format(rule_valid)) print("买了苹果不买香蕉的有{0}人".format(rule_invalid)) # 计算支持度support和置信度confidence support = rule_valid # 支持度是符合“买了苹果又买香蕉”这个规则的人数 confidence = rule_valid / num_apple_purchases print("支持度support = {0} 置信度confidence = {1:.3f}.".format(support, confidence)) # 置信度的百分比形式 print("置信度confidence的百分比形式为 {0:.1f}%.".format(100 * confidence)) ''' 买苹果的有39人 买了苹果又买香蕉的有23人 买了苹果不买香蕉的有16人 支持度support = 23 置信度confidence = 0.590. 置信度confidence的百分比形式为 59.0%. '''
from collections import defaultdict # 上面"买了苹果又买香蕉"是一种情况,现在把所有可能的情况都做一遍 valid_rules = defaultdict(int) invalid_rules = defaultdict(int) num_occurences = defaultdict(int) for sample in X: for premise in range(n_features): if sample[premise] == 0: continue # 先买premise,premise代表一种食物,记做X num_occurences[premise] += 1 for conclusion in range(n_features): if premise == conclusion: continue # 跳过买X又买X的情况 if sample[conclusion] == 1: # 又买了conclusion,conclusion代表一种食物,记做Y valid_rules[(premise, conclusion)] += 1 # 买X买Y else: invalid_rules[(premise, conclusion)] += 1 # 买X没买Y support = valid_rules confidence = defaultdict(float) for premise, conclusion in valid_rules.keys(): confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
for premise, conclusion in confidence: premise_name = features[premise] conclusion_name = features[conclusion] print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name)) print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)])) print(" - 支持度Support: {0}".format(support[(premise, conclusion)])) print("") ''' Rule: 买了cheese,又买apples - 置信度Confidence: 0.553 - 支持度Support: 26 Rule: 买了apples,又买cheese - 置信度Confidence: 0.667 - 支持度Support: 26 Rule: 买了bread,又买milk - 置信度Confidence: 0.619 - 支持度Support: 13 Rule: 买了milk,又买bread - 置信度Confidence: 0.265 - 支持度Support: 13 Rule: 买了bread,又买apples - 置信度Confidence: 0.286 - 支持度Support: 6 Rule: 买了bread,又买bananas - 置信度Confidence: 0.476 - 支持度Support: 10 Rule: 买了apples,又买bread - 置信度Confidence: 0.154 - 支持度Support: 6 Rule: 买了apples,又买bananas - 置信度Confidence: 0.590 - 支持度Support: 23 Rule: 买了bananas,又买bread - 置信度Confidence: 0.185 - 支持度Support: 10 Rule: 买了bananas,又买apples - 置信度Confidence: 0.426 - 支持度Support: 23 Rule: 买了milk,又买cheese - 置信度Confidence: 0.204 - 支持度Support: 10 Rule: 买了milk,又买bananas - 置信度Confidence: 0.429 - 支持度Support: 21 Rule: 买了cheese,又买milk - 置信度Confidence: 0.213 - 支持度Support: 10 Rule: 买了cheese,又买bananas - 置信度Confidence: 0.532 - 支持度Support: 25 Rule: 买了bananas,又买milk - 置信度Confidence: 0.389 - 支持度Support: 21 Rule: 买了bananas,又买cheese - 置信度Confidence: 0.463 - 支持度Support: 25 Rule: 买了bread,又买cheese - 置信度Confidence: 0.238 - 支持度Support: 5 Rule: 买了cheese,又买bread - 置信度Confidence: 0.106 - 支持度Support: 5 Rule: 买了milk,又买apples - 置信度Confidence: 0.184 - 支持度Support: 9 Rule: 买了apples,又买milk - 置信度Confidence: 0.231 - 支持度Support: 9 '''
# 封装一下方便调用 def print_rule(premise, conclusion, support, confidence, features): premise_name = features[premise] conclusion_name = features[conclusion] print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name)) print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)])) print(" - 支持度Support: {0}".format(support[(premise, conclusion)])) print("") premise = 1 conclusion = 3 print_rule(premise, conclusion, support, confidence, features) ''' Rule: 买了milk,又买apples - 置信度Confidence: 0.184 - 支持度Support: 9 '''
# 按支持度support排序 from pprint import pprint pprint(list(support.items())) ''' [((2, 3), 26), ((3, 2), 26), ((0, 1), 13), ((1, 0), 13), ((0, 3), 6), ((0, 4), 10), ((3, 0), 6), ((3, 4), 23), ((4, 0), 10), ((4, 3), 23), ((1, 2), 10), ((1, 4), 21), ((2, 1), 10), ((2, 4), 25), ((4, 1), 21), ((4, 2), 25), ((0, 2), 5), ((2, 0), 5), ((1, 3), 9), ((3, 1), 9)] '''
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True) for index in range(5): # 打印前5个 print("Rule #{0}".format(index + 1)) (premise, conclusion) = sorted_confidence[index][0] print_rule(premise, conclusion, support, confidence, features) ''' Rule #1 Rule: 买了apples,又买cheese - 置信度Confidence: 0.667 - 支持度Support: 26 Rule #2 Rule: 买了bread,又买milk - 置信度Confidence: 0.619 - 支持度Support: 13 Rule #3 Rule: 买了apples,又买bananas - 置信度Confidence: 0.590 - 支持度Support: 23 Rule #4 Rule: 买了cheese,又买apples - 置信度Confidence: 0.553 - 支持度Support: 26 Rule #5 Rule: 买了cheese,又买bananas - 置信度Confidence: 0.532 - 支持度Support: 25 '''
很显然,“在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类”是基于大数据的
不过这样的规则过于简单,下面继续实验会发现准确率只有 60% 左右,当然还是比随机预测 50% 好!
from sklearn.datasets import load_iris
#X, y = np.loadtxt("X_classification.txt"), np.loadtxt("y_classification.txt") # 本地加载数据,我先下载好在 data 文件夹里了
dataset = load_iris() # 或者自己亲自下载数据再加载也行
X = dataset.data
y = dataset.target
print(dataset.DESCR) # 打印下数据集介绍
n_samples, n_features = X.shape
# Compute the mean for each attribute计算平均值
attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')
# 划分训练集和测试集
from sklearn.cross_validation import train_test_split
# 设置随机数种子以便复现书里的内容
random_state = 14
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("训练集数据有 {} 条".format(y_train.shape))
print("测试集数据有 {} 条".format(y_test.shape))
训练集数据有 (112,) 条
测试集数据有 (38,) 条
from collections import defaultdict from operator import itemgetter def train(X, y_true, feature): """Computes the predictors and error for a given feature using the OneR algorithm Parameters ---------- X: array [n_samples, n_features] The two dimensional array that holds the dataset. Each row is a sample, each column is a feature. y_true: array [n_samples,] The one dimensional array that holds the class values. Corresponds to X, such that y_true[i] is the class value for sample X[i]. feature: int An integer corresponding to the index of the variable we wish to test. 0 <= variable < n_features Returns ------- predictors: dictionary of tuples: (value, prediction) For each item in the array, if the variable has a given value, make the given prediction. error: float The ratio of training data that this rule incorrectly predicts. """ # 1.一些等下要用的变量(数据的形状如上) n_samples, n_features = X.shape assert 0 <= feature < n_features values = set(X[:,feature]) predictors = dict() errors = [] # 2.算法(对照上面的算法流程) # 已经给定特征 feature,作为函数参数传过来了 for current_value in values: # For 该特征对应的真值(即植物是哪一类) most_frequent_class, error = train_feature_value(X, y_true, feature, current_value) # 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类 predictors[current_value] = most_frequent_class errors.append(error) # 计算预测值与真值的误差 total_error = sum(errors) # 对上面计算的误差求和 # python里求和函数 sum([1, 2, 3]) == 1 + 2 + 3 == 6 return predictors, total_error # Compute what our predictors say each sample is based on its value #y_predicted = np.array([predictors[sample[feature]] for sample in X]) def train_feature_value(X, y_true, feature, value): # 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类 # 我们需要一个字典型变量存每个变量预测正确的次数 class_counts = defaultdict(int) # 对每个二元组(类别,真值)迭代计数 for sample, y in zip(X, y_true): if sample[feature] == value: class_counts[y] += 1 # 现在选被预测最多的类别,需要排序。(我们认为被预测最多的类别就是正确的) sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True) most_frequent_class = sorted_class_counts[0][0] # 误差定义为分类“错误”的次数,这里“错误”指样本中没有分类为我们预测的值,即样本的真实类别不是“被预测最多的类别” n_samples = X.shape[1] error = sum([class_count for class_value, class_count in class_counts.items() if class_value != most_frequent_class]) return most_frequent_class, error
# For 给定的每个特征,计算所有预测值(这里 for 写到 list 里面是 python 的语法糖) all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])} errors = {variable: error for variable, (mapping, error) in all_predictors.items()} # 现在选择最佳模型并保存为 "model" # 按误差排序 best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0] print("最佳模型基于第 {0} 个变量,误差为 {1:.2f}".format(best_variable, best_error)) # 选最好的模型,也就是误差最小的模型 model = {'variable': best_variable, 'predictor': all_predictors[best_variable][0]} print(model) ''' 最佳模型基于第 2 个变量,误差为 37.00 {'variable': 2, 'predictor': {0: 0, 1: 2}} '''
def predict(X_test, model):
variable = model['variable']
predictor = model['predictor']
y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
return y_predicted
y_predicted = predict(X_test, model) print(y_predicted) accuracy = np.mean(y_predicted == y_test) * 100 print("在测试集上的准确率 {:.1f}%".format(accuracy)) from sklearn.metrics import classification_report print(classification_report(y_test, y_predicted)) ''' [0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2 2] 在测试集上的准确率 65.8% precision recall f1-score support 0 0.94 1.00 0.97 17 1 0.00 0.00 0.00 13 2 0.40 1.00 0.57 8 avg / total 0.51 0.66 0.55 38 '''

在测试集上的准确率 65.8%,比完全随机预测 50% 好一点点。
在测试集上的准确率 65.8%,比完全随机预测 50% 好一点点。
