赞
踩
参考链接:apriori算法 python实现
Apriori算法是挖掘布尔关联规则频繁项集的算法。利用的是Apriori规则:频繁项集的所有非空子集也必须是频繁的。
import numpy as np ''' Apriori算法实现 ''' data_set = np.array([ ['l1', 'l2', 'l5'], ['l2', 'l4'], ['l2', 'l3'], ['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'], ['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3'] ]) ''' 根据数据集获取C1 data_set -- 数据集 ''' def get_C1(data_set): C1 = set() for item in data_set: for l in item: C1.add(frozenset([l])) return C1 ''' 根据数据集和C,筛选出符合最小支持度的频繁项集 data_set -- 数据集 C -- 候选集 min_support -- 最小支持度 ''' def getLByC(data_set, C, min_support): L = {} #频繁项集和支持数 for c in C: for data in data_set: if c.issubset(data): if c not in L: L[c] = 1 else: L[c] += 1 errorKeys=[] for key in L: support = L[key] / float(len(data_set)) if support < min_support:#未达到最小支持数 errorKeys.append(key) else: L[key] = support for key in errorKeys: L.pop(key) return L ''' 根据频繁(k-1)项集自身连接产生候选K项集Ck 并剪去不符合条件的候选 L -- 频繁K-1项集 ''' def getCByL(L, k): len_L = len(L) #获取L的频繁项集数量 L_keys = list(L.keys())#获取L的键值 C = set() for i in range(len_L): for j in range(1,len_L): l1 = list(L_keys[i]) l1.sort() l2 = list(L_keys[j]) l2.sort() if(l1[0:k-2] == l2[0:k-2]): C_item = frozenset(l1).union(frozenset(l2)) #取并集 flag = True #判断C_item的子集是否在L_keys中 for item in C_item: subC = C_item-frozenset([item])#获取C_item的子集 if subC not in L_keys:#不在 flag = False if flag == True: C.add(C_item) return C ''' 根据数据集获取频繁项集 data_set -- 数据集 k -- 挖掘频繁项集次数 min_support -- 最小支持度 ''' def get_L(data_set, k, min_support): #C1较为特殊,先求 C1 = get_C1(data_set) L1 = getLByC(data_set, C1, min_support) support_data = {} L = [] L.append(L1) tempL = L1 for i in range(2, k+1): Ci = getCByL(tempL, i) tempL = getLByC(data_set,Ci,min_support) L.append(tempL) for l in L: for key in l: support_data[key] = l[key] return L,support_data ''' 获取关联规则 ''' def get_rule(L, support_data, min_support, min_conf): big_rules = [] sub_sets= [] for i in range(0, len(L)): for fset in L[i]: for sub_set in sub_sets: if sub_set.issubset(fset): conf = support_data[fset] / support_data[fset - sub_set] big_rule = (fset - sub_set, sub_set, conf) if conf >= min_conf and big_rule not in big_rules: big_rules.append(big_rule) sub_sets.append(fset) return big_rules if __name__ == "__main__": min_support = 0.2 #最小支持度 min_conf = 0.7 #最小置信度 L,support_data = get_L(data_set, 3, min_support)#获取所有的频繁项集 big_rule = get_rule(L, support_data, min_support, min_conf) #获取强关联规则 print('===================所有的频繁项集如下===========================\n') for l in L: for l_item in l: print(l_item, end=' ') print('支持度为:%f'%l[l_item]) print('===================================================') for rule in big_rule: print(rule[0],'==>',rule[1],'\t\tconf = ',rule[2])
频繁项集和关联规则结果为:
===================所有的频繁项集如下=========================== frozenset({'l1'}) 支持度为:0.666667 frozenset({'l4'}) 支持度为:0.222222 frozenset({'l5'}) 支持度为:0.222222 frozenset({'l2'}) 支持度为:0.777778 frozenset({'l3'}) 支持度为:0.666667 =================================================== frozenset({'l2', 'l1'}) 支持度为:0.444444 frozenset({'l1', 'l3'}) 支持度为:0.444444 frozenset({'l5', 'l2'}) 支持度为:0.222222 frozenset({'l2', 'l4'}) 支持度为:0.222222 frozenset({'l5', 'l1'}) 支持度为:0.222222 frozenset({'l2', 'l3'}) 支持度为:0.444444 =================================================== frozenset({'l2', 'l1', 'l3'}) 支持度为:0.222222 frozenset({'l2', 'l5', 'l1'}) 支持度为:0.222222 =================================================== frozenset({'l5'}) ==> frozenset({'l2'}) conf = 1.0 frozenset({'l4'}) ==> frozenset({'l2'}) conf = 1.0 frozenset({'l5'}) ==> frozenset({'l1'}) conf = 1.0 frozenset({'l5', 'l2'}) ==> frozenset({'l1'}) conf = 1.0 frozenset({'l5', 'l1'}) ==> frozenset({'l2'}) conf = 1.0 frozenset({'l5'}) ==> frozenset({'l2', 'l1'}) conf = 1.0
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。