赞
踩
基础环境:anaconda3
开发环境
jupyter lab
使用教程:https://blog.csdn.net/tangyi2008/article/details/123761210
安装mlxtend包
pip install mlxtend -i https://pypi.tuna.tsinghua.edu.cn/simple/
购物篮数据集:data/basket.txt
网盘链接:https://pan.baidu.com/s/1SbDJXwGCoCUM4JdU_NTjAQ?pwd=jiau
提取码:jiau
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
with open('data/basket.txt') as fp:
products = [line.strip().split('\t') for line in fp]
onehot_enc = TransactionEncoder()
tmp_array = onehot_enc.fit_transform(products)
products_ = pd.DataFrame(tmp_array,columns=onehot_enc.columns_)
products_.head(5)
from mlxtend.frequent_patterns import apriori
freq_items = apriori(products_, min_support=0.1,
use_colnames=True, max_len=None)
#计算项集的长度
freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
freq_items.head(5)
from mlxtend.frequent_patterns import association_rules
rules = association_rules(freq_items, metric='lift', min_threshold=1)
# 选取提升度大于1且置信度大于0.5的关联规则
rules[(rules['lift'] > 1) & (rules['confidence'] > 0.5)]
#可以尝试其他的度量指标
度量指标:
support(A->C) = support(A+C) [aka ‘support’], range: [0, 1]
confidence(A->C) = support(A+C) / support(A), range: [0, 1]
lift(A->C) = confidence(A->C) / support©, range: [0, inf]
leverage(A->C) = support(A->C) - support(A)*support©,
range: [-1, 1]conviction = [1 - support©] / [1 - confidence(A->C)],
range: [0, inf]zhangs_metric(A->C) =
leverage(A->C) / max(support(A->C)(1-support(A)), support(A)(support©-support(A->C)))
range: [-1,1]
import pandas as pd
def my_transaction_encoder(file):
tmp_items = []
with open(file) as fp:
tmp_items = [{item:True for item in line.strip().split('\t')} for line in fp]
return pd.DataFrame(tmp_items).fillna(False)
调用测试
my_products = my_transaction_encoder('data/basket.txt')
my_products
自定义函数,实现k项集 -> k+1项集
def gen_candidate_sets(x):
x = list(map(sorted, x))
r = []
for i in range(len(x)):
for j in range(i + 1, len(x)):
if x[i][:-1] == x[j][:-1] and x[i][-1] != x[j][-1]:
r.append(tuple(x[i][:-1] + sorted([x[j][-1], x[i][-1]])))
return r
import numpy as np
#寻找频繁项集
def my_find_items(df, min_support):
# 单项集支持度筛选
support_dict = {(k,):v for k,v in (df.sum()/len(df)).to_dict().items() if v > min_support}
items = list(support_dict.keys())
#第几次遍历
cn = 0
while len(items) > 1:
cn += 1
print(f"\n正在进行第{cn}次搜索..." )
items = gen_candidate_sets(items) #生产k+1项候选集
print(f"数目{len(items)}...")
# 新的支持度函数
#计算k+1项集(即对应k+1列)的连乘
support_dict_tmp = {cols : df[list(cols)].prod(axis=1, numeric_only=True).sum()/len(df)
for cols in items}
#筛选支持度,获取k+1项频繁项集
items = [k for k,v in support_dict_tmp.items() if v > min_support]
#更新支持度字典
support_dict.update(support_dict_tmp)
return support_dict
调用测试
items = my_find_items(my_products, 0.1)
items
def my_find_rule(support_dict, confidence):
result = {}
for k,v in support_dict.items():
if len(k) < 2:
continue
#遍历每一种可能的规则
for i in range(len(k)):
cond = k[:i] + k[i+1:]
r = k[i]
tmp_cofidence = support_dict[k]/support_dict[cond]
if tmp_cofidence > confidence:
result[f'{cond} => {r}'] = dict(confidence = tmp_cofidence,
support = support_dict[k])
sorted(result.items(), key=lambda x:( x[1]["confidence"], x[1]["support"]), reverse=True)
return result
调用测试
my_find_rule(items, 0.5)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。