赞
踩
• Last_date: 最后一次交易时间 • Recency: 最后一次交易时间距离今天的间隔天数 • Frequnece: 累计交易次数
• Amount: 累计交易金额
• Intereval: 平均两次交易之间的间隔天数(如果客户只有一次交易字段为空值)
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
# 读取数据
def read_data(path):
data = pd.read_csv(path)
# print(data.head)
# print(data.shape)
return data
数据格式:
customer_id | trans_date | tran_amount |
---|---|---|
CS5295 | 11-Feb-13 | 35 |
CS4768 | 15-Mar-15 | 39 |
# 替换英文月份
def rep_mon(data):
data['trans_date'] = data['trans_date'].str.replace('Jan','01')
data['trans_date'] = data['trans_date'].str.replace('Feb','02')
data['trans_date'] = data['trans_date'].str.replace('Mar','03')
data['trans_date'] = data['trans_date'].str.replace('Apr','04')
data['trans_date'] = data['trans_date'].str.replace('May','05')
data['trans_date'] = data['trans_date'].str.replace('Jun','06')
data['trans_date'] = data['trans_date'].str.replace('Jul','07')
data['trans_date'] = data['trans_date'].str.replace('Aug','08')
data['trans_date'] = data['trans_date'].str.replace('Sep','09')
data['trans_date'] = data['trans_date'].str.replace('Oct','10')
data['trans_date'] = data['trans_date'].str.replace('Nov','11')
data['trans_date'] = data['trans_date'].str.replace('Dec','12')
return data
# 转换时间格式
def tran_date(data):
for i in tqdm(range(len(data))):
data['trans_date'][i] = datetime.strptime(data['trans_date'][i],"%d-%m-%y")
return data
if __name__ == '__main__':
data = read_data(path)
data = rep_mon(data)
data = tran_date(data)
# 根据用户id分组获取最后交易时间
def last_date(data):
result_date = pd.DataFrame(data.groupby('customer_id')['trans_date'].max())
result_date.columns=['last_date']
return result_date
result_data = last_date(data)
# 用今天的日期减去最后交易日期
def recency(result_data):
today = datetime.today()
result_data['recency'] = [0]*len(result_data)
for i in range(len(result_data)):
result_data['recency'][i] = (today-result_data['last_date'][i]).days
return result_data
result_data = recency(result_data)
# 根据用户id分组统计总次数,并利用id与result表连接
def frequence(data,result):
fre = data.groupby('customer_id').tran_amount.count()
result_data = result.merge(fre,how='left',left_on='customer_id',right_on='customer_id')
result_data = result_data.rename(columns={'tran_amount':'frequence'})
return result_data
result_data = frequence(data,result_data)
# 根据用户id分组统计累计交易金额,并利用id与result表连接
def amount(data_,result_):
amo = data.groupby('customer_id').tran_amount.sum()
result_data = result_.merge(amo,how='left',left_on='customer_id',right_on='customer_id')
result_data = result_data.rename(columns={'tran_amount':'amount'})
return result_data
result_data = amount(data,result_data)
# 将原始数据按照用户id分组,将用户id设置为索引
# 计算每个用户两次交易之间的间隔天数,并计算平均值
def inter(data__,result__):
data_group = data.groupby('customer_id').apply(lambda x:x[:])
data_group = data_group.drop(axis=1,columns='customer_id')
data_group = data_group.reset_index('customer_id')
data_group = data_group.set_index('customer_id')
intereval=dict()
id_index = data_group.index.unique()
for i in id_index:
if len(data_group.loc[i])==1:
intereval[i]=np.nan
else:
intereval[i] = np.mean(data_group.loc[i].sort_values('trans_date')['trans_date'].diff()).days
intereval = pd.DataFrame(intereval,index=['intereval']).T
result_data = result__.merge(intereval,how='left',left_on='customer_id',right_on=intereval.index)
return result_data
result_data = inter(data,result_data)
最后输出格式:
customer_id | last_date | recency | frequence | amount | intereval |
---|---|---|---|---|---|
CS1112 | 2015-01-14 | 2599 | 15 | 1012 | 93 |
• 对Recency字段进行分段,分为5段,用1-5编号,每一段内的人数相同,分段后的新字段名Recency_bin
• 对Frequency字段进行分段,分为5段,用1-5编号,每一段内的人数相同,分段后的新字段名Frequency_bin
• 对Amount字段进行分段,按照100为间距分段,即0-100,100-200…
分段编号用1,2,3,…自然数编号,分段后的新字段名Amount_bin
import math
data['Recency_bin'] = pd.cut(data['recency'],5,labels=[1,2,3,4,5])
data['Frequency_bin'] = pd.cut(data['frequence'],5,labels=[1,2,3,4,5])
# 获得amount列最大值,求其能被100整除的向上整数
# 利用cut分箱,划定分箱标准和label值
def amount_bin(data):
max_amo = math.ceil(data['amount'].max()/100)*100
cut_bin = [i for i in range(0,max_amo+100,100)]
cut_bin[0]=-1
label = [i for i in range(1,int(max_amo/100)+1)]
data['Amount_bin'] =pd.cut(data['amount'],cut_bin,labels=label)
return data
data = amount_bin(data)
最后输出格式:
customer_id | last_date | recency | frequence | amount | intereval | Recency_bin | Frequency_bin | Amount_bin |
---|---|---|---|---|---|---|---|---|
CS1112 | 2015-01-14 | 2599 | 15 | 1012 | 93 | 1 | 2 | 11 |
question3 = pd.DataFrame(data.groupby(by=['Recency_bin','Frequency_bin']).count()['customer_id'])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。